medkit.text.preprocessing.char_rules
medkit.text.preprocessing.char_rules#
Data:
All pre-defined rules for CharReplacer |
|
Rules for dot chars |
|
Rules for fraction characters |
|
Rules for ligatures |
|
replace double and single quotation marks |
|
Rules for sign chars |
|
Rules for non-standard spaces |
- ALL_CHAR_RULES = [('…', '...'), ('⋯', '...'), ('¼', '1/4'), ('½', '1/2'), ('¾', '3/4'), ('⅐', '1/7'), ('⅑', '1/9'), ('⅒', '1/10'), ('⅓', '1/3'), ('⅔', '2/3'), ('⅕', '1/5'), ('⅖', '2/5'), ('⅗', '3/5'), ('⅘', '4/5'), ('⅙', '1/6'), ('⅚', '5/6'), ('⅛', '1/8'), ('⅜', '3/8'), ('⅝', '5/8'), ('⅞', '7/8'), ('↉', '0/3'), ('Æ', 'AE'), ('æ', 'ae'), ('Œ', 'OE'), ('œ', 'oe'), ('»', '"'), ('«', '"'), ('“', '"'), ('”', '"'), ('„', '"'), ('‟', '"'), ('‹', '"'), ('›', '"'), ('˵', '"'), ('˶', '"'), ('˝', '"'), ('"', '"'), ('‚', ''), ('‘', "'"), ('’', "'"), ('‛', "'"), ('ˊ', "'"), ('`', "'"), ('ˋ', "'"), ('´', "'"), ('©', ''), ('®', ''), ('™', ''), ('\xa0', ' '), ('\u1680', ' '), ('\u2002', ' '), ('\u2003', ' '), ('\u2004', ' '), ('\u2005', ' '), ('\u2006', ' '), ('\u2007', ' '), ('\u2008', ' '), ('\u2009', ' '), ('\u200a', ' '), ('\u200b', ' '), ('\u202f', ' '), ('\u205f', ' '), ('␠', ' '), ('\u3000', ' '), ('〿', ' '), ('\ufeff', ' ')]#
All pre-defined rules for CharReplacer
- LIGATURE_RULES = [('Æ', 'AE'), ('æ', 'ae'), ('Œ', 'OE'), ('œ', 'oe')]#
Rules for ligatures
- FRACTION_RULES = [('¼', '1/4'), ('½', '1/2'), ('¾', '3/4'), ('⅐', '1/7'), ('⅑', '1/9'), ('⅒', '1/10'), ('⅓', '1/3'), ('⅔', '2/3'), ('⅕', '1/5'), ('⅖', '2/5'), ('⅗', '3/5'), ('⅘', '4/5'), ('⅙', '1/6'), ('⅚', '5/6'), ('⅛', '1/8'), ('⅜', '3/8'), ('⅝', '5/8'), ('⅞', '7/8'), ('↉', '0/3')]#
Rules for fraction characters
- SIGN_RULES = [('©', ''), ('®', ''), ('™', '')]#
Rules for sign chars
- SPACE_RULES = [('\xa0', ' '), ('\u1680', ' '), ('\u2002', ' '), ('\u2003', ' '), ('\u2004', ' '), ('\u2005', ' '), ('\u2006', ' '), ('\u2007', ' '), ('\u2008', ' '), ('\u2009', ' '), ('\u200a', ' '), ('\u200b', ' '), ('\u202f', ' '), ('\u205f', ' '), ('␠', ' '), ('\u3000', ' '), ('〿', ' '), ('\ufeff', ' ')]#
Rules for non-standard spaces
- DOT_RULES = [('…', '...'), ('⋯', '...')]#
Rules for dot chars
- QUOTATION_RULES = [('»', '"'), ('«', '"'), ('“', '"'), ('”', '"'), ('„', '"'), ('‟', '"'), ('‹', '"'), ('›', '"'), ('˵', '"'), ('˶', '"'), ('˝', '"'), ('"', '"'), ('‚', ''), ('‘', "'"), ('’', "'"), ('‛', "'"), ('ˊ', "'"), ('`', "'"), ('ˋ', "'"), ('´', "'")]#
replace double and single quotation marks
- Type
RegexpReplacer quotation marks