medkit.text.preprocessing.char_rules#

Data:

ALL_CHAR_RULES

All pre-defined rules for CharReplacer

DOT_RULES

Rules for dot chars

FRACTION_RULES

Rules for fraction characters

LIGATURE_RULES

Rules for ligatures

QUOTATION_RULES

replace double and single quotation marks

SIGN_RULES

Rules for sign chars

SPACE_RULES

Rules for non-standard spaces

ALL_CHAR_RULES = [('…', '...'), ('⋯', '...'), ('¼', '1/4'), ('½', '1/2'), ('¾', '3/4'), ('⅐', '1/7'), ('⅑', '1/9'), ('⅒', '1/10'), ('⅓', '1/3'), ('⅔', '2/3'), ('⅕', '1/5'), ('⅖', '2/5'), ('⅗', '3/5'), ('⅘', '4/5'), ('⅙', '1/6'), ('⅚', '5/6'), ('⅛', '1/8'), ('⅜', '3/8'), ('⅝', '5/8'), ('⅞', '7/8'), ('↉', '0/3'), ('Æ', 'AE'), ('æ', 'ae'), ('Œ', 'OE'), ('œ', 'oe'), ('»', '"'), ('«', '"'), ('“', '"'), ('”', '"'), ('„', '"'), ('‟', '"'), ('‹', '"'), ('›', '"'), ('˵', '"'), ('˶', '"'), ('˝', '"'), ('"', '"'), ('‚', ''), ('‘', "'"), ('’', "'"), ('‛', "'"), ('ˊ', "'"), ('`', "'"), ('ˋ', "'"), ('´', "'"), ('©', ''), ('®', ''), ('™', ''), ('\xa0', ' '), ('\u1680', ' '), ('\u2002', ' '), ('\u2003', ' '), ('\u2004', ' '), ('\u2005', ' '), ('\u2006', ' '), ('\u2007', ' '), ('\u2008', ' '), ('\u2009', ' '), ('\u200a', ' '), ('\u200b', ' '), ('\u202f', ' '), ('\u205f', ' '), ('␠', ' '), ('\u3000', ' '), ('〿', ' '), ('\ufeff', ' ')]#

All pre-defined rules for CharReplacer

LIGATURE_RULES = [('Æ', 'AE'), ('æ', 'ae'), ('Œ', 'OE'), ('œ', 'oe')]#

Rules for ligatures

FRACTION_RULES = [('¼', '1/4'), ('½', '1/2'), ('¾', '3/4'), ('⅐', '1/7'), ('⅑', '1/9'), ('⅒', '1/10'), ('⅓', '1/3'), ('⅔', '2/3'), ('⅕', '1/5'), ('⅖', '2/5'), ('⅗', '3/5'), ('⅘', '4/5'), ('⅙', '1/6'), ('⅚', '5/6'), ('⅛', '1/8'), ('⅜', '3/8'), ('⅝', '5/8'), ('⅞', '7/8'), ('↉', '0/3')]#

Rules for fraction characters

SIGN_RULES = [('©', ''), ('®', ''), ('™', '')]#

Rules for sign chars

SPACE_RULES = [('\xa0', ' '), ('\u1680', ' '), ('\u2002', ' '), ('\u2003', ' '), ('\u2004', ' '), ('\u2005', ' '), ('\u2006', ' '), ('\u2007', ' '), ('\u2008', ' '), ('\u2009', ' '), ('\u200a', ' '), ('\u200b', ' '), ('\u202f', ' '), ('\u205f', ' '), ('␠', ' '), ('\u3000', ' '), ('〿', ' '), ('\ufeff', ' ')]#

Rules for non-standard spaces

DOT_RULES = [('…', '...'), ('⋯', '...')]#

Rules for dot chars

QUOTATION_RULES = [('»', '"'), ('«', '"'), ('“', '"'), ('”', '"'), ('„', '"'), ('‟', '"'), ('‹', '"'), ('›', '"'), ('˵', '"'), ('˶', '"'), ('˝', '"'), ('"', '"'), ('‚', ''), ('‘', "'"), ('’', "'"), ('‛', "'"), ('ˊ', "'"), ('`', "'"), ('ˋ', "'"), ('´', "'")]#

replace double and single quotation marks

Type

RegexpReplacer quotation marks