Creating a custom text operation#

If you want to initialize a custom text operation from a simple user-defined function, you can take a look to the following examples.

Note

For more details about public APIs, refer to create_text_operation().

Filtering annotations#

In this example, Jane wants to detect some entities (problems) from a raw text.

1. Create medkit document#

from medkit.core.text import TextDocument

text = "The patient has asthma and is using ventoline. The patient has diabetes"
doc = TextDocument(text=text)

2. Init medkit operations#

Jane would like to reuse a collegue’s file containing a list of regular expression rules for detecting entities. To this purpose, she had to split text into sentences before using the RegexpMatcher component.

from medkit.text.segmentation import SentenceTokenizer

sentence_tokenizer = SentenceTokenizer()

In real life, Jane should load the rules from a path using this instruction:

regexp_rules = RegexpMatcher.load_rules(path_to_rules_file)

But for this example, it is simpler for us to define this set of rules manually.

from medkit.text.ner import RegexpMatcher, RegexpMatcherRule

regexp_rules = [
       RegexpMatcherRule(regexp=r"\basthma\b", label="problem"),
       RegexpMatcherRule(regexp=r"\bventoline\b", label="treatment"),
       RegexpMatcherRule(regexp=r"\bdiabetes\b", label="problem")
       ]
regexp_matcher = RegexpMatcher(rules=regexp_rules)

3. Define filter operation#

As RegexpMatcher is based on her collegue’s file, Jane would like to add a filter operation so that only entities which are problems will be returned.

For that, she has to define her own filter function and use medkit tools to instantiate this custom operation.

from medkit.core.text import Entity

def keep_entities_with_label_problem(entity):
    return entity.label == "problem"

from medkit.core.text import CustomTextOpType, create_text_operation

filter_operation = create_text_operation(function=keep_entities_with_label_problem, function_type=CustomTextOpType.FILTER)

# Same behavior as
# filter_operation = create_text_operation(
#   name="keep_entities_with_label_problem",
#   function=keep_entities_with_label_problem,
#   function_type=CustomTextOpType.FILTER)

4. Construct and run the pipeline#

from medkit.core import Pipeline, PipelineStep

steps=[
    PipelineStep(input_keys=["raw_text"], output_keys=["sentences"], operation=sentence_tokenizer),
    PipelineStep(input_keys=["sentences"], output_keys=["entities"], operation=regexp_matcher),
    PipelineStep(input_keys=["entities"], output_keys=["problems"], operation=filter_operation)
]

pipeline = Pipeline(
       steps=steps,
       input_keys=["raw_text"],
       output_keys=["problems"]
)

entities = pipeline.run([doc.raw_segment])

for entity in entities:
    print(entity)
Entity(uid='ad51b838-8e23-11ee-b699-0242ac110002', label='problem', attrs=EntityAttributeContainer(ann_id='ad51b838-8e23-11ee-b699-0242ac110002', attrs=[]), metadata={'rule_id': 0, 'version': None}, keys={'problems'}, spans=[Span(start=16, end=22)], text='asthma')
Entity(uid='ad51bf0e-8e23-11ee-b699-0242ac110002', label='problem', attrs=EntityAttributeContainer(ann_id='ad51bf0e-8e23-11ee-b699-0242ac110002', attrs=[]), metadata={'rule_id': 2, 'version': None}, keys={'problems'}, spans=[Span(start=63, end=71)], text='diabetes')

In this scenario, 2 entities with problem label are returned.

To compare with the intermediate results generated by regexpmatcher, we’ll use the entities intermediate key. There are 3 results.

IMPORTANT: the following code is only for demo purpose, all pipeline steps are executed, we just select what pipeline outputs

pipeline = Pipeline(
    steps=steps,
    input_keys=["raw_text"],
    output_keys=["entities"]
)

entities = pipeline.run([doc.raw_segment])

for entity in entities:
    print(entity)
Entity(uid='ad535a08-8e23-11ee-b699-0242ac110002', label='problem', attrs=EntityAttributeContainer(ann_id='ad535a08-8e23-11ee-b699-0242ac110002', attrs=[]), metadata={'rule_id': 0, 'version': None}, keys={'entities'}, spans=[Span(start=16, end=22)], text='asthma')
Entity(uid='ad535cb0-8e23-11ee-b699-0242ac110002', label='treatment', attrs=EntityAttributeContainer(ann_id='ad535cb0-8e23-11ee-b699-0242ac110002', attrs=[]), metadata={'rule_id': 1, 'version': None}, keys={'entities'}, spans=[Span(start=36, end=45)], text='ventoline')
Entity(uid='ad5360c0-8e23-11ee-b699-0242ac110002', label='problem', attrs=EntityAttributeContainer(ann_id='ad5360c0-8e23-11ee-b699-0242ac110002', attrs=[]), metadata={'rule_id': 2, 'version': None}, keys={'entities'}, spans=[Span(start=63, end=71)], text='diabetes')

Creating new annotations#

In this example, Jane wants to pre-process the text before detecting entities.

1. Create medkit document#

from medkit.core.text import TextDocument

text = """IRM : Lésion de la CPMI périphérique,
aspect distendu du LCA, kyste poplité."""

doc = TextDocument(text=text)

2. Define custom function#

Jane wants to use a dictionary to convert all abbreviations into their long text. To make it, she may define a custom function and use medkit span_utils to preserve spans during text modifications.

import re
from typing import Dict
from medkit.core.text import Segment, span_utils


# Providing the dictionary of abbreviation mapping
abbrv_mapping = {
    "IRM" : "Imagerie par Résonance Magnétique",
    "CPMI" : "Corne Postérieure du Ménisque Interne",
    "LCA" : "Ligament Croisé Antérieur",
}

# Defining custom function
def translate_abbreviations(segment, abbrv_mapping):
    ranges = []
    replacement_texts = []

    regexp = '|'.join('%s' % abbrv for abbrv in abbrv_mapping.keys())

    # Detect abbreviations
    for mo in re.finditer(regexp, segment.text):
        ranges.append([mo.start(), mo.end()])
        replacement_texts.append(abbrv_mapping[mo.group()])

    # Replace abbreviations by their text (and preserving spans)
    text, spans = span_utils.replace(
        text=segment.text,
        spans=segment.spans,
        ranges=ranges,
        replacement_texts=replacement_texts
    )

    return Segment(label="long_text", text=text, spans=spans)


from medkit.core.text import CustomTextOpType, create_text_operation

# Create the medkit operation from our custom function
preprocessing_operation = create_text_operation(
    function=translate_abbreviations,
    function_type=CustomTextOpType.CREATE_ONE_TO_N,
    name="translate_abbreviations",
    args={"abbrv_mapping":abbrv_mapping}
)

3. Run the operation#

After executing the operation on the document raw text, we can observe that the output segment is composed of:

  • a text with abbreviations replaced by their long text,

  • spans which is a mix of modified spans (for replaced parts of text) and original spans (for not replaced text).

segments = preprocessing_operation.run([doc.raw_segment])

for segment in segments:
    print(f"Text: {segment.text}\n")
    print(f"Spans:")
    for span in segment.spans:
        print(f"- {span}")
Text: Imagerie par Résonance Magnétique : Lésion de la Corne Postérieure du Ménisque Interne périphérique,
aspect distendu du Ligament Croisé Antérieur, kyste poplité.

Spans:
- ModifiedSpan(length=33, replaced_spans=[Span(start=0, end=3)])
- Span(start=3, end=19)
- ModifiedSpan(length=37, replaced_spans=[Span(start=19, end=23)])
- Span(start=23, end=57)
- ModifiedSpan(length=25, replaced_spans=[Span(start=57, end=60)])
- Span(start=60, end=76)

Extracting annotations#

In this example, Jane wants to count detected UMLS cui on a set of documents.

1. Loading text documents#

In this example, we use translated .uid documents. For more info, you may refer to medkit.tools.mtsamples.

from medkit.tools.mtsamples import load_mtsamples

docs = load_mtsamples(nb_max=10)

print(docs[0].text)
SUBJECTIF :, Cette femme blanche de 23 ans se présente avec une plainte d'allergies. Elle avait des allergies lorsqu'elle vivait à Seattle, mais elle pense qu'elles sont pires ici. Dans le passé, elle a essayé Claritin et Zyrtec. Les deux ont fonctionné pendant une courte période, mais ont ensuite semblé perdre de leur efficacité. Elle a également utilisé Allegra. Elle l'a utilisé l'été dernier et elle a recommencé à l'utiliser il y a deux semaines. Cela ne semble pas très bien fonctionner. Elle a utilisé des vaporisateurs en vente libre, mais pas de vaporisateurs nasaux sur ordonnance. Elle souffre d'asthme mais n'a pas besoin de médicaments quotidiens pour cela et ne pense pas qu'elle s'embrase.,MÉDICAMENTS : , Son seul médicament actuellement est Ortho Tri-Cyclen et l'Allegra.,ALLERGIES : , Elle n'a aucune allergie médicamenteuse connue., OBJECTIF :,Vitals : Le poids était de 130 livres et la tension artérielle de 124/78.,HEENT : Sa gorge était légèrement érythémateuse sans exsudat. La muqueuse nasale était érythémateuse et enflée. Seul un drainage clair a été observé. Les MT étaient claires.,Cou : Souple sans adénopathie.,Poumons : Clairs.,BILAN :, Rhinite allergique.,PLAN :,1. Elle essaiera à nouveau Zyrtec au lieu d'Allegra. Une autre option sera d'utiliser la loratadine. Elle ne pense pas avoir une couverture d'ordonnance, ce qui pourrait être moins cher.,2. Échantillons de Nasonex deux pulvérisations dans chaque narine pendant trois semaines. Une ordonnance a également été rédigée.

2. Init our operations#

Let’s initialize same operations as above (i.e., sentence tokenizer, then regexp matcher with default rules) without the filter operation.

from medkit.text.segmentation import SentenceTokenizer

sentence_tokenizer = SentenceTokenizer()
from medkit.text.ner import RegexpMatcher

regexp_matcher = RegexpMatcher()

3. Defining an extraction function#

The extraction function is defined with a label parameter for filtering entities. Our custom operation allows to retrieve only attributes from entity with disorder label.

import re
from typing import List
from medkit.core.text import Entity, UMLSNormAttribute

# Defining custom function for extracting umls normalization attributes from entity
def extract_umls_attributes_from_entity(entity, label):
    return [attr for attr in entity.attrs.get_norms() if entity.label == label and isinstance(attr, UMLSNormAttribute) ]


from medkit.core.text import CustomTextOpType, create_text_operation

attr_extraction_operation = create_text_operation(
    function=extract_umls_attributes_from_entity,
    function_type=CustomTextOpType.EXTRACT_ONE_TO_N,
    args={"label":'disorder'}
)

4. Defining and running our pipeline#

When running the pipeline on the set of documents, the output is a list of umls normalization attributes.

from medkit.core import Pipeline, PipelineStep

steps=[
    PipelineStep(input_keys=["raw_text"], output_keys=["sentences"], operation=sentence_tokenizer),
    PipelineStep(input_keys=["sentences"], output_keys=["entities"], operation=regexp_matcher),
    PipelineStep(input_keys=["entities"], output_keys=["umls_attributes"], operation=attr_extraction_operation),
]

pipeline = Pipeline(
       steps=steps,
       input_keys=["raw_text"],
       output_keys=["umls_attributes"] 
)
attrs = pipeline.run([doc.raw_segment for doc in docs])
attrs[:5]
[UMLSNormAttribute(label='NORMALIZATION', value='umls:C0024117', metadata={}, uid='aed6faa6-8e23-11ee-b699-0242ac110002', kb_name='umls', kb_id='C0024117', kb_version='202AB', term=None, score=None, sem_types=None),
 UMLSNormAttribute(label='NORMALIZATION', value='umls:C0024115', metadata={}, uid='aed6fbd2-8e23-11ee-b699-0242ac110002', kb_name='umls', kb_id='C0024115', kb_version='202AB', term=None, score=None, sem_types=None),
 UMLSNormAttribute(label='NORMALIZATION', value='umls:C0032285', metadata={}, uid='aed6fcd6-8e23-11ee-b699-0242ac110002', kb_name='umls', kb_id='C0032285', kb_version='202AB', term=None, score=None, sem_types=None),
 UMLSNormAttribute(label='NORMALIZATION', value='umls:C0020542', metadata={}, uid='aed6fdd0-8e23-11ee-b699-0242ac110002', kb_name='umls', kb_id='C0020542', kb_version='202AB', term=None, score=None, sem_types=None),
 UMLSNormAttribute(label='NORMALIZATION', value='umls:C0004096', metadata={}, uid='aed6fec0-8e23-11ee-b699-0242ac110002', kb_name='umls', kb_id='C0004096', kb_version='202AB', term=None, score=None, sem_types=None)]

5. Analyzing data#

Now, Jane can analyze the number of cuis detected on her set of documents.

import pandas as pd
df = pd.DataFrame.from_records([attr.to_dict() for attr in attrs], columns=["cui", "umls_version"])
print(df)
          cui umls_version
0    C0024117        202AB
1    C0024115        202AB
2    C0032285        202AB
3    C0020542        202AB
4    C0004096        202AB
..        ...          ...
111  C0027651        202AB
112  C1882062        202AB
113  C0006826        202AB
114  C1305855        202AB
115  C0497406        202AB

[116 rows x 2 columns]
df.value_counts(subset="cui")
cui
C0006266    9
C0004096    9
C0003811    8
C0004238    8
C0004239    8
C0034067    8
C0032285    6
C0264492    6
C0020542    6
C0024115    6
C0024117    6
C0497406    4
C1305855    3
C0349782    3
C0340288    3
C0027051    3
C0027651    2
C0038454    2
C0242339    2
C0020473    2
C0020443    2
C0018801    2
C0948008    2
C0006826    2
C1882062    2
C0028754    1
C0011849    1
dtype: int64