I noticed there was a missing line that’s supposed to call mergeInterval(…). I did find that there could be a bug in it so I also rewrote my own called better_mergeIntervals.
Entities are demarcated by text[start, end] but they can have overlap, and thus need to be resolved/merged
case 1: overlap of entities of the same type, replace the [start, end] by the union.
case 2: overlap of entities of different type, this is ambiguous and resolving this is somewhat arbitrary.
def better_mergeIntervals(entities):
entities = sorted(entities, key=lambda tup: tup[0]) # sort by 'start' the lower bound.
merged_entities = []
for current_entity in entities:
if not merged_entities:
merged_entities.append(current_entity) # the very first entity, no merging consideration is needed.
else:
prev_entity = merged_entities[-1]
if current_entity[0] <= prev_entity[1]: # overlapped
if prev_entity[2] is current_entity[2]: # same type
upper_bound = max(prev_entity[1], current_entity[1])
merged_entities[-1] = (prev_entity[0], upper_bound, prev_entity[2]) # simple merge by union
else:
if prev_entity[1] >= current_entity[1]: # if current entity interval is a subset of previous entity's, then drop current entity
merged_entities[-1] = prev_entity
else:
merged_entities[-1] = (prev_entity[0], current_entity[1], current_entity[2]) # union and adapt the current entity type (this rule somewhat arbitrary)
else:
merged_entities.append(current_entity)
return merged_entities
def generate_ner_labels_from_dataset(sentences_ner_data):
entity_tagged_sentences = []
for data in tqdm(sentences_ner_data): # loop through each sample
text = data[0]
entities = data[1]['entities'] # IMPORTANT: assume only 1 entry in the data[1] dict
# entities = mergeIntervals(entities) # cleanse bad label, where entity interval (start, end) can overlap
entities = better_mergeIntervals(entities)
# sort this in desc order of 'start',
# (since modifying text in-place will modify subsequent start/end that can screw this up, working in reverse order should be ok).
entities = sorted(entities, key=lambda tup: tup[0], reverse=True)
for entity in entities:
start, end, name = entity
name = name.replace('_', '!').replace(' ', '_') # remove space in the entity name (will undo this later)
name = f'<*entity*>{name}</*entity*>' # add html-like tag so this will be recognized as entity name (not the literal text)
replacement = ' '.join([name]*len(text[start: end].split())) # replace real text with their entity assignment
text = text[: start] + replacement + text[end:]
texts = text.split()
# replace any non-entity with 'Empty'
# strip <*entity></*entity*> from entities and reverse format _ and ' '
def to_entity(word):
if not '<*entity*>' in word:
return 'Empty'
else:
pattern = r"<\*entity\*>(.*?)<\/\*entity\*>"
word = re.findall(pattern, word)[0]
return word.replace('_', ' ').replace('!', '_')
ents = list(map(to_entity, texts))
entity_tagged_sentences.append(ents)
ner_labels_df = pd.DataFrame(data={'sentences_cleaned': entity_tagged_sentences})
return ner_labels_df
my cleanedDF = generate_ner_labels_from_dataset(data) has sensible stuff (not shape 1,1).
Please try this and let me know.