Skip to content

Output tags in IOB format for NER analysis

import pandas as pd
from pathlib import Path
from nestor import keyword as kex
import nestor.datasets as nd
# Get raw MWOs
df = (nd.load_excavators(cleaned=False)  # already formats dates
#       .rename(columns={'BscStartDate': 'StartDate'})
      )

# Change date column to DateTime objects
df.head(5)
BscStartDate Asset OriginalShorttext PMType Cost
ID
0 2004-07-01 A BUCKET WON'T OPEN PM01 183.05
1 2005-03-20 A L/H BUCKET CYL LEAKING. PM01 407.40
2 2006-05-05 A SWAP BUCKET PM01 0.00
3 2006-07-11 A FIT BUCKET TOOTH PM01 0.00
4 2006-11-10 A REFIT BUCKET TOOTH PM01 1157.27
vocab=nd.load_vocab('excavators')#.dropna(subset=['alias'])
vocab
NE alias notes score
tokens
replace S replace NaN 0.033502
bucket I bucket NaN 0.018969
repair S repair NaN 0.017499
grease I grease NaN 0.017377
leak P leak NaN 0.016591
... ... ... ... ...
1boily 19 NaN NaN NaN 0.000046
shd 1fitter NaN NaN NaN 0.000046
19 01 NaN NaN NaN 0.000046
01 10 NaN NaN NaN 0.000046
1fitter 1boily NaN NaN NaN 0.000046

6767 rows × 4 columns

iob = kex.iob_extractor(df.OriginalShorttext, vocab)
iob
token NE doc_id
0 bucket B-I 0
1 won O 0
2 open O 0
3 bucket B-I 1
4 cyl B-I 1
... ... ... ...
24663 fault B-P 5484
24664 front O 5484
24665 found O 5484
24666 wire B-I 5484
24667 no O 5484

24668 rows × 3 columns