Case Study: Excavator Survival Analysis

Mining Excavator dataset case study, as originally presented in Sexton et al. [^1].

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nestor
from nestor import keyword as kex
import nestor.datasets as dat
def set_style():
    """This sets reasonable defaults for a figure that will go in a paper"""
    sns.set_context("paper")
    sns.set(font='serif')
    sns.set_style("white", {
        "font.family": "serif",
        "font.serif": ["Times", "Palatino", "serif"]
    })
set_style()

df = dat.load_excavators()
df.head()

	BscStartDate	Asset	OriginalShorttext	PMType	Cost
ID
0	2004-07-01	A	BUCKET WON'T OPEN	PM01	183.05
1	2005-03-20	A	L/H BUCKET CYL LEAKING.	PM01	407.40
2	2006-05-05	A	SWAP BUCKET	PM01	0.00
3	2006-07-11	A	FIT BUCKET TOOTH	PM01	0.00
4	2006-11-10	A	REFIT BUCKET TOOTH	PM01	1157.27

vocab = dat.load_vocab('excavators')
vocab

	NE	alias	notes	score
tokens
replace	S	replace	NaN	0.033502
bucket	I	bucket	NaN	0.018969
repair	S	repair	NaN	0.017499
grease	I	grease	NaN	0.017377
leak	P	leak	NaN	0.016591
...	...	...	...	...
1boily 19	NaN	NaN	NaN	0.000046
shd 1fitter	NaN	NaN	NaN	0.000046
19 01	NaN	NaN	NaN	0.000046
01 10	NaN	NaN	NaN	0.000046
1fitter 1boily	NaN	NaN	NaN	0.000046

6767 rows × 4 columns

Knowledge Extraction

We already have vocabulary and data, so let's merge them to structure our data to a more useful format.

scikit-learn's `Pipeline`

Convenient way to use TagExtractor to output a more usable format. Let's use the multi-index binary format for now. Other options include: - list-of-tokens multilabel - NER-trainer iob.

# merge and cleanse NLP-containing columns of the data
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('clean_combine', kex.NLPSelect(columns=["OriginalShorttext"])),
    ('tag', kex.TagExtractor(
        thesaurus=vocab, # load up pre-trained vocab file from [^1]
        group_untagged=True,  # merge untagged tokens into a misc. `_untagged` label
        filter_types=None,  # keep all tag types
        ngram_range=(1,2),  # this vocab file includes compound tags, so make sure we find them
        verbose=True,  # report how much coverage our vocab file got on proposed tags (tokens)
        output_type='binary', # could use `multilabel` or `iob` as well
    )),
])
tags = pipe.fit_transform(df)

tags.sum().sort_values(ascending=False)

100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.60it/s]

Complete Docs: 1402, or 25.56%
Tag completeness: 0.72 +/- 0.21
Empty Docs: 47, or 0.86%

NA  _untagged           8006.0
S   replace             1364.0
P   leak                 748.0
I   engine               603.0
S   repair               584.0
                         ...  
SI  replace battery        1.0
    replace clamp          1.0
PI  ring leak              1.0
I   step_handrail          1.0
    windscreen_wiper       1.0
Length: 1011, dtype: float64

We can also access the trained steps in the pipeline to make use of convenience functions.

tagger = pipe.named_steps['tag']
tags_read = tagger.tags_as_lists

relation_df = tags.loc[:, ['PI', 'SI']]
tag_df = tags.loc[:, ['I', 'P', 'S', 'U', 'X', 'NA']]

Quality of Extracted Keywords

It's

nbins = int(np.percentile(tag_df.sum(axis=1), 90))
print(f'Docs have at most {nbins} tokens (90th percentile)')

Docs have at most 9 tokens (90th percentile)

tags_read.assign(text=df.OriginalShorttext).sample(10)

	I	NA	P	PI	S	SI	U	text
4890	cable		emergency, need		pull, replace			emergency pull cable needs replacing
4947	camera	_untagged	damage		repair			REPAIR 3 DAMAGED CAMERAS.
3496	timer						idle, working	Idle timer not working
3168	grease, link, link_pin, pin, shd	_untagged						No grease to bottom H-Link Pin SHD0024
544	bucket		cracked		repair			REPAIR CRACKS IN BUCKET
614	bucket, bucket_grease, grease, line	_untagged	broken	broken bucket				2 X BROKEN BUCKET GREASE LINES
752	engine, left_hand		blowing, smoke					lh eng blowing smoke
2908	grease, rotor, steel, steel_tube, tube		leak					STEEL TUBE IN ROTOR LEAKING GREASE
695	air				cleaners, replace	air cleaners, replace air		replace air cleaners
3978	boom, boom_cylinder, cylinder				replace	replace boom		replace r/h boom cylinder

# how many instances of each keyword class are there?
print('named entities: ')
print('I\tItem\nP\tProblem\nS\tSolution')
print('U\tUnknown\nX\tStop Word')
print('total tokens: ', vocab.NE.notna().sum())
print('total tags: ', vocab.groupby("NE").nunique().alias.sum())
vocab.groupby("NE").nunique()

named entities: 
I   Item
P   Problem
S   Solution
U   Unknown
X   Stop Word
total tokens:  4060
total tags:  1123

	alias	notes	score
NE
I	633	18	1856
P	55	5	122
PI	150	0	672
S	44	1	97
SI	134	0	446
U	68	56	92
X	39	0	167

# tag-completeness of work-orders?
tagger.report_completeness()

# with sns.axes_style('ticks') as style:
sns.distplot(tagger.tag_completeness.dropna(), 
             kde=False, bins=nbins, 
             kde_kws={'cut':0})
plt.xlim(0.1, 1.0)
plt.xlabel('precision (PPV)')

Complete Docs: 1402, or 25.56%
Tag completeness: 0.72 +/- 0.21
Empty Docs: 47, or 0.86%

/home/tbsexton/miniconda3/envs/nestor-docs/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 0, 'precision (PPV)')

An Aside: Effectiveness of Tagging

What have we actually gained using the TagExtractor?

The vocab file functions as a thesaurus, that has a default alias representing multiple disparate tokens. This means our resulting matrix dimensionality can be significantly reduced using this domain knowledge, which can improve model predictability, performance, applicability, etc.

# original token string frequencies
cts = np.where(tagger.tfidf.todense()>0., 1, 0).sum(axis=0)
sns.histplot(
    cts, log_scale=True,
    stat='probability',discrete=True,
    label='Raw Tokens',
    color='grey'
)
# tag frequencies
sns.histplot(
    tag_df[['I', 'P', 'S']].sum(), log_scale=True,
    stat='probability', discrete=True,
    label='Tagged Aliases', 
    hatch='///', 
    color='dodgerblue',alpha=0.3,
)
plt.legend()
plt.title('Probability of a Token/Tag\'s frequency')

Text(0.5, 1.0, "Probability of a Token/Tag's frequency")

The entire goal, in some sense, is for us to remove low-occurence, unimportant information from our data, and form concept conglomerates that allow more useful statistical inferences to be made. Tags mapped from nestor-gui, as the plot shows, have very few instances of 1x-occurrence concepts, compared to several thousand in the raw-tokens (this is by design, of course). Additionally, high occurence concepts that might have had misspellings or synonyms drastically inprove their average occurence rate.

NOTE: This is without artificial thresholding of minimum tag frequency. This would simply get reflected by "cutting off" the blue distribution below some threshold, not changing its shape overall.

Survival Analysis

What do we do with tags?

One way is to rely on their ability to normalize raw tokens into consistent aliases so that our estimates of rare-event statistics become possible.

Say you wish to know the median-time-to-failure of an excavator subsystem (e.g. the engines in your fleet): this might help understand the frequency "engine expertise" is needed to plan for hiring or shift scheduls, etc.

To get the raw text occurences into something more consistent for failure-time estimation, one might:

make a rules-based algorithm that checks for known (a priori) pattern occurrences and categorizes/normalizes when matched (think: Regex matching)
create aliases for raw tokens (e.g. using suggestions for "important" tokens from TokenExtractor.thesaurus_template)

This was done in [^1], and whe demonstrate the technique below. See the paper for further details!

Rules-Based

From Hodkeiwiz et al, a rule-based method was used to estimate failure times for SA. Let's see their data:

df_clean = dat.load_excavators(cleaned=True)

df_clean['SuspSugg'] = pd.to_numeric(df_clean['SuspSugg'], errors='coerce')
df_clean.dropna(subset=['RunningTime', 'SuspSugg'], inplace=True)

df_clean.shape

(5289, 16)

df_clean.sort_values('BscStartDate').head(10)

	BscStartDate	Asset	OriginalShorttext	PMType	Cost	RunningTime	MajorSystem	Part	Action	Variant	FM	Location	Comments	FuncLocation	SuspSugg	Rule
ID
8	2001-07-19	B	REPLACE LIP	PM01	1251.52	7.0	Bucket	NaN	Replace	2V	NaN	NaN	NaN	Bucket	0.0	Rule_1_3_78_383_384
1820	2001-09-01	B	OIL LEAK L/H TRACK TENSIONER.	PM01	0.00	3.0	Hydraulic System	Track	Minor Maint	18	Leak	Left	NaN	Power Train - Transmission	0.0	Rule_1_3_52_289_347_425_500
1821	2001-09-04	B	BAD SOS METAL IN OIL	PM01	0.00	3.0	Hydraulic System	Slew Gearbox	NaN	NaN	Contamination	NaN	NaN	Sprocket/Drive Compartment Right	0.0	Rule_1_3_52_303_409
5253	2001-09-05	B	REPLACE AIRCONDITIONER BELTS	PM01	0.00	23.0	NaN	Air Conditioning	Replace	2V	NaN	NaN	NaN	Air Conditioning System	0.0	Rule_1_3_224_227_383_384
3701	2001-09-05	B	REPLACE CLAMPS ON CLAM PIPES	PM01	0.00	28.0	NaN	Mount	Replace	2V	NaN	NaN	NaN	Oil - Hydraulic	0.0	Rule_1_3_92_181_383_384
1167	2001-09-05	B	REPLACE RHS FAN BELT TENSIONER PULLEY	PM01	82.09	0.0	NaN	Fan	Minor Maint_Replace	2V	NaN	Right	NaN	+Cooling System	0.0	Rule_1_3_125_347_383_384_509
1168	2001-09-11	B	replace fan belt	PM01	0.00	6.0	NaN	Fan	Replace	2V	NaN	NaN	NaN	+Cooling System	0.0	Rule_1_3_125_383_384
644	2001-09-15	B	replace heads on lhs eng	PM01	0.00	33.0	Engine	NaN	Replace	2V	NaN	Left	NaN	Engine Left Cylinder Heads	0.0	Rule_1_3_25_383_384_499
4583	2001-09-26	B	REPAIR CABIN DOOR FALLING OFF.	PM01	0.00	27.0	NaN	Drivers Cabin	Repair	1	NaN	NaN	NaN	Operators Cabin	0.0	Rule_1_3_251_284_357
9	2001-10-01	B	rebuild lip #3	PM01	0.00	74.0	Bucket	NaN	Repair	5	NaN	NaN	NaN	Bucket Clam (Lip)	0.0	Rule_1_3_78_362

We once again turn to the library Lifelines as the work-horse for finding the Survival function (in this context, the probability at time $t$ since the previous MWO that a new MWO has not occured).

from lifelines import WeibullFitter, ExponentialFitter, KaplanMeierFitter
mask = (df_clean.MajorSystem =='Bucket')
# mask=df_clean.index
def mask_to_ETclean(df_clean, mask, fill_null=1.):
    filter_df = df_clean.loc[mask]
    g = filter_df.sort_values('BscStartDate').groupby('Asset')
    T = g['BscStartDate'].transform(pd.Series.diff).dt.days
#     T.loc[(T<=0.)|(T.isna())] = fill_null
    E = (~filter_df['SuspSugg'].astype(bool)).astype(int)
    return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

T, E = mask_to_ETclean(df_clean, mask)
wf = WeibullFitter()
wf.fit(T, E, label='Rule-Based Weibull')
print('{:.3f}'.format(wf.lambda_), '{:.3f}'.format(wf.rho_))
# wf.print_summary()
wf.hazard_.plot()
plt.title('weibull hazard function')
plt.xlim(0,110)

wf.survival_function_.plot()
plt.xlim(0,110)
plt.title('weibull survival function')
print(f'transform: β={wf.rho_:.2f}\tη={1/wf.lambda_:.2f}')
# wf._compute_standard_errors()
to_bounds = lambda row:'±'.join([f'{i:.2g}' for i in row])
wf.summary.iloc[:,:2].apply(to_bounds, 1)

16.733 0.833
transform: β=0.83   η=0.06

lambda_       17±0.94
rho_       0.83±0.026
dtype: object

Tag Based Comparison

We estimate the occurence of failures with tag occurrences.

import math


def to_precision(x,p):
    """
    returns a string representation of x formatted with a precision of p

    Based on the webkit javascript implementation taken from here:
    https://code.google.com/p/webkit-mirror/source/browse/JavaScriptCore/kjs/number_object.cpp
    """

    x = float(x)

    if x == 0.:
        return "0." + "0"*(p-1)

    out = []

    if x < 0:
        out.append("-")
        x = -x

    e = int(math.log10(x))
    tens = math.pow(10, e - p + 1)
    n = math.floor(x/tens)

    if n < math.pow(10, p - 1):
        e = e -1
        tens = math.pow(10, e - p+1)
        n = math.floor(x / tens)

    if abs((n + 1.) * tens - x) <= abs(n * tens -x):
        n = n + 1

    if n >= math.pow(10,p):
        n = n / 10.
        e = e + 1

    m = "%.*g" % (p, n)

    if e < -2 or e >= p:
        out.append(m[0])
        if p > 1:
            out.append(".")
            out.extend(m[1:p])
        out.append('e')
        if e > 0:
            out.append("+")
        out.append(str(e))
    elif e == (p -1):
        out.append(m)
    elif e >= 0:
        out.append(m[:e+1])
        if e+1 < len(m):
            out.append(".")
            out.extend(m[e+1:])
    else:
        out.append("0.")
        out.extend(["0"]*-(e+1))
        out.append(m)

    return "".join(out)

def query_experiment(name, df, df_clean, rule, tag, multi_tag, prnt=False):

    def mask_to_ETclean(df_clean, mask, fill_null=1.):
        filter_df = df_clean.loc[mask]
        g = filter_df.sort_values('BscStartDate').groupby('Asset')
        T = g['BscStartDate'].transform(pd.Series.diff).dt.days
        E = (~filter_df['SuspSugg'].astype(bool)).astype(int)
        return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

    def mask_to_ETraw(df_clean, mask, fill_null=1.):
        filter_df = df_clean.loc[mask]
        g = filter_df.sort_values('BscStartDate').groupby('Asset')
        T = g['BscStartDate'].transform(pd.Series.diff).dt.days
        T_defined = (T>0.)|T.notna()
        T = T[T_defined]
        # assume censored when parts replaced (changeout)
        E = (~(tag_df.S.changeout>0)).astype(int)[mask]
        E = E[T_defined]
        return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

    experiment = {
        'rules-based': {
            'query': rule,
            'func': mask_to_ETclean,
            'mask': (df_clean.MajorSystem == rule),
            'data': df_clean
        },
        'single-tag': {
            'query': tag,
            'func': mask_to_ETraw,
            'mask': tag_df.I[tag].sum(axis=1)>0,
            'data': df
        },
        'multi-tag': {
            'query': multi_tag,
            'func': mask_to_ETraw,
            'mask': tag_df.I[multi_tag].sum(axis=1)>0,
            'data': df
        }
    }
    results = {
       ('query', 'text/tag'): [],
#        ('Weibull Params', r'$\lambda$'): [],
       ('Weibull Params', r'$\beta$'): [],
       ('Weibull Params', '$\eta$'): [],
       ('MTTF', 'Weib.'): [],
       ('MTTF', 'K-M'): []
       }
    idx = []

    for key, info in experiment.items():
        idx += [key]
        results[('query','text/tag')] += [info['query']]
        if prnt:
            print('{}: {}'.format(key, info['query']))
        info['T'], info['E'] = info['func'](info['data'], info['mask'])
        wf = WeibullFitter()
        wf.fit(info['T'], info['E'], label=f'{key} weibull')

        to_bounds = lambda row:'$\pm$'.join([to_precision(row[0],2),
                                             to_precision(row[1],1)])

        params = wf.summary.T.iloc[:2]
        params['eta_'] = [1/params.lambda_['coef'],  # err. propagation
                          (params.lambda_['se(coef)']/params.lambda_['coef']**2)]
        params = params.T.apply(to_bounds, 1)

        results[('Weibull Params', r'$\eta$')] += [params['eta_']]
        results[('Weibull Params', r'$\beta$')] += [params['rho_']]
        if prnt:                                     
            print('\tWeibull Params:\n',
                  '\t\tη = {}\t'.format(params['eta_']), 
                  'β = {}'.format(params['rho_']))

        kmf = KaplanMeierFitter()
        kmf.fit(info["T"], event_observed=info['E'], label=f'{key} kaplan-meier')
        results[('MTTF','Weib.')] += [to_precision(wf.median_survival_time_,3)]
        results[('MTTF','K-M')] += [to_precision(kmf.median_survival_time_,3)]
        if prnt:
            print(f'\tMTTF: \n\t\tWeib \t'+to_precision(wf.median_survival_time_,3)+\
                   '\n\t\tKM \t'+to_precision(kmf.median_survival_time_,3))
        info['kmf'] = kmf
        info['wf'] = wf
    return experiment, pd.DataFrame(results, index=pd.Index(idx, name=name))

bucket_exp, bucket_res = query_experiment('Bucket', df, df_clean,
                                          'Bucket',
                                          ['bucket'],
                                          ['bucket', 'tooth', 'lip', 'pin']);

tags = ['hyd', 'hose', 'pump', 'compressor']
hyd_exp, hyd_res = query_experiment('Hydraulic System', df, df_clean,
                                    'Hydraulic System',
                                    ['hyd'],
                                    tags)

eng_exp, eng_res = query_experiment('Engine', df, df_clean,
                                    'Engine',
                                    ['engine'],
                                    ['engine', 'filter', 'fan'])

frames = [bucket_res, hyd_res, eng_res]
res = pd.concat(frames, keys = [i.index.name for i in frames],
               names=['Major System', 'method'])
res

		query	Weibull Params		MTTF
		text/tag	$\beta$	$\eta$	Weib.	K-M
Major System	method
Bucket	rules-based	Bucket	0.83$\pm$0.03	0.060$\pm$3e-3	10.8	9.00
	single-tag	[bucket]	0.83$\pm$0.03	0.038$\pm$3e-3	17.0	15.0
	multi-tag	[bucket, tooth, lip, pin]	0.82$\pm$0.02	0.060$\pm$3e-3	10.6	9.00
Hydraulic System	rules-based	Hydraulic System	0.86$\pm$0.02	0.072$\pm$3e-3	9.02	8.00
	single-tag	[hyd]	0.89$\pm$0.04	0.027$\pm$2e-3	24.3	25.0
	multi-tag	[hyd, hose, pump, compressor]	0.88$\pm$0.02	0.068$\pm$3e-3	9.71	9.00
Engine	rules-based	Engine	0.81$\pm$0.02	0.059$\pm$3e-3	10.8	9.00
	single-tag	[engine]	0.80$\pm$0.03	0.053$\pm$3e-3	12.0	10.0
	multi-tag	[engine, filter, fan]	0.81$\pm$0.02	0.068$\pm$4e-3	9.31	8.00

exp = [bucket_exp, eng_exp, hyd_exp]
f,axes = plt.subplots(nrows=3, figsize=(5,10))
for n, ax in enumerate(axes): 
    exp[n]['rules-based']['kmf'].plot(ax=ax, color='dodgerblue')
    exp[n]['multi-tag']['kmf'].plot(ax=ax, color='xkcd:rust', ls=':')
    exp[n]['single-tag']['kmf'].plot(ax=ax, color='xkcd:rust')

    ax.set_xlim(0,110)
    ax.set_ylim(0,1)
    ax.set_title(r"$S(t)$"+f" of {res.index.levels[0][n]}")
    sns.despine()
plt.tight_layout()

This next one give you an idea of the differences better. using a log-transform. the tags under-estimate death rates a little in the 80-130 day range, probably because there's a failure mode not captured by the [bucket, lip, tooth] tags (because it's rare).

f,axes = plt.subplots(nrows=3, figsize=(5,10))
for n, ax in enumerate(axes): 
    exp[n]['rules-based']['kmf'].plot_loglogs(ax=ax, c='dodgerblue')
    exp[n]['single-tag']['kmf'].plot_loglogs(ax=ax, c='xkcd:rust', ls=':')
    exp[n]['multi-tag']['kmf'].plot_loglogs(ax=ax, c='xkcd:rust')
    if n != 2:
        ax.legend_.remove()
#     ax.set_xlim(0,110)
#     ax.set_ylim(0,1)
    ax.set_title(r"$\log(-\log(S(t)))$"+f" of {res.index.levels[0][n]}")
    sns.despine()
plt.tight_layout()
f.savefig('bkt_logKMsurvival.png')
# kmf.plot_loglogs()