Skip to content

Case Study: Excavator Survival Analysis

Mining Excavator dataset case study, as originally presented in Sexton et al. [^1].

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nestor
from nestor import keyword as kex
import nestor.datasets as dat
def set_style():
    """This sets reasonable defaults for a figure that will go in a paper"""
    sns.set_context("paper")
    sns.set(font='serif')
    sns.set_style("white", {
        "font.family": "serif",
        "font.serif": ["Times", "Palatino", "serif"]
    })
set_style()
df = dat.load_excavators()
df.head()
BscStartDate Asset OriginalShorttext PMType Cost
ID
0 2004-07-01 A BUCKET WON'T OPEN PM01 183.05
1 2005-03-20 A L/H BUCKET CYL LEAKING. PM01 407.40
2 2006-05-05 A SWAP BUCKET PM01 0.00
3 2006-07-11 A FIT BUCKET TOOTH PM01 0.00
4 2006-11-10 A REFIT BUCKET TOOTH PM01 1157.27
vocab = dat.load_vocab('excavators')
vocab
NE alias notes score
tokens
replace S replace NaN 0.033502
bucket I bucket NaN 0.018969
repair S repair NaN 0.017499
grease I grease NaN 0.017377
leak P leak NaN 0.016591
... ... ... ... ...
1boily 19 NaN NaN NaN 0.000046
shd 1fitter NaN NaN NaN 0.000046
19 01 NaN NaN NaN 0.000046
01 10 NaN NaN NaN 0.000046
1fitter 1boily NaN NaN NaN 0.000046

6767 rows × 4 columns

Knowledge Extraction

We already have vocabulary and data, so let's merge them to structure our data to a more useful format.

scikit-learn's Pipeline

Convenient way to use TagExtractor to output a more usable format. Let's use the multi-index binary format for now. Other options include: - list-of-tokens multilabel - NER-trainer iob.

# merge and cleanse NLP-containing columns of the data
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('clean_combine', kex.NLPSelect(columns=["OriginalShorttext"])),
    ('tag', kex.TagExtractor(
        thesaurus=vocab, # load up pre-trained vocab file from [^1]
        group_untagged=True,  # merge untagged tokens into a misc. `_untagged` label
        filter_types=None,  # keep all tag types
        ngram_range=(1,2),  # this vocab file includes compound tags, so make sure we find them
        verbose=True,  # report how much coverage our vocab file got on proposed tags (tokens)
        output_type='binary', # could use `multilabel` or `iob` as well
    )),
])
tags = pipe.fit_transform(df)

tags.sum().sort_values(ascending=False)
100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.60it/s]
Complete Docs: 1402, or 25.56%
Tag completeness: 0.72 +/- 0.21
Empty Docs: 47, or 0.86%



NA  _untagged           8006.0
S   replace             1364.0
P   leak                 748.0
I   engine               603.0
S   repair               584.0
                         ...  
SI  replace battery        1.0
    replace clamp          1.0
PI  ring leak              1.0
I   step_handrail          1.0
    windscreen_wiper       1.0
Length: 1011, dtype: float64

We can also access the trained steps in the pipeline to make use of convenience functions.

tagger = pipe.named_steps['tag']
tags_read = tagger.tags_as_lists

relation_df = tags.loc[:, ['PI', 'SI']]
tag_df = tags.loc[:, ['I', 'P', 'S', 'U', 'X', 'NA']]    

Quality of Extracted Keywords

It's

nbins = int(np.percentile(tag_df.sum(axis=1), 90))
print(f'Docs have at most {nbins} tokens (90th percentile)')
Docs have at most 9 tokens (90th percentile)

tags_read.assign(text=df.OriginalShorttext).sample(10)
I NA P PI S SI U X text
4890 cable emergency, need pull, replace emergency pull cable needs replacing
4947 camera _untagged damage repair REPAIR 3 DAMAGED CAMERAS.
3496 timer idle, working Idle timer not working
3168 grease, link, link_pin, pin, shd _untagged No grease to bottom H-Link Pin SHD0024
544 bucket cracked repair REPAIR CRACKS IN BUCKET
614 bucket, bucket_grease, grease, line _untagged broken broken bucket 2 X BROKEN BUCKET GREASE LINES
752 engine, left_hand blowing, smoke lh eng blowing smoke
2908 grease, rotor, steel, steel_tube, tube leak STEEL TUBE IN ROTOR LEAKING GREASE
695 air cleaners, replace air cleaners, replace air replace air cleaners
3978 boom, boom_cylinder, cylinder replace replace boom replace r/h boom cylinder
# how many instances of each keyword class are there?
print('named entities: ')
print('I\tItem\nP\tProblem\nS\tSolution')
print('U\tUnknown\nX\tStop Word')
print('total tokens: ', vocab.NE.notna().sum())
print('total tags: ', vocab.groupby("NE").nunique().alias.sum())
vocab.groupby("NE").nunique()
named entities: 
I   Item
P   Problem
S   Solution
U   Unknown
X   Stop Word
total tokens:  4060
total tags:  1123

alias notes score
NE
I 633 18 1856
P 55 5 122
PI 150 0 672
S 44 1 97
SI 134 0 446
U 68 56 92
X 39 0 167
# tag-completeness of work-orders?
tagger.report_completeness()

# with sns.axes_style('ticks') as style:
sns.distplot(tagger.tag_completeness.dropna(), 
             kde=False, bins=nbins, 
             kde_kws={'cut':0})
plt.xlim(0.1, 1.0)
plt.xlabel('precision (PPV)')
Complete Docs: 1402, or 25.56%
Tag completeness: 0.72 +/- 0.21
Empty Docs: 47, or 0.86%

/home/tbsexton/miniconda3/envs/nestor-docs/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 0, 'precision (PPV)')

An Aside: Effectiveness of Tagging

What have we actually gained using the TagExtractor?

The vocab file functions as a thesaurus, that has a default alias representing multiple disparate tokens. This means our resulting matrix dimensionality can be significantly reduced using this domain knowledge, which can improve model predictability, performance, applicability, etc.

# original token string frequencies
cts = np.where(tagger.tfidf.todense()>0., 1, 0).sum(axis=0)
sns.histplot(
    cts, log_scale=True,
    stat='probability',discrete=True,
    label='Raw Tokens',
    color='grey'
)
# tag frequencies
sns.histplot(
    tag_df[['I', 'P', 'S']].sum(), log_scale=True,
    stat='probability', discrete=True,
    label='Tagged Aliases', 
    hatch='///', 
    color='dodgerblue',alpha=0.3,
)
plt.legend()
plt.title('Probability of a Token/Tag\'s frequency')
Text(0.5, 1.0, "Probability of a Token/Tag's frequency")

The entire goal, in some sense, is for us to remove low-occurence, unimportant information from our data, and form concept conglomerates that allow more useful statistical inferences to be made. Tags mapped from nestor-gui, as the plot shows, have very few instances of 1x-occurrence concepts, compared to several thousand in the raw-tokens (this is by design, of course). Additionally, high occurence concepts that might have had misspellings or synonyms drastically inprove their average occurence rate.

NOTE: This is without artificial thresholding of minimum tag frequency. This would simply get reflected by "cutting off" the blue distribution below some threshold, not changing its shape overall.

Survival Analysis

What do we do with tags?

One way is to rely on their ability to normalize raw tokens into consistent aliases so that our estimates of rare-event statistics become possible.

Say you wish to know the median-time-to-failure of an excavator subsystem (e.g. the engines in your fleet): this might help understand the frequency "engine expertise" is needed to plan for hiring or shift scheduls, etc.

To get the raw text occurences into something more consistent for failure-time estimation, one might:

  • make a rules-based algorithm that checks for known (a priori) pattern occurrences and categorizes/normalizes when matched (think: Regex matching)
  • create aliases for raw tokens (e.g. using suggestions for "important" tokens from TokenExtractor.thesaurus_template)

This was done in [^1], and whe demonstrate the technique below. See the paper for further details!

Rules-Based

From Hodkeiwiz et al, a rule-based method was used to estimate failure times for SA. Let's see their data:

df_clean = dat.load_excavators(cleaned=True)

df_clean['SuspSugg'] = pd.to_numeric(df_clean['SuspSugg'], errors='coerce')
df_clean.dropna(subset=['RunningTime', 'SuspSugg'], inplace=True)

df_clean.shape
(5289, 16)
df_clean.sort_values('BscStartDate').head(10)
BscStartDate Asset OriginalShorttext PMType Cost RunningTime MajorSystem Part Action Variant FM Location Comments FuncLocation SuspSugg Rule
ID
8 2001-07-19 B REPLACE LIP PM01 1251.52 7.0 Bucket NaN Replace 2V NaN NaN NaN Bucket 0.0 Rule_1_3_78_383_384
1820 2001-09-01 B OIL LEAK L/H TRACK TENSIONER. PM01 0.00 3.0 Hydraulic System Track Minor Maint 18 Leak Left NaN Power Train - Transmission 0.0 Rule_1_3_52_289_347_425_500
1821 2001-09-04 B BAD SOS METAL IN OIL PM01 0.00 3.0 Hydraulic System Slew Gearbox NaN NaN Contamination NaN NaN Sprocket/Drive Compartment Right 0.0 Rule_1_3_52_303_409
5253 2001-09-05 B REPLACE AIRCONDITIONER BELTS PM01 0.00 23.0 NaN Air Conditioning Replace 2V NaN NaN NaN Air Conditioning System 0.0 Rule_1_3_224_227_383_384
3701 2001-09-05 B REPLACE CLAMPS ON CLAM PIPES PM01 0.00 28.0 NaN Mount Replace 2V NaN NaN NaN Oil - Hydraulic 0.0 Rule_1_3_92_181_383_384
1167 2001-09-05 B REPLACE RHS FAN BELT TENSIONER PULLEY PM01 82.09 0.0 NaN Fan Minor Maint_Replace 2V NaN Right NaN +Cooling System 0.0 Rule_1_3_125_347_383_384_509
1168 2001-09-11 B replace fan belt PM01 0.00 6.0 NaN Fan Replace 2V NaN NaN NaN +Cooling System 0.0 Rule_1_3_125_383_384
644 2001-09-15 B replace heads on lhs eng PM01 0.00 33.0 Engine NaN Replace 2V NaN Left NaN Engine Left Cylinder Heads 0.0 Rule_1_3_25_383_384_499
4583 2001-09-26 B REPAIR CABIN DOOR FALLING OFF. PM01 0.00 27.0 NaN Drivers Cabin Repair 1 NaN NaN NaN Operators Cabin 0.0 Rule_1_3_251_284_357
9 2001-10-01 B rebuild lip #3 PM01 0.00 74.0 Bucket NaN Repair 5 NaN NaN NaN Bucket Clam (Lip) 0.0 Rule_1_3_78_362

We once again turn to the library Lifelines as the work-horse for finding the Survival function (in this context, the probability at time t since the previous MWO that a new MWO has not occured).

from lifelines import WeibullFitter, ExponentialFitter, KaplanMeierFitter
mask = (df_clean.MajorSystem =='Bucket')
# mask=df_clean.index
def mask_to_ETclean(df_clean, mask, fill_null=1.):
    filter_df = df_clean.loc[mask]
    g = filter_df.sort_values('BscStartDate').groupby('Asset')
    T = g['BscStartDate'].transform(pd.Series.diff).dt.days
#     T.loc[(T<=0.)|(T.isna())] = fill_null
    E = (~filter_df['SuspSugg'].astype(bool)).astype(int)
    return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

T, E = mask_to_ETclean(df_clean, mask)
wf = WeibullFitter()
wf.fit(T, E, label='Rule-Based Weibull')
print('{:.3f}'.format(wf.lambda_), '{:.3f}'.format(wf.rho_))
# wf.print_summary()
wf.hazard_.plot()
plt.title('weibull hazard function')
plt.xlim(0,110)

wf.survival_function_.plot()
plt.xlim(0,110)
plt.title('weibull survival function')
print(f'transform: β={wf.rho_:.2f}\tη={1/wf.lambda_:.2f}')
# wf._compute_standard_errors()
to_bounds = lambda row:'±'.join([f'{i:.2g}' for i in row])
wf.summary.iloc[:,:2].apply(to_bounds, 1)
16.733 0.833
transform: β=0.83   η=0.06

lambda_       17±0.94
rho_       0.83±0.026
dtype: object

Tag Based Comparison

We estimate the occurence of failures with tag occurrences.

import math


def to_precision(x,p):
    """
    returns a string representation of x formatted with a precision of p

    Based on the webkit javascript implementation taken from here:
    https://code.google.com/p/webkit-mirror/source/browse/JavaScriptCore/kjs/number_object.cpp
    """

    x = float(x)

    if x == 0.:
        return "0." + "0"*(p-1)

    out = []

    if x < 0:
        out.append("-")
        x = -x

    e = int(math.log10(x))
    tens = math.pow(10, e - p + 1)
    n = math.floor(x/tens)

    if n < math.pow(10, p - 1):
        e = e -1
        tens = math.pow(10, e - p+1)
        n = math.floor(x / tens)

    if abs((n + 1.) * tens - x) <= abs(n * tens -x):
        n = n + 1

    if n >= math.pow(10,p):
        n = n / 10.
        e = e + 1

    m = "%.*g" % (p, n)

    if e < -2 or e >= p:
        out.append(m[0])
        if p > 1:
            out.append(".")
            out.extend(m[1:p])
        out.append('e')
        if e > 0:
            out.append("+")
        out.append(str(e))
    elif e == (p -1):
        out.append(m)
    elif e >= 0:
        out.append(m[:e+1])
        if e+1 < len(m):
            out.append(".")
            out.extend(m[e+1:])
    else:
        out.append("0.")
        out.extend(["0"]*-(e+1))
        out.append(m)

    return "".join(out)

def query_experiment(name, df, df_clean, rule, tag, multi_tag, prnt=False):

    def mask_to_ETclean(df_clean, mask, fill_null=1.):
        filter_df = df_clean.loc[mask]
        g = filter_df.sort_values('BscStartDate').groupby('Asset')
        T = g['BscStartDate'].transform(pd.Series.diff).dt.days
        E = (~filter_df['SuspSugg'].astype(bool)).astype(int)
        return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

    def mask_to_ETraw(df_clean, mask, fill_null=1.):
        filter_df = df_clean.loc[mask]
        g = filter_df.sort_values('BscStartDate').groupby('Asset')
        T = g['BscStartDate'].transform(pd.Series.diff).dt.days
        T_defined = (T>0.)|T.notna()
        T = T[T_defined]
        # assume censored when parts replaced (changeout)
        E = (~(tag_df.S.changeout>0)).astype(int)[mask]
        E = E[T_defined]
        return T.loc[~((T<=0.)|(T.isna()))], E.loc[~((T<=0.)|(T.isna()))]

    experiment = {
        'rules-based': {
            'query': rule,
            'func': mask_to_ETclean,
            'mask': (df_clean.MajorSystem == rule),
            'data': df_clean
        },
        'single-tag': {
            'query': tag,
            'func': mask_to_ETraw,
            'mask': tag_df.I[tag].sum(axis=1)>0,
            'data': df
        },
        'multi-tag': {
            'query': multi_tag,
            'func': mask_to_ETraw,
            'mask': tag_df.I[multi_tag].sum(axis=1)>0,
            'data': df
        }
    }
    results = {
       ('query', 'text/tag'): [],
#        ('Weibull Params', r'$\lambda$'): [],
       ('Weibull Params', r'$\beta$'): [],
       ('Weibull Params', '$\eta$'): [],
       ('MTTF', 'Weib.'): [],
       ('MTTF', 'K-M'): []
       }
    idx = []

    for key, info in experiment.items():
        idx += [key]
        results[('query','text/tag')] += [info['query']]
        if prnt:
            print('{}: {}'.format(key, info['query']))
        info['T'], info['E'] = info['func'](info['data'], info['mask'])
        wf = WeibullFitter()
        wf.fit(info['T'], info['E'], label=f'{key} weibull')

        to_bounds = lambda row:'$\pm$'.join([to_precision(row[0],2),
                                             to_precision(row[1],1)])

        params = wf.summary.T.iloc[:2]
        params['eta_'] = [1/params.lambda_['coef'],  # err. propagation
                          (params.lambda_['se(coef)']/params.lambda_['coef']**2)]
        params = params.T.apply(to_bounds, 1)

        results[('Weibull Params', r'$\eta$')] += [params['eta_']]
        results[('Weibull Params', r'$\beta$')] += [params['rho_']]
        if prnt:                                     
            print('\tWeibull Params:\n',
                  '\t\tη = {}\t'.format(params['eta_']), 
                  'β = {}'.format(params['rho_']))

        kmf = KaplanMeierFitter()
        kmf.fit(info["T"], event_observed=info['E'], label=f'{key} kaplan-meier')
        results[('MTTF','Weib.')] += [to_precision(wf.median_survival_time_,3)]
        results[('MTTF','K-M')] += [to_precision(kmf.median_survival_time_,3)]
        if prnt:
            print(f'\tMTTF: \n\t\tWeib \t'+to_precision(wf.median_survival_time_,3)+\
                   '\n\t\tKM \t'+to_precision(kmf.median_survival_time_,3))
        info['kmf'] = kmf
        info['wf'] = wf
    return experiment, pd.DataFrame(results, index=pd.Index(idx, name=name))
bucket_exp, bucket_res = query_experiment('Bucket', df, df_clean,
                                          'Bucket',
                                          ['bucket'],
                                          ['bucket', 'tooth', 'lip', 'pin']);
tags = ['hyd', 'hose', 'pump', 'compressor']
hyd_exp, hyd_res = query_experiment('Hydraulic System', df, df_clean,
                                    'Hydraulic System',
                                    ['hyd'],
                                    tags)
eng_exp, eng_res = query_experiment('Engine', df, df_clean,
                                    'Engine',
                                    ['engine'],
                                    ['engine', 'filter', 'fan'])
frames = [bucket_res, hyd_res, eng_res]
res = pd.concat(frames, keys = [i.index.name for i in frames],
               names=['Major System', 'method'])
res
query Weibull Params MTTF
text/tag $\beta$ $\eta$ Weib. K-M
Major System method
Bucket rules-based Bucket 0.83$\pm$0.03 0.060$\pm$3e-3 10.8 9.00
single-tag [bucket] 0.83$\pm$0.03 0.038$\pm$3e-3 17.0 15.0
multi-tag [bucket, tooth, lip, pin] 0.82$\pm$0.02 0.060$\pm$3e-3 10.6 9.00
Hydraulic System rules-based Hydraulic System 0.86$\pm$0.02 0.072$\pm$3e-3 9.02 8.00
single-tag [hyd] 0.89$\pm$0.04 0.027$\pm$2e-3 24.3 25.0
multi-tag [hyd, hose, pump, compressor] 0.88$\pm$0.02 0.068$\pm$3e-3 9.71 9.00
Engine rules-based Engine 0.81$\pm$0.02 0.059$\pm$3e-3 10.8 9.00
single-tag [engine] 0.80$\pm$0.03 0.053$\pm$3e-3 12.0 10.0
multi-tag [engine, filter, fan] 0.81$\pm$0.02 0.068$\pm$4e-3 9.31 8.00
exp = [bucket_exp, eng_exp, hyd_exp]
f,axes = plt.subplots(nrows=3, figsize=(5,10))
for n, ax in enumerate(axes): 
    exp[n]['rules-based']['kmf'].plot(ax=ax, color='dodgerblue')
    exp[n]['multi-tag']['kmf'].plot(ax=ax, color='xkcd:rust', ls=':')
    exp[n]['single-tag']['kmf'].plot(ax=ax, color='xkcd:rust')

    ax.set_xlim(0,110)
    ax.set_ylim(0,1)
    ax.set_title(r"$S(t)$"+f" of {res.index.levels[0][n]}")
    sns.despine()
plt.tight_layout()

This next one give you an idea of the differences better. using a log-transform. the tags under-estimate death rates a little in the 80-130 day range, probably because there's a failure mode not captured by the [bucket, lip, tooth] tags (because it's rare).

f,axes = plt.subplots(nrows=3, figsize=(5,10))
for n, ax in enumerate(axes): 
    exp[n]['rules-based']['kmf'].plot_loglogs(ax=ax, c='dodgerblue')
    exp[n]['single-tag']['kmf'].plot_loglogs(ax=ax, c='xkcd:rust', ls=':')
    exp[n]['multi-tag']['kmf'].plot_loglogs(ax=ax, c='xkcd:rust')
    if n != 2:
        ax.legend_.remove()
#     ax.set_xlim(0,110)
#     ax.set_ylim(0,1)
    ax.set_title(r"$\log(-\log(S(t)))$"+f" of {res.index.levels[0][n]}")
    sns.despine()
plt.tight_layout()
f.savefig('bkt_logKMsurvival.png')
# kmf.plot_loglogs()