Search
[3]:
import sys, os
sys.path.append(os.path.join("..",".."))
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from mika.utils import Data
from mika.ir import search
Prepare Data: NTSB
[5]:
os.chdir('../')
os.chdir('../')
ntsb_filepath = os.path.join("data/NTSB/ntsb_full.csv")
ntsb_text_columns = ['narr_cause', 'narr_accf'] # narrative accident cause and narrative accident final
ntsb_document_id_col = 'ev_id'
ntsb_database_name = 'NTSB'
ntsb_data = Data()
ntsb_data.load(ntsb_filepath, preprocessed=False, id_col=ntsb_document_id_col, text_columns=ntsb_text_columns.copy(), name=ntsb_database_name, load_kwargs={'dtype':str})
ntsb_data.prepare_data(create_ids=False, combine_columns=ntsb_text_columns.copy(), remove_incomplete_rows=False)
# filter to years of interest
years_of_interest = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
ntsb_data.data_df = ntsb_data.data_df.loc[ntsb_data.data_df['ev_year'].isin(years_of_interest)].drop_duplicates(subset='ev_id', keep="last").reset_index(drop=True) # keep the last record, this one has the phase of flight and mishap right before the accident
ntsb_data._Data__update_ids()
Combining Columns…: 100%|██████████| 196535/196535 [00:02<00:00, 75855.79it/s]
data preparation: 0.04 minutes
Perform Search
[10]:
# fine tuned model
model = SentenceTransformer("NASA-AIML/MIKA_Custom_IR")
#model = SentenceTransformer('all-distilroberta-v1') #uncomment to use pretrained model
ir_ntsb = search('narr_cause', ntsb_data, model)
embeddings_path = os.path.join('data', 'NTSB', 'ntsb_sentence_embeddings_finetune.npy')
ir_ntsb.get_sentence_embeddings(embeddings_path) # uncomment this if the embeddings do not yet exist
#ir_ntsb.load_sentence_embeddings(embeddings_path) # uncomment this if you wish to load sentence embeddings that already exist
[11]:
queries = ['what components are vulnerable to fatigue crack', 'what are the consequences of a fuel leak', 'what are the risks of low visibility']
for query in queries:
print(query)
display(ir_ntsb.run_search(query, return_k=10, rank_k=10))
what components are vulnerable to fatigue crack
top_hit_doc | top_hit_scores | top_hit_text | |
---|---|---|---|
0 | 20121204X63622 | 0.699882 | The total loss of engine power as a result of ... |
1 | 20120703X63653 | 0.633137 | An undetected fatigue crack on the outer surfa... |
2 | 20180404X13226 | 0.603839 | Extensive fatigue cracking in the left-wing ma... |
3 | 20161027X90950 | 0.600091 | A No. 1 engine fire caused by a fuel manifold ... |
4 | 20110613X64905 | 0.599796 | A quality control failure in the Sungear, Inc.... |
5 | 20140921X93622 | 0.598661 | The fatigue fracture of the No. 3 exhaust valv... |
6 | 20110315X35559 | 0.591568 | The fatigue fracture and subsequent failure of... |
7 | 20170607X22245 | 0.582699 | The PW4056 engine lost power due to the fatigu... |
8 | 20161216X91529 | 0.562310 | A fatigue crack of the crankshaft at the No. 4... |
9 | 20130524X83830 | 0.560072 | Failure of the right inboard foreflap outboard... |
what are the consequences of a fuel leak
top_hit_doc | top_hit_scores | top_hit_text | |
---|---|---|---|
0 | 20110729X01918 | 0.596457 | The loss of engine power due to fuel contamina... |
1 | 20140608X03928 | 0.595382 | The loosening of a B-nut in the fuel system wh... |
2 | 20211004104034 | 0.592871 | A total loss of engine power due to fuel conta... |
3 | 20110923X51306 | 0.592871 | A total loss of engine power due to fuel conta... |
4 | 20150421X33923 | 0.591596 | The partial loss of engine power due to contam... |
5 | 20110621X20741 | 0.585873 | Fuel leaking from the fuel flow transmitter th... |
6 | 20151005X24854 | 0.582271 | The total loss of engine power due to contamin... |
7 | 20150325X74504 | 0.582222 | Loss of engine power due to fuel contamination... |
8 | 20180918X41159 | 0.582222 | Loss of engine power due to fuel contamination... |
9 | 20200601X20239 | 0.575583 | A total loss of engine power due to a fuel lea... |
what are the risks of low visibility
top_hit_doc | top_hit_scores | top_hit_text | |
---|---|---|---|
0 | 20150110X84136 | 0.471297 | The non-instrument rated pilot’s decision to d... |
1 | 20180319X90457 | 0.445092 | The pilot's failure to see and avoid power lin... |
2 | 20170114X83817 | 0.441732 | The pilot's failure to see and avoid power lin... |
3 | 20210909103825 | 0.434227 | The pilot's failure to maintain clearance from... |
4 | 20110401X40531 | 0.431236 | The pilot’s inadequate visual lookout and fail... |
5 | 20131002X40410 | 0.431074 | The pilot’s distracted attention and failure t... |
6 | 20150903X31635 | 0.426771 | The pilot’s inadequate visual lookout and fail... |
7 | 20150514X61132 | 0.425973 | The pilot's failure to see and avoid power lin... |
8 | 20130517X02521 | 0.425482 | The pilot's failure to maintain clearance from... |
9 | 20140430X30514 | 0.425437 | The pilots’ failure to maintain clearance from... |