[1]:

from masskit.utils.tablemap import ArrowLibraryMap
from masskit.data_specs.spectral_library import display_masskit_df
from masskit.test_fixtures.demo_fixtures import cho_uniq_short_parquet
from masskit.data_specs.file_schemas import display_drop_fields

Tables of spectra¶

A spectral library is a table of spectrum objects and associated data (nce, precursor charge, etc.), arranged so that there is one spectrum per row and one column per piece of associated data. In Masskit, spectral libraries are kept in pandas dataframes and arrow tables stored in the parquet format. The arrow tables are wrapped in objects of class ArrowLibraryMap.

Pandas dataframes¶

Load a pandas dataframe from the parquet version of a spectral library¶

[2]:

table = ArrowLibraryMap.from_parquet(cho_uniq_short_parquet())
df = table.table.to_pandas().drop(columns=display_drop_fields, errors='ignore').head(10)

INFO:root:created chunk 1 with 100 records
INFO:root:processing batch 0 with size 100
INFO:root:created chunk 1 with 0 records

Take the top 10 lines of the dataframe and display in html¶

[3]:

display_masskit_df(df)

id	instrument	instrument_type	instrument_model	ion_mode	ionization	name	casno	synonyms	scan	collision_energy	retention_time	collision_gas	insource_voltage	sample_inlet	ev	nce	charge	precursor_mz	exact_mass	exact_mw	set	composition	peptide	peptide_len	peptide_type	mod_names	mod_positions	protein_id
0	None	None	None	None	None	AAAACALTPGPLADLAAR/2_1(4,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	46.00	35.00	2	855.45	NaN	NaN	NaN	bestof	AAAACALTPGPLADLAAR	18	tryptic	[4]	[4]	None
1	None	None	None	None	None	AAAACALTPGPLADLAAR/2_1(4,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	53.00	35.00	2	855.45	NaN	NaN	NaN	bestof	AAAACALTPGPLADLAAR	18	tryptic	[4]	[4]	None
2	None	None	None	None	None	AAAAGQTGTVPPGAPGALPLPGMAIVK/2	None	None	None	NaN	NaN	None	NaN	None	76.00	35.00	2	1207.17	NaN	NaN	NaN	bestof	AAAAGQTGTVPPGAPGALPLPGMAIVK	27	semitryptic	[]	[]	None
3	None	None	None	None	None	AAAAGSTSVKPIFSR/2	None	None	None	NaN	NaN	None	NaN	None	44.00	34.00	2	731.90	NaN	NaN	NaN	bestof	AAAAGSTSVKPIFSR	15	semitryptic	[]	[]	None
4	None	None	None	None	None	AAAAGSTSVKPIFSR/3	None	None	None	NaN	NaN	None	NaN	None	28.00	34.00	3	488.27	NaN	NaN	NaN	bestof	AAAAGSTSVKPIFSR	15	semitryptic	[]	[]	None
5	None	None	None	None	None	AAAALGSHGSCSSEVEK/2_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	50.00	34.00	2	830.88	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None
6	None	None	None	None	None	AAAALGSHGSCSSEVEK/2_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	52.00	35.00	2	830.88	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None
7	None	None	None	None	None	AAAALGSHGSCSSEVEK/3_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	32.00	35.00	3	554.26	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None
8	None	None	None	None	None	AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	32.00	35.00	3	759.35	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEKETQEK	22	tryptic	[4]	[10]	None
9	None	None	None	None	None	AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	43.00	34.00	3	759.35	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEKETQEK	22	tryptic	[4]	[10]	None

List columns¶

[4]:

df.columns

[4]:

Index(['id', 'instrument', 'instrument_type', 'instrument_model', 'ion_mode',
       'ionization', 'name', 'casno', 'synonyms', 'scan', 'collision_energy',
       'retention_time', 'collision_gas', 'insource_voltage', 'sample_inlet',
       'ev', 'nce', 'charge', 'precursor_mz', 'exact_mass', 'exact_mw', 'set',
       'composition', 'peptide', 'peptide_len', 'peptide_type', 'mod_names',
       'mod_positions', 'protein_id', 'spectrum'],
      dtype='object')

Iterating through a dataframe¶

[5]:

for row in df.head(10).itertuples():
    print (row.peptide, row.nce)

AAAACALTPGPLADLAAR 35.0
AAAACALTPGPLADLAAR 35.0
AAAAGQTGTVPPGAPGALPLPGMAIVK 35.0
AAAAGSTSVKPIFSR 34.0
AAAAGSTSVKPIFSR 34.0
AAAALGSHGSCSSEVEK 34.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEKETQEK 35.0
AAAALGSHGSCSSEVEKETQEK 34.0

Query on the dataframe to return a dataframe containing rows that match the query, then display the new dataframe¶

[6]:

df_single_peptide = df.query('peptide == "AAAALGSHGSCSSEVEK"')
display_masskit_df(df_single_peptide)

id	instrument	instrument_type	instrument_model	ion_mode	ionization	name	casno	synonyms	scan	collision_energy	retention_time	collision_gas	insource_voltage	sample_inlet	ev	nce	charge	precursor_mz	exact_mass	exact_mw	set	composition	peptide	peptide_len	peptide_type	mod_names	mod_positions	protein_id
5	None	None	None	None	None	AAAALGSHGSCSSEVEK/2_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	50.00	34.00	2	830.88	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None
6	None	None	None	None	None	AAAALGSHGSCSSEVEK/2_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	52.00	35.00	2	830.88	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None
7	None	None	None	None	None	AAAALGSHGSCSSEVEK/3_1(10,C,CAM)	None	None	None	NaN	NaN	None	NaN	None	32.00	35.00	3	554.26	NaN	NaN	NaN	bestof	AAAALGSHGSCSSEVEK	17	tryptic	[4]	[10]	None

Select one spectrum from the new dataframe, filter out low m/z peaks, normalize the base intensity to 1.0¶

[7]:

type(df_single_peptide.iloc[0]['spectrum'])

[7]:

masskit.spectrum.spectrum.Spectrum

[8]:

new_spectrum = df_single_peptide.iloc[0]['spectrum']
new_spectrum = new_spectrum.filter(min_mz=500).norm(1.0)
new_spectrum

[8]:

Input/output¶

The spectral library in the form of a dataframe has the ability to read and write a variety of formats.

MSP¶

[9]:

# write out msp file with peptide annotations
df_single_peptide['spectrum'].array.to_msp("peptides.msp", annotate_peptide=True)

save a single spectrum at row 0 as an msp file¶

[10]:

df_single_peptide.iloc[[0]]['spectrum'].array.to_msp("single_spectrum.msp")

MGF¶

[11]:

df_single_peptide['spectrum'].array.to_mgf("peptides.msp")

PKL (python pickle)¶

[12]:

df_single_peptide.to_pickle('single_peptide.pkl')

CSV¶

[13]:

df_single_peptide.to_csv('single_peptide.csv')