[1]:
from masskit.utils.tablemap import ArrowLibraryMap
from masskit.data_specs.spectral_library import display_masskit_df
from masskit.test_fixtures.demo_fixtures import cho_uniq_short_parquet
from masskit.data_specs.file_schemas import display_drop_fields

Tables of spectra

A spectral library is a table of spectrum objects and associated data (nce, precursor charge, etc.), arranged so that there is one spectrum per row and one column per piece of associated data. In Masskit, spectral libraries are kept in pandas dataframes and arrow tables stored in the parquet format. The arrow tables are wrapped in objects of class ArrowLibraryMap.

Pandas dataframes

Load a pandas dataframe from the parquet version of a spectral library

[2]:
table = ArrowLibraryMap.from_parquet(cho_uniq_short_parquet())
df = table.table.to_pandas().drop(columns=display_drop_fields, errors='ignore').head(10)
INFO:root:created chunk 1 with 100 records
INFO:root:processing batch 0 with size 100
INFO:root:created chunk 1 with 0 records

Take the top 10 lines of the dataframe and display in html

[3]:
display_masskit_df(df)
id instrument instrument_type instrument_model ion_mode ionization name casno synonyms scan collision_energy retention_time collision_gas insource_voltage sample_inlet ev nce charge precursor_mz exact_mass exact_mw set composition peptide peptide_len peptide_type mod_names mod_positions protein_id spectrum
0 None None None None None AAAACALTPGPLADLAAR/2_1(4,C,CAM) None None None NaN NaN None NaN None 46.00 35.00 2 855.45 NaN NaN NaN bestof AAAACALTPGPLADLAAR 18 tryptic [4] [4] None spectrum AAAACALTPGPLADLAAR/2_1(4,C,CAM)
1 None None None None None AAAACALTPGPLADLAAR/2_1(4,C,CAM) None None None NaN NaN None NaN None 53.00 35.00 2 855.45 NaN NaN NaN bestof AAAACALTPGPLADLAAR 18 tryptic [4] [4] None spectrum AAAACALTPGPLADLAAR/2_1(4,C,CAM)
2 None None None None None AAAAGQTGTVPPGAPGALPLPGMAIVK/2 None None None NaN NaN None NaN None 76.00 35.00 2 1207.17 NaN NaN NaN bestof AAAAGQTGTVPPGAPGALPLPGMAIVK 27 semitryptic [] [] None spectrum AAAAGQTGTVPPGAPGALPLPGMAIVK/2
3 None None None None None AAAAGSTSVKPIFSR/2 None None None NaN NaN None NaN None 44.00 34.00 2 731.90 NaN NaN NaN bestof AAAAGSTSVKPIFSR 15 semitryptic [] [] None spectrum AAAAGSTSVKPIFSR/2
4 None None None None None AAAAGSTSVKPIFSR/3 None None None NaN NaN None NaN None 28.00 34.00 3 488.27 NaN NaN NaN bestof AAAAGSTSVKPIFSR 15 semitryptic [] [] None spectrum AAAAGSTSVKPIFSR/3
5 None None None None None AAAALGSHGSCSSEVEK/2_1(10,C,CAM) None None None NaN NaN None NaN None 50.00 34.00 2 830.88 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/2_1(10,C,CAM)
6 None None None None None AAAALGSHGSCSSEVEK/2_1(10,C,CAM) None None None NaN NaN None NaN None 52.00 35.00 2 830.88 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/2_1(10,C,CAM)
7 None None None None None AAAALGSHGSCSSEVEK/3_1(10,C,CAM) None None None NaN NaN None NaN None 32.00 35.00 3 554.26 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/3_1(10,C,CAM)
8 None None None None None AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) None None None NaN NaN None NaN None 32.00 35.00 3 759.35 NaN NaN NaN bestof AAAALGSHGSCSSEVEKETQEK 22 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)
9 None None None None None AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) None None None NaN NaN None NaN None 43.00 34.00 3 759.35 NaN NaN NaN bestof AAAALGSHGSCSSEVEKETQEK 22 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)

List columns

[4]:
df.columns
[4]:
Index(['id', 'instrument', 'instrument_type', 'instrument_model', 'ion_mode',
       'ionization', 'name', 'casno', 'synonyms', 'scan', 'collision_energy',
       'retention_time', 'collision_gas', 'insource_voltage', 'sample_inlet',
       'ev', 'nce', 'charge', 'precursor_mz', 'exact_mass', 'exact_mw', 'set',
       'composition', 'peptide', 'peptide_len', 'peptide_type', 'mod_names',
       'mod_positions', 'protein_id', 'spectrum'],
      dtype='object')

Iterating through a dataframe

[5]:
for row in df.head(10).itertuples():
    print (row.peptide, row.nce)
AAAACALTPGPLADLAAR 35.0
AAAACALTPGPLADLAAR 35.0
AAAAGQTGTVPPGAPGALPLPGMAIVK 35.0
AAAAGSTSVKPIFSR 34.0
AAAAGSTSVKPIFSR 34.0
AAAALGSHGSCSSEVEK 34.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEKETQEK 35.0
AAAALGSHGSCSSEVEKETQEK 34.0

Query on the dataframe to return a dataframe containing rows that match the query, then display the new dataframe

[6]:
df_single_peptide = df.query('peptide == "AAAALGSHGSCSSEVEK"')
display_masskit_df(df_single_peptide)
id instrument instrument_type instrument_model ion_mode ionization name casno synonyms scan collision_energy retention_time collision_gas insource_voltage sample_inlet ev nce charge precursor_mz exact_mass exact_mw set composition peptide peptide_len peptide_type mod_names mod_positions protein_id spectrum
5 None None None None None AAAALGSHGSCSSEVEK/2_1(10,C,CAM) None None None NaN NaN None NaN None 50.00 34.00 2 830.88 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/2_1(10,C,CAM)
6 None None None None None AAAALGSHGSCSSEVEK/2_1(10,C,CAM) None None None NaN NaN None NaN None 52.00 35.00 2 830.88 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/2_1(10,C,CAM)
7 None None None None None AAAALGSHGSCSSEVEK/3_1(10,C,CAM) None None None NaN NaN None NaN None 32.00 35.00 3 554.26 NaN NaN NaN bestof AAAALGSHGSCSSEVEK 17 tryptic [4] [10] None spectrum AAAALGSHGSCSSEVEK/3_1(10,C,CAM)

Select one spectrum from the new dataframe, filter out low m/z peaks, normalize the base intensity to 1.0

[7]:
type(df_single_peptide.iloc[0]['spectrum'])
[7]:
masskit.spectrum.spectrum.Spectrum
[8]:
new_spectrum = df_single_peptide.iloc[0]['spectrum']
new_spectrum = new_spectrum.filter(min_mz=500).norm(1.0)
new_spectrum
[8]:
_images/demo_peptide_14_0.svg

Input/output

The spectral library in the form of a dataframe has the ability to read and write a variety of formats.

MSP

[9]:
# write out msp file with peptide annotations
df_single_peptide['spectrum'].array.to_msp("peptides.msp", annotate_peptide=True)
save a single spectrum at row 0 as an msp file
[10]:
df_single_peptide.iloc[[0]]['spectrum'].array.to_msp("single_spectrum.msp")

MGF

[11]:
df_single_peptide['spectrum'].array.to_mgf("peptides.msp")

PKL (python pickle)

[12]:
df_single_peptide.to_pickle('single_peptide.pkl')

CSV

[13]:
df_single_peptide.to_csv('single_peptide.csv')