[1]:
from masskit.utils.tablemap import ArrowLibraryMap
from masskit.data_specs.spectral_library import display_masskit_df
from masskit.test_fixtures.demo_fixtures import cho_uniq_short_parquet
from masskit.data_specs.file_schemas import display_drop_fields
Tables of spectra¶
A spectral library is a table of spectrum objects and associated data (nce, precursor charge, etc.), arranged so that there is one spectrum per row and one column per piece of associated data. In Masskit, spectral libraries are kept in pandas dataframes and arrow tables stored in the parquet format. The arrow tables are wrapped in objects of class ArrowLibraryMap.
Pandas dataframes¶
Load a pandas dataframe from the parquet version of a spectral library¶
[2]:
table = ArrowLibraryMap.from_parquet(cho_uniq_short_parquet())
df = table.table.to_pandas().drop(columns=display_drop_fields, errors='ignore').head(10)
INFO:root:created chunk 1 with 100 records
INFO:root:processing batch 0 with size 100
INFO:root:created chunk 1 with 0 records
Take the top 10 lines of the dataframe and display in html¶
[3]:
display_masskit_df(df)
id | instrument | instrument_type | instrument_model | ion_mode | ionization | name | casno | synonyms | scan | collision_energy | retention_time | collision_gas | insource_voltage | sample_inlet | ev | nce | charge | precursor_mz | exact_mass | exact_mw | set | composition | peptide | peptide_len | peptide_type | mod_names | mod_positions | protein_id | spectrum |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | None | None | None | None | AAAACALTPGPLADLAAR/2_1(4,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 46.00 | 35.00 | 2 | 855.45 | NaN | NaN | NaN | bestof | AAAACALTPGPLADLAAR | 18 | tryptic | [4] | [4] | None | |
1 | None | None | None | None | None | AAAACALTPGPLADLAAR/2_1(4,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 53.00 | 35.00 | 2 | 855.45 | NaN | NaN | NaN | bestof | AAAACALTPGPLADLAAR | 18 | tryptic | [4] | [4] | None | |
2 | None | None | None | None | None | AAAAGQTGTVPPGAPGALPLPGMAIVK/2 | None | None | None | NaN | NaN | None | NaN | None | 76.00 | 35.00 | 2 | 1207.17 | NaN | NaN | NaN | bestof | AAAAGQTGTVPPGAPGALPLPGMAIVK | 27 | semitryptic | [] | [] | None | |
3 | None | None | None | None | None | AAAAGSTSVKPIFSR/2 | None | None | None | NaN | NaN | None | NaN | None | 44.00 | 34.00 | 2 | 731.90 | NaN | NaN | NaN | bestof | AAAAGSTSVKPIFSR | 15 | semitryptic | [] | [] | None | |
4 | None | None | None | None | None | AAAAGSTSVKPIFSR/3 | None | None | None | NaN | NaN | None | NaN | None | 28.00 | 34.00 | 3 | 488.27 | NaN | NaN | NaN | bestof | AAAAGSTSVKPIFSR | 15 | semitryptic | [] | [] | None | |
5 | None | None | None | None | None | AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 50.00 | 34.00 | 2 | 830.88 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None | |
6 | None | None | None | None | None | AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 52.00 | 35.00 | 2 | 830.88 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None | |
7 | None | None | None | None | None | AAAALGSHGSCSSEVEK/3_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 32.00 | 35.00 | 3 | 554.26 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None | |
8 | None | None | None | None | None | AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 32.00 | 35.00 | 3 | 759.35 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEKETQEK | 22 | tryptic | [4] | [10] | None | |
9 | None | None | None | None | None | AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 43.00 | 34.00 | 3 | 759.35 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEKETQEK | 22 | tryptic | [4] | [10] | None |
List columns¶
[4]:
df.columns
[4]:
Index(['id', 'instrument', 'instrument_type', 'instrument_model', 'ion_mode',
'ionization', 'name', 'casno', 'synonyms', 'scan', 'collision_energy',
'retention_time', 'collision_gas', 'insource_voltage', 'sample_inlet',
'ev', 'nce', 'charge', 'precursor_mz', 'exact_mass', 'exact_mw', 'set',
'composition', 'peptide', 'peptide_len', 'peptide_type', 'mod_names',
'mod_positions', 'protein_id', 'spectrum'],
dtype='object')
Iterating through a dataframe¶
[5]:
for row in df.head(10).itertuples():
print (row.peptide, row.nce)
AAAACALTPGPLADLAAR 35.0
AAAACALTPGPLADLAAR 35.0
AAAAGQTGTVPPGAPGALPLPGMAIVK 35.0
AAAAGSTSVKPIFSR 34.0
AAAAGSTSVKPIFSR 34.0
AAAALGSHGSCSSEVEK 34.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEK 35.0
AAAALGSHGSCSSEVEKETQEK 35.0
AAAALGSHGSCSSEVEKETQEK 34.0
Query on the dataframe to return a dataframe containing rows that match the query, then display the new dataframe¶
[6]:
df_single_peptide = df.query('peptide == "AAAALGSHGSCSSEVEK"')
display_masskit_df(df_single_peptide)
id | instrument | instrument_type | instrument_model | ion_mode | ionization | name | casno | synonyms | scan | collision_energy | retention_time | collision_gas | insource_voltage | sample_inlet | ev | nce | charge | precursor_mz | exact_mass | exact_mw | set | composition | peptide | peptide_len | peptide_type | mod_names | mod_positions | protein_id | spectrum |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | None | None | None | None | None | AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 50.00 | 34.00 | 2 | 830.88 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None | |
6 | None | None | None | None | None | AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 52.00 | 35.00 | 2 | 830.88 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None | |
7 | None | None | None | None | None | AAAALGSHGSCSSEVEK/3_1(10,C,CAM) | None | None | None | NaN | NaN | None | NaN | None | 32.00 | 35.00 | 3 | 554.26 | NaN | NaN | NaN | bestof | AAAALGSHGSCSSEVEK | 17 | tryptic | [4] | [10] | None |
Select one spectrum from the new dataframe, filter out low m/z peaks, normalize the base intensity to 1.0¶
[7]:
type(df_single_peptide.iloc[0]['spectrum'])
[7]:
masskit.spectrum.spectrum.Spectrum
[8]:
new_spectrum = df_single_peptide.iloc[0]['spectrum']
new_spectrum = new_spectrum.filter(min_mz=500).norm(1.0)
new_spectrum
[8]:
Input/output¶
The spectral library in the form of a dataframe has the ability to read and write a variety of formats.
MSP¶
[9]:
# write out msp file with peptide annotations
df_single_peptide['spectrum'].array.to_msp("peptides.msp", annotate_peptide=True)
save a single spectrum at row 0 as an msp file¶
[10]:
df_single_peptide.iloc[[0]]['spectrum'].array.to_msp("single_spectrum.msp")
MGF¶
[11]:
df_single_peptide['spectrum'].array.to_mgf("peptides.msp")
PKL (python pickle)¶
[12]:
df_single_peptide.to_pickle('single_peptide.pkl')
CSV¶
[13]:
df_single_peptide.to_csv('single_peptide.csv')