{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "polish-inquiry", "metadata": {}, "outputs": [], "source": [ "from masskit.utils.tablemap import ArrowLibraryMap\n", "from masskit.data_specs.spectral_library import display_masskit_df\n", "from masskit.test_fixtures.demo_fixtures import cho_uniq_short_parquet\n", "from masskit.data_specs.file_schemas import display_drop_fields" ] }, { "attachments": {}, "cell_type": "markdown", "id": "dutch-vehicle", "metadata": {}, "source": [ "# Tables of spectra\n", "A spectral library is a table of spectrum objects and associated data (nce, precursor charge, etc.), arranged so that there is one spectrum per row and one column per piece of associated data. In Masskit, spectral libraries are kept in pandas dataframes and arrow tables stored in the parquet format. The arrow tables are wrapped in objects of class ArrowLibraryMap.\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "intelligent-commonwealth", "metadata": {}, "source": [ "## Pandas dataframes\n", "### Load a pandas dataframe from the parquet version of a spectral library" ] }, { "cell_type": "code", "execution_count": 2, "id": "economic-attitude", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:root:created chunk 1 with 100 records\n", "INFO:root:processing batch 0 with size 100\n", "INFO:root:created chunk 1 with 0 records\n" ] } ], "source": [ "table = ArrowLibraryMap.from_parquet(cho_uniq_short_parquet())\n", "df = table.table.to_pandas().drop(columns=display_drop_fields, errors='ignore').head(10)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "exclusive-bosnia", "metadata": {}, "source": [ "### Take the top 10 lines of the dataframe and display in html" ] }, { "cell_type": "code", "execution_count": 3, "id": "intelligent-europe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idinstrumentinstrument_typeinstrument_modelion_modeionizationnamecasnosynonymsscancollision_energyretention_timecollision_gasinsource_voltagesample_inletevncechargeprecursor_mzexact_massexact_mwsetcompositionpeptidepeptide_lenpeptide_typemod_namesmod_positionsprotein_idspectrum
0NoneNoneNoneNoneNoneAAAACALTPGPLADLAAR/2_1(4,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone46.0035.002855.45NaNNaNNaNbestofAAAACALTPGPLADLAAR18tryptic[4][4]None\"spectrum
1NoneNoneNoneNoneNoneAAAACALTPGPLADLAAR/2_1(4,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone53.0035.002855.45NaNNaNNaNbestofAAAACALTPGPLADLAAR18tryptic[4][4]None\"spectrum
2NoneNoneNoneNoneNoneAAAAGQTGTVPPGAPGALPLPGMAIVK/2NoneNoneNoneNaNNaNNoneNaNNone76.0035.0021207.17NaNNaNNaNbestofAAAAGQTGTVPPGAPGALPLPGMAIVK27semitryptic[][]None\"spectrum
3NoneNoneNoneNoneNoneAAAAGSTSVKPIFSR/2NoneNoneNoneNaNNaNNoneNaNNone44.0034.002731.90NaNNaNNaNbestofAAAAGSTSVKPIFSR15semitryptic[][]None\"spectrum
4NoneNoneNoneNoneNoneAAAAGSTSVKPIFSR/3NoneNoneNoneNaNNaNNoneNaNNone28.0034.003488.27NaNNaNNaNbestofAAAAGSTSVKPIFSR15semitryptic[][]None\"spectrum
5NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/2_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone50.0034.002830.88NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
6NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/2_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone52.0035.002830.88NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
7NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/3_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone32.0035.003554.26NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
8NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone32.0035.003759.35NaNNaNNaNbestofAAAALGSHGSCSSEVEKETQEK22tryptic[4][10]None\"spectrum
9NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone43.0034.003759.35NaNNaNNaNbestofAAAALGSHGSCSSEVEKETQEK22tryptic[4][10]None\"spectrum
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display_masskit_df(df)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "brave-height", "metadata": {}, "source": [ "### List columns" ] }, { "cell_type": "code", "execution_count": 4, "id": "ranking-pickup", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'instrument', 'instrument_type', 'instrument_model', 'ion_mode',\n", " 'ionization', 'name', 'casno', 'synonyms', 'scan', 'collision_energy',\n", " 'retention_time', 'collision_gas', 'insource_voltage', 'sample_inlet',\n", " 'ev', 'nce', 'charge', 'precursor_mz', 'exact_mass', 'exact_mw', 'set',\n", " 'composition', 'peptide', 'peptide_len', 'peptide_type', 'mod_names',\n", " 'mod_positions', 'protein_id', 'spectrum'],\n", " dtype='object')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "attachments": {}, "cell_type": "markdown", "id": "constant-texas", "metadata": {}, "source": [ "### Iterating through a dataframe" ] }, { "cell_type": "code", "execution_count": 5, "id": "guided-rings", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AAAACALTPGPLADLAAR 35.0\n", "AAAACALTPGPLADLAAR 35.0\n", "AAAAGQTGTVPPGAPGALPLPGMAIVK 35.0\n", "AAAAGSTSVKPIFSR 34.0\n", "AAAAGSTSVKPIFSR 34.0\n", "AAAALGSHGSCSSEVEK 34.0\n", "AAAALGSHGSCSSEVEK 35.0\n", "AAAALGSHGSCSSEVEK 35.0\n", "AAAALGSHGSCSSEVEKETQEK 35.0\n", "AAAALGSHGSCSSEVEKETQEK 34.0\n" ] } ], "source": [ "for row in df.head(10).itertuples():\n", " print (row.peptide, row.nce)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "legislative-tourism", "metadata": {}, "source": [ "### Query on the dataframe to return a dataframe containing rows that match the query, then display the new dataframe" ] }, { "cell_type": "code", "execution_count": 6, "id": "likely-circle", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idinstrumentinstrument_typeinstrument_modelion_modeionizationnamecasnosynonymsscancollision_energyretention_timecollision_gasinsource_voltagesample_inletevncechargeprecursor_mzexact_massexact_mwsetcompositionpeptidepeptide_lenpeptide_typemod_namesmod_positionsprotein_idspectrum
5NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/2_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone50.0034.002830.88NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
6NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/2_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone52.0035.002830.88NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
7NoneNoneNoneNoneNoneAAAALGSHGSCSSEVEK/3_1(10,C,CAM)NoneNoneNoneNaNNaNNoneNaNNone32.0035.003554.26NaNNaNNaNbestofAAAALGSHGSCSSEVEK17tryptic[4][10]None\"spectrum
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_single_peptide = df.query('peptide == \"AAAALGSHGSCSSEVEK\"')\n", "display_masskit_df(df_single_peptide)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "assigned-welcome", "metadata": {}, "source": [ "## Select one spectrum from the new dataframe, filter out low m/z peaks, normalize the base intensity to 1.0" ] }, { "cell_type": "code", "execution_count": 7, "id": "b6baab7f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "masskit.spectrum.spectrum.Spectrum" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(df_single_peptide.iloc[0]['spectrum'])" ] }, { "cell_type": "code", "execution_count": 8, "id": "disturbed-rwanda", "metadata": {}, "outputs": [ { "data": { "image/png": "", "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", " \n", " 2023-07-12T14:16:43.126665\n", " image/svg+xml\n", " \n", " \n", " Matplotlib v3.6.2, https://matplotlib.org/\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_spectrum = df_single_peptide.iloc[0]['spectrum']\n", "new_spectrum = new_spectrum.filter(min_mz=500).norm(1.0)\n", "new_spectrum" ] }, { "attachments": {}, "cell_type": "markdown", "id": "obvious-welsh", "metadata": {}, "source": [ "### Input/output\n", "The spectral library in the form of a dataframe has the ability to read and write a variety of formats." ] }, { "attachments": {}, "cell_type": "markdown", "id": "decimal-regular", "metadata": {}, "source": [ "#### MSP" ] }, { "cell_type": "code", "execution_count": 9, "id": "cathedral-candidate", "metadata": {}, "outputs": [], "source": [ "# write out msp file with peptide annotations\n", "df_single_peptide['spectrum'].array.to_msp(\"peptides.msp\", annotate_peptide=True)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "prescription-baghdad", "metadata": {}, "source": [ "##### save a single spectrum at row 0 as an msp file" ] }, { "cell_type": "code", "execution_count": 10, "id": "worthy-rochester", "metadata": {}, "outputs": [], "source": [ "df_single_peptide.iloc[[0]]['spectrum'].array.to_msp(\"single_spectrum.msp\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "scientific-today", "metadata": {}, "source": [ "#### MGF" ] }, { "cell_type": "code", "execution_count": 11, "id": "vocal-failing", "metadata": {}, "outputs": [], "source": [ "df_single_peptide['spectrum'].array.to_mgf(\"peptides.msp\")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "associate-warren", "metadata": {}, "source": [ "#### PKL (python pickle)" ] }, { "cell_type": "code", "execution_count": 12, "id": "lucky-recovery", "metadata": {}, "outputs": [], "source": [ "df_single_peptide.to_pickle('single_peptide.pkl')" ] }, { "attachments": {}, "cell_type": "markdown", "id": "smoking-importance", "metadata": {}, "source": [ "#### CSV" ] }, { "cell_type": "code", "execution_count": 13, "id": "acceptable-insertion", "metadata": {}, "outputs": [], "source": [ "df_single_peptide.to_csv('single_peptide.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.10 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "vscode": { "interpreter": { "hash": "11d150ef1a59d6ee6bd3538ad9ed751649d8a614c736b8deec7e36a34a38bbb5" } } }, "nbformat": 4, "nbformat_minor": 5 }