{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "polish-inquiry",
"metadata": {},
"outputs": [],
"source": [
"from masskit.utils.tablemap import ArrowLibraryMap\n",
"from masskit.data_specs.spectral_library import display_masskit_df\n",
"from masskit.test_fixtures.demo_fixtures import cho_uniq_short_parquet\n",
"from masskit.data_specs.file_schemas import display_drop_fields"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "dutch-vehicle",
"metadata": {},
"source": [
"# Tables of spectra\n",
"A spectral library is a table of spectrum objects and associated data (nce, precursor charge, etc.), arranged so that there is one spectrum per row and one column per piece of associated data. In Masskit, spectral libraries are kept in pandas dataframes and arrow tables stored in the parquet format. The arrow tables are wrapped in objects of class ArrowLibraryMap.\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "intelligent-commonwealth",
"metadata": {},
"source": [
"## Pandas dataframes\n",
"### Load a pandas dataframe from the parquet version of a spectral library"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "economic-attitude",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:root:created chunk 1 with 100 records\n",
"INFO:root:processing batch 0 with size 100\n",
"INFO:root:created chunk 1 with 0 records\n"
]
}
],
"source": [
"table = ArrowLibraryMap.from_parquet(cho_uniq_short_parquet())\n",
"df = table.table.to_pandas().drop(columns=display_drop_fields, errors='ignore').head(10)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "exclusive-bosnia",
"metadata": {},
"source": [
"### Take the top 10 lines of the dataframe and display in html"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "intelligent-europe",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" id | \n",
" instrument | \n",
" instrument_type | \n",
" instrument_model | \n",
" ion_mode | \n",
" ionization | \n",
" name | \n",
" casno | \n",
" synonyms | \n",
" scan | \n",
" collision_energy | \n",
" retention_time | \n",
" collision_gas | \n",
" insource_voltage | \n",
" sample_inlet | \n",
" ev | \n",
" nce | \n",
" charge | \n",
" precursor_mz | \n",
" exact_mass | \n",
" exact_mw | \n",
" set | \n",
" composition | \n",
" peptide | \n",
" peptide_len | \n",
" peptide_type | \n",
" mod_names | \n",
" mod_positions | \n",
" protein_id | \n",
" spectrum | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAACALTPGPLADLAAR/2_1(4,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 46.00 | \n",
" 35.00 | \n",
" 2 | \n",
" 855.45 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAACALTPGPLADLAAR | \n",
" 18 | \n",
" tryptic | \n",
" [4] | \n",
" [4] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 1 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAACALTPGPLADLAAR/2_1(4,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 53.00 | \n",
" 35.00 | \n",
" 2 | \n",
" 855.45 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAACALTPGPLADLAAR | \n",
" 18 | \n",
" tryptic | \n",
" [4] | \n",
" [4] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 2 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAAGQTGTVPPGAPGALPLPGMAIVK/2 | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 76.00 | \n",
" 35.00 | \n",
" 2 | \n",
" 1207.17 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAAGQTGTVPPGAPGALPLPGMAIVK | \n",
" 27 | \n",
" semitryptic | \n",
" [] | \n",
" [] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 3 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAAGSTSVKPIFSR/2 | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 44.00 | \n",
" 34.00 | \n",
" 2 | \n",
" 731.90 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAAGSTSVKPIFSR | \n",
" 15 | \n",
" semitryptic | \n",
" [] | \n",
" [] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 4 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAAGSTSVKPIFSR/3 | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 28.00 | \n",
" 34.00 | \n",
" 3 | \n",
" 488.27 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAAGSTSVKPIFSR | \n",
" 15 | \n",
" semitryptic | \n",
" [] | \n",
" [] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 5 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 50.00 | \n",
" 34.00 | \n",
" 2 | \n",
" 830.88 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 6 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 52.00 | \n",
" 35.00 | \n",
" 2 | \n",
" 830.88 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 7 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/3_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 32.00 | \n",
" 35.00 | \n",
" 3 | \n",
" 554.26 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 8 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 32.00 | \n",
" 35.00 | \n",
" 3 | \n",
" 759.35 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEKETQEK | \n",
" 22 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 9 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEKETQEK/3_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 43.00 | \n",
" 34.00 | \n",
" 3 | \n",
" 759.35 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEKETQEK | \n",
" 22 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_masskit_df(df)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "brave-height",
"metadata": {},
"source": [
"### List columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ranking-pickup",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'instrument', 'instrument_type', 'instrument_model', 'ion_mode',\n",
" 'ionization', 'name', 'casno', 'synonyms', 'scan', 'collision_energy',\n",
" 'retention_time', 'collision_gas', 'insource_voltage', 'sample_inlet',\n",
" 'ev', 'nce', 'charge', 'precursor_mz', 'exact_mass', 'exact_mw', 'set',\n",
" 'composition', 'peptide', 'peptide_len', 'peptide_type', 'mod_names',\n",
" 'mod_positions', 'protein_id', 'spectrum'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "constant-texas",
"metadata": {},
"source": [
"### Iterating through a dataframe"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "guided-rings",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AAAACALTPGPLADLAAR 35.0\n",
"AAAACALTPGPLADLAAR 35.0\n",
"AAAAGQTGTVPPGAPGALPLPGMAIVK 35.0\n",
"AAAAGSTSVKPIFSR 34.0\n",
"AAAAGSTSVKPIFSR 34.0\n",
"AAAALGSHGSCSSEVEK 34.0\n",
"AAAALGSHGSCSSEVEK 35.0\n",
"AAAALGSHGSCSSEVEK 35.0\n",
"AAAALGSHGSCSSEVEKETQEK 35.0\n",
"AAAALGSHGSCSSEVEKETQEK 34.0\n"
]
}
],
"source": [
"for row in df.head(10).itertuples():\n",
" print (row.peptide, row.nce)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "legislative-tourism",
"metadata": {},
"source": [
"### Query on the dataframe to return a dataframe containing rows that match the query, then display the new dataframe"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "likely-circle",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" id | \n",
" instrument | \n",
" instrument_type | \n",
" instrument_model | \n",
" ion_mode | \n",
" ionization | \n",
" name | \n",
" casno | \n",
" synonyms | \n",
" scan | \n",
" collision_energy | \n",
" retention_time | \n",
" collision_gas | \n",
" insource_voltage | \n",
" sample_inlet | \n",
" ev | \n",
" nce | \n",
" charge | \n",
" precursor_mz | \n",
" exact_mass | \n",
" exact_mw | \n",
" set | \n",
" composition | \n",
" peptide | \n",
" peptide_len | \n",
" peptide_type | \n",
" mod_names | \n",
" mod_positions | \n",
" protein_id | \n",
" spectrum | \n",
"
\n",
" \n",
" \n",
" \n",
" 5 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 50.00 | \n",
" 34.00 | \n",
" 2 | \n",
" 830.88 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 6 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/2_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 52.00 | \n",
" 35.00 | \n",
" 2 | \n",
" 830.88 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
" 7 | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" AAAALGSHGSCSSEVEK/3_1(10,C,CAM) | \n",
" None | \n",
" None | \n",
" None | \n",
" NaN | \n",
" NaN | \n",
" None | \n",
" NaN | \n",
" None | \n",
" 32.00 | \n",
" 35.00 | \n",
" 3 | \n",
" 554.26 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" bestof | \n",
" AAAALGSHGSCSSEVEK | \n",
" 17 | \n",
" tryptic | \n",
" [4] | \n",
" [10] | \n",
" None | \n",
"  | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df_single_peptide = df.query('peptide == \"AAAALGSHGSCSSEVEK\"')\n",
"display_masskit_df(df_single_peptide)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "assigned-welcome",
"metadata": {},
"source": [
"## Select one spectrum from the new dataframe, filter out low m/z peaks, normalize the base intensity to 1.0"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b6baab7f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"masskit.spectrum.spectrum.Spectrum"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df_single_peptide.iloc[0]['spectrum'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "disturbed-rwanda",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAYAAADGFbfiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAa70lEQVR4nO3dfVAU9/0H8PepB1iEQ0XgLvKkxsQKPkGqmKrEVAg+R2vVOKi12CHGJ9CaaMcaTRszjlEnVYNOfKI2o2NRo5Ua0KKSEYvyUBUfQkYCFO9EqTx4Kpzc9/cHP6+cHAjL3R4H79fMztx997u7n/1yw5vdW3YVQggBIiKiFupk7wKIiMgxMUCIiEgSBggREUnCACEiIkkYIEREJAkDhIiIJGGAEBGRJAwQIiKShAFCRESSMECIiEgSBggREUnCACEiIkkYIEREJEkXexfgKIxGI+7evQs3NzcoFAp7l0NE1GpCCFRVVUGj0aBTp5YfTzBAmunu3bvw9fW1dxlERFZXXFyM3r17t3g5Bkgzubm5AagbaHd3dztXQ0TUepWVlfD19TX9fmspBkgzPT9t5e7uzgAhonZF6ml5folORESSMECIiEgSBggREUnCACEiIkkcMkAuXLiASZMmQaPRQKFQ4Pjx4y9d5vz58wgJCYGLiwv69OmDhIQE2xdKRNSOOWSA6PV6DB48GNu3b29W/4KCAowfPx6jRo1CTk4O1qxZg6VLlyIpKcnGlRIRtV8OeRlvVFQUoqKimt0/ISEBfn5+2LZtGwBgwIABuHLlCjZv3ozp06fbqEoiovbNIY9AWiojIwMRERFmbZGRkbhy5QoMBoOdqiIicmwOeQTSUjqdDt7e3mZt3t7eePbsGR48eAC1Wt1gmerqalRXV5veV1ZW2rxOIiJH0iGOQICG/2kphLDY/tzGjRuhUqlME++DRe2NXg8oFHWTXm/vasgRdYgA8fHxgU6nM2srLS1Fly5d0LNnT4vLrF69GhUVFaapuLhYjlKJiBxGhziFFRYWhpMnT5q1paSkIDQ0FEql0uIyzs7OcHZ2lqM8IiKH5JBHII8ePUJubi5yc3MB1F2mm5ubi6KiIgB1Rw9z58419Y+NjUVhYSHi4+Nx8+ZN7N27F3v27MHKlSvtUT4RUbvgkEcgV65cwVtvvWV6Hx8fDwCYN28e9u/fD61WawoTAAgMDERycjLi4uKwY8cOaDQafPHFF7yEl4ioFRTi+bfJ1KTKykqoVCpUVFTwdu7ULuj1QLduda8fPQJcXe1bD8mvtb/XHPIUFhER2R8DhIiIJGGAEBGRJAwQIiKShAFCRESSMECIiEgSBggREUnCACEiIkkYIEREJAkDhIiIJGGAEBGRJAwQIiKShAFCRESSMECIiEgSBggREUnCACGidkuvBxSKukmvt3c17Q8DhIiIJGGAEFG7wSMOeckaIPPnz8eFCxfk3CQREdmIrAFSVVWFiIgIvPrqq/j0009RUlIi5+aJiMiKZA2QpKQklJSUYPHixThy5AgCAgIQFRWFv/3tbzAYDHKWQkRErST7dyA9e/bEsmXLkJOTg8zMTPTr1w/R0dHQaDSIi4tDfn6+3CUREZEEdvsSXavVIiUlBSkpKejcuTPGjx+PvLw8/PSnP8XWrVvtVRYRETWTrAFiMBiQlJSEiRMnwt/fH0eOHEFcXBy0Wi0OHDiAlJQU/OUvf8GGDRvkLIuIiCToIufG1Go1jEYjZs+ejczMTAwZMqRBn8jISHh4eMhZFhERSSBrgGzduhUzZsyAi4tLo326d++OgoICGasiIiIpZD2FlZaWZvFqK71ejwULFshZChERtZKsAXLgwAE8efKkQfuTJ0+QmJgoZylERNRKspzCqqyshBACQghUVVWZncKqra1FcnIyvLy85CiFiIisRJYA8fDwgEKhgEKhQP/+/RvMVygUWL9+vRylEBGRlcgSIGlpaRBCYOzYsUhKSkKPHj1M85ycnODv7w+NRiNHKUREZCWyBMiYMWMAAAUFBfDz84NCoZBjs0TUBuj1QLduda8fPQJcXe1bD1mPzQPk6tWrCAoKQqdOnVBRUYFr16412nfQoEG2LoeIiKzE5gEyZMgQ6HQ6eHl5YciQIVAoFBBCNOinUChQW1tr63KIiMhKbB4gBQUF6NWrl+k1ERG1DzYPEH9/f4uviYjIscn+j4SnTp0yvV+1ahU8PDwwcuRIFBYWylkKERG1kqwB8umnn6Jr164AgIyMDGzfvh2bNm2Cp6cn4uLi5CyFiIhaSdabKRYXF6Nfv34AgOPHj+OXv/wlfvvb3+LNN99EeHi4nKUQEVEryXoE0q1bN5SVlQEAUlJS8Itf/AIA4OLiYvEeWURE1HbJegQybtw4xMTEYOjQofj+++8xYcIEAEBeXh4CAgLkLIWIiFpJ1iOQHTt2ICwsDPfv30dSUhJ69uwJAMjKysLs2bPlLIWIiFpJISz9Vx81UFlZCZVKhYqKCri7u9u7HKJWk+sWI3LeyuTFbQG8jUpTWvt7TdZTWABQXl6OzMxMlJaWwmg0mtoVCgWio6PlLoeIiCSSNUBOnjyJOXPmQK/Xw83NzeymigwQIiLHIut3ICtWrMCCBQtQVVWF8vJyPHz40DT997//lbMUIiJqJVkDpKSkBEuXLsVPfvITOTdLREQ2IGuAREZG4sqVK1ZZ186dOxEYGAgXFxeEhIQgPT290b7nzp0zPRGx/nTr1i2r1EJE1BHJ+h3IhAkT8Lvf/Q43btxAcHAwlEql2fzJkyc3az2HDx/G8uXLsXPnTrz55pvYtWsXoqKicOPGDfj5+TW63O3bt82uNHh+l2Aiah/0evPXvOrKtmS9jLdTp8YPeFryPJDhw4dj2LBh+PLLL01tAwYMwNSpU7Fx48YG/c+dO4e33noLDx8+hIeHR4vrBngZL7U/7fEy3tJSwNu77vW9e3Xb4mW8jWvt7zVZT2EZjcZGp+aGR01NDbKyshAREWHWHhERgYsXLza57NChQ6FWq/H2228jLS2tyb7V1dWorKw0m4iI6H9kDZD6nj59Kmm5Bw8eoLa2Ft7P/8z4f97e3tDpdBaXUavV2L17N5KSknD06FG89tprePvtt3HhwoVGt7Nx40aoVCrT5OvrK6leIqL2StYAqa2txSeffIJXXnkF3bp1w507dwAAa9euxZ49e1q0rvr/QwIAQogGbc+99tprWLhwIYYNG4awsDDs3LkTEyZMwObNmxtd/+rVq1FRUWGaiouLW1QfERFQdwpPoaib6n9H0x7IGiB/+tOfsH//fmzatAlOTk6m9uDgYHz11VfNWoenpyc6d+7c4GijtLS0wVFJU0aMGIH8/PxG5zs7O8Pd3d1sIiJqL/R6QKVq3TpkDZDExETs3r0bc+bMQefOnU3tgwYNavYltU5OTggJCUFqaqpZe2pqKkaOHNnsWnJycqBWq5vdn4iIzMl6GW9JSYnpgVL1GY1GGAyGZq8nPj4e0dHRCA0NRVhYGHbv3o2ioiLExsYCqDv9VFJSgsTERADAtm3bEBAQgIEDB6KmpgYHDx5EUlISkpKSrLNjREQdkKwBMnDgQKSnp8Pf39+s/ciRIxg6dGiz1zNz5kyUlZVhw4YN0Gq1CAoKQnJysmm9Wq0WRUVFpv41NTVYuXIlSkpK0LVrVwwcOBCnTp3C+PHjrbNjREQdkKwBsm7dOkRHR6OkpARGoxFHjx7F7du3kZiYiL///e8tWteiRYuwaNEii/P2799v9n7VqlVYtWqV1LKJiMgCWb8DmTRpEg4fPozk5GQoFAr84Q9/wM2bN3Hy5EmMGzdOzlKIWqw9X01DbYOjfcZkfx5IZGQkIiMj5d4sERFZmaxHIH369EFZWVmD9vLycvTp00fOUoionm7dHOMvXntwtKMCOckaID/++KPFW5ZUV1ejpKREzlKIqAN48eaKHYkcwSfLKawTJ06YXn/77bdQ1fvvldraWpw9exYBAQFylEJERFYiS4BMnToVQN3tR+bNm2c2T6lUIiAgAJ9//rkcpRARkZXIEiBGoxEAEBgYiMuXL8PT01OOzRJZTf1bkhNRHVmvwiooKJBzc0REZEOyX8Z79uxZnD17FqWlpaYjk+f27t0rdzlEZGN8SmD7JWuArF+/Hhs2bEBoaCjUanWjt18nIqK2T9YASUhIwP79+xEdHS3nZomIOhS5vrOT9f9AampqWnTLdSIiartkDZCYmBh8/fXXcm6SiIhsRNZTWE+fPsXu3btx5swZDBo0CEql0mz+li1b5CyHiEhW9U8tPXrk+BcUyBogV69exZAhQwAA169fl3PTRERkZbIGSFpampybIyIiG5IlQKZNm/bSPgqFgo+YJSJyILIESP2bJxJJZa/zx7a+JLK9nRd/mY62v+2ZLAGyb98+OTZDJDv+MrQ9jnHbHQNZL+MlIqL2gwFCRO2St3fHe4iU3BggREQkCQOEiIgkYYAQUZOkPFtbjudxk/0xQIiImvDi80yau4xcAWrPsGaAENmZlF9QLy7Pv/bJHhgg1K7xlyt1JN26yftZZ4AQEZEkDBAiIpKEAUJERJIwQIiISBIGCBERScIAISIiSRggREQkCQOEiIgkYYAQEZEkDBAiIpKEAUJERJIwQIiISBIGCBERScIAISIiSRggREQkCQOEiIgkYYAQEZEkDBAiIpKEAUJERJI4bIDs3LkTgYGBcHFxQUhICNLT05vsf/78eYSEhMDFxQV9+vRBQkKCTJUSEbVPDhkghw8fxvLly/H73/8eOTk5GDVqFKKiolBUVGSxf0FBAcaPH49Ro0YhJycHa9aswdKlS5GUlCRz5URE7YdDBsiWLVvwm9/8BjExMRgwYAC2bdsGX19ffPnllxb7JyQkwM/PD9u2bcOAAQMQExODBQsWYPPmzTJXTkTUfjhcgNTU1CArKwsRERFm7REREbh48aLFZTIyMhr0j4yMxJUrV2AwGGxWKxFRe9bF3gW01IMHD1BbWwtvb2+zdm9vb+h0OovL6HQ6i/2fPXuGBw8eQK1WN1imuroa1dXVpvcVFRUAgMrKytbuAkmk1//vdWUlUFtrm2WaWseL67LG+quqzF+7uEivryU1vLhfjS3b2nF/cf/qL9+c9bVk+/W3ZWnbLR1bqetoav+fs7Qvje1rU2Ng6fP54jg0vq2632dCCIv78TIOFyDPKRQKs/dCiAZtL+tvqf25jRs3Yv369Q3afX19W1oq2YBGI88yLVmXNdbfr1/rlm9NDc1ZVsr6Bw/+3+sX96+l62tp/6a2LYWUddRfpv7rl+1LY/ObMwYtHeeqqiqoVKqXr/gFDhcgnp6e6Ny5c4OjjdLS0gZHGc/5+PhY7N+lSxf07NnT4jKrV69GfHy86X15eTn8/f1RVFQkaaDtqbKyEr6+viguLoa7u7u9y2k21i0/R62ddUsjhEBVVRU0Ev/ycLgAcXJyQkhICFJTU/Huu++a2lNTUzFlyhSLy4SFheHkyZNmbSkpKQgNDYVSqbS4jLOzM5ydnRu0q1Qqh/qA1ufu7u6QtbNu+Tlq7ay75VrzB7HDfYkOAPHx8fjqq6+wd+9e3Lx5E3FxcSgqKkJsbCyAuqOHuXPnmvrHxsaisLAQ8fHxuHnzJvbu3Ys9e/Zg5cqV9toFIiKH53BHIAAwc+ZMlJWVYcOGDdBqtQgKCkJycjL8/f0BAFqt1ux/QgIDA5GcnIy4uDjs2LEDGo0GX3zxBaZPn26vXSAicngOGSAAsGjRIixatMjivP379zdoGzNmDLKzsyVvz9nZGevWrbN4Wqutc9TaWbf8HLV21m0fCiH1+i0iIurQHPI7ECIisj8GCBERScIAISIiSRggzdTS28fb0saNG/HGG2/Azc0NXl5emDp1Km7fvm3WZ/78+VAoFGbTiBEjzPpUV1djyZIl8PT0hKurKyZPnoz//Oc/Nq39448/blCXj4+Pab4QAh9//DE0Gg26du2K8PBw5OXl2b3ugICABnUrFAp88MEHANrWeF+4cAGTJk2CRqOBQqHA8ePHzeZba4wfPnyI6OhoqFQqqFQqREdHo7y83CZ1GwwGfPjhhwgODoarqys0Gg3mzp2Lu3fvmq0jPDy8wc9h1qxZdqsbsN5nw9p1WwMDpBlaevt4Wzt//jw++OADXLp0CampqXj27BkiIiKgf+GmOO+88w60Wq1pSk5ONpu/fPlyHDt2DIcOHcJ3332HR48eYeLEiaiVckOnFhg4cKBZXdeuXTPN27RpE7Zs2YLt27fj8uXL8PHxwbhx41BV7+Y+9qj78uXLZjWnpqYCAGbMmGHq01bGW6/XY/Dgwdi+fbvF+dYa4/feew+5ubk4ffo0Tp8+jdzcXERHR9uk7sePHyM7Oxtr165FdnY2jh49iu+//x6TJ09u0HfhwoVmP4ddu3aZzZez7ues8dmwdt1WIeilfvazn4nY2Fizttdff1189NFHdqrIXGlpqQAgzp8/b2qbN2+emDJlSqPLlJeXC6VSKQ4dOmRqKykpEZ06dRKnT5+2Wa3r1q0TgwcPtjjPaDQKHx8f8dlnn5nanj59KlQqlUhISLBr3S9atmyZ6Nu3rzAajUKItjveAMSxY8dM7601xjdu3BAAxKVLl0x9MjIyBABx69Ytq9dtSWZmpgAgCgsLTW1jxowRy5Yta3QZe9Rtjc+GreuWikcgLyHl9vFye36n4B49epi1nzt3Dl5eXujfvz8WLlyI0tJS07ysrCwYDAaz/dJoNAgKCrL5fuXn50Oj0SAwMBCzZs3CnTt3ANQ9+Eun05nV5OzsjDFjxphqsmfdz9XU1ODgwYNYsGCB2c042+p412etMc7IyIBKpcLw4cNNfUaMGAGVSiXb/lRUVEChUMDDw8Os/a9//Ss8PT0xcOBArFy50uzIyl51t/az0RbG2xKH/UdCuUi5fbychBCIj4/Hz3/+cwQFBZnao6KiMGPGDPj7+6OgoABr167F2LFjkZWVBWdnZ+h0Ojg5OaF79+5m67P1fg0fPhyJiYno378/7t27hz/+8Y8YOXIk8vLyTNu1NNaFhYUAYLe66zt+/DjKy8sxf/58U1tbHe8XWWuMdTodvLy8Gqzfy8tLlv15+vQpPvroI7z33ntm95CaM2cOAgMD4ePjg+vXr2P16tX497//bTrlaI+6rfHZsPd4N4YB0kwtvX28XBYvXoyrV6/iu+++M2ufOXOm6XVQUBBCQ0Ph7++PU6dOYdq0aY2uz9b7FRUVZXodHByMsLAw9O3bFwcOHDB9sShlrOX8eezZswdRUVFmdzBtq+PdGGuMsaX+cuyPwWDArFmzYDQasXPnTrN5CxcuNL0OCgrCq6++itDQUGRnZ2PYsGF2qdtanw17jXdTeArrJaTcPl4uS5YswYkTJ5CWlobevXs32VetVsPf3x/5+fkA6m5xX1NTg4cPH5r1k3u/XF1dERwcjPz8fNPVWE2Ntb3rLiwsxJkzZxATE9Nkv7Y63tYaYx8fH9y7d6/B+u/fv2/T/TEYDPjVr36FgoICpKamvvQOtsOGDYNSqTT7Odij7vqkfDbaQt2WMEBeov7t4+tLTU3FyJEj7VKTEAKLFy/G0aNH8c9//hOBgYEvXaasrAzFxcWmpy+GhIRAqVSa7ZdWq8X169dl3a/q6mrcvHkTarXadOqhfk01NTU4f/68qSZ7171v3z54eXlhwoQJTfZrq+NtrTEOCwtDRUUFMjMzTX3+9a9/oaKiwmb78zw88vPzcebMmUaf5VNfXl4eDAaD6edgj7pfJOWz0RbqtsguX907mEOHDgmlUin27Nkjbty4IZYvXy5cXV3Fjz/+aJd63n//faFSqcS5c+eEVqs1TY8fPxZCCFFVVSVWrFghLl68KAoKCkRaWpoICwsTr7zyiqisrDStJzY2VvTu3VucOXNGZGdni7Fjx4rBgweLZ8+e2az2FStWiHPnzok7d+6IS5cuiYkTJwo3NzfTWH722WdCpVKJo0ePimvXronZs2cLtVpt97qFEKK2tlb4+fmJDz/80Ky9rY13VVWVyMnJETk5OQKA2LJli8jJyTFdrWStMX7nnXfEoEGDREZGhsjIyBDBwcFi4sSJNqnbYDCIyZMni969e4vc3Fyzz311dbUQQogffvhBrF+/Xly+fFkUFBSIU6dOiddff10MHTrUbnVb87Nh7bqtgQHSTDt27BD+/v7CyclJDBs2zOySWbkBsDjt27dPCCHE48ePRUREhOjVq5dQKpXCz89PzJs3TxQVFZmt58mTJ2Lx4sWiR48eomvXrmLixIkN+ljbzJkzhVqtFkqlUmg0GjFt2jSRl5dnmm80GsW6deuEj4+PcHZ2FqNHjxbXrl2ze91CCPHtt98KAOL27dtm7W1tvNPS0ix+PubNmyeEsN4Yl5WViTlz5gg3Nzfh5uYm5syZIx4+fGiTugsKChr93KelpQkhhCgqKhKjR48WPXr0EE5OTqJv375i6dKloqyszG51W/OzYe26rYF34yUiIkn4HQgREUnCACEiIkkYIEREJAkDhIiIJGGAEBGRJAwQIiKShAFCRESSMECIiEgSBggREUnCACEiIkkYIERtXGFhIZydnVFZWWnvUojMMECI2rhvvvkG4eHhL332BZHcGCBEMgkPD8eSJUuwfPlydO/eHd7e3ti9ezf0ej1+/etfw83NDX379sU//vEPs+W++eYbTJ48GUDdU+lenAICAuywN0QMECJZHThwAJ6ensjMzMSSJUvw/vvvY8aMGRg5ciSys7MRGRmJ6OhoPH78GABQXl6O9PR0U4BotVrT9MMPP6Bfv34YPXq0PXeJOjDezp1IJuHh4aitrUV6ejoAoLa2FiqVCtOmTUNiYiKAukfNqtVqZGRkYMSIEfj666/x+eefIysry2xdQghMnz4dRUVFSE9PR9euXWXfH6Iu9i6AqCMZNGiQ6XXnzp3Rs2dPBAcHm9qeP9+6tLQUgPnpq/rWrFmDjIwMXL58meFBdsNTWEQyUiqVZu8VCoVZm0KhAAAYjUYYDAacPn0aU6ZMMVvm4MGD2Lp1K44dO4bevXvbvmiiRjBAiNqotLQ0eHh4YMiQIaa2jIwMxMTEYNeuXRgxYoT9iiMCT2ERtVknTpwwO32l0+nw7rvvYtasWYiMjIROpwNQdyqsV69e9iqTOjAegRC1USdOnDA7fXXr1i3cu3cPBw4cgFqtNk1vvPGGHaukjoxXYRG1QdnZ2Rg7dizu37/f4HsToraCRyBEbdCzZ8/w5z//meFBbRqPQIiISBIegRARkSQMECIikoQBQkREkjBAiIhIEgYIERFJwgAhIiJJGCBERCQJA4SIiCRhgBARkSQMECIikoQBQkREkjBAiIhIEgYIERFJwgAhIiJJ/g+MbG/543eNDQAAAABJRU5ErkJggg==",
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_spectrum = df_single_peptide.iloc[0]['spectrum']\n",
"new_spectrum = new_spectrum.filter(min_mz=500).norm(1.0)\n",
"new_spectrum"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "obvious-welsh",
"metadata": {},
"source": [
"### Input/output\n",
"The spectral library in the form of a dataframe has the ability to read and write a variety of formats."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "decimal-regular",
"metadata": {},
"source": [
"#### MSP"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cathedral-candidate",
"metadata": {},
"outputs": [],
"source": [
"# write out msp file with peptide annotations\n",
"df_single_peptide['spectrum'].array.to_msp(\"peptides.msp\", annotate_peptide=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "prescription-baghdad",
"metadata": {},
"source": [
"##### save a single spectrum at row 0 as an msp file"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "worthy-rochester",
"metadata": {},
"outputs": [],
"source": [
"df_single_peptide.iloc[[0]]['spectrum'].array.to_msp(\"single_spectrum.msp\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "scientific-today",
"metadata": {},
"source": [
"#### MGF"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "vocal-failing",
"metadata": {},
"outputs": [],
"source": [
"df_single_peptide['spectrum'].array.to_mgf(\"peptides.msp\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "associate-warren",
"metadata": {},
"source": [
"#### PKL (python pickle)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "lucky-recovery",
"metadata": {},
"outputs": [],
"source": [
"df_single_peptide.to_pickle('single_peptide.pkl')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "smoking-importance",
"metadata": {},
"source": [
"#### CSV"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "acceptable-insertion",
"metadata": {},
"outputs": [],
"source": [
"df_single_peptide.to_csv('single_peptide.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.10 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"vscode": {
"interpreter": {
"hash": "11d150ef1a59d6ee6bd3538ad9ed751649d8a614c736b8deec7e36a34a38bbb5"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}