Source code for pgfinder.pgio

"""PG Finder I/O operations"""

import logging
import sqlite3
from datetime import datetime
from importlib.metadata import version
from pathlib import Path, PurePath

import numpy as np
import pandas as pd
import yaml
from yaml.error import YAMLError

try:
    from yaml import CLoader as Loader
except ImportError:
    from yaml import Loader

from pgfinder import COLUMNS
from pgfinder.errors import UserError
from pgfinder.logs.logs import LOGGER_NAME

LOGGER = logging.getLogger(LOGGER_NAME)


[docs] def ms_file_reader(file: str | Path) -> pd.DataFrame: """ Read mass spec data. Parameters ---------- file : str | Path Path to be loaded. Returns ------- pd.DataFrame File loaded as Pandas Dataframe. """ # If we get a path, we need to convert to a string for `in` to work filename = PurePath(file) if filename.suffix == ".ftrs": return_df = ftrs_reader(file) elif filename.suffix == ".txt": return_df = maxquant_file_reader(file) else: raise UserError( ( "The supplied data file was neither a .ftrs nor a .txt file. Please ensure that " "you've selected a valid Byos (.ftrs) or MaxQuant (.txt) file." ) ) return_df.attrs["file"] = filename.name LOGGER.info(f"Mass spectroscopy file loaded from : {filename.name}") return return_df
[docs] def ftrs_reader(file: str | Path, columns: dict = COLUMNS) -> pd.DataFrame: """ Reads Features file from Byos. Parameters ---------- file : str | Path Feature file to be read. columns : dict Dictionary of columns, this defaults to the global COLUMNS which is read from 'config/columns.yaml' and simplifies extension to new formats. Returns ------- pd.DataFrame Pandas DataFrame of features. """ with sqlite3.connect(file) as db: sql = "SELECT * FROM Features" # Reads sql database into dataframe ff = pd.read_sql(sql, db) # Adds empty "Inferred structure" and "Theo (Da)" columns ff = ff.reindex(columns=[*ff.columns.tolist(), *columns["pgfinder"]["inferred"].values()], fill_value=np.nan) # Determine if file is v311 or v52 is_ftrs_52 = set(columns["ftrs_52"]).issubset(ff.columns) is_ftrs_311 = set(columns["ftrs_311"]).issubset(ff.columns) if is_ftrs_52: ff.rename( columns=dict(zip(columns["ftrs_52"], columns["pgfinder"]["input"].values())), inplace=True, ) elif is_ftrs_311: ff.rename( columns=dict(zip(columns["ftrs_311"], columns["pgfinder"]["input"].values())), inplace=True, ) else: raise UserError( "The supplied FTRS file could not be read. Did it come from an unsupported version of Byos?" ) # Reorder columns in dataframe to desired order, dropping unwanted columns return _select_and_order_columns(ff)
[docs] def _select_and_order_columns(df: pd.DataFrame, columns: dict = COLUMNS) -> pd.DataFrame: """ Select (renamed) columns and order them. Parameters ---------- df : pd.DataFrame Full dataframe from which a subset of variables is to be returned. columns : dict Dictionary of columns, this defaults to the global COLUMNS which is read from 'config/columns.yaml' and simplifies extension to new formats. Returns ------- pd.DataFrame Subset of data frame with selected columns in specified order. """ cols_order = list(columns["pgfinder"]["input"].values()) + list(columns["pgfinder"]["inferred"].values()) # Move Intensity column to the end to match required order cols_order.append(cols_order.pop(cols_order.index("Intensity"))) return df[cols_order].copy()
[docs] def theo_masses_reader(file: str | Path) -> pd.DataFrame: """ Reads theoretical masses files (csv) returning a Panda Dataframe Parameters ---------- file : str | Path Path to file to be loaded. Returns ------- pd.DataFrame Pandas DataFrame of theoretical masses. """ try: cols = ["Structure", "Monoisotopic Mass"] theo_masses_df = pd.read_csv(file, usecols=cols)[cols] theo_masses_df.columns = ["Inferred structure", "Theo (Da)"] except (pd.errors.ParserError, UnicodeDecodeError) as e: raise UserError( ( "The supplied mass database doesn't contain valid CSV. Double-check that you've " "selected the correct file and that it's plain CSV." ) ) from e except pd.errors.EmptyDataError as e: raise UserError( ( "The supplied mass database was empty. Double-check that you've " "selected the correct file and that it contains CSV data." ) ) from e except ValueError as e: raise UserError( ( "The supplied mass database didn't have the correct columns. " "Have you checked the format of your database against one of the built-in databases?" ) ) from e # Check that all structures are followed by "|n" where n is one or more digits if not theo_masses_df["Inferred structure"].str.contains(r"\|\d+$").all(): raise UserError( ( "The supplied mass database contains structures missing the '|n' suffix encoding oligomerisation " "state. This should be '|1' for monomers, '|2' for dimers, '|3' for trimers, and so on." ) ) theo_masses_df.attrs["file"] = PurePath(file).name LOGGER.info(f"Theoretical masses loaded from : {file}") return theo_masses_df
[docs] def maxquant_file_reader(file: str | Path, columns: dict = COLUMNS): """ Reads maxquant files and outputs data as a dataframe. Parameters ---------- filepath : str | Path Path to a text file. columns : dict Dictionary of columns, this defaults to the global COLUMNS which is read from 'config/columns.yaml' and simplifies extension to new formats. Returns ------- pd.DataFrame Pandas Data frame. """ # reads file into dataframe try: maxquant_df = pd.read_table(file, low_memory=False) except pd.errors.EmptyDataError as e: raise UserError( ( "No data was found in the supplied .txt file. Have you checked " "you're using the allPeptides.txt file from MaxQuant?" ) ) from e # adds empty columns for inferred structure and theoretical mass maxquant_df = maxquant_df.reindex( columns=[*maxquant_df.columns.tolist(), *columns["pgfinder"]["inferred"].values()], fill_value=np.nan ) # insert dataframe index as a column maxquant_df.reset_index(level=0, inplace=True) # Renames columns to expected column heading required for data_analysis function maxquant_df.rename( columns=dict(zip(columns["maxquant"], ["ID", "RT (min)", "Obs (Da)"])), inplace=True, ) # Reorder columns in dataframe to desired order. try: return _select_and_order_columns(maxquant_df) # maxquant_df = maxquant_df[cols_order] except KeyError as e: raise UserError( ( "The supplied MaxQuant file could not be read. Have you checked " "you're using the allPeptides.txt file from a supported version of MaxQuant?" ) ) from e return maxquant_df
[docs] def dataframe_to_csv_metadata( output_dataframe: pd.DataFrame, save_filepath: str | Path = None, filename: str | Path = None, float_format: str = "%.4f", ) -> str | None: """ Convert dataframe to CSV with metadata. If save_filepath is specified return the relative path of the output file, including the filename, otherwise return the .csv in the form of a string. Parameters ---------- output_dataframe : pd.DataFrame Dataframe to output. save_filepath : str | Path Path to save to. filename : str | Path Filename to save to. float_format : str Format for floating point numbers (default 4 decimal places). Returns ------- str | None Either returns the path to write data to or writes it to CSV. """ release = version("pgfinder") _version = ".".join(release.split("."[:2])) metadata = [ f"file : {str(output_dataframe.attrs['file'])}", f"masses_file : {str(output_dataframe.attrs['masses_file'])}", f"rt_window : {output_dataframe.attrs['rt_window']}", f"modifications : {output_dataframe.attrs['modifications']}", f"ppm : {output_dataframe.attrs['ppm']}", f"consolidation_ppm : {output_dataframe.attrs['consolidation_ppm']}", f"version : {_version}", ] # Add Metadata as first column output_dataframe = pd.concat([pd.DataFrame({"Metadata": metadata}), output_dataframe], axis=1) # Save the file to disk if save_filepath: filename = filename if filename is not None else default_filename() save_filepath = Path(save_filepath) save_filepath.mkdir(parents=True, exist_ok=True) output_dataframe.to_csv(save_filepath / filename, index=False, float_format=float_format) output = str(save_filepath / filename) # Store in memory as a string for returning to Notebook else: output = output_dataframe.to_csv(index=False, float_format=float_format) return output
[docs] def default_filename(prefix: str = "results_") -> str: """ Generate a default filename based on the current date/time. Parameters ---------- prefix : str String to use as a prefix, default is 'results_'. Returns ------- str Filename with format 'results_YYYY-MM-DD-hh-mm-ss.csv'. """ now = datetime.now() date_time = now.strftime("%Y-%m-%d_%H-%M-%S") filename = prefix + date_time + ".csv" return filename
[docs] def read_yaml(filename: str | Path) -> dict: """ Read a YAML file. Parameters ---------- filename : str | Path YAML file to read. Returns ------- dict Dictionary of the file. """ with Path(filename).open() as f: try: return yaml.load(f, Loader=Loader) except YAMLError as exception: LOGGER.error(exception) return {}