IO Modules

Module for reading and writing files.

`create_config(args=None)`

Write the default configuration file to disk.

Parameters:

Name	Type	Description	Default
`args`	`Namespace \| None`	Optional arguments to parse.	`None`

Source code in isoslam/io.py

def create_config(args: argparse.Namespace | None = None) -> None:
    """
    Write the default configuration file to disk.

    Parameters
    ----------
    args : argparse.Namespace | None
        Optional arguments to parse.
    """
    filename = "config" if args.filename is None else args.filename  # type: ignore [union-attr]
    output_dir = Path("./") if args.output_dir is None else Path(args.output_dir)  # type: ignore [union-attr]
    output_dir.mkdir(parents=True, exist_ok=True)
    config_path = resources.files(__package__) / "default_config.yaml"
    config = config_path.read_text()

    if ".yaml" not in str(filename) and ".yml" not in str(filename):
        create_config_path = output_dir / f"{filename}.yaml"
    else:
        create_config_path = output_dir / filename

    with create_config_path.open("w", encoding="utf-8") as f:
        f.write(f"# Config file generated {_get_date_time()}\n")
        f.write(f"{CONFIG_DOCUMENTATION_REFERENCE}")
        f.write(config)
    logger.info(f"A sample configuration file has been written to : {str(create_config_path)}")
    logger.info(CONFIG_DOCUMENTATION_REFERENCE)

`data_frame_to_file(data, output_dir='./output/', outfile='summary_counts.tsv', sep='\t', **kwargs)`

Write a Pandas DataFrame to disk.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame \| DataFrame`	Pandas DataFrame to write to disk.	required
`output_dir`	`str \| Path`	Location to write the output to, default is ''./output''.capitalize.	`'./output/'`
`outfile`	`str`	Filename to write data to.	`'summary_counts.tsv'`
`sep`	`str`	Separator to use in output file.	`'\t'`
`**kwargs`	`dict[Any, Any]`	Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.	`{}`

Source code in isoslam/io.py

def data_frame_to_file(
    data: pd.DataFrame | pl.DataFrame,
    output_dir: str | Path = "./output/",
    outfile: str = "summary_counts.tsv",
    sep: str = "\t",
    **kwargs: dict[Any, Any],
) -> None:
    """
    Write a Pandas DataFrame to disk.

    Parameters
    ----------
    data : pd.DataFrame | pl.DataFrame
        Pandas DataFrame to write to disk.
    output_dir : str | Path
        Location to write the output to, default is ''./output''.capitalize.
    outfile : str
        Filename to write data to.
    sep : str
        Separator to use in output file.
    **kwargs
        Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.
    """
    outdir_file = Path(output_dir) / f"{outfile}"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    if isinstance(data, pl.DataFrame):
        try:
            if re.search(r"parquet$", str(outfile)):
                data.write_parquet(outdir_file, **kwargs)
            elif re.search(r"\..sv$", str(outfile)):
                data.write_csv(outdir_file, separator=sep, **kwargs)
            logger.debug(f"File written to : {outdir_file}")
        except Exception as e:
            raise e
    elif isinstance(data, pd.DataFrame):
        try:
            if re.search(r"parquet", str(outfile)):
                data.to_parquet(outdir_file, **kwargs)
            elif re.search(r"\..sv$", str(outfile)):
                data.to_csv(outdir_file, sep=sep, **kwargs)
            logger.debug(f"File written to : {outdir_file}")
        except Exception as e:
            raise e
    else:
        raise TypeError(f"Can not write output Pandas or Polar Dataframe object not supplied = {type(data)=}")

`load_and_update_config(args)`

Load a configuration file to dictionary and update entries with user supplied arguments.

If ''args'' does not contain any value for ''args.config_file'' the default configuration (''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.

Once the configuration is loaded any user specified options update the dictionary.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	Arguments supplied by user.	required

Returns:

Type	Description
`dict[str:Any]`	Dictionary of configuration optionsupdated with user specified options.

Source code in isoslam/io.py

def load_and_update_config(args: argparse.Namespace | None) -> dict[str, Any]:
    """
    Load a configuration file to dictionary and update entries with user supplied arguments.

    If ''args'' does not contain any value for ''args.config_file'' the default configuration
    (''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.

    Once the configuration is loaded any user specified options update the dictionary.

    Parameters
    ----------
    args : argparse.Namespace
        Arguments supplied by user.

    Returns
    -------
    dict[str: Any]
        Dictionary of configuration optionsupdated with user specified options.
    """
    config = read_yaml() if vars(args)["config_file"] is None else read_yaml(vars(args)["config_file"])
    config["schema"] = _type_schema(config["schema"])  # type: ignore[index]
    return utils.update_config(config, vars(args))  # type: ignore[arg-type]

`load_file(file_path)`

Load files of different types.

Supports the following file types...

.bam - The sequence data that is to be analysed.
.bed - The locations of introns/splice junctions.
.gtf - Transcript structures from which the .bed file is derived.
.vcf - Locations of known sequences difference from the reference sequence.

Parameters:

Name	Type	Description	Default
`file_path`	`str \| Path`	Path to file to load.	required

Returns:

Type	Description
`Any`	Returns the loaded file as an object.

Source code in isoslam/io.py

def load_file(file_path: str | Path) -> Any:
    """
    Load files of different types.

    Supports the following file types...

    * ``.bam`` - The sequence data that is to be analysed.
    * ``.bed`` - The locations of introns/splice junctions.
    * ``.gtf`` - Transcript structures from which the ``.bed`` file is derived.
    * ``.vcf`` - Locations of known sequences difference from the reference sequence.

    Parameters
    ----------
    file_path : str | Path
        Path to file to load.

    Returns
    -------
    Any
        Returns the loaded file as an object.
    """
    file_suffix = Path(file_path).suffix
    if file_suffix == ".gz":
        file_suffix = "".join(Path(file_path).suffixes)
    loader = _get_loader(file_suffix)
    return loader(file_path)

`load_output_files(file_ext='.tsv', directory=None)`

Read a set of files into a list of Polars DataFrames.

Supports reading ''.parquet'', ''.tsv'' and .csv.

Parameters:

Name	Type	Description	Default
`file_ext`	`str`	File name pattern to search for.	`'.tsv'`
`directory`	`str \| Path \| None`	Directory to search for files.	`None`

Returns:

Type	Description
`list[DataFrame]`	A list of Polars DataFrames of each file found.

Source code in isoslam/io.py

def load_output_files(file_ext: str = ".tsv", directory: str | Path | None = None) -> dict[str, pl.DataFrame]:
    """
    Read a set of files into a list of Polars DataFrames.

    Supports reading ''.parquet'', ''.tsv'' and ``.csv``.

    Parameters
    ----------
    file_ext : str
        File name pattern to search for.
    directory : str | Path | None
        Directory to search for files.

    Returns
    -------
    list[pl.DataFrame]
        A list of Polars DataFrames of each file found.
    """
    # This function could be refactored into a factory method with submethods for each file type
    pattern = f"*{file_ext}"
    if file_ext[file_ext.rfind(".") :] == ".parquet":
        results = {_file.stem: pl.read_parquet(_file) for _file in _find_files(pattern, directory)}
    else:
        if file_ext == ".tsv":
            separator = "\t"
        if file_ext == ".csv":
            separator = ","
        results = {_file.stem: pl.read_csv(_file, separator=separator) for _file in _find_files(pattern, directory)}
    return {key: df.with_columns(filename=pl.lit(key)) for key, df in results.items()}

`read_yaml(filename=None)`

Read a YAML file.

Parameters:

Name	Type	Description	Default
`filename`	`Union[str, Path]`	YAML file to read.	`None`

Returns:

Type	Description
`Dict`	Dictionary of the file.

Source code in isoslam/io.py

def read_yaml(filename: str | Path | None = None) -> dict[str, Any] | None:
    """
    Read a YAML file.

    Parameters
    ----------
    filename : Union[str, Path]
        YAML file to read.

    Returns
    -------
    Dict
        Dictionary of the file.
    """
    if filename is None:
        filename = resources.files(__package__) / "default_config.yaml"  # type: ignore[assignment]
    with Path(filename).open(encoding="utf-8") as f:  # type: ignore[arg-type]
        try:
            yaml_file = YAML(typ="safe")
            return yaml_file.load(f)  # type: ignore[no-any-return]
        except YAMLError as exception:
            logger.error(exception)
            return {}

`write_assigned_conversions(assigned_conversions, coverage_counts, read_uid, assignment, outfile, delim)`

Write assigned conversions to files.

Combines the ''coverage_counts'' with the ''assigned_conversions'' and outputs to disk at the specified location and filename with configurable delimiter.

Parameters:

Name	Type	Description	Default
`assigned_conversions`	`set[list[Any]]`	A set of assigned conversions. Each element of the set is a list of key features (CHECK WHAT THESE ARE).	required
`coverage_counts`	`dict[str, int] dest_dir: str \| Path`	A dictionary of coverage counts indexed by CHECK.	required
`read_uid`	`int`	Integer representing the unique read ID.	required
`assignment`	`str`	Type of assignment, either ''Rep'' or ''Spl'' (for Splice).	required
`outfile`	`Any`	Open connection to write results to.	required
`delim`	`str`	Delimiter to be used between fields, typically '','' for ''.csv'' or ''\t'' for ''.tsv'' output.	required

Source code in isoslam/io.py

def write_assigned_conversions(  # pylint: disable=too-many-positional-arguments
    assigned_conversions: set[list[Any]],
    coverage_counts: dict[str, int],
    read_uid: int,
    assignment: str,
    outfile: TextIOWrapper,
    delim: str,
) -> None:
    r"""
    Write assigned conversions to files.

    Combines the ''coverage_counts'' with the ''assigned_conversions'' and outputs to disk at the specified location and
    filename with configurable delimiter.

    Parameters
    ----------
    assigned_conversions : set[list[Any]]
        A set of assigned conversions. Each element of the set is a list of key features (CHECK WHAT THESE ARE).
    coverage_counts : dict[str, int] dest_dir: str | Path
        A dictionary of coverage counts indexed by CHECK.
    read_uid : int
        Integer representing the unique read ID.
    assignment : str
        Type of assignment, either ''Rep'' or ''Spl'' (for Splice).
    outfile : Any
        Open connection to write results to.
    delim : str
        Delimiter to be used between fields, typically '','' for ''.csv'' or ''\t'' for ''.tsv'' output.
    """
    for transcript_id, position in assigned_conversions:
        start, end, chromosome, strand = position
        outfile.write(
            f"{read_uid}{delim}{transcript_id}{delim}"
            f"{start}{delim}{end}{delim}{chromosome}{delim}"
            f"{strand}{delim}{assignment}{delim}{coverage_counts['converted_position']}{delim}"
            f"{coverage_counts['convertible']}{delim}{coverage_counts['coverage']}\n"
        )

`write_yaml(config, output_dir, config_file='config.yaml', header_message=None)`

Write a configuration (stored as a dictionary) to a YAML file.

Parameters:

Name	Type	Description	Default
`config`	`dict`	Configuration dictionary.	required
`output_dir`	`Union[str, Path]`	Path to save the dictionary to as a YAML file (it will be called 'config.yaml').	required
`config_file`	`str`	Filename to write to.	`'config.yaml'`
`header_message`	`str`	String to write to the header message of the YAML file.	`None`

Source code in isoslam/io.py

def write_yaml(
    config: dict,  # type: ignore[type-arg]
    output_dir: str | Path,
    config_file: str = "config.yaml",
    header_message: str | None = None,
) -> None:
    """
    Write a configuration (stored as a dictionary) to a YAML file.

    Parameters
    ----------
    config : dict
        Configuration dictionary.
    output_dir : Union[str, Path]
        Path to save the dictionary to as a YAML file (it will be called 'config.yaml').
    config_file : str
        Filename to write to.
    header_message : str
        String to write to the header message of the YAML file.
    """
    output_config = Path(output_dir) / config_file
    # Revert PosixPath items to string
    config = _path_to_str(config)

    if header_message:
        header = f"# {header_message} : {_get_date_time()}\n" + CONFIG_DOCUMENTATION_REFERENCE
    else:
        header = f"# Configuration from IsoSLAM run completed : {_get_date_time()}\n" + CONFIG_DOCUMENTATION_REFERENCE
    output_config.write_text(header, encoding="utf-8")

    yaml = YAML(typ="safe")
    with output_config.open("a", encoding="utf-8") as f:
        try:
            yaml.dump(config, f)
        except YAMLError as exception:
            logger.error(exception)