Skip to content

IO Modules

Module for reading and writing files.

create_config(args=None)

Write the default configuration file to disk.

Parameters:

Name Type Description Default
args Namespace | None

Optional arguments to parse.

None
Source code in isoslam/io.py
def create_config(args: argparse.Namespace | None = None) -> None:
    """
    Write the default configuration file to disk.

    Parameters
    ----------
    args : argparse.Namespace | None
        Optional arguments to parse.
    """
    filename = "config" if args.filename is None else args.filename  # type: ignore [union-attr]
    output_dir = Path("./") if args.output_dir is None else Path(args.output_dir)  # type: ignore [union-attr]
    output_dir.mkdir(parents=True, exist_ok=True)
    config_path = resources.files(__package__) / "default_config.yaml"
    config = config_path.read_text()

    if ".yaml" not in str(filename) and ".yml" not in str(filename):
        create_config_path = output_dir / f"{filename}.yaml"
    else:
        create_config_path = output_dir / filename

    with create_config_path.open("w", encoding="utf-8") as f:
        f.write(f"# Config file generated {_get_date_time()}\n")
        f.write(f"{CONFIG_DOCUMENTATION_REFERENCE}")
        f.write(config)
    logger.info(f"A sample configuration file has been written to : {str(create_config_path)}")
    logger.info(CONFIG_DOCUMENTATION_REFERENCE)

data_frame_to_file(data, output_dir='./output/', outfile='summary_counts.tsv', sep='\t', **kwargs)

Write a Pandas DataFrame to disk.

Parameters:

Name Type Description Default
data DataFrame | DataFrame

Pandas DataFrame to write to disk.

required
output_dir str | Path

Location to write the output to, default is ''./output''.capitalize.

'./output/'
outfile str

Filename to write data to.

'summary_counts.tsv'
sep str

Separator to use in output file.

'\t'
**kwargs dict[Any, Any]

Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.

{}
Source code in isoslam/io.py
def data_frame_to_file(
    data: pd.DataFrame | pl.DataFrame,
    output_dir: str | Path = "./output/",
    outfile: str = "summary_counts.tsv",
    sep: str = "\t",
    **kwargs: dict[Any, Any],
) -> None:
    """
    Write a Pandas DataFrame to disk.

    Parameters
    ----------
    data : pd.DataFrame | pl.DataFrame
        Pandas DataFrame to write to disk.
    output_dir : str | Path
        Location to write the output to, default is ''./output''.capitalize.
    outfile : str
        Filename to write data to.
    sep : str
        Separator to use in output file.
    **kwargs
        Dictionary of keyword arguments to pass to ''pandas.DataFrame.to_csv()''.
    """
    outdir_file = Path(output_dir) / f"{outfile}"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    if isinstance(data, pl.DataFrame):
        try:
            if re.search(r"parquet$", str(outfile)):
                data.write_parquet(outdir_file, **kwargs)
            elif re.search(r"\..sv$", str(outfile)):
                data.write_csv(outdir_file, separator=sep, **kwargs)
            logger.debug(f"File written to : {outdir_file}")
        except Exception as e:
            raise e
    elif isinstance(data, pd.DataFrame):
        try:
            if re.search(r"parquet", str(outfile)):
                data.to_parquet(outdir_file, **kwargs)
            elif re.search(r"\..sv$", str(outfile)):
                data.to_csv(outdir_file, sep=sep, **kwargs)
            logger.debug(f"File written to : {outdir_file}")
        except Exception as e:
            raise e
    else:
        raise TypeError(f"Can not write output Pandas or Polar Dataframe object not supplied = {type(data)=}")

load_and_update_config(args)

Load a configuration file to dictionary and update entries with user supplied arguments.

If ''args'' does not contain any value for ''args.config_file'' the default configuration (''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.

Once the configuration is loaded any user specified options update the dictionary.

Parameters:

Name Type Description Default
args Namespace

Arguments supplied by user.

required

Returns:

Type Description
dict[str:Any]

Dictionary of configuration optionsupdated with user specified options.

Source code in isoslam/io.py
def load_and_update_config(args: argparse.Namespace | None) -> dict[str, Any]:
    """
    Load a configuration file to dictionary and update entries with user supplied arguments.

    If ''args'' does not contain any value for ''args.config_file'' the default configuration
    (''isoslam/default_config.yaml'') is loaded, otherwise the user specified configuration is loaded.

    Once the configuration is loaded any user specified options update the dictionary.

    Parameters
    ----------
    args : argparse.Namespace
        Arguments supplied by user.

    Returns
    -------
    dict[str: Any]
        Dictionary of configuration optionsupdated with user specified options.
    """
    config = read_yaml() if vars(args)["config_file"] is None else read_yaml(vars(args)["config_file"])
    config["schema"] = _type_schema(config["schema"])  # type: ignore[index]
    return utils.update_config(config, vars(args))  # type: ignore[arg-type]

load_file(file_path)

Load files of different types.

Supports the following file types...

  • .bam - The sequence data that is to be analysed.
  • .bed - The locations of introns/splice junctions.
  • .gtf - Transcript structures from which the .bed file is derived.
  • .vcf - Locations of known sequences difference from the reference sequence.

Parameters:

Name Type Description Default
file_path str | Path

Path to file to load.

required

Returns:

Type Description
Any

Returns the loaded file as an object.

Source code in isoslam/io.py
def load_file(file_path: str | Path) -> Any:
    """
    Load files of different types.

    Supports the following file types...

    * ``.bam`` - The sequence data that is to be analysed.
    * ``.bed`` - The locations of introns/splice junctions.
    * ``.gtf`` - Transcript structures from which the ``.bed`` file is derived.
    * ``.vcf`` - Locations of known sequences difference from the reference sequence.

    Parameters
    ----------
    file_path : str | Path
        Path to file to load.

    Returns
    -------
    Any
        Returns the loaded file as an object.
    """
    file_suffix = Path(file_path).suffix
    if file_suffix == ".gz":
        file_suffix = "".join(Path(file_path).suffixes)
    loader = _get_loader(file_suffix)
    return loader(file_path)

load_output_files(file_ext='.tsv', directory=None)

Read a set of files into a list of Polars DataFrames.

Supports reading ''.parquet'', ''.tsv'' and .csv.

Parameters:

Name Type Description Default
file_ext str

File name pattern to search for.

'.tsv'
directory str | Path | None

Directory to search for files.

None

Returns:

Type Description
list[DataFrame]

A list of Polars DataFrames of each file found.

Source code in isoslam/io.py
def load_output_files(file_ext: str = ".tsv", directory: str | Path | None = None) -> dict[str, pl.DataFrame]:
    """
    Read a set of files into a list of Polars DataFrames.

    Supports reading ''.parquet'', ''.tsv'' and ``.csv``.

    Parameters
    ----------
    file_ext : str
        File name pattern to search for.
    directory : str | Path | None
        Directory to search for files.

    Returns
    -------
    list[pl.DataFrame]
        A list of Polars DataFrames of each file found.
    """
    # This function could be refactored into a factory method with submethods for each file type
    pattern = f"*{file_ext}"
    if file_ext[file_ext.rfind(".") :] == ".parquet":
        results = {_file.stem: pl.read_parquet(_file) for _file in _find_files(pattern, directory)}
    else:
        if file_ext == ".tsv":
            separator = "\t"
        if file_ext == ".csv":
            separator = ","
        results = {_file.stem: pl.read_csv(_file, separator=separator) for _file in _find_files(pattern, directory)}
    return {key: df.with_columns(filename=pl.lit(key)) for key, df in results.items()}

read_yaml(filename=None)

Read a YAML file.

Parameters:

Name Type Description Default
filename Union[str, Path]

YAML file to read.

None

Returns:

Type Description
Dict

Dictionary of the file.

Source code in isoslam/io.py
def read_yaml(filename: str | Path | None = None) -> dict[str, Any] | None:
    """
    Read a YAML file.

    Parameters
    ----------
    filename : Union[str, Path]
        YAML file to read.

    Returns
    -------
    Dict
        Dictionary of the file.
    """
    if filename is None:
        filename = resources.files(__package__) / "default_config.yaml"  # type: ignore[assignment]
    with Path(filename).open(encoding="utf-8") as f:  # type: ignore[arg-type]
        try:
            yaml_file = YAML(typ="safe")
            return yaml_file.load(f)  # type: ignore[no-any-return]
        except YAMLError as exception:
            logger.error(exception)
            return {}

write_assigned_conversions(assigned_conversions, coverage_counts, read_uid, assignment, outfile, delim)

Write assigned conversions to files.

Combines the ''coverage_counts'' with the ''assigned_conversions'' and outputs to disk at the specified location and filename with configurable delimiter.

Parameters:

Name Type Description Default
assigned_conversions set[list[Any]]

A set of assigned conversions. Each element of the set is a list of key features (CHECK WHAT THESE ARE).

required
coverage_counts dict[str, int] dest_dir: str | Path

A dictionary of coverage counts indexed by CHECK.

required
read_uid int

Integer representing the unique read ID.

required
assignment str

Type of assignment, either ''Rep'' or ''Spl'' (for Splice).

required
outfile Any

Open connection to write results to.

required
delim str

Delimiter to be used between fields, typically '','' for ''.csv'' or ''\t'' for ''.tsv'' output.

required
Source code in isoslam/io.py
def write_assigned_conversions(  # pylint: disable=too-many-positional-arguments
    assigned_conversions: set[list[Any]],
    coverage_counts: dict[str, int],
    read_uid: int,
    assignment: str,
    outfile: TextIOWrapper,
    delim: str,
) -> None:
    r"""
    Write assigned conversions to files.

    Combines the ''coverage_counts'' with the ''assigned_conversions'' and outputs to disk at the specified location and
    filename with configurable delimiter.

    Parameters
    ----------
    assigned_conversions : set[list[Any]]
        A set of assigned conversions. Each element of the set is a list of key features (CHECK WHAT THESE ARE).
    coverage_counts : dict[str, int] dest_dir: str | Path
        A dictionary of coverage counts indexed by CHECK.
    read_uid : int
        Integer representing the unique read ID.
    assignment : str
        Type of assignment, either ''Rep'' or ''Spl'' (for Splice).
    outfile : Any
        Open connection to write results to.
    delim : str
        Delimiter to be used between fields, typically '','' for ''.csv'' or ''\t'' for ''.tsv'' output.
    """
    for transcript_id, position in assigned_conversions:
        start, end, chromosome, strand = position
        outfile.write(
            f"{read_uid}{delim}{transcript_id}{delim}"
            f"{start}{delim}{end}{delim}{chromosome}{delim}"
            f"{strand}{delim}{assignment}{delim}{coverage_counts['converted_position']}{delim}"
            f"{coverage_counts['convertible']}{delim}{coverage_counts['coverage']}\n"
        )

write_yaml(config, output_dir, config_file='config.yaml', header_message=None)

Write a configuration (stored as a dictionary) to a YAML file.

Parameters:

Name Type Description Default
config dict

Configuration dictionary.

required
output_dir Union[str, Path]

Path to save the dictionary to as a YAML file (it will be called 'config.yaml').

required
config_file str

Filename to write to.

'config.yaml'
header_message str

String to write to the header message of the YAML file.

None
Source code in isoslam/io.py
def write_yaml(
    config: dict,  # type: ignore[type-arg]
    output_dir: str | Path,
    config_file: str = "config.yaml",
    header_message: str | None = None,
) -> None:
    """
    Write a configuration (stored as a dictionary) to a YAML file.

    Parameters
    ----------
    config : dict
        Configuration dictionary.
    output_dir : Union[str, Path]
        Path to save the dictionary to as a YAML file (it will be called 'config.yaml').
    config_file : str
        Filename to write to.
    header_message : str
        String to write to the header message of the YAML file.
    """
    output_config = Path(output_dir) / config_file
    # Revert PosixPath items to string
    config = _path_to_str(config)

    if header_message:
        header = f"# {header_message} : {_get_date_time()}\n" + CONFIG_DOCUMENTATION_REFERENCE
    else:
        header = f"# Configuration from IsoSLAM run completed : {_get_date_time()}\n" + CONFIG_DOCUMENTATION_REFERENCE
    output_config.write_text(header, encoding="utf-8")

    yaml = YAML(typ="safe")
    with output_config.open("a", encoding="utf-8") as f:
        try:
            yaml.dump(config, f)
        except YAMLError as exception:
            logger.error(exception)