Source code for cnvfinder.tsvparser.tsvparser
import sys
from collections import defaultdict
from bedhandler.handler import BedFileLoader
from pandas import DataFrame
[docs]class CoverageFileParser:
"""
Parses a amplicon coverage file loaded by BedFileLoader
:param str filename: path to amplicon.cov file
"""
def __init__(self, filename: str):
bed_file = BedFileLoader(filename)
print('Loading coverage data from {}'.format(filename))
# noinspection PyProtectedMember
if bed_file.file_type != bed_file._BedFileLoader__amplicon_cov:
print('{} is not a valid amplicon_coverage file'.format(filename))
sys.exit(1)
self.targets, self.counters = self.define_targets(bed_file.expand_columns(), bed_file.columns)
[docs] def define_targets(self, lines, columns) -> tuple:
"""
Extract columns from lines based on columns of interest and
split them in two entities: one DataFrame representing actual targets [chrom, chromStart, chromEnd...] and
a list representing the number of reads for each target.
:param list lines: actual data
:param list columns: list of columns of interest
:return: a DataFrame describing the targets and a list of counters
"""
targets = []
counters = []
columns_map = self.create_column_map(columns)
for line in lines:
targets.append([line[columns_map['chrom']],
int(line[columns_map['chrom_start']]),
int(line[columns_map['chrom_end']]),
str(line[columns_map['gene']]),
line[columns_map['pools']]])
counters.append(int(line[columns_map['total_reads']]))
return DataFrame(targets, columns=['chrom', 'chromStart', 'chromEnd', 'gene', 'pools']), counters
[docs] @staticmethod
def create_column_map(columns) -> defaultdict:
"""
Create a dict based on columns
:param list columns: list of columns
:return: a dictionary mapping 'columns' values and indexes
"""
columns_map = defaultdict(int)
for i, column in enumerate(columns):
columns_map[column] = i
return columns_map