Skip to content

Exon counts when there are genes on both strands #253

@skagawa2

Description

@skagawa2

Hello! We were looking at the exon counts again and saw some discrepancies from what we expected. This one comes from reads going on one strand counting as being excluded from the gene on the opposite strand. The example we found was with this gene below- F2RL2 and IQGAP2 (and we presume there are a handful more that get caught under this case). We saw that in exon_counts.tsv, the exclude counts for F2RL2 exons a sample that expressed IQGAP2 and not F2RL2 was high. This wasn't what we expected to happen when an exon is included or excluded since our reads are stranded.

image

We came up with the following patch that we think fixes this problem, but we're not sure if this messes anything else up in unexpected ways (like the transcript classification or other parts of the pipeline). If this was also intended (that exons aren't stranded), please let us know as well!

diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index 8644a47..2833991 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -272,23 +272,23 @@ class AlignmentCollector:
         split_regions = AlignmentCollector.split_coverage_regions(current_region, alignment_storage)
 
         if len(split_regions) == 1:
-            yield self.process_alignments_in_region(current_region, alignment_storage.get_alignments())
+            current_region, alignments, strand = split_regions[0]
+            yield self.process_alignments_in_region(current_region, alignments, strand)
         else:
-            for new_region in split_regions:
-                alignments = alignment_storage.get_alignments(new_region)
-                yield self.process_alignments_in_region(new_region, alignments)
+            for new_region, alignments, strand in split_regions:
+                yield self.process_alignments_in_region(new_region, alignments, strand)
 
-    def process_alignments_in_region(self, current_region, alignment_storage):
-        logger.debug("Processing region %s" % str(current_region))
-        gene_info = self.get_gene_info_for_region(current_region)
+    def process_alignments_in_region(self, current_region, alignment_storage, strand):
+        logger.debug("Processing region %s on strand %s" % (str(current_region), strand))
+        gene_info = self.get_gene_info_for_region(current_region, strand=strand)
         if gene_info.empty():
-            assignment_storage = self.process_intergenic(alignment_storage, current_region)
+            assignment_storage = self.process_intergenic(alignment_storage, current_region, strand)
         else:
-            assignment_storage = self.process_genic(alignment_storage, gene_info, current_region)
+            assignment_storage = self.process_genic(alignment_storage, gene_info, current_region, strand)
 
         return gene_info, assignment_storage
 
-    def process_intergenic(self, alignment_storage, region):
+    def process_intergenic(self, alignment_storage, region, strand):
         assignment_storage = []
         if self.illumina_bam is not None:
             corrector = IlluminaExonCorrector(self.chr_id, region[0], region[1], self.illumina_bam)
@@ -310,6 +310,10 @@ class AlignmentCollector:
                 logger.warning("Read %s has no aligned exons" % read_id)
                 continue
 
+            mapped_strand = "-" if alignment.is_reverse else "+"
+            if strand != '.' and mapped_strand != strand:
+                continue
+
             #if len(alignment_info.read_exons) > 2 and not alignment.is_secondary and \
             #        alignment.mapping_quality < self.params.multi_intron_mapping_quality_cutoff:
             #    continue
@@ -338,7 +342,7 @@ class AlignmentCollector:
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
 
             read_assignment.read_group = self.read_groupper.get_group_id(alignment, self.bam_merger.bam_pairs[bam_index][1])
-            read_assignment.mapped_strand = "-" if alignment.is_reverse else "+"
+            read_assignment.mapped_strand = mapped_strand
             read_assignment.strand = self.get_assignment_strand(read_assignment)
             read_assignment.chr_id = self.chr_id
             read_assignment.multimapper = alignment.is_secondary
@@ -347,7 +351,7 @@ class AlignmentCollector:
             assignment_storage.append(read_assignment)
         return assignment_storage
 
-    def process_genic(self, alignment_storage, gene_info, region):
+    def process_genic(self, alignment_storage, gene_info, region, strand):
         assigner = LongReadAssigner(gene_info, self.params)
         profile_constructor = CombinedProfileConstructor(gene_info, self.params)
         exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record)
@@ -369,6 +373,10 @@ class AlignmentCollector:
                 logger.warning("Read %s has no aligned exons" % read_id)
                 continue
 
+            mapped_strand = "-" if alignment.is_reverse else "+"
+            if strand != '.' and mapped_strand != strand:
+                continue
+
             alignment_info.add_polya_info(self.polya_finder, self.polya_fixer)
             if self.params.cage:
                 alignment_info.add_cage_info(self.cage_finder)
@@ -396,7 +404,7 @@ class AlignmentCollector:
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
 
             read_assignment.read_group = self.read_groupper.get_group_id(alignment, self.bam_merger.bam_pairs[bam_index][1])
-            read_assignment.mapped_strand = "-" if alignment.is_reverse else "+"
+            read_assignment.mapped_strand = mapped_strand
             read_assignment.strand = self.get_assignment_strand(read_assignment)
             AlignmentCollector.check_antisense(read_assignment)
             AlignmentCollector.import_bam_tags(alignment, read_assignment, self.params.bam_tags)
@@ -458,13 +466,13 @@ class AlignmentCollector:
 
         return indel_count, junctions_with_indels
 
-    def get_gene_info_for_region(self, current_region):
+    def get_gene_info_for_region(self, current_region, strand='.'):
         if not self.genedb:
             return GeneInfo.from_region(self.chr_id, current_region[0], current_region[1],
                                         self.params.delta, self.chr_record)
 
         gene_list = list(self.genedb.region(seqid=self.chr_id, start=current_region[0],
-                                            end=current_region[1], featuretype="gene"))
+                                            end=current_region[1], featuretype="gene", strand=strand))
         if not gene_list:
             return GeneInfo.from_region(self.chr_id, current_region[0], current_region[1],
                                         self.params.delta, self.chr_record)
@@ -478,7 +486,19 @@ class AlignmentCollector:
     def split_coverage_regions(genomic_region, alignment_storage):
         if interval_len(genomic_region) < AlignmentCollector.MAX_REGION_LEN and \
                 alignment_storage.get_read_count() < AlignmentCollector.MIN_READS_TO_SPLIT:
-            return [genomic_region]
+            pos_strand_alignments = []
+            neg_strand_alignments = []
+            for alignment in alignment_storage.get_alignments():
+                if not alignment[1].is_reverse:
+                    pos_strand_alignments.append(alignment)
+                else:
+                    neg_strand_alignments.append(alignment)
+            split_regions = []
+            if pos_strand_alignments:
+                split_regions.append((genomic_region, pos_strand_alignments, '+'))
+            if neg_strand_alignments:
+                split_regions.append((genomic_region, neg_strand_alignments, '-'))
+            return split_regions
 
         split_regions = []
         coverage_dict = alignment_storage.coverage_dict
@@ -492,8 +512,17 @@ class AlignmentCollector:
                     coverage_dict[pos] > max(AlignmentCollector.ABS_COV_VALLEY, max_cov * AlignmentCollector.REL_COV_VALLEY):
                 max_cov = max(max_cov, coverage_dict[pos])
                 pos += 1
-            split_regions.append((max(current_start * AbstractAlignmentStorage.COVERAGE_BIN + 1, genomic_region[0]),
-                                  min(pos * AbstractAlignmentStorage.COVERAGE_BIN, genomic_region[1])))
+            split_region = (max(current_start * AbstractAlignmentStorage.COVERAGE_BIN + 1, genomic_region[0]),
+                            min(pos * AbstractAlignmentStorage.COVERAGE_BIN, genomic_region[1]))
+            alignments = list(alignment_storage.get_alignments(split_region))
+            pos_strand_alignments = [(bam_index, aln) for bam_index, aln in alignments if not aln.is_reverse]
+            neg_strand_alignments = [(bam_index, aln) for bam_index, aln in alignments if aln.is_reverse]
+
+            if pos_strand_alignments:
+                split_regions.append((split_region, pos_strand_alignments, '+'))
+            if neg_strand_alignments:
+                split_regions.append((split_region, neg_strand_alignments, '-'))
+
             current_start = pos
             max_cov = coverage_dict[current_start]
             pos = min(current_start + 1, coverage_positions[-1] + 1)

Once again, thank you for creating this software!

Metadata

Metadata

Assignees

Labels

algorithmIssue requires algorithmic improvementenhancementNew feature or requestfixed in devIssue resolved but not released yet

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions