Codebase list aegean / 96c39eb
New upstream version 0.16.0+dfsg Sascha Steinbiss 5 years ago
10 changed file(s) with 115 addition(s) and 56 deletion(s). Raw diff Collapse all Expand all
00 # Change Log
11 All notable changes to this project will be documented in this file.
22 This project adheres to [Semantic Versioning](http://semver.org/).
3
4 ## [0.16.0] - 2016-05-09
5
6 ### Fixed
7 - Outer gene in intron gene cases now designated as a ciLocus.
8 - ciLoci are now exempt from merging into miLoci.
9 - Fixes to how polycistrons and other edge cases are handled.
310
411 ## [0.15.2] - 2016-03-17
512
0 v0.15.2 stable
0 v0.16.0 stable
77 #!annotation-source NCBI Acromyrmex echinatior Annotation Release 100
88 NW_011626563.1 AEGeAn::LocusPocus locus 1190754 1200781 . . . child_gene=1;child_mRNA=2;riil=769;effective_length=10028;iLocus_type=siLocus
99 NW_011626563.1 AEGeAn::LocusPocus locus 1200782 1201550 . . . fg_orient=FR;effective_length=769;iLocus_type=iiLocus
10 NW_011626563.1 AEGeAn::LocusPocus locus 1201551 1566216 . . . effective_length=364666;iiLocus_exception=complex-overlap-3;liil=769;riil=8884;iLocus_type=siLocus;child_gene=1;child_mRNA=1
10 NW_011626563.1 AEGeAn::LocusPocus locus 1201551 1566216 . . . iLocus_type=ciLocus;effective_length=364666;iiLocus_exception=complex-overlap-3;liil=769;riil=8884;child_gene=1;child_mRNA=1
1111 NW_011626563.1 AEGeAn::LocusPocus locus 1216473 1219475 . . . iiLocus_exception=intron-gene;liil=0;riil=0;iLocus_type=siLocus;child_gene=1;child_mRNA=2
1212 NW_011626563.1 AEGeAn::LocusPocus locus 1537022 1540189 . . . liil=0;riil=0;iLocus_type=niLocus;child_gene=1;child_ncRNA=1
1313 NW_011626563.1 AEGeAn::LocusPocus locus 1566217 1575100 . . . fg_orient=RR;effective_length=8884;iLocus_type=iiLocus
55 #!genome-build-accession NCBI_Assembly:GCF_000220905.1
66 #!annotation-date 7 April 2015
77 #!annotation-source NCBI Megachile rotundata Annotation Release 101
8 NW_003797177.1 AEGeAn::LocusPocus locus 9652 19311 . . . effective_length=9660;iLocus_type=siLocus;child_gene=1;child_mRNA=1
8 NW_003797177.1 AEGeAn::LocusPocus locus 9652 19311 . . . iLocus_type=ciLocus;effective_length=9660;child_gene=1;child_mRNA=1
99 NW_003797177.1 AEGeAn::LocusPocus locus 11405 18146 . . . iiLocus_exception=intron-gene;iLocus_type=siLocus;child_gene=1;child_mRNA=4
66 #!annotation-date 3 June 2014
77 #!annotation-source NCBI Nasonia vitripennis Annotation Release 101
88 NC_015867.2 AEGeAn::LocusPocus locus 370926 373152 . . . child_gene=1;child_mRNA=1;right_overlap=262;iiLocus_exception=delta-overlap-delta;riil=0;effective_length=1965;iLocus_type=siLocus
9 NC_015867.2 AEGeAn::LocusPocus locus 372891 380536 . . . effective_length=6758;iLocus_type=siLocus;child_gene=1;child_mRNA=1
9 NC_015867.2 AEGeAn::LocusPocus locus 372891 380536 . . . iLocus_type=ciLocus;effective_length=6758;child_gene=1;child_mRNA=1
1010 NC_015867.2 AEGeAn::LocusPocus locus 374998 378903 . . . iiLocus_exception=intron-gene;iLocus_type=siLocus;child_gene=1;child_mRNA=1
1111 NC_015867.2 AEGeAn::LocusPocus locus 379649 381835 . . . left_overlap=888;liil=0;child_gene=1;child_mRNA=1;effective_length=2187;iLocus_type=siLocus
4242 </tbody>
4343 </table>
4444
45 <h2>Gene loci <span class="tooltip">[?]<span class="tooltip_text">If a gene annotation overlaps with another gene annotation, those annotations are associated with the same gene locus. See <a target="_blank" href="http://aegean.readthedocs.org/en/refactor/loci.html">this page</a> for a formal definition of a locus annotation.</span></span></h2>
45 <h2>Gene loci <span class="tooltip">[?]<span class="tooltip_text">If a gene annotation overlaps with another gene annotation, those annotations are associated with the same gene locus. See <a target="_blank" href="http://aegean.readthedocs.io/en/latest/loci.html">this page</a> for a formal definition of a locus annotation.</span></span></h2>
4646 <table class="table_normal">
4747 <tr><td>shared</td><td>7</td></tr>
4848 <tr><td>unique to reference</td><td>1</td></tr>
1010 import sys
1111
1212
13 class Locus(object):
14 def __init__(self, line):
15 self._rawdata = line
16 self.fields = line.strip().split('\t')
17 assert len(self.fields) == 9
18
19 @property
20 def seqid(self):
21 return self.fields[0]
22
23 @property
24 def start(self):
25 return int(self.fields[3])
26
27 @property
28 def end(self):
29 return int(self.fields[4])
30
31 @property
32 def ilocus_class(self):
33 typematch = re.search('iLocus_type=([^;\n]+)', self.fields[8])
34 assert typematch, 'could not determine iLocus type: ' + self._rawdata
35 return typematch.group(1)
36
37 @property
38 def mergeable(self):
39 if self.ilocus_class not in ['siLocus', 'niLocus']:
40 return False
41 if 'iiLocus_exception=intron-gene' in self.fields[8]:
42 return False
43 return True
44
45 def __len__(self):
46 return self.end - self.start + 1
47
48 def __str__(self):
49 return '\t'.join(self.fields)
50
51 def strip(self):
52 self.fields[8] = re.sub('ID=[^;\n]+;*', '', self.fields[8])
53 self.fields[8] = re.sub('Name=[^;\n]+;*', '', self.fields[8])
54
55
1356 def merge_iloci(loci):
1457 """Merge ajacent or overlapping gene-containing iLoci."""
1558 assert len(loci) > 0
1659 if len(loci) == 1:
17 line = re.sub('ID=[^;\n]+;*', '', loci[0])
18 line = re.sub('Name=[^;\n]+;*', '', line)
19 return line
60 loci[0].strip()
61 return loci[0]
2062
2163 seqid = None
2264 start, end = -1, -1
2365 attrs = {}
2466 for locus in loci:
25 fields = locus.split('\t')
26 assert len(fields) == 9
2767 if seqid:
28 assert fields[0] == seqid
29 seqid = fields[0]
30 lstart = int(fields[3])
31 lend = int(fields[4])
32 if start == -1 or lstart < start:
33 start = lstart
34 end = max(end, lend)
35 numeric_attrs = re.findall('([^;=]+=\d+)', fields[8])
68 assert locus.seqid == seqid
69 seqid = locus.seqid
70 if start == -1 or locus.start < start:
71 start = locus.start
72 end = max(end, locus.end)
73 numeric_attrs = re.findall('([^;=]+=\d+)', locus.fields[8])
3674 for key_value_pair in numeric_attrs:
3775 assert '=' in key_value_pair, \
3876 'malformed key/value pair %s' % key_value_pair
4886 for key in sorted(attrs):
4987 attrstring += ';%s=%d' % (key, attrs[key])
5088 gff3 = [seqid, 'AEGeAn::miloci.py', 'locus', str(start), str(end),
51 '%d' % len(loci), '.', '.', attrstring]
52 return '\t'.join(gff3)
89 str(len(loci)), '.', '.', attrstring]
90 line = '\t'.join(gff3)
91 return Locus(line)
5392
5493
5594 def parse_iloci(fp):
5897 Output: merged iLoci; gene-containing iLoci that are adjacent or
5998 overlapping are combined
6099 """
61 seqid = None
62 prev_loci = []
100 locus_buffer = []
63101 for line in fp:
64 line = line.rstrip()
65102 if '\tlocus\t' not in line:
66103 continue
104 locus = Locus(line)
67105
68 locusseqid = re.match('([^\t]+)', line).group(1)
69 if seqid is None:
70 seqid = locusseqid
71 elif locusseqid != seqid:
72 if len(prev_loci) > 0:
73 yield merge_iloci(prev_loci)
74 prev_loci = []
75 seqid = locusseqid
106 if len(locus_buffer) > 0 and locus.seqid != locus_buffer[0].seqid:
107 yield merge_iloci(locus_buffer)
108 locus_buffer = []
76109
77 if ';child_gene=' in line:
78 prev_loci.append(line)
110 if locus.mergeable:
111 locus_buffer.append(locus)
79112 continue
80113 else:
81 if len(prev_loci) > 0:
82 yield merge_iloci(prev_loci)
83 prev_loci = []
84 line = re.sub('ID=[^;\n]+;*', '', line)
85 line = re.sub('Name=[^;\n]+;*', '', line)
86 yield line
87 if len(prev_loci) > 0:
88 yield merge_iloci(prev_loci)
114 if len(locus_buffer) > 0:
115 yield merge_iloci(locus_buffer)
116 locus_buffer = []
117 locus.strip()
118 yield locus
119
120 if len(locus_buffer) > 0:
121 yield merge_iloci(locus_buffer)
89122
90123
91124 if __name__ == '__main__':
402402 fprintf(outstream,
403403 " <p class=\"footer\">\n"
404404 " Generated by <a href=\"%s\">AEGeAn %s (%s %s)</a>.<br />\n"
405 " Copyright © %s <a href=\"http://aegean.readthedocs.org/en/"
405 " Copyright © %s <a href=\"http://aegean.readthedocs.io/en/"
406406 "latest/contrib.html\">AEGeAn authors</a>.<br />\n"
407407 " See <a href=\"LICENSE\">LICENSE</a> for details."
408408 " </p>\n", AGN_VERSION_LINK, AGN_SEMANTIC_VERSION,
711711 {
712712 GtArray *compclassdata;
713713 GtUword i;
714
714
715715 compclassdata = gt_hashmap_get(rpt->compclassdata, "perfect");
716716 if(gt_array_size(compclassdata) > 0)
717717 {
733733 compare_report_html_seqfile_footer(outstream);
734734 fclose(outstream);
735735 }
736
736
737737 compclassdata = gt_hashmap_get(rpt->compclassdata, "mislabeled");
738738 if(gt_array_size(compclassdata) > 0)
739739 {
756756 compare_report_html_seqfile_footer(outstream);
757757 fclose(outstream);
758758 }
759
759
760760 compclassdata = gt_hashmap_get(rpt->compclassdata, "cds");
761761 if(gt_array_size(compclassdata) > 0)
762762 {
778778 compare_report_html_seqfile_footer(outstream);
779779 fclose(outstream);
780780 }
781
781
782782 compclassdata = gt_hashmap_get(rpt->compclassdata, "exon");
783783 if(gt_array_size(compclassdata) > 0)
784784 {
800800 compare_report_html_seqfile_footer(outstream);
801801 fclose(outstream);
802802 }
803
803
804804 compclassdata = gt_hashmap_get(rpt->compclassdata, "utr");
805805 if(gt_array_size(compclassdata) > 0)
806806 {
822822 compare_report_html_seqfile_footer(outstream);
823823 fclose(outstream);
824824 }
825
825
826826 compclassdata = gt_hashmap_get(rpt->compclassdata, "nonmatch");
827827 if(gt_array_size(compclassdata) > 0)
828828 {
11261126 " <h2>Gene loci <span class=\"tooltip\">[?]<span class=\"tooltip_text\">If a gene "
11271127 "annotation overlaps with another gene annotation, those annotations are associated "
11281128 "with the same gene locus. See <a target=\"_blank\" "
1129 "href=\"http://aegean.readthedocs.org/en/refactor/loci.html\">"
1129 "href=\"http://aegean.readthedocs.io/en/latest/loci.html\">"
11301130 "this page</a> for a formal definition of a locus annotation.</span></span></h2>\n"
11311131 " <table class=\"table_normal\">\n"
11321132 " <tr><td>shared</td><td>%lu</td></tr>\n"
310310 GtStr *seqid = gt_genome_node_get_seqid(*gn1);
311311 AgnLocus *locus = agn_locus_new(seqid);
312312 agn_locus_add_feature(locus, fn1);
313 gt_feature_node_add_attribute((GtFeatureNode *)locus, "iLocus_type",
314 "ciLocus");
313315 gt_genome_node_ref(*gn1);
314316 gt_array_add(iloci, locus);
315317
407409 for(i = 0; i < numloci; i++)
408410 {
409411 GtGenomeNode **gn = gt_array_get(iloci, i);
412 GtFeatureNode *fn = gt_feature_node_cast(*gn);
410413 if(i == 0)
411 coding_status = agn_locus_num_mrnas(*gn) > 0;
414 {
415 coding_status = agn_typecheck_count(fn, agn_typecheck_cds) > 0;
416 }
412417 else
413418 {
414 bool test_status = agn_locus_num_mrnas(*gn) > 0;
419 bool test_status = agn_typecheck_count(fn, agn_typecheck_cds) > 0;
415420 same_coding_status = coding_status == test_status;
416421 if(!same_coding_status)
417422 break;
466471 fprintf(stream->ilenfile, "%s\t0\n", gt_str_get(seqid));
467472 }
468473 }
469 gt_feature_node_add_attribute(fn, "iLocus_type", typestr);
474 if(gt_feature_node_get_attribute(fn, "iLocus_type") == NULL)
475 {
476 gt_feature_node_add_attribute(fn, "iLocus_type", typestr);
477 }
470478 }
471479 if(numloci == 1)
472480 {
488496 GtFeatureNode *fn1 = gt_feature_node_cast(*gn1);
489497 GtFeatureNode *fn2 = gt_feature_node_cast(*gn2);
490498
491 bool cds1 = agn_locus_num_mrnas(*gn1) > 0;
499 bool cds1 = agn_typecheck_count(fn1, agn_typecheck_cds) > 0;;
492500 if(cds1 == true)
493501 {
494502 gt_feature_node_add_attribute(fn1, "iLocus_type", "siLocus");
623631 {
624632 type = "ciLocus";
625633 }
626 gt_feature_node_add_attribute(fn, "iLocus_type", type);
634 if(gt_feature_node_get_attribute(fn, "iLocus_type") == NULL)
635 {
636 gt_feature_node_add_attribute(fn, "iLocus_type", type);
637 }
627638 }
628639 }
629640
268268 if(gt_str_cmp(seqid1, seqid2) != 0)
269269 return false;
270270
271 GtRange r1 = gt_genome_node_get_range(f1);
272 GtRange r2 = gt_genome_node_get_range(f2);
273
271274 if(by_cds)
272275 {
273276 GtRange c1 = agn_feature_node_get_cds_range((GtFeatureNode *)f1);
285288 {
286289 // Both have coding sequences, use those instead of the complete feature
287290 // coordinates.
291
292 if(gt_range_compare(&r1, &r2) == 0)
293 {
294 // Polycistrons belong together
295 return true;
296 }
297
288298 return gt_range_overlap_delta(&c1, &c2, minoverlap);
289299 }
290300 }
291301
292302 // Either we are not in CDS mode, or the features don't have a CDS.
293 GtRange r1 = gt_genome_node_get_range(f1);
294 GtRange r2 = gt_genome_node_get_range(f2);
295303 return gt_range_overlap_delta(&r1, &r2, minoverlap);
296304 }
297305