New upstream version 0.16.0+dfsg
Sascha Steinbiss
5 years ago
0 | 0 | # Change Log |
1 | 1 | All notable changes to this project will be documented in this file. |
2 | 2 | This project adheres to [Semantic Versioning](http://semver.org/). |
3 | ||
4 | ## [0.16.0] - 2016-05-09 | |
5 | ||
6 | ### Fixed | |
7 | - Outer gene in intron gene cases now designated as a ciLocus. | |
8 | - ciLoci are now exempt from merging into miLoci. | |
9 | - Fixes to how polycistrons and other edge cases are handled. | |
3 | 10 | |
4 | 11 | ## [0.15.2] - 2016-03-17 |
5 | 12 |
7 | 7 | #!annotation-source NCBI Acromyrmex echinatior Annotation Release 100 |
8 | 8 | NW_011626563.1 AEGeAn::LocusPocus locus 1190754 1200781 . . . child_gene=1;child_mRNA=2;riil=769;effective_length=10028;iLocus_type=siLocus |
9 | 9 | NW_011626563.1 AEGeAn::LocusPocus locus 1200782 1201550 . . . fg_orient=FR;effective_length=769;iLocus_type=iiLocus |
10 | NW_011626563.1 AEGeAn::LocusPocus locus 1201551 1566216 . . . effective_length=364666;iiLocus_exception=complex-overlap-3;liil=769;riil=8884;iLocus_type=siLocus;child_gene=1;child_mRNA=1 | |
10 | NW_011626563.1 AEGeAn::LocusPocus locus 1201551 1566216 . . . iLocus_type=ciLocus;effective_length=364666;iiLocus_exception=complex-overlap-3;liil=769;riil=8884;child_gene=1;child_mRNA=1 | |
11 | 11 | NW_011626563.1 AEGeAn::LocusPocus locus 1216473 1219475 . . . iiLocus_exception=intron-gene;liil=0;riil=0;iLocus_type=siLocus;child_gene=1;child_mRNA=2 |
12 | 12 | NW_011626563.1 AEGeAn::LocusPocus locus 1537022 1540189 . . . liil=0;riil=0;iLocus_type=niLocus;child_gene=1;child_ncRNA=1 |
13 | 13 | NW_011626563.1 AEGeAn::LocusPocus locus 1566217 1575100 . . . fg_orient=RR;effective_length=8884;iLocus_type=iiLocus |
5 | 5 | #!genome-build-accession NCBI_Assembly:GCF_000220905.1 |
6 | 6 | #!annotation-date 7 April 2015 |
7 | 7 | #!annotation-source NCBI Megachile rotundata Annotation Release 101 |
8 | NW_003797177.1 AEGeAn::LocusPocus locus 9652 19311 . . . effective_length=9660;iLocus_type=siLocus;child_gene=1;child_mRNA=1 | |
8 | NW_003797177.1 AEGeAn::LocusPocus locus 9652 19311 . . . iLocus_type=ciLocus;effective_length=9660;child_gene=1;child_mRNA=1 | |
9 | 9 | NW_003797177.1 AEGeAn::LocusPocus locus 11405 18146 . . . iiLocus_exception=intron-gene;iLocus_type=siLocus;child_gene=1;child_mRNA=4 |
6 | 6 | #!annotation-date 3 June 2014 |
7 | 7 | #!annotation-source NCBI Nasonia vitripennis Annotation Release 101 |
8 | 8 | NC_015867.2 AEGeAn::LocusPocus locus 370926 373152 . . . child_gene=1;child_mRNA=1;right_overlap=262;iiLocus_exception=delta-overlap-delta;riil=0;effective_length=1965;iLocus_type=siLocus |
9 | NC_015867.2 AEGeAn::LocusPocus locus 372891 380536 . . . effective_length=6758;iLocus_type=siLocus;child_gene=1;child_mRNA=1 | |
9 | NC_015867.2 AEGeAn::LocusPocus locus 372891 380536 . . . iLocus_type=ciLocus;effective_length=6758;child_gene=1;child_mRNA=1 | |
10 | 10 | NC_015867.2 AEGeAn::LocusPocus locus 374998 378903 . . . iiLocus_exception=intron-gene;iLocus_type=siLocus;child_gene=1;child_mRNA=1 |
11 | 11 | NC_015867.2 AEGeAn::LocusPocus locus 379649 381835 . . . left_overlap=888;liil=0;child_gene=1;child_mRNA=1;effective_length=2187;iLocus_type=siLocus |
42 | 42 | </tbody> |
43 | 43 | </table> |
44 | 44 | |
45 | <h2>Gene loci <span class="tooltip">[?]<span class="tooltip_text">If a gene annotation overlaps with another gene annotation, those annotations are associated with the same gene locus. See <a target="_blank" href="http://aegean.readthedocs.org/en/refactor/loci.html">this page</a> for a formal definition of a locus annotation.</span></span></h2> | |
45 | <h2>Gene loci <span class="tooltip">[?]<span class="tooltip_text">If a gene annotation overlaps with another gene annotation, those annotations are associated with the same gene locus. See <a target="_blank" href="http://aegean.readthedocs.io/en/latest/loci.html">this page</a> for a formal definition of a locus annotation.</span></span></h2> | |
46 | 46 | <table class="table_normal"> |
47 | 47 | <tr><td>shared</td><td>7</td></tr> |
48 | 48 | <tr><td>unique to reference</td><td>1</td></tr> |
10 | 10 | import sys |
11 | 11 | |
12 | 12 | |
13 | class Locus(object): | |
14 | def __init__(self, line): | |
15 | self._rawdata = line | |
16 | self.fields = line.strip().split('\t') | |
17 | assert len(self.fields) == 9 | |
18 | ||
19 | @property | |
20 | def seqid(self): | |
21 | return self.fields[0] | |
22 | ||
23 | @property | |
24 | def start(self): | |
25 | return int(self.fields[3]) | |
26 | ||
27 | @property | |
28 | def end(self): | |
29 | return int(self.fields[4]) | |
30 | ||
31 | @property | |
32 | def ilocus_class(self): | |
33 | typematch = re.search('iLocus_type=([^;\n]+)', self.fields[8]) | |
34 | assert typematch, 'could not determine iLocus type: ' + self._rawdata | |
35 | return typematch.group(1) | |
36 | ||
37 | @property | |
38 | def mergeable(self): | |
39 | if self.ilocus_class not in ['siLocus', 'niLocus']: | |
40 | return False | |
41 | if 'iiLocus_exception=intron-gene' in self.fields[8]: | |
42 | return False | |
43 | return True | |
44 | ||
45 | def __len__(self): | |
46 | return self.end - self.start + 1 | |
47 | ||
48 | def __str__(self): | |
49 | return '\t'.join(self.fields) | |
50 | ||
51 | def strip(self): | |
52 | self.fields[8] = re.sub('ID=[^;\n]+;*', '', self.fields[8]) | |
53 | self.fields[8] = re.sub('Name=[^;\n]+;*', '', self.fields[8]) | |
54 | ||
55 | ||
13 | 56 | def merge_iloci(loci): |
14 | 57 | """Merge ajacent or overlapping gene-containing iLoci.""" |
15 | 58 | assert len(loci) > 0 |
16 | 59 | if len(loci) == 1: |
17 | line = re.sub('ID=[^;\n]+;*', '', loci[0]) | |
18 | line = re.sub('Name=[^;\n]+;*', '', line) | |
19 | return line | |
60 | loci[0].strip() | |
61 | return loci[0] | |
20 | 62 | |
21 | 63 | seqid = None |
22 | 64 | start, end = -1, -1 |
23 | 65 | attrs = {} |
24 | 66 | for locus in loci: |
25 | fields = locus.split('\t') | |
26 | assert len(fields) == 9 | |
27 | 67 | if seqid: |
28 | assert fields[0] == seqid | |
29 | seqid = fields[0] | |
30 | lstart = int(fields[3]) | |
31 | lend = int(fields[4]) | |
32 | if start == -1 or lstart < start: | |
33 | start = lstart | |
34 | end = max(end, lend) | |
35 | numeric_attrs = re.findall('([^;=]+=\d+)', fields[8]) | |
68 | assert locus.seqid == seqid | |
69 | seqid = locus.seqid | |
70 | if start == -1 or locus.start < start: | |
71 | start = locus.start | |
72 | end = max(end, locus.end) | |
73 | numeric_attrs = re.findall('([^;=]+=\d+)', locus.fields[8]) | |
36 | 74 | for key_value_pair in numeric_attrs: |
37 | 75 | assert '=' in key_value_pair, \ |
38 | 76 | 'malformed key/value pair %s' % key_value_pair |
48 | 86 | for key in sorted(attrs): |
49 | 87 | attrstring += ';%s=%d' % (key, attrs[key]) |
50 | 88 | gff3 = [seqid, 'AEGeAn::miloci.py', 'locus', str(start), str(end), |
51 | '%d' % len(loci), '.', '.', attrstring] | |
52 | return '\t'.join(gff3) | |
89 | str(len(loci)), '.', '.', attrstring] | |
90 | line = '\t'.join(gff3) | |
91 | return Locus(line) | |
53 | 92 | |
54 | 93 | |
55 | 94 | def parse_iloci(fp): |
58 | 97 | Output: merged iLoci; gene-containing iLoci that are adjacent or |
59 | 98 | overlapping are combined |
60 | 99 | """ |
61 | seqid = None | |
62 | prev_loci = [] | |
100 | locus_buffer = [] | |
63 | 101 | for line in fp: |
64 | line = line.rstrip() | |
65 | 102 | if '\tlocus\t' not in line: |
66 | 103 | continue |
104 | locus = Locus(line) | |
67 | 105 | |
68 | locusseqid = re.match('([^\t]+)', line).group(1) | |
69 | if seqid is None: | |
70 | seqid = locusseqid | |
71 | elif locusseqid != seqid: | |
72 | if len(prev_loci) > 0: | |
73 | yield merge_iloci(prev_loci) | |
74 | prev_loci = [] | |
75 | seqid = locusseqid | |
106 | if len(locus_buffer) > 0 and locus.seqid != locus_buffer[0].seqid: | |
107 | yield merge_iloci(locus_buffer) | |
108 | locus_buffer = [] | |
76 | 109 | |
77 | if ';child_gene=' in line: | |
78 | prev_loci.append(line) | |
110 | if locus.mergeable: | |
111 | locus_buffer.append(locus) | |
79 | 112 | continue |
80 | 113 | else: |
81 | if len(prev_loci) > 0: | |
82 | yield merge_iloci(prev_loci) | |
83 | prev_loci = [] | |
84 | line = re.sub('ID=[^;\n]+;*', '', line) | |
85 | line = re.sub('Name=[^;\n]+;*', '', line) | |
86 | yield line | |
87 | if len(prev_loci) > 0: | |
88 | yield merge_iloci(prev_loci) | |
114 | if len(locus_buffer) > 0: | |
115 | yield merge_iloci(locus_buffer) | |
116 | locus_buffer = [] | |
117 | locus.strip() | |
118 | yield locus | |
119 | ||
120 | if len(locus_buffer) > 0: | |
121 | yield merge_iloci(locus_buffer) | |
89 | 122 | |
90 | 123 | |
91 | 124 | if __name__ == '__main__': |
402 | 402 | fprintf(outstream, |
403 | 403 | " <p class=\"footer\">\n" |
404 | 404 | " Generated by <a href=\"%s\">AEGeAn %s (%s %s)</a>.<br />\n" |
405 | " Copyright © %s <a href=\"http://aegean.readthedocs.org/en/" | |
405 | " Copyright © %s <a href=\"http://aegean.readthedocs.io/en/" | |
406 | 406 | "latest/contrib.html\">AEGeAn authors</a>.<br />\n" |
407 | 407 | " See <a href=\"LICENSE\">LICENSE</a> for details." |
408 | 408 | " </p>\n", AGN_VERSION_LINK, AGN_SEMANTIC_VERSION, |
711 | 711 | { |
712 | 712 | GtArray *compclassdata; |
713 | 713 | GtUword i; |
714 | ||
714 | ||
715 | 715 | compclassdata = gt_hashmap_get(rpt->compclassdata, "perfect"); |
716 | 716 | if(gt_array_size(compclassdata) > 0) |
717 | 717 | { |
733 | 733 | compare_report_html_seqfile_footer(outstream); |
734 | 734 | fclose(outstream); |
735 | 735 | } |
736 | ||
736 | ||
737 | 737 | compclassdata = gt_hashmap_get(rpt->compclassdata, "mislabeled"); |
738 | 738 | if(gt_array_size(compclassdata) > 0) |
739 | 739 | { |
756 | 756 | compare_report_html_seqfile_footer(outstream); |
757 | 757 | fclose(outstream); |
758 | 758 | } |
759 | ||
759 | ||
760 | 760 | compclassdata = gt_hashmap_get(rpt->compclassdata, "cds"); |
761 | 761 | if(gt_array_size(compclassdata) > 0) |
762 | 762 | { |
778 | 778 | compare_report_html_seqfile_footer(outstream); |
779 | 779 | fclose(outstream); |
780 | 780 | } |
781 | ||
781 | ||
782 | 782 | compclassdata = gt_hashmap_get(rpt->compclassdata, "exon"); |
783 | 783 | if(gt_array_size(compclassdata) > 0) |
784 | 784 | { |
800 | 800 | compare_report_html_seqfile_footer(outstream); |
801 | 801 | fclose(outstream); |
802 | 802 | } |
803 | ||
803 | ||
804 | 804 | compclassdata = gt_hashmap_get(rpt->compclassdata, "utr"); |
805 | 805 | if(gt_array_size(compclassdata) > 0) |
806 | 806 | { |
822 | 822 | compare_report_html_seqfile_footer(outstream); |
823 | 823 | fclose(outstream); |
824 | 824 | } |
825 | ||
825 | ||
826 | 826 | compclassdata = gt_hashmap_get(rpt->compclassdata, "nonmatch"); |
827 | 827 | if(gt_array_size(compclassdata) > 0) |
828 | 828 | { |
1126 | 1126 | " <h2>Gene loci <span class=\"tooltip\">[?]<span class=\"tooltip_text\">If a gene " |
1127 | 1127 | "annotation overlaps with another gene annotation, those annotations are associated " |
1128 | 1128 | "with the same gene locus. See <a target=\"_blank\" " |
1129 | "href=\"http://aegean.readthedocs.org/en/refactor/loci.html\">" | |
1129 | "href=\"http://aegean.readthedocs.io/en/latest/loci.html\">" | |
1130 | 1130 | "this page</a> for a formal definition of a locus annotation.</span></span></h2>\n" |
1131 | 1131 | " <table class=\"table_normal\">\n" |
1132 | 1132 | " <tr><td>shared</td><td>%lu</td></tr>\n" |
310 | 310 | GtStr *seqid = gt_genome_node_get_seqid(*gn1); |
311 | 311 | AgnLocus *locus = agn_locus_new(seqid); |
312 | 312 | agn_locus_add_feature(locus, fn1); |
313 | gt_feature_node_add_attribute((GtFeatureNode *)locus, "iLocus_type", | |
314 | "ciLocus"); | |
313 | 315 | gt_genome_node_ref(*gn1); |
314 | 316 | gt_array_add(iloci, locus); |
315 | 317 | |
407 | 409 | for(i = 0; i < numloci; i++) |
408 | 410 | { |
409 | 411 | GtGenomeNode **gn = gt_array_get(iloci, i); |
412 | GtFeatureNode *fn = gt_feature_node_cast(*gn); | |
410 | 413 | if(i == 0) |
411 | coding_status = agn_locus_num_mrnas(*gn) > 0; | |
414 | { | |
415 | coding_status = agn_typecheck_count(fn, agn_typecheck_cds) > 0; | |
416 | } | |
412 | 417 | else |
413 | 418 | { |
414 | bool test_status = agn_locus_num_mrnas(*gn) > 0; | |
419 | bool test_status = agn_typecheck_count(fn, agn_typecheck_cds) > 0; | |
415 | 420 | same_coding_status = coding_status == test_status; |
416 | 421 | if(!same_coding_status) |
417 | 422 | break; |
466 | 471 | fprintf(stream->ilenfile, "%s\t0\n", gt_str_get(seqid)); |
467 | 472 | } |
468 | 473 | } |
469 | gt_feature_node_add_attribute(fn, "iLocus_type", typestr); | |
474 | if(gt_feature_node_get_attribute(fn, "iLocus_type") == NULL) | |
475 | { | |
476 | gt_feature_node_add_attribute(fn, "iLocus_type", typestr); | |
477 | } | |
470 | 478 | } |
471 | 479 | if(numloci == 1) |
472 | 480 | { |
488 | 496 | GtFeatureNode *fn1 = gt_feature_node_cast(*gn1); |
489 | 497 | GtFeatureNode *fn2 = gt_feature_node_cast(*gn2); |
490 | 498 | |
491 | bool cds1 = agn_locus_num_mrnas(*gn1) > 0; | |
499 | bool cds1 = agn_typecheck_count(fn1, agn_typecheck_cds) > 0;; | |
492 | 500 | if(cds1 == true) |
493 | 501 | { |
494 | 502 | gt_feature_node_add_attribute(fn1, "iLocus_type", "siLocus"); |
623 | 631 | { |
624 | 632 | type = "ciLocus"; |
625 | 633 | } |
626 | gt_feature_node_add_attribute(fn, "iLocus_type", type); | |
634 | if(gt_feature_node_get_attribute(fn, "iLocus_type") == NULL) | |
635 | { | |
636 | gt_feature_node_add_attribute(fn, "iLocus_type", type); | |
637 | } | |
627 | 638 | } |
628 | 639 | } |
629 | 640 |
268 | 268 | if(gt_str_cmp(seqid1, seqid2) != 0) |
269 | 269 | return false; |
270 | 270 | |
271 | GtRange r1 = gt_genome_node_get_range(f1); | |
272 | GtRange r2 = gt_genome_node_get_range(f2); | |
273 | ||
271 | 274 | if(by_cds) |
272 | 275 | { |
273 | 276 | GtRange c1 = agn_feature_node_get_cds_range((GtFeatureNode *)f1); |
285 | 288 | { |
286 | 289 | // Both have coding sequences, use those instead of the complete feature |
287 | 290 | // coordinates. |
291 | ||
292 | if(gt_range_compare(&r1, &r2) == 0) | |
293 | { | |
294 | // Polycistrons belong together | |
295 | return true; | |
296 | } | |
297 | ||
288 | 298 | return gt_range_overlap_delta(&c1, &c2, minoverlap); |
289 | 299 | } |
290 | 300 | } |
291 | 301 | |
292 | 302 | // Either we are not in CDS mode, or the features don't have a CDS. |
293 | GtRange r1 = gt_genome_node_get_range(f1); | |
294 | GtRange r2 = gt_genome_node_get_range(f2); | |
295 | 303 | return gt_range_overlap_delta(&r1, &r2, minoverlap); |
296 | 304 | } |
297 | 305 |