Codebase list changeo / ac0ca4e
Update upstream source from tag 'upstream/1.2.0' Update to upstream version '1.2.0' with Debian dir a8341ffd330fa5525d7858a3f13b926f2f75bd2f Nilesh Patra 2 years ago
12 changed file(s) with 152 addition(s) and 84 deletion(s). Raw diff Collapse all Expand all
2323 + `SciPy 0.14 <http://scipy.org>`__
2424 + `pandas 0.24 <http://pandas.pydata.org>`__
2525 + `Biopython 1.77 <http://biopython.org>`__
26 + `presto 0.6.2 <http://presto.readthedocs.io>`__
26 + `presto 0.7.0 <http://presto.readthedocs.io>`__
2727 + `airr 1.3.1 <https://docs.airr-community.org>`__
2828
2929 Some tools wrap external applications that are not required for installation.
00 Release Notes
11 ===============================================================================
22
3 Version 1.2.0: October 29, 2021
4 -------------------------------------------------------------------------------
5
6 + Updated dependencies to presto >= v0.7.0.
7
8 AssignGenes:
9
10 + Fixed reporting of IgBLAST output counts when specifying ``--format airr``.
11
12 BuildTrees:
13
14 + Added support for specifying fixed omega and hotness parameters at the
15 commandline.
16
17 CreateGermlines:
18
19 + Will now use the first allele in the reference database when duplicate
20 allele names are provided. Only appears to affect mouse BCR light chains
21 and TCR alleles in the IMGT database when the same allele name differs by
22 strain.
23
24 MakeDb:
25
26 + Added support for changes in how IMGT/HighV-QUEST v1.8.4 handles special
27 characters in sequence identifiers.
28 + Fixed the ``imgt`` subcommand incorrectly allowing execution without
29 specifying the IMGT/HighV-QUEST output file at the commandline.
30
31 ParseDb:
32
33 + Added reporting of output file sizes to the console log of the ``split``
34 subcommand.
35
36
337 Version 1.1.0: June 21, 2021
438 -------------------------------------------------------------------------------
539
741 + Updated dependencies to biopython >= v1.77, airr >= v1.3.1, PyYAML>=5.1.
842
943 MakeDb:
10
1144 + Added the ``--imgt-id-len`` argument to accommodate changes introduced in how
12 IMGT/HighV-QUEST truncates sequence identifiers as of version 1.8.3 (May 7, 2021).
45 IMGT/HighV-QUEST truncates sequence identifiers as of v1.8.3 (May 7, 2021).
1346 The header lines in the fasta files are now truncated to 49 characters. In
14 IMGT/HighV-QUEST versions older that 1.8.3, they were truncated to 50 characters.
47 IMGT/HighV-QUEST versions older than v1.8.3, they were truncated to 50 characters.
1548 ``--imgt-id-len`` default value is 49. Users should specify ``--imgt-id-len 50``
16 to analyze IMGT results generated with IMGT/HighV-QUEST versions older that 1.8.3.
49 to analyze IMGT results generated with IMGT/HighV-QUEST versions older than v1.8.3.
1750 + Added the ``--infer-junction`` argument to ``MakeDb igblast``, to enable the inference
1851 of the junction sequence when not reported by IgBLAST. Should be used with data from
1952 IgBLAST v1.6.0 or older; before igblast added the IMGT-CDR3 inference.
00 Metadata-Version: 1.1
11 Name: changeo
2 Version: 1.1.0
2 Version: 1.2.0
33 Summary: A bioinformatics toolkit for processing high-throughput lymphocyte receptor sequencing data.
44 Home-page: http://changeo.readthedocs.io
55 Author: Namita Gupta, Jason Anthony Vander Heiden
1313 from pkg_resources import parse_version
1414 from textwrap import dedent
1515 from time import time
16 import re
1617
1718 # Presto imports
1819 from presto.IO import printLog, printMessage, printError, printWarning
9899 vdb=vdb, output=out_file,
99100 threads=nproc, exec=igblast_exec)
100101 printMessage('Done', start_time=start_time, end=True, width=25)
101
102
103 # Get number of processed sequences
104 if (format == 'blast'):
105 with open(out_file, 'rb') as f:
106 f.seek(-2, os.SEEK_END)
107 while f.read(1) != b'\n':
108 f.seek(-2, os.SEEK_CUR)
109 pass_info = f.readline().decode()
110 num_seqs_match = re.search('(# BLAST processed )(\d+)( .*)', pass_info)
111 num_sequences = num_seqs_match.group(2)
112 else:
113 f = open(out_file, 'rb')
114 lines = 0
115 buf_size = 1024 * 1024
116 read_f = f.raw.read
117 buf = read_f(buf_size)
118 while buf:
119 lines += buf.count(b'\n')
120 buf = read_f(buf_size)
121 num_sequences = lines - 1
122
102123 # Print log
103124 log = OrderedDict()
125 log['PASS'] = num_sequences
104126 log['OUTPUT'] = os.path.basename(out_file)
105127 log['END'] = 'AssignGenes'
106128 printLog(log)
998998 if oformat == "tab":
999999 os.rmdir(clone_dir)
10001000 else:
1001 printWarning("Using --clean all with --oformat txt will delete all tree file results.\n"
1001 printWarning("Using --clean all with --oformat txt will not delete all tree file results.\n"
10021002 "You'll have to do that yourself.")
10031003 log = OrderedDict()
10041004 log["END"] = "IgPhyML analysis"
13221322 help="""Optimize combination of topology (t) branch lengths (l) and parameters (r), or
13231323 nothing (n), for IgPhyML.""")
13241324 igphyml_group.add_argument("--omega", action="store", dest="omega", type=str, default="e,e",
1325 choices = ("e", "ce", "e,e", "ce,e", "e,ce", "ce,ce"),
13261325 help="""Omega parameters to estimate for FWR,CDR respectively:
1327 e = estimate, ce = estimate + confidence interval""")
1326 e = estimate, ce = estimate + confidence interval, or numeric value""")
13281327 igphyml_group.add_argument("-t", action="store", dest="kappa", type=str, default="e",
1329 choices=("e", "ce"),
13301328 help="""Kappa parameters to estimate:
1331 e = estimate, ce = estimate + confidence interval""")
1329 e = estimate, ce = estimate + confidence interval, or numeric value""")
13321330 igphyml_group.add_argument("--motifs", action="store", dest="motifs", type=str,
13331331 default="WRC_2:0,GYW_0:1,WA_1:2,TW_0:3,SYC_2:4,GRS_0:5",
13341332 help="""Which motifs to estimate mutability.""")
13351333 igphyml_group.add_argument("--hotness", action="store", dest="hotness", type=str, default="e,e,e,e,e,e",
13361334 help="""Mutability parameters to estimate:
1337 e = estimate, ce = estimate + confidence interval""")
1335 e = estimate, ce = estimate + confidence interval, or numeric value""")
13381336 igphyml_group.add_argument("--oformat", action="store", dest="oformat", type=str, default="tab",
13391337 choices=("tab", "txt"),
13401338 help="""IgPhyML output format.""")
117117 for rec in readSeqFile(seq_file):
118118 if len(rec.description) <= imgt_id_len:
119119 id_key = rec.description
120 else:
121 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:imgt_id_len])
120 else: # truncate and replace characters
121 if imgt_id_len == 49: # 28 September 2021 (version 1.8.4)
122 id_key = re.sub('\s|\t', '_', rec.description[:imgt_id_len])
123 else: # older versions
124 id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:imgt_id_len])
122125 ids.update({id_key: rec.description})
123126
124127 return ids
144147 writer=AIRRWriter, out_file=None, out_args=default_out_args):
145148 """
146149 Writes parsed records to an output file
147
148 Arguments:
150
151 Arguments:
149152 records : a iterator of Receptor objects containing alignment data.
150153 fields : a list of ordered field names to write.
151154 aligner_file : input file name.
354357
355358
356359 def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
357 extended=False, format=default_format, out_file=None, out_args=default_out_args, imgt_id_len=default_imgt_id_len):
360 extended=False, format=default_format, out_file=None, out_args=default_out_args,
361 imgt_id_len=default_imgt_id_len):
358362 """
359363 Main for IMGT aligned sample sequences.
360364
395399
396400 # Get (parsed) IDs from fasta file submitted to IMGT
397401 id_dict = getIDforIMGT(seq_file, imgt_id_len) if seq_file else {}
398
402
399403 # Load supplementary annotation table
400404 if cellranger_file is not None:
401405 f = cellranger_extended if extended else cellranger_base
437441 printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
438442 germ_iter = (addGermline(x, references) for x in parse_iter)
439443 # Write db
440 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
444 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
441445 annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
442446 writer=writer, out_file=out_file, out_args=out_args)
443447
534538 with open(aligner_file, 'r') as f:
535539 parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls, infer_junction=infer_junction)
536540 germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
537 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
541 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
538542 annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
539543 regions=regions, writer=writer, out_file=out_file, out_args=out_args)
540544
613617 with open(aligner_file, 'r') as f:
614618 parse_iter = IHMMuneReader(f, seq_dict, references)
615619 germ_iter = (addGermline(x, references) for x in parse_iter)
616 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
620 output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
617621 annotations=annotations, asis_id=asis_id, partial=partial,
618622 writer=writer, out_file=out_file, out_args=out_args)
619623
624628 """
625629 Defines the ArgumentParser.
626630
627 Returns:
631 Returns:
628632 argparse.ArgumentParser
629633 """
630634 fields = dedent(
636640 db-fail
637641 database with records that fail due to no productivity information,
638642 no gene V assignment, no J assignment, or no junction region.
639
643
640644 universal output fields:
641 sequence_id, sequence, sequence_alignment, germline_alignment,
642 rev_comp, productive, stop_codon, vj_in_frame, locus,
643 v_call, d_call, j_call, junction, junction_length, junction_aa,
645 sequence_id, sequence, sequence_alignment, germline_alignment,
646 rev_comp, productive, stop_codon, vj_in_frame, locus,
647 v_call, d_call, j_call, junction, junction_length, junction_aa,
644648 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
645649 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
646650 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
647651 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
648652
649653 imgt specific output fields:
650 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
651 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
652
654 n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
655 d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
656
653657 igblast specific output fields:
654 v_score, v_identity, v_support, v_cigar,
655 d_score, d_identity, d_support, d_cigar,
658 v_score, v_identity, v_support, v_cigar,
659 d_score, d_identity, d_support, d_cigar,
656660 j_score, j_identity, j_support, j_cigar
657661
658662 ihmm specific output fields:
659663 vdj_score
660
664
661665 10X specific output fields:
662 cell_id, c_call, consensus_count, umi_count,
666 cell_id, c_call, consensus_count, umi_count,
663667 v_call_10x, d_call_10x, j_call_10x,
664668 junction_10x, junction_10x_aa
665669 ''')
666
670
667671 # Define ArgumentParser
668672 parser = ArgumentParser(description=__doc__, epilog=fields,
669673 formatter_class=CommonHelpFormatter, add_help=False)
685689 help='Process igblastn output.',
686690 description='Process igblastn output.')
687691 group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
688 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
689 required=True,
692 group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
690693 help='''IgBLAST output files in format 7 with query sequence
691694 (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
692695 group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
715718 group_igblast.add_argument('--partial', action='store_true', dest='partial',
716719 help='''If specified, include incomplete V(D)J alignments in
717720 the pass file instead of the fail file. An incomplete alignment
718 is defined as a record for which a valid IMGT-gapped sequence
719 cannot be built or that is missing a V gene assignment,
721 is defined as a record for which a valid IMGT-gapped sequence
722 cannot be built or that is missing a V gene assignment,
720723 J gene assignment, junction region, or productivity call.''')
721724 group_igblast.add_argument('--extended', action='store_true', dest='extended',
722 help='''Specify to include additional aligner specific fields in the output.
725 help='''Specify to include additional aligner specific fields in the output.
723726 Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
724727 fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
725728 group_igblast.add_argument('--regions', action='store', dest='regions',
726729 choices=('default', 'rhesus-igl'), default='default',
727730 help='''IMGT CDR and FWR boundary definition to use.''')
728731 group_igblast.add_argument('--infer-junction', action='store_true', dest='infer_junction',
729 help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
732 help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
730733 prior to the addition of IMGT-CDR3 inference.''')
731734 parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
732735
736739 help='Process igblastp output.',
737740 description='Process igblastp output.')
738741 group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
739 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
740 required=True,
742 group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
741743 help='''IgBLAST output files in format 7 with query sequence
742744 (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
743745 group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
762764 the sequence identifiers in the reference sequence set and the IgBLAST
763765 database to be exact string matches.''')
764766 group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
765 help='''Specify to include additional aligner specific fields in the output.
767 help='''Specify to include additional aligner specific fields in the output.
766768 Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
767769 group_igblast_aa.add_argument('--regions', action='store', dest='regions',
768770 choices=('default', 'rhesus-igl'), default='default',
778780 description='''Process IMGT/HighV-Quest output
779781 (does not work with V-QUEST).''')
780782 group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
781 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
783 group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
782784 help='''Either zipped IMGT output files (.zip or .txz) or a
783785 folder containing unzipped IMGT output files (which must
784786 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
785787 and 6_Junction).''')
786788 group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
787789 help='''List of FASTA files (with .fasta, .fna or .fa
788 extension) that were submitted to IMGT/HighV-QUEST.
790 extension) that were submitted to IMGT/HighV-QUEST.
789791 If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
790792 will not be corrected.''')
791793 group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
792794 help='''List of folders and/or fasta files containing
793 the germline sequence set used by IMGT/HighV-QUEST.
795 the germline sequence set used by IMGT/HighV-QUEST.
794796 These reference sequences must contain IMGT-numbering spacers (gaps)
795 in the V segment. If unspecified, the germline sequence reconstruction
797 in the V segment. If unspecified, the germline sequence reconstruction
796798 will not be included in the output.''')
797799 group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
798800 help='''Table file containing 10X annotations (with .csv or .tsv
806808 group_imgt.add_argument('--partial', action='store_true', dest='partial',
807809 help='''If specified, include incomplete V(D)J alignments in
808810 the pass file instead of the fail file. An incomplete alignment
809 is defined as a record that is missing a V gene assignment,
811 is defined as a record that is missing a V gene assignment,
810812 J gene assignment, junction region, or productivity call.''')
811813 group_imgt.add_argument('--extended', action='store_true', dest='extended',
812 help='''Specify to include additional aligner specific fields in the output.
814 help='''Specify to include additional aligner specific fields in the output.
813815 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
814 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
816 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
815817 p3d_length, p5j_length and d_frame.''')
816818 group_imgt.add_argument('--imgt-id-len', action='store', dest='imgt_id_len', type=int,
817819 default=default_imgt_id_len,
818 help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
819 Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
820 help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
821 Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
820822 than 1.8.3 (May 7, 2021).''')
821823 parser_imgt.set_defaults(func=parseIMGT)
822824
850852 group_ihmm.add_argument('--partial', action='store_true', dest='partial',
851853 help='''If specified, include incomplete V(D)J alignments in
852854 the pass file instead of the fail file. An incomplete alignment
853 is defined as a record for which a valid IMGT-gapped sequence
854 cannot be built or that is missing a V gene assignment,
855 is defined as a record for which a valid IMGT-gapped sequence
856 cannot be built or that is missing a V gene assignment,
855857 J gene assignment, junction region, or productivity call.''')
856858 group_ihmm.add_argument('--extended', action='store_true', dest='extended',
857 help='''Specify to include additional aligner specific fields in the output.
859 help='''Specify to include additional aligner specific fields in the output.
858860 Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
859861 adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
860862 parser_ihmm.set_defaults(func=parseIHMM)
861863
862864 return parser
863
864
865
866
865867 if __name__ == "__main__":
866868 """
867869 Parses command line arguments and calls main
880882 if 'seq_files' in args_dict: del args_dict['seq_files']
881883 if 'out_files' in args_dict: del args_dict['out_files']
882884 if 'command' in args_dict: del args_dict['command']
883 if 'func' in args_dict: del args_dict['func']
885 if 'func' in args_dict: del args_dict['func']
884886
885887 # Call main
886888 for i, f in enumerate(args.__dict__['aligner_files']):
141141 log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
142142 log['RECORDS'] = rec_count
143143 log['PARTS'] = len(handles_dict)
144
145 # Close output file handles and log file size
146 db_handle.close()
147 for i, t in enumerate(handles_dict):
148 handles_dict[t].close()
149 log['SIZE%i' % (i + 1)] = countDbFile(handles_dict[t].name)
150
144151 log['END'] = 'ParseDb'
145152 printLog(log)
146
147 # Close output file handles
148 db_handle.close()
149 for t in handles_dict: handles_dict[t].close()
150153
151154 return [handles_dict[t].name for t in handles_dict]
152155
363366 """
364367 Deletes records from a database file
365368
366 Arguments:
369 Arguments:
367370 db_file : the database file name.
368371 fields : a list of fields to check for deletion criteria.
369372 values : a list of values defining deletion targets.
371374 regex : if False do exact full string matches; if True allow partial regex matches.
372375 out_file : output file name. Automatically generated from the input file if None.
373376 out_args : common output argument dictionary from parseCommonArgs.
374
375 Returns:
377
378 Returns:
376379 str : output file name.
377380 """
378381 # Define string match function
427430 rec_count += 1
428431 # Check for deletion values in all fields
429432 delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
430
433
431434 # Write sequences
432435 if not delete:
433436 pass_count += 1
434437 pass_writer.writeDict(rec)
435438 else:
436439 fail_count += 1
437
440
438441 # Print counts
439442 printProgress(rec_count, result_count, 0.05, start_time=start_time)
440443 log = OrderedDict()
448451 # Close file handles
449452 pass_handle.close()
450453 db_handle.close()
451
454
452455 return pass_handle.name
453456
454457
866869 """
867870 Defines the ArgumentParser
868871
869 Arguments:
872 Arguments:
870873 None
871
872 Returns:
874
875 Returns:
873876 an ArgumentParser object
874877 """
875878 # Define input and output field help message
887890 required fields:
888891 sequence_id
889892 ''')
890
893
891894 # Define ArgumentParser
892895 parser = ArgumentParser(description=__doc__, epilog=fields,
893896 formatter_class=CommonHelpFormatter, add_help=False)
10261029 description='Merges files.')
10271030 group_merge = parser_merge.add_argument_group('parsing arguments')
10281031 group_merge.add_argument('-o', action='store', dest='out_file', default=None,
1029 help='''Explicit output file name. Note, this argument cannot be used with
1032 help='''Explicit output file name. Note, this argument cannot be used with
10301033 the --failed, --outdir or --outname arguments.''')
10311034 group_merge.add_argument('--drop', action='store_true', dest='drop',
10321035 help='''If specified, drop fields that do not exist in all input files.
1033 Otherwise, include all columns in all files and fill missing data
1036 Otherwise, include all columns in all files and fill missing data
10341037 with empty strings.''')
10351038 parser_merge.set_defaults(func=mergeDbFiles)
10361039
10911094 args_dict['out_file'] = args.__dict__['out_files'][i] \
10921095 if args.__dict__['out_files'] else None
10931096 args.func(**args_dict)
1094
1097
1212 import zipfile
1313 from itertools import chain, groupby, zip_longest
1414 from tempfile import TemporaryDirectory
15 from textwrap import indent
1516 from Bio import SeqIO
1617 from Bio.Seq import Seq
1718
1819 # Presto and changeo imports
19 from presto.IO import getFileType, printError, printWarning
20 from presto.IO import getFileType, printError, printWarning, printDebug
2021 from changeo.Defaults import default_csv_size
2122 from changeo.Gene import getAllele, getLocus, getVAllele, getDAllele, getJAllele
2223 from changeo.Receptor import AIRRSchema, AIRRSchemaAA, ChangeoSchema, ChangeoSchemaAA, Receptor, ReceptorData
21662167 return db
21672168
21682169
2169 def readGermlines(references, asis=False):
2170 def readGermlines(references, asis=False, warn=False):
21702171 """
21712172 Parses germline repositories
21722173
21732174 Arguments:
21742175 references (list): list of strings specifying directories and/or files from which to read germline records.
21752176 asis (bool): if True use sequence ID as record name and do not parse headers for allele names.
2177 warn (bool): print warning messages to standard error if True.
21762178
21772179 Returns:
21782180 dict: Dictionary of germlines in the form {allele: sequence}.
21932195 printError('No valid germline fasta files (.fasta, .fna, .fa) were found at %s.' % ','.join(references))
21942196
21952197 repo_dict = {}
2198 duplicates = []
21962199 for file_name in repo_files:
21972200 with open(file_name, 'rU') as file_handle:
21982201 germlines = SeqIO.parse(file_handle, 'fasta')
21992202 for g in germlines:
22002203 germ_key = getAllele(g.description, 'first') if not asis else g.id
2201 repo_dict[germ_key] = str(g.seq).upper()
2204 if germ_key not in repo_dict:
2205 repo_dict[germ_key] = str(g.seq).upper()
2206 else:
2207 duplicates.append(g.description)
2208
2209 if warn and len(duplicates) > 0:
2210 w = indent('\n'.join(duplicates), ' '*9)
2211 printWarning('Duplicated germline allele names excluded from references:\n%s' % w)
22022212
22032213 return repo_dict
22042214
44 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
55 __copyright__ = 'Copyright 2021 Kleinstein Lab, Yale University. All rights reserved.'
66 __license__ = 'GNU Affero General Public License 3 (AGPL-3)'
7 __version__ = '1.1.0'
8 __date__ = '2021.06.21'
7 __version__ = '1.2.0'
8 __date__ = '2021.10.29'
00 Metadata-Version: 1.1
11 Name: changeo
2 Version: 1.1.0
2 Version: 1.2.0
33 Summary: A bioinformatics toolkit for processing high-throughput lymphocyte receptor sequencing data.
44 Home-page: http://changeo.readthedocs.io
55 Author: Namita Gupta, Jason Anthony Vander Heiden
33 biopython>=1.77
44 PyYAML>=5.1
55 setuptools>=2.0
6 presto>=0.6.2
6 presto>=0.7.0
77 airr>=1.3.1
33 biopython>=1.77
44 PyYAML>=5.1
55 setuptools>=2.0
6 presto>=0.6.2
6 presto>=0.7.0
77 airr>=1.3.1