117 | 117 |
for rec in readSeqFile(seq_file):
|
118 | 118 |
if len(rec.description) <= imgt_id_len:
|
119 | 119 |
id_key = rec.description
|
120 | |
else:
|
121 | |
id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:imgt_id_len])
|
|
120 |
else: # truncate and replace characters
|
|
121 |
if imgt_id_len == 49: # 28 September 2021 (version 1.8.4)
|
|
122 |
id_key = re.sub('\s|\t', '_', rec.description[:imgt_id_len])
|
|
123 |
else: # older versions
|
|
124 |
id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:imgt_id_len])
|
122 | 125 |
ids.update({id_key: rec.description})
|
123 | 126 |
|
124 | 127 |
return ids
|
|
144 | 147 |
writer=AIRRWriter, out_file=None, out_args=default_out_args):
|
145 | 148 |
"""
|
146 | 149 |
Writes parsed records to an output file
|
147 | |
|
148 | |
Arguments:
|
|
150 |
|
|
151 |
Arguments:
|
149 | 152 |
records : a iterator of Receptor objects containing alignment data.
|
150 | 153 |
fields : a list of ordered field names to write.
|
151 | 154 |
aligner_file : input file name.
|
|
354 | 357 |
|
355 | 358 |
|
356 | 359 |
def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
|
357 | |
extended=False, format=default_format, out_file=None, out_args=default_out_args, imgt_id_len=default_imgt_id_len):
|
|
360 |
extended=False, format=default_format, out_file=None, out_args=default_out_args,
|
|
361 |
imgt_id_len=default_imgt_id_len):
|
358 | 362 |
"""
|
359 | 363 |
Main for IMGT aligned sample sequences.
|
360 | 364 |
|
|
395 | 399 |
|
396 | 400 |
# Get (parsed) IDs from fasta file submitted to IMGT
|
397 | 401 |
id_dict = getIDforIMGT(seq_file, imgt_id_len) if seq_file else {}
|
398 | |
|
|
402 |
|
399 | 403 |
# Load supplementary annotation table
|
400 | 404 |
if cellranger_file is not None:
|
401 | 405 |
f = cellranger_extended if extended else cellranger_base
|
|
437 | 441 |
printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
|
438 | 442 |
germ_iter = (addGermline(x, references) for x in parse_iter)
|
439 | 443 |
# Write db
|
440 | |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
444 |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
441 | 445 |
annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
|
442 | 446 |
writer=writer, out_file=out_file, out_args=out_args)
|
443 | 447 |
|
|
534 | 538 |
with open(aligner_file, 'r') as f:
|
535 | 539 |
parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls, infer_junction=infer_junction)
|
536 | 540 |
germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
|
537 | |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
541 |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
538 | 542 |
annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
|
539 | 543 |
regions=regions, writer=writer, out_file=out_file, out_args=out_args)
|
540 | 544 |
|
|
613 | 617 |
with open(aligner_file, 'r') as f:
|
614 | 618 |
parse_iter = IHMMuneReader(f, seq_dict, references)
|
615 | 619 |
germ_iter = (addGermline(x, references) for x in parse_iter)
|
616 | |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
|
620 |
output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
|
617 | 621 |
annotations=annotations, asis_id=asis_id, partial=partial,
|
618 | 622 |
writer=writer, out_file=out_file, out_args=out_args)
|
619 | 623 |
|
|
624 | 628 |
"""
|
625 | 629 |
Defines the ArgumentParser.
|
626 | 630 |
|
627 | |
Returns:
|
|
631 |
Returns:
|
628 | 632 |
argparse.ArgumentParser
|
629 | 633 |
"""
|
630 | 634 |
fields = dedent(
|
|
636 | 640 |
db-fail
|
637 | 641 |
database with records that fail due to no productivity information,
|
638 | 642 |
no gene V assignment, no J assignment, or no junction region.
|
639 | |
|
|
643 |
|
640 | 644 |
universal output fields:
|
641 | |
sequence_id, sequence, sequence_alignment, germline_alignment,
|
642 | |
rev_comp, productive, stop_codon, vj_in_frame, locus,
|
643 | |
v_call, d_call, j_call, junction, junction_length, junction_aa,
|
|
645 |
sequence_id, sequence, sequence_alignment, germline_alignment,
|
|
646 |
rev_comp, productive, stop_codon, vj_in_frame, locus,
|
|
647 |
v_call, d_call, j_call, junction, junction_length, junction_aa,
|
644 | 648 |
v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
|
645 | 649 |
d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
|
646 | 650 |
j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
|
647 | 651 |
np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
|
648 | 652 |
|
649 | 653 |
imgt specific output fields:
|
650 | |
n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
|
651 | |
d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
|
652 | |
|
|
654 |
n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
|
|
655 |
d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
|
|
656 |
|
653 | 657 |
igblast specific output fields:
|
654 | |
v_score, v_identity, v_support, v_cigar,
|
655 | |
d_score, d_identity, d_support, d_cigar,
|
|
658 |
v_score, v_identity, v_support, v_cigar,
|
|
659 |
d_score, d_identity, d_support, d_cigar,
|
656 | 660 |
j_score, j_identity, j_support, j_cigar
|
657 | 661 |
|
658 | 662 |
ihmm specific output fields:
|
659 | 663 |
vdj_score
|
660 | |
|
|
664 |
|
661 | 665 |
10X specific output fields:
|
662 | |
cell_id, c_call, consensus_count, umi_count,
|
|
666 |
cell_id, c_call, consensus_count, umi_count,
|
663 | 667 |
v_call_10x, d_call_10x, j_call_10x,
|
664 | 668 |
junction_10x, junction_10x_aa
|
665 | 669 |
''')
|
666 | |
|
|
670 |
|
667 | 671 |
# Define ArgumentParser
|
668 | 672 |
parser = ArgumentParser(description=__doc__, epilog=fields,
|
669 | 673 |
formatter_class=CommonHelpFormatter, add_help=False)
|
|
685 | 689 |
help='Process igblastn output.',
|
686 | 690 |
description='Process igblastn output.')
|
687 | 691 |
group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
|
688 | |
group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
689 | |
required=True,
|
|
692 |
group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
|
690 | 693 |
help='''IgBLAST output files in format 7 with query sequence
|
691 | 694 |
(igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
|
692 | 695 |
group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
715 | 718 |
group_igblast.add_argument('--partial', action='store_true', dest='partial',
|
716 | 719 |
help='''If specified, include incomplete V(D)J alignments in
|
717 | 720 |
the pass file instead of the fail file. An incomplete alignment
|
718 | |
is defined as a record for which a valid IMGT-gapped sequence
|
719 | |
cannot be built or that is missing a V gene assignment,
|
|
721 |
is defined as a record for which a valid IMGT-gapped sequence
|
|
722 |
cannot be built or that is missing a V gene assignment,
|
720 | 723 |
J gene assignment, junction region, or productivity call.''')
|
721 | 724 |
group_igblast.add_argument('--extended', action='store_true', dest='extended',
|
722 | |
help='''Specify to include additional aligner specific fields in the output.
|
|
725 |
help='''Specify to include additional aligner specific fields in the output.
|
723 | 726 |
Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
|
724 | 727 |
fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
|
725 | 728 |
group_igblast.add_argument('--regions', action='store', dest='regions',
|
726 | 729 |
choices=('default', 'rhesus-igl'), default='default',
|
727 | 730 |
help='''IMGT CDR and FWR boundary definition to use.''')
|
728 | 731 |
group_igblast.add_argument('--infer-junction', action='store_true', dest='infer_junction',
|
729 | |
help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
|
|
732 |
help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
|
730 | 733 |
prior to the addition of IMGT-CDR3 inference.''')
|
731 | 734 |
parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
|
732 | 735 |
|
|
736 | 739 |
help='Process igblastp output.',
|
737 | 740 |
description='Process igblastp output.')
|
738 | 741 |
group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
|
739 | |
group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
740 | |
required=True,
|
|
742 |
group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
|
741 | 743 |
help='''IgBLAST output files in format 7 with query sequence
|
742 | 744 |
(igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
|
743 | 745 |
group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
762 | 764 |
the sequence identifiers in the reference sequence set and the IgBLAST
|
763 | 765 |
database to be exact string matches.''')
|
764 | 766 |
group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
|
765 | |
help='''Specify to include additional aligner specific fields in the output.
|
|
767 |
help='''Specify to include additional aligner specific fields in the output.
|
766 | 768 |
Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
|
767 | 769 |
group_igblast_aa.add_argument('--regions', action='store', dest='regions',
|
768 | 770 |
choices=('default', 'rhesus-igl'), default='default',
|
|
778 | 780 |
description='''Process IMGT/HighV-Quest output
|
779 | 781 |
(does not work with V-QUEST).''')
|
780 | 782 |
group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
|
781 | |
group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
|
|
783 |
group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
|
782 | 784 |
help='''Either zipped IMGT output files (.zip or .txz) or a
|
783 | 785 |
folder containing unzipped IMGT output files (which must
|
784 | 786 |
include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
|
785 | 787 |
and 6_Junction).''')
|
786 | 788 |
group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
|
787 | 789 |
help='''List of FASTA files (with .fasta, .fna or .fa
|
788 | |
extension) that were submitted to IMGT/HighV-QUEST.
|
|
790 |
extension) that were submitted to IMGT/HighV-QUEST.
|
789 | 791 |
If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
|
790 | 792 |
will not be corrected.''')
|
791 | 793 |
group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
|
792 | 794 |
help='''List of folders and/or fasta files containing
|
793 | |
the germline sequence set used by IMGT/HighV-QUEST.
|
|
795 |
the germline sequence set used by IMGT/HighV-QUEST.
|
794 | 796 |
These reference sequences must contain IMGT-numbering spacers (gaps)
|
795 | |
in the V segment. If unspecified, the germline sequence reconstruction
|
|
797 |
in the V segment. If unspecified, the germline sequence reconstruction
|
796 | 798 |
will not be included in the output.''')
|
797 | 799 |
group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
|
798 | 800 |
help='''Table file containing 10X annotations (with .csv or .tsv
|
|
806 | 808 |
group_imgt.add_argument('--partial', action='store_true', dest='partial',
|
807 | 809 |
help='''If specified, include incomplete V(D)J alignments in
|
808 | 810 |
the pass file instead of the fail file. An incomplete alignment
|
809 | |
is defined as a record that is missing a V gene assignment,
|
|
811 |
is defined as a record that is missing a V gene assignment,
|
810 | 812 |
J gene assignment, junction region, or productivity call.''')
|
811 | 813 |
group_imgt.add_argument('--extended', action='store_true', dest='extended',
|
812 | |
help='''Specify to include additional aligner specific fields in the output.
|
|
814 |
help='''Specify to include additional aligner specific fields in the output.
|
813 | 815 |
Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
|
814 | |
cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
|
|
816 |
cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
|
815 | 817 |
p3d_length, p5j_length and d_frame.''')
|
816 | 818 |
group_imgt.add_argument('--imgt-id-len', action='store', dest='imgt_id_len', type=int,
|
817 | 819 |
default=default_imgt_id_len,
|
818 | |
help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
|
819 | |
Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
|
|
820 |
help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
|
|
821 |
Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
|
820 | 822 |
than 1.8.3 (May 7, 2021).''')
|
821 | 823 |
parser_imgt.set_defaults(func=parseIMGT)
|
822 | 824 |
|
|
850 | 852 |
group_ihmm.add_argument('--partial', action='store_true', dest='partial',
|
851 | 853 |
help='''If specified, include incomplete V(D)J alignments in
|
852 | 854 |
the pass file instead of the fail file. An incomplete alignment
|
853 | |
is defined as a record for which a valid IMGT-gapped sequence
|
854 | |
cannot be built or that is missing a V gene assignment,
|
|
855 |
is defined as a record for which a valid IMGT-gapped sequence
|
|
856 |
cannot be built or that is missing a V gene assignment,
|
855 | 857 |
J gene assignment, junction region, or productivity call.''')
|
856 | 858 |
group_ihmm.add_argument('--extended', action='store_true', dest='extended',
|
857 | |
help='''Specify to include additional aligner specific fields in the output.
|
|
859 |
help='''Specify to include additional aligner specific fields in the output.
|
858 | 860 |
Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
|
859 | 861 |
adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
|
860 | 862 |
parser_ihmm.set_defaults(func=parseIHMM)
|
861 | 863 |
|
862 | 864 |
return parser
|
863 | |
|
864 | |
|
|
865 |
|
|
866 |
|
865 | 867 |
if __name__ == "__main__":
|
866 | 868 |
"""
|
867 | 869 |
Parses command line arguments and calls main
|
|
880 | 882 |
if 'seq_files' in args_dict: del args_dict['seq_files']
|
881 | 883 |
if 'out_files' in args_dict: del args_dict['out_files']
|
882 | 884 |
if 'command' in args_dict: del args_dict['command']
|
883 | |
if 'func' in args_dict: del args_dict['func']
|
|
885 |
if 'func' in args_dict: del args_dict['func']
|
884 | 886 |
|
885 | 887 |
# Call main
|
886 | 888 |
for i, f in enumerate(args.__dict__['aligner_files']):
|