Commit ac0ca4e14ce3d964e0c388fa5f887dfc331e7f0f - changeo

Update upstream source from tag 'upstream/1.2.0' Update to upstream version '1.2.0' with Debian dir a8341ffd330fa5525d7858a3f13b926f2f75bd2f Nilesh Patra 2 years ago

12 changed file(s) with 152 addition(s) and 84 deletion(s). Raw diff Collapse all Expand all

-1

INSTALL.rst less more

23	23	+ `SciPy 0.14 <http://scipy.org>`__
24	24	+ `pandas 0.24 <http://pandas.pydata.org>`__
25	25	+ `Biopython 1.77 <http://biopython.org>`__
26		+ `presto 0.6.2 <http://presto.readthedocs.io>`__
	26	+ `presto 0.7.0 <http://presto.readthedocs.io>`__
27	27	+ `airr 1.3.1 <https://docs.airr-community.org>`__
28	28
29	29	Some tools wrap external applications that are not required for installation.

+37

-4

NEWS.rst less more

0	0	Release Notes
1	1	===============================================================================
2	2
	3	Version 1.2.0: October 29, 2021
	4	-------------------------------------------------------------------------------
	5
	6	+ Updated dependencies to presto >= v0.7.0.
	7
	8	AssignGenes:
	9
	10	+ Fixed reporting of IgBLAST output counts when specifying ``--format airr``.
	11
	12	BuildTrees:
	13
	14	+ Added support for specifying fixed omega and hotness parameters at the
	15	commandline.
	16
	17	CreateGermlines:
	18
	19	+ Will now use the first allele in the reference database when duplicate
	20	allele names are provided. Only appears to affect mouse BCR light chains
	21	and TCR alleles in the IMGT database when the same allele name differs by
	22	strain.
	23
	24	MakeDb:
	25
	26	+ Added support for changes in how IMGT/HighV-QUEST v1.8.4 handles special
	27	characters in sequence identifiers.
	28	+ Fixed the ``imgt`` subcommand incorrectly allowing execution without
	29	specifying the IMGT/HighV-QUEST output file at the commandline.
	30
	31	ParseDb:
	32
	33	+ Added reporting of output file sizes to the console log of the ``split``
	34	subcommand.
	35
	36
3	37	Version 1.1.0: June 21, 2021
4	38	-------------------------------------------------------------------------------
5	39

7	41	+ Updated dependencies to biopython >= v1.77, airr >= v1.3.1, PyYAML>=5.1.
8	42
9	43	MakeDb:
10
11	44	+ Added the ``--imgt-id-len`` argument to accommodate changes introduced in how
12		IMGT/HighV-QUEST truncates sequence identifiers as of version 1.8.3 (May 7, 2021).
	45	IMGT/HighV-QUEST truncates sequence identifiers as of v1.8.3 (May 7, 2021).
13	46	The header lines in the fasta files are now truncated to 49 characters. In
14		IMGT/HighV-QUEST versions older that 1.8.3, they were truncated to 50 characters.
	47	IMGT/HighV-QUEST versions older than v1.8.3, they were truncated to 50 characters.
15	48	``--imgt-id-len`` default value is 49. Users should specify ``--imgt-id-len 50``
16		to analyze IMGT results generated with IMGT/HighV-QUEST versions older that 1.8.3.
	49	to analyze IMGT results generated with IMGT/HighV-QUEST versions older than v1.8.3.
17	50	+ Added the ``--infer-junction`` argument to ``MakeDb igblast``, to enable the inference
18	51	of the junction sequence when not reported by IgBLAST. Should be used with data from
19	52	IgBLAST v1.6.0 or older; before igblast added the IMGT-CDR3 inference.

-1

PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: changeo
2		Version: 1.1.0
	2	Version: 1.2.0
3	3	Summary: A bioinformatics toolkit for processing high-throughput lymphocyte receptor sequencing data.
4	4	Home-page: http://changeo.readthedocs.io
5	5	Author: Namita Gupta, Jason Anthony Vander Heiden

+23

-1

bin/AssignGenes.py less more

13	13	from pkg_resources import parse_version
14	14	from textwrap import dedent
15	15	from time import time
	16	import re
16	17
17	18	# Presto imports
18	19	from presto.IO import printLog, printMessage, printError, printWarning

98	99	vdb=vdb, output=out_file,
99	100	threads=nproc, exec=igblast_exec)
100	101	printMessage('Done', start_time=start_time, end=True, width=25)
101
	102
	103	# Get number of processed sequences
	104	if (format == 'blast'):
	105	with open(out_file, 'rb') as f:
	106	f.seek(-2, os.SEEK_END)
	107	while f.read(1) != b'\n':
	108	f.seek(-2, os.SEEK_CUR)
	109	pass_info = f.readline().decode()
	110	num_seqs_match = re.search('(# BLAST processed )(\d+)( .*)', pass_info)
	111	num_sequences = num_seqs_match.group(2)
	112	else:
	113	f = open(out_file, 'rb')
	114	lines = 0
	115	buf_size = 1024 * 1024
	116	read_f = f.raw.read
	117	buf = read_f(buf_size)
	118	while buf:
	119	lines += buf.count(b'\n')
	120	buf = read_f(buf_size)
	121	num_sequences = lines - 1
	122
102	123	# Print log
103	124	log = OrderedDict()
	125	log['PASS'] = num_sequences
104	126	log['OUTPUT'] = os.path.basename(out_file)
105	127	log['END'] = 'AssignGenes'
106	128	printLog(log)

-6

bin/BuildTrees.py less more

998	998	if oformat == "tab":
999	999	os.rmdir(clone_dir)
1000	1000	else:
1001		printWarning("Using --clean all with --oformat txt will delete all tree file results.\n"
	1001	printWarning("Using --clean all with --oformat txt will not delete all tree file results.\n"
1002	1002	"You'll have to do that yourself.")
1003	1003	log = OrderedDict()
1004	1004	log["END"] = "IgPhyML analysis"

1322	1322	help="""Optimize combination of topology (t) branch lengths (l) and parameters (r), or
1323	1323	nothing (n), for IgPhyML.""")
1324	1324	igphyml_group.add_argument("--omega", action="store", dest="omega", type=str, default="e,e",
1325		choices = ("e", "ce", "e,e", "ce,e", "e,ce", "ce,ce"),
1326	1325	help="""Omega parameters to estimate for FWR,CDR respectively:
1327		e = estimate, ce = estimate + confidence interval""")
	1326	e = estimate, ce = estimate + confidence interval, or numeric value""")
1328	1327	igphyml_group.add_argument("-t", action="store", dest="kappa", type=str, default="e",
1329		choices=("e", "ce"),
1330	1328	help="""Kappa parameters to estimate:
1331		e = estimate, ce = estimate + confidence interval""")
	1329	e = estimate, ce = estimate + confidence interval, or numeric value""")
1332	1330	igphyml_group.add_argument("--motifs", action="store", dest="motifs", type=str,
1333	1331	default="WRC_2:0,GYW_0:1,WA_1:2,TW_0:3,SYC_2:4,GRS_0:5",
1334	1332	help="""Which motifs to estimate mutability.""")
1335	1333	igphyml_group.add_argument("--hotness", action="store", dest="hotness", type=str, default="e,e,e,e,e,e",
1336	1334	help="""Mutability parameters to estimate:
1337		e = estimate, ce = estimate + confidence interval""")
	1335	e = estimate, ce = estimate + confidence interval, or numeric value""")
1338	1336	igphyml_group.add_argument("--oformat", action="store", dest="oformat", type=str, default="tab",
1339	1337	choices=("tab", "txt"),
1340	1338	help="""IgPhyML output format.""")

+48

-46

bin/MakeDb.py less more

117	117	for rec in readSeqFile(seq_file):
118	118	if len(rec.description) <= imgt_id_len:
119	119	id_key = rec.description
120		else:
121		id_key = re.sub('\\|\|\s\|!\|&\|\*\|<\|>\|\?', '_', rec.description[:imgt_id_len])
	120	else: # truncate and replace characters
	121	if imgt_id_len == 49: # 28 September 2021 (version 1.8.4)
	122	id_key = re.sub('\s\|\t', '_', rec.description[:imgt_id_len])
	123	else: # older versions
	124	id_key = re.sub('\\|\|\s\|!\|&\|\*\|<\|>\|\?', '_', rec.description[:imgt_id_len])
122	125	ids.update({id_key: rec.description})
123	126
124	127	return ids

144	147	writer=AIRRWriter, out_file=None, out_args=default_out_args):
145	148	"""
146	149	Writes parsed records to an output file
147
148		Arguments:
	150
	151	Arguments:
149	152	records : a iterator of Receptor objects containing alignment data.
150	153	fields : a list of ordered field names to write.
151	154	aligner_file : input file name.

354	357
355	358
356	359	def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
357		extended=False, format=default_format, out_file=None, out_args=default_out_args, imgt_id_len=default_imgt_id_len):
	360	extended=False, format=default_format, out_file=None, out_args=default_out_args,
	361	imgt_id_len=default_imgt_id_len):
358	362	"""
359	363	Main for IMGT aligned sample sequences.
360	364

395	399
396	400	# Get (parsed) IDs from fasta file submitted to IMGT
397	401	id_dict = getIDforIMGT(seq_file, imgt_id_len) if seq_file else {}
398
	402
399	403	# Load supplementary annotation table
400	404	if cellranger_file is not None:
401	405	f = cellranger_extended if extended else cellranger_base

437	441	printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
438	442	germ_iter = (addGermline(x, references) for x in parse_iter)
439	443	# Write db
440		output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
	444	output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
441	445	annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
442	446	writer=writer, out_file=out_file, out_args=out_args)
443	447

534	538	with open(aligner_file, 'r') as f:
535	539	parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls, infer_junction=infer_junction)
536	540	germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
537		output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
	541	output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
538	542	annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
539	543	regions=regions, writer=writer, out_file=out_file, out_args=out_args)
540	544

613	617	with open(aligner_file, 'r') as f:
614	618	parse_iter = IHMMuneReader(f, seq_dict, references)
615	619	germ_iter = (addGermline(x, references) for x in parse_iter)
616		output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
	620	output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
617	621	annotations=annotations, asis_id=asis_id, partial=partial,
618	622	writer=writer, out_file=out_file, out_args=out_args)
619	623

624	628	"""
625	629	Defines the ArgumentParser.
626	630
627		Returns:
	631	Returns:
628	632	argparse.ArgumentParser
629	633	"""
630	634	fields = dedent(

636	640	db-fail
637	641	database with records that fail due to no productivity information,
638	642	no gene V assignment, no J assignment, or no junction region.
639
	643
640	644	universal output fields:
641		sequence_id, sequence, sequence_alignment, germline_alignment,
642		rev_comp, productive, stop_codon, vj_in_frame, locus,
643		v_call, d_call, j_call, junction, junction_length, junction_aa,
	645	sequence_id, sequence, sequence_alignment, germline_alignment,
	646	rev_comp, productive, stop_codon, vj_in_frame, locus,
	647	v_call, d_call, j_call, junction, junction_length, junction_aa,
644	648	v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
645	649	d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
646	650	j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
647	651	np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
648	652
649	653	imgt specific output fields:
650		n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
651		d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
652
	654	n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
	655	d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
	656
653	657	igblast specific output fields:
654		v_score, v_identity, v_support, v_cigar,
655		d_score, d_identity, d_support, d_cigar,
	658	v_score, v_identity, v_support, v_cigar,
	659	d_score, d_identity, d_support, d_cigar,
656	660	j_score, j_identity, j_support, j_cigar
657	661
658	662	ihmm specific output fields:
659	663	vdj_score
660
	664
661	665	10X specific output fields:
662		cell_id, c_call, consensus_count, umi_count,
	666	cell_id, c_call, consensus_count, umi_count,
663	667	v_call_10x, d_call_10x, j_call_10x,
664	668	junction_10x, junction_10x_aa
665	669	''')
666
	670
667	671	# Define ArgumentParser
668	672	parser = ArgumentParser(description=__doc__, epilog=fields,
669	673	formatter_class=CommonHelpFormatter, add_help=False)

685	689	help='Process igblastn output.',
686	690	description='Process igblastn output.')
687	691	group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
688		group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
689		required=True,
	692	group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
690	693	help='''IgBLAST output files in format 7 with query sequence
691	694	(igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
692	695	group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,

715	718	group_igblast.add_argument('--partial', action='store_true', dest='partial',
716	719	help='''If specified, include incomplete V(D)J alignments in
717	720	the pass file instead of the fail file. An incomplete alignment
718		is defined as a record for which a valid IMGT-gapped sequence
719		cannot be built or that is missing a V gene assignment,
	721	is defined as a record for which a valid IMGT-gapped sequence
	722	cannot be built or that is missing a V gene assignment,
720	723	J gene assignment, junction region, or productivity call.''')
721	724	group_igblast.add_argument('--extended', action='store_true', dest='extended',
722		help='''Specify to include additional aligner specific fields in the output.
	725	help='''Specify to include additional aligner specific fields in the output.
723	726	Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
724	727	fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
725	728	group_igblast.add_argument('--regions', action='store', dest='regions',
726	729	choices=('default', 'rhesus-igl'), default='default',
727	730	help='''IMGT CDR and FWR boundary definition to use.''')
728	731	group_igblast.add_argument('--infer-junction', action='store_true', dest='infer_junction',
729		help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
	732	help='''Infer the junction sequence. For use with IgBLAST v1.6.0 or older,
730	733	prior to the addition of IMGT-CDR3 inference.''')
731	734	parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
732	735

736	739	help='Process igblastp output.',
737	740	description='Process igblastp output.')
738	741	group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
739		group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
740		required=True,
	742	group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
741	743	help='''IgBLAST output files in format 7 with query sequence
742	744	(igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
743	745	group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,

762	764	the sequence identifiers in the reference sequence set and the IgBLAST
763	765	database to be exact string matches.''')
764	766	group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
765		help='''Specify to include additional aligner specific fields in the output.
	767	help='''Specify to include additional aligner specific fields in the output.
766	768	Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
767	769	group_igblast_aa.add_argument('--regions', action='store', dest='regions',
768	770	choices=('default', 'rhesus-igl'), default='default',

778	780	description='''Process IMGT/HighV-Quest output
779	781	(does not work with V-QUEST).''')
780	782	group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
781		group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
	783	group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', required=True,
782	784	help='''Either zipped IMGT output files (.zip or .txz) or a
783	785	folder containing unzipped IMGT output files (which must
784	786	include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
785	787	and 6_Junction).''')
786	788	group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
787	789	help='''List of FASTA files (with .fasta, .fna or .fa
788		extension) that were submitted to IMGT/HighV-QUEST.
	790	extension) that were submitted to IMGT/HighV-QUEST.
789	791	If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
790	792	will not be corrected.''')
791	793	group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
792	794	help='''List of folders and/or fasta files containing
793		the germline sequence set used by IMGT/HighV-QUEST.
	795	the germline sequence set used by IMGT/HighV-QUEST.
794	796	These reference sequences must contain IMGT-numbering spacers (gaps)
795		in the V segment. If unspecified, the germline sequence reconstruction
	797	in the V segment. If unspecified, the germline sequence reconstruction
796	798	will not be included in the output.''')
797	799	group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
798	800	help='''Table file containing 10X annotations (with .csv or .tsv

806	808	group_imgt.add_argument('--partial', action='store_true', dest='partial',
807	809	help='''If specified, include incomplete V(D)J alignments in
808	810	the pass file instead of the fail file. An incomplete alignment
809		is defined as a record that is missing a V gene assignment,
	811	is defined as a record that is missing a V gene assignment,
810	812	J gene assignment, junction region, or productivity call.''')
811	813	group_imgt.add_argument('--extended', action='store_true', dest='extended',
812		help='''Specify to include additional aligner specific fields in the output.
	814	help='''Specify to include additional aligner specific fields in the output.
813	815	Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
814		cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
	816	cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
815	817	p3d_length, p5j_length and d_frame.''')
816	818	group_imgt.add_argument('--imgt-id-len', action='store', dest='imgt_id_len', type=int,
817	819	default=default_imgt_id_len,
818		help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
819		Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
	820	help='''The maximum character length of sequence identifiers reported by IMGT/HighV-QUEST.
	821	Specify 50 if the IMGT files (-i) were generated with an IMGT/HighV-QUEST version older
820	822	than 1.8.3 (May 7, 2021).''')
821	823	parser_imgt.set_defaults(func=parseIMGT)
822	824

850	852	group_ihmm.add_argument('--partial', action='store_true', dest='partial',
851	853	help='''If specified, include incomplete V(D)J alignments in
852	854	the pass file instead of the fail file. An incomplete alignment
853		is defined as a record for which a valid IMGT-gapped sequence
854		cannot be built or that is missing a V gene assignment,
	855	is defined as a record for which a valid IMGT-gapped sequence
	856	cannot be built or that is missing a V gene assignment,
855	857	J gene assignment, junction region, or productivity call.''')
856	858	group_ihmm.add_argument('--extended', action='store_true', dest='extended',
857		help='''Specify to include additional aligner specific fields in the output.
	859	help='''Specify to include additional aligner specific fields in the output.
858	860	Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
859	861	adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
860	862	parser_ihmm.set_defaults(func=parseIHMM)
861	863
862	864	return parser
863
864
	865
	866
865	867	if __name__ == "__main__":
866	868	"""
867	869	Parses command line arguments and calls main

880	882	if 'seq_files' in args_dict: del args_dict['seq_files']
881	883	if 'out_files' in args_dict: del args_dict['out_files']
882	884	if 'command' in args_dict: del args_dict['command']
883		if 'func' in args_dict: del args_dict['func']
	885	if 'func' in args_dict: del args_dict['func']
884	886
885	887	# Call main
886	888	for i, f in enumerate(args.__dict__['aligner_files']):

+20

-17

bin/ParseDb.py less more

141	141	log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
142	142	log['RECORDS'] = rec_count
143	143	log['PARTS'] = len(handles_dict)
	144
	145	# Close output file handles and log file size
	146	db_handle.close()
	147	for i, t in enumerate(handles_dict):
	148	handles_dict[t].close()
	149	log['SIZE%i' % (i + 1)] = countDbFile(handles_dict[t].name)
	150
144	151	log['END'] = 'ParseDb'
145	152	printLog(log)
146
147		# Close output file handles
148		db_handle.close()
149		for t in handles_dict: handles_dict[t].close()
150	153
151	154	return [handles_dict[t].name for t in handles_dict]
152	155

363	366	"""
364	367	Deletes records from a database file
365	368
366		Arguments:
	369	Arguments:
367	370	db_file : the database file name.
368	371	fields : a list of fields to check for deletion criteria.
369	372	values : a list of values defining deletion targets.

371	374	regex : if False do exact full string matches; if True allow partial regex matches.
372	375	out_file : output file name. Automatically generated from the input file if None.
373	376	out_args : common output argument dictionary from parseCommonArgs.
374
375		Returns:
	377
	378	Returns:
376	379	str : output file name.
377	380	"""
378	381	# Define string match function

427	430	rec_count += 1
428	431	# Check for deletion values in all fields
429	432	delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
430
	433
431	434	# Write sequences
432	435	if not delete:
433	436	pass_count += 1
434	437	pass_writer.writeDict(rec)
435	438	else:
436	439	fail_count += 1
437
	440
438	441	# Print counts
439	442	printProgress(rec_count, result_count, 0.05, start_time=start_time)
440	443	log = OrderedDict()

448	451	# Close file handles
449	452	pass_handle.close()
450	453	db_handle.close()
451
	454
452	455	return pass_handle.name
453	456
454	457

866	869	"""
867	870	Defines the ArgumentParser
868	871
869		Arguments:
	872	Arguments:
870	873	None
871
872		Returns:
	874
	875	Returns:
873	876	an ArgumentParser object
874	877	"""
875	878	# Define input and output field help message

887	890	required fields:
888	891	sequence_id
889	892	''')
890
	893
891	894	# Define ArgumentParser
892	895	parser = ArgumentParser(description=__doc__, epilog=fields,
893	896	formatter_class=CommonHelpFormatter, add_help=False)

1026	1029	description='Merges files.')
1027	1030	group_merge = parser_merge.add_argument_group('parsing arguments')
1028	1031	group_merge.add_argument('-o', action='store', dest='out_file', default=None,
1029		help='''Explicit output file name. Note, this argument cannot be used with
	1032	help='''Explicit output file name. Note, this argument cannot be used with
1030	1033	the --failed, --outdir or --outname arguments.''')
1031	1034	group_merge.add_argument('--drop', action='store_true', dest='drop',
1032	1035	help='''If specified, drop fields that do not exist in all input files.
1033		Otherwise, include all columns in all files and fill missing data
	1036	Otherwise, include all columns in all files and fill missing data
1034	1037	with empty strings.''')
1035	1038	parser_merge.set_defaults(func=mergeDbFiles)
1036	1039

1091	1094	args_dict['out_file'] = args.__dict__['out_files'][i] \
1092	1095	if args.__dict__['out_files'] else None
1093	1096	args.func(**args_dict)
1094
	1097

+13

-3

changeo/IO.py less more

12	12	import zipfile
13	13	from itertools import chain, groupby, zip_longest
14	14	from tempfile import TemporaryDirectory
	15	from textwrap import indent
15	16	from Bio import SeqIO
16	17	from Bio.Seq import Seq
17	18
18	19	# Presto and changeo imports
19		from presto.IO import getFileType, printError, printWarning
	20	from presto.IO import getFileType, printError, printWarning, printDebug
20	21	from changeo.Defaults import default_csv_size
21	22	from changeo.Gene import getAllele, getLocus, getVAllele, getDAllele, getJAllele
22	23	from changeo.Receptor import AIRRSchema, AIRRSchemaAA, ChangeoSchema, ChangeoSchemaAA, Receptor, ReceptorData

2166	2167	return db
2167	2168
2168	2169
2169		def readGermlines(references, asis=False):
	2170	def readGermlines(references, asis=False, warn=False):
2170	2171	"""
2171	2172	Parses germline repositories
2172	2173
2173	2174	Arguments:
2174	2175	references (list): list of strings specifying directories and/or files from which to read germline records.
2175	2176	asis (bool): if True use sequence ID as record name and do not parse headers for allele names.
	2177	warn (bool): print warning messages to standard error if True.
2176	2178
2177	2179	Returns:
2178	2180	dict: Dictionary of germlines in the form {allele: sequence}.

2193	2195	printError('No valid germline fasta files (.fasta, .fna, .fa) were found at %s.' % ','.join(references))
2194	2196
2195	2197	repo_dict = {}
	2198	duplicates = []
2196	2199	for file_name in repo_files:
2197	2200	with open(file_name, 'rU') as file_handle:
2198	2201	germlines = SeqIO.parse(file_handle, 'fasta')
2199	2202	for g in germlines:
2200	2203	germ_key = getAllele(g.description, 'first') if not asis else g.id
2201		repo_dict[germ_key] = str(g.seq).upper()
	2204	if germ_key not in repo_dict:
	2205	repo_dict[germ_key] = str(g.seq).upper()
	2206	else:
	2207	duplicates.append(g.description)
	2208
	2209	if warn and len(duplicates) > 0:
	2210	w = indent('\n'.join(duplicates), ' '*9)
	2211	printWarning('Duplicated germline allele names excluded from references:\n%s' % w)
2202	2212
2203	2213	return repo_dict
2204	2214

-2

changeo/Version.py less more

4	4	__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
5	5	__copyright__ = 'Copyright 2021 Kleinstein Lab, Yale University. All rights reserved.'
6	6	__license__ = 'GNU Affero General Public License 3 (AGPL-3)'
7		__version__ = '1.1.0'
8		__date__ = '2021.06.21'
	7	__version__ = '1.2.0'
	8	__date__ = '2021.10.29'

-1

changeo.egg-info/PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: changeo
2		Version: 1.1.0
	2	Version: 1.2.0
3	3	Summary: A bioinformatics toolkit for processing high-throughput lymphocyte receptor sequencing data.
4	4	Home-page: http://changeo.readthedocs.io
5	5	Author: Namita Gupta, Jason Anthony Vander Heiden

-1

changeo.egg-info/requires.txt less more

3	3	biopython>=1.77
4	4	PyYAML>=5.1
5	5	setuptools>=2.0
6		presto>=0.6.2
	6	presto>=0.7.0
7	7	airr>=1.3.1

-1

requirements.txt less more

3	3	biopython>=1.77
4	4	PyYAML>=5.1
5	5	setuptools>=2.0
6		presto>=0.6.2
	6	presto>=0.7.0
7	7	airr>=1.3.1