Codebase list r-cran-tigger / 798a4d9
New upstream version 0.3.1 Andreas Tille 5 years ago
53 changed file(s) with 3234 addition(s) and 1940 deletion(s). Raw diff Collapse all Expand all
00 Package: tigger
11 Type: Package
2 Version: 0.2.11
3 Date: 2017-09-21
2 Version: 0.3.1
3 Date: 2018-10-19
44 Authors@R: c(person("Daniel", "Gadala-Maria", role=c("aut"),
55 email="daniel.gadala-maria@yale.edu"),
6 person("Susanna", "Marquez", role=c("aut"),
7 email="susanna.marquez@yale.edu"),
8 person("Moriah", "Cohen", role=c("aut"),
9 email="moriah.cohen@biu.ac.il"),
10 person("Gur", "Yaari", role=c("aut"),
11 email="gur.yaari@biu.ac.il"),
612 person("Jason", "Vander Heiden", role=c("ctb", "cre"),
713 email="jason.vanderheiden@yale.edu"),
814 person("Steven", "Kleinstein", role=c("aut", "cph"),
915 email="steven.kleinstein@yale.edu"))
10 Title: R Tools for Inferring New Immunoglobulin Alleles from Rep-Seq
11 Data
16 Title: Infers Novel Immunoglobulin Alleles from Sequencing Data
1217 Description: Infers the V genotype of an individual from immunoglobulin (Ig)
13 repertoire-sequencing (Rep-Seq) data, including detection of any novel
14 alleles. This information is then used to correct existing V allele calls
15 from among the sample sequences.
18 repertoire sequencing data (AIRR-Seq, Rep-Seq). Includes detection of
19 any novel alleles. This information is then used to correct existing V
20 allele calls from among the sample sequences.
21 Citations:
22 Gadala-Maria, et al (2015) <doi:10.1073/pnas.1417683112>.
1623 License: CC BY-SA 4.0
1724 URL: http://tigger.readthedocs.io
1825 BugReports: https://bitbucket.org/kleinstein/tigger/issues
1926 LazyData: true
2027 BuildVignettes: true
2128 VignetteBuilder: knitr
29 Encoding: UTF-8
2230 Depends: R (>= 3.2.5), ggplot2 (>= 2.0.0)
23 Imports: alakazam (>= 0.2.6), tidyr, dplyr (>= 0.5.0), doParallel,
24 foreach, graphics, grid, iterators, lazyeval, parallel, stats
31 Imports: alakazam (>= 0.2.11), dplyr (>= 0.7.0), doParallel, foreach,
32 graphics, gridExtra, gtools, iterators, lazyeval, parallel,
33 rlang, shazam (>= 0.1.10), stats, stringi, tidyr
2534 Suggests: knitr, testthat
26 RoxygenNote: 6.0.1
35 RoxygenNote: 6.1.0
2736 NeedsCompilation: no
28 Packaged: 2017-09-21 17:56:29 UTC; jason
37 Packaged: 2018-10-19 15:56:11 UTC; susanna
2938 Author: Daniel Gadala-Maria [aut],
39 Susanna Marquez [aut],
40 Moriah Cohen [aut],
41 Gur Yaari [aut],
3042 Jason Vander Heiden [ctb, cre],
3143 Steven Kleinstein [aut, cph]
3244 Maintainer: Jason Vander Heiden <jason.vanderheiden@yale.edu>
3345 Repository: CRAN
34 Date/Publication: 2017-09-21 18:36:16 UTC
46 Date/Publication: 2018-10-19 18:30:08 UTC
+43
-39
MD5 less more
0 e2485e1eb82326bf1a87a26d31bca302 *DESCRIPTION
1 b93c4f7f082c0a113a3b14ce72ad6fb2 *NAMESPACE
2 7603efe9dc5bd19429a3dd8fe493db5e *NEWS.md
3 cf81dc763706d5af4b6338587de99e07 *R/data.R
4 7591807e25a344f4a340b111ae2bd6c0 *R/functions.R
5 f206efd53bac36d5cda16f38126de62f *R/tigger.R
6 f4692fbe495ed0cbf93b5572bcf5d1c0 *README.md
7 83f16ce2648803eb619b5c6bd7c13b48 *build/vignette.rds
8 895ced14d12f95482b10d7ea7006c5e7 *data/genotype.rda
9 e6bf4ae95811336b57a6a4c161d84b49 *data/germline_ighv.rda
10 9f888cd0ed3b8029c1544798c2a0626e *data/novel_df.rda
11 8281d239e0e8ec4898b72a3c0c96f815 *data/sample_db.rda
0 7883ea65ca4dba8729ebe5521726f920 *DESCRIPTION
1 bf68ba350c86248c0c49cb21b9e9ac93 *NAMESPACE
2 e5ed4b12e1d369b23b0e60a0ce717d08 *NEWS.md
3 fd8970214a6f891f3f8630eeae6490da *R/bayesian.R
4 26641da5adf037bb742328f8aff43aa2 *R/data.R
5 a9b2c160a920976ad3dc10f2b693df00 *R/evidence.R
6 ee415813b86d23820983f8ba0ee66a11 *R/functions.R
7 0e7ef247b95b0f031cc4ea70f4835af3 *R/tigger.R
8 dd36e57af6a27547f887c01d7f7ea110 *README.md
9 06d0733b1e053ed668e7f17c1f975725 *build/vignette.rds
10 3e3cb573afe36d5c5ca64289396c180f *data/GermlineIGHV.rda
11 b65f789115c28c50e855e042dd8773b0 *data/SampleDb.rda
12 5b91397303d4a843360e11172f52fbd2 *data/SampleGenotype.rda
13 996de70aa93a09176f85d473e31d143e *data/SampleNovel.rda
14 a22b2561c478e1579d686230a39e67f5 *data/datalist
1215 de6a4346597304f77270ce7f43877cde *inst/CITATION
13 d449c31440bb88f31affd3d3bd9c2f3d *inst/doc/Tigger-Vignette.R
14 9af1046ef91efdb469d1b6e96836abb8 *inst/doc/Tigger-Vignette.Rmd
15 3352ec0d31c7fcc8672426f2f20bf3bf *inst/doc/Tigger-Vignette.pdf
16 a0ddc22d3a55eb331b3796dd6eb789c0 *inst/markr/build.R
17 aac471649aead0fc15cfd550b45fd049 *man/cleanSeqs.Rd
18 10c535405f44d35b11fbbb24441f65c8 *man/findNovelAlleles.Rd
19 dee7ae50bce0ce5c4b2eb9249064b0db *man/findUnmutatedCalls.Rd
20 4085c0bd803087104a44f0017a4a50af *man/genotype.Rd
21 1cc5dc16d32e2b0448171993ae787080 *man/genotypeFasta.Rd
22 52a7865ed7ecc697424ee208d338fb55 *man/germline_ighv.Rd
23 12bbad6df7fa988067c9eaa56d536ab6 *man/getMutCount.Rd
24 63dabc7cd596b16388f81216f0c17e66 *man/getMutatedPositions.Rd
25 be9136e88f9dadd4d7b08ca36f98ea53 *man/getPopularMutationCount.Rd
26 e3e9b6c1d4eb11abd8181e9782aa62a0 *man/inferGenotype.Rd
27 148b730525b1631e20baa47971bbf879 *man/insertPolymorphisms.Rd
28 12b6f73cd643edb19b6eafabb4e2f65c *man/novel_df.Rd
29 e62237f96f7801f6f24c92ead258fce8 *man/plotGenotype.Rd
30 25ca722b523385793c9e7c7f87008b8f *man/plotNovel.Rd
31 6bc3636e5c5505ba2fd63260e4e732ae *man/readIgFasta.Rd
32 6503ecee104c9ff753290c7d975d80dc *man/reassignAlleles.Rd
33 ebb8f5373091db562d8205e58a3f8984 *man/sample_db.Rd
34 2ead201dab9226a33bc56982df4b6c71 *man/selectNovel.Rd
35 d484cb96d7479059da49c218a81617db *man/sortAlleles.Rd
36 701723880ed5d0d8dd02b45112f1f42c *man/tigger.Rd
37 d76e9252cf5af39d91abdc7f4bded2ad *man/updateAlleleNames.Rd
38 f0b33cf048dd355a7e4aa6e725b40795 *man/writeFasta.Rd
39 9af1046ef91efdb469d1b6e96836abb8 *vignettes/Tigger-Vignette.Rmd
16 41f63ff830855d88ac107ea1de1c73bc *inst/doc/Tigger-Vignette.R
17 2d3b4a189816c543e285b79a82af7e39 *inst/doc/Tigger-Vignette.Rmd
18 01ff9a6cb57c035d7b6410b599195500 *inst/doc/Tigger-Vignette.pdf
19 0d10fc4f29c4fd44dbcd056d562e6e7d *man/GermlineIGHV.Rd
20 4449a3b926f140109fed29bdfcb2ea21 *man/SampleDb.Rd
21 ee6f174d98cdac6c8e920cffe93b8762 *man/SampleGenotype.Rd
22 887b6cdecd67bce3a4b18c3b68b8ed6c *man/SampleNovel.Rd
23 65be86d49f6538199afe8a396d78e46d *man/cleanSeqs.Rd
24 a0ea6946c659258fa37d9c9e6e313113 *man/findNovelAlleles.Rd
25 862a43c6d9ff0a343ad365c5627ba52c *man/findUnmutatedCalls.Rd
26 f1ca61511a381cde50708fb801db0541 *man/generateEvidence.Rd
27 f771057e2965efe11994cb4501633b43 *man/genotypeFasta.Rd
28 6f01fbc67804c7f34b46098a24711c91 *man/getMutCount.Rd
29 b985f5ee9c7e41a4343177f1fe73116f *man/getMutatedPositions.Rd
30 40eb4612eb9d4f4f7871124875cbe430 *man/getPopularMutationCount.Rd
31 7140a2237a7c22f48bc6480669492861 *man/inferGenotype.Rd
32 46fc21b7809cea2fcbb9ea3f08bed22e *man/inferGenotypeBayesian.Rd
33 29fc1168511d72e6c691accdba47c7fb *man/insertPolymorphisms.Rd
34 4888ee94f43ae2bc8b44493c102158b8 *man/plotGenotype.Rd
35 be5190e0acbc1873d5cdcba4aa13c458 *man/plotNovel.Rd
36 fe039528bfc29103382b6cfbb92c7447 *man/readIgFasta.Rd
37 12d8f40948568c68985316f7df0bf5fe *man/reassignAlleles.Rd
38 b955f370f2897613ab16833edd89397b *man/selectNovel.Rd
39 72ecb30aceba37af5b1bf37fc6951944 *man/sortAlleles.Rd
40 96b614e74d00fad9219dcde24e456149 *man/tigger.Rd
41 dc2623df8768f04d7f6f7d65bb51648e *man/updateAlleleNames.Rd
42 0ed3d8709b7dc07e58a4ac931bf905b0 *man/writeFasta.Rd
43 2d3b4a189816c543e285b79a82af7e39 *vignettes/Tigger-Vignette.Rmd
22 export(cleanSeqs)
33 export(findNovelAlleles)
44 export(findUnmutatedCalls)
5 export(generateEvidence)
56 export(genotypeFasta)
67 export(getMutCount)
78 export(getMutatedPositions)
89 export(getPopularMutationCount)
910 export(inferGenotype)
11 export(inferGenotypeBayesian)
1012 export(insertPolymorphisms)
1113 export(plotGenotype)
1214 export(plotNovel)
2123 importFrom(alakazam,getAllele)
2224 importFrom(alakazam,getFamily)
2325 importFrom(alakazam,getGene)
26 importFrom(alakazam,translateDNA)
2427 importFrom(doParallel,registerDoParallel)
2528 importFrom(dplyr,"%>%")
2629 importFrom(dplyr,arrange)
4043 importFrom(dplyr,glimpse)
4144 importFrom(dplyr,group_by)
4245 importFrom(dplyr,group_by_)
46 importFrom(dplyr,inner_join)
4347 importFrom(dplyr,mutate)
4448 importFrom(dplyr,mutate_)
4549 importFrom(dplyr,n)
5862 importFrom(foreach,foreach)
5963 importFrom(foreach,registerDoSEQ)
6064 importFrom(graphics,plot)
61 importFrom(grid,grid.layout)
62 importFrom(grid,grid.newpage)
63 importFrom(grid,pushViewport)
64 importFrom(grid,viewport)
65 importFrom(gridExtra,arrangeGrob)
66 importFrom(gtools,ddirichlet)
6567 importFrom(iterators,icount)
6668 importFrom(lazyeval,interp)
6769 importFrom(parallel,clusterEvalQ)
6870 importFrom(parallel,clusterExport)
6971 importFrom(parallel,makeCluster)
7072 importFrom(parallel,stopCluster)
73 importFrom(rlang,.data)
74 importFrom(shazam,calcObservedMutations)
7175 importFrom(stats,confint)
7276 importFrom(stats,cor)
7377 importFrom(stats,cov)
7882 importFrom(stats,na.omit)
7983 importFrom(stats,sd)
8084 importFrom(stats,setNames)
85 importFrom(stringi,stri_length)
8186 importFrom(tidyr,gather)
8287 importFrom(tidyr,gather_)
8388 importFrom(tidyr,spread)
8489 importFrom(tidyr,spread_)
90 importFrom(tidyr,unnest)
0 Version 0.3.1 October 19, 2018
1 -------------------------------------------------------------------------------
2
3 + Fixed a fatal error in `reassignAlleles` with non-existent `v_call` column.
4 + Fixed bug in `generateEvidence` that was reporting amino acids mutations as
5 NA instead of gaps.
6
7
8 Version 0.3.0 October 3, 2018
9 -------------------------------------------------------------------------------
10
11 Bug Fixes:
12
13 + Fixed a bug in `reassignAlleles` occuring with single match genotypes.
14 + Fixed `selectNovel` improperly removing all identical novel alleles, rather
15 than keeping a single entry.
16 + `genotypeFasta` will now retain IMGT-numbering spacers as `.` characters
17 instead of converting them to `-` characters.
18 + Fixed a bug in `findNovelAlleles` causing overly aggressive minimum sequence
19 threshold filtering.
20 + Fixed a bug in the grouping behavior of `getPopularMutationCount`.
21
22 New Features:
23
24 + Added a Bayesian approach to genotype inferrence as the
25 `inferGenotypeBayesian` function.
26 + Added the function `generateEvidence` to build a complete evidence table
27 from the results of `findNovelAlleles`, `inferGenotype`,
28 `inferGenotypeBayesian`, and `reassignAlleles`.
29 + Added multiple new evidence columns to the output of `findNovelAlleles`
30 and adjusted the definitions/names of some existing columns.
31 + Added behavior to the `keep_gene` argument of `reassignAlleles` to provide
32 options for maintaining reassignments at the gene (previous `TRUE` behavior),
33 family, or repertoire level.
34 + Improved tie resolution in `findNovelAlleles`.
35
36 Backwards Incompatible Refactors:
37
38 + Renamed sample data from `germline_ighv`, `sample_db`, `genotype` and
39 `novel_df` to `GermlineIGHV`, `SampleDb`, `SampleGenotype` and `SampleNovel`,
40 respectively.
41 + Renamed the `novel_df` argument to `novel` in `selectNovel`, `inferGenotype`,
42 and `genotypeFasta`.
43 + Renamed the `novel_df_row` argument to `novel_row` in `plotNovel`.
44 + Argument order in `inferGenotype` was alter for clarity.
45 + Changed the return behavior of `reassignAlleles` so that it returns the
46 input data.frame with the `V_CALL_GENOTYPED` column appended or overwritten.
47 + `cleanSeqs` will no longer replace `.` characters with `-`.
48
49
050 Version 0.2.11 September 21, 2017
151 -------------------------------------------------------------------------------
252
0 #' Infer a subject-specific genotype using a Bayesian approach
1 #'
2 #' \code{inferGenotypeBayesian} infers an subject's genotype by applying a Bayesian framework
3 #' with a Dirichlet prior for the multinomial distribution. Up to four distinct alleles are
4 #' allowed in an individual’s genotype. Four likelihood distributions were generated by
5 #' empirically fitting three high coverage genotypes from three individuals
6 #' (Laserson and Vigneault et al, 2014). A posterior probability is calculated for the
7 #' four most common alleles. The certainty of the highest probability model was
8 #' calculated using a Bayes factor (the most likely model divided by second-most likely model).
9 #' The larger the Bayes factor (K), the greater the certainty in the model.
10 #'
11 #' @details
12 #' Allele calls representing cases where multiple alleles have been
13 #' assigned to a single sample sequence are rare among unmutated
14 #' sequences but may result if nucleotides for certain positions are
15 #' not available. Calls containing multiple alleles are treated as
16 #' belonging to all groups. If \code{novel} is provided, all
17 #' sequences that are assigned to the same starting allele as any
18 #' novel germline allele will have the novel germline allele appended
19 #' to their assignent prior to searching for unmutated sequences.
20 #'
21 #' @param data a \code{data.frame} containing V allele
22 #' calls from a single subject. If \code{find_unmutated}
23 #' is \code{TRUE}, then the sample IMGT-gapped V(D)J sequence
24 #' should be provided in a column \code{"SEQUENCE_IMGT"}
25 #' @param v_call column in \code{data} with V allele calls.
26 #' Default is \code{"V_CALL"}.
27 #' @param find_unmutated if \code{TRUE}, use \code{germline_db} to
28 #' find which samples are unmutated. Not needed
29 #' if \code{allele_calls} only represent
30 #' unmutated samples.
31 #' @param germline_db named vector of sequences containing the
32 #' germline sequences named in \code{allele_calls}.
33 #' Only required if \code{find_unmutated} is \code{TRUE}.
34 #' @param novel an optional \code{data.frame} of the type
35 #' novel returned by \link{findNovelAlleles} containing
36 #' germline sequences that will be utilized if
37 #' \code{find_unmutated} is \code{TRUE}. See Details.
38 #' @param priors a numeric vector of priors for the multinomial distribution.
39 #' The \code{priors} vector must be nine values that defined
40 #' the priors for the heterozygous (two allele),
41 #' trizygous (three allele), and quadrozygous (four allele)
42 #' distributions. The first two values of \code{priors} define
43 #' the prior for the heterozygous case, the next three values are for
44 #' the trizygous case, and the final four values are for the
45 #' quadrozygous case. Each set of priors should sum to one.
46 #' Note, each distribution prior is actually defined internally
47 #' by set of four numbers, with the unspecified final values
48 #' assigned to \code{0}; e.g., the heterozygous case is
49 #' \code{c(priors[1], priors[2], 0, 0)}. The prior for the
50 #' homozygous distribution is fixed at \code{c(1, 0, 0, 0)}.
51 #'
52 #' @return
53 #' A \code{data.frame} of alleles denoting the genotype of the subject with the log10
54 #' of the likelihood of each model and the log10 of the Bayes factor. The output
55 #' contains the following columns:
56 #'
57 #' \itemize{
58 #' \item \code{GENE}: The gene name without allele.
59 #' \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}.
60 #' \item \code{COUNTS}: Comma separated list of observed sequences for each
61 #' corresponding allele in the \code{ALLELES} list.
62 #' \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}.
63 #' \item \code{NOTE}: Any comments on the inferrence.
64 #' \item \code{KH}: log10 likelihood that the \code{GENE} is homozygous.
65 #' \item \code{KD}: log10 likelihood that the \code{GENE} is heterozygous.
66 #' \item \code{KT}: log10 likelihood that the \code{GENE} is trizygous
67 #' \item \code{KQ}: log10 likelihood that the \code{GENE} is quadrozygous.
68 #' \item \code{K_DIFF}: log10 ratio of the highest to second-highest zygosity likelihoods.
69 #' }
70 #'
71 #' @note
72 #' This method works best with data derived from blood, where a large
73 #' portion of sequences are expected to be unmutated. Ideally, there
74 #' should be hundreds of allele calls per gene in the input.
75 #'
76 #' @seealso \link{plotGenotype} for a colorful visualization and
77 #' \link{genotypeFasta} to convert the genotype to nucleotide sequences.
78 #' See \link{inferGenotype} to infer a subject-specific genotype using
79 #' a frequency method
80 #'
81 #' @references
82 #' \enumerate{
83 #' \item Laserson U and Vigneault F, et al. High-resolution antibody dynamics of
84 #' vaccine-induced immune responses. PNAS. 2014 111(13):4928-33.
85 #' }
86 #'
87 #' @examples
88 #' # Infer IGHV genotype, using only unmutated sequences, including novel alleles
89 #' inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel,
90 #' find_unmutated=TRUE)
91 #'
92 #' @export
93 inferGenotypeBayesian <- function(data, germline_db=NA, novel=NA,
94 v_call="V_CALL", find_unmutated=TRUE,
95 priors=c(0.6, 0.4, 0.4, 0.35, 0.25, 0.25, 0.25, 0.25, 0.25)){
96 # Visibility hack
97 . <- NULL
98
99 allele_calls = getAllele(data[,v_call], first=FALSE, strip_d=FALSE)
100 # Find the unmutated subset, if requested
101 if(find_unmutated){
102 if(is.na(germline_db[1])){
103 stop("germline_db needed if find_unmutated is TRUE")
104 }
105 if(!is.null(nrow(novel))){
106 novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>%
107 select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT)
108 if(nrow(novel) > 0){
109 # Extract novel alleles if any and add them to germline_db
110 novel_gl = novel$NOVEL_IMGT
111 names(novel_gl) = novel$POLYMORPHISM_CALL
112 germline_db = c(germline_db, novel_gl)
113 # Add the novel allele calls to allele calls of the same starting allele
114 for(r in 1:nrow(novel)){
115 ind = grep(novel$GERMLINE_CALL[r], allele_calls, fixed=TRUE)
116 allele_calls[ind] = allele_calls[ind] %>%
117 sapply(paste, novel$POLYMORPHISM_CALL[r], sep=",")
118 }
119 }
120 }
121 # Find unmutated sequences
122 allele_calls = findUnmutatedCalls(allele_calls,
123 as.character(data$SEQUENCE_IMGT),
124 germline_db)
125 if(length(allele_calls) == 0){
126 stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.")
127 }
128 }
129
130 # Find which rows' calls contain which genes
131 gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>%
132 getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="")
133 gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE)
134 names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE)
135 gene_groups = gene_groups[sortAlleles(names(gene_groups))]
136
137 # Make a table to store the resulting genotype
138 GENE = names(gene_groups)
139 # ALLELES = COUNTS = NOTE = rep("", length(GENE))
140 # TOTAL = sapply(gene_groups, length)
141 # genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE)
142 ALLELES = COUNTS = KH = KD = KT = KQ = K_DIFF = NOTE = rep("", length(GENE))
143 TOTAL = sapply(gene_groups, length)
144 genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE, KH, KD, KT, KQ, K_DIFF)
145
146 # For each gene, find which alleles to include
147 for (g in GENE){
148 # Keep only the part of the allele calls that uses the gene being analyzed
149 ac = allele_calls[gene_groups[[g]]] %>%
150 strsplit(",") %>%
151 lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>%
152 sapply(paste, collapse=",")
153 t_ac = table(ac) # table of allele calls
154 potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles
155
156 regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="")
157 regexpotentials =
158 paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|")
159 tmat =
160 sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE))
161
162 if (length(potentials) == 1 | length(t_ac) == 1){
163 seqs_expl = t(as.data.frame(apply(t(as.matrix(tmat)), 2, function(x) x *
164 t_ac)))
165 rownames(seqs_expl)<-names(t_ac)[1]
166 }else{
167 seqs_expl = as.data.frame(apply(tmat, 2, function(x) x *
168 t_ac))
169 }
170 # seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac))
171 colnames(seqs_expl) = potentials
172 # Add low (fake) counts
173 sapply(colnames(seqs_expl), function(x){if(sum(rownames(seqs_expl) %in% paste(x)) == 0){
174 seqs_expl <<- rbind(seqs_expl,rep(0,ncol(seqs_expl)));
175 rownames(seqs_expl)[nrow(seqs_expl)] <<- paste(x)
176 seqs_expl[rownames(seqs_expl) %in% paste(x),paste(x)] <<- 0.01
177
178 }})
179
180 # Build ratio dependent allele count distribution of multi assigned reads
181 seqs_expl_single <- seqs_expl[grep(',',rownames(seqs_expl),invert = T),]
182
183 seqs_expl_multi <- seqs_expl[grep(',',rownames(seqs_expl),invert = F),]
184 if(is.null(nrow(seqs_expl_multi))){
185 seqs_expl_multi <- t(as.data.frame(seqs_expl_multi))
186 rownames(seqs_expl_multi) <- grep(',',rownames(seqs_expl),invert = F,value = T)
187 }
188
189 if(!is.null(nrow(seqs_expl_single)) && nrow(seqs_expl_single) !=0 && nrow(seqs_expl_single) != nrow(seqs_expl)){
190 if(nrow(seqs_expl_multi)>1){
191 seqs_expl_multi <- seqs_expl_multi[order(nchar(row.names(seqs_expl_multi))),]
192 }
193 sapply(1:nrow(seqs_expl_multi),function(x){
194 genes <- unlist(strsplit(row.names(seqs_expl_multi)[x],','));
195 counts <- seqs_expl_single[rownames(seqs_expl_single) %in% genes,genes]
196 counts <- colSums(counts)
197 counts_to_distribute <- seqs_expl_multi[x,genes]
198
199 new_counts <- counts+((counts_to_distribute*counts)/sum(counts))
200 for(i in 1:length(new_counts)){
201 gene_tmp <- names(new_counts)[i]
202 seqs_expl_single[rownames(seqs_expl_single) %in% gene_tmp,gene_tmp] <<- new_counts[i]
203 }
204 })
205 }
206
207 # Cycle through the table, including alleles to explain more sequences,
208 # until we explain enough sequences
209 #included = counts = character(0)
210 #tot_expl = 0
211
212 seqs_expl <- if(is.null(nrow(seqs_expl_single)) || nrow(seqs_expl_single) ==0 ){seqs_expl}else{seqs_expl_single}
213 seqs_expl <- round(seqs_expl)
214 if(sum(rowSums(seqs_expl) == 0 ) != 0){
215 seqs_expl <- seqs_expl[rowSums(seqs_expl)!= 0, ]
216 }
217
218 allele_tot = sort(apply(seqs_expl, 2, sum),decreasing=TRUE)
219 len=min(length(allele_tot),4);
220 #print(priors)
221 probs <-get_probabilites_with_priors(sort(c(allele_tot,rep(0,4-len)),decreasing = T)[1:4],priors = priors)
222 probs[probs==-Inf] <- -1000
223 names(probs) <- c('H','D','T','Q')
224
225 k <- sort(as.numeric(probs),decreasing = T);
226
227 probs<-c(probs,k[1]-k[2])
228 names(probs)[5] <- "K_DIFF"
229
230 genotype[genotype[, "GENE"] == g, "ALLELES"] = paste(gsub("[^d\\*]*[d\\*]",
231 "", names(allele_tot)[1:len]), collapse = ",")
232 genotype[genotype[, "GENE"] == g, "COUNTS"] = paste(as.numeric(allele_tot)[1:len],
233 collapse = ",")
234 genotype[genotype[, "GENE"] == g, "KH"] =probs[1];
235 genotype[genotype[, "GENE"] == g, "KD"] =probs[2];
236 genotype[genotype[, "GENE"] == g, "KT"] =probs[3];
237 genotype[genotype[, "GENE"] == g, "KQ"] =probs[4];
238 genotype[genotype[, "GENE"] == g, "K_DIFF"] =probs[5];
239 # }
240
241 }
242
243
244 geno = as.data.frame(genotype, stringsAsFactors = FALSE)
245
246 # Check for indistinguishable calls
247 if(find_unmutated == TRUE){
248 seqs = genotypeFasta(geno, germline_db)
249 dist_mat = seqs %>%
250 sapply(function(x) sapply((getMutatedPositions(seqs, x)), length))
251 rownames(dist_mat) = colnames(dist_mat)
252 for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA }
253 same = which(dist_mat == 0, arr.ind=TRUE)
254 if (nrow(same) > 0 ) {
255 for (r in 1:nrow(same)) {
256 inds = as.vector(same[r,])
257 geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE =
258 paste(rownames(dist_mat)[inds], collapse=" and ") %>%
259 paste("Cannot distinguish", .)
260 }
261 }
262 }
263 rownames(geno) = NULL
264 return(geno)
265 }
266
267
268 # Calculate models likelihood
269 #
270 #
271 # @param X a vector of counts
272 # @param alpha_dirichlet alpha parameter for dirichlet distribution
273 # @param epsilon epsilon
274 # @param priors a vector of priors
275 #
276 # @return log10 of the likelihoods
277 get_probabilites_with_priors <- function(X, alpha_dirichlet=c(0.5,0.5,0.5,0.5)*2,
278 epsilon=0.01,
279 priors=c(0.5,0.5,0.33,0.33,0.33,0.25,0.25,0.25,0.25)){
280 ## Hypotheses
281 X<-sort(X,decreasing=TRUE)
282
283 H1<-c(1,0,0,0)
284 H2<-c(priors[1],priors[2],0,0)
285 H3<-c(priors[3],priors[4],priors[5],0)
286 H4<-c(priors[6],priors[7],priors[8],priors[9])
287
288 E1<-ddirichlet((H1+epsilon)/sum(H1+epsilon),alpha_dirichlet+X)
289 E2<-ddirichlet((H2+epsilon)/sum(H2+epsilon),alpha_dirichlet+X)
290 E3<-ddirichlet((H3+epsilon)/sum(H3+epsilon),alpha_dirichlet+X)
291 E4<-ddirichlet((H4+epsilon)/sum(H4+epsilon),alpha_dirichlet+X)
292
293
294
295 while(sort(c(E1,E2,E3,E4),decreasing=TRUE)[2] == 0 ){
296
297 X <- X/10
298 E1<-ddirichlet((H1+epsilon)/sum(H1+epsilon),alpha_dirichlet+X)
299 E2<-ddirichlet((H2+epsilon)/sum(H2+epsilon),alpha_dirichlet+X)
300 E3<-ddirichlet((H3+epsilon)/sum(H3+epsilon),alpha_dirichlet+X)
301 E4<-ddirichlet((H4+epsilon)/sum(H4+epsilon),alpha_dirichlet+X)
302
303 }
304
305 return(log10(c(E1,E2,E3,E4)))
306 }
0 #' Human IGHV germlines
1 #'
2 #' A \code{character} vector of all 344 human IGHV germline gene segment alleles
3 #' in IMGT Gene-db release 201408-4.
4 #'
5 #' @name germline_ighv
6 #' @docType data
7 #' @format Values correspond to IMGT-gaped nuceltoide sequences (with
8 #' nucleotides capitalized and gaps represented by ".") while names correspond
9 #' to stripped-down IMGT allele names (e.g. "IGHV1-18*01").
10 #'
11 #' @references Xochelli \emph{et al}. (2014) Immunoglobulin heavy variable
12 #' (IGHV) genes and alleles: new entities, new names and implications for
13 #' research and prognostication in chronic lymphocytic leukaemia.
14 #' \emph{Immunogenetics}. 67(1):61-6.
15 #' @keywords data
16 NULL
17
18
19 #' Example human Rep-Seq data
20 #'
21 #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
22 #' individual (PGP1), sequenced on the Roche 454 platform, and thought by
23 #' IMGT/V-QUEST to utilize IGHV1 family alleles.
24 #'
25 #' @name sample_db
26 #' @docType data
27 #' @format A \code{data.frame} where rows correspond to unique VDJ sequences and
28 #' columns include:
29 #' \itemize{
30 #' \item IMGT-gapped nucleotide sequence (\code{"SEQUENCE_IMGT"})
31 #' \item IMGT/V-QUEST allele calls (\code{"V_CALL"}, \code{"D_CALL"}, and
32 #' \code{"J_CALL"})
33 #' \item Junction length (\code{"JUNCTION_LENGTH"})
34 #' }
35 #'
36 #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of
37 #' high-throughput B cell sequencing data reveals a high frequency of novel
38 #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
39 #' @keywords data
40 NULL
41
42 #' Example of Analyzed Rep-Seq data
43 #'
44 #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
45 #' individual (PGP1), sequenced on the Roche 454 platform, and thought by
46 #' IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by
47 #' \link{findNovelAlleles}.
48 #'
49 #' @name novel_df
50 #' @docType data
51 #' @format A \code{data.frame} where rows correspond to alleles checked for
52 #' polymorphisms and columns give results as well as paramaters used to run
53 #' the test.
54 #'
55 #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of
56 #' high-throughput B cell sequencing data reveals a high frequency of novel
57 #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
58 #' @keywords data
59 NULL
60
61 #' Example of an Inferred Genotype
62 #'
63 #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
64 #' individual (PGP1), sequenced on the Roche 454 platform, and thought by
65 #' IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by
66 #' \link{findNovelAlleles} and \link{inferGenotype}
67 #'
68 #' @name genotype
69 #' @docType data
70 #' @format A \code{data.frame} where rows correspond to genes carried by an
71 #' individual and columns lists the alleles of those genes and their counts.
72 #'
73 #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of
74 #' high-throughput B cell sequencing data reveals a high frequency of novel
75 #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
76 #' @keywords data
0 #' Human IGHV germlines
1 #'
2 #' A \code{character} vector of all 344 human IGHV germline gene segment alleles
3 #' in IMGT/GENE-DB release 201408-4.
4 #'
5 #' @name GermlineIGHV
6 #' @docType data
7 #' @format Values correspond to IMGT-gaped nuceltoide sequences (with
8 #' nucleotides capitalized and gaps represented by ".") while names correspond
9 #' to stripped-down IMGT allele names (e.g. "IGHV1-18*01").
10 #'
11 #' @references
12 #' \enumerate{
13 #' \item Xochelli, et al. (2014) Immunoglobulin heavy variable (IGHV) genes and
14 #' alleles: new entities, new names and implications for research and
15 #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6.
16 #' }
17 #'
18 #' @keywords data
19 NULL
20
21
22 #' Example human immune repertoire data
23 #'
24 #' A \code{data.frame} of example V(D)J immunoglobulin sequences derived from a
25 #' single individual (PGP1), sequenced on the Roche 454 platform, and assigned by
26 #' IMGT/HighV-QUEST to IGHV1 family alleles.
27 #'
28 #' @name SampleDb
29 #' @docType data
30 #' @format A \code{data.frame} where rows correspond to unique V(D)J sequences and
31 #' columns include:
32 #' \itemize{
33 #' \item \code{"SEQUENCE_IMGT"}: IMGT-gapped V(D)J nucleotide sequence.
34 #' \item \code{"V_CALL"}: IMGT/HighV-QUEST V segment allele calls.
35 #' \item \code{"D_CALL"}: IMGT/HighV-QUEST D segment allele calls.
36 #' \item \code{"J_CALL"}: IMGT/HighV-QUEST J segment allele calls.
37 #' \item \code{"JUNCTION_LENGTH"}: Junction region length.
38 #' }
39 #'
40 #' @references
41 #' \enumerate{
42 #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
43 #' sequencing data reveals a high frequency of novel immunoglobulin V gene
44 #' segment alleles. PNAS. 112(8):E862-70.
45 #' }
46 #'
47 #' @keywords data
48 NULL
49
50 #' Example novel allele detection results
51 #'
52 #' A \code{data.frame} of novel allele detection results from \link{findNovelAlleles}.
53 #' Source data was a collection of V(D)J immunoglobulin sequences derived from a single
54 #' individual (PGP1), sequenced on the Roche 454 platform, and assigned by
55 #' IMGT/HighV-QUEST to IGHV1 family alleles.
56 #'
57 #' @name SampleNovel
58 #' @docType data
59 #' @format A \code{data.frame} where rows correspond to alleles checked for
60 #' polymorphisms and columns give results as well as paramaters used to run
61 #' the test.
62 #'
63 #' @seealso See \link{findNovelAlleles} for detailed column descriptions.
64 #'
65 #' @references
66 #' \enumerate{
67 #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
68 #' sequencing data reveals a high frequency of novel immunoglobulin V gene
69 #' segment alleles. PNAS. 112(8):E862-70.
70 #' }
71 #'
72 #' @keywords data
73 NULL
74
75 #' Example genotype inferrence results
76 #'
77 #' A \code{data.frame} of genotype inference results from \link{inferGenotype}
78 #' after novel allele detection via \link{findNovelAlleles}.
79 #' Source data was a collection of V(D)J immunoglobulin sequences derived from a single
80 #' individual (PGP1), sequenced on the Roche 454 platform, and assigned by
81 #' IMGT/HighV-QUEST to IGHV1 family alleles.
82 #'
83 #' @name SampleGenotype
84 #' @docType data
85 #' @format A \code{data.frame} where rows correspond to genes carried by an
86 #' individual and columns lists the alleles of those genes and their counts.
87 #'
88 #' @seealso See \link{inferGenotype} for detailed column descriptions.
89 #'
90 #' @references
91 #' \enumerate{
92 #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
93 #' sequencing data reveals a high frequency of novel immunoglobulin V gene
94 #' segment alleles. PNAS. 112(8):E862-70.
95 #' }
96 #'
97 #' @keywords data
7798 NULL
0 # Find non triplet gaps in a nucleotide sequence
1 hasNonImgtGaps <- function (seq) {
2 len <- ceiling(nchar(seq)/3)*3
3 codons <- substring(seq, seq(1, len-2, 3), seq(3, len, 3))
4 gaps_lengths <- nchar(gsub("[^\\.\\-]", "", codons))
5 if (any(gaps_lengths %% 3 != 0)) {
6 TRUE
7 } else {
8 FALSE
9 }
10 }
11
12 # Compare two IMGT gapped sequences and find AA mutations
13 getMutatedAA <- function(ref_imgt, novel_imgt) {
14 if (grepl("N", ref_imgt)) {
15 stop("Unexpected N in ref_imgt")
16 }
17 if (grepl("N", novel_imgt)) {
18 stop("Unexpected N in novel_imgt")
19 }
20
21 if (hasNonImgtGaps(ref_imgt)) {
22 warning("Non IMGT gaps found in ref_imgt")
23 }
24
25 if (hasNonImgtGaps(novel_imgt)) {
26 warning("Non IMGT gaps found in novel_imgt")
27 }
28
29 ref_imgt <- strsplit(alakazam::translateDNA(ref_imgt),"")[[1]]
30 novel_imgt <- strsplit(alakazam::translateDNA(novel_imgt),"")[[1]]
31 mutations <- c()
32 diff_idx <- which(ref_imgt != novel_imgt)
33 if (length(diff_idx)>0) {
34 mutations <- paste0(diff_idx, ref_imgt[diff_idx],">",
35 replace(novel_imgt[diff_idx], is.na(novel_imgt[diff_idx]),"-"))
36 }
37 mutations
38 }
39
40
41 #' Generate evidence
42 #'
43 #' \code{generateEvidence} builds a table of evidence metrics for the final novel V
44 #' allele detection and genotyping inferrences.
45 #'
46 #' @param data a \code{data.frame} containing sequence data that has been
47 #' passed through \link{reassignAlleles} to correct the allele
48 #' assignments.
49 #' @param novel the \code{data.frame} returned by \link{findNovelAlleles}.
50 #' @param genotype the \code{data.frame} of alleles generated with \link{inferGenotype}
51 #' denoting the genotype of the subject.
52 #' @param genotype_db a vector of named nucleotide germline sequences in the genotype.
53 #' Returned by \link{genotypeFasta}.
54 #' @param germline_db the original uncorrected germline database used to by
55 #' \link{findNovelAlleles} to identify novel alleles.
56 #' @param fields character vector of column names used to split the data to
57 #' identify novel alleles, if any. If \code{NULL} then the data is
58 #' not divided by grouping variables.
59 #'
60 #' @return
61 #' Returns the \code{genotype} input \code{data.frame} with the following additional columns
62 #' providing supporting evidence for each inferred allele:
63 #'
64 #' \itemize{
65 #' \item \code{FIELD_ID}: Data subset identifier, defined with the input paramter \code{fields}.
66 #' \item A variable number of columns, specified with the input parameter \code{fields}.
67 #' \item \code{POLYMORPHISM_CALL}: The novel allele call.
68 #' \item \code{NOVEL_IMGT}: The novel allele sequence.
69 #' \item \code{CLOSEST_REFERENCE}: The closest reference gene and allele in
70 #' the \code{germline_db} database.
71 #' \item \code{CLOSEST_REFERENCE_IMGT}: Sequence of the closest reference gene and
72 #' allele in the \code{germline_db} database.
73 #' \item \code{GERMLINE_CALL}: The input (uncorrected) V call.
74 #' \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}.
75 #' \item \code{NT_DIFF}: Number of nucleotides that differ between the new allele and
76 #' the closest reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database.
77 #' \item \code{NT_SUBSTITUTIONS}: A comma separated list of specific nucleotide
78 #' differences (e.g. \code{112G>A}) in the novel allele.
79 #' \item \code{AA_DIFF}: Number of amino acids that differ between the new allele and the closest
80 #' reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database.
81 #' \item \code{AA_SUBSTITUTIONS}: A comma separated list with specific amino acid
82 #' differences (e.g. \code{96A>N}) in the novel allele.
83 #' \item \code{SEQUENCES}: Number of sequences unambiguosly assigned to this allele.
84 #' \item \code{UNMUTATED_SEQUENCES}: Number of records with the unmutated novel allele sequence.
85 #' \item \code{UNMUTATED_FREQUENCY}: Proportion of records with the unmutated novel allele
86 #' sequence (\code{UNMUTATED_SEQUENCES / SEQUENCE}).
87 #' \item \code{ALLELIC_PERCENTAGE}: Percentage at which the (unmutated) allele is observed
88 #' in the sequence dataset compared to other (unmutated) alleles.
89 #' \item \code{UNIQUE_JS}: Number of unique J sequences found associated with the
90 #' novel allele. The sequences are those who have been unambiguously assigned
91 #' to the novel allelle (\code{POLYMORPHISM_CALL}).
92 #' \item \code{UNIQUE_CDR3S}: Number of unique CDR3s associated with the inferred allele.
93 #' The sequences are those who have been unambiguously assigned to the
94 #' novel allelle (POLYMORPHISM_CALL).
95 #' \item \code{MUT_MIN}: Minimum mutation considered by the algorithm.
96 #' \item \code{MUT_MAX}: Maximum mutation considered by the algorithm.
97 #' \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering).
98 #' \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering).
99 #' \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered
100 #' potentially polymorphic.
101 #' \item \code{ALPHA}: Significance threshold to be used when constructing the
102 #' confidence interval for the y-intercept.
103 #' \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences
104 #' (within the desired mutational range and nucleotide range) required
105 #' for the samples to be considered.
106 #' \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly
107 #' aligning to a potential novel allele that are allowed to utilize to a particular
108 #' combination of junction length and J gene.
109 #' \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must
110 #' have usable nucleotides in a given position for that position to be considered.
111 #' \item \code{NOTE}: Comments regarding the novel allele inferrence.
112 #' }
113 #'
114 #' @seealso
115 #' See \link{findNovelAlleles}, \link{inferGenotype} and \link{genotypeFasta}
116 #' for generating the required input.
117 #'
118 #' @examples
119 #' \donttest{
120 #' # Generate input data
121 #' novel <- findNovelAlleles(SampleDb, GermlineIGHV)
122 #' genotype <- inferGenotype(SampleDb, find_unmutated=TRUE, germline_db=GermlineIGHV,
123 #' novel=novel)
124 #' genotype_db <- genotypeFasta(genotype, GermlineIGHV, novel)
125 #' data_db <- reassignAlleles(SampleDb, genotype_db)
126 #'
127 #' # Assemble evidence table
128 #' evidence <- generateEvidence(data_db, novel, genotype, genotype_db, GermlineIGHV)
129 #' }
130 #'
131 #' @export
132 generateEvidence <- function(data, novel, genotype, genotype_db,
133 germline_db, fields=NULL) {
134 # Visibility hack
135 . <- NULL
136
137 # Define set of sequences containing genotype and uncorrected calls
138 germline_set <- c(germline_db[!names(germline_db) %in% names(genotype_db)],
139 genotype_db)
140
141 # Find closest reference
142 .findClosestReference <- function(seq, allele_calls, ref_germ,
143 exclude_self=F, multiple=F) {
144 closest <- getMutCount(seq,
145 paste(allele_calls, collapse=","),
146 ref_germ)
147 min_dist <- min(unlist(closest))
148 closest_idx <- which(unlist(closest) == min_dist)
149 closest_names <- unique(allele_calls[closest_idx])
150 if (exclude_self & names(seq) %in% closest_names) {
151 warning("Excluding self")
152 closest_names <- closest_names[closest_names!=names(seq)] # not self
153 }
154 if (length(closest_names) > 1) {
155 warning(paste0("More than one closest reference found for ",
156 names(seq),": ",
157 paste(closest_names, collapse=",")))
158 # Keep the one with less mutated positions
159 mut_pos_count <- sapply(gsub("[^_]","",closest_names), nchar)
160 closest_names <- closest_names[mut_pos_count==min(mut_pos_count)]
161 # Pick same length
162 if (length(closest_names) > 1 ) {
163 idx <- which(
164 sapply(ref_germ[closest_names],nchar) == nchar(ref_germ[names(seq)])
165 )
166 if (length(idx) > 0 ) {
167 closest_names <- closest_names[idx]
168 }
169 }
170 # Pick same allele
171 if (length(closest_names) > 1 ) {
172 idx <- which(
173 getAllele(closest_names) == gsub("_.+", "", getAllele(names(seq)))
174 )
175 if (length(idx) > 0 ) {
176 closest_names <- closest_names[idx]
177 }
178 }
179 # Pick not duplicated
180 if (length(closest_names) > 1 ) {
181 idx <- !grepl("D\\*", closest_names)
182 if (any(idx)) {
183 closest_names <- closest_names[idx]
184 }
185 }
186 # If still more than one, err and TODO
187 if (length(closest_names) > 1 & multiple==FALSE) {
188 msg <- paste0("Multiple closest reference found for ",
189 names(seq),":\n",
190 paste(closest_names, collapse=","))
191 stop(msg)
192 }
193 warning(paste0("Use: ",
194 paste(closest_names, collapse=","),
195 " (less mutated positions, not D, same length, same allele)"))
196
197 }
198 closest_names
199 }
200
201 # Subset to novel alleles
202 final_gt <- genotype %>%
203 dplyr::group_by(.data$GENE) %>%
204 dplyr::filter(!duplicated(.data$ALLELES)) %>%
205 dplyr::ungroup() %>%
206 dplyr::mutate(ALLELES=strsplit(as.character(.data$ALLELES), ","),
207 COUNTS=strsplit(as.character(.data$COUNTS), ",")) %>%
208 tidyr::unnest(.data$ALLELES, .data$COUNTS) %>%
209 dplyr::mutate(POLYMORPHISM_CALL=paste0(.data$GENE, "*" , .data$ALLELES)) %>%
210 dplyr::filter(.data$POLYMORPHISM_CALL %in% novel$POLYMORPHISM_CALL) %>%
211 dplyr::rename(ALLELE="ALLELES")
212
213
214 # Add info from novel
215 final_gt <- dplyr::inner_join(dplyr::rename(final_gt, NOTE_GT="NOTE"),
216 novel,
217 by=c(fields, "POLYMORPHISM_CALL"))
218
219 # Add message if the same novel img sequence found from
220 # different starting alleles, these will be novel imgt sequences
221 # with more than one polymorphism call
222 final_gt <- final_gt %>%
223 dplyr::group_by(.data$NOVEL_IMGT) %>%
224 dplyr::mutate(NUM_CALLS=length(unique(.data$POLYMORPHISM_CALL))) %>%
225 dplyr::ungroup()
226 idx_mult <- which(final_gt$NUM_CALLS > 1)
227 final_gt$NUM_CALLS <- NULL
228 if (length(idx_mult) > 0) {
229 final_gt$NOTE_GT[idx_mult] <- paste(
230 final_gt$NOTE_GT[idx_mult],
231 " Found multiple polymorphism calls for the same NOVEL_IMGT.",
232 sep="")
233 }
234
235
236 if (nrow(final_gt)>0) {
237
238 .addEvidence <- function(df, germline_set, germline_db) {
239 polymorphism <- df[['POLYMORPHISM_CALL']]
240 novel_imgt <- df[["NOVEL_IMGT"]]
241 names(novel_imgt) <- polymorphism
242
243 v_call_genotyped <- data[["V_CALL_GENOTYPED"]]
244
245 SEQUENCES <- sum(v_call_genotyped == polymorphism)
246 df[["SEQUENCES"]] <- SEQUENCES
247 closest_ref_input <- .findClosestReference(novel_imgt,
248 names(germline_db),
249 germline_db,
250 exclude_self=F)
251 closest_ref <- .findClosestReference(novel_imgt,
252 names(germline_set),
253 germline_set,
254 exclude_self=F, multiple=T)
255
256 if (all(getGene(closest_ref_input) != getGene(closest_ref))) {
257 warning("closest reference gene difference")
258 }
259
260 if (all(closest_ref != polymorphism)) {
261 warning(paste0("closest reference allele (",
262 closest_ref
263 ,") different from POLYMORPHISM_CALL allele (",
264 polymorphism,")"))
265 }
266
267 ## TODO: this still not clear.
268 ## Any diff using sequence_imgt instead of germline[[polymorphism]]?
269 df[["CLOSEST_REFERENCE"]] <- closest_ref_input
270
271 nt_diff <- unlist(getMutatedPositions(novel_imgt, germline_set[[closest_ref_input]]))
272 nt_diff_string <- ""
273 if (nchar(novel_imgt) < nchar(germline_set)[[closest_ref_input]]) {
274 nt_diff <- c(nt_diff, (nchar(novel_imgt)+1):nchar(germline_set[[closest_ref_input]]))
275 }
276 if (length(nt_diff) > 0 ) {
277 ref_nt <- strsplit(germline_set[[closest_ref_input]],"")[[1]][nt_diff]
278 novel_nt <- strsplit(germline_set[[polymorphism]],"")[[1]][nt_diff]
279 nt_diff_string <- paste(paste(
280 nt_diff,
281 ref_nt,
282 ">",
283 replace(novel_nt, is.na(novel_nt), "-"),
284 sep=""), collapse=",")
285 }
286
287 df[["NT_DIFF"]] <- length(nt_diff)
288 df[["NT_SUBSTITUTIONS"]] <- nt_diff_string
289
290 diff_aa <- getMutatedAA(germline_set[[closest_ref_input]], germline_set[[polymorphism]])
291
292 if (length(diff_aa)>0) {
293 df[["AA_DIFF"]] <- length(diff_aa)
294 df[["AA_SUBSTITUTIONS"]] <- paste(diff_aa,collapse=",")
295 } else {
296 df[["AA_DIFF"]] <- 0
297 df[["AA_SUBSTITUTIONS"]] <- ""
298 }
299
300 df[["COUNTS"]] <- as.numeric(df[["COUNTS"]])
301 df[["TOTAL"]] <- as.numeric(df[["TOTAL"]])
302 df[["UNMUTATED_SEQUENCES"]] <- as.numeric(df[["COUNTS"]])
303 df[["UNMUTATED_FREQUENCY"]] <- as.numeric(df[["COUNTS"]])/SEQUENCES
304
305 df[["ALLELIC_PERCENTAGE"]] <- 100*df[["UNMUTATED_SEQUENCES"]]/as.numeric(df[["TOTAL"]])
306
307 if (SEQUENCES > 0) {
308 df[["UNIQUE_JS"]] <- data %>%
309 dplyr::filter(.data$V_CALL_GENOTYPED == polymorphism) %>%
310 dplyr::distinct(.data$J_CALL) %>%
311 nrow()
312 df[["UNIQUE_CDR3S"]] <- data %>%
313 dplyr::filter(.data$V_CALL_GENOTYPED == polymorphism) %>%
314 dplyr::distinct(translateDNA(.data$JUNCTION, trim=TRUE)) %>%
315 nrow()
316 } else {
317 df[["UNIQUE_JS"]] <- NA
318 df[["UNIQUE_CDR3S"]] <- NA
319 }
320
321 # Add closest germline
322 df[["CLOSEST_REFERENCE_IMGT"]] <- cleanSeqs(germline_set[[closest_ref_input]])
323
324 data.frame(df, stringsAsFactors=FALSE)
325 }
326
327 final_gt <- final_gt %>%
328 dplyr::rowwise() %>%
329 do(.addEvidence(., germline_set=germline_set, germline_db=germline_db)) %>%
330 dplyr::mutate(NOTE=trimws(paste(.data$NOTE_GT, .data$NOTE, sep=" "))) %>%
331 dplyr::select(-c("NOTE_GT"))
332 }
333
334 return(final_gt)
335 }
66 #' align to each germline allele in order to determine which positions
77 #' might be polymorphic.
88 #'
9 #' @details A \code{data.frame} in Change-O format contains the following
10 #' columns:
11 #' \itemize{
12 #' \item \code{"SEQUENCE_IMGT"} containing the IMGT-gapped nucleotide sequence
13 #' \item \code{"V_CALL"} containing the IMGT/V-QUEST V allele call(s)
14 #' \item \code{"J_CALL"} containing the IMGT/V-QUEST J allele call(s)
15 #' \item \code{"JUNCTION_LENGTH"} containing the junction length
16 #' }
179 #' The TIgGER allele-finding algorithm, briefly, works as follows:
1810 #' Mutations are determined through comparison to the provided germline.
1911 #' Mutation frequency at each *position* is determined as a function of
2315 #' against by ensuring that sequences perfectly matching the potential novel
2416 #' allele utilize a wide range of combinations of J gene and junction length.
2517 #'
26 #' @param clip_db a \code{data.frame} in Change-O format. See details.
18 #' @param data a \code{data.frame} in Change-O format. See details.
2719 #' @param germline_db a vector of named nucleotide germline sequences
28 #' matching the V calls in \code{clip_db}
29 #' @param v_call name of the column in clip_db with V allele calls.
20 #' matching the V calls in \code{data}.
21 #' @param v_call name of the column in \code{data} with V allele calls.
3022 #' Default is V_CALL.
3123 #' @param germline_min the minimum number of sequences that must have a
3224 #' particular germline allele call for the allele to
3931 #' be considered by the algorithm
4032 #' @param pos_range the range of IMGT-numbered positions that should be
4133 #' considered by the algorithm
42 #' @param alpha the alpha cutoff to be used when constructing the
43 #' confidence interval for the y-intercept
44 #' @param y_intercept the y-intercept above which positions should be
34 #' @param alpha the alpha value used for determining whether the
35 #' fit y-intercept is greater than the \code{y_intercept}
36 #' threshold
37 #' @param y_intercept the y-intercept threshold above which positions should be
4538 #' considered potentially polymorphic
4639 #' @param j_max the maximum fraction of sequences perfectly aligning
4740 #' to a potential novel allele that are allowed to
5548 #' position to considered
5649 #' @param nproc the number of processors to use
5750 #'
58 #' @return a \code{data.frame} with a row for each known allele analyzed.
51 #' @return
52 #' A \code{data.frame} with a row for each known allele analyzed.
5953 #' Besides metadata on the the parameters used in the search, each row will have
6054 #' either a note as to where the polymorphism-finding algorithm exited or a
61 #' nucleotide sequence for the predicted novel allele.
55 #' nucleotide sequence for the predicted novel allele, along with columns providing
56 #' additional evidence.
57 #'
58 #' The output contains the following columns:
59 #' \itemize{
60 #' \item \code{GERMLINE_CALL}: The input (uncorrected) V call.
61 #' \item \code{NOTE}: Comments regarding the inferrence.
62 #' \item \code{POLYMORPHISM_CALL}: The novel allele call.
63 #' \item \code{NT_SUBSTITUTIONS}: Mutations identified in the novel allele, relative
64 #' to the reference germline (\code{GERMLINE_CALL})
65 #' \item \code{NOVEL_IMGT}: The novel allele sequence.
66 #' \item \code{NOVEL_IMGT_COUNT}: The number of times the sequence \code{NOVEL_IMGT}
67 #' is found in the input data. Considers the subsequence of \code{NOVEL_IMGT}
68 #' in the \code{pos_range}.
69 #' \item \code{NOVEL_IMGT_UNIQUE_J}: Number of distinct J calls associated to \code{NOVEL_IMGT}
70 #' in the input data. Considers the subsequence of \code{NOVEL_IMGT} in the \code{pos_range}.
71 #' \item \code{NOVEL_IMGT_UNIQUE_CDR3}: Number of distinct CDR3 sequences associated
72 #' with \code{NOVEL_IMGT} in the input data. Considers the subsequence of \code{NOVEL_IMGT}
73 #' in the \code{pos_range}.
74 #' \item \code{PERFECT_MATCH_COUNT}: Final number of sequences retained to call the new
75 #' allele. These are unique sequences that have V segments that perfectly match
76 #' the predicted germline in the \code{pos_range}.
77 #' \item \code{PERFECT_MATCH_FREQ}: \code{PERFECT_MATCH_COUNT / GERMLINE_CALL_COUNT}
78 #' \item \code{GERMLINE_CALL_COUNT}: The number of sequences with the \code{GERMLINE_CALL}
79 #' in the input data that were initially considered for the analysis.
80 #' \item \code{GERMLINE_CALL_FREQ}: The fraction of sequences with the \code{GERMLINE_CALL}
81 #' in the input data initially considered for the analysis.
82 #' \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}.
83 #' \item \code{GERMLINE_IMGT_COUNT}: The number of times the \code{GERMLINE_IMGT}
84 #' sequence is found in the input data.
85 #' \item \code{MUT_MIN}: Minimum mutation considered by the algorithm.
86 #' \item \code{MUT_MAX}: Maximum mutation considered by the algorithm.
87 #' \item \code{MUT_PASS_COUNT}: Number of sequences in the mutation range.
88 #' \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering).
89 #' \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering).
90 #' \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered
91 #' potentially polymorphic.
92 #' \item \code{Y_INTERCEPT_PASS}: Number of positions that pass the \code{Y_INTERCEPT} threshold.
93 #' \item \code{SNP_PASS}: Number of sequences that pass the \code{Y_INTERCEPT} threshold and are
94 #' within the desired nucleotide range (\code{min_seqs}).
95 #' \item \code{UNMUTATED_COUNT}: Number of unmutated sequences.
96 #' \item \code{UNMUTATED_FREQ}: Number of unmutated sequences over \code{GERMLINE_IMGT_COUNT}.
97 #' \item \code{UNMUTATED_SNP_J_GENE_LENGTH_COUNT}: Number of distinct combinations
98 #' of SNP, J gene, and junction length.
99 #' \item \code{SNP_MIN_SEQS_J_MAX_PASS}: Number of SNPs that pass both the \code{min_seqs}
100 #' and \code{j_max} thresholds.
101 #' \item \code{ALPHA}: Significance threshold to be used when constructing the
102 #' confidence interval for the y-intercept.
103 #' \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences
104 #' (within the desired mutational range and nucleotide range) required
105 #' for the samples to be considered.
106 #' \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly
107 #' aligning to a potential novel allele that are allowed to utilize to a particular
108 #' combination of junction length and J gene.
109 #' \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must
110 #' have usable nucleotides in a given position for that position to be considered.
111 #' }
112 #'
113 #' The following comments can appear in the \code{NOTE} column:
114 #'
115 #' \itemize{
116 #' \item \emph{Novel allele found}: A novel allele was detected.
117 #' \item \emph{Plurality sequence too rare}: No sequence is frequent enough to pass
118 #' the J test (\code{j_max}).
119 #' \item \emph{A J-junction combination is too prevalent}: Not enough J diversity (\code{j_max}).
120 #' \item \emph{No positions pass y-intercept test}: No positions above \code{y_intercept}.
121 #' \item \emph{Insufficient sequences in desired mutational range}:
122 #' \code{mut_range} and \code{pos_range}.
123 #' \item \emph{Not enough sequences}: Not enough sequences in the desired mutational
124 #' range and nucleotide range (\code{min_seqs}).
125 #' \item \emph{No unmutated versions of novel allele found}: All observed variants of the
126 #' allele are mutated.
127 #' }
62128 #'
63129 #' @seealso \link{plotNovel} to visualize the data supporting any
64130 #' novel alleles hypothesized to be present in the data and
65131 #' \link{inferGenotype} to determine if the novel alleles are frequent
66 #' enought to be included in the subject's genotype
132 #' enought to be included in the subject's genotype.
67133 #'
68134 #' @examples
69 #' # Load example data and germlines
70 #' data(sample_db)
71 #' data(germline_ighv)
72 #'
135 #' \donttest{
73136 #' # Find novel alleles and return relevant data
74 #' \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)}
137 #' novel <- findNovelAlleles(SampleDb, GermlineIGHV)
138 #' }
75139 #'
76140 #' @export
77 findNovelAlleles <- function(clip_db, germline_db,
78 v_call="V_CALL",
79 germline_min = 200,
80 min_seqs = 50,
81 auto_mutrange = TRUE,
82 mut_range = 1:10,
83 pos_range = 1:312,
84 y_intercept = 0.125,
85 alpha = 0.05,
86 j_max = 0.15,
87 min_frac = 0.75,
88 nproc = 1) {
89 . = idx = NULL
90
91 # Keep only the db columns needed
92 clip_db <- clip_db %>%
93 dplyr::select_('SEQUENCE_IMGT', v_call, 'J_CALL', 'JUNCTION_LENGTH')
94
95 # Keep only the columns we need and clean up the sequences
96 missing = c("SEQUENCE_IMGT", v_call, "J_CALL", "JUNCTION_LENGTH") %>%
97 setdiff(colnames(clip_db))
98 if (length(missing) != 0) {
99 stop("Could not find required columns in clip_db:\n ",
100 paste(missing, collapse="\n "))
101 }
102 empty_junctions = sum(clip_db$JUNCTION_LENGTH == 0, na.rm=TRUE)
103 if (empty_junctions > 0) {
104 stop(empty_junctions, " sequences have junction ", "length of zero. ",
105 "Please remove these sequences.")
106 }
107 germlines = cleanSeqs(germline_db)
108 names(germlines) = getAllele(names(germlines), first=FALSE, strip_d=FALSE)
109 clip_db$SEQUENCE_IMGT = cleanSeqs(clip_db$SEQUENCE_IMGT)
110
111
112 # Find which rows' calls contain which germline alleles
113 cutoff =
114 ifelse(germline_min < 1, round(nrow(clip_db)*germline_min), germline_min)
115 allele_groups = sapply(names(germlines), grep, clip_db[[v_call]], fixed=TRUE,
116 simplify=FALSE)
117 names(allele_groups) = names(germlines)
118 allele_groups = allele_groups[sapply(allele_groups, length) >= cutoff]
119 if(length(allele_groups) == 0){
120 stop_message <- paste("Not enough sample sequences were assigned to any germline:\n",
121 " (1) germline_min is too large or\n",
122 " (2) sequences names don't match germlines.")
123 stop(stop_message)
124 }
125 allele_groups = allele_groups[sortAlleles(names(allele_groups))]
126
127 # Prepare for parallel processing
128 nproc = ifelse(Sys.info()['sysname'] == "Windows",
129 Sys.getenv('NUMBER_OF_PROCESSORS'),
130 ifelse(Sys.info()['sysname'] == "Darwin",
131 system("sysctl -n hw.ncpu", intern=TRUE),
132 system("nproc", intern=TRUE))) %>%
133 as.numeric() %>%
134 min(nproc, . - 1) %>%
135 max(1, .)
136 if(nproc == 1) {
137 foreach::registerDoSEQ()
138 } else {
139 cluster_type = ifelse(Sys.info()['sysname'] == "Windows",
140 "PSOCK", "FORK")
141 cluster <- parallel::makeCluster(nproc, type="PSOCK")
142 parallel::clusterExport(cluster, list("allele_groups",
143 "germlines",
144 "clip_db",
145 "min_seqs",
146 "auto_mutrange",
147 "mut_range",
148 "pos_range",
149 "y_intercept",
150 "alpha",
151 "j_max",
152 "germline_min",
153 "min_frac",
154 "findLowerY",
155 "mutationRangeSubset",
156 "positionMutations",
157 "superSubstring"),
158 envir=environment())
159 doParallel::registerDoParallel(cluster)
160 }
161
162 out_list <- foreach(idx=iterators::icount(length(allele_groups))) %dopar% {
163 # out_list <- lapply(1:length(allele_groups), function(idx) {
164 gc()
165 # message(paste0("idx=",idx))
166 # Subset of data being analyzed
167 allele_name = names(allele_groups)[idx]
168 germline = germlines[allele_name]
169 indicies = allele_groups[[allele_name]]
170 db_subset = clip_db[indicies, ]
171
172 # If mutrange is auto, find most popular mutation count and start from there
173 gpm = db_subset %>%
174 dplyr::mutate_(V_CALL = ~allele_name) %>%
175 getPopularMutationCount(germline,
176 gene_min=0, seq_min=min_seqs,
177 seq_p_of_max=1/8, full_return=TRUE)
178
179 # Determine the mutation range(s) to scan
180 mut_mins = min(mut_range)
181 if(auto_mutrange & sum(gpm$MUTATION_COUNT > 0) > 0 ){
182 mut_mins = c(mut_mins, gpm$MUTATION_COUNT[gpm$MUTATION_COUNT > 0]) %>%
183 unique() %>%
184 sort()
185 }
186
187 # Create the run's return object
188 df_run_empty = data.frame(GERMLINE_CALL = names(germline),
189 NOTE = "",
190 POLYMORPHISM_CALL = NA,
191 NOVEL_IMGT = NA,
192 PERFECT_MATCH_COUNT = NA,
193 GERMLINE_CALL_COUNT = length(indicies),
194 MUT_MIN = NA,
195 MUT_MAX = NA,
196 GERMLINE_IMGT = as.character(germline),
197 POS_MIN = min(pos_range),
198 POS_MAX = max(pos_range),
199 Y_INTERCEPT = y_intercept,
200 ALPHA = alpha,
201 MIN_SEQS = min_seqs,
202 J_MAX = j_max,
203 MIN_FRAC = min_frac,
204 stringsAsFactors = FALSE)
205 for (mut_min in rev(mut_mins)) {
206 gc()
207 # message(paste0("|-- mut_min=",mut_min))
208 if (mut_min == rev(mut_mins)[1]){
209 df_run = df_run_empty
210 } else {
211 df_run = dplyr::bind_rows(df_run_empty, df_run)
212 }
213 mut_max = mut_min + diff(range(mut_range))
214 df_run$MUT_MIN[1] = mut_min
215 df_run$MUT_MAX[1] = mut_max
216
217 # If no sequence is frequent enough to pass the J test, give up now
218 if(nrow(gpm) < 1) {
219 df_run$NOTE[1] = "Plurality sequence too rare."
220 if(mut_mins[1] == mut_min){
221 return(df_run)
222 } else {
223 next
224 }
225 }
226
227 # Add a mutation count column and filter out sequences not in our range
228 db_subset_mm = mutationRangeSubset(db_subset, germline,
229 mut_min:mut_max, pos_range)
230
231 if(nrow(db_subset_mm) < germline_min){
232 df_run$NOTE[1] = "Insufficient sequences in desired mutational range."
233 if(mut_mins[1] == mut_min){
234 return(df_run)
235 } else {
236 next
237 }
238 }
239
240 # Duplicate each sequence for all the positions to be analyzed
241 # and find which positions are mutated
242 pos_db = positionMutations(db_subset_mm, germline, pos_range)
243
244 # Find positional mut freq vs seq mut count
245 pos_muts = pos_db %>%
246 dplyr::group_by_(~POSITION) %>%
247 dplyr::mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>%
248 dplyr::group_by_(~MUT_COUNT, ~POSITION) %>%
249 dplyr::summarise_(POS_MUT_RATE = ~ mean(MUTATED)*unique(PASS) ) %>%
250 dplyr::ungroup()
251
252 rm(pos_db)
253 gc()
254
255 # Calculate y intercepts, find which pass the test
256 pass_y = pos_muts %>%
257 dplyr::group_by_(~POSITION) %>%
258 dplyr::summarise_(Y_INT_MIN = ~findLowerY(POS_MUT_RATE, MUT_COUNT,
259 mut_min, alpha)) %>%
260 dplyr::filter_(~Y_INT_MIN > y_intercept)
261
262 if(nrow(pass_y) < 1){
263 df_run$NOTE[1] = "No positions pass y-intercept test."
264 if(mut_mins[1] == mut_min){
265 return(df_run)
266 } else {
267 next
268 }
269 }
270
271 gl_substring = superSubstring(germline, pass_y$POSITION)
272 gl_minus_substring = insertPolymorphisms(germline, pass_y$POSITION,
273 rep("N", nrow(pass_y)))
274
275 # Find the potential SNP positions and remove anything that matches
276 # the germline at all those positions or any combo that is too rare
277 db_y_subset_mm = db_subset_mm %>%
278 dplyr::group_by(1:n()) %>%
279 dplyr::mutate_(SNP_STRING = ~superSubstring(SEQUENCE_IMGT,
280 pass_y$POSITION)) %>%
281 dplyr::filter_(~SNP_STRING != gl_substring) %>%
282 dplyr::group_by_(~SNP_STRING) %>%
283 dplyr::mutate_(STRING_COUNT = ~n()) %>%
284 dplyr::filter_(~STRING_COUNT >= min_seqs)
285
286 if (nrow(db_y_subset_mm) < 1 ){
287 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
288 paste(pass_y$POSITION, collapse = ","),
289 ") but the plurality sequence is too rare.",
290 sep="")
291 if(mut_mins[1] == mut_min){
292 return(df_run)
293 } else {
294 next
295 }
296 }
297
298 # Get mutation count at all positions that are not potential SNPs
299 pads = paste(rep("-", min(pos_range)-1), collapse="")
300 db_y_subset_mm$MUT_COUNT_MINUS_SUBSTRING = db_y_subset_mm$SEQUENCE_IMGT %>%
301 substring(min(pos_range), max(pos_range)) %>%
302 paste(pads, ., sep="") %>%
303 getMutatedPositions(gl_minus_substring) %>%
304 sapply(length)
305
306 # Keep only unmutated seqences and then find the counts of J and
307 # junction length for each of the SNP strings, and then check to
308 # see which pass the j/junction and count requirements
309 db_y_summary0 = db_y_subset_mm %>%
310 dplyr::filter_(~MUT_COUNT_MINUS_SUBSTRING == 0) %>%
311 dplyr::mutate_(J_GENE = ~getGene(J_CALL)) %>%
312 dplyr::group_by_(~SNP_STRING, ~J_GENE, ~JUNCTION_LENGTH) %>%
313 dplyr::summarise_(COUNT = ~n()) %>%
314 dplyr::group_by_(~SNP_STRING) %>%
315 dplyr::mutate_(FRACTION = ~COUNT/sum(COUNT)) %>%
316 dplyr::summarise_(TOTAL_COUNT = ~sum(COUNT), MAX_FRAC = ~max(FRACTION))
141 findNovelAlleles <- function(data, germline_db,
142 v_call="V_CALL",
143 germline_min=200,
144 min_seqs=50,
145 auto_mutrange=TRUE,
146 mut_range=1:10,
147 pos_range=1:312,
148 y_intercept=0.125,
149 alpha=0.05,
150 j_max=0.15,
151 min_frac=0.75,
152 nproc=1) {
153 . = idx = NULL
154
155 # Keep only the db columns needed
156 data <- data %>%
157 dplyr::select_('SEQUENCE_IMGT', v_call, 'J_CALL', 'JUNCTION_LENGTH', 'JUNCTION')
158
159 # Keep only the columns we need and clean up the sequences
160 missing = c("SEQUENCE_IMGT", v_call, "J_CALL", "JUNCTION_LENGTH") %>%
161 setdiff(colnames(data))
162 if (length(missing) != 0) {
163 stop("Could not find required columns in the input data:\n ",
164 paste(missing, collapse="\n "))
165 }
166 empty_junctions = sum(data$JUNCTION_LENGTH == 0, na.rm=TRUE)
167 if (empty_junctions > 0) {
168 stop(empty_junctions, " sequences have junction ", "length of zero. ",
169 "Please remove these sequences.")
170 }
171 germlines = cleanSeqs(germline_db)
172 names(germlines) = getAllele(names(germlines), first=FALSE, strip_d=FALSE)
173 data$SEQUENCE_IMGT = cleanSeqs(data$SEQUENCE_IMGT)
174
175
176 # Find which rows' calls contain which germline alleles
177 cutoff =
178 ifelse(germline_min < 1, round(nrow(data)*germline_min), germline_min)
179 allele_groups = sapply(names(germlines), grep, data[[v_call]], fixed=TRUE,
180 simplify=FALSE)
181 names(allele_groups) = names(germlines)
182 allele_groups = allele_groups[sapply(allele_groups, length) >= cutoff]
183 if(length(allele_groups) == 0){
184 stop_message <- paste("Not enough sample sequences were assigned to any germline:\n",
185 " (1) germline_min is too large or\n",
186 " (2) sequences names don't match germlines.")
187 stop(stop_message)
188 }
189 allele_groups = allele_groups[sortAlleles(names(allele_groups))]
190
191 # Prepare for parallel processing
192 nproc = ifelse(Sys.info()['sysname'] == "Windows",
193 Sys.getenv('NUMBER_OF_PROCESSORS'),
194 ifelse(Sys.info()['sysname'] == "Darwin",
195 system("sysctl -n hw.ncpu", intern=TRUE),
196 system("nproc", intern=TRUE))) %>%
197 as.numeric() %>%
198 min(nproc, . - 1) %>%
199 max(1, .)
200 if(nproc == 1) {
201 foreach::registerDoSEQ()
202 } else {
203 #cluster_type = ifelse(Sys.info()['sysname'] == "Windows", "PSOCK", "FORK")
204 cluster <- parallel::makeCluster(nproc, type="PSOCK")
205 parallel::clusterExport(cluster, list("allele_groups",
206 "germlines",
207 "data",
208 "min_seqs",
209 "auto_mutrange",
210 "mut_range",
211 "pos_range",
212 "y_intercept",
213 "alpha",
214 "j_max",
215 "germline_min",
216 "min_frac",
217 "findLowerY",
218 "mutationRangeSubset",
219 "positionMutations",
220 "superSubstring"),
221 envir=environment())
222 doParallel::registerDoParallel(cluster)
223 }
224
225 out_list <- foreach(idx=iterators::icount(length(allele_groups))) %dopar% {
226 # out_list <- lapply(1:length(allele_groups), function(idx) {
227 gc()
228 # message(paste0("idx=",idx))
229 # Subset of data being analyzed
230 allele_name = names(allele_groups)[idx]
231 germline = germlines[allele_name]
232 indicies = allele_groups[[allele_name]]
233 db_subset = data[indicies, ]
317234
318 if(nrow(db_y_summary0) < 1){
319 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
320 paste(pass_y$POSITION, collapse = ","),
321 ") but no unmutated versions of novel allele",
322 " found.", sep="")
323 if(mut_mins[1] == mut_min){
324 return(df_run)
325 } else {
326 next
327 }
328 }
329
330 # db_y_summary = db_y_summary0 %>%
331 # filter_(~TOTAL_COUNT >= min_seqs & MAX_FRAC <= j_max)
332
333 min_seqs_pass <- db_y_summary0$TOTAL_COUNT >= min_seqs
334 j_max_pass <- db_y_summary0$MAX_FRAC <= j_max
335
336 db_y_summary <- db_y_summary0[min_seqs_pass & j_max_pass, , drop=FALSE]
337
338 if(nrow(db_y_summary) < 1){
339 msg <- c(NA, NA)
340 names(msg) <- c("j_max", "min_seqs")
341
342 if (sum(min_seqs_pass) == 0) {
343 msg['min_seqs'] <- paste0("not enough sequences (maximum total count is ",
344 max(db_y_summary0$TOTAL_COUNT),
345 ")")
235 # If mutrange is auto, find most popular mutation count and start from there
236 gpm = db_subset %>%
237 dplyr::mutate_(V_CALL = ~allele_name) %>%
238 getPopularMutationCount(germline,
239 gene_min=0, seq_min=min_seqs,
240 seq_p_of_max=1/8, full_return=TRUE)
241
242 # Determine the mutation range(s) to scan
243 mut_mins = min(mut_range)
244 if(auto_mutrange & sum(gpm$MUTATION_COUNT > 0) > 0 ){
245 mut_mins = c(mut_mins, gpm$MUTATION_COUNT[gpm$MUTATION_COUNT > 0]) %>%
246 unique() %>%
247 sort()
346248 }
347249
348 if (sum(j_max_pass) == 0) {
349 msg['j_max'] <- paste0("a J-junction combination is too prevalent (",
350 round(100*max(db_y_summary0$MAX_FRAC),1),"% of sequences)")
250 # Create the run's return object
251 df_run_empty = data.frame(GERMLINE_CALL = names(germline),
252 NOTE = "",
253 POLYMORPHISM_CALL = NA,
254 NT_SUBSTITUTIONS=NA,
255 NOVEL_IMGT = NA,
256 NOVEL_IMGT_COUNT=NA,
257 NOVEL_IMGT_UNIQUE_J=NA,
258 NOVEL_IMGT_UNIQUE_CDR3=NA,
259 PERFECT_MATCH_COUNT = NA,
260 PERFECT_MATCH_FREQ = NA,
261 GERMLINE_CALL_COUNT = length(indicies),
262 GERMLINE_CALL_FREQ = round(length(indicies)/nrow(data), 3),
263 MUT_MIN = NA,
264 MUT_MAX = NA,
265 MUT_PASS_COUNT=NA,
266 GERMLINE_IMGT = as.character(germline),
267 GERMLINE_IMGT_COUNT=NA,
268 POS_MIN = min(pos_range),
269 POS_MAX = max(pos_range),
270 Y_INTERCEPT = y_intercept,
271 Y_INTERCEPT_PASS = NA,
272 SNP_PASS=NA,
273 UNMUTATED_COUNT=NA,
274 UNMUTATED_FREQ=NA,
275 UNMUTATED_SNP_J_GENE_LENGTH_COUNT=NA,
276 SNP_MIN_SEQS_J_MAX_PASS=NA,
277 ALPHA = alpha,
278 MIN_SEQS = min_seqs,
279 J_MAX = j_max,
280 MIN_FRAC = min_frac,
281 stringsAsFactors = FALSE)
282 for (mut_min in rev(mut_mins)) {
283 gc()
284 # message(paste0("|-- mut_min=",mut_min))
285 if (mut_min == rev(mut_mins)[1]){
286 df_run = df_run_empty
287 } else {
288 df_run = dplyr::bind_rows(df_run_empty, df_run)
289 }
290 mut_max = mut_min + diff(range(mut_range))
291 df_run$MUT_MIN[1] = mut_min
292 df_run$MUT_MAX[1] = mut_max
293
294 # If no sequence is frequent enough to pass the J test, give up now
295 if(nrow(gpm) < 1) {
296 df_run$NOTE[1] = "Plurality sequence too rare."
297 if(mut_mins[1] == mut_min){
298 return(df_run)
299 } else {
300 next
301 }
302 }
303
304 # Add a mutation count column and filter out sequences not in our range
305 db_subset_mm = mutationRangeSubset(db_subset, germline,
306 mut_min:mut_max, pos_range)
307 df_run$MUT_PASS_COUNT[1] <- nrow(db_subset_mm)
308
309 if(nrow(db_subset_mm) < min_seqs){
310 df_run$NOTE[1] = paste0("Insufficient sequences (",nrow(db_subset_mm),") in desired mutational range.")
311 if(mut_mins[1] == mut_min){
312 return(df_run)
313 } else {
314 next
315 }
316 }
317
318 # Duplicate each sequence for all the positions to be analyzed
319 # and find which positions are mutated
320 pos_db = positionMutations(db_subset_mm, germline, pos_range)
321
322 # Find positional mut freq vs seq mut count
323 pos_muts = pos_db %>%
324 dplyr::group_by_(~POSITION) %>%
325 dplyr::mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>%
326 dplyr::group_by_(~MUT_COUNT, ~POSITION) %>%
327 dplyr::summarise_(POS_MUT_RATE = ~ mean(MUTATED)*unique(PASS) ) %>%
328 dplyr::ungroup()
329
330 rm(pos_db)
331 gc()
332
333 # Calculate y intercepts, find which pass the test
334 pass_y = pos_muts %>%
335 dplyr::group_by_(~POSITION) %>%
336 dplyr::summarise_(Y_INT_MIN = ~findLowerY(POS_MUT_RATE, MUT_COUNT,
337 mut_min, alpha)) %>%
338 dplyr::filter_(~Y_INT_MIN > y_intercept)
339
340 df_run$Y_INTERCEPT_PASS[1] <- nrow(pass_y)
341
342 if(nrow(pass_y) < 1){
343 df_run$NOTE[1] = "No positions pass y-intercept test."
344 if(mut_mins[1] == mut_min){
345 return(df_run)
346 } else {
347 next
348 }
349 }
350
351 gl_substring = superSubstring(germline, pass_y$POSITION)
352 gl_minus_substring = insertPolymorphisms(germline, pass_y$POSITION,
353 rep("N", nrow(pass_y)))
354
355 # Find the potential SNP positions and remove anything that matches
356 # the germline at all those positions or any combo that is too rare
357 db_y_subset_mm = db_subset_mm %>%
358 dplyr::group_by(1:n()) %>%
359 dplyr::mutate_(SNP_STRING = ~superSubstring(SEQUENCE_IMGT,
360 pass_y$POSITION)) %>%
361 dplyr::filter_(~SNP_STRING != gl_substring) %>%
362 dplyr::group_by_(~SNP_STRING) %>%
363 dplyr::mutate_(STRING_COUNT = ~n()) %>%
364 dplyr::filter_(~STRING_COUNT >= min_seqs)
365
366 df_run$SNP_PASS[1] <- nrow(db_y_subset_mm)
367
368 if (nrow(db_y_subset_mm) < 1 ){
369 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
370 paste(pass_y$POSITION, collapse = ","),
371 ") but the plurality sequence is too rare.",
372 sep="")
373 if(mut_mins[1] == mut_min){
374 return(df_run)
375 } else {
376 next
377 }
378 }
379
380 # Get mutation count at all positions that are not potential SNPs
381 pads = paste(rep("-", min(pos_range)-1), collapse="")
382 db_y_subset_mm$MUT_COUNT_MINUS_SUBSTRING = db_y_subset_mm$SEQUENCE_IMGT %>%
383 substring(min(pos_range), max(pos_range)) %>%
384 paste(pads, ., sep="") %>%
385 getMutatedPositions(gl_minus_substring) %>%
386 sapply(length)
387
388 # Keep only unmutated seqences and then find the counts of J and
389 # junction length for each of the SNP strings, and then check to
390 # see which pass the j/junction and count requirements
391 db_y_summary0 = db_y_subset_mm %>%
392 dplyr::filter_(~MUT_COUNT_MINUS_SUBSTRING == 0)
393
394 df_run$UNMUTATED_COUNT[1] <- nrow(db_y_summary0)
395
396 db_y_summary0 <- db_y_summary0 %>%
397 dplyr::mutate_(J_GENE = ~getGene(J_CALL)) %>%
398 dplyr::group_by_(~SNP_STRING, ~J_GENE, ~JUNCTION_LENGTH) %>%
399 dplyr::summarise_(COUNT = ~n())
400
401 df_run$UNMUTATED_SNP_J_GENE_LENGTH_COUNT[1] <- nrow(db_y_summary0)
402
403 db_y_summary0 <- db_y_summary0 %>%
404 dplyr::group_by_(~SNP_STRING) %>%
405 dplyr::mutate_(FRACTION = ~COUNT/sum(COUNT)) %>%
406 dplyr::summarise_(TOTAL_COUNT = ~sum(COUNT), MAX_FRAC = ~max(FRACTION))
407
408 if(nrow(db_y_summary0) < 1){
409 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
410 paste(pass_y$POSITION, collapse = ","),
411 ") but no unmutated versions of novel allele",
412 " found.", sep="")
413 if(mut_mins[1] == mut_min){
414 return(df_run)
415 } else {
416 next
417 }
418 }
419
420 # db_y_summary = db_y_summary0 %>%
421 # filter_(~TOTAL_COUNT >= min_seqs & MAX_FRAC <= j_max)
422
423 min_seqs_pass <- db_y_summary0$TOTAL_COUNT >= min_seqs
424 j_max_pass <- db_y_summary0$MAX_FRAC <= j_max
425
426 db_y_summary <- db_y_summary0[min_seqs_pass & j_max_pass, , drop=FALSE]
427
428 df_run$SNP_MIN_SEQS_J_MAX_PASS[1] <- nrow(db_y_summary)
429
430 if(nrow(db_y_summary) < 1){
431 msg <- c(NA, NA)
432 names(msg) <- c("j_max", "min_seqs")
433
434 if (sum(min_seqs_pass) == 0) {
435 msg['min_seqs'] <- paste0("Not enough sequences (maximum total count is ",
436 max(db_y_summary0$TOTAL_COUNT),
437 ").")
438 }
439
440 if (sum(j_max_pass) == 0) {
441 msg['j_max'] <- paste0("A J-junction combination is too prevalent (",
442 round(100*max(db_y_summary0$MAX_FRAC),1),"% of sequences).")
443 }
444
445 msg <- paste(na.omit(msg), collapse=" and ")
446 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
447 paste(pass_y$POSITION, collapse = ","),
448 ") but ",
449 msg,".", sep="")
450 df_run$PERFECT_MATCH_COUNT[1] = max(db_y_summary0$TOTAL_COUNT)
451 df_run$PERFECT_MATCH_FREQ[1] <- df_run$PERFECT_MATCH_COUNT[1]/df_run$GERMLINE_CALL_COUNT[1]
452 if(mut_mins[1] == mut_min){
453 return(df_run)
454 } else {
455 next
456 }
457 }
458
459 germ_nts = unlist(strsplit(gl_substring,""))
460 for (r in 1:nrow(db_y_summary)) {
461 if (r > 1){
462 df_run = dplyr::bind_rows(df_run[1,], df_run)
463 }
464 # Create the new germline
465 snp_nts = unlist(strsplit(db_y_summary$SNP_STRING[r],""))
466 remain_mut = db_y_summary$SNP_STRING[r] %>%
467 getMutatedPositions(gl_substring) %>%
468 unlist() %>%
469 unique()
470 germ = insertPolymorphisms(germline, pass_y$POSITION, snp_nts)
471 is_known_allele <- germ == germlines
472 if (sum(is_known_allele) == 0 ) {
473 names(germ) = mapply(paste, germ_nts[remain_mut],
474 pass_y$POSITION[remain_mut],
475 snp_nts[remain_mut], sep="") %>%
476 paste(collapse="_") %>%
477 paste(names(germline), ., sep="_")
478 } else {
479 # If the match is with duplicated sequences in the reference germlines,
480 # use the first
481 known_allele_names <- sortAlleles(names(germlines)[is_known_allele],
482 method="position")
483 names(germ) = known_allele_names[1]
484 }
485 # Save the new germline to our data frame
486 df_run$POLYMORPHISM_CALL[1] = names(germ)
487 df_run$NOVEL_IMGT[1] = as.character(germ)
488 df_run$PERFECT_MATCH_COUNT[1] = db_y_summary$TOTAL_COUNT[r]
489 df_run$PERFECT_MATCH_FREQ[1] <- df_run$PERFECT_MATCH_COUNT[1]/df_run$GERMLINE_CALL_COUNT[1]
490 df_run$NOTE[1] = "Novel allele found!"
491 }
492
493 } # end for each starting mutation counts
494 return(df_run)
495
496 } # end foreach allele
497
498 if(nproc > 1) { stopCluster(cluster) }
499 out_df <- dplyr::bind_rows(out_list)
500 getMuSpec <- function(poly_call, germ_call) {
501 sapply(1:length(poly_call), function(i){
502 p <- gsub(germ_call[i], "", poly_call[i], fixed = T)
503 p <- strsplit(p,"_")[[1]][-1]
504 m <- gsub("([[:alpha:]])([[:digit:]]*)([[:alpha:]])", "\\2\\1>\\3", p)
505 paste(m, collapse=",")
506 })
507 }
508
509 # The number of records in the sequence dataset matching
510 # each exact NOVEL_IMGT sequence
511 getDbMatch <- function(novel_imgt) {
512 sapply(novel_imgt, function(n) {
513 n <- substr(n, min(pos_range), max(pos_range))
514 sum(grepl(gsub("[-\\.]","",n),
515 gsub("[-\\.]","",data$SEQUENCE_IMGT)))
516 })
517 }
518
519 # The number of distinct J in the sequence dataset associated
520 # with the exact NOVEL_IMGT sequence
521 getNumJ <- function(novel_imgt) {
522 sapply(novel_imgt, function(n) {
523 n <- substr(n, min(pos_range), max(pos_range))
524 imgt_idx <- grepl(gsub("[-\\.]","",n),
525 gsub("[-\\.]","",data$SEQUENCE_IMGT))
526 length(unique(getGene(data[['J_CALL']][imgt_idx])))
527 })
528 }
529
530
531 # The number of distinct CDR3 in the sequence dataset associated
532 # with the exact NOVEL_IMGT sequence
533 getNumCDR3 <- function(novel_imgt) {
534 sapply(novel_imgt, function(n) {
535 n <- substr(n, min(pos_range), max(pos_range))
536 imgt_idx <- grepl(gsub("[-\\.]","",n),
537 gsub("[-\\.]","",data$SEQUENCE_IMGT))
538 seq <- data[['JUNCTION']][imgt_idx]
539 seq <- substr(seq, 4, stringi::stri_length(seq) - 3)
540 length(unique(seq))
541 })
542 }
543
544 idx <- which(!is.na(out_df$NOVEL_IMGT))
545 if (length(idx)>0) {
546 out_df$NT_SUBSTITUTIONS[idx] <- getMuSpec(out_df$POLYMORPHISM_CALL[idx],
547 out_df$GERMLINE_CALL[idx])
548 out_df$NOVEL_IMGT_COUNT[idx] <- getDbMatch(out_df$NOVEL_IMGT[idx])
549 out_df$NOVEL_IMGT_UNIQUE_J[idx] <- getNumJ(out_df$NOVEL_IMGT[idx])
550 if ("JUNCTION" %in% colnames(data)) {
551 out_df$NOVEL_IMGT_UNIQUE_CDR3[idx] <- getNumCDR3(out_df$NOVEL_IMGT[idx])
351552 }
352
353 msg <- paste(na.omit(msg), collapse=" and ")
354 df_run$NOTE[1] = paste("Position(s) passed y-intercept (",
355 paste(pass_y$POSITION, collapse = ","),
356 ") but ",
357 msg,".", sep="")
358 df_run$PERFECT_MATCH_COUNT[1] = max(db_y_summary0$TOTAL_COUNT)
359 if(mut_mins[1] == mut_min){
360 return(df_run)
361 } else {
362 next
363 }
364 }
365
366 germ_nts = unlist(strsplit(gl_substring,""))
367 for (r in 1:nrow(db_y_summary)) {
368 if (r > 1){
369 df_run = dplyr::bind_rows(df_run[1,], df_run)
370 }
371 # Create the new germline
372 snp_nts = unlist(strsplit(db_y_summary$SNP_STRING[r],""))
373 remain_mut = db_y_summary$SNP_STRING[r] %>%
374 getMutatedPositions(gl_substring) %>%
375 unlist() %>%
376 unique()
377 germ = insertPolymorphisms(germline, pass_y$POSITION, snp_nts)
378 names(germ) = mapply(paste, germ_nts[remain_mut],
379 pass_y$POSITION[remain_mut],
380 snp_nts[remain_mut], sep="") %>%
381 paste(collapse="_") %>%
382 paste(names(germline), ., sep="_")
383 # Save the new germline to our data frame
384 df_run$POLYMORPHISM_CALL[1] = names(germ)
385 df_run$NOVEL_IMGT[1] = as.character(germ)
386 df_run$PERFECT_MATCH_COUNT[1] = db_y_summary$TOTAL_COUNT[r]
387 df_run$NOTE[1] = "Novel allele found!"
388 }
389
390 } # end for each starting mutation counts
391 return(df_run)
392
393 } # end foreach allele
394
395 if(nproc > 1) { stopCluster(cluster) }
396 rm(clip_db)
397 gc()
398 out_df <- dplyr::bind_rows(out_list)
399 return(out_df)
553 }
554 out_df$GERMLINE_IMGT_COUNT <- getDbMatch(out_df$GERMLINE_IMGT)
555 out_df$UNMUTATED_FREQ = out_df$UNMUTATED_COUNT/out_df$GERMLINE_CALL_COUNT
556 rm(data)
557 gc()
558
559 return(out_df)
400560 }
401561
402562 #' Select rows containing novel alleles
404564 #' \code{selectNovel} takes the result from \link{findNovelAlleles} and
405565 #' selects only the rows containing unique, novel alleles.
406566 #'
407 #' @param novel_df A \code{data.frame} of the type returned by
408 #' \link{findNovelAlleles}
409 #' @param keep_alleles A \code{logical} indicating if different alleles
410 #' leading to the same novel sequence should be kept.
411 #' See details.
412 #'
413 #' @details If, for instance, subject has in his genome IGHV1-2*02 and a novel
414 #' allele equally close to IGHV1-2*02 and IGHV1-2*05, the novel allele may be
567 #' @details
568 #' If, for instance, subject has in his genome \code{IGHV1-2*02} and a novel
569 #' allele equally close to \code{IGHV1-2*02} and \code{IGHV1-2*05}, the novel allele may be
415570 #' detected by analyzing sequences that best align to either of these alleles.
416571 #' If \code{keep_alleles} is \code{TRUE}, both polymorphic allele calls will
417572 #' be retained. In the case that multiple mutation ranges are checked for the
418573 #' same allele, only one mutation range will be kept in the output.
574 #'
575 #' @param novel a \code{data.frame} of the type returned by
576 #' \link{findNovelAlleles}.
577 #' @param keep_alleles a \code{logical} indicating if different alleles
578 #' leading to the same novel sequence should be kept.
579 #' See Details.
419580 #'
420581 #' @return A \code{data.frame} containing only unique, novel alleles (if any)
421582 #' that were in the input.
422583 #'
423584 #' @examples
424 #' data(novel_df)
425 #' novel = selectNovel(novel_df)
585 #' novel <- selectNovel(SampleNovel)
426586 #'
427587 #' @export
428 selectNovel <- function(novel_df, keep_alleles=FALSE) {
588 selectNovel <- function(novel, keep_alleles=FALSE) {
429589 # Remove non-novel rows
430 novel_df = filter_(novel_df, ~!is.na(NOVEL_IMGT))
431
590 novel = filter_(novel, ~!is.na(NOVEL_IMGT))
591
432592 if (keep_alleles) {
433 novel_df = novel_df %>%
593 novel = novel %>%
434594 group_by_(~GERMLINE_CALL)
435595 }
436 novel = novel_df %>%
596 novel_set = novel %>%
437597 distinct_(~NOVEL_IMGT, .keep_all=TRUE) %>%
438598 ungroup()
439
440 return(novel)
599
600 return(novel_set)
441601 }
442602
443603 #' Visualize evidence of novel V alleles
444604 #'
445605 #' \code{plotNovel} is be used to visualize the evidence of any novel V
446 #' alleles found using \link{findNovelAlleles}.
447 #'
448 #' @param clip_db a \code{data.frame} in Change-O format. See
606 #' alleles found using \link{findNovelAlleles}. It can also be used to
607 #' visualize the results for alleles that did
608 #'
609 #' @details
610 #' The first panel in the plot shows, for all sequences which align to a particular
611 #' germline allele, the mutation frequency at each postion along the aligned
612 #' sequece as a function of the sequence-wide mutation. Sequences that pass
613 #' the novel allele test are colored red, while sequences that don't pass
614 #' the test are colored yellow. The second panel shows the nucleotide usage at the
615 #' positions as a function of sequence-wide mutation count.
616 #'
617 #' To avoid cases where a clonal expansion might lead to a false positive, tigger examines
618 #' the combinations of J gene and junction length among sequences which perfectly
619 #' match the proposed germline allele.
620 #'
621 #' @param data a \code{data.frame} in Change-O format. See
449622 #' \link{findNovelAlleles} for details.
450 #' @param novel_df_row a single row from a data frame as output by
623 #' @param novel_row a single row from a data frame as output by
451624 #' \link{findNovelAlleles} that contains a
452625 #' polymorphism-containing germline allele
626 #' @param v_call name of the column in \code{data} with V allele
627 #' calls. Default is "V_CALL".
453628 #' @param ncol number of columns to use when laying out the plots
454 #' @param v_call name of the column in \code{clip_db} with V allele
455 #' calls. Default is "V_CALL"
456 #' @return NULL
457629 #'
458630 #' @examples
459 #' # Load example data and germlines
460 #' data(sample_db)
461 #' data(germline_ighv)
462 #'
463 #' # Find novel alleles and return relevant data
464 #' \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)}
465 #' data(novel_df)
466631 #' # Plot the evidence for the first (and only) novel allele in the example data
467 #' novel = selectNovel(novel_df)
468 #' plotNovel(sample_db, novel[1,])
632 #' novel <- selectNovel(SampleNovel)
633 #' plotNovel(SampleDb, novel[1, ])
469634 #'
470635 #' @export
471 plotNovel <- function(clip_db, novel_df_row, ncol = 1, v_call="V_CALL"){
472 . = NULL
473
474 # Use the data frame
475 if(length(novel_df_row) > 0){
476 if(is.data.frame(novel_df_row) & nrow(novel_df_row) == 1){
477 pos_range = novel_df_row$POS_MIN:novel_df_row$POS_MAX
478 germline = novel_df_row$GERMLINE_IMGT
479 names(germline) = novel_df_row$GERMLINE_CALL
480 mut_range = novel_df_row$MUT_MIN[1]:novel_df_row$MUT_MAX[1]
481 novel_imgt = novel_df_row$NOVEL_IMGT
482 names(novel_imgt) = novel_df_row$POLYMORPHISM_CALL
483 min_frac = novel_df_row$MIN_FRAC
484 note = novel_df_row$NOTE
636 plotNovel <- function(data, novel_row, v_call="V_CALL", ncol=1) {
637 . = NULL
638
639 # Use the data frame
640 if(length(novel_row) > 0) {
641 if(is.data.frame(novel_row) & nrow(novel_row) == 1) {
642 pos_range = novel_row$POS_MIN:novel_row$POS_MAX
643 germline = novel_row$GERMLINE_IMGT
644 names(germline) = novel_row$GERMLINE_CALL
645 mut_range = novel_row$MUT_MIN[1]:novel_row$MUT_MAX[1]
646 novel_imgt = novel_row$NOVEL_IMGT
647 names(novel_imgt) = novel_row$POLYMORPHISM_CALL
648 min_frac = novel_row$MIN_FRAC
649 note = novel_row$NOTE
650 } else {
651 stop("novel_row is not a data frame with only one row.")
652 }
653 }
654
655 germline = cleanSeqs(germline)
656 data$SEQUENCE_IMGT = cleanSeqs(data$SEQUENCE_IMGT)
657
658 # Extract sequences assigned to the germline, determine which
659 # have an appropriate range of mutations, and find the mutation
660 # frequency of each position
661 db_subset = data %>%
662 select_(~SEQUENCE_IMGT, v_call, ~J_CALL, ~JUNCTION_LENGTH) %>%
663 filter_(~grepl(names(germline), data[[v_call]], fixed=TRUE))
664 pos_db = db_subset %>%
665 mutationRangeSubset(germline, mut_range, pos_range)
666 if (nrow(pos_db) == 0) {
667 warning(paste0("Insufficient sequences (",nrow(pos_db),") in desired mutational range."))
668 return (invisible(NULL))
669 }
670 pos_db <- pos_db %>%
671 positionMutations(germline, pos_range)
672 pos_muts = pos_db %>%
673 group_by_(~POSITION) %>%
674 mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>%
675 group_by_(~MUT_COUNT, ~POSITION) %>%
676 summarise_(POS_MUT_RATE = ~mean(MUTATED)*unique(PASS) ) %>%
677 ungroup()
678
679 # Label the polymorphic positions as such
680 pass_y = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
681 gsub("[^0-9]", "", .) %>%
682 as.numeric()
683 p_y_f = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
684 gsub("[0-9]+.", "", .)
685 p_y_t = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
686 gsub(".[0-9]+", "", .)
687 # Parse the note to find positions that passed y intercept if no novel found
688 if(length(pass_y) == 0 & grepl("Position\\(s\\) passed y-intercept", note)){
689 pass_y = note %>% gsub("Position\\(s\\) passed y-intercept \\(", "", .) %>%
690 gsub("\\).*", "", .) %>% strsplit(",") %>% unlist %>% as.numeric
691 p_y_f = sapply(pass_y, function (x) substring(germline, x, x))
692 p_y_t = gsub(".", "?", p_y_f)
693 }
694
695 to_from = paste(paste("Position", pass_y), paste(paste(p_y_f, "->"), p_y_t))
696 names(to_from) = pass_y
697 pos_muts = pos_muts %>%
698 mutate_(Polymorphic = ~ifelse(POSITION %in% pass_y, "True", "False"))
699
700 pads = paste(rep("-", min(pos_range)-1), collapse="")
701 db_subset$MUT_COUNT_NOVEL = db_subset$SEQUENCE_IMGT %>%
702 substring(min(pos_range), max(pos_range)) %>%
703 paste(pads, ., sep="") %>%
704 getMutatedPositions(novel_imgt) %>%
705 sapply(length)
706 db_subset = db_subset %>%
707 filter_(~MUT_COUNT_NOVEL == 0) %>%
708 mutate_(J_GENE = ~getGene(J_CALL))
709 if (nrow(db_subset) == 0) {
710 warning(paste0("Insufficient sequences (",nrow(db_subset),") with MUT_COUNT_NOVEL == 0."))
711 return (invisible(NULL))
712 }
713 db_subset$JUNCTION_LENGTH = db_subset$JUNCTION_LENGTH %>%
714 factor(levels=min(db_subset$JUNCTION_LENGTH):max(db_subset$JUNCTION_LENGTH))
715 pos_muts$Polymorphic = pos_muts$Polymorphic %>%
716 factor(levels = c("False", "True"))
717 pos_db$NT = pos_db$NT %>%
718 factor(levels = names(DNA_COLORS))
719 pos_muts$GERMLINE = names(germline)
720
721 # MAKE THE FIRST PLOT
722 if(!is.na(novel_imgt)){
723 POLYCOLORS = setNames(DNA_COLORS[c(4,3)], c("False", "True"))
724 p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION,
725 color=~Polymorphic)) +
726 geom_line(size = 0.75) +
727 facet_grid(GERMLINE ~ .) +
728 scale_color_manual(values = POLYCOLORS) +
729 ylim(0,1) +
730 xlab("Mutation Count (Sequence)") +
731 ylab("Mutation Frequency (Position)") +
732 theme_bw() +
733 theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1),
734 legend.background=element_rect(fill = "transparent")) +
735 guides(color = guide_legend(ncol = 2, reverse = TRUE))
736 } else{
737 POLYCOLORS = setNames(DNA_COLORS[c(4,2)], c("False", "True"))
738 p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION,
739 color=~Polymorphic)) +
740 geom_line(size = 0.75) +
741 facet_grid(GERMLINE ~ .) +
742 scale_color_manual(values = POLYCOLORS) +
743 ylim(0,1) +
744 xlab("Mutation Count (Sequence)") +
745 ylab("Mutation Frequency (Position)") +
746 theme_bw() +
747 theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1),
748 legend.background=element_rect(fill = "transparent")) +
749 guides(color = guide_legend("Passed y-intercept test",
750 ncol = 2, reverse = TRUE))
751 }
752 # MAKE THE SECOND PLOT
753 p2_data = mutate_(filter_(pos_db, ~POSITION %in% pass_y),
754 POSITION = ~to_from[as.character(POSITION)])
755 if (nrow(p2_data)) {
756 p2 = ggplot(p2_data,
757 aes_(~factor(MUT_COUNT), fill=~NT)) +
758 geom_bar(width=0.9) +
759 guides(fill = guide_legend("Nucleotide", ncol = 4)) +
760 facet_grid(POSITION ~ .) +
761 xlab("Mutation Count (Sequence)") + ylab("Sequence Count") +
762 scale_fill_manual(values = DNA_COLORS, breaks=names(DNA_COLORS),
763 drop=FALSE) +
764 theme_bw() +
765 theme(legend.position=c(1,1), legend.justification=c(1,1),
766 legend.background=element_rect(fill = "transparent"))
485767 } else {
486 stop("novel_df_row is not a data frame with only one row.")
487 }
488 }
489
490 germline = cleanSeqs(germline)
491 clip_db$SEQUENCE_IMGT = cleanSeqs(clip_db$SEQUENCE_IMGT)
492
493 # Extract sequences assigned to the germline, determine which
494 # have an appropriate range of mutations, and find the mutation
495 # frequency of each position
496 db_subset = clip_db %>%
497 select_(~SEQUENCE_IMGT, v_call, ~J_CALL, ~JUNCTION_LENGTH) %>%
498 filter_(~grepl(names(germline), clip_db[[v_call]], fixed=TRUE))
499 pos_db = db_subset %>%
500 mutationRangeSubset(germline, mut_range, pos_range)
501 if (nrow(pos_db) == 0) {
502 warning("Insufficient sequences in desired mutational range")
503 return (invisible(NULL))
504 }
505 pos_db <- pos_db %>%
506 positionMutations(germline, pos_range)
507 pos_muts = pos_db %>%
508 group_by_(~POSITION) %>%
509 mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>%
510 group_by_(~MUT_COUNT, ~POSITION) %>%
511 summarise_(POS_MUT_RATE = ~mean(MUTATED)*unique(PASS) ) %>%
512 ungroup()
513
514 # Label the polymorphic positions as such
515 pass_y = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
516 gsub("[^0-9]", "", .) %>%
517 as.numeric()
518 p_y_f = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
519 gsub("[0-9]+.", "", .)
520 p_y_t = unlist(strsplit(names(novel_imgt), "_"))[-1] %>%
521 gsub(".[0-9]+", "", .)
522 # Parse the note to find positions that passed y intercept if no novel found
523 if(length(pass_y) == 0 & grepl("Position\\(s\\) passed y-intercept", note)){
524 pass_y = note %>% gsub("Position\\(s\\) passed y-intercept \\(", "", .) %>%
525 gsub("\\).*", "", .) %>% strsplit(",") %>% unlist %>% as.numeric
526 p_y_f = sapply(pass_y, function (x) substring(germline, x, x))
527 p_y_t = gsub(".", "?", p_y_f)
528 }
529
530 to_from = paste(paste("Position", pass_y), paste(paste(p_y_f, "->"), p_y_t))
531 names(to_from) = pass_y
532 pos_muts = pos_muts %>%
533 mutate_(Polymorphic = ~ifelse(POSITION %in% pass_y, "True", "False"))
534
535 pads = paste(rep("-", min(pos_range)-1), collapse="")
536 db_subset$MUT_COUNT_NOVEL = db_subset$SEQUENCE_IMGT %>%
537 substring(min(pos_range), max(pos_range)) %>%
538 paste(pads, ., sep="") %>%
539 getMutatedPositions(novel_imgt) %>%
540 sapply(length)
541 db_subset = db_subset %>%
542 filter_(~MUT_COUNT_NOVEL == 0) %>%
543 mutate_(J_GENE = ~getGene(J_CALL))
544 db_subset$JUNCTION_LENGTH = db_subset$JUNCTION_LENGTH %>%
545 factor(levels=min(db_subset$JUNCTION_LENGTH):max(db_subset$JUNCTION_LENGTH))
546 pos_muts$Polymorphic = pos_muts$Polymorphic %>%
547 factor(levels = c("False", "True"))
548 pos_db$NT = pos_db$NT %>%
549 factor(levels = names(DNA_COLORS))
550 pos_muts$GERMLINE = names(germline)
551
552 # MAKE THE FIRST PLOT
553 if(!is.na(novel_imgt)){
554 POLYCOLORS = setNames(DNA_COLORS[c(4,3)], c("False", "True"))
555 p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION,
556 color=~Polymorphic)) +
557 geom_line(size = 0.75) +
558 facet_grid(GERMLINE ~ .) +
559 scale_color_manual(values = POLYCOLORS) +
560 ylim(0,1) +
561 xlab("Mutation Count (Sequence)") +
562 ylab("Mutation Frequency (Position)") +
563 theme_bw() +
564 theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1),
565 legend.background=element_rect(fill = "transparent")) +
566 guides(color = guide_legend(ncol = 2, reverse = TRUE))
567 } else{
568 POLYCOLORS = setNames(DNA_COLORS[c(4,2)], c("False", "True"))
569 p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION,
570 color=~Polymorphic)) +
571 geom_line(size = 0.75) +
572 facet_grid(GERMLINE ~ .) +
573 scale_color_manual(values = POLYCOLORS) +
574 ylim(0,1) +
575 xlab("Mutation Count (Sequence)") +
576 ylab("Mutation Frequency (Position)") +
577 theme_bw() +
578 theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1),
579 legend.background=element_rect(fill = "transparent")) +
580 guides(color = guide_legend("Passed y-intercept test",
581 ncol = 2, reverse = TRUE))
582 }
583 # MAKE THE SECOND PLOT
584 p2_data = mutate_(filter_(pos_db, ~POSITION %in% pass_y),
585 POSITION = ~to_from[as.character(POSITION)])
586 if (nrow(p2_data)) {
587 p2 = ggplot(p2_data,
588 aes_(~factor(MUT_COUNT), fill=~NT)) +
589 geom_bar(width=0.9) +
590 guides(fill = guide_legend("Nucleotide", ncol = 4)) +
591 facet_grid(POSITION ~ .) +
592 xlab("Mutation Count (Sequence)") + ylab("Sequence Count") +
593 scale_fill_manual(values = DNA_COLORS, breaks=names(DNA_COLORS),
594 drop=FALSE) +
595 theme_bw() +
596 theme(legend.position=c(1,1), legend.justification=c(1,1),
597 legend.background=element_rect(fill = "transparent"))
598 } else {
599 p2_data = mutate_(filter_(pos_db,
600 ~POSITION %in% names(which.max(table(pos_db$POSITION)))),
601 POSITION = ~"No positions pass y-intercept test.")
602 p2 = ggplot(p2_data, aes_(~factor(MUT_COUNT))) +
603 geom_bar(width=0.9) +
604 facet_grid(POSITION ~ .) +
605 xlab("Mutation Count (Sequence)") + ylab("Sequence Count") +
606 theme_bw() +
607 theme(legend.position=c(1,1), legend.justification=c(1,1),
608 legend.background=element_rect(fill = "transparent"))
609 }
610 # MAKE THE THIRD PLOT
611 p3 = ggplot(db_subset, aes_(~JUNCTION_LENGTH, fill=~factor(J_GENE))) +
612 geom_bar(width=0.9) +
613 guides(fill = guide_legend("J Gene", ncol = 2)) +
614 xlab("Junction Length") + ylab("Unmutated Sequence Count") +
615 theme_bw() +
616 theme(legend.position=c(1,1), legend.justification=c(1,1),
617 legend.background=element_rect(fill = "transparent"))
618
619 p2_height = length(unique(p2_data$POSITION))
620 if (p2_height>1) { p2_height = 0.5 * p2_height}
621 heights = c(1, p2_height, 1)
622 multiplot(p1, p2, p3, cols = ncol, heights=heights)
623 }
624
625 #' Infer a subject-specific genotype
768 p2_data = mutate_(filter_(pos_db,
769 ~POSITION %in% names(which.max(table(pos_db$POSITION)))),
770 POSITION = ~"No positions pass y-intercept test.")
771 p2 = ggplot(p2_data, aes_(~factor(MUT_COUNT))) +
772 geom_bar(width=0.9) +
773 facet_grid(POSITION ~ .) +
774 xlab("Mutation Count (Sequence)") + ylab("Sequence Count") +
775 theme_bw() +
776 theme(legend.position=c(1,1), legend.justification=c(1,1),
777 legend.background=element_rect(fill = "transparent"))
778 }
779 # MAKE THE THIRD PLOT
780 p3 = ggplot(db_subset, aes_(~JUNCTION_LENGTH, fill=~factor(J_GENE))) +
781 geom_bar(width=0.9) +
782 guides(fill = guide_legend("J Gene", ncol = 2)) +
783 xlab("Junction Length") + ylab("Unmutated Sequence Count") +
784 theme_bw() +
785 theme(legend.position=c(1,1), legend.justification=c(1,1),
786 legend.background=element_rect(fill = "transparent"))
787
788 p2_height = length(unique(p2_data$POSITION))
789 if (p2_height>1) { p2_height = 0.5 * p2_height}
790 heights = c(1, p2_height, 1)
791 multiplot(p1, p2, p3, cols = ncol, heights=heights)
792 }
793
794 #' Infer a subject-specific genotype using a frequency method
626795 #'
627 #' \code{inferGenotype} infers an subject's genotype by finding the minimum
628 #' number set of alleles that can explain the majority of each gene's calls. The
629 #' most common allele of each gene is included in the genotype first, and the
630 #' next most common allele is added until the desired fraction of alleles can be
631 #' explained. In this way, mistaken allele calls (resulting from sequences which
796 #' \code{inferGenotype} infers an subject's genotype using a frequency method.
797 #' The genotype is inferred by finding the minimum number set of alleles that
798 #' can explain the majority of each gene's calls. The most common allele of
799 #' each gene is included in the genotype first, and the next most common allele
800 #' is added until the desired fraction of alleles can be explained. In this
801 #' way, mistaken allele calls (resulting from sequences which
632802 #' by chance have been mutated to look like another allele) can be removed.
633803 #'
634 #' @param clip_db a \code{data.frame} containing V allele
804 #' @details
805 #' Allele calls representing cases where multiple alleles have been
806 #' assigned to a single sample sequence are rare among unmutated
807 #' sequences but may result if nucleotides for certain positions are
808 #' not available. Calls containing multiple alleles are treated as
809 #' belonging to all groups. If \code{novel} is provided, all
810 #' sequences that are assigned to the same starting allele as any
811 #' novel germline allele will have the novel germline allele appended
812 #' to their assignent prior to searching for unmutated sequences.
813 #'
814 #' @param data a \code{data.frame} containing V allele
635815 #' calls from a single subject. If
636816 #' \code{find_unmutated} is \code{TRUE}, then
637817 #' the sample IMGT-gapped V(D)J sequence should
638 #' @param v_call column in \code{clip_db} with V allele calls.
639 #' Default is \code{"V_CALL"}
818 #' @param germline_db named vector of sequences containing the
819 #' germline sequences named in
820 #' \code{allele_calls}. Only required if
821 #' \code{find_unmutated} is \code{TRUE}.
822 #' @param novel an optional \code{data.frame} of the type
823 #' novel returned by
824 #' \link{findNovelAlleles} containing
825 #' germline sequences that will be utilized if
826 #' \code{find_unmutated} is \code{TRUE}. See
827 #' Details.
828 #' @param v_call column in \code{data} with V allele calls.
829 #' Default is \code{"V_CALL"}.
640830 #' be provided in a column \code{"SEQUENCE_IMGT"}
641831 #' @param fraction_to_explain the portion of each gene that must be
642832 #' explained by the alleles that will be included
643 #' in the genotype
833 #' in the genotype.
644834 #' @param gene_cutoff either a number of sequences or a fraction of
645835 #' the length of \code{allele_calls} denoting the
646836 #' minimum number of times a gene must be
647837 #' observed in \code{allele_calls} to be included
648 #' in the genotype
838 #' in the genotype.
649839 #' @param find_unmutated if \code{TRUE}, use \code{germline_db} to
650840 #' find which samples are unmutated. Not needed
651841 #' if \code{allele_calls} only represent
652842 #' unmutated samples.
653 #' @param germline_db named vector of sequences containing the
654 #' germline sequences named in
655 #' \code{allele_calls}. Only required if
656 #' \code{find_unmutated} is \code{TRUE}.
657 #' @param novel_df an optional \code{data.frame} of the type
658 #' novel returned by
659 #' \link{findNovelAlleles} containing
660 #' germline sequences that will be utilized if
661 #' \code{find_unmutated} is \code{TRUE}. See
662 #' details.
663 #' @details Allele calls representing cases where multiple alleles have been
664 #' assigned to a single sample sequence are rare among unmutated
665 #' sequences but may result if nucleotides for certain positions are
666 #' not available. Calls containing multiple alleles are treated as
667 #' belonging to all groups. If \code{novel_df} is provided, all
668 #' sequences that are assigned to the same starting allele as any
669 #' novel germline allele will have the novel germline allele appended
670 #' to their assignent prior to searching for unmutated sequences.
671 #'
672 #' @return A table of alleles denoting the genotype of the subject
673 #'
674 #' @note This method works best with data derived from blood, where a large
675 #' portion of sequences are expected to be unmutated. Ideally, there
676 #' should be hundreds of allele calls per gene in the input.
677 #'
678 #' @examples
679 #' # Infer the IGHV genotype, using only unmutated sequences, including any
680 #' # novel alleles
681 #' data(sample_db)
682 #' data(germline_ighv)
683 #' data(novel_df)
684 #' inferGenotype(sample_db, find_unmutated = TRUE, germline_db = germline_ighv,
685 #' novel_df = novel_df)
843 #'
844 #' @return
845 #' A \code{data.frame} of alleles denoting the genotype of the subject containing
846 #' the following columns:
847 #'
848 #' \itemize{
849 #' \item \code{GENE}: The gene name without allele.
850 #' \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}.
851 #' \item \code{COUNTS}: Comma separated list of observed sequences for each
852 #' corresponding allele in the \code{ALLELES} list.
853 #' \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}.
854 #' \item \code{NOTE}: Any comments on the inferrence.
855 #' }
856 #'
857 #' @note
858 #' This method works best with data derived from blood, where a large
859 #' portion of sequences are expected to be unmutated. Ideally, there
860 #' should be hundreds of allele calls per gene in the input.
686861 #'
687862 #' @seealso \link{plotGenotype} for a colorful visualization and
688863 #' \link{genotypeFasta} to convert the genotype to nucleotide sequences.
864 #' See \link{inferGenotypeBayesian} to infer a subject-specific genotype
865 #' using a Bayesian approach.
866 #'
867 #' @examples
868 #' # Infer IGHV genotype, using only unmutated sequences, including novel alleles
869 #' inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel,
870 #' find_unmutated=TRUE)
689871 #'
690872 #' @export
691 inferGenotype <- function(clip_db, v_call="V_CALL", fraction_to_explain = 0.875,
692 gene_cutoff = 1e-4, find_unmutated = TRUE,
693 germline_db = NA, novel_df = NA){
694
695 . = NULL
696 allele_calls = getAllele(clip_db[[v_call]], first=FALSE, strip_d=FALSE)
697 # Find the unmutated subset, if requested
698 if(find_unmutated){
699 if(is.na(germline_db[1])){
700 stop("germline_db needed if find_unmutated is TRUE")
701 }
702 if(!is.null(nrow(novel_df))){
703 novel_df = filter_(novel_df, ~!is.na(POLYMORPHISM_CALL)) %>%
704 select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT)
705 if(nrow(novel_df) > 0){
706 # Extract novel alleles if any and add them to germline_db
707 novel_gl = novel_df$NOVEL_IMGT
708 names(novel_gl) = novel_df$POLYMORPHISM_CALL
709 germline_db = c(germline_db, novel_gl)
710 # Add the novel allele calls to allele calls of the same starting allele
711 for(r in 1:nrow(novel_df)){
712 ind = grep(novel_df$GERMLINE_CALL[r], allele_calls, fixed=TRUE)
713 allele_calls[ind] = allele_calls[ind] %>%
714 sapply(paste, novel_df$POLYMORPHISM_CALL[r], sep=",")
873 inferGenotype <- function(data, germline_db=NA, novel=NA, v_call="V_CALL",
874 fraction_to_explain=0.875, gene_cutoff=1e-4,
875 find_unmutated=TRUE) {
876
877 . = NULL
878 allele_calls = getAllele(data[[v_call]], first=FALSE, strip_d=FALSE)
879 # Find the unmutated subset, if requested
880 if(find_unmutated){
881 if(is.na(germline_db[1])){
882 stop("germline_db needed if find_unmutated is TRUE")
715883 }
716 }
717 }
718 # Find unmutated sequences
719 allele_calls = findUnmutatedCalls(allele_calls,
720 as.character(clip_db$SEQUENCE_IMGT),
721 germline_db)
722 if(length(allele_calls) == 0){
723 stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.")
724 }
725 }
726
727 # Find which rows' calls contain which genes
728 cutoff = ifelse(gene_cutoff < 1, length(allele_calls)*gene_cutoff, gene_cutoff)
729 gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>%
730 getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="")
731 gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE)
732 names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE)
733 gene_groups = gene_groups[sapply(gene_groups, length) >= cutoff]
734 gene_groups = gene_groups[sortAlleles(names(gene_groups))]
735
736 # Make a table to store the resulting genotype
737 GENE = names(gene_groups)
738 ALLELES = COUNTS = NOTE = rep("", length(GENE))
739 TOTAL = sapply(gene_groups, length)
740 genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE)
741
742 # For each gene, find which alleles to include
743 for (g in GENE){
744 # Keep only the part of the allele calls that uses the gene being analyzed
745 ac = allele_calls[gene_groups[[g]]] %>%
746 strsplit(",") %>%
747 lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>%
748 sapply(paste, collapse=",")
749 target = ceiling(fraction_to_explain*length(ac)) # how many we need to explain
750 t_ac = table(ac) # table of allele calls
751 potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles
752 # One allele? Easy!
753 if (length(potentials) == 1 | length(t_ac) == 1){
754 genotype[genotype[,"GENE"]==g,"ALLELES"] =
755 gsub("[^d\\*]*[d\\*]","",potentials )[1]
756 genotype[genotype[,"GENE"]==g,"COUNTS"] = t_ac
757 } else {
758 # More alleles? Let's find the fewest that can explain the needed fraction
759 # Make a table of which alleles can explain which calls
760 regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="")
761 regexpotentials =
762 paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|")
763 tmat =
764 sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE))
765 seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac))
766 colnames(seqs_expl) = potentials
767
768 # Cycle through the table, including alleles to explain more sequences,
769 # until we explain enough sequences
770 included = counts = character(0)
771 tot_expl = 0
772 while(tot_expl < target){
773 allele_tot = apply(seqs_expl, 2, sum)
774 included = c(included, names(which.max(allele_tot)))
775 counts = c(counts, max(allele_tot))
776 tot_expl = max(allele_tot) + tot_expl
777 seqs_expl = seqs_expl[which(seqs_expl[,which.max(allele_tot)]==0),]
778 }
779 genotype[genotype[,"GENE"]==g,"ALLELES"] =
780 paste(gsub("[^d\\*]*[d\\*]","",included ),collapse=",")
781 genotype[genotype[,"GENE"]==g,"COUNTS"] =
782 paste(counts,collapse=",")
783 }
784
785 }
786 geno = as.data.frame(genotype, stringsAsFactors = FALSE)
787
788 # Check for indistinguishable calls
789 if(find_unmutated == TRUE){
790 seqs = genotypeFasta(geno, germline_db)
791 dist_mat = seqs %>%
792 sapply(function(x) sapply((getMutatedPositions(seqs, x)), length)) %>%
793 as.matrix
794 rownames(dist_mat) = colnames(dist_mat)
795 for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA }
796 same = which(dist_mat == 0, arr.ind=TRUE)
797 if (nrow(same) > 0 ) {
798 for (r in 1:nrow(same)) {
799 inds = as.vector(same[r,])
800 geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE =
801 paste(rownames(dist_mat)[inds], collapse=" and ") %>%
802 paste("Cannot distinguish", .)
803 }
804 }
805 }
806 rownames(geno) = NULL
807 return(geno)
808 }
884 if(!is.null(nrow(novel))){
885 novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>%
886 select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT)
887 if(nrow(novel) > 0){
888 # Extract novel alleles if any and add them to germline_db
889 novel_gl = novel$NOVEL_IMGT
890 names(novel_gl) = novel$POLYMORPHISM_CALL
891 germline_db = c(germline_db, novel_gl)
892 # Add the novel allele calls to allele calls of the same starting allele
893 for(r in 1:nrow(novel)){
894 ind = grep(novel$GERMLINE_CALL[r], allele_calls, fixed=TRUE)
895 allele_calls[ind] = allele_calls[ind] %>%
896 sapply(paste, novel$POLYMORPHISM_CALL[r], sep=",")
897 }
898 }
899 }
900 # Find unmutated sequences
901 allele_calls = findUnmutatedCalls(allele_calls,
902 as.character(data$SEQUENCE_IMGT),
903 germline_db)
904 if(length(allele_calls) == 0){
905 stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.")
906 }
907 }
908
909 # Find which rows' calls contain which genes
910 cutoff = ifelse(gene_cutoff < 1, length(allele_calls)*gene_cutoff, gene_cutoff)
911 gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>%
912 getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="")
913 gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE)
914 names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE)
915 gene_groups = gene_groups[sapply(gene_groups, length) >= cutoff]
916 gene_groups = gene_groups[sortAlleles(names(gene_groups))]
917
918 # Make a table to store the resulting genotype
919 GENE = names(gene_groups)
920 ALLELES = COUNTS = NOTE = rep("", length(GENE))
921 TOTAL = sapply(gene_groups, length)
922 genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE)
923
924 # For each gene, find which alleles to include
925 for (g in GENE) {
926 # Keep only the part of the allele calls that uses the gene being analyzed
927 ac = allele_calls[gene_groups[[g]]] %>%
928 strsplit(",") %>%
929 lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>%
930 sapply(paste, collapse=",")
931 target = ceiling(fraction_to_explain*length(ac)) # how many we need to explain
932 t_ac = table(ac) # table of allele calls
933 potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles
934 # One allele? Easy!
935 if (length(potentials) == 1 | length(t_ac) == 1) {
936 genotype[genotype[,"GENE"]==g,"ALLELES"] = gsub("[^d\\*]*[d\\*]","",potentials )[1]
937 genotype[genotype[,"GENE"]==g,"COUNTS"] = t_ac
938 } else {
939 # More alleles? Let's find the fewest that can explain the needed fraction
940 # Make a table of which alleles can explain which calls
941 regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="")
942 regexpotentials =
943 paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|")
944 tmat =
945 sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE))
946 seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac))
947 colnames(seqs_expl) = potentials
948
949 # Cycle through the table, including alleles to explain more sequences,
950 # until we explain enough sequences
951 included = counts = character(0)
952 tot_expl = 0
953 while(tot_expl < target){
954 allele_tot = apply(seqs_expl, 2, sum)
955 included = c(included, names(which.max(allele_tot)))
956 counts = c(counts, max(allele_tot))
957 tot_expl = max(allele_tot) + tot_expl
958 seqs_expl = seqs_expl[which(seqs_expl[,which.max(allele_tot)]==0),]
959 }
960 genotype[genotype[,"GENE"]==g,"ALLELES"] =
961 paste(gsub("[^d\\*]*[d\\*]","",included ),collapse=",")
962 genotype[genotype[,"GENE"]==g,"COUNTS"] =
963 paste(counts,collapse=",")
964 }
965 }
966
967 geno = as.data.frame(genotype, stringsAsFactors = FALSE)
968
969 # Check for indistinguishable calls
970 if (find_unmutated == TRUE) {
971 seqs = genotypeFasta(geno, germline_db)
972 dist_mat = seqs %>%
973 sapply(function(x) sapply((getMutatedPositions(seqs, x)), length)) %>%
974 as.matrix
975 rownames(dist_mat) = colnames(dist_mat)
976 for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA }
977 same = which(dist_mat == 0, arr.ind=TRUE)
978 if (nrow(same) > 0 ) {
979 for (r in 1:nrow(same)) {
980 inds = as.vector(same[r,])
981 geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE =
982 paste(rownames(dist_mat)[inds], collapse=" and ") %>%
983 paste("Cannot distinguish", .)
984 }
985 }
986 }
987 rownames(geno) = NULL
988
989 return(geno)
990 }
991
809992
810993 #' Show a colorful representation of a genotype
811994 #'
812995 #' \code{plotGenotype} plots a genotype table.
813996 #'
814 #' @param genotype a table of alleles denoting a genotype, as returned by
815 #' \link{inferGenotype}
997 #' @param genotype a \code{data.frame} of alleles denoting a genotype,
998 #' as returned by \link{inferGenotype}.
816999 #' @param facet_by a column name in \code{genotype} to facet the plot by.
8171000 #' If \code{NULL}, then do not facet the plot.
8181001 #' @param gene_sort a string defining the method to use when sorting alleles.
8191002 #' If \code{"name"} then sort in lexicographic order. If
8201003 #' \code{"position"} then sort by position in the locus, as
8211004 #' determined by the final two numbers in the gene name.
822 #' @param text_size the point size of the plotted text
1005 #' @param text_size the point size of the plotted text.
8231006 #' @param silent if \code{TRUE} do not draw the plot and just return the ggplot
8241007 #' object; if \code{FALSE} draw the plot.
8251008 #' @param ... additional arguments to pass to ggplot2::theme.
8291012 #' @seealso \link{inferGenotype}
8301013 #'
8311014 #' @examples
832 #' # Load example data
833 #' data(novel_df)
834 #' data(genotype)
835 #'
8361015 #' # Plot genotype
837 #' plotGenotype(genotype)
1016 #' plotGenotype(SampleGenotype)
8381017 #'
8391018 #' # Facet by subject
840 #' genotypea = genotypeb = genotype
841 #' genotypea$SUBJECT = "A"
842 #' genotypeb$SUBJECT = "B"
843 #' geno_sub = rbind(genotypea, genotypeb)
1019 #' genotype_a <- genotype_b <- SampleGenotype
1020 #' genotype_a$SUBJECT <- "A"
1021 #' genotype_b$SUBJECT <- "B"
1022 #' geno_sub <- rbind(genotype_a, genotype_b)
8441023 #' plotGenotype(geno_sub, facet_by="SUBJECT", gene_sort="pos")
8451024 #'
8461025 #' @export
847 plotGenotype = function(genotype, facet_by=NULL, gene_sort=c("name", "position"),
848 text_size=12, silent=FALSE, ...) {
849 # Check arguments
850 gene_sort <- match.arg(gene_sort)
851
852 # Split genes' alleles into their own rows
853 alleles = strsplit(genotype$ALLELES, ",")
854 geno2 = genotype
855 r = 1
856 for (g in 1:nrow(genotype)){
857 for(a in 1:length(alleles[[g]])) {
858 geno2[r, ] = genotype[g, ]
859 geno2[r, ]$ALLELES = alleles[[g]][a]
860 r = r + 1
861 }
862 }
863
864 # Set the gene order
865 geno2$GENE = factor(geno2$GENE,
866 levels=rev(sortAlleles(unique(geno2$GENE), method=gene_sort)))
867
868 # Create the base plot
869 p = ggplot(geno2, aes_(x=~GENE, fill=~ALLELES)) +
870 theme_bw() +
871 theme(axis.ticks=element_blank(),
872 axis.text.x=element_blank(),
873 panel.grid.major=element_blank(),
874 panel.grid.minor=element_blank(),
875 text=element_text(size=text_size),
876 strip.background=element_blank(),
877 strip.text=element_text(face="bold")) +
878 geom_bar(position="fill") +
879 coord_flip() + xlab("Gene") + ylab("") +
880 scale_fill_hue(name="Allele", h=c(0, 270), h.start=10)
881
882 # Plot, with facets by SUBJECT if that column is present
883 if (!is.null(facet_by)) {
884 p = p + facet_grid(paste0(".~", facet_by))
885 }
886
887 # Add additional theme elements
888 p = p + do.call(theme, list(...))
889
890 # Plot
891 if (!silent) { plot(p) }
892
893 invisible(p)
1026 plotGenotype <- function(genotype, facet_by=NULL, gene_sort=c("name", "position"),
1027 text_size=12, silent=FALSE, ...) {
1028 # Check arguments
1029 gene_sort <- match.arg(gene_sort)
1030
1031 # Split genes' alleles into their own rows
1032 alleles = strsplit(genotype$ALLELES, ",")
1033 geno2 = genotype
1034 r = 1
1035 for (g in 1:nrow(genotype)){
1036 for(a in 1:length(alleles[[g]])) {
1037 geno2[r, ] = genotype[g, ]
1038 geno2[r, ]$ALLELES = alleles[[g]][a]
1039 r = r + 1
1040 }
1041 }
1042
1043 # Set the gene order
1044 geno2$GENE = factor(geno2$GENE,
1045 levels=rev(sortAlleles(unique(geno2$GENE), method=gene_sort)))
1046
1047 # Create the base plot
1048 p = ggplot(geno2, aes_(x=~GENE, fill=~ALLELES)) +
1049 theme_bw() +
1050 theme(axis.ticks=element_blank(),
1051 axis.text.x=element_blank(),
1052 panel.grid.major=element_blank(),
1053 panel.grid.minor=element_blank(),
1054 text=element_text(size=text_size),
1055 strip.background=element_blank(),
1056 strip.text=element_text(face="bold")) +
1057 geom_bar(position="fill") +
1058 coord_flip() + xlab("Gene") + ylab("") +
1059 scale_fill_hue(name="Allele", h=c(0, 270), h.start=10)
1060
1061 # Plot, with facets by SUBJECT if that column is present
1062 if (!is.null(facet_by)) {
1063 p = p + facet_grid(paste0(".~", facet_by))
1064 }
1065
1066 # Add additional theme elements
1067 p = p + do.call(theme, list(...))
1068
1069 # Plot
1070 if (!silent) { plot(p) }
1071
1072 invisible(p)
8941073 }
8951074
8961075 #' Return the nucleotide sequences of a genotype
8981077 #' \code{genotypeFasta} converts a genotype table into a vector of nucleotide
8991078 #' sequences.
9001079 #'
901 #' @param genotype a table of alleles denoting a genotype, as returned by
902 #' \link{inferGenotype}
1080 #' @param genotype a \code{data.frame} of alleles denoting a genotype,
1081 #' as returned by \link{inferGenotype}.
9031082 #' @param germline_db a vector of named nucleotide germline sequences
904 #' matching the alleles detailed in \code{genotype}
905 #' @param novel_df an optional \code{data.frame} containing putative
1083 #' matching the alleles detailed in \code{genotype}.
1084 #' @param novel an optional \code{data.frame} containing putative
9061085 #' novel alleeles of the type returned by
907 #' \link{findNovelAlleles}
1086 #' \link{findNovelAlleles}.
9081087 #'
9091088 #' @return A named vector of strings containing the germline nucleotide
910 #' sequences of the alleles in the provided genotype
1089 #' sequences of the alleles in the provided genotype.
9111090 #'
9121091 #' @seealso \link{inferGenotype}
9131092 #'
9141093 #' @examples
915 #' # Load example data
916 #' data(germline_ighv)
917 #' data(novel_df)
918 #' data(genotype)
919 #'
9201094 #' # Find the sequences that correspond to the genotype
921 #' genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df)
922 #'
1095 #' genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, SampleNovel)
9231096 #'
9241097 #' @export
925 genotypeFasta <- function(genotype, germline_db, novel_df=NA){
926 if(!is.null(nrow(novel_df))){
927 # Extract novel alleles if any and add them to germline_db
928 novel_df = filter_(novel_df, ~!is.na(POLYMORPHISM_CALL)) %>%
929 select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT)
930 if(nrow(novel_df) > 0){
931 novel_gl = novel_df$NOVEL_IMGT
932 names(novel_gl) = novel_df$POLYMORPHISM_CALL
933 germline_db = c(germline_db, novel_gl)
934 }
935 }
936
937 genotype$GENE = gsub("D$|d$","",genotype$GENE)
938
939 g_names = names(germline_db)
940 names(g_names) = gsub("D", "", names(germline_db))
941 table_calls = mapply(paste, genotype$GENE, strsplit(genotype$ALLELES, ","),
942 sep="*")
943 seqs = germline_db[as.vector(g_names[unlist(table_calls)])]
944 if(sum(is.na(seqs)) > 0){
945 stop("The following genotype alleles were not found in germline_db: ",
946 paste(unlist(table_calls)[which(is.na(seqs))], collapse = ", "))
947 }
948 return(seqs)
1098 genotypeFasta <- function(genotype, germline_db, novel=NA){
1099 if(!is.null(nrow(novel))){
1100 # Extract novel alleles if any and add them to germline_db
1101 novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>%
1102 select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT)
1103 if(nrow(novel) > 0){
1104 novel_gl = novel$NOVEL_IMGT
1105 names(novel_gl) = novel$POLYMORPHISM_CALL
1106 germline_db = c(germline_db, novel_gl)
1107 }
1108 }
1109
1110 genotype$GENE = gsub("D$|d$","",genotype$GENE)
1111
1112 g_names = names(germline_db)
1113 names(g_names) = gsub("D", "", names(germline_db))
1114 table_calls = mapply(paste, genotype$GENE, strsplit(genotype$ALLELES, ","),
1115 sep="*")
1116 seqs = germline_db[as.vector(g_names[unlist(table_calls)])]
1117 if(sum(is.na(seqs)) > 0){
1118 stop("The following genotype alleles were not found in germline_db: ",
1119 paste(unlist(table_calls)[which(is.na(seqs))], collapse = ", "))
1120 }
1121 return(seqs)
9491122 }
9501123
9511124 #' Correct allele calls based on a personalized genotype
9541127 #' correct preliminary allele assignments of a set of sequences derived
9551128 #' from a single subject.
9561129 #'
957 #' @details In order to save time, initial gene assignments are preserved and
1130 #' @details
1131 #' In order to save time, initial gene assignments are preserved and
9581132 #' the allele calls are chosen from among those provided in \code{genotype_db},
9591133 #' based on a simple alignment to the sample sequence.
9601134 #'
961 #' @param clip_db a \code{data.frame} containing V allele calls from a
962 #' single subject and the sample
963 #' IMGT-gapped V(D)J sequences under
964 #' \code{"SEQUENCE_IMGT"}
1135 #' @param data a \code{data.frame} containing V allele calls from a
1136 #' single subject and the sample IMGT-gapped V(D)J sequences under
1137 #' \code{"SEQUENCE_IMGT"}.
9651138 #' @param genotype_db a vector of named nucleotide germline sequences
9661139 #' matching the calls detailed in \code{allele_calls}
9671140 #' and personalized to the subject
968 #' @param v_call name of the column in \code{clip_db} with V allele
969 #' calls. Default is \code{"V_CALL"}
1141 #' @param v_call name of the column in \code{data} with V allele
1142 #' calls. Default is \code{"V_CALL"}.
9701143 #' @param method the method to be used when realigning sequences to
971 #' the genotype_db sequences. Currently only "hammming"
1144 #' the genotype_db sequences. Currently, only \code{"hammming"}
9721145 #' (for Hamming distance) is implemented.
9731146 #' @param path directory containing the tool used in the
9741147 #' realignment method, if needed. Hamming distance does
9751148 #' not require a path to a tool.
976 #' @param keep_gene logical indicating if gene assignments should be
977 #' maintained when possible. Increases speed by
978 #' minimizing required number of alignments. Currently
979 #' only "TRUE" is implemented.
980 #'
981 #' @return a single-column \code{data.frame} corresponding to \code{clip.db}
982 #' and containing the best allele call from among the sequences
983 #' listed in \code{genotype_db}
1149 #' @param keep_gene a string indicating if the gene (\code{"gene"}),
1150 #' family (\code{"family"}) or complete repertoire
1151 #' (\code{"repertoire"}) assignments should be performed.
1152 #' Use of \code{"gene"} increases speed by minimizing required number of
1153 #' alignments, as gene level assignments will be maintained when possible.
1154 #'
1155 #' @return A modifed input \code{data.frame} containing the best allele call from
1156 #' among the sequences listed in \code{genotype_db} in the
1157 #' \code{V_CALL_GENOTYPED} column.
9841158 #'
9851159 #' @examples
986 #' # Load example data
987 #' data(germline_ighv)
988 #' data(sample_db)
989 #' data(genotype)
990 #' data(novel_df)
991 #'
9921160 #' # Extract the database sequences that correspond to the genotype
993 #' genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df)
1161 #' genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, novel=SampleNovel)
9941162 #'
9951163 #' # Use the personlized genotype to determine corrected allele assignments
996 #' V_CALL_GENOTYPED = reassignAlleles(sample_db, genotype_seqs)
997 #' sample_db = cbind(sample_db, V_CALL_GENOTYPED)
1164 #' output_db <- reassignAlleles(SampleDb, genotype_db)
9981165 #'
9991166 #' @export
1000 reassignAlleles <- function(clip_db, genotype_db, v_call="V_CALL",
1167 reassignAlleles <- function(data, genotype_db, v_call="V_CALL",
10011168 method="hamming", path=NA,
1002 keep_gene=TRUE){
1003
1004 # Extract data subset and prepare output vector
1005 v_sequences = as.character(clip_db$SEQUENCE_IMGT)
1006 v_calls = getAllele(clip_db[[v_call]], first=FALSE, strip_d=FALSE)
1007 v_genes = getGene(v_calls, first = TRUE, strip_d=FALSE)
1008 V_CALL_GENOTYPED = rep("", length(v_calls))
1009
1010
1011 if(keep_gene){
1012 # Find which genotype genes are homozygous and assign those alleles first
1013 geno_genes = getGene(names(genotype_db),strip_d=TRUE)
1014 names(geno_genes) = names(genotype_db)
1015 hetero_genes = unique(geno_genes[which(duplicated(geno_genes))])
1016 homo_genes = geno_genes[!(geno_genes %in% hetero_genes)]
1017 homo_alleles = names(homo_genes); names(homo_alleles) = homo_genes
1018 homo_calls_i = which(v_genes %in% homo_genes)
1019 V_CALL_GENOTYPED[homo_calls_i] = homo_alleles[v_genes[homo_calls_i]]
1169 keep_gene=c("gene", "family", "repertoire")){
1170 # Check arguments
1171 keep_gene <- match.arg(keep_gene)
1172
1173 # Extract data subset and prepare output vector
1174 v_sequences = as.character(data$SEQUENCE_IMGT)
1175 v_calls = getAllele(data[[v_call]], first=FALSE, strip_d=FALSE)
1176 v_call_genotyped = rep("", length(v_calls))
1177
1178 if (keep_gene == "gene") {
1179 v = getGene(v_calls, first = TRUE, strip_d=FALSE)
1180 geno = getGene(names(genotype_db),strip_d=TRUE)
1181 names(geno) = names(genotype_db)
1182 } else if (keep_gene == "family") {
1183 v <- getFamily(v_calls, first = TRUE, strip_d = FALSE)
1184 geno = getFamily(names(genotype_db),strip_d=TRUE)
1185 names(geno) = names(genotype_db)
1186 } else if (keep_gene == "repertoire") {
1187 v <- rep(v_call, length(v_calls))
1188 geno = rep(v_call, length(genotype_db))
1189 names(geno) = names(genotype_db)
1190 } else {
1191 stop("Unknown keep_gene value: ", keep_gene)
1192 }
1193
1194 # keep_gene == FALSE
1195 # Find which genotype genes/families are homozygous and assign those alleles first
1196 hetero = unique(geno[which(duplicated(geno))])
1197 homo = geno[!(geno %in% hetero)]
1198 homo_alleles = names(homo)
1199 names(homo_alleles) = homo
1200 homo_calls_i = which(v %in% homo)
1201 v_call_genotyped[homo_calls_i] = homo_alleles[v[homo_calls_i]]
10201202
10211203 # Now realign the heterozygote sequences to each allele of that gene
1022 for (het_gene in hetero_genes){
1023 ind = which(v_genes %in% het_gene)
1024 if (length(ind) > 0){
1025 het_alleles = names(geno_genes[which(geno_genes == het_gene)])
1026 het_seqs = genotype_db[het_alleles]
1027 if(method == "hamming"){
1028 dists = lapply(het_seqs, function(x)
1029 sapply(getMutatedPositions(v_sequences[ind], x, match_instead=FALSE),
1030 length))
1031 dist_mat = matrix(unlist(dists), ncol = length(het_seqs))
1204 for (het in hetero){
1205 ind = which(v %in% het)
1206 if (length(ind) > 0){
1207 het_alleles = names(geno[which(geno == het)])
1208 het_seqs = genotype_db[het_alleles]
1209 if(method == "hamming"){
1210 dists = lapply(het_seqs, function(x)
1211 sapply(getMutatedPositions(v_sequences[ind], x, match_instead=FALSE),
1212 length))
1213 dist_mat = matrix(unlist(dists), ncol = length(het_seqs))
1214 } else {
1215 stop("Only Hamming distance is currently supported as a method.")
1216 }
1217 # The sapply-apply approach could become problematic when nrow(dist_mat)
1218 # is 1 and min(best_match) has multiple values, due to the fact that R
1219 # does not always keep data structures unmutable
1220 # Explicitly specifying a list and subsequently keeping it as a list by
1221 # using lapply avoids that problem
1222 best_match = vector("list", length=nrow(dist_mat))
1223 for (i in 1:nrow(dist_mat)) {
1224 best_match[[i]] = which(dist_mat[i, ]==min(dist_mat[i, ]))
1225 }
1226 best_alleles = lapply(best_match, function(x) het_alleles[x])
1227 v_call_genotyped[ind] = unlist(lapply(best_alleles, paste, collapse=","))
1228 }
1229 }
1230
1231 # Now realign the gene-not-in-genotype calls to every genotype allele
1232 hetero_calls_i = which(v %in% hetero)
1233 not_called = setdiff(1:length(v), c(homo_calls_i, hetero_calls_i))
1234 if(length(not_called)>1){
1235 if(method == "hamming"){
1236 dists = lapply(genotype_db, function(x)
1237 sapply(getMutatedPositions(v_sequences[not_called], x, match_instead=FALSE),
1238 length))
1239 dist_mat = matrix(unlist(dists), ncol = length(genotype_db))
10321240 } else {
1033 stop("Only Hamming distance is currently supported as a method.")
1241 stop("Only Hamming distance is currently supported as a method.")
10341242 }
1035 best_match = apply(dist_mat, 1, function(x) which(x == min(x)))
1036 best_alleles = sapply(best_match, function(x) het_alleles[x])
1037 V_CALL_GENOTYPED[ind] = sapply(best_alleles, paste, collapse=",")
1038 }
1039 }
1040
1041 # Now realign the gene-not-in-genotype calls to every genotype allele
1042 hetero_calls_i = which(v_genes %in% hetero_genes)
1043 not_called = setdiff(1:length(v_genes), c(homo_calls_i, hetero_calls_i))
1044 if(length(not_called)>1){
1045 if(method == "hamming"){
1046 dists = lapply(genotype_db, function(x)
1047 sapply(getMutatedPositions(v_sequences[not_called], x, match_instead=FALSE),
1048 length))
1049 dist_mat = matrix(unlist(dists), ncol = length(genotype_db))
1050 } else {
1051 stop("Only Hamming distance is currently supported as a method.")
1052 }
1053 best_match = apply(dist_mat, 1, function(x) which(x == min(x)))
1054 best_alleles = sapply(best_match, function(x) names(genotype_db[x]))
1055 V_CALL_GENOTYPED[not_called] = sapply(best_alleles, paste, collapse=",")
1056 }
1057 } else {
1058 stop("Complete realignment is currently not supported.")
1059 }
1060
1061 return(data.frame(V_CALL_GENOTYPED,stringsAsFactors=FALSE))
1243 # The sapply-apply approach could become problematic when nrow(dist_mat)
1244 # is 1 and min(best_match) has multiple values, due to the fact that R
1245 # does not always keep data structures unmutable
1246 # Explicitly specifying a list and subsequently keeping it as a list by
1247 # using lapply avoids that problem
1248 best_match = vector("list", length=nrow(dist_mat))
1249 for (i in 1:nrow(dist_mat)) {
1250 best_match[[i]] = which(dist_mat[i, ]==min(dist_mat[i, ]))
1251 }
1252 best_alleles = lapply(best_match, function(x) names(genotype_db[x]))
1253 v_call_genotyped[not_called] = unlist(lapply(best_alleles, paste, collapse=","))
1254 }
1255
1256 if (all(v_call_genotyped == data[[v_call]])) {
1257 msg <- ("No allele assignment corrections made.")
1258 if (all(v %in% homo) & length(hetero) > 0) {
1259 keep_opt <- eval(formals(reassignAlleles)$keep_gene)
1260 i <- match(keep_gene, keep_opt)
1261 rec_opt <- paste(keep_opt[(i+1):length(keep_opt)], collapse = ", ")
1262 msg <- paste(msg, "Consider setting keep_gene to one of:", rec_opt)
1263 }
1264 warning(msg)
1265 }
1266
1267 data$V_CALL_GENOTYPED <- v_call_genotyped
1268
1269 return(data)
10621270 }
10631271
10641272
10851293 #'
10861294 #' @examples
10871295 #' # Create strings to act as a sample sequences and a reference sequence
1088 #' seqs = c("----GATA","GAGAGAGA","TANA")
1089 #' ref = "GATAGATA"
1296 #' seqs <- c("----GATA", "GAGAGAGA", "TANA")
1297 #' ref <- "GATAGATA"
10901298 #'
10911299 #' # Find the differences between the two
10921300 #' getMutatedPositions(seqs, ref)
10941302 #' @export
10951303 getMutatedPositions <- function(samples, germlines, ignored_regex="[\\.N-]",
10961304 match_instead=FALSE) {
1097
1098 # If only one germline sequence is given, use it for all the sample seqs
1099 if(length(germlines) == 1){ germlines = rep(germlines, length(samples)) }
1100 if(length(samples) != length(germlines)) {
1101 stop("Number of input sequences does not match number of germlines.")
1102 }
1103
1104 # Truncate each pair of sequences to the length of the shorter
1105 germ_mins = lapply(germlines, nchar)
1106 samp_mins = lapply(samples, nchar)
1107 min_lens = mapply(min, germ_mins, samp_mins)
1108 germ = toupper(mapply(substr, germlines, 1, min_lens, SIMPLIFY=FALSE))
1109 samp = toupper(mapply(substr, samples, 1, min_lens, SIMPLIFY=FALSE))
1110
1111 # Calculate poisitions of mutations (or matches), ignoring gaps, Ns, and CDR3
1112 samp_char = strsplit(samp,"")
1113 germ_char = strsplit(germ,"")
1114 if(!match_instead){
1115 muts = lapply(mapply("!=", samp_char, germ_char, SIMPLIFY=FALSE), which)
1116 } else {
1117 muts = lapply(mapply("==", samp_char, germ_char, SIMPLIFY=FALSE), which)
1118 }
1119 ignore_germ = gregexpr(ignored_regex, germ)
1120 ignore_samp = gregexpr(ignored_regex, samp)
1121 ignore = mapply(c, ignore_germ, ignore_samp, SIMPLIFY=FALSE)
1122
1123 muts = mapply(function(x, y) x[!x%in%y], muts, ignore, SIMPLIFY=FALSE)
1124 return(muts)
1305
1306 # If only one germline sequence is given, use it for all the sample seqs
1307 if(length(germlines) == 1){ germlines = rep(germlines, length(samples)) }
1308 if(length(samples) != length(germlines)) {
1309 stop("Number of input sequences does not match number of germlines.")
1310 }
1311
1312 # Truncate each pair of sequences to the length of the shorter
1313 germ_mins = lapply(germlines, nchar)
1314 samp_mins = lapply(samples, nchar)
1315 min_lens = mapply(min, germ_mins, samp_mins)
1316 germ = toupper(mapply(substr, germlines, 1, min_lens, SIMPLIFY=FALSE))
1317 samp = toupper(mapply(substr, samples, 1, min_lens, SIMPLIFY=FALSE))
1318
1319 # Calculate poisitions of mutations (or matches), ignoring gaps, Ns, and CDR3
1320 samp_char = strsplit(samp,"")
1321 germ_char = strsplit(germ,"")
1322 if(!match_instead){
1323 muts = lapply(mapply("!=", samp_char, germ_char, SIMPLIFY=FALSE), which)
1324 } else {
1325 muts = lapply(mapply("==", samp_char, germ_char, SIMPLIFY=FALSE), which)
1326 }
1327 ignore_germ = gregexpr(ignored_regex, germ)
1328 ignore_samp = gregexpr(ignored_regex, samp)
1329 ignore = mapply(c, ignore_germ, ignore_samp, SIMPLIFY=FALSE)
1330
1331 muts = mapply(function(x, y) x[!x%in%y], muts, ignore, SIMPLIFY=FALSE)
1332 return(muts)
11251333 }
11261334
11271335
11431351 #' each element of \code{samples}
11441352 #'
11451353 #' @examples
1146 #' # Load germline database
1147 #' data(germline_ighv)
1148 #'
1149 #' # Use createGermlines to insert a mutation into a germline sequence
1150 #' #sample_seqs = c(germline_ighv[2],
1151 #' # createGermlines(germline_ighv[1], 103, "G"),
1152 #' # createGermlines(germline_ighv[1], 107, "C"))
1354 #' # Insert a mutation into a germline sequence
1355 #' s2 <- s3 <- GermlineIGHV[1]
1356 #' stringi::stri_sub(s2, 103, 103) <- "G"
1357 #' stringi::stri_sub(s3, 107, 107) <- "C"
1358 #'
1359 #' sample_seqs <- c(GermlineIGHV[2], s2, s3)
11531360 #'
11541361 #' # Pretend that one sample sequence has received an ambiguous allele call
1155 #' #sample_alleles = c(paste(names(germline_ighv[1:2]), collapse=","),
1156 #' # names(germline_ighv[2]),
1157 #' # names(germline_ighv[1]))
1362 #' sample_alleles <- c(paste(names(GermlineIGHV[1:2]), collapse=","),
1363 #' names(GermlineIGHV[2]),
1364 #' names(GermlineIGHV[1]))
11581365 #'
11591366 #' # Compare each sequence to its assigned germline(s) to determine the distance
1160 #' #getMutCount(sample_seqs, sample_alleles, germline_ighv)
1367 #' getMutCount(sample_seqs, sample_alleles, GermlineIGHV)
11611368 #'
11621369 #' @export
11631370 getMutCount <- function(samples, allele_calls, germline_db){
1164
1165 call_list = strsplit(allele_calls, ",")
1166
1167 germline_list = lapply(call_list, function(x) germline_db[x])
1168
1169 mut_pos_list = list()
1170 mut_count_list = list()
1171 # First, find mutations of all sequences with call count of 1
1172 call_count = sapply(germline_list, length)
1173 cc1 = which(call_count == 1)
1174 if (length(cc1) > 0) {
1175 mut_pos_list[cc1] = getMutatedPositions(samples[cc1],
1176 unlist(germline_list[cc1]))
1177 mut_count_list[cc1] = lapply(mut_pos_list[cc1], length)
1178 }
1179 # Then find mutations of all sequences with call count > 1
1180 ccm = which(call_count > 1)
1181 if (length(ccm) > 0){
1182 mut_pos_list[ccm] = mapply(getMutatedPositions,
1183 germline_list[ccm], samples[ccm],
1184 SIMPLIFY=FALSE)
1185 mut_count_list[ccm] = lapply(mut_pos_list[ccm],
1186 function(x) lapply(x,length))
1187 }
1188
1189 return(mut_count_list)
1371
1372 call_list = strsplit(allele_calls, ",")
1373
1374 germline_list = lapply(call_list, function(x) germline_db[x])
1375
1376 mut_pos_list = list()
1377 mut_count_list = list()
1378 # First, find mutations of all sequences with call count of 1
1379 call_count = sapply(germline_list, length)
1380 cc1 = which(call_count == 1)
1381 if (length(cc1) > 0) {
1382 mut_pos_list[cc1] = getMutatedPositions(samples[cc1],
1383 unlist(germline_list[cc1]))
1384 mut_count_list[cc1] = lapply(mut_pos_list[cc1], length)
1385 }
1386 # Then find mutations of all sequences with call count > 1
1387 ccm = which(call_count > 1)
1388 if (length(ccm) > 0){
1389 mut_pos_list[ccm] = mapply(getMutatedPositions,
1390 germline_list[ccm], samples[ccm],
1391 SIMPLIFY=FALSE)
1392 mut_count_list[ccm] = lapply(mut_pos_list[ccm],
1393 function(x) lapply(x,length))
1394 }
1395
1396 return(mut_count_list)
11901397 }
11911398
11921399 #' Determine which calls represent an unmutated allele
11971404 #' sequence, only the subset that would represent a perfect match is returned.
11981405 #'
11991406 #' @param allele_calls a vector of strings respresenting Ig allele calls,
1200 #' where multiple calls are separated by a comma
1407 #' where multiple calls are separated by a comma.
12011408 #' @param germline_db a vector of named nucleotide germline sequences
12021409 #' @param sample_seqs V(D)J-rearranged sample sequences matching the order
1203 #' of the given \code{allele_calls}
1410 #' of the given \code{allele_calls}.
12041411 #'
12051412 #' @return A vector of strings containing the members of \code{allele_calls}
1206 #' that represent unmutated sequences
1413 #' that represent unmutated sequences.
12071414 #'
12081415 #' @examples
1209 #' # Load data
1210 #' data(germline_ighv)
1211 #' data(sample_db)
1212 #'
12131416 #' # Find which of the sample alleles are unmutated
1214 #' calls <- findUnmutatedCalls(sample_db$V_CALL, sample_db$SEQUENCE_IMGT,
1215 #' germline_db=germline_ighv)
1417 #' calls <- findUnmutatedCalls(SampleDb$V_CALL, SampleDb$SEQUENCE_IMGT,
1418 #' germline_db=GermlineIGHV)
12161419 #'
12171420 #' @export
12181421 findUnmutatedCalls <- function(allele_calls, sample_seqs, germline_db){
1219 . = NULL
1220 allele_calls = getAllele(allele_calls, first = FALSE)
1221 sample_seqs = as.character(sample_seqs)
1222
1223 # Remove calls not in germline_db
1224 not_in_db = allele_calls %>%
1225 strsplit(",") %>%
1226 unlist %>%
1227 setdiff(names(germline_db))
1228 no_call = which(allele_calls == "")
1229 in_db = not_in_db %>%
1230 sapply(grep, allele_calls, fixed=TRUE) %>%
1231 unlist() %>%
1232 c(no_call) %>%
1233 unique() %>%
1234 setdiff(1:length(allele_calls), .)
1235 allele_calls = allele_calls[in_db]
1236 sample_seqs = sample_seqs[in_db]
1237
1238 mut_counts = getMutCount(sample_seqs, allele_calls, germline_db)
1239
1240 # Find which seqs are unmutated and which of the allele calls that represents
1241 unmut_i = which(sapply(mut_counts, function(x) min(unlist(x))) == 0)
1242 which_no_muts = sapply(mut_counts, function(x) grep("^0$", unlist(x)) )
1243 unmut_alleles = rep("", length(allele_calls))
1244
1245 # How many alleles represent perfect matches?
1246 n_gl_unmut = sapply(which_no_muts, length)
1247
1248 one_unmut = which(n_gl_unmut == 1)
1249 split_names = strsplit(allele_calls, ",")
1250 if (length(one_unmut) > 0){
1251 inds = unlist(which_no_muts[one_unmut])
1252 unmut_alleles[one_unmut] = mapply("[", split_names[one_unmut], inds)
1253 }
1254
1255 more_unmut = which(n_gl_unmut > 1)
1256 if (length(more_unmut) > 0){
1257 inds = which_no_muts[more_unmut]
1258 unmut_multi = mapply(function(x,y) x[unlist(y)], split_names[more_unmut],
1259 inds, SIMPLIFY = FALSE)
1260 unmut_alleles[more_unmut] = sapply(unmut_multi, paste, collapse=",")
1261 }
1262
1263 unmut_alleles = unmut_alleles[unmut_i]
1264
1265 return(unmut_alleles)
1266
1267 }
1268
1269 #' Find Frequent Sequences' Mutation Counts
1422 . = NULL
1423 allele_calls = getAllele(allele_calls, first = FALSE)
1424 sample_seqs = as.character(sample_seqs)
1425
1426 # Remove calls not in germline_db
1427 not_in_db = allele_calls %>%
1428 strsplit(",") %>%
1429 unlist %>%
1430 setdiff(names(germline_db))
1431 no_call = which(allele_calls == "")
1432 in_db = not_in_db %>%
1433 sapply(grep, allele_calls, fixed=TRUE) %>%
1434 unlist() %>%
1435 c(no_call) %>%
1436 unique() %>%
1437 setdiff(1:length(allele_calls), .)
1438 allele_calls = allele_calls[in_db]
1439 sample_seqs = sample_seqs[in_db]
1440
1441 mut_counts = getMutCount(sample_seqs, allele_calls, germline_db)
1442
1443 # Find which seqs are unmutated and which of the allele calls that represents
1444 unmut_i = which(sapply(mut_counts, function(x) min(unlist(x))) == 0)
1445 which_no_muts = sapply(mut_counts, function(x) grep("^0$", unlist(x)) )
1446 unmut_alleles = rep("", length(allele_calls))
1447
1448 # How many alleles represent perfect matches?
1449 n_gl_unmut = sapply(which_no_muts, length)
1450
1451 one_unmut = which(n_gl_unmut == 1)
1452 split_names = strsplit(allele_calls, ",")
1453 if (length(one_unmut) > 0){
1454 inds = unlist(which_no_muts[one_unmut])
1455 unmut_alleles[one_unmut] = mapply("[", split_names[one_unmut], inds)
1456 }
1457
1458 more_unmut = which(n_gl_unmut > 1)
1459 if (length(more_unmut) > 0){
1460 inds = which_no_muts[more_unmut]
1461 unmut_multi = mapply(function(x,y) x[unlist(y)], split_names[more_unmut],
1462 inds, SIMPLIFY = FALSE)
1463 unmut_alleles[more_unmut] = sapply(unmut_multi, paste, collapse=",")
1464 }
1465
1466 unmut_alleles = unmut_alleles[unmut_i]
1467
1468 return(unmut_alleles)
1469
1470 }
1471
1472 #' Find mutation counts for frequency sequences
12701473 #'
12711474 #' \code{getPopularMutationCount} determines which sequences occur frequently
12721475 #' for each V gene and returns the mutation count of those sequences.
12731476 #'
1274 #' @param sample_db A Change-O db data frame. See
1477 #' @param data a \code{data.frame} in the Change-O format. See
12751478 #' \link{findNovelAlleles} for a list of required
12761479 #' columns.
12771480 #' @param germline_db A named list of IMGT-gapped germline sequences.
12811484 #' to avoid exclusion.
12821485 #' @param seq_p_of_max For each gene, fraction of the most common V sequence's
12831486 #' count that a sequence must meet to avoid exclusion.
1284 #' @param full_return If true, will return all \code{sample_db} columns and
1487 #' @param full_return If \code{TRUE}, will return all \code{data} columns and
12851488 #' will include sequences with mutation count < 1.
12861489 #'
12871490 #' @return A data frame of genes that have a frequent sequence mutation count
12911494 #' of a set of sequences are mutated.
12921495 #'
12931496 #' @examples
1294 #' data(sample_db, germline_ighv)
1295 #' getPopularMutationCount(sample_db, germline_ighv)
1497 #' getPopularMutationCount(SampleDb, GermlineIGHV)
12961498 #'
12971499 #' @export
1298 getPopularMutationCount <- function(sample_db, germline_db, gene_min = 1e-03,
1500 getPopularMutationCount <- function(data, germline_db, gene_min = 1e-03,
12991501 seq_min = 50, seq_p_of_max = 1/8,
13001502 full_return = FALSE){
1301 modified_db = sample_db %>%
1302 mutate_(V_GENE = ~getGene(V_CALL)) %>%
1303 group_by_(~1:n()) %>%
1304 mutate_(V_SEQUENCE_IMGT = ~substring(SEQUENCE_IMGT, 1, 312)) %>%
1305 # Count occurence of each unique IMGT-gapped V sequence
1306 group_by_(~V_GENE, ~V_SEQUENCE_IMGT) %>%
1307 mutate_(V_SEQUENCE_IMGT_N = ~n()) %>%
1308 # Count occurence of each gene and determine count of most common sequence
1309 mutate_(V_GENE_N = ~n()) %>%
1310 mutate_(V_SEQUENCE_IMGT_N_MAX = ~max(V_SEQUENCE_IMGT_N)) %>%
1311 # Remove rare V genes, rare sequences, and sequences not making up a
1312 # sufficient proportion of sequences as compared to the most common
1313 ungroup %>%
1314 distinct_(~V_SEQUENCE_IMGT, .keep_all = TRUE) %>%
1315 filter_(~V_GENE_N >= (nrow(sample_db)*gene_min)) %>%
1316 filter_(~V_SEQUENCE_IMGT_N >= seq_min) %>%
1317 mutate_(V_SEQUENCE_IMGT_P_MAX = ~V_SEQUENCE_IMGT_N/V_SEQUENCE_IMGT_N_MAX) %>%
1318 filter_(~V_SEQUENCE_IMGT_P_MAX >= seq_p_of_max)
1319 # Determine the mutation counts of the V sequences and append them to the db
1320 MUTATION_COUNT = getMutCount(modified_db$V_SEQUENCE_IMGT,
1321 modified_db$V_CALL,
1322 germline_db) %>%
1323 sapply(function(x) min(unlist(x)))
1324 if (length(MUTATION_COUNT)==0){
1325 MUTATION_COUNT = integer(0)
1326 }
1327 merged_db = bind_cols(modified_db, data.frame(MUTATION_COUNT))
1328 # Strip down the data frame before returning it
1329 if (!full_return) {
1330 merged_db = merged_db %>%
1331 filter_(~MUTATION_COUNT > 0) %>%
1332 select_(~V_GENE, ~MUTATION_COUNT)
1333 }
1334 return(merged_db)
1503 modified_db = data %>%
1504 mutate_(V_GENE = ~getGene(V_CALL)) %>%
1505 group_by_(~V_GENE) %>%
1506 mutate_(V_GENE_N = ~n()) %>%
1507 group_by_(~1:n()) %>%
1508 mutate_(V_SEQUENCE_IMGT = ~substring(SEQUENCE_IMGT, 1, 312)) %>%
1509 # Count occurence of each unique IMGT-gapped V sequence
1510 group_by_(~V_GENE, ~V_SEQUENCE_IMGT) %>%
1511 mutate_(V_SEQUENCE_IMGT_N = ~n()) %>%
1512 # Determine count of most common sequence
1513 group_by_(~V_GENE) %>%
1514 mutate_(V_SEQUENCE_IMGT_N_MAX = ~max(V_SEQUENCE_IMGT_N)) %>%
1515 # Remove rare V genes, rare sequences, and sequences not making up a
1516 # sufficient proportion of sequences as compared to the most common
1517 ungroup %>%
1518 distinct_(~V_SEQUENCE_IMGT, .keep_all = TRUE) %>%
1519 filter_(~V_GENE_N >= (nrow(data)*gene_min)) %>%
1520 filter_(~V_SEQUENCE_IMGT_N >= seq_min) %>%
1521 mutate_(V_SEQUENCE_IMGT_P_MAX = ~V_SEQUENCE_IMGT_N/V_SEQUENCE_IMGT_N_MAX) %>%
1522 filter_(~V_SEQUENCE_IMGT_P_MAX >= seq_p_of_max)
1523 # Determine the mutation counts of the V sequences and append them to the db
1524 MUTATION_COUNT = getMutCount(modified_db$V_SEQUENCE_IMGT,
1525 modified_db$V_CALL,
1526 germline_db) %>%
1527 sapply(function(x) min(unlist(x)))
1528 if (length(MUTATION_COUNT)==0){
1529 MUTATION_COUNT = integer(0)
1530 }
1531 merged_db = bind_cols(modified_db, data.frame(MUTATION_COUNT))
1532 # Strip down the data frame before returning it
1533 if (!full_return) {
1534 merged_db = merged_db %>%
1535 filter_(~MUTATION_COUNT > 0) %>%
1536 select_(~V_GENE, ~MUTATION_COUNT)
1537 }
1538 return(merged_db)
13351539 }
13361540
13371541 #' Insert polymorphisms into a nucleotide sequence
13391543 #' \code{insertPolymorphisms} replaces nucleotides in the desired locations of a
13401544 #' provided sequence.
13411545 #'
1342 #'
1343 #' @param sequence the starting nucletide sequence
1344 #' @param positions a vector of positions which to be changed
1345 #' @param nucleotides a vector of nucletides to which to change the
1346 #' positions
1347 #' @return a sequence with the desired nucleotides in provided locations
1546 #' @param sequence starting nucletide sequence.
1547 #' @param positions numeric vector of positions which to be changed.
1548 #' @param nucleotides character vector of nucletides to which to change the
1549 #' positions.
1550 #'
1551 #' @return A sequence with the desired nucleotides in the provided locations.
13481552 #'
13491553 #' @examples
1350 #' insertPolymorphisms("hugged", c(1,6,2), c("t","r","i"))
1554 #' insertPolymorphisms("HUGGED", c(1, 6, 2), c("T", "R", "I"))
13511555 #'
13521556 #' @export
1353 insertPolymorphisms <- function(sequence, positions, nucleotides){
1354
1355 if(length(positions) != length(nucleotides)){
1356 stop("Number of nucleotides and number of positions do not match.")
1357 }
1358 names(positions) = nucleotides
1359 for (i in 1:length(positions)){
1360 substr(sequence, positions[i], positions[i]) = names(positions[i])
1361 }
1362
1363 return(sequence)
1557 insertPolymorphisms <- function(sequence, positions, nucleotides) {
1558
1559 if(length(positions) != length(nucleotides)){
1560 stop("Number of nucleotides and number of positions do not match.")
1561 }
1562 names(positions) = nucleotides
1563 for (i in 1:length(positions)){
1564 substr(sequence, positions[i], positions[i]) = names(positions[i])
1565 }
1566
1567 return(sequence)
13641568 }
13651569
13661570 # Formatting and Cleanup --------------------------------------------------
13701574 #' \code{readIgFasta} reads a fasta-formatted file of immunoglobulin (Ig)
13711575 #' sequences and returns a named vector of those sequences.
13721576 #'
1373 #' @param fasta_file fasta-formatted file of immunoglobuling sequences
1577 #' @param fasta_file fasta-formatted file of immunoglobuling sequences.
13741578 #' @param strip_down_name if \code{TRUE}, will extract only the allele name
1375 #' from the strings fasta file's sequence names
1579 #' from the strings fasta file's sequence names.
13761580 #' @param force_caps if \code{TRUE}, will force nucleotides to
1377 #' uppercase
1378 #' @return a named vector of strings respresenting Ig alleles
1581 #' uppercase.
1582 #'
1583 #' @return Named vector of strings respresenting Ig alleles.
13791584 #'
13801585 #' @seealso \link{writeFasta} to do the inverse.
13811586 #'
13821587 #' @export
1383 readIgFasta <- function(fasta_file,
1384 strip_down_name = TRUE,
1385 force_caps = TRUE){
1386 all_char = readChar(fasta_file, file.info(fasta_file)$size)
1387 split_by_sequence = strsplit(all_char, "[ \t\r\n\v\f]?>")
1388 add_name_break = sapply(split_by_sequence, function(x) sub("[\r\n]",">",x))
1389 cleaned_up = sapply(add_name_break, function(x) gsub("[ \t\r\n\v\f]", "", x))
1390 broken_names = sapply(cleaned_up, strsplit, ">")
1391 seqs = sapply(broken_names, "[", 2)
1392 seq_names = sapply(broken_names, "[", 1)
1393 if(force_caps){ seqs = toupper(seqs) }
1394 if(strip_down_name){ seq_names = getAllele(seq_names, strip_d=FALSE) }
1395 names(seqs) = seq_names
1396 return(seqs[which(!is.na(seqs))])
1588 readIgFasta <- function(fasta_file, strip_down_name=TRUE, force_caps=TRUE) {
1589 all_char = readChar(fasta_file, file.info(fasta_file)$size)
1590 split_by_sequence = strsplit(all_char, "[ \t\r\n\v\f]?>")
1591 add_name_break = sapply(split_by_sequence, function(x) sub("[\r\n]",">",x))
1592 cleaned_up = sapply(add_name_break, function(x) gsub("[ \t\r\n\v\f]", "", x))
1593 broken_names = sapply(cleaned_up, strsplit, ">")
1594
1595 seqs = sapply(broken_names, "[", 2)
1596 seq_names = sapply(broken_names, "[", 1)
1597 if(force_caps) { seqs = toupper(seqs) }
1598 if(strip_down_name){ seq_names = getAllele(seq_names, strip_d=FALSE) }
1599 names(seqs) = seq_names
1600
1601 return(seqs[which(!is.na(seqs))])
13971602 }
13981603
13991604 #' Write to a fasta file
14021607 #' format.
14031608 #'
14041609 #' @param named_sequences a vector of named string representing sequences
1405 #' @param file the name of the output file
1610 #' @param file the name of the output file.
14061611 #' @param width the number of characters to be printed per line.
1407 #' If not between 1 and 255, width with be infinite.
1612 #' if not between 1 and 255, width with be infinite.
14081613 #' @param append \code{logical} indicating if the output should be
14091614 #' appended to \code{file} instead of overwriting it
14101615 #'
1411 #' @return a named vector of strings respresenting Ig alleles
1616 #' @return A named vector of strings respresenting Ig alleles.
14121617 #'
14131618 #' @seealso \link{readIgFasta} to do the inverse.
14141619 #'
14151620 #' @export
14161621 writeFasta <- function(named_sequences, file, width=60, append=FALSE){
1417 . = NULL
1418 seq_names = names(named_sequences) %>%
1419 paste(">", ., "\n", sep="")
1420 seqs = as.character(named_sequences)
1421 if(is.numeric(width) & width > 0 & width < 256){
1422 width_regex = paste("(.{", width, ",", width, "})", sep="")
1423 seqs = gsub(width_regex, "\\1\n", seqs)
1424 }
1425 seqs = seqs %>%
1426 paste("\n", sep="") %>%
1427 gsub("\n\n", "\n", .)
1428 paste(seq_names, seqs, sep="", collapse="") %>%
1429 cat(file=file, append=append)
1622 . = NULL
1623 seq_names = names(named_sequences) %>%
1624 paste(">", ., "\n", sep="")
1625 seqs = as.character(named_sequences)
1626 if(is.numeric(width) & width > 0 & width < 256){
1627 width_regex = paste("(.{", width, ",", width, "})", sep="")
1628 seqs = gsub(width_regex, "\\1\n", seqs)
1629 }
1630 seqs = seqs %>%
1631 paste("\n", sep="") %>%
1632 gsub("\n\n", "\n", .)
1633 paste(seq_names, seqs, sep="", collapse="") %>%
1634 cat(file=file, append=append)
14301635 }
14311636
14321637 #' Update IGHV allele names
14331638 #'
14341639 #' \code{updateAlleleNames} takes a set of IGHV allele calls and replaces any
14351640 #' outdated names (e.g. IGHV1-f) with the new IMGT names.
1436 #' @details The updated allele names are based on IMGT release 201408-4.
1437 #' @note IGMT has removed IGHV2-5*10 and IGHV2-5*07 as it has determined they
1438 #' are actually alleles *02 and *04, respectively.
1439 #'
1440 #' @param allele_calls a vector of strings respresenting IGHV allele names
1441 #'
1442 #' @return vector of strings respresenting updated IGHV allele names
1443 #'
1444 #' @references Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes
1445 #' and alleles: new entities, new names and implications for research and
1446 #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6
1641 #'
1642 #' @param allele_calls a vector of strings respresenting IGHV allele names.
1643 #'
1644 #' @return Vector of strings respresenting updated IGHV allele names.
1645 #'
1646 #' @note
1647 #' IGMT has removed \code{IGHV2-5*10} and \code{IGHV2-5*07} as it has determined they
1648 #' are actually alleles \code{02} and \code{04}, respectively. The updated allele
1649 #' names are based on IMGT release 201408-4.
1650 #'
1651 #' @references
1652 #' \enumerate{
1653 #' \item Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes
1654 #' and alleles: new entities, new names and implications for research and
1655 #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6
1656 #' }
14471657 #'
14481658 #' @seealso Like \code{updateAlleleNames}, \link{sortAlleles} can help
14491659 #' format a list of allele names.
14501660 #'
14511661 #' @examples
14521662 #' # Create a vector that uses old gene/allele names.
1453 #' alleles = c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07")
1663 #' alleles <- c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07")
14541664 #'
14551665 #' # Update the alleles to the new names
14561666 #' updateAlleleNames(alleles)
14571667 #'
14581668 #' @export
1459 updateAlleleNames <- function(allele_calls){
1460 . = NULL
1461 temporary_names = c("IGHV1-c*",
1462 "IGHV1-f*",
1463 "IGHV3-d*",
1464 "IGHV3-h*",
1465 "IGHV4-b*",
1466 "IGHV5-a*",
1467 "IGHV2-5*10",
1468 "IGHV2-5*07")
1469 definitive_names = c("IGHV1-38-4*",
1470 "IGHV1-69-2*",
1471 "IGHV3-38-3*",
1472 "IGHV3-69-1*",
1473 "IGHV4-38-2*",
1474 "IGHV5-10-1*",
1475 "IGHV2-5*02",
1476 "IGHV2-5*04")
1477 for (i in 1:length(temporary_names)){
1478 allele_calls = allele_calls %>%
1479 gsub(temporary_names[i], definitive_names[i], ., fixed = TRUE)
1480 }
1481 return(allele_calls)
1669 updateAlleleNames <- function(allele_calls) {
1670 . = NULL
1671 temporary_names = c("IGHV1-c*",
1672 "IGHV1-f*",
1673 "IGHV3-d*",
1674 "IGHV3-h*",
1675 "IGHV4-b*",
1676 "IGHV5-a*",
1677 "IGHV2-5*10",
1678 "IGHV2-5*07")
1679 definitive_names = c("IGHV1-38-4*",
1680 "IGHV1-69-2*",
1681 "IGHV3-38-3*",
1682 "IGHV3-69-1*",
1683 "IGHV4-38-2*",
1684 "IGHV5-10-1*",
1685 "IGHV2-5*02",
1686 "IGHV2-5*04")
1687 for (i in 1:length(temporary_names)){
1688 allele_calls = allele_calls %>%
1689 gsub(temporary_names[i], definitive_names[i], ., fixed = TRUE)
1690 }
1691 return(allele_calls)
14821692 }
14831693
14841694 #' Sort allele names
14861696 #' \code{sortAlleles} returns a sorted vector of strings respresenting Ig allele
14871697 #' names. Names are first sorted by gene family, then by gene, then by allele.
14881698 #' Duplicated genes have their alleles are sorted as if they were part of their
1489 #' non-duplicated counterparts (e.g. IGHV1-69D*01 comes after IGHV1-69*01 but
1490 #' before IGHV1-69*02), and non-localized genes (e.g. IGHV1-NL1*01) come last
1491 #' within their gene family.
1492 #'
1493 #' @param allele_calls a vector of strings respresenting Ig allele names
1699 #' non-duplicated counterparts (e.g. \code{IGHV1-69D*01} comes after \code{IGHV1-69*01}
1700 #' but before \code{IGHV1-69*02}), and non-localized genes (e.g. \code{IGHV1-NL1*01})
1701 #' come last within their gene family.
1702 #'
1703 #' @param allele_calls a vector of strings respresenting Ig allele names.
14941704 #' @param method a string defining the method to use when sorting alleles.
14951705 #' If \code{"name"} then sort in lexicographic order. If
14961706 #' \code{"position"} then sort by position in the locus, as
14971707 #' determined by the final two numbers in the gene name.
1498 #' @return A sorted vector of strings respresenting Ig allele names
1708 #' @return A sorted vector of strings respresenting Ig allele names.
14991709 #'
15001710 #' @seealso Like \code{sortAlleles}, \link{updateAlleleNames} can help
15011711 #' format a list of allele names.
15021712 #'
15031713 #' @examples
15041714 #' # Create a list of allele names
1505 #' alleles = c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01",
1506 #' "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05",
1507 #' "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02")
1715 #' alleles <- c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01",
1716 #' "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05",
1717 #' "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02")
15081718 #'
15091719 #' # Sort the alleles by name
15101720 #' sortAlleles(alleles)
15141724 #'
15151725 #' @export
15161726 sortAlleles <- function(allele_calls, method=c("name", "position")) {
1517 # Check arguments
1518 method <- match.arg(method)
1519
1520 # Standardize format of submitted alleles, first
1521 SUBMITTED_CALLS = getAllele(allele_calls, first = FALSE, strip_d= FALSE) %>%
1522 sort()
1523 allele_df = data.frame(SUBMITTED_CALLS,stringsAsFactors = FALSE) %>%
1524 # Determine the family
1525 mutate_(FAMILY = ~getFamily(SUBMITTED_CALLS)) %>%
1526 # Determine the gene (exclude family); convert letters to numbers for sort
1527 mutate_(GENE = ~getGene(SUBMITTED_CALLS)) %>%
1528 mutate_(GENE1 = ~gsub("[^-]+[-S]([^-\\*D]+).*","\\1",SUBMITTED_CALLS)) %>%
1529 mutate_(GENE1 = ~as.numeric(gsub("[^0-9]+", "99", GENE1))) %>%
1530 # If there is a second gene number, determine that, too
1531 mutate_(GENE2 = ~gsub("[^-]+[-S][^-]+-?","",GENE)) %>%
1532 mutate_(GENE2 = ~as.numeric(gsub("[^0-9]+", "99", GENE2))) %>%
1533 mutate_(ALLELE = ~getAllele(SUBMITTED_CALLS)) %>%
1534 mutate_(ALLELE = ~(sub("[^\\*]+\\*|[^\\*]+$","",
1535 ALLELE))) %>%
1536 mutate_(ALLELE = ~as.numeric(sub("_.+$","",
1537 ALLELE)))
1538 # Convert missing values to 0, sort data frame
1539 allele_df[is.na(allele_df)] = 0
1540 if (method == "name") {
1541 sorted_df = arrange_(allele_df, ~FAMILY, ~GENE1, ~GENE2, ~ALLELE)
1542 } else if (method == "position") {
1543 sorted_df = arrange_(allele_df, ~desc(GENE1), ~desc(GENE2), ~FAMILY, ~ALLELE)
1544 }
1545
1546 return(sorted_df$SUBMITTED_CALLS)
1727 # Check arguments
1728 method <- match.arg(method)
1729
1730 # Standardize format of submitted alleles, first
1731 SUBMITTED_CALLS = getAllele(allele_calls, first = FALSE, strip_d= FALSE) %>%
1732 sort()
1733 allele_df = data.frame(SUBMITTED_CALLS,stringsAsFactors = FALSE) %>%
1734 # Determine the family
1735 mutate_(FAMILY = ~getFamily(SUBMITTED_CALLS)) %>%
1736 # Determine the gene (exclude family); convert letters to numbers for sort
1737 mutate_(GENE = ~getGene(SUBMITTED_CALLS)) %>%
1738 mutate_(GENE1 = ~gsub("[^-]+[-S]([^-\\*D]+).*","\\1",SUBMITTED_CALLS)) %>%
1739 mutate_(GENE1 = ~as.numeric(gsub("[^0-9]+", "99", GENE1))) %>%
1740 # If there is a second gene number, determine that, too
1741 mutate_(GENE2 = ~gsub("[^-]+[-S][^-]+-?","",GENE)) %>%
1742 mutate_(GENE2 = ~as.numeric(gsub("[^0-9]+", "99", GENE2))) %>%
1743 mutate_(ALLELE = ~getAllele(SUBMITTED_CALLS)) %>%
1744 mutate_(ALLELE = ~(sub("[^\\*]+\\*|[^\\*]+$","",
1745 ALLELE))) %>%
1746 mutate_(ALLELE = ~as.numeric(sub("_.+$","",
1747 ALLELE)))
1748 # Convert missing values to 0, sort data frame
1749 allele_df[is.na(allele_df)] = 0
1750 if (method == "name") {
1751 sorted_df = arrange_(allele_df, ~FAMILY, ~GENE1, ~GENE2, ~ALLELE)
1752 } else if (method == "position") {
1753 sorted_df = arrange_(allele_df, ~desc(GENE1), ~desc(GENE2), ~FAMILY, ~ALLELE)
1754 }
1755
1756 return(sorted_df$SUBMITTED_CALLS)
15471757 }
15481758
15491759 #' Clean up nucleotide sequences
15501760 #'
1551 #' \code{cleanSeqs} capitalizes nucleotides, replaces "." with "-", and then
1552 #' replaces all characters besides ACGT- with "N".
1553 #'
1554 #' @param seqs a vector of nucleotide sequences
1555 #' @return A vector of nucleotide sequences
1761 #' \code{cleanSeqs} capitalizes nucleotides and replaces all characters
1762 #' besides \code{c("A", "C", "G", "T", "-", ".")} with \code{"N"}.
1763 #'
1764 #' @param seqs a vector of nucleotide sequences.
1765 #'
1766 #' @return A modified vector of nucleotide sequences.
15561767 #'
15571768 #' @seealso \link{sortAlleles} and \link{updateAlleleNames} can
15581769 #' help format a list of allele names.
15591770 #'
15601771 #' @examples
1561 #' # Create messy nucleotide sequences
1562 #' seqs = c("AGAT.taa-GAG...ATA",
1563 #' "GATACAGTXXXXXAGNNNPPPACA")
1564 #' # Clean them up
1772 #' # Clean messy nucleotide sequences
1773 #' seqs <- c("AGAT.taa-GAG...ATA", "GATACAGTXXZZAGNNPPACA")
15651774 #' cleanSeqs(seqs)
15661775 #'
15671776 #' @export
1568 cleanSeqs <- function(seqs){
1569 . = NULL
1570 seqs %>%
1571 toupper %>%
1572 gsub(".", "-", . , fixed = TRUE) %>%
1573 gsub("[^ACGT-]", "N", .) %>%
1574 return
1777 cleanSeqs <- function(seqs) {
1778 # . = NULL
1779 # seqs %>%
1780 # toupper %>%
1781 # gsub(".", "-", . , fixed = TRUE) %>%
1782 # gsub("[^ACGT-]", "N", .) %>%
1783 # return
1784
1785 return (gsub("[^ACGT\\.\\-]", "N", toupper(seqs)))
15751786 }
15761787
15771788
15831794 # position to be analyzed and determines if each sample is mutated at that
15841795 # position
15851796 #
1586 # @param clip_db A Change-O db data frame. See
1797 # @param data a Change-O db data.frame. See
15871798 # \link{findNovelAlleles} for a list of required
15881799 # columns.
1589 # @param germline The germline to which all the sequences should be
1800 # @param germline the germline to which all the sequences should be
15901801 # compared
1591 # @param pos_range The range of positions within the sequence for which
1802 # @param pos_range the range of positions within the sequence for which
15921803 # the rows should be duplicated and checked for mutation
15931804 #
15941805 # @return A data frame with rows duplicated for all the positions to be
15951806 # analyzed and a column indicating whether the position is mutated in
15961807 # comparison to the germline
15971808 #
1598 positionMutations <- function(clip_db, germline, pos_range){
1599 . = NULL
1600 pos_db = pos_range %>%
1601 length() %>%
1602 rep("clip_db", .) %>%
1603 paste(collapse=",") %>%
1604 paste("bind_rows(",., ")") %>%
1605 parse(text=.) %>%
1606 eval()
1607 pos_db$POSITION = c(sapply(pos_range, rep, nrow(clip_db)))
1608 # Find which positions are mutated
1609 pos_db = pos_db %>%
1610 mutate_(NT = ~substring(SEQUENCE_IMGT, POSITION, POSITION)) %>%
1611 mutate_(GERM_NT = ~substring(germline, POSITION, POSITION)) %>%
1612 mutate_(MUTATED = ~(NT != GERM_NT & NT != "N" & NT != "-" & NT != "")) %>%
1613 mutate_(OBSERVED = ~(NT != "-" & NT != ""))
1614 return(pos_db)
1809 positionMutations <- function(data, germline, pos_range){
1810 . = NULL
1811 pos_db = pos_range %>%
1812 length() %>%
1813 rep("data", .) %>%
1814 paste(collapse=",") %>%
1815 paste("bind_rows(",., ")") %>%
1816 parse(text=.) %>%
1817 eval()
1818 pos_db$POSITION = c(sapply(pos_range, rep, nrow(data)))
1819 # Find which positions are mutated
1820 pos_db = pos_db %>%
1821 mutate_(NT = ~substring(SEQUENCE_IMGT, POSITION, POSITION)) %>%
1822 mutate_(GERM_NT = ~substring(germline, POSITION, POSITION)) %>%
1823 mutate_(MUTATED = ~(NT != GERM_NT & NT != "N" & NT != "-" & NT != "")) %>%
1824 mutate_(OBSERVED = ~(NT != "-" & NT != ""))
1825 return(pos_db)
16151826 }
16161827
16171828 # Find sequences carrying certain levels of mutation
16201831 # sequences and returns the subset of sequences that meet the given mutation
16211832 # count limits
16221833 #
1623 # @param clip_db A Change-O db data frame. See
1834 # @param data a Change-O db data frame. See
16241835 # \link{findNovelAlleles} for a list of required
16251836 # columns.
1626 # @param germline The germline to which all the sequences should be
1837 # @param germline the germline to which all the sequences should be
16271838 # compared
1628 # @param pos_range The range of positions within the sequences that should
1839 # @param pos_range the range of positions within the sequences that should
16291840 # be analyzed for mutations
1630 # @param pos_range The range of mutation counts that sequences can have
1841 # @param pos_range the range of mutation counts that sequences can have
16311842 # and still be included
16321843 #
1633 # @return A data frame containing only the subset carrying the desired levels
1844 # @return
1845 # A data.frame containing only the subset carrying the desired levels
16341846 # of mutation
16351847 #
1636 mutationRangeSubset <- function(clip_db, germline, mut_range, pos_range){
1637 . = NULL
1638 pads = paste(rep("-", min(pos_range)-1), collapse="")
1639 clip_db$MUT_COUNT = clip_db$SEQUENCE_IMGT %>%
1640 substring(min(pos_range), max(pos_range)) %>%
1641 paste(pads, ., sep="") %>%
1642 getMutatedPositions(germline) %>%
1643 sapply(length)
1644 clip_db = clip_db %>%
1645 filter_(~MUT_COUNT %in% mut_range)
1646 return(clip_db)
1848 mutationRangeSubset <- function(data, germline, mut_range, pos_range){
1849 . = NULL
1850 pads = paste(rep("-", min(pos_range)-1), collapse="")
1851 data$MUT_COUNT = data$SEQUENCE_IMGT %>%
1852 substring(min(pos_range), max(pos_range)) %>%
1853 paste(pads, ., sep="") %>%
1854 getMutatedPositions(germline) %>%
1855 sapply(length)
1856 data = data %>%
1857 filter_(~MUT_COUNT %in% mut_range)
1858 return(data)
16471859 }
16481860
16491861 # Find lower range of y-intercept confidence interval
16501862 #
16511863 # \code{findLowerY} finds the lower range of y-intercept confidence interval
16521864 #
1865 # @details If mut_min is 1, a y-intercept will be searched for at 0. If
1866 # mut_min is above 1, then the "y-intercept" will be found at x = mut_min - 1.
1867 #
16531868 # @param x A vector of x values
16541869 # @param y A vector of y values
16551870 # @param mut_min The value where the the lowest mutation count should be
16571872 # @param alpha The alpha cutoff the be used in constructing the
16581873 # confidence interval
16591874 #
1660 # @details If mut_min is 1, a y-intercept will be searched for at 0. If
1661 # mut_min is above 1, then the "y-intercept" will be found at x = mut_min - 1.
1662 #
16631875 # @return A data frame containing only the subset carrying the desired levels
16641876 # of mutation
16651877 #
16661878 findLowerY = function(x, y, mut_min, alpha){
1667 y = y+1-mut_min
1668 lowerY = suppressWarnings(confint(lm(x ~ y),level=1-2*alpha)[[1]])
1669 return(lowerY)
1879 y = y + 1 - mut_min
1880 lowerY = suppressWarnings(confint(lm(x ~ y), level=1 - 2*alpha)[[1]])
1881 return(lowerY)
16701882 }
16711883
16721884 # Enchanced substring extraction
16801892 # @return a substring
16811893 #
16821894 superSubstring = function(string, positions){
1683 if(length(string) != 1){ stop("Please submit only one string.") }
1684 chars = sapply(positions, function(x) substring(string, x, x))
1685 return(paste(chars, collapse=""))
1895 if(length(string) != 1){ stop("Please submit only one string.") }
1896 chars = sapply(positions, function(x) substring(string, x, x))
1897 return(paste(chars, collapse=""))
16861898 }
16871899
16881900
17011913 # describing the heights of the rows in the layout. Will
17021914 # be passed to grid.layout. Default is all plots have
17031915 # the same height.
1704 multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL, heights=NULL) {
1705
1706 # Make a list from the ... arguments and plotlist
1707 plots <- c(list(...), plotlist)
1708
1709 numPlots = length(plots)
1710 if (is.null(heights)) { heights = rep(1,numPlots) }
1711
1712 # If layout is NULL, then use 'cols' to determine layout
1713 if (is.null(layout)) {
1714 # Make the panel
1715 # ncol: Number of columns of plots
1716 # nrow: Number of rows needed, calculated from # of cols
1717 layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
1718 ncol = cols, nrow = ceiling(numPlots/cols))
1719 }
1720
1721 if (numPlots==1) {
1722 print(plots[[1]])
1723
1724 } else {
1725 # Set up the page
1726 grid.newpage()
1727 pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout),
1728 heights=heights)))
1729
1730 # Make each plot, in the correct location
1731 for (i in 1:numPlots) {
1732 # Get the i,j matrix positions of the regions that contain this subplot
1733 matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
1734
1735 print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
1736 layout.pos.col = matchidx$col))
1737 }
1738 }
1739 }
1740
1916 multiplot <- function(..., plotlist=NULL, cols=1, layout=NULL, heights=NULL) {
1917 # Make a list from the ... arguments and plotlist
1918 plots <- c(list(...), plotlist)
1919 numPlots <- length(plots)
1920 ncol <- cols
1921 nrow <- ceiling(numPlots/cols)
1922 if (is.null(heights)) { heights = rep(1,nrow) }
1923 if (is.null(layout)) {
1924 # Make the panel
1925 # ncol: Number of columns of plots
1926 # nrow: Number of rows needed, calculated from # of cols
1927 layout <- matrix(seq(1, cols * nrow),
1928 ncol = cols, nrow = nrow)
1929 }
1930 grob <- gridExtra::arrangeGrob(grobs=plots,
1931 nrow=nrow, ncol=ncol, layout_matrix = layout,
1932 heights=heights)
1933 p <- ggplot() +
1934 layer(data = data.frame(x = NA),
1935 stat = StatIdentity,
1936 position = PositionIdentity,
1937 # geom = GeomDrawGrob,
1938 geom = GeomCustomAnn,
1939 inherit.aes = FALSE,
1940 params = list(grob = grob,
1941 xmin = 0,
1942 xmax = 1,
1943 ymin = 0,
1944 ymax = 1)) +
1945 scale_x_continuous(expand=c(0,0)) +
1946 scale_y_continuous(expand=c(0,0))
1947 p
1948 }
22 # @author Daniel Gadala-Maria
33 # @copyright Copyright 2016 Kleinstein Lab, Yale University. All rights reserved
44 # @license Creative Commons Attribution-NonCommercial-ShareAlike 4.0 Unported
5 # @version 0.3.0
6 # @date 2017.05.29
75
86
97 #' tigger
108 #'
119 #' Here we provide a \strong{T}ool for \strong{I}mmuno\strong{g}lobulin
12 #' \strong{G}enotype \strong{E}lucidation via
13 #' \strong{R}ep-Seq (TIgGER). TIgGER inferrs the set of Ig alleles carried by an
10 #' \strong{G}enotype \strong{E}lucidation via \strong{R}ep-Seq (TIgGER).
11 #' TIgGER inferrs the set of Ig alleles carried by an
1412 #' individual (including any novel alleles) and then uses this set of alleles to
1513 #' correct the initial assignments given to sample sequences by existing tools.
1614 #'
17 #' @details Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the
15 #' @details
16 #' Immunoglobulin repertoire sequencing (AIRR-Seq, Rep-Seq) data is currently the
1817 #' subject of much study. A key step in analyzing these data involves assigning
1918 #' the closest known V(D)J germline alleles to the (often somatically mutated)
2019 #' sample sequences using a tool such as IMGT/HighV-QUEST. However, if the
2120 #' sample utilizes alleles not in the germline database used for alignment, this
2221 #' step will fail. Additionally, this alignment has an associated error rate of
23 #' ~5 percent, notably among sequences carrying a large number of somatic
22 #' ~5%, notably among sequences carrying a large number of somatic
2423 #' mutations. The purpose of TIgGER is to address these issues.
2524 #'
26 #' @section Core tigger functions:
25 #' @section Allele detection and genotyping:
2726 #' \itemize{
28 #' \item \link{findNovelAlleles}: Detect novel alleles
29 #' \item \link{plotNovel}: Plot evidence of novel alleles
30 #' \item \link{inferGenotype}: Infer an Ig genotype
31 #' \item \link{plotGenotype}: A colorful genotype visualization
32 #' \item \link{genotypeFasta}: Convert a genotype to sequences
33 #' \item \link{reassignAlleles}: Correct allele calls
27 #' \item \link{findNovelAlleles}: Detect novel alleles.
28 #' \item \link{plotNovel}: Plot evidence of novel alleles.
29 #' \item \link{inferGenotype}: Infer an Ig genotype using a frequency approach.
30 #' \item \link{inferGenotypeBayesian}: Infer an Ig genotype using a Bayesian approach.
31 #' \item \link{plotGenotype}: A colorful genotype visualization.
32 #' \item \link{genotypeFasta}: Convert a genotype to sequences.
33 #' \item \link{reassignAlleles}: Correct allele calls.
34 #' \item \link{generateEvidence}: Generate evidence for the genotype and
35 #' allele detection inferrence.
3436 #' }
3537 #'
36 #' @section Mutation-related functions:
38 #' @section Mutation handling:
3739 #' \itemize{
38 #' \item \link{getMutatedPositions}: Find mutation locations
39 #' \item \link{getMutCount}: Find distance from germline
40 #' \item \link{findUnmutatedCalls}: Subset unmutated sequences
40 #' \item \link{getMutatedPositions}: Find mutation locations.
41 #' \item \link{getMutCount}: Find distance from germline.
42 #' \item \link{findUnmutatedCalls}: Subset unmutated sequences.
4143 #' \item \link{getPopularMutationCount}: Find most common sequence's
42 #' mutation count
43 #' \item \link{insertPolymorphisms}: Insert SNPs into a sequence
44 #' mutation count.
45 #' \item \link{insertPolymorphisms}: Insert SNPs into a sequence.
4446 #' }
4547 #'
46 #' @section Input and formatting:
48 #' @section Input, output and formatting:
4749 #' \itemize{
48 #' \item \link{readIgFasta}: Read a fasta file of Ig sequences
49 #' \item \link{updateAlleleNames}: Correct outdated allele names
50 #' \item \link{sortAlleles}: Sort allele names intelligently
51 #' \item \link{cleanSeqs}: Standardize sequence format
50 #' \item \link{readIgFasta}: Read a fasta file of Ig sequences.
51 #' \item \link{updateAlleleNames}: Correct outdated allele names.
52 #' \item \link{sortAlleles}: Sort allele names intelligently.
53 #' \item \link{cleanSeqs}: Standardize sequence format.
5254 #' }
5355 #'
5456 #' @name tigger
5557 #' @docType package
56 #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of
57 #' high-throughput B cell sequencing data reveals a high frequency of novel
58 #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
58 #' @references
59 #' \enumerate{
60 #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
61 #' sequencing data reveals a high frequency of novel immunoglobulin V gene
62 #' segment alleles. PNAS. 112(8):E862-70.
63 #' }
5964 #'
6065 #' @import ggplot2
61 #' @importFrom alakazam getAllele getGene getFamily DNA_COLORS
66 #' @importFrom alakazam getAllele getGene getFamily translateDNA DNA_COLORS
6267 #' @importFrom doParallel registerDoParallel
6368 #' @importFrom dplyr do n desc %>%
6469 #' glimpse distinct distinct_
6570 #' as_data_frame data_frame data_frame_
66 #' bind_cols bind_rows combine
71 #' bind_cols bind_rows combine inner_join
6772 #' filter filter_ select select_ arrange arrange_
6873 #' group_by group_by_ ungroup
6974 #' mutate mutate_ transmute transmute_
7176 #' slice slice_
7277 #' @importFrom foreach foreach %dopar% registerDoSEQ
7378 #' @importFrom graphics plot
74 #' @importFrom grid grid.layout grid.newpage pushViewport viewport
79 #' @importFrom gridExtra arrangeGrob
80 #' @importFrom gtools ddirichlet
7581 #' @importFrom iterators icount
7682 #' @importFrom lazyeval interp
7783 #' @importFrom parallel clusterEvalQ clusterExport makeCluster stopCluster
84 #' @importFrom rlang .data
85 #' @importFrom shazam calcObservedMutations
7886 #' @importFrom stats na.omit setNames ecdf sd cor cov median mad
7987 #' confint lm
80 #' @importFrom tidyr gather gather_ spread spread_
88 #' @importFrom stringi stri_length
89 #' @importFrom tidyr gather gather_ spread spread_ unnest
8190 NULL
11
22 High-throughput sequencing of B cell immunoglobulin receptors is providing unprecedented insight into adaptive immunity. A key step in analyzing these data involves assignment of the germline V, D and J gene segment alleles that comprise each immunoglobulin sequence by matching them against a database of known V(D)J alleles. However, this process will fail for sequences that utilize previously undetected alleles, whose frequency in the population is unclear.
33
4 **TIgGER is a computational method that significantly improves V(D)J allele assignments by first determining the complete set of gene segments carried by an individual (including novel alleles) from V(D)J-rearrange sequences. TIgGER can then infer a subject's genotype from these sequences, and use this genotype to correct the initial V(D)J allele assignments.**
4 TIgGER is a computational method that significantly improves V(D)J allele assignments by first determining the complete set of gene segments carried by an individual (including novel alleles) from V(D)J-rearrange sequences. TIgGER can then infer a subject's genotype from these sequences, and use this genotype to correct the initial V(D)J allele assignments.
55
66 The application of TIgGER continues to identify a surprisingly high frequency of novel alleles in humans, highlighting the critical need for this approach. (TIgGER, however, can and has been used with data from other species.)
77
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
0 GermlineIGHV
1 SampleDb
2 SampleGenotype
3 SampleNovel
data/genotype.rda less more
Binary diff not shown
data/germline_ighv.rda less more
Binary diff not shown
data/novel_df.rda less more
Binary diff not shown
data/sample_db.rda less more
Binary diff not shown
00 ## ---- eval=TRUE, message=FALSE, warning=FALSE----------------------------
1 # Load packages required for this example
12 library(tigger)
23 library(dplyr)
3 # Load example sequence data and example germline database
4 data(sample_db, germline_ighv)
54
65 ## ---- eval=TRUE, warning=FALSE-------------------------------------------
76 # Detect novel alleles
8 novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1)
7 novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1)
98
109 ## ---- eval=TRUE, warning=FALSE-------------------------------------------
1110 # Extract and view the rows that contain successful novel allele calls
12 novel <- selectNovel(novel_df)
13 novel[1:3]
11 novel_rows <- selectNovel(novel)
12 novel_rows[1:3]
1413
1514 ## ---- eval=TRUE, warning=FALSE, fig.width=6, fig.height=8----------------
1615 # Plot evidence of the first (and only) novel allele from the example data
17 plotNovel(sample_db, novel[1, ])
16 plotNovel(SampleDb, novel[1, ])
1817
1918 ## ---- eval=TRUE, warning=FALSE, fig.width=4, fig.height=3----------------
2019 # Infer the individual's genotype, using only unmutated sequences and checking
2120 # for the use of the novel alleles inferred in the earlier step.
22 geno <- inferGenotype(sample_db, find_unmutated = TRUE,
23 germline_db = germline_ighv, novel_df = novel_df)
21 geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel,
22 find_unmutated=TRUE)
2423 # Save the genotype sequences to a vector
25 genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df)
24 genotype_db <- genotypeFasta(geno, GermlineIGHV, novel)
2625 # Visualize the genotype and sequence counts
2726 print(geno)
2827 # Make a colorful visualization. Bars indicate presence, not proportion.
2928 plotGenotype(geno, text_size = 10)
3029
30 ## ---- eval=TRUE, warning=FALSE, fig.width=4, fig.height=3----------------
31 # Infer the individual's genotype, using the bayesian method
32 geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV,
33 novel=novel, find_unmutated=TRUE)
34 # Visualize the genotype and sequence counts
35 print(geno_bayesian)
36 # Make a colorful visualization. Bars indicate presence, not proportion.
37 plotGenotype(geno_bayesian, text_size=10)
3138
3239 ## ---- eval=TRUE, warning=FALSE-------------------------------------------
3340 # Use the personlized genotype to determine corrected allele assignments
34 V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs)
35 # Append the corrected calls to the original data.frame
36 sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED)
41 # Updated genotype will be placed in the V_CALL_GENOTYPED column
42 sample_db <- reassignAlleles(SampleDb, genotype_db)
3743
3844 ## ---- eval=TRUE, warning=FALSE-------------------------------------------
3945 # Find the set of alleles in the original calls that were not in the genotype
4046 not_in_genotype <- sample_db$V_CALL %>%
41 strsplit(",") %>%
42 unlist() %>%
43 unique() %>%
44 setdiff(names(genotype_seqs))
47 strsplit(",") %>%
48 unlist() %>%
49 unique() %>%
50 setdiff(names(genotype_db))
4551
4652 # Determine the fraction of calls that were ambigious before/after correction
4753 # and the fraction that contained original calls to non-genotype alleles. Note
4854 # that by design, only genotype alleles are allowed in "after" calls.
49 data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)),
50 mean(grepl(",",sample_db$V_CALL_GENOTYPED))),
51 NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype),
52 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
53 row.names = c("Before", "After")) %>%
55 data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)),
56 mean(grepl(",", sample_db$V_CALL_GENOTYPED))),
57 NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype),
58 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
59 row.names=c("Before", "After")) %>%
5460 t() %>% round(3)
5561
56
2929
3030 ## Introduction
3131
32 Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the subject of
33 much study. A key step in analyzing these data involves assigning the closest
34 known V(D)J germline alleles to the (often somatically mutated) sample sequences
35 using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, if the sample utilizes
36 alleles not in the germline database used for alignment, this step will fail.
37 Additionally, this alignment has an associated error rate of ~5% ([[2]][2]),
38 notably among sequences carrying a large number of somatic mutations.
32 Adapative immune receptor repertoire sequencing (AIRR-Seq, Rep-Seq) data is
33 currently the subject of much study. A key step in analyzing these data involves
34 assigning the closest known V(D)J germline alleles to the (often somatically mutated)
35 sample sequences using a tool such as IMGT/HighV-QUEST ([[1]][1]). However,
36 if the sample utilizes alleles not in the germline database used for alignment,
37 this step will fail. Additionally, this alignment has an associated error rate
38 of ~5% ([[2]][2]), notably among sequences carrying a large number of somatic
39 mutations.
3940
4041 Here we provide a **T**ool for **I**mmuno**g**lobulin **G**enotype
4142 **E**lucidation via **R**ep-Seq (TIgGER). TIgGER addresses these issues by
42 inferring the set of Ig alleles carried by an individual (including any novel
43 alleles) and then using this set of alleles to correct the initial assignments
44 given to sample sequences by existing tools.
45
46 Additional information is available in:
43 inferring the set of Immunoglobulin (Ig) alleles carried by an individual
44 (including any novel alleles) and then using this set of alleles to correct
45 the initial assignments given to sample sequences by existing tools.
46
47 This vignette covers the following tasks:
48
49 1. Inferring the presence of novel IGHV alleles not in the germline database.
50 2. Inferring the personalized IGHV genotype of a sample.
51 3. Correcting the IGHV allele calls of a sample based on the IGHV genotype.
52
53 Additional information about the methods used by TIgGER is available in:
4754
4855 [Gadala-Maria D, Yaari G, Uduman M, Kleinstein SH (2015) Automated analysis of
4956 high-throughput B cell sequencing data reveals a high frequency of novel
5057 immunoglobulin V gene segment alleles. *PNAS*
5158 112(8):E862-70](http://www.pnas.org/content/early/2015/02/05/1417683112).
5259
53
5460 ## Input
5561
5662 TIgGER requires two main inputs:
5763
58 1. Pre-processed Rep-Seq data
64 1. Pre-processed Ig sequence data
5965 2. Database germline sequences
6066
61 Rep-seq data is input as a data frame where each row represents a unique
62 observation and and columns represent data about that observation. The required
63 names of the required columns are provided below along with a description of
64 each.
67 AIRR-seq data is input as a data frame following the Change-O standard where
68 each row represents a unique observation and and columns represent data about
69 that observation. The required names of the required columns are provided below
70 along with a description of each.
6571
6672 Column Name | Description
6773 ----------------------|---------------------------------------------------------
7076 `J_CALL` | (Comma separated) name(s) of the nearest J allele(s)
7177 `JUNCTION_LENGTH` | Length of the junction region of the V(D)J sample
7278
73 An example dataset is provided with the `tigger` package. It contains unique
74 functional sequences assigned to IGHV1 family genes isolated from individual
75 PGP1 (referenced in Gadala-Maria *et al.* 2015).
79 An example dataset is provided with the `tigger` package as `SampleDb`. It
80 contains unique functional sequences assigned to IGHV1 family genes isolated
81 from individual PGP1 (referenced in Gadala-Maria *et al.* 2015).
7682
7783 The database of germline sequences should be provided in FASTA format with
7884 sequences gapped according to the IMGT numbering scheme ([[3]][3]). IGHV alleles in
79 the IMGT database (build 201408-4) are provided with this package. You may read
80 in your own fasta file using `readIgFasta`.
85 the IMGT database (build 201408-4) are provided with this package as `GermlineIGHV`.
86 You may read in your own fasta file using `readIgFasta`.
8187
8288 ```{r, eval=TRUE, message=FALSE, warning=FALSE}
89 # Load packages required for this example
8390 library(tigger)
8491 library(dplyr)
85 # Load example sequence data and example germline database
86 data(sample_db, germline_ighv)
87 ```
88
89 ## Running TIgGER
90
91 The functions provided by this package can be used to perform any combination of
92 the following:
93
94 1. Infer the presence of novel IGHV alleles not in the germline database
95 2. Infer the individual's IGHV genotype
96 3. Correct the IGHV allele calls of the samples based on the IGHV genotype
97
98 ### Novel Alleles
92 ```
93
94 ## Novel allele detection
9995
10096 Potential novel alleles can be detected by TIgGER. Some of these may be included
10197 in the genotype later (see below). `findNovelAlleles` will return a `data.frame`
108104
109105 ```{r, eval=TRUE, warning=FALSE}
110106 # Detect novel alleles
111 novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1)
107 novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1)
112108 ```
113109
114110 ```{r, eval=TRUE, warning=FALSE}
115111 # Extract and view the rows that contain successful novel allele calls
116 novel <- selectNovel(novel_df)
117 novel[1:3]
112 novel_rows <- selectNovel(novel)
113 novel_rows[1:3]
118114 ```
119115
120116 The TIgGER procedure for identifying novel alleles (see citation above) involves
147143
148144 ```{r, eval=TRUE, warning=FALSE, fig.width=6, fig.height=8}
149145 # Plot evidence of the first (and only) novel allele from the example data
150 plotNovel(sample_db, novel[1, ])
151 ```
152
153 ### Genotype
154 An individual's genotype can be inferred using the function `inferGenotype`.
155 This function will remove from the genotype rare/erroneous allele calls which
156 may result from mutations in allele-differentiating regions. This is done by
157 determining the fewest alleles that account for nearly all (default is 7/8) of
158 the allele calls made. The user may opt to only use sequences which perfectly
159 match germline alleles, and may opt to include potential novel alleles.
160 (The genotype output is designed to be human readable, though `plotGenotype`
161 can be used to make a colorful visualization.) For each allele, the
162 number of sequences which match the germline are listed in the same order as
163 the alleles are listed. The total number of sequences that match any allele of
164 that gene is also given. To output these alleles as a names vector of nucleotide
165 sequences, the user may use the function `genotypeFasta`. To save this vector to
166 a fasta file, `writeFasta` may be used.
146 plotNovel(SampleDb, novel[1, ])
147 ```
148
149 ## Inferring genotypes
150
151 An individual's genotype can be inferred using the functions `inferGenotype` or
152 `inferGenotypeBayesian`. Using one of this functions allows to remove from the
153 genotype rare/erroneous allele calls which may result from mutations in
154 allele-differentiating regions. `inferGenotype` uses a frequency method to
155 decide which alleles belong to the subjects genotype whereas
156 `inferGenotypeBayesian` infers an subject's genotype applying a Bayesian
157 framework and provides a confidence estimate associated with
158 the genotype calls.
159
160
161 ### Frequency genotyping approach
162
163 `inferGenotype` identifies the fewest alleles that account for
164 nearly all (default is 7/8) of the allele calls made. The user may opt to only
165 use sequences which perfectly match germline alleles, and may opt to include
166 potential novel alleles. (The genotype output is designed to be human readable,
167 though `plotGenotype` can be used to make a colorful visualization.) For each
168 allele, the number of sequences which match the germline are listed in the same
169 order as the alleles are listed. The total number of sequences that match any
170 allele of that gene is also given. To output these alleles as a names vector of
171 nucleotide sequences, the user may use the function `genotypeFasta`. To save
172 this vector to a fasta file, `writeFasta` may be used.
167173
168174 ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3}
169175 # Infer the individual's genotype, using only unmutated sequences and checking
170176 # for the use of the novel alleles inferred in the earlier step.
171 geno <- inferGenotype(sample_db, find_unmutated = TRUE,
172 germline_db = germline_ighv, novel_df = novel_df)
177 geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel,
178 find_unmutated=TRUE)
173179 # Save the genotype sequences to a vector
174 genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df)
180 genotype_db <- genotypeFasta(geno, GermlineIGHV, novel)
175181 # Visualize the genotype and sequence counts
176182 print(geno)
177183 # Make a colorful visualization. Bars indicate presence, not proportion.
178184 plotGenotype(geno, text_size = 10)
179
180 ```
181
182 ### Corrected Allele Calls
185 ```
186
187 ### Bayesian genotyping approach
188
189 The method `inferGenotypeBayesian` analyzes the posterior probabilities of
190 possible allele distributions, considering up to four distinct alleles per
191 V gene, corresponding to a gene duplication with both loci being heterozygous
192 (i.e., homozygous, heterozygous with one copy of each allele, etc.). The
193 posterior probabilities for these four possible models are compared and a Bayes
194 factor is calculated for the two most probable models. This Bayes factor
195 reflects the confidence in the genotyping call of the method. The bayesian
196 method doesn't use the strict cutoff criterion `fraction_to_explain` that
197 `inferGenotype` uses wherein only the minimum set of alleles explaining
198 88% (7/8) of apparently-unmutated sequences are included in the genotype.
199
200
201 ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3}
202 # Infer the individual's genotype, using the bayesian method
203 geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV,
204 novel=novel, find_unmutated=TRUE)
205 # Visualize the genotype and sequence counts
206 print(geno_bayesian)
207 # Make a colorful visualization. Bars indicate presence, not proportion.
208 plotGenotype(geno_bayesian, text_size=10)
209 ```
210
211 ## Correcting allele calls
183212
184213 Finally, the original V allele calls may be limited to only those within the
185214 inferred genotype. This can be done by using the function `reassignAlleles`.
186 By corrected the calls in this manner, the user can greatly reduce the numer of
215 By correcting the calls in this manner, the user can greatly reduce the numer of
187216 ambiguous allele calls (where a single sample sequences is assigned to multiple
188217 V alleles, thus preventing the mutations analysis of allele-differentiating
189218 positions). Additionally, assignments to erroneous not-in-genotype alleles
191220
192221 ```{r, eval=TRUE, warning=FALSE}
193222 # Use the personlized genotype to determine corrected allele assignments
194 V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs)
195 # Append the corrected calls to the original data.frame
196 sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED)
223 # Updated genotype will be placed in the V_CALL_GENOTYPED column
224 sample_db <- reassignAlleles(SampleDb, genotype_db)
197225 ```
198226
199227 From here, one may proceed with further downstream analyses, but with the
200228 advantage of having much-improved allele calls. Besides having discovered
201 alleles not in the IGMT database, the calls made by IMGT have been tailored to
229 alleles not in the IMGT database, the calls made by IMGT have been tailored to
202230 the subject's genotype, greatly reducing the number of problematic calls, as
203231 can be seen below.
204232
205233 ```{r, eval=TRUE, warning=FALSE}
206234 # Find the set of alleles in the original calls that were not in the genotype
207235 not_in_genotype <- sample_db$V_CALL %>%
208 strsplit(",") %>%
209 unlist() %>%
210 unique() %>%
211 setdiff(names(genotype_seqs))
236 strsplit(",") %>%
237 unlist() %>%
238 unique() %>%
239 setdiff(names(genotype_db))
212240
213241 # Determine the fraction of calls that were ambigious before/after correction
214242 # and the fraction that contained original calls to non-genotype alleles. Note
215243 # that by design, only genotype alleles are allowed in "after" calls.
216 data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)),
217 mean(grepl(",",sample_db$V_CALL_GENOTYPED))),
218 NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype),
219 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
220 row.names = c("Before", "After")) %>%
244 data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)),
245 mean(grepl(",", sample_db$V_CALL_GENOTYPED))),
246 NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype),
247 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
248 row.names=c("Before", "After")) %>%
221249 t() %>% round(3)
222
223 ```
224
250 ```
225251
226252 ## References
227253
232258 [1]: http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.html "Alamyar et al. (2010)"
233259 [2]: http://www.ncbi.nlm.nih.gov/pubmed/20147303 "Munshaw and Kepler (2010)"
234260 [3]: http://www.ncbi.nlm.nih.gov/pubmed/12477501 "Lefranc et al. (2003)"
261
+0
-9
inst/markr/build.R less more
0 library(markr)
1 library(tigger)
2
3 # Directories
4 pkg_path <- "."
5 doc_path <- "./docs"
6
7 # Build
8 build_mkdocs(pkg_path, doc_path=doc_path, yaml=F)
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{GermlineIGHV}
4 \alias{GermlineIGHV}
5 \title{Human IGHV germlines}
6 \format{Values correspond to IMGT-gaped nuceltoide sequences (with
7 nucleotides capitalized and gaps represented by ".") while names correspond
8 to stripped-down IMGT allele names (e.g. "IGHV1-18*01").}
9 \description{
10 A \code{character} vector of all 344 human IGHV germline gene segment alleles
11 in IMGT/GENE-DB release 201408-4.
12 }
13 \references{
14 \enumerate{
15 \item Xochelli, et al. (2014) Immunoglobulin heavy variable (IGHV) genes and
16 alleles: new entities, new names and implications for research and
17 prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6.
18 }
19 }
20 \keyword{data}
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{SampleDb}
4 \alias{SampleDb}
5 \title{Example human immune repertoire data}
6 \format{A \code{data.frame} where rows correspond to unique V(D)J sequences and
7 columns include:
8 \itemize{
9 \item \code{"SEQUENCE_IMGT"}: IMGT-gapped V(D)J nucleotide sequence.
10 \item \code{"V_CALL"}: IMGT/HighV-QUEST V segment allele calls.
11 \item \code{"D_CALL"}: IMGT/HighV-QUEST D segment allele calls.
12 \item \code{"J_CALL"}: IMGT/HighV-QUEST J segment allele calls.
13 \item \code{"JUNCTION_LENGTH"}: Junction region length.
14 }}
15 \description{
16 A \code{data.frame} of example V(D)J immunoglobulin sequences derived from a
17 single individual (PGP1), sequenced on the Roche 454 platform, and assigned by
18 IMGT/HighV-QUEST to IGHV1 family alleles.
19 }
20 \references{
21 \enumerate{
22 \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
23 sequencing data reveals a high frequency of novel immunoglobulin V gene
24 segment alleles. PNAS. 112(8):E862-70.
25 }
26 }
27 \keyword{data}
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{SampleGenotype}
4 \alias{SampleGenotype}
5 \title{Example genotype inferrence results}
6 \format{A \code{data.frame} where rows correspond to genes carried by an
7 individual and columns lists the alleles of those genes and their counts.}
8 \description{
9 A \code{data.frame} of genotype inference results from \link{inferGenotype}
10 after novel allele detection via \link{findNovelAlleles}.
11 Source data was a collection of V(D)J immunoglobulin sequences derived from a single
12 individual (PGP1), sequenced on the Roche 454 platform, and assigned by
13 IMGT/HighV-QUEST to IGHV1 family alleles.
14 }
15 \references{
16 \enumerate{
17 \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
18 sequencing data reveals a high frequency of novel immunoglobulin V gene
19 segment alleles. PNAS. 112(8):E862-70.
20 }
21 }
22 \seealso{
23 See \link{inferGenotype} for detailed column descriptions.
24 }
25 \keyword{data}
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{SampleNovel}
4 \alias{SampleNovel}
5 \title{Example novel allele detection results}
6 \format{A \code{data.frame} where rows correspond to alleles checked for
7 polymorphisms and columns give results as well as paramaters used to run
8 the test.}
9 \description{
10 A \code{data.frame} of novel allele detection results from \link{findNovelAlleles}.
11 Source data was a collection of V(D)J immunoglobulin sequences derived from a single
12 individual (PGP1), sequenced on the Roche 454 platform, and assigned by
13 IMGT/HighV-QUEST to IGHV1 family alleles.
14 }
15 \references{
16 \enumerate{
17 \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
18 sequencing data reveals a high frequency of novel immunoglobulin V gene
19 segment alleles. PNAS. 112(8):E862-70.
20 }
21 }
22 \seealso{
23 See \link{findNovelAlleles} for detailed column descriptions.
24 }
25 \keyword{data}
66 cleanSeqs(seqs)
77 }
88 \arguments{
9 \item{seqs}{a vector of nucleotide sequences}
9 \item{seqs}{a vector of nucleotide sequences.}
1010 }
1111 \value{
12 A vector of nucleotide sequences
12 A modified vector of nucleotide sequences.
1313 }
1414 \description{
15 \code{cleanSeqs} capitalizes nucleotides, replaces "." with "-", and then
16 replaces all characters besides ACGT- with "N".
15 \code{cleanSeqs} capitalizes nucleotides and replaces all characters
16 besides \code{c("A", "C", "G", "T", "-", ".")} with \code{"N"}.
1717 }
1818 \examples{
19 # Create messy nucleotide sequences
20 seqs = c("AGAT.taa-GAG...ATA",
21 "GATACAGTXXXXXAGNNNPPPACA")
22 # Clean them up
19 # Clean messy nucleotide sequences
20 seqs <- c("AGAT.taa-GAG...ATA", "GATACAGTXXZZAGNNPPACA")
2321 cleanSeqs(seqs)
2422
2523 }
33 \alias{findNovelAlleles}
44 \title{Find novel alleles from repertoire sequencing data}
55 \usage{
6 findNovelAlleles(clip_db, germline_db, v_call = "V_CALL",
6 findNovelAlleles(data, germline_db, v_call = "V_CALL",
77 germline_min = 200, min_seqs = 50, auto_mutrange = TRUE,
88 mut_range = 1:10, pos_range = 1:312, y_intercept = 0.125,
99 alpha = 0.05, j_max = 0.15, min_frac = 0.75, nproc = 1)
1010 }
1111 \arguments{
12 \item{clip_db}{a \code{data.frame} in Change-O format. See details.}
12 \item{data}{a \code{data.frame} in Change-O format. See details.}
1313
1414 \item{germline_db}{a vector of named nucleotide germline sequences
15 matching the V calls in \code{clip_db}}
15 matching the V calls in \code{data}.}
1616
17 \item{v_call}{name of the column in clip_db with V allele calls.
17 \item{v_call}{name of the column in \code{data} with V allele calls.
1818 Default is V_CALL.}
1919
2020 \item{germline_min}{the minimum number of sequences that must have a
3636 \item{pos_range}{the range of IMGT-numbered positions that should be
3737 considered by the algorithm}
3838
39 \item{y_intercept}{the y-intercept above which positions should be
39 \item{y_intercept}{the y-intercept threshold above which positions should be
4040 considered potentially polymorphic}
4141
42 \item{alpha}{the alpha cutoff to be used when constructing the
43 confidence interval for the y-intercept}
42 \item{alpha}{the alpha value used for determining whether the
43 fit y-intercept is greater than the \code{y_intercept}
44 threshold}
4445
4546 \item{j_max}{the maximum fraction of sequences perfectly aligning
4647 to a potential novel allele that are allowed to
5455 \item{nproc}{the number of processors to use}
5556 }
5657 \value{
57 a \code{data.frame} with a row for each known allele analyzed.
58 A \code{data.frame} with a row for each known allele analyzed.
5859 Besides metadata on the the parameters used in the search, each row will have
5960 either a note as to where the polymorphism-finding algorithm exited or a
60 nucleotide sequence for the predicted novel allele.
61 nucleotide sequence for the predicted novel allele, along with columns providing
62 additional evidence.
63
64 The output contains the following columns:
65 \itemize{
66 \item \code{GERMLINE_CALL}: The input (uncorrected) V call.
67 \item \code{NOTE}: Comments regarding the inferrence.
68 \item \code{POLYMORPHISM_CALL}: The novel allele call.
69 \item \code{NT_SUBSTITUTIONS}: Mutations identified in the novel allele, relative
70 to the reference germline (\code{GERMLINE_CALL})
71 \item \code{NOVEL_IMGT}: The novel allele sequence.
72 \item \code{NOVEL_IMGT_COUNT}: The number of times the sequence \code{NOVEL_IMGT}
73 is found in the input data. Considers the subsequence of \code{NOVEL_IMGT}
74 in the \code{pos_range}.
75 \item \code{NOVEL_IMGT_UNIQUE_J}: Number of distinct J calls associated to \code{NOVEL_IMGT}
76 in the input data. Considers the subsequence of \code{NOVEL_IMGT} in the \code{pos_range}.
77 \item \code{NOVEL_IMGT_UNIQUE_CDR3}: Number of distinct CDR3 sequences associated
78 with \code{NOVEL_IMGT} in the input data. Considers the subsequence of \code{NOVEL_IMGT}
79 in the \code{pos_range}.
80 \item \code{PERFECT_MATCH_COUNT}: Final number of sequences retained to call the new
81 allele. These are unique sequences that have V segments that perfectly match
82 the predicted germline in the \code{pos_range}.
83 \item \code{PERFECT_MATCH_FREQ}: \code{PERFECT_MATCH_COUNT / GERMLINE_CALL_COUNT}
84 \item \code{GERMLINE_CALL_COUNT}: The number of sequences with the \code{GERMLINE_CALL}
85 in the input data that were initially considered for the analysis.
86 \item \code{GERMLINE_CALL_FREQ}: The fraction of sequences with the \code{GERMLINE_CALL}
87 in the input data initially considered for the analysis.
88 \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}.
89 \item \code{GERMLINE_IMGT_COUNT}: The number of times the \code{GERMLINE_IMGT}
90 sequence is found in the input data.
91 \item \code{MUT_MIN}: Minimum mutation considered by the algorithm.
92 \item \code{MUT_MAX}: Maximum mutation considered by the algorithm.
93 \item \code{MUT_PASS_COUNT}: Number of sequences in the mutation range.
94 \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering).
95 \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering).
96 \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered
97 potentially polymorphic.
98 \item \code{Y_INTERCEPT_PASS}: Number of positions that pass the \code{Y_INTERCEPT} threshold.
99 \item \code{SNP_PASS}: Number of sequences that pass the \code{Y_INTERCEPT} threshold and are
100 within the desired nucleotide range (\code{min_seqs}).
101 \item \code{UNMUTATED_COUNT}: Number of unmutated sequences.
102 \item \code{UNMUTATED_FREQ}: Number of unmutated sequences over \code{GERMLINE_IMGT_COUNT}.
103 \item \code{UNMUTATED_SNP_J_GENE_LENGTH_COUNT}: Number of distinct combinations
104 of SNP, J gene, and junction length.
105 \item \code{SNP_MIN_SEQS_J_MAX_PASS}: Number of SNPs that pass both the \code{min_seqs}
106 and \code{j_max} thresholds.
107 \item \code{ALPHA}: Significance threshold to be used when constructing the
108 confidence interval for the y-intercept.
109 \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences
110 (within the desired mutational range and nucleotide range) required
111 for the samples to be considered.
112 \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly
113 aligning to a potential novel allele that are allowed to utilize to a particular
114 combination of junction length and J gene.
115 \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must
116 have usable nucleotides in a given position for that position to be considered.
117 }
118
119 The following comments can appear in the \code{NOTE} column:
120
121 \itemize{
122 \item \emph{Novel allele found}: A novel allele was detected.
123 \item \emph{Plurality sequence too rare}: No sequence is frequent enough to pass
124 the J test (\code{j_max}).
125 \item \emph{A J-junction combination is too prevalent}: Not enough J diversity (\code{j_max}).
126 \item \emph{No positions pass y-intercept test}: No positions above \code{y_intercept}.
127 \item \emph{Insufficient sequences in desired mutational range}:
128 \code{mut_range} and \code{pos_range}.
129 \item \emph{Not enough sequences}: Not enough sequences in the desired mutational
130 range and nucleotide range (\code{min_seqs}).
131 \item \emph{No unmutated versions of novel allele found}: All observed variants of the
132 allele are mutated.
133 }
61134 }
62135 \description{
63136 \code{findNovelAlleles} analyzes mutation patterns in sequences thought to
65138 might be polymorphic.
66139 }
67140 \details{
68 A \code{data.frame} in Change-O format contains the following
69 columns:
70 \itemize{
71 \item \code{"SEQUENCE_IMGT"} containing the IMGT-gapped nucleotide sequence
72 \item \code{"V_CALL"} containing the IMGT/V-QUEST V allele call(s)
73 \item \code{"J_CALL"} containing the IMGT/V-QUEST J allele call(s)
74 \item \code{"JUNCTION_LENGTH"} containing the junction length
75 }
76141 The TIgGER allele-finding algorithm, briefly, works as follows:
77142 Mutations are determined through comparison to the provided germline.
78143 Mutation frequency at each *position* is determined as a function of
83148 allele utilize a wide range of combinations of J gene and junction length.
84149 }
85150 \examples{
86 # Load example data and germlines
87 data(sample_db)
88 data(germline_ighv)
89
151 \donttest{
90152 # Find novel alleles and return relevant data
91 \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)}
153 novel <- findNovelAlleles(SampleDb, GermlineIGHV)
154 }
92155
93156 }
94157 \seealso{
95158 \link{plotNovel} to visualize the data supporting any
96159 novel alleles hypothesized to be present in the data and
97160 \link{inferGenotype} to determine if the novel alleles are frequent
98 enought to be included in the subject's genotype
161 enought to be included in the subject's genotype.
99162 }
77 }
88 \arguments{
99 \item{allele_calls}{a vector of strings respresenting Ig allele calls,
10 where multiple calls are separated by a comma}
10 where multiple calls are separated by a comma.}
1111
1212 \item{sample_seqs}{V(D)J-rearranged sample sequences matching the order
13 of the given \code{allele_calls}}
13 of the given \code{allele_calls}.}
1414
1515 \item{germline_db}{a vector of named nucleotide germline sequences}
1616 }
1717 \value{
1818 A vector of strings containing the members of \code{allele_calls}
19 that represent unmutated sequences
19 that represent unmutated sequences.
2020 }
2121 \description{
2222 \code{findUnmutatedCalls} determines which allele calls would represent a
2525 sequence, only the subset that would represent a perfect match is returned.
2626 }
2727 \examples{
28 # Load data
29 data(germline_ighv)
30 data(sample_db)
31
3228 # Find which of the sample alleles are unmutated
33 calls <- findUnmutatedCalls(sample_db$V_CALL, sample_db$SEQUENCE_IMGT,
34 germline_db=germline_ighv)
29 calls <- findUnmutatedCalls(SampleDb$V_CALL, SampleDb$SEQUENCE_IMGT,
30 germline_db=GermlineIGHV)
3531
3632 }
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/evidence.R
2 \name{generateEvidence}
3 \alias{generateEvidence}
4 \title{Generate evidence}
5 \usage{
6 generateEvidence(data, novel, genotype, genotype_db, germline_db,
7 fields = NULL)
8 }
9 \arguments{
10 \item{data}{a \code{data.frame} containing sequence data that has been
11 passed through \link{reassignAlleles} to correct the allele
12 assignments.}
13
14 \item{novel}{the \code{data.frame} returned by \link{findNovelAlleles}.}
15
16 \item{genotype}{the \code{data.frame} of alleles generated with \link{inferGenotype}
17 denoting the genotype of the subject.}
18
19 \item{genotype_db}{a vector of named nucleotide germline sequences in the genotype.
20 Returned by \link{genotypeFasta}.}
21
22 \item{germline_db}{the original uncorrected germline database used to by
23 \link{findNovelAlleles} to identify novel alleles.}
24
25 \item{fields}{character vector of column names used to split the data to
26 identify novel alleles, if any. If \code{NULL} then the data is
27 not divided by grouping variables.}
28 }
29 \value{
30 Returns the \code{genotype} input \code{data.frame} with the following additional columns
31 providing supporting evidence for each inferred allele:
32
33 \itemize{
34 \item \code{FIELD_ID}: Data subset identifier, defined with the input paramter \code{fields}.
35 \item A variable number of columns, specified with the input parameter \code{fields}.
36 \item \code{POLYMORPHISM_CALL}: The novel allele call.
37 \item \code{NOVEL_IMGT}: The novel allele sequence.
38 \item \code{CLOSEST_REFERENCE}: The closest reference gene and allele in
39 the \code{germline_db} database.
40 \item \code{CLOSEST_REFERENCE_IMGT}: Sequence of the closest reference gene and
41 allele in the \code{germline_db} database.
42 \item \code{GERMLINE_CALL}: The input (uncorrected) V call.
43 \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}.
44 \item \code{NT_DIFF}: Number of nucleotides that differ between the new allele and
45 the closest reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database.
46 \item \code{NT_SUBSTITUTIONS}: A comma separated list of specific nucleotide
47 differences (e.g. \code{112G>A}) in the novel allele.
48 \item \code{AA_DIFF}: Number of amino acids that differ between the new allele and the closest
49 reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database.
50 \item \code{AA_SUBSTITUTIONS}: A comma separated list with specific amino acid
51 differences (e.g. \code{96A>N}) in the novel allele.
52 \item \code{SEQUENCES}: Number of sequences unambiguosly assigned to this allele.
53 \item \code{UNMUTATED_SEQUENCES}: Number of records with the unmutated novel allele sequence.
54 \item \code{UNMUTATED_FREQUENCY}: Proportion of records with the unmutated novel allele
55 sequence (\code{UNMUTATED_SEQUENCES / SEQUENCE}).
56 \item \code{ALLELIC_PERCENTAGE}: Percentage at which the (unmutated) allele is observed
57 in the sequence dataset compared to other (unmutated) alleles.
58 \item \code{UNIQUE_JS}: Number of unique J sequences found associated with the
59 novel allele. The sequences are those who have been unambiguously assigned
60 to the novel allelle (\code{POLYMORPHISM_CALL}).
61 \item \code{UNIQUE_CDR3S}: Number of unique CDR3s associated with the inferred allele.
62 The sequences are those who have been unambiguously assigned to the
63 novel allelle (POLYMORPHISM_CALL).
64 \item \code{MUT_MIN}: Minimum mutation considered by the algorithm.
65 \item \code{MUT_MAX}: Maximum mutation considered by the algorithm.
66 \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering).
67 \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering).
68 \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered
69 potentially polymorphic.
70 \item \code{ALPHA}: Significance threshold to be used when constructing the
71 confidence interval for the y-intercept.
72 \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences
73 (within the desired mutational range and nucleotide range) required
74 for the samples to be considered.
75 \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly
76 aligning to a potential novel allele that are allowed to utilize to a particular
77 combination of junction length and J gene.
78 \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must
79 have usable nucleotides in a given position for that position to be considered.
80 \item \code{NOTE}: Comments regarding the novel allele inferrence.
81 }
82 }
83 \description{
84 \code{generateEvidence} builds a table of evidence metrics for the final novel V
85 allele detection and genotyping inferrences.
86 }
87 \examples{
88 \donttest{
89 # Generate input data
90 novel <- findNovelAlleles(SampleDb, GermlineIGHV)
91 genotype <- inferGenotype(SampleDb, find_unmutated=TRUE, germline_db=GermlineIGHV,
92 novel=novel)
93 genotype_db <- genotypeFasta(genotype, GermlineIGHV, novel)
94 data_db <- reassignAlleles(SampleDb, genotype_db)
95
96 # Assemble evidence table
97 evidence <- generateEvidence(data_db, novel, genotype, genotype_db, GermlineIGHV)
98 }
99
100 }
101 \seealso{
102 See \link{findNovelAlleles}, \link{inferGenotype} and \link{genotypeFasta}
103 for generating the required input.
104 }
+0
-20
man/genotype.Rd less more
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{genotype}
4 \alias{genotype}
5 \title{Example of an Inferred Genotype}
6 \format{A \code{data.frame} where rows correspond to genes carried by an
7 individual and columns lists the alleles of those genes and their counts.}
8 \description{
9 Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
10 individual (PGP1), sequenced on the Roche 454 platform, and thought by
11 IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by
12 \link{findNovelAlleles} and \link{inferGenotype}
13 }
14 \references{
15 Gadala-Maria \emph{et al}. (2015) Automated analysis of
16 high-throughput B cell sequencing data reveals a high frequency of novel
17 immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
18 }
19 \keyword{data}
33 \alias{genotypeFasta}
44 \title{Return the nucleotide sequences of a genotype}
55 \usage{
6 genotypeFasta(genotype, germline_db, novel_df = NA)
6 genotypeFasta(genotype, germline_db, novel = NA)
77 }
88 \arguments{
9 \item{genotype}{a table of alleles denoting a genotype, as returned by
10 \link{inferGenotype}}
9 \item{genotype}{a \code{data.frame} of alleles denoting a genotype,
10 as returned by \link{inferGenotype}.}
1111
1212 \item{germline_db}{a vector of named nucleotide germline sequences
13 matching the alleles detailed in \code{genotype}}
13 matching the alleles detailed in \code{genotype}.}
1414
15 \item{novel_df}{an optional \code{data.frame} containing putative
15 \item{novel}{an optional \code{data.frame} containing putative
1616 novel alleeles of the type returned by
17 \link{findNovelAlleles}}
17 \link{findNovelAlleles}.}
1818 }
1919 \value{
2020 A named vector of strings containing the germline nucleotide
21 sequences of the alleles in the provided genotype
21 sequences of the alleles in the provided genotype.
2222 }
2323 \description{
2424 \code{genotypeFasta} converts a genotype table into a vector of nucleotide
2525 sequences.
2626 }
2727 \examples{
28 # Load example data
29 data(germline_ighv)
30 data(novel_df)
31 data(genotype)
32
3328 # Find the sequences that correspond to the genotype
34 genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df)
35
29 genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, SampleNovel)
3630
3731 }
3832 \seealso{
+0
-20
man/germline_ighv.Rd less more
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{germline_ighv}
4 \alias{germline_ighv}
5 \title{Human IGHV germlines}
6 \format{Values correspond to IMGT-gaped nuceltoide sequences (with
7 nucleotides capitalized and gaps represented by ".") while names correspond
8 to stripped-down IMGT allele names (e.g. "IGHV1-18*01").}
9 \description{
10 A \code{character} vector of all 344 human IGHV germline gene segment alleles
11 in IMGT Gene-db release 201408-4.
12 }
13 \references{
14 Xochelli \emph{et al}. (2014) Immunoglobulin heavy variable
15 (IGHV) genes and alleles: new entities, new names and implications for
16 research and prognostication in chronic lymphocytic leukaemia.
17 \emph{Immunogenetics}. 67(1):61-6.
18 }
19 \keyword{data}
2626 contained within the call
2727 }
2828 \examples{
29 # Load germline database
30 data(germline_ighv)
29 # Insert a mutation into a germline sequence
30 s2 <- s3 <- GermlineIGHV[1]
31 stringi::stri_sub(s2, 103, 103) <- "G"
32 stringi::stri_sub(s3, 107, 107) <- "C"
3133
32 # Use createGermlines to insert a mutation into a germline sequence
33 #sample_seqs = c(germline_ighv[2],
34 # createGermlines(germline_ighv[1], 103, "G"),
35 # createGermlines(germline_ighv[1], 107, "C"))
34 sample_seqs <- c(GermlineIGHV[2], s2, s3)
3635
3736 # Pretend that one sample sequence has received an ambiguous allele call
38 #sample_alleles = c(paste(names(germline_ighv[1:2]), collapse=","),
39 # names(germline_ighv[2]),
40 # names(germline_ighv[1]))
37 sample_alleles <- c(paste(names(GermlineIGHV[1:2]), collapse=","),
38 names(GermlineIGHV[2]),
39 names(GermlineIGHV[1]))
4140
4241 # Compare each sequence to its assigned germline(s) to determine the distance
43 #getMutCount(sample_seqs, sample_alleles, germline_ighv)
42 getMutCount(sample_seqs, sample_alleles, GermlineIGHV)
4443
4544 }
3232 }
3333 \examples{
3434 # Create strings to act as a sample sequences and a reference sequence
35 seqs = c("----GATA","GAGAGAGA","TANA")
36 ref = "GATAGATA"
35 seqs <- c("----GATA", "GAGAGAGA", "TANA")
36 ref <- "GATAGATA"
3737
3838 # Find the differences between the two
3939 getMutatedPositions(seqs, ref)
11 % Please edit documentation in R/functions.R
22 \name{getPopularMutationCount}
33 \alias{getPopularMutationCount}
4 \title{Find Frequent Sequences' Mutation Counts}
4 \title{Find mutation counts for frequency sequences}
55 \usage{
6 getPopularMutationCount(sample_db, germline_db, gene_min = 0.001,
6 getPopularMutationCount(data, germline_db, gene_min = 0.001,
77 seq_min = 50, seq_p_of_max = 1/8, full_return = FALSE)
88 }
99 \arguments{
10 \item{sample_db}{A Change-O db data frame. See
10 \item{data}{a \code{data.frame} in the Change-O format. See
1111 \link{findNovelAlleles} for a list of required
1212 columns.}
1313
2222 \item{seq_p_of_max}{For each gene, fraction of the most common V sequence's
2323 count that a sequence must meet to avoid exclusion.}
2424
25 \item{full_return}{If true, will return all \code{sample_db} columns and
25 \item{full_return}{If \code{TRUE}, will return all \code{data} columns and
2626 will include sequences with mutation count < 1.}
2727 }
2828 \value{
3434 for each V gene and returns the mutation count of those sequences.
3535 }
3636 \examples{
37 data(sample_db, germline_ighv)
38 getPopularMutationCount(sample_db, germline_ighv)
37 getPopularMutationCount(SampleDb, GermlineIGHV)
3938
4039 }
4140 \seealso{
11 % Please edit documentation in R/functions.R
22 \name{inferGenotype}
33 \alias{inferGenotype}
4 \title{Infer a subject-specific genotype}
4 \title{Infer a subject-specific genotype using a frequency method}
55 \usage{
6 inferGenotype(clip_db, v_call = "V_CALL", fraction_to_explain = 0.875,
7 gene_cutoff = 1e-04, find_unmutated = TRUE, germline_db = NA,
8 novel_df = NA)
6 inferGenotype(data, germline_db = NA, novel = NA, v_call = "V_CALL",
7 fraction_to_explain = 0.875, gene_cutoff = 1e-04,
8 find_unmutated = TRUE)
99 }
1010 \arguments{
11 \item{clip_db}{a \code{data.frame} containing V allele
11 \item{data}{a \code{data.frame} containing V allele
1212 calls from a single subject. If
1313 \code{find_unmutated} is \code{TRUE}, then
1414 the sample IMGT-gapped V(D)J sequence should}
15
16 \item{v_call}{column in \code{clip_db} with V allele calls.
17 Default is \code{"V_CALL"}
18 be provided in a column \code{"SEQUENCE_IMGT"}}
19
20 \item{fraction_to_explain}{the portion of each gene that must be
21 explained by the alleles that will be included
22 in the genotype}
23
24 \item{gene_cutoff}{either a number of sequences or a fraction of
25 the length of \code{allele_calls} denoting the
26 minimum number of times a gene must be
27 observed in \code{allele_calls} to be included
28 in the genotype}
29
30 \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to
31 find which samples are unmutated. Not needed
32 if \code{allele_calls} only represent
33 unmutated samples.}
3415
3516 \item{germline_db}{named vector of sequences containing the
3617 germline sequences named in
3718 \code{allele_calls}. Only required if
3819 \code{find_unmutated} is \code{TRUE}.}
3920
40 \item{novel_df}{an optional \code{data.frame} of the type
21 \item{novel}{an optional \code{data.frame} of the type
4122 novel returned by
4223 \link{findNovelAlleles} containing
4324 germline sequences that will be utilized if
4425 \code{find_unmutated} is \code{TRUE}. See
45 details.}
26 Details.}
27
28 \item{v_call}{column in \code{data} with V allele calls.
29 Default is \code{"V_CALL"}.
30 be provided in a column \code{"SEQUENCE_IMGT"}}
31
32 \item{fraction_to_explain}{the portion of each gene that must be
33 explained by the alleles that will be included
34 in the genotype.}
35
36 \item{gene_cutoff}{either a number of sequences or a fraction of
37 the length of \code{allele_calls} denoting the
38 minimum number of times a gene must be
39 observed in \code{allele_calls} to be included
40 in the genotype.}
41
42 \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to
43 find which samples are unmutated. Not needed
44 if \code{allele_calls} only represent
45 unmutated samples.}
4646 }
4747 \value{
48 A table of alleles denoting the genotype of the subject
48 A \code{data.frame} of alleles denoting the genotype of the subject containing
49 the following columns:
50
51 \itemize{
52 \item \code{GENE}: The gene name without allele.
53 \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}.
54 \item \code{COUNTS}: Comma separated list of observed sequences for each
55 corresponding allele in the \code{ALLELES} list.
56 \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}.
57 \item \code{NOTE}: Any comments on the inferrence.
58 }
4959 }
5060 \description{
51 \code{inferGenotype} infers an subject's genotype by finding the minimum
52 number set of alleles that can explain the majority of each gene's calls. The
53 most common allele of each gene is included in the genotype first, and the
54 next most common allele is added until the desired fraction of alleles can be
55 explained. In this way, mistaken allele calls (resulting from sequences which
61 \code{inferGenotype} infers an subject's genotype using a frequency method.
62 The genotype is inferred by finding the minimum number set of alleles that
63 can explain the majority of each gene's calls. The most common allele of
64 each gene is included in the genotype first, and the next most common allele
65 is added until the desired fraction of alleles can be explained. In this
66 way, mistaken allele calls (resulting from sequences which
5667 by chance have been mutated to look like another allele) can be removed.
5768 }
5869 \details{
5970 Allele calls representing cases where multiple alleles have been
60 assigned to a single sample sequence are rare among unmutated
61 sequences but may result if nucleotides for certain positions are
62 not available. Calls containing multiple alleles are treated as
63 belonging to all groups. If \code{novel_df} is provided, all
64 sequences that are assigned to the same starting allele as any
65 novel germline allele will have the novel germline allele appended
66 to their assignent prior to searching for unmutated sequences.
71 assigned to a single sample sequence are rare among unmutated
72 sequences but may result if nucleotides for certain positions are
73 not available. Calls containing multiple alleles are treated as
74 belonging to all groups. If \code{novel} is provided, all
75 sequences that are assigned to the same starting allele as any
76 novel germline allele will have the novel germline allele appended
77 to their assignent prior to searching for unmutated sequences.
6778 }
6879 \note{
6980 This method works best with data derived from blood, where a large
70 portion of sequences are expected to be unmutated. Ideally, there
71 should be hundreds of allele calls per gene in the input.
81 portion of sequences are expected to be unmutated. Ideally, there
82 should be hundreds of allele calls per gene in the input.
7283 }
7384 \examples{
74 # Infer the IGHV genotype, using only unmutated sequences, including any
75 # novel alleles
76 data(sample_db)
77 data(germline_ighv)
78 data(novel_df)
79 inferGenotype(sample_db, find_unmutated = TRUE, germline_db = germline_ighv,
80 novel_df = novel_df)
85 # Infer IGHV genotype, using only unmutated sequences, including novel alleles
86 inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel,
87 find_unmutated=TRUE)
8188
8289 }
8390 \seealso{
8491 \link{plotGenotype} for a colorful visualization and
8592 \link{genotypeFasta} to convert the genotype to nucleotide sequences.
93 See \link{inferGenotypeBayesian} to infer a subject-specific genotype
94 using a Bayesian approach.
8695 }
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/bayesian.R
2 \name{inferGenotypeBayesian}
3 \alias{inferGenotypeBayesian}
4 \title{Infer a subject-specific genotype using a Bayesian approach}
5 \usage{
6 inferGenotypeBayesian(data, germline_db = NA, novel = NA,
7 v_call = "V_CALL", find_unmutated = TRUE, priors = c(0.6, 0.4, 0.4,
8 0.35, 0.25, 0.25, 0.25, 0.25, 0.25))
9 }
10 \arguments{
11 \item{data}{a \code{data.frame} containing V allele
12 calls from a single subject. If \code{find_unmutated}
13 is \code{TRUE}, then the sample IMGT-gapped V(D)J sequence
14 should be provided in a column \code{"SEQUENCE_IMGT"}}
15
16 \item{germline_db}{named vector of sequences containing the
17 germline sequences named in \code{allele_calls}.
18 Only required if \code{find_unmutated} is \code{TRUE}.}
19
20 \item{novel}{an optional \code{data.frame} of the type
21 novel returned by \link{findNovelAlleles} containing
22 germline sequences that will be utilized if
23 \code{find_unmutated} is \code{TRUE}. See Details.}
24
25 \item{v_call}{column in \code{data} with V allele calls.
26 Default is \code{"V_CALL"}.}
27
28 \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to
29 find which samples are unmutated. Not needed
30 if \code{allele_calls} only represent
31 unmutated samples.}
32
33 \item{priors}{a numeric vector of priors for the multinomial distribution.
34 The \code{priors} vector must be nine values that defined
35 the priors for the heterozygous (two allele),
36 trizygous (three allele), and quadrozygous (four allele)
37 distributions. The first two values of \code{priors} define
38 the prior for the heterozygous case, the next three values are for
39 the trizygous case, and the final four values are for the
40 quadrozygous case. Each set of priors should sum to one.
41 Note, each distribution prior is actually defined internally
42 by set of four numbers, with the unspecified final values
43 assigned to \code{0}; e.g., the heterozygous case is
44 \code{c(priors[1], priors[2], 0, 0)}. The prior for the
45 homozygous distribution is fixed at \code{c(1, 0, 0, 0)}.}
46 }
47 \value{
48 A \code{data.frame} of alleles denoting the genotype of the subject with the log10
49 of the likelihood of each model and the log10 of the Bayes factor. The output
50 contains the following columns:
51
52 \itemize{
53 \item \code{GENE}: The gene name without allele.
54 \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}.
55 \item \code{COUNTS}: Comma separated list of observed sequences for each
56 corresponding allele in the \code{ALLELES} list.
57 \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}.
58 \item \code{NOTE}: Any comments on the inferrence.
59 \item \code{KH}: log10 likelihood that the \code{GENE} is homozygous.
60 \item \code{KD}: log10 likelihood that the \code{GENE} is heterozygous.
61 \item \code{KT}: log10 likelihood that the \code{GENE} is trizygous
62 \item \code{KQ}: log10 likelihood that the \code{GENE} is quadrozygous.
63 \item \code{K_DIFF}: log10 ratio of the highest to second-highest zygosity likelihoods.
64 }
65 }
66 \description{
67 \code{inferGenotypeBayesian} infers an subject's genotype by applying a Bayesian framework
68 with a Dirichlet prior for the multinomial distribution. Up to four distinct alleles are
69 allowed in an individual’s genotype. Four likelihood distributions were generated by
70 empirically fitting three high coverage genotypes from three individuals
71 (Laserson and Vigneault et al, 2014). A posterior probability is calculated for the
72 four most common alleles. The certainty of the highest probability model was
73 calculated using a Bayes factor (the most likely model divided by second-most likely model).
74 The larger the Bayes factor (K), the greater the certainty in the model.
75 }
76 \details{
77 Allele calls representing cases where multiple alleles have been
78 assigned to a single sample sequence are rare among unmutated
79 sequences but may result if nucleotides for certain positions are
80 not available. Calls containing multiple alleles are treated as
81 belonging to all groups. If \code{novel} is provided, all
82 sequences that are assigned to the same starting allele as any
83 novel germline allele will have the novel germline allele appended
84 to their assignent prior to searching for unmutated sequences.
85 }
86 \note{
87 This method works best with data derived from blood, where a large
88 portion of sequences are expected to be unmutated. Ideally, there
89 should be hundreds of allele calls per gene in the input.
90 }
91 \examples{
92 # Infer IGHV genotype, using only unmutated sequences, including novel alleles
93 inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel,
94 find_unmutated=TRUE)
95
96 }
97 \references{
98 \enumerate{
99 \item Laserson U and Vigneault F, et al. High-resolution antibody dynamics of
100 vaccine-induced immune responses. PNAS. 2014 111(13):4928-33.
101 }
102 }
103 \seealso{
104 \link{plotGenotype} for a colorful visualization and
105 \link{genotypeFasta} to convert the genotype to nucleotide sequences.
106 See \link{inferGenotype} to infer a subject-specific genotype using
107 a frequency method
108 }
66 insertPolymorphisms(sequence, positions, nucleotides)
77 }
88 \arguments{
9 \item{sequence}{the starting nucletide sequence}
9 \item{sequence}{starting nucletide sequence.}
1010
11 \item{positions}{a vector of positions which to be changed}
11 \item{positions}{numeric vector of positions which to be changed.}
1212
13 \item{nucleotides}{a vector of nucletides to which to change the
14 positions}
13 \item{nucleotides}{character vector of nucletides to which to change the
14 positions.}
1515 }
1616 \value{
17 a sequence with the desired nucleotides in provided locations
17 A sequence with the desired nucleotides in the provided locations.
1818 }
1919 \description{
2020 \code{insertPolymorphisms} replaces nucleotides in the desired locations of a
2121 provided sequence.
2222 }
2323 \examples{
24 insertPolymorphisms("hugged", c(1,6,2), c("t","r","i"))
24 insertPolymorphisms("HUGGED", c(1, 6, 2), c("T", "R", "I"))
2525
2626 }
+0
-21
man/novel_df.Rd less more
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{novel_df}
4 \alias{novel_df}
5 \title{Example of Analyzed Rep-Seq data}
6 \format{A \code{data.frame} where rows correspond to alleles checked for
7 polymorphisms and columns give results as well as paramaters used to run
8 the test.}
9 \description{
10 Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
11 individual (PGP1), sequenced on the Roche 454 platform, and thought by
12 IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by
13 \link{findNovelAlleles}.
14 }
15 \references{
16 Gadala-Maria \emph{et al}. (2015) Automated analysis of
17 high-throughput B cell sequencing data reveals a high frequency of novel
18 immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
19 }
20 \keyword{data}
33 \alias{plotGenotype}
44 \title{Show a colorful representation of a genotype}
55 \usage{
6 plotGenotype(genotype, facet_by = NULL, gene_sort = c("name", "position"),
7 text_size = 12, silent = FALSE, ...)
6 plotGenotype(genotype, facet_by = NULL, gene_sort = c("name",
7 "position"), text_size = 12, silent = FALSE, ...)
88 }
99 \arguments{
10 \item{genotype}{a table of alleles denoting a genotype, as returned by
11 \link{inferGenotype}}
10 \item{genotype}{a \code{data.frame} of alleles denoting a genotype,
11 as returned by \link{inferGenotype}.}
1212
1313 \item{facet_by}{a column name in \code{genotype} to facet the plot by.
1414 If \code{NULL}, then do not facet the plot.}
1818 \code{"position"} then sort by position in the locus, as
1919 determined by the final two numbers in the gene name.}
2020
21 \item{text_size}{the point size of the plotted text}
21 \item{text_size}{the point size of the plotted text.}
2222
2323 \item{silent}{if \code{TRUE} do not draw the plot and just return the ggplot
2424 object; if \code{FALSE} draw the plot.}
3232 \code{plotGenotype} plots a genotype table.
3333 }
3434 \examples{
35 # Load example data
36 data(novel_df)
37 data(genotype)
38
3935 # Plot genotype
40 plotGenotype(genotype)
36 plotGenotype(SampleGenotype)
4137
4238 # Facet by subject
43 genotypea = genotypeb = genotype
44 genotypea$SUBJECT = "A"
45 genotypeb$SUBJECT = "B"
46 geno_sub = rbind(genotypea, genotypeb)
39 genotype_a <- genotype_b <- SampleGenotype
40 genotype_a$SUBJECT <- "A"
41 genotype_b$SUBJECT <- "B"
42 geno_sub <- rbind(genotype_a, genotype_b)
4743 plotGenotype(geno_sub, facet_by="SUBJECT", gene_sort="pos")
4844
4945 }
33 \alias{plotNovel}
44 \title{Visualize evidence of novel V alleles}
55 \usage{
6 plotNovel(clip_db, novel_df_row, ncol = 1, v_call = "V_CALL")
6 plotNovel(data, novel_row, v_call = "V_CALL", ncol = 1)
77 }
88 \arguments{
9 \item{clip_db}{a \code{data.frame} in Change-O format. See
9 \item{data}{a \code{data.frame} in Change-O format. See
1010 \link{findNovelAlleles} for details.}
1111
12 \item{novel_df_row}{a single row from a data frame as output by
12 \item{novel_row}{a single row from a data frame as output by
1313 \link{findNovelAlleles} that contains a
1414 polymorphism-containing germline allele}
1515
16 \item{v_call}{name of the column in \code{data} with V allele
17 calls. Default is "V_CALL".}
18
1619 \item{ncol}{number of columns to use when laying out the plots}
17
18 \item{v_call}{name of the column in \code{clip_db} with V allele
19 calls. Default is "V_CALL"}
2020 }
2121 \description{
2222 \code{plotNovel} is be used to visualize the evidence of any novel V
23 alleles found using \link{findNovelAlleles}.
23 alleles found using \link{findNovelAlleles}. It can also be used to
24 visualize the results for alleles that did
25 }
26 \details{
27 The first panel in the plot shows, for all sequences which align to a particular
28 germline allele, the mutation frequency at each postion along the aligned
29 sequece as a function of the sequence-wide mutation. Sequences that pass
30 the novel allele test are colored red, while sequences that don't pass
31 the test are colored yellow. The second panel shows the nucleotide usage at the
32 positions as a function of sequence-wide mutation count.
33
34 To avoid cases where a clonal expansion might lead to a false positive, tigger examines
35 the combinations of J gene and junction length among sequences which perfectly
36 match the proposed germline allele.
2437 }
2538 \examples{
26 # Load example data and germlines
27 data(sample_db)
28 data(germline_ighv)
29
30 # Find novel alleles and return relevant data
31 \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)}
32 data(novel_df)
3339 # Plot the evidence for the first (and only) novel allele in the example data
34 novel = selectNovel(novel_df)
35 plotNovel(sample_db, novel[1,])
40 novel <- selectNovel(SampleNovel)
41 plotNovel(SampleDb, novel[1, ])
3642
3743 }
66 readIgFasta(fasta_file, strip_down_name = TRUE, force_caps = TRUE)
77 }
88 \arguments{
9 \item{fasta_file}{fasta-formatted file of immunoglobuling sequences}
9 \item{fasta_file}{fasta-formatted file of immunoglobuling sequences.}
1010
1111 \item{strip_down_name}{if \code{TRUE}, will extract only the allele name
12 from the strings fasta file's sequence names}
12 from the strings fasta file's sequence names.}
1313
1414 \item{force_caps}{if \code{TRUE}, will force nucleotides to
15 uppercase}
15 uppercase.}
1616 }
1717 \value{
18 a named vector of strings respresenting Ig alleles
18 Named vector of strings respresenting Ig alleles.
1919 }
2020 \description{
2121 \code{readIgFasta} reads a fasta-formatted file of immunoglobulin (Ig)
33 \alias{reassignAlleles}
44 \title{Correct allele calls based on a personalized genotype}
55 \usage{
6 reassignAlleles(clip_db, genotype_db, v_call = "V_CALL", method = "hamming",
7 path = NA, keep_gene = TRUE)
6 reassignAlleles(data, genotype_db, v_call = "V_CALL",
7 method = "hamming", path = NA, keep_gene = c("gene", "family",
8 "repertoire"))
89 }
910 \arguments{
10 \item{clip_db}{a \code{data.frame} containing V allele calls from a
11 single subject and the sample
12 IMGT-gapped V(D)J sequences under
13 \code{"SEQUENCE_IMGT"}}
11 \item{data}{a \code{data.frame} containing V allele calls from a
12 single subject and the sample IMGT-gapped V(D)J sequences under
13 \code{"SEQUENCE_IMGT"}.}
1414
1515 \item{genotype_db}{a vector of named nucleotide germline sequences
1616 matching the calls detailed in \code{allele_calls}
1717 and personalized to the subject}
1818
19 \item{v_call}{name of the column in \code{clip_db} with V allele
20 calls. Default is \code{"V_CALL"}}
19 \item{v_call}{name of the column in \code{data} with V allele
20 calls. Default is \code{"V_CALL"}.}
2121
2222 \item{method}{the method to be used when realigning sequences to
23 the genotype_db sequences. Currently only "hammming"
23 the genotype_db sequences. Currently, only \code{"hammming"}
2424 (for Hamming distance) is implemented.}
2525
2626 \item{path}{directory containing the tool used in the
2727 realignment method, if needed. Hamming distance does
2828 not require a path to a tool.}
2929
30 \item{keep_gene}{logical indicating if gene assignments should be
31 maintained when possible. Increases speed by
32 minimizing required number of alignments. Currently
33 only "TRUE" is implemented.}
30 \item{keep_gene}{a string indicating if the gene (\code{"gene"}),
31 family (\code{"family"}) or complete repertoire
32 (\code{"repertoire"}) assignments should be performed.
33 Use of \code{"gene"} increases speed by minimizing required number of
34 alignments, as gene level assignments will be maintained when possible.}
3435 }
3536 \value{
36 a single-column \code{data.frame} corresponding to \code{clip.db}
37 and containing the best allele call from among the sequences
38 listed in \code{genotype_db}
37 A modifed input \code{data.frame} containing the best allele call from
38 among the sequences listed in \code{genotype_db} in the
39 \code{V_CALL_GENOTYPED} column.
3940 }
4041 \description{
4142 \code{reassignAlleles} uses a subject-specific genotype to correct
4849 based on a simple alignment to the sample sequence.
4950 }
5051 \examples{
51 # Load example data
52 data(germline_ighv)
53 data(sample_db)
54 data(genotype)
55 data(novel_df)
56
5752 # Extract the database sequences that correspond to the genotype
58 genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df)
53 genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, novel=SampleNovel)
5954
6055 # Use the personlized genotype to determine corrected allele assignments
61 V_CALL_GENOTYPED = reassignAlleles(sample_db, genotype_seqs)
62 sample_db = cbind(sample_db, V_CALL_GENOTYPED)
56 output_db <- reassignAlleles(SampleDb, genotype_db)
6357
6458 }
+0
-25
man/sample_db.Rd less more
0 % Generated by roxygen2: do not edit by hand
1 % Please edit documentation in R/data.R
2 \docType{data}
3 \name{sample_db}
4 \alias{sample_db}
5 \title{Example human Rep-Seq data}
6 \format{A \code{data.frame} where rows correspond to unique VDJ sequences and
7 columns include:
8 \itemize{
9 \item IMGT-gapped nucleotide sequence (\code{"SEQUENCE_IMGT"})
10 \item IMGT/V-QUEST allele calls (\code{"V_CALL"}, \code{"D_CALL"}, and
11 \code{"J_CALL"})
12 \item Junction length (\code{"JUNCTION_LENGTH"})
13 }}
14 \description{
15 Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single
16 individual (PGP1), sequenced on the Roche 454 platform, and thought by
17 IMGT/V-QUEST to utilize IGHV1 family alleles.
18 }
19 \references{
20 Gadala-Maria \emph{et al}. (2015) Automated analysis of
21 high-throughput B cell sequencing data reveals a high frequency of novel
22 immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
23 }
24 \keyword{data}
33 \alias{selectNovel}
44 \title{Select rows containing novel alleles}
55 \usage{
6 selectNovel(novel_df, keep_alleles = FALSE)
6 selectNovel(novel, keep_alleles = FALSE)
77 }
88 \arguments{
9 \item{novel_df}{A \code{data.frame} of the type returned by
10 \link{findNovelAlleles}}
9 \item{novel}{a \code{data.frame} of the type returned by
10 \link{findNovelAlleles}.}
1111
12 \item{keep_alleles}{A \code{logical} indicating if different alleles
12 \item{keep_alleles}{a \code{logical} indicating if different alleles
1313 leading to the same novel sequence should be kept.
14 See details.}
14 See Details.}
1515 }
1616 \value{
1717 A \code{data.frame} containing only unique, novel alleles (if any)
2222 selects only the rows containing unique, novel alleles.
2323 }
2424 \details{
25 If, for instance, subject has in his genome IGHV1-2*02 and a novel
26 allele equally close to IGHV1-2*02 and IGHV1-2*05, the novel allele may be
25 If, for instance, subject has in his genome \code{IGHV1-2*02} and a novel
26 allele equally close to \code{IGHV1-2*02} and \code{IGHV1-2*05}, the novel allele may be
2727 detected by analyzing sequences that best align to either of these alleles.
2828 If \code{keep_alleles} is \code{TRUE}, both polymorphic allele calls will
2929 be retained. In the case that multiple mutation ranges are checked for the
3030 same allele, only one mutation range will be kept in the output.
3131 }
3232 \examples{
33 data(novel_df)
34 novel = selectNovel(novel_df)
33 novel <- selectNovel(SampleNovel)
3534
3635 }
66 sortAlleles(allele_calls, method = c("name", "position"))
77 }
88 \arguments{
9 \item{allele_calls}{a vector of strings respresenting Ig allele names}
9 \item{allele_calls}{a vector of strings respresenting Ig allele names.}
1010
1111 \item{method}{a string defining the method to use when sorting alleles.
1212 If \code{"name"} then sort in lexicographic order. If
1414 determined by the final two numbers in the gene name.}
1515 }
1616 \value{
17 A sorted vector of strings respresenting Ig allele names
17 A sorted vector of strings respresenting Ig allele names.
1818 }
1919 \description{
2020 \code{sortAlleles} returns a sorted vector of strings respresenting Ig allele
2121 names. Names are first sorted by gene family, then by gene, then by allele.
2222 Duplicated genes have their alleles are sorted as if they were part of their
23 non-duplicated counterparts (e.g. IGHV1-69D*01 comes after IGHV1-69*01 but
24 before IGHV1-69*02), and non-localized genes (e.g. IGHV1-NL1*01) come last
25 within their gene family.
23 non-duplicated counterparts (e.g. \code{IGHV1-69D*01} comes after \code{IGHV1-69*01}
24 but before \code{IGHV1-69*02}), and non-localized genes (e.g. \code{IGHV1-NL1*01})
25 come last within their gene family.
2626 }
2727 \examples{
2828 # Create a list of allele names
29 alleles = c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01",
30 "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05",
31 "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02")
29 alleles <- c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01",
30 "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05",
31 "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02")
3232
3333 # Sort the alleles by name
3434 sortAlleles(alleles)
66 \title{tigger}
77 \description{
88 Here we provide a \strong{T}ool for \strong{I}mmuno\strong{g}lobulin
9 \strong{G}enotype \strong{E}lucidation via
10 \strong{R}ep-Seq (TIgGER). TIgGER inferrs the set of Ig alleles carried by an
9 \strong{G}enotype \strong{E}lucidation via \strong{R}ep-Seq (TIgGER).
10 TIgGER inferrs the set of Ig alleles carried by an
1111 individual (including any novel alleles) and then uses this set of alleles to
1212 correct the initial assignments given to sample sequences by existing tools.
1313 }
1414 \details{
15 Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the
15 Immunoglobulin repertoire sequencing (AIRR-Seq, Rep-Seq) data is currently the
1616 subject of much study. A key step in analyzing these data involves assigning
1717 the closest known V(D)J germline alleles to the (often somatically mutated)
1818 sample sequences using a tool such as IMGT/HighV-QUEST. However, if the
1919 sample utilizes alleles not in the germline database used for alignment, this
2020 step will fail. Additionally, this alignment has an associated error rate of
21 ~5 percent, notably among sequences carrying a large number of somatic
21 ~5%, notably among sequences carrying a large number of somatic
2222 mutations. The purpose of TIgGER is to address these issues.
2323 }
24 \section{Core tigger functions}{
24 \section{Allele detection and genotyping}{
2525
2626 \itemize{
27 \item \link{findNovelAlleles}: Detect novel alleles
28 \item \link{plotNovel}: Plot evidence of novel alleles
29 \item \link{inferGenotype}: Infer an Ig genotype
30 \item \link{plotGenotype}: A colorful genotype visualization
31 \item \link{genotypeFasta}: Convert a genotype to sequences
32 \item \link{reassignAlleles}: Correct allele calls
27 \item \link{findNovelAlleles}: Detect novel alleles.
28 \item \link{plotNovel}: Plot evidence of novel alleles.
29 \item \link{inferGenotype}: Infer an Ig genotype using a frequency approach.
30 \item \link{inferGenotypeBayesian}: Infer an Ig genotype using a Bayesian approach.
31 \item \link{plotGenotype}: A colorful genotype visualization.
32 \item \link{genotypeFasta}: Convert a genotype to sequences.
33 \item \link{reassignAlleles}: Correct allele calls.
34 \item \link{generateEvidence}: Generate evidence for the genotype and
35 allele detection inferrence.
3336 }
3437 }
3538
36 \section{Mutation-related functions}{
39 \section{Mutation handling}{
3740
3841 \itemize{
39 \item \link{getMutatedPositions}: Find mutation locations
40 \item \link{getMutCount}: Find distance from germline
41 \item \link{findUnmutatedCalls}: Subset unmutated sequences
42 \item \link{getMutatedPositions}: Find mutation locations.
43 \item \link{getMutCount}: Find distance from germline.
44 \item \link{findUnmutatedCalls}: Subset unmutated sequences.
4245 \item \link{getPopularMutationCount}: Find most common sequence's
43 mutation count
44 \item \link{insertPolymorphisms}: Insert SNPs into a sequence
46 mutation count.
47 \item \link{insertPolymorphisms}: Insert SNPs into a sequence.
4548 }
4649 }
4750
48 \section{Input and formatting}{
51 \section{Input, output and formatting}{
4952
5053 \itemize{
51 \item \link{readIgFasta}: Read a fasta file of Ig sequences
52 \item \link{updateAlleleNames}: Correct outdated allele names
53 \item \link{sortAlleles}: Sort allele names intelligently
54 \item \link{cleanSeqs}: Standardize sequence format
54 \item \link{readIgFasta}: Read a fasta file of Ig sequences.
55 \item \link{updateAlleleNames}: Correct outdated allele names.
56 \item \link{sortAlleles}: Sort allele names intelligently.
57 \item \link{cleanSeqs}: Standardize sequence format.
5558 }
5659 }
5760
5861 \references{
59 Gadala-Maria \emph{et al}. (2015) Automated analysis of
60 high-throughput B cell sequencing data reveals a high frequency of novel
61 immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70.
62 \enumerate{
63 \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell
64 sequencing data reveals a high frequency of novel immunoglobulin V gene
65 segment alleles. PNAS. 112(8):E862-70.
6266 }
67 }
66 updateAlleleNames(allele_calls)
77 }
88 \arguments{
9 \item{allele_calls}{a vector of strings respresenting IGHV allele names}
9 \item{allele_calls}{a vector of strings respresenting IGHV allele names.}
1010 }
1111 \value{
12 vector of strings respresenting updated IGHV allele names
12 Vector of strings respresenting updated IGHV allele names.
1313 }
1414 \description{
1515 \code{updateAlleleNames} takes a set of IGHV allele calls and replaces any
1616 outdated names (e.g. IGHV1-f) with the new IMGT names.
1717 }
18 \details{
19 The updated allele names are based on IMGT release 201408-4.
20 }
2118 \note{
22 IGMT has removed IGHV2-5*10 and IGHV2-5*07 as it has determined they
23 are actually alleles *02 and *04, respectively.
19 IGMT has removed \code{IGHV2-5*10} and \code{IGHV2-5*07} as it has determined they
20 are actually alleles \code{02} and \code{04}, respectively. The updated allele
21 names are based on IMGT release 201408-4.
2422 }
2523 \examples{
2624 # Create a vector that uses old gene/allele names.
27 alleles = c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07")
25 alleles <- c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07")
2826
2927 # Update the alleles to the new names
3028 updateAlleleNames(alleles)
3129
3230 }
3331 \references{
34 Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes
35 and alleles: new entities, new names and implications for research and
36 prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6
32 \enumerate{
33 \item Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes
34 and alleles: new entities, new names and implications for research and
35 prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6
36 }
3737 }
3838 \seealso{
3939 Like \code{updateAlleleNames}, \link{sortAlleles} can help
88 \arguments{
99 \item{named_sequences}{a vector of named string representing sequences}
1010
11 \item{file}{the name of the output file}
11 \item{file}{the name of the output file.}
1212
1313 \item{width}{the number of characters to be printed per line.
14 If not between 1 and 255, width with be infinite.}
14 if not between 1 and 255, width with be infinite.}
1515
1616 \item{append}{\code{logical} indicating if the output should be
1717 appended to \code{file} instead of overwriting it}
1818 }
1919 \value{
20 a named vector of strings respresenting Ig alleles
20 A named vector of strings respresenting Ig alleles.
2121 }
2222 \description{
2323 \code{writeFasta} writes a named vector of sequences to a file in fasta
2929
3030 ## Introduction
3131
32 Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the subject of
33 much study. A key step in analyzing these data involves assigning the closest
34 known V(D)J germline alleles to the (often somatically mutated) sample sequences
35 using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, if the sample utilizes
36 alleles not in the germline database used for alignment, this step will fail.
37 Additionally, this alignment has an associated error rate of ~5% ([[2]][2]),
38 notably among sequences carrying a large number of somatic mutations.
32 Adapative immune receptor repertoire sequencing (AIRR-Seq, Rep-Seq) data is
33 currently the subject of much study. A key step in analyzing these data involves
34 assigning the closest known V(D)J germline alleles to the (often somatically mutated)
35 sample sequences using a tool such as IMGT/HighV-QUEST ([[1]][1]). However,
36 if the sample utilizes alleles not in the germline database used for alignment,
37 this step will fail. Additionally, this alignment has an associated error rate
38 of ~5% ([[2]][2]), notably among sequences carrying a large number of somatic
39 mutations.
3940
4041 Here we provide a **T**ool for **I**mmuno**g**lobulin **G**enotype
4142 **E**lucidation via **R**ep-Seq (TIgGER). TIgGER addresses these issues by
42 inferring the set of Ig alleles carried by an individual (including any novel
43 alleles) and then using this set of alleles to correct the initial assignments
44 given to sample sequences by existing tools.
45
46 Additional information is available in:
43 inferring the set of Immunoglobulin (Ig) alleles carried by an individual
44 (including any novel alleles) and then using this set of alleles to correct
45 the initial assignments given to sample sequences by existing tools.
46
47 This vignette covers the following tasks:
48
49 1. Inferring the presence of novel IGHV alleles not in the germline database.
50 2. Inferring the personalized IGHV genotype of a sample.
51 3. Correcting the IGHV allele calls of a sample based on the IGHV genotype.
52
53 Additional information about the methods used by TIgGER is available in:
4754
4855 [Gadala-Maria D, Yaari G, Uduman M, Kleinstein SH (2015) Automated analysis of
4956 high-throughput B cell sequencing data reveals a high frequency of novel
5057 immunoglobulin V gene segment alleles. *PNAS*
5158 112(8):E862-70](http://www.pnas.org/content/early/2015/02/05/1417683112).
5259
53
5460 ## Input
5561
5662 TIgGER requires two main inputs:
5763
58 1. Pre-processed Rep-Seq data
64 1. Pre-processed Ig sequence data
5965 2. Database germline sequences
6066
61 Rep-seq data is input as a data frame where each row represents a unique
62 observation and and columns represent data about that observation. The required
63 names of the required columns are provided below along with a description of
64 each.
67 AIRR-seq data is input as a data frame following the Change-O standard where
68 each row represents a unique observation and and columns represent data about
69 that observation. The required names of the required columns are provided below
70 along with a description of each.
6571
6672 Column Name | Description
6773 ----------------------|---------------------------------------------------------
7076 `J_CALL` | (Comma separated) name(s) of the nearest J allele(s)
7177 `JUNCTION_LENGTH` | Length of the junction region of the V(D)J sample
7278
73 An example dataset is provided with the `tigger` package. It contains unique
74 functional sequences assigned to IGHV1 family genes isolated from individual
75 PGP1 (referenced in Gadala-Maria *et al.* 2015).
79 An example dataset is provided with the `tigger` package as `SampleDb`. It
80 contains unique functional sequences assigned to IGHV1 family genes isolated
81 from individual PGP1 (referenced in Gadala-Maria *et al.* 2015).
7682
7783 The database of germline sequences should be provided in FASTA format with
7884 sequences gapped according to the IMGT numbering scheme ([[3]][3]). IGHV alleles in
79 the IMGT database (build 201408-4) are provided with this package. You may read
80 in your own fasta file using `readIgFasta`.
85 the IMGT database (build 201408-4) are provided with this package as `GermlineIGHV`.
86 You may read in your own fasta file using `readIgFasta`.
8187
8288 ```{r, eval=TRUE, message=FALSE, warning=FALSE}
89 # Load packages required for this example
8390 library(tigger)
8491 library(dplyr)
85 # Load example sequence data and example germline database
86 data(sample_db, germline_ighv)
87 ```
88
89 ## Running TIgGER
90
91 The functions provided by this package can be used to perform any combination of
92 the following:
93
94 1. Infer the presence of novel IGHV alleles not in the germline database
95 2. Infer the individual's IGHV genotype
96 3. Correct the IGHV allele calls of the samples based on the IGHV genotype
97
98 ### Novel Alleles
92 ```
93
94 ## Novel allele detection
9995
10096 Potential novel alleles can be detected by TIgGER. Some of these may be included
10197 in the genotype later (see below). `findNovelAlleles` will return a `data.frame`
108104
109105 ```{r, eval=TRUE, warning=FALSE}
110106 # Detect novel alleles
111 novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1)
107 novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1)
112108 ```
113109
114110 ```{r, eval=TRUE, warning=FALSE}
115111 # Extract and view the rows that contain successful novel allele calls
116 novel <- selectNovel(novel_df)
117 novel[1:3]
112 novel_rows <- selectNovel(novel)
113 novel_rows[1:3]
118114 ```
119115
120116 The TIgGER procedure for identifying novel alleles (see citation above) involves
147143
148144 ```{r, eval=TRUE, warning=FALSE, fig.width=6, fig.height=8}
149145 # Plot evidence of the first (and only) novel allele from the example data
150 plotNovel(sample_db, novel[1, ])
151 ```
152
153 ### Genotype
154 An individual's genotype can be inferred using the function `inferGenotype`.
155 This function will remove from the genotype rare/erroneous allele calls which
156 may result from mutations in allele-differentiating regions. This is done by
157 determining the fewest alleles that account for nearly all (default is 7/8) of
158 the allele calls made. The user may opt to only use sequences which perfectly
159 match germline alleles, and may opt to include potential novel alleles.
160 (The genotype output is designed to be human readable, though `plotGenotype`
161 can be used to make a colorful visualization.) For each allele, the
162 number of sequences which match the germline are listed in the same order as
163 the alleles are listed. The total number of sequences that match any allele of
164 that gene is also given. To output these alleles as a names vector of nucleotide
165 sequences, the user may use the function `genotypeFasta`. To save this vector to
166 a fasta file, `writeFasta` may be used.
146 plotNovel(SampleDb, novel[1, ])
147 ```
148
149 ## Inferring genotypes
150
151 An individual's genotype can be inferred using the functions `inferGenotype` or
152 `inferGenotypeBayesian`. Using one of this functions allows to remove from the
153 genotype rare/erroneous allele calls which may result from mutations in
154 allele-differentiating regions. `inferGenotype` uses a frequency method to
155 decide which alleles belong to the subjects genotype whereas
156 `inferGenotypeBayesian` infers an subject's genotype applying a Bayesian
157 framework and provides a confidence estimate associated with
158 the genotype calls.
159
160
161 ### Frequency genotyping approach
162
163 `inferGenotype` identifies the fewest alleles that account for
164 nearly all (default is 7/8) of the allele calls made. The user may opt to only
165 use sequences which perfectly match germline alleles, and may opt to include
166 potential novel alleles. (The genotype output is designed to be human readable,
167 though `plotGenotype` can be used to make a colorful visualization.) For each
168 allele, the number of sequences which match the germline are listed in the same
169 order as the alleles are listed. The total number of sequences that match any
170 allele of that gene is also given. To output these alleles as a names vector of
171 nucleotide sequences, the user may use the function `genotypeFasta`. To save
172 this vector to a fasta file, `writeFasta` may be used.
167173
168174 ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3}
169175 # Infer the individual's genotype, using only unmutated sequences and checking
170176 # for the use of the novel alleles inferred in the earlier step.
171 geno <- inferGenotype(sample_db, find_unmutated = TRUE,
172 germline_db = germline_ighv, novel_df = novel_df)
177 geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel,
178 find_unmutated=TRUE)
173179 # Save the genotype sequences to a vector
174 genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df)
180 genotype_db <- genotypeFasta(geno, GermlineIGHV, novel)
175181 # Visualize the genotype and sequence counts
176182 print(geno)
177183 # Make a colorful visualization. Bars indicate presence, not proportion.
178184 plotGenotype(geno, text_size = 10)
179
180 ```
181
182 ### Corrected Allele Calls
185 ```
186
187 ### Bayesian genotyping approach
188
189 The method `inferGenotypeBayesian` analyzes the posterior probabilities of
190 possible allele distributions, considering up to four distinct alleles per
191 V gene, corresponding to a gene duplication with both loci being heterozygous
192 (i.e., homozygous, heterozygous with one copy of each allele, etc.). The
193 posterior probabilities for these four possible models are compared and a Bayes
194 factor is calculated for the two most probable models. This Bayes factor
195 reflects the confidence in the genotyping call of the method. The bayesian
196 method doesn't use the strict cutoff criterion `fraction_to_explain` that
197 `inferGenotype` uses wherein only the minimum set of alleles explaining
198 88% (7/8) of apparently-unmutated sequences are included in the genotype.
199
200
201 ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3}
202 # Infer the individual's genotype, using the bayesian method
203 geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV,
204 novel=novel, find_unmutated=TRUE)
205 # Visualize the genotype and sequence counts
206 print(geno_bayesian)
207 # Make a colorful visualization. Bars indicate presence, not proportion.
208 plotGenotype(geno_bayesian, text_size=10)
209 ```
210
211 ## Correcting allele calls
183212
184213 Finally, the original V allele calls may be limited to only those within the
185214 inferred genotype. This can be done by using the function `reassignAlleles`.
186 By corrected the calls in this manner, the user can greatly reduce the numer of
215 By correcting the calls in this manner, the user can greatly reduce the numer of
187216 ambiguous allele calls (where a single sample sequences is assigned to multiple
188217 V alleles, thus preventing the mutations analysis of allele-differentiating
189218 positions). Additionally, assignments to erroneous not-in-genotype alleles
191220
192221 ```{r, eval=TRUE, warning=FALSE}
193222 # Use the personlized genotype to determine corrected allele assignments
194 V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs)
195 # Append the corrected calls to the original data.frame
196 sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED)
223 # Updated genotype will be placed in the V_CALL_GENOTYPED column
224 sample_db <- reassignAlleles(SampleDb, genotype_db)
197225 ```
198226
199227 From here, one may proceed with further downstream analyses, but with the
200228 advantage of having much-improved allele calls. Besides having discovered
201 alleles not in the IGMT database, the calls made by IMGT have been tailored to
229 alleles not in the IMGT database, the calls made by IMGT have been tailored to
202230 the subject's genotype, greatly reducing the number of problematic calls, as
203231 can be seen below.
204232
205233 ```{r, eval=TRUE, warning=FALSE}
206234 # Find the set of alleles in the original calls that were not in the genotype
207235 not_in_genotype <- sample_db$V_CALL %>%
208 strsplit(",") %>%
209 unlist() %>%
210 unique() %>%
211 setdiff(names(genotype_seqs))
236 strsplit(",") %>%
237 unlist() %>%
238 unique() %>%
239 setdiff(names(genotype_db))
212240
213241 # Determine the fraction of calls that were ambigious before/after correction
214242 # and the fraction that contained original calls to non-genotype alleles. Note
215243 # that by design, only genotype alleles are allowed in "after" calls.
216 data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)),
217 mean(grepl(",",sample_db$V_CALL_GENOTYPED))),
218 NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype),
219 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
220 row.names = c("Before", "After")) %>%
244 data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)),
245 mean(grepl(",", sample_db$V_CALL_GENOTYPED))),
246 NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype),
247 mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)),
248 row.names=c("Before", "After")) %>%
221249 t() %>% round(3)
222
223 ```
224
250 ```
225251
226252 ## References
227253
232258 [1]: http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.html "Alamyar et al. (2010)"
233259 [2]: http://www.ncbi.nlm.nih.gov/pubmed/20147303 "Munshaw and Kepler (2010)"
234260 [3]: http://www.ncbi.nlm.nih.gov/pubmed/12477501 "Lefranc et al. (2003)"
261