New upstream version 0.3.1
Andreas Tille
5 years ago
0 | 0 | Package: tigger |
1 | 1 | Type: Package |
2 | Version: 0.2.11 | |
3 | Date: 2017-09-21 | |
2 | Version: 0.3.1 | |
3 | Date: 2018-10-19 | |
4 | 4 | Authors@R: c(person("Daniel", "Gadala-Maria", role=c("aut"), |
5 | 5 | email="daniel.gadala-maria@yale.edu"), |
6 | person("Susanna", "Marquez", role=c("aut"), | |
7 | email="susanna.marquez@yale.edu"), | |
8 | person("Moriah", "Cohen", role=c("aut"), | |
9 | email="moriah.cohen@biu.ac.il"), | |
10 | person("Gur", "Yaari", role=c("aut"), | |
11 | email="gur.yaari@biu.ac.il"), | |
6 | 12 | person("Jason", "Vander Heiden", role=c("ctb", "cre"), |
7 | 13 | email="jason.vanderheiden@yale.edu"), |
8 | 14 | person("Steven", "Kleinstein", role=c("aut", "cph"), |
9 | 15 | email="steven.kleinstein@yale.edu")) |
10 | Title: R Tools for Inferring New Immunoglobulin Alleles from Rep-Seq | |
11 | Data | |
16 | Title: Infers Novel Immunoglobulin Alleles from Sequencing Data | |
12 | 17 | Description: Infers the V genotype of an individual from immunoglobulin (Ig) |
13 | repertoire-sequencing (Rep-Seq) data, including detection of any novel | |
14 | alleles. This information is then used to correct existing V allele calls | |
15 | from among the sample sequences. | |
18 | repertoire sequencing data (AIRR-Seq, Rep-Seq). Includes detection of | |
19 | any novel alleles. This information is then used to correct existing V | |
20 | allele calls from among the sample sequences. | |
21 | Citations: | |
22 | Gadala-Maria, et al (2015) <doi:10.1073/pnas.1417683112>. | |
16 | 23 | License: CC BY-SA 4.0 |
17 | 24 | URL: http://tigger.readthedocs.io |
18 | 25 | BugReports: https://bitbucket.org/kleinstein/tigger/issues |
19 | 26 | LazyData: true |
20 | 27 | BuildVignettes: true |
21 | 28 | VignetteBuilder: knitr |
29 | Encoding: UTF-8 | |
22 | 30 | Depends: R (>= 3.2.5), ggplot2 (>= 2.0.0) |
23 | Imports: alakazam (>= 0.2.6), tidyr, dplyr (>= 0.5.0), doParallel, | |
24 | foreach, graphics, grid, iterators, lazyeval, parallel, stats | |
31 | Imports: alakazam (>= 0.2.11), dplyr (>= 0.7.0), doParallel, foreach, | |
32 | graphics, gridExtra, gtools, iterators, lazyeval, parallel, | |
33 | rlang, shazam (>= 0.1.10), stats, stringi, tidyr | |
25 | 34 | Suggests: knitr, testthat |
26 | RoxygenNote: 6.0.1 | |
35 | RoxygenNote: 6.1.0 | |
27 | 36 | NeedsCompilation: no |
28 | Packaged: 2017-09-21 17:56:29 UTC; jason | |
37 | Packaged: 2018-10-19 15:56:11 UTC; susanna | |
29 | 38 | Author: Daniel Gadala-Maria [aut], |
39 | Susanna Marquez [aut], | |
40 | Moriah Cohen [aut], | |
41 | Gur Yaari [aut], | |
30 | 42 | Jason Vander Heiden [ctb, cre], |
31 | 43 | Steven Kleinstein [aut, cph] |
32 | 44 | Maintainer: Jason Vander Heiden <jason.vanderheiden@yale.edu> |
33 | 45 | Repository: CRAN |
34 | Date/Publication: 2017-09-21 18:36:16 UTC | |
46 | Date/Publication: 2018-10-19 18:30:08 UTC |
0 | e2485e1eb82326bf1a87a26d31bca302 *DESCRIPTION | |
1 | b93c4f7f082c0a113a3b14ce72ad6fb2 *NAMESPACE | |
2 | 7603efe9dc5bd19429a3dd8fe493db5e *NEWS.md | |
3 | cf81dc763706d5af4b6338587de99e07 *R/data.R | |
4 | 7591807e25a344f4a340b111ae2bd6c0 *R/functions.R | |
5 | f206efd53bac36d5cda16f38126de62f *R/tigger.R | |
6 | f4692fbe495ed0cbf93b5572bcf5d1c0 *README.md | |
7 | 83f16ce2648803eb619b5c6bd7c13b48 *build/vignette.rds | |
8 | 895ced14d12f95482b10d7ea7006c5e7 *data/genotype.rda | |
9 | e6bf4ae95811336b57a6a4c161d84b49 *data/germline_ighv.rda | |
10 | 9f888cd0ed3b8029c1544798c2a0626e *data/novel_df.rda | |
11 | 8281d239e0e8ec4898b72a3c0c96f815 *data/sample_db.rda | |
0 | 7883ea65ca4dba8729ebe5521726f920 *DESCRIPTION | |
1 | bf68ba350c86248c0c49cb21b9e9ac93 *NAMESPACE | |
2 | e5ed4b12e1d369b23b0e60a0ce717d08 *NEWS.md | |
3 | fd8970214a6f891f3f8630eeae6490da *R/bayesian.R | |
4 | 26641da5adf037bb742328f8aff43aa2 *R/data.R | |
5 | a9b2c160a920976ad3dc10f2b693df00 *R/evidence.R | |
6 | ee415813b86d23820983f8ba0ee66a11 *R/functions.R | |
7 | 0e7ef247b95b0f031cc4ea70f4835af3 *R/tigger.R | |
8 | dd36e57af6a27547f887c01d7f7ea110 *README.md | |
9 | 06d0733b1e053ed668e7f17c1f975725 *build/vignette.rds | |
10 | 3e3cb573afe36d5c5ca64289396c180f *data/GermlineIGHV.rda | |
11 | b65f789115c28c50e855e042dd8773b0 *data/SampleDb.rda | |
12 | 5b91397303d4a843360e11172f52fbd2 *data/SampleGenotype.rda | |
13 | 996de70aa93a09176f85d473e31d143e *data/SampleNovel.rda | |
14 | a22b2561c478e1579d686230a39e67f5 *data/datalist | |
12 | 15 | de6a4346597304f77270ce7f43877cde *inst/CITATION |
13 | d449c31440bb88f31affd3d3bd9c2f3d *inst/doc/Tigger-Vignette.R | |
14 | 9af1046ef91efdb469d1b6e96836abb8 *inst/doc/Tigger-Vignette.Rmd | |
15 | 3352ec0d31c7fcc8672426f2f20bf3bf *inst/doc/Tigger-Vignette.pdf | |
16 | a0ddc22d3a55eb331b3796dd6eb789c0 *inst/markr/build.R | |
17 | aac471649aead0fc15cfd550b45fd049 *man/cleanSeqs.Rd | |
18 | 10c535405f44d35b11fbbb24441f65c8 *man/findNovelAlleles.Rd | |
19 | dee7ae50bce0ce5c4b2eb9249064b0db *man/findUnmutatedCalls.Rd | |
20 | 4085c0bd803087104a44f0017a4a50af *man/genotype.Rd | |
21 | 1cc5dc16d32e2b0448171993ae787080 *man/genotypeFasta.Rd | |
22 | 52a7865ed7ecc697424ee208d338fb55 *man/germline_ighv.Rd | |
23 | 12bbad6df7fa988067c9eaa56d536ab6 *man/getMutCount.Rd | |
24 | 63dabc7cd596b16388f81216f0c17e66 *man/getMutatedPositions.Rd | |
25 | be9136e88f9dadd4d7b08ca36f98ea53 *man/getPopularMutationCount.Rd | |
26 | e3e9b6c1d4eb11abd8181e9782aa62a0 *man/inferGenotype.Rd | |
27 | 148b730525b1631e20baa47971bbf879 *man/insertPolymorphisms.Rd | |
28 | 12b6f73cd643edb19b6eafabb4e2f65c *man/novel_df.Rd | |
29 | e62237f96f7801f6f24c92ead258fce8 *man/plotGenotype.Rd | |
30 | 25ca722b523385793c9e7c7f87008b8f *man/plotNovel.Rd | |
31 | 6bc3636e5c5505ba2fd63260e4e732ae *man/readIgFasta.Rd | |
32 | 6503ecee104c9ff753290c7d975d80dc *man/reassignAlleles.Rd | |
33 | ebb8f5373091db562d8205e58a3f8984 *man/sample_db.Rd | |
34 | 2ead201dab9226a33bc56982df4b6c71 *man/selectNovel.Rd | |
35 | d484cb96d7479059da49c218a81617db *man/sortAlleles.Rd | |
36 | 701723880ed5d0d8dd02b45112f1f42c *man/tigger.Rd | |
37 | d76e9252cf5af39d91abdc7f4bded2ad *man/updateAlleleNames.Rd | |
38 | f0b33cf048dd355a7e4aa6e725b40795 *man/writeFasta.Rd | |
39 | 9af1046ef91efdb469d1b6e96836abb8 *vignettes/Tigger-Vignette.Rmd | |
16 | 41f63ff830855d88ac107ea1de1c73bc *inst/doc/Tigger-Vignette.R | |
17 | 2d3b4a189816c543e285b79a82af7e39 *inst/doc/Tigger-Vignette.Rmd | |
18 | 01ff9a6cb57c035d7b6410b599195500 *inst/doc/Tigger-Vignette.pdf | |
19 | 0d10fc4f29c4fd44dbcd056d562e6e7d *man/GermlineIGHV.Rd | |
20 | 4449a3b926f140109fed29bdfcb2ea21 *man/SampleDb.Rd | |
21 | ee6f174d98cdac6c8e920cffe93b8762 *man/SampleGenotype.Rd | |
22 | 887b6cdecd67bce3a4b18c3b68b8ed6c *man/SampleNovel.Rd | |
23 | 65be86d49f6538199afe8a396d78e46d *man/cleanSeqs.Rd | |
24 | a0ea6946c659258fa37d9c9e6e313113 *man/findNovelAlleles.Rd | |
25 | 862a43c6d9ff0a343ad365c5627ba52c *man/findUnmutatedCalls.Rd | |
26 | f1ca61511a381cde50708fb801db0541 *man/generateEvidence.Rd | |
27 | f771057e2965efe11994cb4501633b43 *man/genotypeFasta.Rd | |
28 | 6f01fbc67804c7f34b46098a24711c91 *man/getMutCount.Rd | |
29 | b985f5ee9c7e41a4343177f1fe73116f *man/getMutatedPositions.Rd | |
30 | 40eb4612eb9d4f4f7871124875cbe430 *man/getPopularMutationCount.Rd | |
31 | 7140a2237a7c22f48bc6480669492861 *man/inferGenotype.Rd | |
32 | 46fc21b7809cea2fcbb9ea3f08bed22e *man/inferGenotypeBayesian.Rd | |
33 | 29fc1168511d72e6c691accdba47c7fb *man/insertPolymorphisms.Rd | |
34 | 4888ee94f43ae2bc8b44493c102158b8 *man/plotGenotype.Rd | |
35 | be5190e0acbc1873d5cdcba4aa13c458 *man/plotNovel.Rd | |
36 | fe039528bfc29103382b6cfbb92c7447 *man/readIgFasta.Rd | |
37 | 12d8f40948568c68985316f7df0bf5fe *man/reassignAlleles.Rd | |
38 | b955f370f2897613ab16833edd89397b *man/selectNovel.Rd | |
39 | 72ecb30aceba37af5b1bf37fc6951944 *man/sortAlleles.Rd | |
40 | 96b614e74d00fad9219dcde24e456149 *man/tigger.Rd | |
41 | dc2623df8768f04d7f6f7d65bb51648e *man/updateAlleleNames.Rd | |
42 | 0ed3d8709b7dc07e58a4ac931bf905b0 *man/writeFasta.Rd | |
43 | 2d3b4a189816c543e285b79a82af7e39 *vignettes/Tigger-Vignette.Rmd |
2 | 2 | export(cleanSeqs) |
3 | 3 | export(findNovelAlleles) |
4 | 4 | export(findUnmutatedCalls) |
5 | export(generateEvidence) | |
5 | 6 | export(genotypeFasta) |
6 | 7 | export(getMutCount) |
7 | 8 | export(getMutatedPositions) |
8 | 9 | export(getPopularMutationCount) |
9 | 10 | export(inferGenotype) |
11 | export(inferGenotypeBayesian) | |
10 | 12 | export(insertPolymorphisms) |
11 | 13 | export(plotGenotype) |
12 | 14 | export(plotNovel) |
21 | 23 | importFrom(alakazam,getAllele) |
22 | 24 | importFrom(alakazam,getFamily) |
23 | 25 | importFrom(alakazam,getGene) |
26 | importFrom(alakazam,translateDNA) | |
24 | 27 | importFrom(doParallel,registerDoParallel) |
25 | 28 | importFrom(dplyr,"%>%") |
26 | 29 | importFrom(dplyr,arrange) |
40 | 43 | importFrom(dplyr,glimpse) |
41 | 44 | importFrom(dplyr,group_by) |
42 | 45 | importFrom(dplyr,group_by_) |
46 | importFrom(dplyr,inner_join) | |
43 | 47 | importFrom(dplyr,mutate) |
44 | 48 | importFrom(dplyr,mutate_) |
45 | 49 | importFrom(dplyr,n) |
58 | 62 | importFrom(foreach,foreach) |
59 | 63 | importFrom(foreach,registerDoSEQ) |
60 | 64 | importFrom(graphics,plot) |
61 | importFrom(grid,grid.layout) | |
62 | importFrom(grid,grid.newpage) | |
63 | importFrom(grid,pushViewport) | |
64 | importFrom(grid,viewport) | |
65 | importFrom(gridExtra,arrangeGrob) | |
66 | importFrom(gtools,ddirichlet) | |
65 | 67 | importFrom(iterators,icount) |
66 | 68 | importFrom(lazyeval,interp) |
67 | 69 | importFrom(parallel,clusterEvalQ) |
68 | 70 | importFrom(parallel,clusterExport) |
69 | 71 | importFrom(parallel,makeCluster) |
70 | 72 | importFrom(parallel,stopCluster) |
73 | importFrom(rlang,.data) | |
74 | importFrom(shazam,calcObservedMutations) | |
71 | 75 | importFrom(stats,confint) |
72 | 76 | importFrom(stats,cor) |
73 | 77 | importFrom(stats,cov) |
78 | 82 | importFrom(stats,na.omit) |
79 | 83 | importFrom(stats,sd) |
80 | 84 | importFrom(stats,setNames) |
85 | importFrom(stringi,stri_length) | |
81 | 86 | importFrom(tidyr,gather) |
82 | 87 | importFrom(tidyr,gather_) |
83 | 88 | importFrom(tidyr,spread) |
84 | 89 | importFrom(tidyr,spread_) |
90 | importFrom(tidyr,unnest) |
0 | Version 0.3.1 October 19, 2018 | |
1 | ------------------------------------------------------------------------------- | |
2 | ||
3 | + Fixed a fatal error in `reassignAlleles` with non-existent `v_call` column. | |
4 | + Fixed bug in `generateEvidence` that was reporting amino acids mutations as | |
5 | NA instead of gaps. | |
6 | ||
7 | ||
8 | Version 0.3.0 October 3, 2018 | |
9 | ------------------------------------------------------------------------------- | |
10 | ||
11 | Bug Fixes: | |
12 | ||
13 | + Fixed a bug in `reassignAlleles` occuring with single match genotypes. | |
14 | + Fixed `selectNovel` improperly removing all identical novel alleles, rather | |
15 | than keeping a single entry. | |
16 | + `genotypeFasta` will now retain IMGT-numbering spacers as `.` characters | |
17 | instead of converting them to `-` characters. | |
18 | + Fixed a bug in `findNovelAlleles` causing overly aggressive minimum sequence | |
19 | threshold filtering. | |
20 | + Fixed a bug in the grouping behavior of `getPopularMutationCount`. | |
21 | ||
22 | New Features: | |
23 | ||
24 | + Added a Bayesian approach to genotype inferrence as the | |
25 | `inferGenotypeBayesian` function. | |
26 | + Added the function `generateEvidence` to build a complete evidence table | |
27 | from the results of `findNovelAlleles`, `inferGenotype`, | |
28 | `inferGenotypeBayesian`, and `reassignAlleles`. | |
29 | + Added multiple new evidence columns to the output of `findNovelAlleles` | |
30 | and adjusted the definitions/names of some existing columns. | |
31 | + Added behavior to the `keep_gene` argument of `reassignAlleles` to provide | |
32 | options for maintaining reassignments at the gene (previous `TRUE` behavior), | |
33 | family, or repertoire level. | |
34 | + Improved tie resolution in `findNovelAlleles`. | |
35 | ||
36 | Backwards Incompatible Refactors: | |
37 | ||
38 | + Renamed sample data from `germline_ighv`, `sample_db`, `genotype` and | |
39 | `novel_df` to `GermlineIGHV`, `SampleDb`, `SampleGenotype` and `SampleNovel`, | |
40 | respectively. | |
41 | + Renamed the `novel_df` argument to `novel` in `selectNovel`, `inferGenotype`, | |
42 | and `genotypeFasta`. | |
43 | + Renamed the `novel_df_row` argument to `novel_row` in `plotNovel`. | |
44 | + Argument order in `inferGenotype` was alter for clarity. | |
45 | + Changed the return behavior of `reassignAlleles` so that it returns the | |
46 | input data.frame with the `V_CALL_GENOTYPED` column appended or overwritten. | |
47 | + `cleanSeqs` will no longer replace `.` characters with `-`. | |
48 | ||
49 | ||
0 | 50 | Version 0.2.11 September 21, 2017 |
1 | 51 | ------------------------------------------------------------------------------- |
2 | 52 |
0 | #' Infer a subject-specific genotype using a Bayesian approach | |
1 | #' | |
2 | #' \code{inferGenotypeBayesian} infers an subject's genotype by applying a Bayesian framework | |
3 | #' with a Dirichlet prior for the multinomial distribution. Up to four distinct alleles are | |
4 | #' allowed in an individual’s genotype. Four likelihood distributions were generated by | |
5 | #' empirically fitting three high coverage genotypes from three individuals | |
6 | #' (Laserson and Vigneault et al, 2014). A posterior probability is calculated for the | |
7 | #' four most common alleles. The certainty of the highest probability model was | |
8 | #' calculated using a Bayes factor (the most likely model divided by second-most likely model). | |
9 | #' The larger the Bayes factor (K), the greater the certainty in the model. | |
10 | #' | |
11 | #' @details | |
12 | #' Allele calls representing cases where multiple alleles have been | |
13 | #' assigned to a single sample sequence are rare among unmutated | |
14 | #' sequences but may result if nucleotides for certain positions are | |
15 | #' not available. Calls containing multiple alleles are treated as | |
16 | #' belonging to all groups. If \code{novel} is provided, all | |
17 | #' sequences that are assigned to the same starting allele as any | |
18 | #' novel germline allele will have the novel germline allele appended | |
19 | #' to their assignent prior to searching for unmutated sequences. | |
20 | #' | |
21 | #' @param data a \code{data.frame} containing V allele | |
22 | #' calls from a single subject. If \code{find_unmutated} | |
23 | #' is \code{TRUE}, then the sample IMGT-gapped V(D)J sequence | |
24 | #' should be provided in a column \code{"SEQUENCE_IMGT"} | |
25 | #' @param v_call column in \code{data} with V allele calls. | |
26 | #' Default is \code{"V_CALL"}. | |
27 | #' @param find_unmutated if \code{TRUE}, use \code{germline_db} to | |
28 | #' find which samples are unmutated. Not needed | |
29 | #' if \code{allele_calls} only represent | |
30 | #' unmutated samples. | |
31 | #' @param germline_db named vector of sequences containing the | |
32 | #' germline sequences named in \code{allele_calls}. | |
33 | #' Only required if \code{find_unmutated} is \code{TRUE}. | |
34 | #' @param novel an optional \code{data.frame} of the type | |
35 | #' novel returned by \link{findNovelAlleles} containing | |
36 | #' germline sequences that will be utilized if | |
37 | #' \code{find_unmutated} is \code{TRUE}. See Details. | |
38 | #' @param priors a numeric vector of priors for the multinomial distribution. | |
39 | #' The \code{priors} vector must be nine values that defined | |
40 | #' the priors for the heterozygous (two allele), | |
41 | #' trizygous (three allele), and quadrozygous (four allele) | |
42 | #' distributions. The first two values of \code{priors} define | |
43 | #' the prior for the heterozygous case, the next three values are for | |
44 | #' the trizygous case, and the final four values are for the | |
45 | #' quadrozygous case. Each set of priors should sum to one. | |
46 | #' Note, each distribution prior is actually defined internally | |
47 | #' by set of four numbers, with the unspecified final values | |
48 | #' assigned to \code{0}; e.g., the heterozygous case is | |
49 | #' \code{c(priors[1], priors[2], 0, 0)}. The prior for the | |
50 | #' homozygous distribution is fixed at \code{c(1, 0, 0, 0)}. | |
51 | #' | |
52 | #' @return | |
53 | #' A \code{data.frame} of alleles denoting the genotype of the subject with the log10 | |
54 | #' of the likelihood of each model and the log10 of the Bayes factor. The output | |
55 | #' contains the following columns: | |
56 | #' | |
57 | #' \itemize{ | |
58 | #' \item \code{GENE}: The gene name without allele. | |
59 | #' \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}. | |
60 | #' \item \code{COUNTS}: Comma separated list of observed sequences for each | |
61 | #' corresponding allele in the \code{ALLELES} list. | |
62 | #' \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}. | |
63 | #' \item \code{NOTE}: Any comments on the inferrence. | |
64 | #' \item \code{KH}: log10 likelihood that the \code{GENE} is homozygous. | |
65 | #' \item \code{KD}: log10 likelihood that the \code{GENE} is heterozygous. | |
66 | #' \item \code{KT}: log10 likelihood that the \code{GENE} is trizygous | |
67 | #' \item \code{KQ}: log10 likelihood that the \code{GENE} is quadrozygous. | |
68 | #' \item \code{K_DIFF}: log10 ratio of the highest to second-highest zygosity likelihoods. | |
69 | #' } | |
70 | #' | |
71 | #' @note | |
72 | #' This method works best with data derived from blood, where a large | |
73 | #' portion of sequences are expected to be unmutated. Ideally, there | |
74 | #' should be hundreds of allele calls per gene in the input. | |
75 | #' | |
76 | #' @seealso \link{plotGenotype} for a colorful visualization and | |
77 | #' \link{genotypeFasta} to convert the genotype to nucleotide sequences. | |
78 | #' See \link{inferGenotype} to infer a subject-specific genotype using | |
79 | #' a frequency method | |
80 | #' | |
81 | #' @references | |
82 | #' \enumerate{ | |
83 | #' \item Laserson U and Vigneault F, et al. High-resolution antibody dynamics of | |
84 | #' vaccine-induced immune responses. PNAS. 2014 111(13):4928-33. | |
85 | #' } | |
86 | #' | |
87 | #' @examples | |
88 | #' # Infer IGHV genotype, using only unmutated sequences, including novel alleles | |
89 | #' inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel, | |
90 | #' find_unmutated=TRUE) | |
91 | #' | |
92 | #' @export | |
93 | inferGenotypeBayesian <- function(data, germline_db=NA, novel=NA, | |
94 | v_call="V_CALL", find_unmutated=TRUE, | |
95 | priors=c(0.6, 0.4, 0.4, 0.35, 0.25, 0.25, 0.25, 0.25, 0.25)){ | |
96 | # Visibility hack | |
97 | . <- NULL | |
98 | ||
99 | allele_calls = getAllele(data[,v_call], first=FALSE, strip_d=FALSE) | |
100 | # Find the unmutated subset, if requested | |
101 | if(find_unmutated){ | |
102 | if(is.na(germline_db[1])){ | |
103 | stop("germline_db needed if find_unmutated is TRUE") | |
104 | } | |
105 | if(!is.null(nrow(novel))){ | |
106 | novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>% | |
107 | select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT) | |
108 | if(nrow(novel) > 0){ | |
109 | # Extract novel alleles if any and add them to germline_db | |
110 | novel_gl = novel$NOVEL_IMGT | |
111 | names(novel_gl) = novel$POLYMORPHISM_CALL | |
112 | germline_db = c(germline_db, novel_gl) | |
113 | # Add the novel allele calls to allele calls of the same starting allele | |
114 | for(r in 1:nrow(novel)){ | |
115 | ind = grep(novel$GERMLINE_CALL[r], allele_calls, fixed=TRUE) | |
116 | allele_calls[ind] = allele_calls[ind] %>% | |
117 | sapply(paste, novel$POLYMORPHISM_CALL[r], sep=",") | |
118 | } | |
119 | } | |
120 | } | |
121 | # Find unmutated sequences | |
122 | allele_calls = findUnmutatedCalls(allele_calls, | |
123 | as.character(data$SEQUENCE_IMGT), | |
124 | germline_db) | |
125 | if(length(allele_calls) == 0){ | |
126 | stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.") | |
127 | } | |
128 | } | |
129 | ||
130 | # Find which rows' calls contain which genes | |
131 | gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>% | |
132 | getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="") | |
133 | gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE) | |
134 | names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE) | |
135 | gene_groups = gene_groups[sortAlleles(names(gene_groups))] | |
136 | ||
137 | # Make a table to store the resulting genotype | |
138 | GENE = names(gene_groups) | |
139 | # ALLELES = COUNTS = NOTE = rep("", length(GENE)) | |
140 | # TOTAL = sapply(gene_groups, length) | |
141 | # genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE) | |
142 | ALLELES = COUNTS = KH = KD = KT = KQ = K_DIFF = NOTE = rep("", length(GENE)) | |
143 | TOTAL = sapply(gene_groups, length) | |
144 | genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE, KH, KD, KT, KQ, K_DIFF) | |
145 | ||
146 | # For each gene, find which alleles to include | |
147 | for (g in GENE){ | |
148 | # Keep only the part of the allele calls that uses the gene being analyzed | |
149 | ac = allele_calls[gene_groups[[g]]] %>% | |
150 | strsplit(",") %>% | |
151 | lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>% | |
152 | sapply(paste, collapse=",") | |
153 | t_ac = table(ac) # table of allele calls | |
154 | potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles | |
155 | ||
156 | regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="") | |
157 | regexpotentials = | |
158 | paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|") | |
159 | tmat = | |
160 | sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE)) | |
161 | ||
162 | if (length(potentials) == 1 | length(t_ac) == 1){ | |
163 | seqs_expl = t(as.data.frame(apply(t(as.matrix(tmat)), 2, function(x) x * | |
164 | t_ac))) | |
165 | rownames(seqs_expl)<-names(t_ac)[1] | |
166 | }else{ | |
167 | seqs_expl = as.data.frame(apply(tmat, 2, function(x) x * | |
168 | t_ac)) | |
169 | } | |
170 | # seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac)) | |
171 | colnames(seqs_expl) = potentials | |
172 | # Add low (fake) counts | |
173 | sapply(colnames(seqs_expl), function(x){if(sum(rownames(seqs_expl) %in% paste(x)) == 0){ | |
174 | seqs_expl <<- rbind(seqs_expl,rep(0,ncol(seqs_expl))); | |
175 | rownames(seqs_expl)[nrow(seqs_expl)] <<- paste(x) | |
176 | seqs_expl[rownames(seqs_expl) %in% paste(x),paste(x)] <<- 0.01 | |
177 | ||
178 | }}) | |
179 | ||
180 | # Build ratio dependent allele count distribution of multi assigned reads | |
181 | seqs_expl_single <- seqs_expl[grep(',',rownames(seqs_expl),invert = T),] | |
182 | ||
183 | seqs_expl_multi <- seqs_expl[grep(',',rownames(seqs_expl),invert = F),] | |
184 | if(is.null(nrow(seqs_expl_multi))){ | |
185 | seqs_expl_multi <- t(as.data.frame(seqs_expl_multi)) | |
186 | rownames(seqs_expl_multi) <- grep(',',rownames(seqs_expl),invert = F,value = T) | |
187 | } | |
188 | ||
189 | if(!is.null(nrow(seqs_expl_single)) && nrow(seqs_expl_single) !=0 && nrow(seqs_expl_single) != nrow(seqs_expl)){ | |
190 | if(nrow(seqs_expl_multi)>1){ | |
191 | seqs_expl_multi <- seqs_expl_multi[order(nchar(row.names(seqs_expl_multi))),] | |
192 | } | |
193 | sapply(1:nrow(seqs_expl_multi),function(x){ | |
194 | genes <- unlist(strsplit(row.names(seqs_expl_multi)[x],',')); | |
195 | counts <- seqs_expl_single[rownames(seqs_expl_single) %in% genes,genes] | |
196 | counts <- colSums(counts) | |
197 | counts_to_distribute <- seqs_expl_multi[x,genes] | |
198 | ||
199 | new_counts <- counts+((counts_to_distribute*counts)/sum(counts)) | |
200 | for(i in 1:length(new_counts)){ | |
201 | gene_tmp <- names(new_counts)[i] | |
202 | seqs_expl_single[rownames(seqs_expl_single) %in% gene_tmp,gene_tmp] <<- new_counts[i] | |
203 | } | |
204 | }) | |
205 | } | |
206 | ||
207 | # Cycle through the table, including alleles to explain more sequences, | |
208 | # until we explain enough sequences | |
209 | #included = counts = character(0) | |
210 | #tot_expl = 0 | |
211 | ||
212 | seqs_expl <- if(is.null(nrow(seqs_expl_single)) || nrow(seqs_expl_single) ==0 ){seqs_expl}else{seqs_expl_single} | |
213 | seqs_expl <- round(seqs_expl) | |
214 | if(sum(rowSums(seqs_expl) == 0 ) != 0){ | |
215 | seqs_expl <- seqs_expl[rowSums(seqs_expl)!= 0, ] | |
216 | } | |
217 | ||
218 | allele_tot = sort(apply(seqs_expl, 2, sum),decreasing=TRUE) | |
219 | len=min(length(allele_tot),4); | |
220 | #print(priors) | |
221 | probs <-get_probabilites_with_priors(sort(c(allele_tot,rep(0,4-len)),decreasing = T)[1:4],priors = priors) | |
222 | probs[probs==-Inf] <- -1000 | |
223 | names(probs) <- c('H','D','T','Q') | |
224 | ||
225 | k <- sort(as.numeric(probs),decreasing = T); | |
226 | ||
227 | probs<-c(probs,k[1]-k[2]) | |
228 | names(probs)[5] <- "K_DIFF" | |
229 | ||
230 | genotype[genotype[, "GENE"] == g, "ALLELES"] = paste(gsub("[^d\\*]*[d\\*]", | |
231 | "", names(allele_tot)[1:len]), collapse = ",") | |
232 | genotype[genotype[, "GENE"] == g, "COUNTS"] = paste(as.numeric(allele_tot)[1:len], | |
233 | collapse = ",") | |
234 | genotype[genotype[, "GENE"] == g, "KH"] =probs[1]; | |
235 | genotype[genotype[, "GENE"] == g, "KD"] =probs[2]; | |
236 | genotype[genotype[, "GENE"] == g, "KT"] =probs[3]; | |
237 | genotype[genotype[, "GENE"] == g, "KQ"] =probs[4]; | |
238 | genotype[genotype[, "GENE"] == g, "K_DIFF"] =probs[5]; | |
239 | # } | |
240 | ||
241 | } | |
242 | ||
243 | ||
244 | geno = as.data.frame(genotype, stringsAsFactors = FALSE) | |
245 | ||
246 | # Check for indistinguishable calls | |
247 | if(find_unmutated == TRUE){ | |
248 | seqs = genotypeFasta(geno, germline_db) | |
249 | dist_mat = seqs %>% | |
250 | sapply(function(x) sapply((getMutatedPositions(seqs, x)), length)) | |
251 | rownames(dist_mat) = colnames(dist_mat) | |
252 | for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA } | |
253 | same = which(dist_mat == 0, arr.ind=TRUE) | |
254 | if (nrow(same) > 0 ) { | |
255 | for (r in 1:nrow(same)) { | |
256 | inds = as.vector(same[r,]) | |
257 | geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE = | |
258 | paste(rownames(dist_mat)[inds], collapse=" and ") %>% | |
259 | paste("Cannot distinguish", .) | |
260 | } | |
261 | } | |
262 | } | |
263 | rownames(geno) = NULL | |
264 | return(geno) | |
265 | } | |
266 | ||
267 | ||
268 | # Calculate models likelihood | |
269 | # | |
270 | # | |
271 | # @param X a vector of counts | |
272 | # @param alpha_dirichlet alpha parameter for dirichlet distribution | |
273 | # @param epsilon epsilon | |
274 | # @param priors a vector of priors | |
275 | # | |
276 | # @return log10 of the likelihoods | |
277 | get_probabilites_with_priors <- function(X, alpha_dirichlet=c(0.5,0.5,0.5,0.5)*2, | |
278 | epsilon=0.01, | |
279 | priors=c(0.5,0.5,0.33,0.33,0.33,0.25,0.25,0.25,0.25)){ | |
280 | ## Hypotheses | |
281 | X<-sort(X,decreasing=TRUE) | |
282 | ||
283 | H1<-c(1,0,0,0) | |
284 | H2<-c(priors[1],priors[2],0,0) | |
285 | H3<-c(priors[3],priors[4],priors[5],0) | |
286 | H4<-c(priors[6],priors[7],priors[8],priors[9]) | |
287 | ||
288 | E1<-ddirichlet((H1+epsilon)/sum(H1+epsilon),alpha_dirichlet+X) | |
289 | E2<-ddirichlet((H2+epsilon)/sum(H2+epsilon),alpha_dirichlet+X) | |
290 | E3<-ddirichlet((H3+epsilon)/sum(H3+epsilon),alpha_dirichlet+X) | |
291 | E4<-ddirichlet((H4+epsilon)/sum(H4+epsilon),alpha_dirichlet+X) | |
292 | ||
293 | ||
294 | ||
295 | while(sort(c(E1,E2,E3,E4),decreasing=TRUE)[2] == 0 ){ | |
296 | ||
297 | X <- X/10 | |
298 | E1<-ddirichlet((H1+epsilon)/sum(H1+epsilon),alpha_dirichlet+X) | |
299 | E2<-ddirichlet((H2+epsilon)/sum(H2+epsilon),alpha_dirichlet+X) | |
300 | E3<-ddirichlet((H3+epsilon)/sum(H3+epsilon),alpha_dirichlet+X) | |
301 | E4<-ddirichlet((H4+epsilon)/sum(H4+epsilon),alpha_dirichlet+X) | |
302 | ||
303 | } | |
304 | ||
305 | return(log10(c(E1,E2,E3,E4))) | |
306 | } |
0 | #' Human IGHV germlines | |
1 | #' | |
2 | #' A \code{character} vector of all 344 human IGHV germline gene segment alleles | |
3 | #' in IMGT Gene-db release 201408-4. | |
4 | #' | |
5 | #' @name germline_ighv | |
6 | #' @docType data | |
7 | #' @format Values correspond to IMGT-gaped nuceltoide sequences (with | |
8 | #' nucleotides capitalized and gaps represented by ".") while names correspond | |
9 | #' to stripped-down IMGT allele names (e.g. "IGHV1-18*01"). | |
10 | #' | |
11 | #' @references Xochelli \emph{et al}. (2014) Immunoglobulin heavy variable | |
12 | #' (IGHV) genes and alleles: new entities, new names and implications for | |
13 | #' research and prognostication in chronic lymphocytic leukaemia. | |
14 | #' \emph{Immunogenetics}. 67(1):61-6. | |
15 | #' @keywords data | |
16 | NULL | |
17 | ||
18 | ||
19 | #' Example human Rep-Seq data | |
20 | #' | |
21 | #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
22 | #' individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
23 | #' IMGT/V-QUEST to utilize IGHV1 family alleles. | |
24 | #' | |
25 | #' @name sample_db | |
26 | #' @docType data | |
27 | #' @format A \code{data.frame} where rows correspond to unique VDJ sequences and | |
28 | #' columns include: | |
29 | #' \itemize{ | |
30 | #' \item IMGT-gapped nucleotide sequence (\code{"SEQUENCE_IMGT"}) | |
31 | #' \item IMGT/V-QUEST allele calls (\code{"V_CALL"}, \code{"D_CALL"}, and | |
32 | #' \code{"J_CALL"}) | |
33 | #' \item Junction length (\code{"JUNCTION_LENGTH"}) | |
34 | #' } | |
35 | #' | |
36 | #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
37 | #' high-throughput B cell sequencing data reveals a high frequency of novel | |
38 | #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
39 | #' @keywords data | |
40 | NULL | |
41 | ||
42 | #' Example of Analyzed Rep-Seq data | |
43 | #' | |
44 | #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
45 | #' individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
46 | #' IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by | |
47 | #' \link{findNovelAlleles}. | |
48 | #' | |
49 | #' @name novel_df | |
50 | #' @docType data | |
51 | #' @format A \code{data.frame} where rows correspond to alleles checked for | |
52 | #' polymorphisms and columns give results as well as paramaters used to run | |
53 | #' the test. | |
54 | #' | |
55 | #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
56 | #' high-throughput B cell sequencing data reveals a high frequency of novel | |
57 | #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
58 | #' @keywords data | |
59 | NULL | |
60 | ||
61 | #' Example of an Inferred Genotype | |
62 | #' | |
63 | #' Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
64 | #' individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
65 | #' IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by | |
66 | #' \link{findNovelAlleles} and \link{inferGenotype} | |
67 | #' | |
68 | #' @name genotype | |
69 | #' @docType data | |
70 | #' @format A \code{data.frame} where rows correspond to genes carried by an | |
71 | #' individual and columns lists the alleles of those genes and their counts. | |
72 | #' | |
73 | #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
74 | #' high-throughput B cell sequencing data reveals a high frequency of novel | |
75 | #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
76 | #' @keywords data | |
0 | #' Human IGHV germlines | |
1 | #' | |
2 | #' A \code{character} vector of all 344 human IGHV germline gene segment alleles | |
3 | #' in IMGT/GENE-DB release 201408-4. | |
4 | #' | |
5 | #' @name GermlineIGHV | |
6 | #' @docType data | |
7 | #' @format Values correspond to IMGT-gaped nuceltoide sequences (with | |
8 | #' nucleotides capitalized and gaps represented by ".") while names correspond | |
9 | #' to stripped-down IMGT allele names (e.g. "IGHV1-18*01"). | |
10 | #' | |
11 | #' @references | |
12 | #' \enumerate{ | |
13 | #' \item Xochelli, et al. (2014) Immunoglobulin heavy variable (IGHV) genes and | |
14 | #' alleles: new entities, new names and implications for research and | |
15 | #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6. | |
16 | #' } | |
17 | #' | |
18 | #' @keywords data | |
19 | NULL | |
20 | ||
21 | ||
22 | #' Example human immune repertoire data | |
23 | #' | |
24 | #' A \code{data.frame} of example V(D)J immunoglobulin sequences derived from a | |
25 | #' single individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
26 | #' IMGT/HighV-QUEST to IGHV1 family alleles. | |
27 | #' | |
28 | #' @name SampleDb | |
29 | #' @docType data | |
30 | #' @format A \code{data.frame} where rows correspond to unique V(D)J sequences and | |
31 | #' columns include: | |
32 | #' \itemize{ | |
33 | #' \item \code{"SEQUENCE_IMGT"}: IMGT-gapped V(D)J nucleotide sequence. | |
34 | #' \item \code{"V_CALL"}: IMGT/HighV-QUEST V segment allele calls. | |
35 | #' \item \code{"D_CALL"}: IMGT/HighV-QUEST D segment allele calls. | |
36 | #' \item \code{"J_CALL"}: IMGT/HighV-QUEST J segment allele calls. | |
37 | #' \item \code{"JUNCTION_LENGTH"}: Junction region length. | |
38 | #' } | |
39 | #' | |
40 | #' @references | |
41 | #' \enumerate{ | |
42 | #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
43 | #' sequencing data reveals a high frequency of novel immunoglobulin V gene | |
44 | #' segment alleles. PNAS. 112(8):E862-70. | |
45 | #' } | |
46 | #' | |
47 | #' @keywords data | |
48 | NULL | |
49 | ||
50 | #' Example novel allele detection results | |
51 | #' | |
52 | #' A \code{data.frame} of novel allele detection results from \link{findNovelAlleles}. | |
53 | #' Source data was a collection of V(D)J immunoglobulin sequences derived from a single | |
54 | #' individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
55 | #' IMGT/HighV-QUEST to IGHV1 family alleles. | |
56 | #' | |
57 | #' @name SampleNovel | |
58 | #' @docType data | |
59 | #' @format A \code{data.frame} where rows correspond to alleles checked for | |
60 | #' polymorphisms and columns give results as well as paramaters used to run | |
61 | #' the test. | |
62 | #' | |
63 | #' @seealso See \link{findNovelAlleles} for detailed column descriptions. | |
64 | #' | |
65 | #' @references | |
66 | #' \enumerate{ | |
67 | #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
68 | #' sequencing data reveals a high frequency of novel immunoglobulin V gene | |
69 | #' segment alleles. PNAS. 112(8):E862-70. | |
70 | #' } | |
71 | #' | |
72 | #' @keywords data | |
73 | NULL | |
74 | ||
75 | #' Example genotype inferrence results | |
76 | #' | |
77 | #' A \code{data.frame} of genotype inference results from \link{inferGenotype} | |
78 | #' after novel allele detection via \link{findNovelAlleles}. | |
79 | #' Source data was a collection of V(D)J immunoglobulin sequences derived from a single | |
80 | #' individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
81 | #' IMGT/HighV-QUEST to IGHV1 family alleles. | |
82 | #' | |
83 | #' @name SampleGenotype | |
84 | #' @docType data | |
85 | #' @format A \code{data.frame} where rows correspond to genes carried by an | |
86 | #' individual and columns lists the alleles of those genes and their counts. | |
87 | #' | |
88 | #' @seealso See \link{inferGenotype} for detailed column descriptions. | |
89 | #' | |
90 | #' @references | |
91 | #' \enumerate{ | |
92 | #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
93 | #' sequencing data reveals a high frequency of novel immunoglobulin V gene | |
94 | #' segment alleles. PNAS. 112(8):E862-70. | |
95 | #' } | |
96 | #' | |
97 | #' @keywords data | |
77 | 98 | NULL⏎ |
0 | # Find non triplet gaps in a nucleotide sequence | |
1 | hasNonImgtGaps <- function (seq) { | |
2 | len <- ceiling(nchar(seq)/3)*3 | |
3 | codons <- substring(seq, seq(1, len-2, 3), seq(3, len, 3)) | |
4 | gaps_lengths <- nchar(gsub("[^\\.\\-]", "", codons)) | |
5 | if (any(gaps_lengths %% 3 != 0)) { | |
6 | TRUE | |
7 | } else { | |
8 | FALSE | |
9 | } | |
10 | } | |
11 | ||
12 | # Compare two IMGT gapped sequences and find AA mutations | |
13 | getMutatedAA <- function(ref_imgt, novel_imgt) { | |
14 | if (grepl("N", ref_imgt)) { | |
15 | stop("Unexpected N in ref_imgt") | |
16 | } | |
17 | if (grepl("N", novel_imgt)) { | |
18 | stop("Unexpected N in novel_imgt") | |
19 | } | |
20 | ||
21 | if (hasNonImgtGaps(ref_imgt)) { | |
22 | warning("Non IMGT gaps found in ref_imgt") | |
23 | } | |
24 | ||
25 | if (hasNonImgtGaps(novel_imgt)) { | |
26 | warning("Non IMGT gaps found in novel_imgt") | |
27 | } | |
28 | ||
29 | ref_imgt <- strsplit(alakazam::translateDNA(ref_imgt),"")[[1]] | |
30 | novel_imgt <- strsplit(alakazam::translateDNA(novel_imgt),"")[[1]] | |
31 | mutations <- c() | |
32 | diff_idx <- which(ref_imgt != novel_imgt) | |
33 | if (length(diff_idx)>0) { | |
34 | mutations <- paste0(diff_idx, ref_imgt[diff_idx],">", | |
35 | replace(novel_imgt[diff_idx], is.na(novel_imgt[diff_idx]),"-")) | |
36 | } | |
37 | mutations | |
38 | } | |
39 | ||
40 | ||
41 | #' Generate evidence | |
42 | #' | |
43 | #' \code{generateEvidence} builds a table of evidence metrics for the final novel V | |
44 | #' allele detection and genotyping inferrences. | |
45 | #' | |
46 | #' @param data a \code{data.frame} containing sequence data that has been | |
47 | #' passed through \link{reassignAlleles} to correct the allele | |
48 | #' assignments. | |
49 | #' @param novel the \code{data.frame} returned by \link{findNovelAlleles}. | |
50 | #' @param genotype the \code{data.frame} of alleles generated with \link{inferGenotype} | |
51 | #' denoting the genotype of the subject. | |
52 | #' @param genotype_db a vector of named nucleotide germline sequences in the genotype. | |
53 | #' Returned by \link{genotypeFasta}. | |
54 | #' @param germline_db the original uncorrected germline database used to by | |
55 | #' \link{findNovelAlleles} to identify novel alleles. | |
56 | #' @param fields character vector of column names used to split the data to | |
57 | #' identify novel alleles, if any. If \code{NULL} then the data is | |
58 | #' not divided by grouping variables. | |
59 | #' | |
60 | #' @return | |
61 | #' Returns the \code{genotype} input \code{data.frame} with the following additional columns | |
62 | #' providing supporting evidence for each inferred allele: | |
63 | #' | |
64 | #' \itemize{ | |
65 | #' \item \code{FIELD_ID}: Data subset identifier, defined with the input paramter \code{fields}. | |
66 | #' \item A variable number of columns, specified with the input parameter \code{fields}. | |
67 | #' \item \code{POLYMORPHISM_CALL}: The novel allele call. | |
68 | #' \item \code{NOVEL_IMGT}: The novel allele sequence. | |
69 | #' \item \code{CLOSEST_REFERENCE}: The closest reference gene and allele in | |
70 | #' the \code{germline_db} database. | |
71 | #' \item \code{CLOSEST_REFERENCE_IMGT}: Sequence of the closest reference gene and | |
72 | #' allele in the \code{germline_db} database. | |
73 | #' \item \code{GERMLINE_CALL}: The input (uncorrected) V call. | |
74 | #' \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}. | |
75 | #' \item \code{NT_DIFF}: Number of nucleotides that differ between the new allele and | |
76 | #' the closest reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database. | |
77 | #' \item \code{NT_SUBSTITUTIONS}: A comma separated list of specific nucleotide | |
78 | #' differences (e.g. \code{112G>A}) in the novel allele. | |
79 | #' \item \code{AA_DIFF}: Number of amino acids that differ between the new allele and the closest | |
80 | #' reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database. | |
81 | #' \item \code{AA_SUBSTITUTIONS}: A comma separated list with specific amino acid | |
82 | #' differences (e.g. \code{96A>N}) in the novel allele. | |
83 | #' \item \code{SEQUENCES}: Number of sequences unambiguosly assigned to this allele. | |
84 | #' \item \code{UNMUTATED_SEQUENCES}: Number of records with the unmutated novel allele sequence. | |
85 | #' \item \code{UNMUTATED_FREQUENCY}: Proportion of records with the unmutated novel allele | |
86 | #' sequence (\code{UNMUTATED_SEQUENCES / SEQUENCE}). | |
87 | #' \item \code{ALLELIC_PERCENTAGE}: Percentage at which the (unmutated) allele is observed | |
88 | #' in the sequence dataset compared to other (unmutated) alleles. | |
89 | #' \item \code{UNIQUE_JS}: Number of unique J sequences found associated with the | |
90 | #' novel allele. The sequences are those who have been unambiguously assigned | |
91 | #' to the novel allelle (\code{POLYMORPHISM_CALL}). | |
92 | #' \item \code{UNIQUE_CDR3S}: Number of unique CDR3s associated with the inferred allele. | |
93 | #' The sequences are those who have been unambiguously assigned to the | |
94 | #' novel allelle (POLYMORPHISM_CALL). | |
95 | #' \item \code{MUT_MIN}: Minimum mutation considered by the algorithm. | |
96 | #' \item \code{MUT_MAX}: Maximum mutation considered by the algorithm. | |
97 | #' \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering). | |
98 | #' \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering). | |
99 | #' \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered | |
100 | #' potentially polymorphic. | |
101 | #' \item \code{ALPHA}: Significance threshold to be used when constructing the | |
102 | #' confidence interval for the y-intercept. | |
103 | #' \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences | |
104 | #' (within the desired mutational range and nucleotide range) required | |
105 | #' for the samples to be considered. | |
106 | #' \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly | |
107 | #' aligning to a potential novel allele that are allowed to utilize to a particular | |
108 | #' combination of junction length and J gene. | |
109 | #' \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must | |
110 | #' have usable nucleotides in a given position for that position to be considered. | |
111 | #' \item \code{NOTE}: Comments regarding the novel allele inferrence. | |
112 | #' } | |
113 | #' | |
114 | #' @seealso | |
115 | #' See \link{findNovelAlleles}, \link{inferGenotype} and \link{genotypeFasta} | |
116 | #' for generating the required input. | |
117 | #' | |
118 | #' @examples | |
119 | #' \donttest{ | |
120 | #' # Generate input data | |
121 | #' novel <- findNovelAlleles(SampleDb, GermlineIGHV) | |
122 | #' genotype <- inferGenotype(SampleDb, find_unmutated=TRUE, germline_db=GermlineIGHV, | |
123 | #' novel=novel) | |
124 | #' genotype_db <- genotypeFasta(genotype, GermlineIGHV, novel) | |
125 | #' data_db <- reassignAlleles(SampleDb, genotype_db) | |
126 | #' | |
127 | #' # Assemble evidence table | |
128 | #' evidence <- generateEvidence(data_db, novel, genotype, genotype_db, GermlineIGHV) | |
129 | #' } | |
130 | #' | |
131 | #' @export | |
132 | generateEvidence <- function(data, novel, genotype, genotype_db, | |
133 | germline_db, fields=NULL) { | |
134 | # Visibility hack | |
135 | . <- NULL | |
136 | ||
137 | # Define set of sequences containing genotype and uncorrected calls | |
138 | germline_set <- c(germline_db[!names(germline_db) %in% names(genotype_db)], | |
139 | genotype_db) | |
140 | ||
141 | # Find closest reference | |
142 | .findClosestReference <- function(seq, allele_calls, ref_germ, | |
143 | exclude_self=F, multiple=F) { | |
144 | closest <- getMutCount(seq, | |
145 | paste(allele_calls, collapse=","), | |
146 | ref_germ) | |
147 | min_dist <- min(unlist(closest)) | |
148 | closest_idx <- which(unlist(closest) == min_dist) | |
149 | closest_names <- unique(allele_calls[closest_idx]) | |
150 | if (exclude_self & names(seq) %in% closest_names) { | |
151 | warning("Excluding self") | |
152 | closest_names <- closest_names[closest_names!=names(seq)] # not self | |
153 | } | |
154 | if (length(closest_names) > 1) { | |
155 | warning(paste0("More than one closest reference found for ", | |
156 | names(seq),": ", | |
157 | paste(closest_names, collapse=","))) | |
158 | # Keep the one with less mutated positions | |
159 | mut_pos_count <- sapply(gsub("[^_]","",closest_names), nchar) | |
160 | closest_names <- closest_names[mut_pos_count==min(mut_pos_count)] | |
161 | # Pick same length | |
162 | if (length(closest_names) > 1 ) { | |
163 | idx <- which( | |
164 | sapply(ref_germ[closest_names],nchar) == nchar(ref_germ[names(seq)]) | |
165 | ) | |
166 | if (length(idx) > 0 ) { | |
167 | closest_names <- closest_names[idx] | |
168 | } | |
169 | } | |
170 | # Pick same allele | |
171 | if (length(closest_names) > 1 ) { | |
172 | idx <- which( | |
173 | getAllele(closest_names) == gsub("_.+", "", getAllele(names(seq))) | |
174 | ) | |
175 | if (length(idx) > 0 ) { | |
176 | closest_names <- closest_names[idx] | |
177 | } | |
178 | } | |
179 | # Pick not duplicated | |
180 | if (length(closest_names) > 1 ) { | |
181 | idx <- !grepl("D\\*", closest_names) | |
182 | if (any(idx)) { | |
183 | closest_names <- closest_names[idx] | |
184 | } | |
185 | } | |
186 | # If still more than one, err and TODO | |
187 | if (length(closest_names) > 1 & multiple==FALSE) { | |
188 | msg <- paste0("Multiple closest reference found for ", | |
189 | names(seq),":\n", | |
190 | paste(closest_names, collapse=",")) | |
191 | stop(msg) | |
192 | } | |
193 | warning(paste0("Use: ", | |
194 | paste(closest_names, collapse=","), | |
195 | " (less mutated positions, not D, same length, same allele)")) | |
196 | ||
197 | } | |
198 | closest_names | |
199 | } | |
200 | ||
201 | # Subset to novel alleles | |
202 | final_gt <- genotype %>% | |
203 | dplyr::group_by(.data$GENE) %>% | |
204 | dplyr::filter(!duplicated(.data$ALLELES)) %>% | |
205 | dplyr::ungroup() %>% | |
206 | dplyr::mutate(ALLELES=strsplit(as.character(.data$ALLELES), ","), | |
207 | COUNTS=strsplit(as.character(.data$COUNTS), ",")) %>% | |
208 | tidyr::unnest(.data$ALLELES, .data$COUNTS) %>% | |
209 | dplyr::mutate(POLYMORPHISM_CALL=paste0(.data$GENE, "*" , .data$ALLELES)) %>% | |
210 | dplyr::filter(.data$POLYMORPHISM_CALL %in% novel$POLYMORPHISM_CALL) %>% | |
211 | dplyr::rename(ALLELE="ALLELES") | |
212 | ||
213 | ||
214 | # Add info from novel | |
215 | final_gt <- dplyr::inner_join(dplyr::rename(final_gt, NOTE_GT="NOTE"), | |
216 | novel, | |
217 | by=c(fields, "POLYMORPHISM_CALL")) | |
218 | ||
219 | # Add message if the same novel img sequence found from | |
220 | # different starting alleles, these will be novel imgt sequences | |
221 | # with more than one polymorphism call | |
222 | final_gt <- final_gt %>% | |
223 | dplyr::group_by(.data$NOVEL_IMGT) %>% | |
224 | dplyr::mutate(NUM_CALLS=length(unique(.data$POLYMORPHISM_CALL))) %>% | |
225 | dplyr::ungroup() | |
226 | idx_mult <- which(final_gt$NUM_CALLS > 1) | |
227 | final_gt$NUM_CALLS <- NULL | |
228 | if (length(idx_mult) > 0) { | |
229 | final_gt$NOTE_GT[idx_mult] <- paste( | |
230 | final_gt$NOTE_GT[idx_mult], | |
231 | " Found multiple polymorphism calls for the same NOVEL_IMGT.", | |
232 | sep="") | |
233 | } | |
234 | ||
235 | ||
236 | if (nrow(final_gt)>0) { | |
237 | ||
238 | .addEvidence <- function(df, germline_set, germline_db) { | |
239 | polymorphism <- df[['POLYMORPHISM_CALL']] | |
240 | novel_imgt <- df[["NOVEL_IMGT"]] | |
241 | names(novel_imgt) <- polymorphism | |
242 | ||
243 | v_call_genotyped <- data[["V_CALL_GENOTYPED"]] | |
244 | ||
245 | SEQUENCES <- sum(v_call_genotyped == polymorphism) | |
246 | df[["SEQUENCES"]] <- SEQUENCES | |
247 | closest_ref_input <- .findClosestReference(novel_imgt, | |
248 | names(germline_db), | |
249 | germline_db, | |
250 | exclude_self=F) | |
251 | closest_ref <- .findClosestReference(novel_imgt, | |
252 | names(germline_set), | |
253 | germline_set, | |
254 | exclude_self=F, multiple=T) | |
255 | ||
256 | if (all(getGene(closest_ref_input) != getGene(closest_ref))) { | |
257 | warning("closest reference gene difference") | |
258 | } | |
259 | ||
260 | if (all(closest_ref != polymorphism)) { | |
261 | warning(paste0("closest reference allele (", | |
262 | closest_ref | |
263 | ,") different from POLYMORPHISM_CALL allele (", | |
264 | polymorphism,")")) | |
265 | } | |
266 | ||
267 | ## TODO: this still not clear. | |
268 | ## Any diff using sequence_imgt instead of germline[[polymorphism]]? | |
269 | df[["CLOSEST_REFERENCE"]] <- closest_ref_input | |
270 | ||
271 | nt_diff <- unlist(getMutatedPositions(novel_imgt, germline_set[[closest_ref_input]])) | |
272 | nt_diff_string <- "" | |
273 | if (nchar(novel_imgt) < nchar(germline_set)[[closest_ref_input]]) { | |
274 | nt_diff <- c(nt_diff, (nchar(novel_imgt)+1):nchar(germline_set[[closest_ref_input]])) | |
275 | } | |
276 | if (length(nt_diff) > 0 ) { | |
277 | ref_nt <- strsplit(germline_set[[closest_ref_input]],"")[[1]][nt_diff] | |
278 | novel_nt <- strsplit(germline_set[[polymorphism]],"")[[1]][nt_diff] | |
279 | nt_diff_string <- paste(paste( | |
280 | nt_diff, | |
281 | ref_nt, | |
282 | ">", | |
283 | replace(novel_nt, is.na(novel_nt), "-"), | |
284 | sep=""), collapse=",") | |
285 | } | |
286 | ||
287 | df[["NT_DIFF"]] <- length(nt_diff) | |
288 | df[["NT_SUBSTITUTIONS"]] <- nt_diff_string | |
289 | ||
290 | diff_aa <- getMutatedAA(germline_set[[closest_ref_input]], germline_set[[polymorphism]]) | |
291 | ||
292 | if (length(diff_aa)>0) { | |
293 | df[["AA_DIFF"]] <- length(diff_aa) | |
294 | df[["AA_SUBSTITUTIONS"]] <- paste(diff_aa,collapse=",") | |
295 | } else { | |
296 | df[["AA_DIFF"]] <- 0 | |
297 | df[["AA_SUBSTITUTIONS"]] <- "" | |
298 | } | |
299 | ||
300 | df[["COUNTS"]] <- as.numeric(df[["COUNTS"]]) | |
301 | df[["TOTAL"]] <- as.numeric(df[["TOTAL"]]) | |
302 | df[["UNMUTATED_SEQUENCES"]] <- as.numeric(df[["COUNTS"]]) | |
303 | df[["UNMUTATED_FREQUENCY"]] <- as.numeric(df[["COUNTS"]])/SEQUENCES | |
304 | ||
305 | df[["ALLELIC_PERCENTAGE"]] <- 100*df[["UNMUTATED_SEQUENCES"]]/as.numeric(df[["TOTAL"]]) | |
306 | ||
307 | if (SEQUENCES > 0) { | |
308 | df[["UNIQUE_JS"]] <- data %>% | |
309 | dplyr::filter(.data$V_CALL_GENOTYPED == polymorphism) %>% | |
310 | dplyr::distinct(.data$J_CALL) %>% | |
311 | nrow() | |
312 | df[["UNIQUE_CDR3S"]] <- data %>% | |
313 | dplyr::filter(.data$V_CALL_GENOTYPED == polymorphism) %>% | |
314 | dplyr::distinct(translateDNA(.data$JUNCTION, trim=TRUE)) %>% | |
315 | nrow() | |
316 | } else { | |
317 | df[["UNIQUE_JS"]] <- NA | |
318 | df[["UNIQUE_CDR3S"]] <- NA | |
319 | } | |
320 | ||
321 | # Add closest germline | |
322 | df[["CLOSEST_REFERENCE_IMGT"]] <- cleanSeqs(germline_set[[closest_ref_input]]) | |
323 | ||
324 | data.frame(df, stringsAsFactors=FALSE) | |
325 | } | |
326 | ||
327 | final_gt <- final_gt %>% | |
328 | dplyr::rowwise() %>% | |
329 | do(.addEvidence(., germline_set=germline_set, germline_db=germline_db)) %>% | |
330 | dplyr::mutate(NOTE=trimws(paste(.data$NOTE_GT, .data$NOTE, sep=" "))) %>% | |
331 | dplyr::select(-c("NOTE_GT")) | |
332 | } | |
333 | ||
334 | return(final_gt) | |
335 | } |
6 | 6 | #' align to each germline allele in order to determine which positions |
7 | 7 | #' might be polymorphic. |
8 | 8 | #' |
9 | #' @details A \code{data.frame} in Change-O format contains the following | |
10 | #' columns: | |
11 | #' \itemize{ | |
12 | #' \item \code{"SEQUENCE_IMGT"} containing the IMGT-gapped nucleotide sequence | |
13 | #' \item \code{"V_CALL"} containing the IMGT/V-QUEST V allele call(s) | |
14 | #' \item \code{"J_CALL"} containing the IMGT/V-QUEST J allele call(s) | |
15 | #' \item \code{"JUNCTION_LENGTH"} containing the junction length | |
16 | #' } | |
17 | 9 | #' The TIgGER allele-finding algorithm, briefly, works as follows: |
18 | 10 | #' Mutations are determined through comparison to the provided germline. |
19 | 11 | #' Mutation frequency at each *position* is determined as a function of |
23 | 15 | #' against by ensuring that sequences perfectly matching the potential novel |
24 | 16 | #' allele utilize a wide range of combinations of J gene and junction length. |
25 | 17 | #' |
26 | #' @param clip_db a \code{data.frame} in Change-O format. See details. | |
18 | #' @param data a \code{data.frame} in Change-O format. See details. | |
27 | 19 | #' @param germline_db a vector of named nucleotide germline sequences |
28 | #' matching the V calls in \code{clip_db} | |
29 | #' @param v_call name of the column in clip_db with V allele calls. | |
20 | #' matching the V calls in \code{data}. | |
21 | #' @param v_call name of the column in \code{data} with V allele calls. | |
30 | 22 | #' Default is V_CALL. |
31 | 23 | #' @param germline_min the minimum number of sequences that must have a |
32 | 24 | #' particular germline allele call for the allele to |
39 | 31 | #' be considered by the algorithm |
40 | 32 | #' @param pos_range the range of IMGT-numbered positions that should be |
41 | 33 | #' considered by the algorithm |
42 | #' @param alpha the alpha cutoff to be used when constructing the | |
43 | #' confidence interval for the y-intercept | |
44 | #' @param y_intercept the y-intercept above which positions should be | |
34 | #' @param alpha the alpha value used for determining whether the | |
35 | #' fit y-intercept is greater than the \code{y_intercept} | |
36 | #' threshold | |
37 | #' @param y_intercept the y-intercept threshold above which positions should be | |
45 | 38 | #' considered potentially polymorphic |
46 | 39 | #' @param j_max the maximum fraction of sequences perfectly aligning |
47 | 40 | #' to a potential novel allele that are allowed to |
55 | 48 | #' position to considered |
56 | 49 | #' @param nproc the number of processors to use |
57 | 50 | #' |
58 | #' @return a \code{data.frame} with a row for each known allele analyzed. | |
51 | #' @return | |
52 | #' A \code{data.frame} with a row for each known allele analyzed. | |
59 | 53 | #' Besides metadata on the the parameters used in the search, each row will have |
60 | 54 | #' either a note as to where the polymorphism-finding algorithm exited or a |
61 | #' nucleotide sequence for the predicted novel allele. | |
55 | #' nucleotide sequence for the predicted novel allele, along with columns providing | |
56 | #' additional evidence. | |
57 | #' | |
58 | #' The output contains the following columns: | |
59 | #' \itemize{ | |
60 | #' \item \code{GERMLINE_CALL}: The input (uncorrected) V call. | |
61 | #' \item \code{NOTE}: Comments regarding the inferrence. | |
62 | #' \item \code{POLYMORPHISM_CALL}: The novel allele call. | |
63 | #' \item \code{NT_SUBSTITUTIONS}: Mutations identified in the novel allele, relative | |
64 | #' to the reference germline (\code{GERMLINE_CALL}) | |
65 | #' \item \code{NOVEL_IMGT}: The novel allele sequence. | |
66 | #' \item \code{NOVEL_IMGT_COUNT}: The number of times the sequence \code{NOVEL_IMGT} | |
67 | #' is found in the input data. Considers the subsequence of \code{NOVEL_IMGT} | |
68 | #' in the \code{pos_range}. | |
69 | #' \item \code{NOVEL_IMGT_UNIQUE_J}: Number of distinct J calls associated to \code{NOVEL_IMGT} | |
70 | #' in the input data. Considers the subsequence of \code{NOVEL_IMGT} in the \code{pos_range}. | |
71 | #' \item \code{NOVEL_IMGT_UNIQUE_CDR3}: Number of distinct CDR3 sequences associated | |
72 | #' with \code{NOVEL_IMGT} in the input data. Considers the subsequence of \code{NOVEL_IMGT} | |
73 | #' in the \code{pos_range}. | |
74 | #' \item \code{PERFECT_MATCH_COUNT}: Final number of sequences retained to call the new | |
75 | #' allele. These are unique sequences that have V segments that perfectly match | |
76 | #' the predicted germline in the \code{pos_range}. | |
77 | #' \item \code{PERFECT_MATCH_FREQ}: \code{PERFECT_MATCH_COUNT / GERMLINE_CALL_COUNT} | |
78 | #' \item \code{GERMLINE_CALL_COUNT}: The number of sequences with the \code{GERMLINE_CALL} | |
79 | #' in the input data that were initially considered for the analysis. | |
80 | #' \item \code{GERMLINE_CALL_FREQ}: The fraction of sequences with the \code{GERMLINE_CALL} | |
81 | #' in the input data initially considered for the analysis. | |
82 | #' \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}. | |
83 | #' \item \code{GERMLINE_IMGT_COUNT}: The number of times the \code{GERMLINE_IMGT} | |
84 | #' sequence is found in the input data. | |
85 | #' \item \code{MUT_MIN}: Minimum mutation considered by the algorithm. | |
86 | #' \item \code{MUT_MAX}: Maximum mutation considered by the algorithm. | |
87 | #' \item \code{MUT_PASS_COUNT}: Number of sequences in the mutation range. | |
88 | #' \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering). | |
89 | #' \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering). | |
90 | #' \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered | |
91 | #' potentially polymorphic. | |
92 | #' \item \code{Y_INTERCEPT_PASS}: Number of positions that pass the \code{Y_INTERCEPT} threshold. | |
93 | #' \item \code{SNP_PASS}: Number of sequences that pass the \code{Y_INTERCEPT} threshold and are | |
94 | #' within the desired nucleotide range (\code{min_seqs}). | |
95 | #' \item \code{UNMUTATED_COUNT}: Number of unmutated sequences. | |
96 | #' \item \code{UNMUTATED_FREQ}: Number of unmutated sequences over \code{GERMLINE_IMGT_COUNT}. | |
97 | #' \item \code{UNMUTATED_SNP_J_GENE_LENGTH_COUNT}: Number of distinct combinations | |
98 | #' of SNP, J gene, and junction length. | |
99 | #' \item \code{SNP_MIN_SEQS_J_MAX_PASS}: Number of SNPs that pass both the \code{min_seqs} | |
100 | #' and \code{j_max} thresholds. | |
101 | #' \item \code{ALPHA}: Significance threshold to be used when constructing the | |
102 | #' confidence interval for the y-intercept. | |
103 | #' \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences | |
104 | #' (within the desired mutational range and nucleotide range) required | |
105 | #' for the samples to be considered. | |
106 | #' \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly | |
107 | #' aligning to a potential novel allele that are allowed to utilize to a particular | |
108 | #' combination of junction length and J gene. | |
109 | #' \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must | |
110 | #' have usable nucleotides in a given position for that position to be considered. | |
111 | #' } | |
112 | #' | |
113 | #' The following comments can appear in the \code{NOTE} column: | |
114 | #' | |
115 | #' \itemize{ | |
116 | #' \item \emph{Novel allele found}: A novel allele was detected. | |
117 | #' \item \emph{Plurality sequence too rare}: No sequence is frequent enough to pass | |
118 | #' the J test (\code{j_max}). | |
119 | #' \item \emph{A J-junction combination is too prevalent}: Not enough J diversity (\code{j_max}). | |
120 | #' \item \emph{No positions pass y-intercept test}: No positions above \code{y_intercept}. | |
121 | #' \item \emph{Insufficient sequences in desired mutational range}: | |
122 | #' \code{mut_range} and \code{pos_range}. | |
123 | #' \item \emph{Not enough sequences}: Not enough sequences in the desired mutational | |
124 | #' range and nucleotide range (\code{min_seqs}). | |
125 | #' \item \emph{No unmutated versions of novel allele found}: All observed variants of the | |
126 | #' allele are mutated. | |
127 | #' } | |
62 | 128 | #' |
63 | 129 | #' @seealso \link{plotNovel} to visualize the data supporting any |
64 | 130 | #' novel alleles hypothesized to be present in the data and |
65 | 131 | #' \link{inferGenotype} to determine if the novel alleles are frequent |
66 | #' enought to be included in the subject's genotype | |
132 | #' enought to be included in the subject's genotype. | |
67 | 133 | #' |
68 | 134 | #' @examples |
69 | #' # Load example data and germlines | |
70 | #' data(sample_db) | |
71 | #' data(germline_ighv) | |
72 | #' | |
135 | #' \donttest{ | |
73 | 136 | #' # Find novel alleles and return relevant data |
74 | #' \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)} | |
137 | #' novel <- findNovelAlleles(SampleDb, GermlineIGHV) | |
138 | #' } | |
75 | 139 | #' |
76 | 140 | #' @export |
77 | findNovelAlleles <- function(clip_db, germline_db, | |
78 | v_call="V_CALL", | |
79 | germline_min = 200, | |
80 | min_seqs = 50, | |
81 | auto_mutrange = TRUE, | |
82 | mut_range = 1:10, | |
83 | pos_range = 1:312, | |
84 | y_intercept = 0.125, | |
85 | alpha = 0.05, | |
86 | j_max = 0.15, | |
87 | min_frac = 0.75, | |
88 | nproc = 1) { | |
89 | . = idx = NULL | |
90 | ||
91 | # Keep only the db columns needed | |
92 | clip_db <- clip_db %>% | |
93 | dplyr::select_('SEQUENCE_IMGT', v_call, 'J_CALL', 'JUNCTION_LENGTH') | |
94 | ||
95 | # Keep only the columns we need and clean up the sequences | |
96 | missing = c("SEQUENCE_IMGT", v_call, "J_CALL", "JUNCTION_LENGTH") %>% | |
97 | setdiff(colnames(clip_db)) | |
98 | if (length(missing) != 0) { | |
99 | stop("Could not find required columns in clip_db:\n ", | |
100 | paste(missing, collapse="\n ")) | |
101 | } | |
102 | empty_junctions = sum(clip_db$JUNCTION_LENGTH == 0, na.rm=TRUE) | |
103 | if (empty_junctions > 0) { | |
104 | stop(empty_junctions, " sequences have junction ", "length of zero. ", | |
105 | "Please remove these sequences.") | |
106 | } | |
107 | germlines = cleanSeqs(germline_db) | |
108 | names(germlines) = getAllele(names(germlines), first=FALSE, strip_d=FALSE) | |
109 | clip_db$SEQUENCE_IMGT = cleanSeqs(clip_db$SEQUENCE_IMGT) | |
110 | ||
111 | ||
112 | # Find which rows' calls contain which germline alleles | |
113 | cutoff = | |
114 | ifelse(germline_min < 1, round(nrow(clip_db)*germline_min), germline_min) | |
115 | allele_groups = sapply(names(germlines), grep, clip_db[[v_call]], fixed=TRUE, | |
116 | simplify=FALSE) | |
117 | names(allele_groups) = names(germlines) | |
118 | allele_groups = allele_groups[sapply(allele_groups, length) >= cutoff] | |
119 | if(length(allele_groups) == 0){ | |
120 | stop_message <- paste("Not enough sample sequences were assigned to any germline:\n", | |
121 | " (1) germline_min is too large or\n", | |
122 | " (2) sequences names don't match germlines.") | |
123 | stop(stop_message) | |
124 | } | |
125 | allele_groups = allele_groups[sortAlleles(names(allele_groups))] | |
126 | ||
127 | # Prepare for parallel processing | |
128 | nproc = ifelse(Sys.info()['sysname'] == "Windows", | |
129 | Sys.getenv('NUMBER_OF_PROCESSORS'), | |
130 | ifelse(Sys.info()['sysname'] == "Darwin", | |
131 | system("sysctl -n hw.ncpu", intern=TRUE), | |
132 | system("nproc", intern=TRUE))) %>% | |
133 | as.numeric() %>% | |
134 | min(nproc, . - 1) %>% | |
135 | max(1, .) | |
136 | if(nproc == 1) { | |
137 | foreach::registerDoSEQ() | |
138 | } else { | |
139 | cluster_type = ifelse(Sys.info()['sysname'] == "Windows", | |
140 | "PSOCK", "FORK") | |
141 | cluster <- parallel::makeCluster(nproc, type="PSOCK") | |
142 | parallel::clusterExport(cluster, list("allele_groups", | |
143 | "germlines", | |
144 | "clip_db", | |
145 | "min_seqs", | |
146 | "auto_mutrange", | |
147 | "mut_range", | |
148 | "pos_range", | |
149 | "y_intercept", | |
150 | "alpha", | |
151 | "j_max", | |
152 | "germline_min", | |
153 | "min_frac", | |
154 | "findLowerY", | |
155 | "mutationRangeSubset", | |
156 | "positionMutations", | |
157 | "superSubstring"), | |
158 | envir=environment()) | |
159 | doParallel::registerDoParallel(cluster) | |
160 | } | |
161 | ||
162 | out_list <- foreach(idx=iterators::icount(length(allele_groups))) %dopar% { | |
163 | # out_list <- lapply(1:length(allele_groups), function(idx) { | |
164 | gc() | |
165 | # message(paste0("idx=",idx)) | |
166 | # Subset of data being analyzed | |
167 | allele_name = names(allele_groups)[idx] | |
168 | germline = germlines[allele_name] | |
169 | indicies = allele_groups[[allele_name]] | |
170 | db_subset = clip_db[indicies, ] | |
171 | ||
172 | # If mutrange is auto, find most popular mutation count and start from there | |
173 | gpm = db_subset %>% | |
174 | dplyr::mutate_(V_CALL = ~allele_name) %>% | |
175 | getPopularMutationCount(germline, | |
176 | gene_min=0, seq_min=min_seqs, | |
177 | seq_p_of_max=1/8, full_return=TRUE) | |
178 | ||
179 | # Determine the mutation range(s) to scan | |
180 | mut_mins = min(mut_range) | |
181 | if(auto_mutrange & sum(gpm$MUTATION_COUNT > 0) > 0 ){ | |
182 | mut_mins = c(mut_mins, gpm$MUTATION_COUNT[gpm$MUTATION_COUNT > 0]) %>% | |
183 | unique() %>% | |
184 | sort() | |
185 | } | |
186 | ||
187 | # Create the run's return object | |
188 | df_run_empty = data.frame(GERMLINE_CALL = names(germline), | |
189 | NOTE = "", | |
190 | POLYMORPHISM_CALL = NA, | |
191 | NOVEL_IMGT = NA, | |
192 | PERFECT_MATCH_COUNT = NA, | |
193 | GERMLINE_CALL_COUNT = length(indicies), | |
194 | MUT_MIN = NA, | |
195 | MUT_MAX = NA, | |
196 | GERMLINE_IMGT = as.character(germline), | |
197 | POS_MIN = min(pos_range), | |
198 | POS_MAX = max(pos_range), | |
199 | Y_INTERCEPT = y_intercept, | |
200 | ALPHA = alpha, | |
201 | MIN_SEQS = min_seqs, | |
202 | J_MAX = j_max, | |
203 | MIN_FRAC = min_frac, | |
204 | stringsAsFactors = FALSE) | |
205 | for (mut_min in rev(mut_mins)) { | |
206 | gc() | |
207 | # message(paste0("|-- mut_min=",mut_min)) | |
208 | if (mut_min == rev(mut_mins)[1]){ | |
209 | df_run = df_run_empty | |
210 | } else { | |
211 | df_run = dplyr::bind_rows(df_run_empty, df_run) | |
212 | } | |
213 | mut_max = mut_min + diff(range(mut_range)) | |
214 | df_run$MUT_MIN[1] = mut_min | |
215 | df_run$MUT_MAX[1] = mut_max | |
216 | ||
217 | # If no sequence is frequent enough to pass the J test, give up now | |
218 | if(nrow(gpm) < 1) { | |
219 | df_run$NOTE[1] = "Plurality sequence too rare." | |
220 | if(mut_mins[1] == mut_min){ | |
221 | return(df_run) | |
222 | } else { | |
223 | next | |
224 | } | |
225 | } | |
226 | ||
227 | # Add a mutation count column and filter out sequences not in our range | |
228 | db_subset_mm = mutationRangeSubset(db_subset, germline, | |
229 | mut_min:mut_max, pos_range) | |
230 | ||
231 | if(nrow(db_subset_mm) < germline_min){ | |
232 | df_run$NOTE[1] = "Insufficient sequences in desired mutational range." | |
233 | if(mut_mins[1] == mut_min){ | |
234 | return(df_run) | |
235 | } else { | |
236 | next | |
237 | } | |
238 | } | |
239 | ||
240 | # Duplicate each sequence for all the positions to be analyzed | |
241 | # and find which positions are mutated | |
242 | pos_db = positionMutations(db_subset_mm, germline, pos_range) | |
243 | ||
244 | # Find positional mut freq vs seq mut count | |
245 | pos_muts = pos_db %>% | |
246 | dplyr::group_by_(~POSITION) %>% | |
247 | dplyr::mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>% | |
248 | dplyr::group_by_(~MUT_COUNT, ~POSITION) %>% | |
249 | dplyr::summarise_(POS_MUT_RATE = ~ mean(MUTATED)*unique(PASS) ) %>% | |
250 | dplyr::ungroup() | |
251 | ||
252 | rm(pos_db) | |
253 | gc() | |
254 | ||
255 | # Calculate y intercepts, find which pass the test | |
256 | pass_y = pos_muts %>% | |
257 | dplyr::group_by_(~POSITION) %>% | |
258 | dplyr::summarise_(Y_INT_MIN = ~findLowerY(POS_MUT_RATE, MUT_COUNT, | |
259 | mut_min, alpha)) %>% | |
260 | dplyr::filter_(~Y_INT_MIN > y_intercept) | |
261 | ||
262 | if(nrow(pass_y) < 1){ | |
263 | df_run$NOTE[1] = "No positions pass y-intercept test." | |
264 | if(mut_mins[1] == mut_min){ | |
265 | return(df_run) | |
266 | } else { | |
267 | next | |
268 | } | |
269 | } | |
270 | ||
271 | gl_substring = superSubstring(germline, pass_y$POSITION) | |
272 | gl_minus_substring = insertPolymorphisms(germline, pass_y$POSITION, | |
273 | rep("N", nrow(pass_y))) | |
274 | ||
275 | # Find the potential SNP positions and remove anything that matches | |
276 | # the germline at all those positions or any combo that is too rare | |
277 | db_y_subset_mm = db_subset_mm %>% | |
278 | dplyr::group_by(1:n()) %>% | |
279 | dplyr::mutate_(SNP_STRING = ~superSubstring(SEQUENCE_IMGT, | |
280 | pass_y$POSITION)) %>% | |
281 | dplyr::filter_(~SNP_STRING != gl_substring) %>% | |
282 | dplyr::group_by_(~SNP_STRING) %>% | |
283 | dplyr::mutate_(STRING_COUNT = ~n()) %>% | |
284 | dplyr::filter_(~STRING_COUNT >= min_seqs) | |
285 | ||
286 | if (nrow(db_y_subset_mm) < 1 ){ | |
287 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
288 | paste(pass_y$POSITION, collapse = ","), | |
289 | ") but the plurality sequence is too rare.", | |
290 | sep="") | |
291 | if(mut_mins[1] == mut_min){ | |
292 | return(df_run) | |
293 | } else { | |
294 | next | |
295 | } | |
296 | } | |
297 | ||
298 | # Get mutation count at all positions that are not potential SNPs | |
299 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
300 | db_y_subset_mm$MUT_COUNT_MINUS_SUBSTRING = db_y_subset_mm$SEQUENCE_IMGT %>% | |
301 | substring(min(pos_range), max(pos_range)) %>% | |
302 | paste(pads, ., sep="") %>% | |
303 | getMutatedPositions(gl_minus_substring) %>% | |
304 | sapply(length) | |
305 | ||
306 | # Keep only unmutated seqences and then find the counts of J and | |
307 | # junction length for each of the SNP strings, and then check to | |
308 | # see which pass the j/junction and count requirements | |
309 | db_y_summary0 = db_y_subset_mm %>% | |
310 | dplyr::filter_(~MUT_COUNT_MINUS_SUBSTRING == 0) %>% | |
311 | dplyr::mutate_(J_GENE = ~getGene(J_CALL)) %>% | |
312 | dplyr::group_by_(~SNP_STRING, ~J_GENE, ~JUNCTION_LENGTH) %>% | |
313 | dplyr::summarise_(COUNT = ~n()) %>% | |
314 | dplyr::group_by_(~SNP_STRING) %>% | |
315 | dplyr::mutate_(FRACTION = ~COUNT/sum(COUNT)) %>% | |
316 | dplyr::summarise_(TOTAL_COUNT = ~sum(COUNT), MAX_FRAC = ~max(FRACTION)) | |
141 | findNovelAlleles <- function(data, germline_db, | |
142 | v_call="V_CALL", | |
143 | germline_min=200, | |
144 | min_seqs=50, | |
145 | auto_mutrange=TRUE, | |
146 | mut_range=1:10, | |
147 | pos_range=1:312, | |
148 | y_intercept=0.125, | |
149 | alpha=0.05, | |
150 | j_max=0.15, | |
151 | min_frac=0.75, | |
152 | nproc=1) { | |
153 | . = idx = NULL | |
154 | ||
155 | # Keep only the db columns needed | |
156 | data <- data %>% | |
157 | dplyr::select_('SEQUENCE_IMGT', v_call, 'J_CALL', 'JUNCTION_LENGTH', 'JUNCTION') | |
158 | ||
159 | # Keep only the columns we need and clean up the sequences | |
160 | missing = c("SEQUENCE_IMGT", v_call, "J_CALL", "JUNCTION_LENGTH") %>% | |
161 | setdiff(colnames(data)) | |
162 | if (length(missing) != 0) { | |
163 | stop("Could not find required columns in the input data:\n ", | |
164 | paste(missing, collapse="\n ")) | |
165 | } | |
166 | empty_junctions = sum(data$JUNCTION_LENGTH == 0, na.rm=TRUE) | |
167 | if (empty_junctions > 0) { | |
168 | stop(empty_junctions, " sequences have junction ", "length of zero. ", | |
169 | "Please remove these sequences.") | |
170 | } | |
171 | germlines = cleanSeqs(germline_db) | |
172 | names(germlines) = getAllele(names(germlines), first=FALSE, strip_d=FALSE) | |
173 | data$SEQUENCE_IMGT = cleanSeqs(data$SEQUENCE_IMGT) | |
174 | ||
175 | ||
176 | # Find which rows' calls contain which germline alleles | |
177 | cutoff = | |
178 | ifelse(germline_min < 1, round(nrow(data)*germline_min), germline_min) | |
179 | allele_groups = sapply(names(germlines), grep, data[[v_call]], fixed=TRUE, | |
180 | simplify=FALSE) | |
181 | names(allele_groups) = names(germlines) | |
182 | allele_groups = allele_groups[sapply(allele_groups, length) >= cutoff] | |
183 | if(length(allele_groups) == 0){ | |
184 | stop_message <- paste("Not enough sample sequences were assigned to any germline:\n", | |
185 | " (1) germline_min is too large or\n", | |
186 | " (2) sequences names don't match germlines.") | |
187 | stop(stop_message) | |
188 | } | |
189 | allele_groups = allele_groups[sortAlleles(names(allele_groups))] | |
190 | ||
191 | # Prepare for parallel processing | |
192 | nproc = ifelse(Sys.info()['sysname'] == "Windows", | |
193 | Sys.getenv('NUMBER_OF_PROCESSORS'), | |
194 | ifelse(Sys.info()['sysname'] == "Darwin", | |
195 | system("sysctl -n hw.ncpu", intern=TRUE), | |
196 | system("nproc", intern=TRUE))) %>% | |
197 | as.numeric() %>% | |
198 | min(nproc, . - 1) %>% | |
199 | max(1, .) | |
200 | if(nproc == 1) { | |
201 | foreach::registerDoSEQ() | |
202 | } else { | |
203 | #cluster_type = ifelse(Sys.info()['sysname'] == "Windows", "PSOCK", "FORK") | |
204 | cluster <- parallel::makeCluster(nproc, type="PSOCK") | |
205 | parallel::clusterExport(cluster, list("allele_groups", | |
206 | "germlines", | |
207 | "data", | |
208 | "min_seqs", | |
209 | "auto_mutrange", | |
210 | "mut_range", | |
211 | "pos_range", | |
212 | "y_intercept", | |
213 | "alpha", | |
214 | "j_max", | |
215 | "germline_min", | |
216 | "min_frac", | |
217 | "findLowerY", | |
218 | "mutationRangeSubset", | |
219 | "positionMutations", | |
220 | "superSubstring"), | |
221 | envir=environment()) | |
222 | doParallel::registerDoParallel(cluster) | |
223 | } | |
224 | ||
225 | out_list <- foreach(idx=iterators::icount(length(allele_groups))) %dopar% { | |
226 | # out_list <- lapply(1:length(allele_groups), function(idx) { | |
227 | gc() | |
228 | # message(paste0("idx=",idx)) | |
229 | # Subset of data being analyzed | |
230 | allele_name = names(allele_groups)[idx] | |
231 | germline = germlines[allele_name] | |
232 | indicies = allele_groups[[allele_name]] | |
233 | db_subset = data[indicies, ] | |
317 | 234 | |
318 | if(nrow(db_y_summary0) < 1){ | |
319 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
320 | paste(pass_y$POSITION, collapse = ","), | |
321 | ") but no unmutated versions of novel allele", | |
322 | " found.", sep="") | |
323 | if(mut_mins[1] == mut_min){ | |
324 | return(df_run) | |
325 | } else { | |
326 | next | |
327 | } | |
328 | } | |
329 | ||
330 | # db_y_summary = db_y_summary0 %>% | |
331 | # filter_(~TOTAL_COUNT >= min_seqs & MAX_FRAC <= j_max) | |
332 | ||
333 | min_seqs_pass <- db_y_summary0$TOTAL_COUNT >= min_seqs | |
334 | j_max_pass <- db_y_summary0$MAX_FRAC <= j_max | |
335 | ||
336 | db_y_summary <- db_y_summary0[min_seqs_pass & j_max_pass, , drop=FALSE] | |
337 | ||
338 | if(nrow(db_y_summary) < 1){ | |
339 | msg <- c(NA, NA) | |
340 | names(msg) <- c("j_max", "min_seqs") | |
341 | ||
342 | if (sum(min_seqs_pass) == 0) { | |
343 | msg['min_seqs'] <- paste0("not enough sequences (maximum total count is ", | |
344 | max(db_y_summary0$TOTAL_COUNT), | |
345 | ")") | |
235 | # If mutrange is auto, find most popular mutation count and start from there | |
236 | gpm = db_subset %>% | |
237 | dplyr::mutate_(V_CALL = ~allele_name) %>% | |
238 | getPopularMutationCount(germline, | |
239 | gene_min=0, seq_min=min_seqs, | |
240 | seq_p_of_max=1/8, full_return=TRUE) | |
241 | ||
242 | # Determine the mutation range(s) to scan | |
243 | mut_mins = min(mut_range) | |
244 | if(auto_mutrange & sum(gpm$MUTATION_COUNT > 0) > 0 ){ | |
245 | mut_mins = c(mut_mins, gpm$MUTATION_COUNT[gpm$MUTATION_COUNT > 0]) %>% | |
246 | unique() %>% | |
247 | sort() | |
346 | 248 | } |
347 | 249 | |
348 | if (sum(j_max_pass) == 0) { | |
349 | msg['j_max'] <- paste0("a J-junction combination is too prevalent (", | |
350 | round(100*max(db_y_summary0$MAX_FRAC),1),"% of sequences)") | |
250 | # Create the run's return object | |
251 | df_run_empty = data.frame(GERMLINE_CALL = names(germline), | |
252 | NOTE = "", | |
253 | POLYMORPHISM_CALL = NA, | |
254 | NT_SUBSTITUTIONS=NA, | |
255 | NOVEL_IMGT = NA, | |
256 | NOVEL_IMGT_COUNT=NA, | |
257 | NOVEL_IMGT_UNIQUE_J=NA, | |
258 | NOVEL_IMGT_UNIQUE_CDR3=NA, | |
259 | PERFECT_MATCH_COUNT = NA, | |
260 | PERFECT_MATCH_FREQ = NA, | |
261 | GERMLINE_CALL_COUNT = length(indicies), | |
262 | GERMLINE_CALL_FREQ = round(length(indicies)/nrow(data), 3), | |
263 | MUT_MIN = NA, | |
264 | MUT_MAX = NA, | |
265 | MUT_PASS_COUNT=NA, | |
266 | GERMLINE_IMGT = as.character(germline), | |
267 | GERMLINE_IMGT_COUNT=NA, | |
268 | POS_MIN = min(pos_range), | |
269 | POS_MAX = max(pos_range), | |
270 | Y_INTERCEPT = y_intercept, | |
271 | Y_INTERCEPT_PASS = NA, | |
272 | SNP_PASS=NA, | |
273 | UNMUTATED_COUNT=NA, | |
274 | UNMUTATED_FREQ=NA, | |
275 | UNMUTATED_SNP_J_GENE_LENGTH_COUNT=NA, | |
276 | SNP_MIN_SEQS_J_MAX_PASS=NA, | |
277 | ALPHA = alpha, | |
278 | MIN_SEQS = min_seqs, | |
279 | J_MAX = j_max, | |
280 | MIN_FRAC = min_frac, | |
281 | stringsAsFactors = FALSE) | |
282 | for (mut_min in rev(mut_mins)) { | |
283 | gc() | |
284 | # message(paste0("|-- mut_min=",mut_min)) | |
285 | if (mut_min == rev(mut_mins)[1]){ | |
286 | df_run = df_run_empty | |
287 | } else { | |
288 | df_run = dplyr::bind_rows(df_run_empty, df_run) | |
289 | } | |
290 | mut_max = mut_min + diff(range(mut_range)) | |
291 | df_run$MUT_MIN[1] = mut_min | |
292 | df_run$MUT_MAX[1] = mut_max | |
293 | ||
294 | # If no sequence is frequent enough to pass the J test, give up now | |
295 | if(nrow(gpm) < 1) { | |
296 | df_run$NOTE[1] = "Plurality sequence too rare." | |
297 | if(mut_mins[1] == mut_min){ | |
298 | return(df_run) | |
299 | } else { | |
300 | next | |
301 | } | |
302 | } | |
303 | ||
304 | # Add a mutation count column and filter out sequences not in our range | |
305 | db_subset_mm = mutationRangeSubset(db_subset, germline, | |
306 | mut_min:mut_max, pos_range) | |
307 | df_run$MUT_PASS_COUNT[1] <- nrow(db_subset_mm) | |
308 | ||
309 | if(nrow(db_subset_mm) < min_seqs){ | |
310 | df_run$NOTE[1] = paste0("Insufficient sequences (",nrow(db_subset_mm),") in desired mutational range.") | |
311 | if(mut_mins[1] == mut_min){ | |
312 | return(df_run) | |
313 | } else { | |
314 | next | |
315 | } | |
316 | } | |
317 | ||
318 | # Duplicate each sequence for all the positions to be analyzed | |
319 | # and find which positions are mutated | |
320 | pos_db = positionMutations(db_subset_mm, germline, pos_range) | |
321 | ||
322 | # Find positional mut freq vs seq mut count | |
323 | pos_muts = pos_db %>% | |
324 | dplyr::group_by_(~POSITION) %>% | |
325 | dplyr::mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>% | |
326 | dplyr::group_by_(~MUT_COUNT, ~POSITION) %>% | |
327 | dplyr::summarise_(POS_MUT_RATE = ~ mean(MUTATED)*unique(PASS) ) %>% | |
328 | dplyr::ungroup() | |
329 | ||
330 | rm(pos_db) | |
331 | gc() | |
332 | ||
333 | # Calculate y intercepts, find which pass the test | |
334 | pass_y = pos_muts %>% | |
335 | dplyr::group_by_(~POSITION) %>% | |
336 | dplyr::summarise_(Y_INT_MIN = ~findLowerY(POS_MUT_RATE, MUT_COUNT, | |
337 | mut_min, alpha)) %>% | |
338 | dplyr::filter_(~Y_INT_MIN > y_intercept) | |
339 | ||
340 | df_run$Y_INTERCEPT_PASS[1] <- nrow(pass_y) | |
341 | ||
342 | if(nrow(pass_y) < 1){ | |
343 | df_run$NOTE[1] = "No positions pass y-intercept test." | |
344 | if(mut_mins[1] == mut_min){ | |
345 | return(df_run) | |
346 | } else { | |
347 | next | |
348 | } | |
349 | } | |
350 | ||
351 | gl_substring = superSubstring(germline, pass_y$POSITION) | |
352 | gl_minus_substring = insertPolymorphisms(germline, pass_y$POSITION, | |
353 | rep("N", nrow(pass_y))) | |
354 | ||
355 | # Find the potential SNP positions and remove anything that matches | |
356 | # the germline at all those positions or any combo that is too rare | |
357 | db_y_subset_mm = db_subset_mm %>% | |
358 | dplyr::group_by(1:n()) %>% | |
359 | dplyr::mutate_(SNP_STRING = ~superSubstring(SEQUENCE_IMGT, | |
360 | pass_y$POSITION)) %>% | |
361 | dplyr::filter_(~SNP_STRING != gl_substring) %>% | |
362 | dplyr::group_by_(~SNP_STRING) %>% | |
363 | dplyr::mutate_(STRING_COUNT = ~n()) %>% | |
364 | dplyr::filter_(~STRING_COUNT >= min_seqs) | |
365 | ||
366 | df_run$SNP_PASS[1] <- nrow(db_y_subset_mm) | |
367 | ||
368 | if (nrow(db_y_subset_mm) < 1 ){ | |
369 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
370 | paste(pass_y$POSITION, collapse = ","), | |
371 | ") but the plurality sequence is too rare.", | |
372 | sep="") | |
373 | if(mut_mins[1] == mut_min){ | |
374 | return(df_run) | |
375 | } else { | |
376 | next | |
377 | } | |
378 | } | |
379 | ||
380 | # Get mutation count at all positions that are not potential SNPs | |
381 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
382 | db_y_subset_mm$MUT_COUNT_MINUS_SUBSTRING = db_y_subset_mm$SEQUENCE_IMGT %>% | |
383 | substring(min(pos_range), max(pos_range)) %>% | |
384 | paste(pads, ., sep="") %>% | |
385 | getMutatedPositions(gl_minus_substring) %>% | |
386 | sapply(length) | |
387 | ||
388 | # Keep only unmutated seqences and then find the counts of J and | |
389 | # junction length for each of the SNP strings, and then check to | |
390 | # see which pass the j/junction and count requirements | |
391 | db_y_summary0 = db_y_subset_mm %>% | |
392 | dplyr::filter_(~MUT_COUNT_MINUS_SUBSTRING == 0) | |
393 | ||
394 | df_run$UNMUTATED_COUNT[1] <- nrow(db_y_summary0) | |
395 | ||
396 | db_y_summary0 <- db_y_summary0 %>% | |
397 | dplyr::mutate_(J_GENE = ~getGene(J_CALL)) %>% | |
398 | dplyr::group_by_(~SNP_STRING, ~J_GENE, ~JUNCTION_LENGTH) %>% | |
399 | dplyr::summarise_(COUNT = ~n()) | |
400 | ||
401 | df_run$UNMUTATED_SNP_J_GENE_LENGTH_COUNT[1] <- nrow(db_y_summary0) | |
402 | ||
403 | db_y_summary0 <- db_y_summary0 %>% | |
404 | dplyr::group_by_(~SNP_STRING) %>% | |
405 | dplyr::mutate_(FRACTION = ~COUNT/sum(COUNT)) %>% | |
406 | dplyr::summarise_(TOTAL_COUNT = ~sum(COUNT), MAX_FRAC = ~max(FRACTION)) | |
407 | ||
408 | if(nrow(db_y_summary0) < 1){ | |
409 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
410 | paste(pass_y$POSITION, collapse = ","), | |
411 | ") but no unmutated versions of novel allele", | |
412 | " found.", sep="") | |
413 | if(mut_mins[1] == mut_min){ | |
414 | return(df_run) | |
415 | } else { | |
416 | next | |
417 | } | |
418 | } | |
419 | ||
420 | # db_y_summary = db_y_summary0 %>% | |
421 | # filter_(~TOTAL_COUNT >= min_seqs & MAX_FRAC <= j_max) | |
422 | ||
423 | min_seqs_pass <- db_y_summary0$TOTAL_COUNT >= min_seqs | |
424 | j_max_pass <- db_y_summary0$MAX_FRAC <= j_max | |
425 | ||
426 | db_y_summary <- db_y_summary0[min_seqs_pass & j_max_pass, , drop=FALSE] | |
427 | ||
428 | df_run$SNP_MIN_SEQS_J_MAX_PASS[1] <- nrow(db_y_summary) | |
429 | ||
430 | if(nrow(db_y_summary) < 1){ | |
431 | msg <- c(NA, NA) | |
432 | names(msg) <- c("j_max", "min_seqs") | |
433 | ||
434 | if (sum(min_seqs_pass) == 0) { | |
435 | msg['min_seqs'] <- paste0("Not enough sequences (maximum total count is ", | |
436 | max(db_y_summary0$TOTAL_COUNT), | |
437 | ").") | |
438 | } | |
439 | ||
440 | if (sum(j_max_pass) == 0) { | |
441 | msg['j_max'] <- paste0("A J-junction combination is too prevalent (", | |
442 | round(100*max(db_y_summary0$MAX_FRAC),1),"% of sequences).") | |
443 | } | |
444 | ||
445 | msg <- paste(na.omit(msg), collapse=" and ") | |
446 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
447 | paste(pass_y$POSITION, collapse = ","), | |
448 | ") but ", | |
449 | msg,".", sep="") | |
450 | df_run$PERFECT_MATCH_COUNT[1] = max(db_y_summary0$TOTAL_COUNT) | |
451 | df_run$PERFECT_MATCH_FREQ[1] <- df_run$PERFECT_MATCH_COUNT[1]/df_run$GERMLINE_CALL_COUNT[1] | |
452 | if(mut_mins[1] == mut_min){ | |
453 | return(df_run) | |
454 | } else { | |
455 | next | |
456 | } | |
457 | } | |
458 | ||
459 | germ_nts = unlist(strsplit(gl_substring,"")) | |
460 | for (r in 1:nrow(db_y_summary)) { | |
461 | if (r > 1){ | |
462 | df_run = dplyr::bind_rows(df_run[1,], df_run) | |
463 | } | |
464 | # Create the new germline | |
465 | snp_nts = unlist(strsplit(db_y_summary$SNP_STRING[r],"")) | |
466 | remain_mut = db_y_summary$SNP_STRING[r] %>% | |
467 | getMutatedPositions(gl_substring) %>% | |
468 | unlist() %>% | |
469 | unique() | |
470 | germ = insertPolymorphisms(germline, pass_y$POSITION, snp_nts) | |
471 | is_known_allele <- germ == germlines | |
472 | if (sum(is_known_allele) == 0 ) { | |
473 | names(germ) = mapply(paste, germ_nts[remain_mut], | |
474 | pass_y$POSITION[remain_mut], | |
475 | snp_nts[remain_mut], sep="") %>% | |
476 | paste(collapse="_") %>% | |
477 | paste(names(germline), ., sep="_") | |
478 | } else { | |
479 | # If the match is with duplicated sequences in the reference germlines, | |
480 | # use the first | |
481 | known_allele_names <- sortAlleles(names(germlines)[is_known_allele], | |
482 | method="position") | |
483 | names(germ) = known_allele_names[1] | |
484 | } | |
485 | # Save the new germline to our data frame | |
486 | df_run$POLYMORPHISM_CALL[1] = names(germ) | |
487 | df_run$NOVEL_IMGT[1] = as.character(germ) | |
488 | df_run$PERFECT_MATCH_COUNT[1] = db_y_summary$TOTAL_COUNT[r] | |
489 | df_run$PERFECT_MATCH_FREQ[1] <- df_run$PERFECT_MATCH_COUNT[1]/df_run$GERMLINE_CALL_COUNT[1] | |
490 | df_run$NOTE[1] = "Novel allele found!" | |
491 | } | |
492 | ||
493 | } # end for each starting mutation counts | |
494 | return(df_run) | |
495 | ||
496 | } # end foreach allele | |
497 | ||
498 | if(nproc > 1) { stopCluster(cluster) } | |
499 | out_df <- dplyr::bind_rows(out_list) | |
500 | getMuSpec <- function(poly_call, germ_call) { | |
501 | sapply(1:length(poly_call), function(i){ | |
502 | p <- gsub(germ_call[i], "", poly_call[i], fixed = T) | |
503 | p <- strsplit(p,"_")[[1]][-1] | |
504 | m <- gsub("([[:alpha:]])([[:digit:]]*)([[:alpha:]])", "\\2\\1>\\3", p) | |
505 | paste(m, collapse=",") | |
506 | }) | |
507 | } | |
508 | ||
509 | # The number of records in the sequence dataset matching | |
510 | # each exact NOVEL_IMGT sequence | |
511 | getDbMatch <- function(novel_imgt) { | |
512 | sapply(novel_imgt, function(n) { | |
513 | n <- substr(n, min(pos_range), max(pos_range)) | |
514 | sum(grepl(gsub("[-\\.]","",n), | |
515 | gsub("[-\\.]","",data$SEQUENCE_IMGT))) | |
516 | }) | |
517 | } | |
518 | ||
519 | # The number of distinct J in the sequence dataset associated | |
520 | # with the exact NOVEL_IMGT sequence | |
521 | getNumJ <- function(novel_imgt) { | |
522 | sapply(novel_imgt, function(n) { | |
523 | n <- substr(n, min(pos_range), max(pos_range)) | |
524 | imgt_idx <- grepl(gsub("[-\\.]","",n), | |
525 | gsub("[-\\.]","",data$SEQUENCE_IMGT)) | |
526 | length(unique(getGene(data[['J_CALL']][imgt_idx]))) | |
527 | }) | |
528 | } | |
529 | ||
530 | ||
531 | # The number of distinct CDR3 in the sequence dataset associated | |
532 | # with the exact NOVEL_IMGT sequence | |
533 | getNumCDR3 <- function(novel_imgt) { | |
534 | sapply(novel_imgt, function(n) { | |
535 | n <- substr(n, min(pos_range), max(pos_range)) | |
536 | imgt_idx <- grepl(gsub("[-\\.]","",n), | |
537 | gsub("[-\\.]","",data$SEQUENCE_IMGT)) | |
538 | seq <- data[['JUNCTION']][imgt_idx] | |
539 | seq <- substr(seq, 4, stringi::stri_length(seq) - 3) | |
540 | length(unique(seq)) | |
541 | }) | |
542 | } | |
543 | ||
544 | idx <- which(!is.na(out_df$NOVEL_IMGT)) | |
545 | if (length(idx)>0) { | |
546 | out_df$NT_SUBSTITUTIONS[idx] <- getMuSpec(out_df$POLYMORPHISM_CALL[idx], | |
547 | out_df$GERMLINE_CALL[idx]) | |
548 | out_df$NOVEL_IMGT_COUNT[idx] <- getDbMatch(out_df$NOVEL_IMGT[idx]) | |
549 | out_df$NOVEL_IMGT_UNIQUE_J[idx] <- getNumJ(out_df$NOVEL_IMGT[idx]) | |
550 | if ("JUNCTION" %in% colnames(data)) { | |
551 | out_df$NOVEL_IMGT_UNIQUE_CDR3[idx] <- getNumCDR3(out_df$NOVEL_IMGT[idx]) | |
351 | 552 | } |
352 | ||
353 | msg <- paste(na.omit(msg), collapse=" and ") | |
354 | df_run$NOTE[1] = paste("Position(s) passed y-intercept (", | |
355 | paste(pass_y$POSITION, collapse = ","), | |
356 | ") but ", | |
357 | msg,".", sep="") | |
358 | df_run$PERFECT_MATCH_COUNT[1] = max(db_y_summary0$TOTAL_COUNT) | |
359 | if(mut_mins[1] == mut_min){ | |
360 | return(df_run) | |
361 | } else { | |
362 | next | |
363 | } | |
364 | } | |
365 | ||
366 | germ_nts = unlist(strsplit(gl_substring,"")) | |
367 | for (r in 1:nrow(db_y_summary)) { | |
368 | if (r > 1){ | |
369 | df_run = dplyr::bind_rows(df_run[1,], df_run) | |
370 | } | |
371 | # Create the new germline | |
372 | snp_nts = unlist(strsplit(db_y_summary$SNP_STRING[r],"")) | |
373 | remain_mut = db_y_summary$SNP_STRING[r] %>% | |
374 | getMutatedPositions(gl_substring) %>% | |
375 | unlist() %>% | |
376 | unique() | |
377 | germ = insertPolymorphisms(germline, pass_y$POSITION, snp_nts) | |
378 | names(germ) = mapply(paste, germ_nts[remain_mut], | |
379 | pass_y$POSITION[remain_mut], | |
380 | snp_nts[remain_mut], sep="") %>% | |
381 | paste(collapse="_") %>% | |
382 | paste(names(germline), ., sep="_") | |
383 | # Save the new germline to our data frame | |
384 | df_run$POLYMORPHISM_CALL[1] = names(germ) | |
385 | df_run$NOVEL_IMGT[1] = as.character(germ) | |
386 | df_run$PERFECT_MATCH_COUNT[1] = db_y_summary$TOTAL_COUNT[r] | |
387 | df_run$NOTE[1] = "Novel allele found!" | |
388 | } | |
389 | ||
390 | } # end for each starting mutation counts | |
391 | return(df_run) | |
392 | ||
393 | } # end foreach allele | |
394 | ||
395 | if(nproc > 1) { stopCluster(cluster) } | |
396 | rm(clip_db) | |
397 | gc() | |
398 | out_df <- dplyr::bind_rows(out_list) | |
399 | return(out_df) | |
553 | } | |
554 | out_df$GERMLINE_IMGT_COUNT <- getDbMatch(out_df$GERMLINE_IMGT) | |
555 | out_df$UNMUTATED_FREQ = out_df$UNMUTATED_COUNT/out_df$GERMLINE_CALL_COUNT | |
556 | rm(data) | |
557 | gc() | |
558 | ||
559 | return(out_df) | |
400 | 560 | } |
401 | 561 | |
402 | 562 | #' Select rows containing novel alleles |
404 | 564 | #' \code{selectNovel} takes the result from \link{findNovelAlleles} and |
405 | 565 | #' selects only the rows containing unique, novel alleles. |
406 | 566 | #' |
407 | #' @param novel_df A \code{data.frame} of the type returned by | |
408 | #' \link{findNovelAlleles} | |
409 | #' @param keep_alleles A \code{logical} indicating if different alleles | |
410 | #' leading to the same novel sequence should be kept. | |
411 | #' See details. | |
412 | #' | |
413 | #' @details If, for instance, subject has in his genome IGHV1-2*02 and a novel | |
414 | #' allele equally close to IGHV1-2*02 and IGHV1-2*05, the novel allele may be | |
567 | #' @details | |
568 | #' If, for instance, subject has in his genome \code{IGHV1-2*02} and a novel | |
569 | #' allele equally close to \code{IGHV1-2*02} and \code{IGHV1-2*05}, the novel allele may be | |
415 | 570 | #' detected by analyzing sequences that best align to either of these alleles. |
416 | 571 | #' If \code{keep_alleles} is \code{TRUE}, both polymorphic allele calls will |
417 | 572 | #' be retained. In the case that multiple mutation ranges are checked for the |
418 | 573 | #' same allele, only one mutation range will be kept in the output. |
574 | #' | |
575 | #' @param novel a \code{data.frame} of the type returned by | |
576 | #' \link{findNovelAlleles}. | |
577 | #' @param keep_alleles a \code{logical} indicating if different alleles | |
578 | #' leading to the same novel sequence should be kept. | |
579 | #' See Details. | |
419 | 580 | #' |
420 | 581 | #' @return A \code{data.frame} containing only unique, novel alleles (if any) |
421 | 582 | #' that were in the input. |
422 | 583 | #' |
423 | 584 | #' @examples |
424 | #' data(novel_df) | |
425 | #' novel = selectNovel(novel_df) | |
585 | #' novel <- selectNovel(SampleNovel) | |
426 | 586 | #' |
427 | 587 | #' @export |
428 | selectNovel <- function(novel_df, keep_alleles=FALSE) { | |
588 | selectNovel <- function(novel, keep_alleles=FALSE) { | |
429 | 589 | # Remove non-novel rows |
430 | novel_df = filter_(novel_df, ~!is.na(NOVEL_IMGT)) | |
431 | ||
590 | novel = filter_(novel, ~!is.na(NOVEL_IMGT)) | |
591 | ||
432 | 592 | if (keep_alleles) { |
433 | novel_df = novel_df %>% | |
593 | novel = novel %>% | |
434 | 594 | group_by_(~GERMLINE_CALL) |
435 | 595 | } |
436 | novel = novel_df %>% | |
596 | novel_set = novel %>% | |
437 | 597 | distinct_(~NOVEL_IMGT, .keep_all=TRUE) %>% |
438 | 598 | ungroup() |
439 | ||
440 | return(novel) | |
599 | ||
600 | return(novel_set) | |
441 | 601 | } |
442 | 602 | |
443 | 603 | #' Visualize evidence of novel V alleles |
444 | 604 | #' |
445 | 605 | #' \code{plotNovel} is be used to visualize the evidence of any novel V |
446 | #' alleles found using \link{findNovelAlleles}. | |
447 | #' | |
448 | #' @param clip_db a \code{data.frame} in Change-O format. See | |
606 | #' alleles found using \link{findNovelAlleles}. It can also be used to | |
607 | #' visualize the results for alleles that did | |
608 | #' | |
609 | #' @details | |
610 | #' The first panel in the plot shows, for all sequences which align to a particular | |
611 | #' germline allele, the mutation frequency at each postion along the aligned | |
612 | #' sequece as a function of the sequence-wide mutation. Sequences that pass | |
613 | #' the novel allele test are colored red, while sequences that don't pass | |
614 | #' the test are colored yellow. The second panel shows the nucleotide usage at the | |
615 | #' positions as a function of sequence-wide mutation count. | |
616 | #' | |
617 | #' To avoid cases where a clonal expansion might lead to a false positive, tigger examines | |
618 | #' the combinations of J gene and junction length among sequences which perfectly | |
619 | #' match the proposed germline allele. | |
620 | #' | |
621 | #' @param data a \code{data.frame} in Change-O format. See | |
449 | 622 | #' \link{findNovelAlleles} for details. |
450 | #' @param novel_df_row a single row from a data frame as output by | |
623 | #' @param novel_row a single row from a data frame as output by | |
451 | 624 | #' \link{findNovelAlleles} that contains a |
452 | 625 | #' polymorphism-containing germline allele |
626 | #' @param v_call name of the column in \code{data} with V allele | |
627 | #' calls. Default is "V_CALL". | |
453 | 628 | #' @param ncol number of columns to use when laying out the plots |
454 | #' @param v_call name of the column in \code{clip_db} with V allele | |
455 | #' calls. Default is "V_CALL" | |
456 | #' @return NULL | |
457 | 629 | #' |
458 | 630 | #' @examples |
459 | #' # Load example data and germlines | |
460 | #' data(sample_db) | |
461 | #' data(germline_ighv) | |
462 | #' | |
463 | #' # Find novel alleles and return relevant data | |
464 | #' \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)} | |
465 | #' data(novel_df) | |
466 | 631 | #' # Plot the evidence for the first (and only) novel allele in the example data |
467 | #' novel = selectNovel(novel_df) | |
468 | #' plotNovel(sample_db, novel[1,]) | |
632 | #' novel <- selectNovel(SampleNovel) | |
633 | #' plotNovel(SampleDb, novel[1, ]) | |
469 | 634 | #' |
470 | 635 | #' @export |
471 | plotNovel <- function(clip_db, novel_df_row, ncol = 1, v_call="V_CALL"){ | |
472 | . = NULL | |
473 | ||
474 | # Use the data frame | |
475 | if(length(novel_df_row) > 0){ | |
476 | if(is.data.frame(novel_df_row) & nrow(novel_df_row) == 1){ | |
477 | pos_range = novel_df_row$POS_MIN:novel_df_row$POS_MAX | |
478 | germline = novel_df_row$GERMLINE_IMGT | |
479 | names(germline) = novel_df_row$GERMLINE_CALL | |
480 | mut_range = novel_df_row$MUT_MIN[1]:novel_df_row$MUT_MAX[1] | |
481 | novel_imgt = novel_df_row$NOVEL_IMGT | |
482 | names(novel_imgt) = novel_df_row$POLYMORPHISM_CALL | |
483 | min_frac = novel_df_row$MIN_FRAC | |
484 | note = novel_df_row$NOTE | |
636 | plotNovel <- function(data, novel_row, v_call="V_CALL", ncol=1) { | |
637 | . = NULL | |
638 | ||
639 | # Use the data frame | |
640 | if(length(novel_row) > 0) { | |
641 | if(is.data.frame(novel_row) & nrow(novel_row) == 1) { | |
642 | pos_range = novel_row$POS_MIN:novel_row$POS_MAX | |
643 | germline = novel_row$GERMLINE_IMGT | |
644 | names(germline) = novel_row$GERMLINE_CALL | |
645 | mut_range = novel_row$MUT_MIN[1]:novel_row$MUT_MAX[1] | |
646 | novel_imgt = novel_row$NOVEL_IMGT | |
647 | names(novel_imgt) = novel_row$POLYMORPHISM_CALL | |
648 | min_frac = novel_row$MIN_FRAC | |
649 | note = novel_row$NOTE | |
650 | } else { | |
651 | stop("novel_row is not a data frame with only one row.") | |
652 | } | |
653 | } | |
654 | ||
655 | germline = cleanSeqs(germline) | |
656 | data$SEQUENCE_IMGT = cleanSeqs(data$SEQUENCE_IMGT) | |
657 | ||
658 | # Extract sequences assigned to the germline, determine which | |
659 | # have an appropriate range of mutations, and find the mutation | |
660 | # frequency of each position | |
661 | db_subset = data %>% | |
662 | select_(~SEQUENCE_IMGT, v_call, ~J_CALL, ~JUNCTION_LENGTH) %>% | |
663 | filter_(~grepl(names(germline), data[[v_call]], fixed=TRUE)) | |
664 | pos_db = db_subset %>% | |
665 | mutationRangeSubset(germline, mut_range, pos_range) | |
666 | if (nrow(pos_db) == 0) { | |
667 | warning(paste0("Insufficient sequences (",nrow(pos_db),") in desired mutational range.")) | |
668 | return (invisible(NULL)) | |
669 | } | |
670 | pos_db <- pos_db %>% | |
671 | positionMutations(germline, pos_range) | |
672 | pos_muts = pos_db %>% | |
673 | group_by_(~POSITION) %>% | |
674 | mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>% | |
675 | group_by_(~MUT_COUNT, ~POSITION) %>% | |
676 | summarise_(POS_MUT_RATE = ~mean(MUTATED)*unique(PASS) ) %>% | |
677 | ungroup() | |
678 | ||
679 | # Label the polymorphic positions as such | |
680 | pass_y = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
681 | gsub("[^0-9]", "", .) %>% | |
682 | as.numeric() | |
683 | p_y_f = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
684 | gsub("[0-9]+.", "", .) | |
685 | p_y_t = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
686 | gsub(".[0-9]+", "", .) | |
687 | # Parse the note to find positions that passed y intercept if no novel found | |
688 | if(length(pass_y) == 0 & grepl("Position\\(s\\) passed y-intercept", note)){ | |
689 | pass_y = note %>% gsub("Position\\(s\\) passed y-intercept \\(", "", .) %>% | |
690 | gsub("\\).*", "", .) %>% strsplit(",") %>% unlist %>% as.numeric | |
691 | p_y_f = sapply(pass_y, function (x) substring(germline, x, x)) | |
692 | p_y_t = gsub(".", "?", p_y_f) | |
693 | } | |
694 | ||
695 | to_from = paste(paste("Position", pass_y), paste(paste(p_y_f, "->"), p_y_t)) | |
696 | names(to_from) = pass_y | |
697 | pos_muts = pos_muts %>% | |
698 | mutate_(Polymorphic = ~ifelse(POSITION %in% pass_y, "True", "False")) | |
699 | ||
700 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
701 | db_subset$MUT_COUNT_NOVEL = db_subset$SEQUENCE_IMGT %>% | |
702 | substring(min(pos_range), max(pos_range)) %>% | |
703 | paste(pads, ., sep="") %>% | |
704 | getMutatedPositions(novel_imgt) %>% | |
705 | sapply(length) | |
706 | db_subset = db_subset %>% | |
707 | filter_(~MUT_COUNT_NOVEL == 0) %>% | |
708 | mutate_(J_GENE = ~getGene(J_CALL)) | |
709 | if (nrow(db_subset) == 0) { | |
710 | warning(paste0("Insufficient sequences (",nrow(db_subset),") with MUT_COUNT_NOVEL == 0.")) | |
711 | return (invisible(NULL)) | |
712 | } | |
713 | db_subset$JUNCTION_LENGTH = db_subset$JUNCTION_LENGTH %>% | |
714 | factor(levels=min(db_subset$JUNCTION_LENGTH):max(db_subset$JUNCTION_LENGTH)) | |
715 | pos_muts$Polymorphic = pos_muts$Polymorphic %>% | |
716 | factor(levels = c("False", "True")) | |
717 | pos_db$NT = pos_db$NT %>% | |
718 | factor(levels = names(DNA_COLORS)) | |
719 | pos_muts$GERMLINE = names(germline) | |
720 | ||
721 | # MAKE THE FIRST PLOT | |
722 | if(!is.na(novel_imgt)){ | |
723 | POLYCOLORS = setNames(DNA_COLORS[c(4,3)], c("False", "True")) | |
724 | p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION, | |
725 | color=~Polymorphic)) + | |
726 | geom_line(size = 0.75) + | |
727 | facet_grid(GERMLINE ~ .) + | |
728 | scale_color_manual(values = POLYCOLORS) + | |
729 | ylim(0,1) + | |
730 | xlab("Mutation Count (Sequence)") + | |
731 | ylab("Mutation Frequency (Position)") + | |
732 | theme_bw() + | |
733 | theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1), | |
734 | legend.background=element_rect(fill = "transparent")) + | |
735 | guides(color = guide_legend(ncol = 2, reverse = TRUE)) | |
736 | } else{ | |
737 | POLYCOLORS = setNames(DNA_COLORS[c(4,2)], c("False", "True")) | |
738 | p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION, | |
739 | color=~Polymorphic)) + | |
740 | geom_line(size = 0.75) + | |
741 | facet_grid(GERMLINE ~ .) + | |
742 | scale_color_manual(values = POLYCOLORS) + | |
743 | ylim(0,1) + | |
744 | xlab("Mutation Count (Sequence)") + | |
745 | ylab("Mutation Frequency (Position)") + | |
746 | theme_bw() + | |
747 | theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1), | |
748 | legend.background=element_rect(fill = "transparent")) + | |
749 | guides(color = guide_legend("Passed y-intercept test", | |
750 | ncol = 2, reverse = TRUE)) | |
751 | } | |
752 | # MAKE THE SECOND PLOT | |
753 | p2_data = mutate_(filter_(pos_db, ~POSITION %in% pass_y), | |
754 | POSITION = ~to_from[as.character(POSITION)]) | |
755 | if (nrow(p2_data)) { | |
756 | p2 = ggplot(p2_data, | |
757 | aes_(~factor(MUT_COUNT), fill=~NT)) + | |
758 | geom_bar(width=0.9) + | |
759 | guides(fill = guide_legend("Nucleotide", ncol = 4)) + | |
760 | facet_grid(POSITION ~ .) + | |
761 | xlab("Mutation Count (Sequence)") + ylab("Sequence Count") + | |
762 | scale_fill_manual(values = DNA_COLORS, breaks=names(DNA_COLORS), | |
763 | drop=FALSE) + | |
764 | theme_bw() + | |
765 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
766 | legend.background=element_rect(fill = "transparent")) | |
485 | 767 | } else { |
486 | stop("novel_df_row is not a data frame with only one row.") | |
487 | } | |
488 | } | |
489 | ||
490 | germline = cleanSeqs(germline) | |
491 | clip_db$SEQUENCE_IMGT = cleanSeqs(clip_db$SEQUENCE_IMGT) | |
492 | ||
493 | # Extract sequences assigned to the germline, determine which | |
494 | # have an appropriate range of mutations, and find the mutation | |
495 | # frequency of each position | |
496 | db_subset = clip_db %>% | |
497 | select_(~SEQUENCE_IMGT, v_call, ~J_CALL, ~JUNCTION_LENGTH) %>% | |
498 | filter_(~grepl(names(germline), clip_db[[v_call]], fixed=TRUE)) | |
499 | pos_db = db_subset %>% | |
500 | mutationRangeSubset(germline, mut_range, pos_range) | |
501 | if (nrow(pos_db) == 0) { | |
502 | warning("Insufficient sequences in desired mutational range") | |
503 | return (invisible(NULL)) | |
504 | } | |
505 | pos_db <- pos_db %>% | |
506 | positionMutations(germline, pos_range) | |
507 | pos_muts = pos_db %>% | |
508 | group_by_(~POSITION) %>% | |
509 | mutate_(PASS = ~mean(OBSERVED) >= min_frac) %>% | |
510 | group_by_(~MUT_COUNT, ~POSITION) %>% | |
511 | summarise_(POS_MUT_RATE = ~mean(MUTATED)*unique(PASS) ) %>% | |
512 | ungroup() | |
513 | ||
514 | # Label the polymorphic positions as such | |
515 | pass_y = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
516 | gsub("[^0-9]", "", .) %>% | |
517 | as.numeric() | |
518 | p_y_f = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
519 | gsub("[0-9]+.", "", .) | |
520 | p_y_t = unlist(strsplit(names(novel_imgt), "_"))[-1] %>% | |
521 | gsub(".[0-9]+", "", .) | |
522 | # Parse the note to find positions that passed y intercept if no novel found | |
523 | if(length(pass_y) == 0 & grepl("Position\\(s\\) passed y-intercept", note)){ | |
524 | pass_y = note %>% gsub("Position\\(s\\) passed y-intercept \\(", "", .) %>% | |
525 | gsub("\\).*", "", .) %>% strsplit(",") %>% unlist %>% as.numeric | |
526 | p_y_f = sapply(pass_y, function (x) substring(germline, x, x)) | |
527 | p_y_t = gsub(".", "?", p_y_f) | |
528 | } | |
529 | ||
530 | to_from = paste(paste("Position", pass_y), paste(paste(p_y_f, "->"), p_y_t)) | |
531 | names(to_from) = pass_y | |
532 | pos_muts = pos_muts %>% | |
533 | mutate_(Polymorphic = ~ifelse(POSITION %in% pass_y, "True", "False")) | |
534 | ||
535 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
536 | db_subset$MUT_COUNT_NOVEL = db_subset$SEQUENCE_IMGT %>% | |
537 | substring(min(pos_range), max(pos_range)) %>% | |
538 | paste(pads, ., sep="") %>% | |
539 | getMutatedPositions(novel_imgt) %>% | |
540 | sapply(length) | |
541 | db_subset = db_subset %>% | |
542 | filter_(~MUT_COUNT_NOVEL == 0) %>% | |
543 | mutate_(J_GENE = ~getGene(J_CALL)) | |
544 | db_subset$JUNCTION_LENGTH = db_subset$JUNCTION_LENGTH %>% | |
545 | factor(levels=min(db_subset$JUNCTION_LENGTH):max(db_subset$JUNCTION_LENGTH)) | |
546 | pos_muts$Polymorphic = pos_muts$Polymorphic %>% | |
547 | factor(levels = c("False", "True")) | |
548 | pos_db$NT = pos_db$NT %>% | |
549 | factor(levels = names(DNA_COLORS)) | |
550 | pos_muts$GERMLINE = names(germline) | |
551 | ||
552 | # MAKE THE FIRST PLOT | |
553 | if(!is.na(novel_imgt)){ | |
554 | POLYCOLORS = setNames(DNA_COLORS[c(4,3)], c("False", "True")) | |
555 | p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION, | |
556 | color=~Polymorphic)) + | |
557 | geom_line(size = 0.75) + | |
558 | facet_grid(GERMLINE ~ .) + | |
559 | scale_color_manual(values = POLYCOLORS) + | |
560 | ylim(0,1) + | |
561 | xlab("Mutation Count (Sequence)") + | |
562 | ylab("Mutation Frequency (Position)") + | |
563 | theme_bw() + | |
564 | theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1), | |
565 | legend.background=element_rect(fill = "transparent")) + | |
566 | guides(color = guide_legend(ncol = 2, reverse = TRUE)) | |
567 | } else{ | |
568 | POLYCOLORS = setNames(DNA_COLORS[c(4,2)], c("False", "True")) | |
569 | p1 = ggplot(pos_muts, aes_(~factor(MUT_COUNT), ~POS_MUT_RATE, group=~POSITION, | |
570 | color=~Polymorphic)) + | |
571 | geom_line(size = 0.75) + | |
572 | facet_grid(GERMLINE ~ .) + | |
573 | scale_color_manual(values = POLYCOLORS) + | |
574 | ylim(0,1) + | |
575 | xlab("Mutation Count (Sequence)") + | |
576 | ylab("Mutation Frequency (Position)") + | |
577 | theme_bw() + | |
578 | theme(legend.position=c(0.5,0.9), legend.justification=c(0.5,1), | |
579 | legend.background=element_rect(fill = "transparent")) + | |
580 | guides(color = guide_legend("Passed y-intercept test", | |
581 | ncol = 2, reverse = TRUE)) | |
582 | } | |
583 | # MAKE THE SECOND PLOT | |
584 | p2_data = mutate_(filter_(pos_db, ~POSITION %in% pass_y), | |
585 | POSITION = ~to_from[as.character(POSITION)]) | |
586 | if (nrow(p2_data)) { | |
587 | p2 = ggplot(p2_data, | |
588 | aes_(~factor(MUT_COUNT), fill=~NT)) + | |
589 | geom_bar(width=0.9) + | |
590 | guides(fill = guide_legend("Nucleotide", ncol = 4)) + | |
591 | facet_grid(POSITION ~ .) + | |
592 | xlab("Mutation Count (Sequence)") + ylab("Sequence Count") + | |
593 | scale_fill_manual(values = DNA_COLORS, breaks=names(DNA_COLORS), | |
594 | drop=FALSE) + | |
595 | theme_bw() + | |
596 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
597 | legend.background=element_rect(fill = "transparent")) | |
598 | } else { | |
599 | p2_data = mutate_(filter_(pos_db, | |
600 | ~POSITION %in% names(which.max(table(pos_db$POSITION)))), | |
601 | POSITION = ~"No positions pass y-intercept test.") | |
602 | p2 = ggplot(p2_data, aes_(~factor(MUT_COUNT))) + | |
603 | geom_bar(width=0.9) + | |
604 | facet_grid(POSITION ~ .) + | |
605 | xlab("Mutation Count (Sequence)") + ylab("Sequence Count") + | |
606 | theme_bw() + | |
607 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
608 | legend.background=element_rect(fill = "transparent")) | |
609 | } | |
610 | # MAKE THE THIRD PLOT | |
611 | p3 = ggplot(db_subset, aes_(~JUNCTION_LENGTH, fill=~factor(J_GENE))) + | |
612 | geom_bar(width=0.9) + | |
613 | guides(fill = guide_legend("J Gene", ncol = 2)) + | |
614 | xlab("Junction Length") + ylab("Unmutated Sequence Count") + | |
615 | theme_bw() + | |
616 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
617 | legend.background=element_rect(fill = "transparent")) | |
618 | ||
619 | p2_height = length(unique(p2_data$POSITION)) | |
620 | if (p2_height>1) { p2_height = 0.5 * p2_height} | |
621 | heights = c(1, p2_height, 1) | |
622 | multiplot(p1, p2, p3, cols = ncol, heights=heights) | |
623 | } | |
624 | ||
625 | #' Infer a subject-specific genotype | |
768 | p2_data = mutate_(filter_(pos_db, | |
769 | ~POSITION %in% names(which.max(table(pos_db$POSITION)))), | |
770 | POSITION = ~"No positions pass y-intercept test.") | |
771 | p2 = ggplot(p2_data, aes_(~factor(MUT_COUNT))) + | |
772 | geom_bar(width=0.9) + | |
773 | facet_grid(POSITION ~ .) + | |
774 | xlab("Mutation Count (Sequence)") + ylab("Sequence Count") + | |
775 | theme_bw() + | |
776 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
777 | legend.background=element_rect(fill = "transparent")) | |
778 | } | |
779 | # MAKE THE THIRD PLOT | |
780 | p3 = ggplot(db_subset, aes_(~JUNCTION_LENGTH, fill=~factor(J_GENE))) + | |
781 | geom_bar(width=0.9) + | |
782 | guides(fill = guide_legend("J Gene", ncol = 2)) + | |
783 | xlab("Junction Length") + ylab("Unmutated Sequence Count") + | |
784 | theme_bw() + | |
785 | theme(legend.position=c(1,1), legend.justification=c(1,1), | |
786 | legend.background=element_rect(fill = "transparent")) | |
787 | ||
788 | p2_height = length(unique(p2_data$POSITION)) | |
789 | if (p2_height>1) { p2_height = 0.5 * p2_height} | |
790 | heights = c(1, p2_height, 1) | |
791 | multiplot(p1, p2, p3, cols = ncol, heights=heights) | |
792 | } | |
793 | ||
794 | #' Infer a subject-specific genotype using a frequency method | |
626 | 795 | #' |
627 | #' \code{inferGenotype} infers an subject's genotype by finding the minimum | |
628 | #' number set of alleles that can explain the majority of each gene's calls. The | |
629 | #' most common allele of each gene is included in the genotype first, and the | |
630 | #' next most common allele is added until the desired fraction of alleles can be | |
631 | #' explained. In this way, mistaken allele calls (resulting from sequences which | |
796 | #' \code{inferGenotype} infers an subject's genotype using a frequency method. | |
797 | #' The genotype is inferred by finding the minimum number set of alleles that | |
798 | #' can explain the majority of each gene's calls. The most common allele of | |
799 | #' each gene is included in the genotype first, and the next most common allele | |
800 | #' is added until the desired fraction of alleles can be explained. In this | |
801 | #' way, mistaken allele calls (resulting from sequences which | |
632 | 802 | #' by chance have been mutated to look like another allele) can be removed. |
633 | 803 | #' |
634 | #' @param clip_db a \code{data.frame} containing V allele | |
804 | #' @details | |
805 | #' Allele calls representing cases where multiple alleles have been | |
806 | #' assigned to a single sample sequence are rare among unmutated | |
807 | #' sequences but may result if nucleotides for certain positions are | |
808 | #' not available. Calls containing multiple alleles are treated as | |
809 | #' belonging to all groups. If \code{novel} is provided, all | |
810 | #' sequences that are assigned to the same starting allele as any | |
811 | #' novel germline allele will have the novel germline allele appended | |
812 | #' to their assignent prior to searching for unmutated sequences. | |
813 | #' | |
814 | #' @param data a \code{data.frame} containing V allele | |
635 | 815 | #' calls from a single subject. If |
636 | 816 | #' \code{find_unmutated} is \code{TRUE}, then |
637 | 817 | #' the sample IMGT-gapped V(D)J sequence should |
638 | #' @param v_call column in \code{clip_db} with V allele calls. | |
639 | #' Default is \code{"V_CALL"} | |
818 | #' @param germline_db named vector of sequences containing the | |
819 | #' germline sequences named in | |
820 | #' \code{allele_calls}. Only required if | |
821 | #' \code{find_unmutated} is \code{TRUE}. | |
822 | #' @param novel an optional \code{data.frame} of the type | |
823 | #' novel returned by | |
824 | #' \link{findNovelAlleles} containing | |
825 | #' germline sequences that will be utilized if | |
826 | #' \code{find_unmutated} is \code{TRUE}. See | |
827 | #' Details. | |
828 | #' @param v_call column in \code{data} with V allele calls. | |
829 | #' Default is \code{"V_CALL"}. | |
640 | 830 | #' be provided in a column \code{"SEQUENCE_IMGT"} |
641 | 831 | #' @param fraction_to_explain the portion of each gene that must be |
642 | 832 | #' explained by the alleles that will be included |
643 | #' in the genotype | |
833 | #' in the genotype. | |
644 | 834 | #' @param gene_cutoff either a number of sequences or a fraction of |
645 | 835 | #' the length of \code{allele_calls} denoting the |
646 | 836 | #' minimum number of times a gene must be |
647 | 837 | #' observed in \code{allele_calls} to be included |
648 | #' in the genotype | |
838 | #' in the genotype. | |
649 | 839 | #' @param find_unmutated if \code{TRUE}, use \code{germline_db} to |
650 | 840 | #' find which samples are unmutated. Not needed |
651 | 841 | #' if \code{allele_calls} only represent |
652 | 842 | #' unmutated samples. |
653 | #' @param germline_db named vector of sequences containing the | |
654 | #' germline sequences named in | |
655 | #' \code{allele_calls}. Only required if | |
656 | #' \code{find_unmutated} is \code{TRUE}. | |
657 | #' @param novel_df an optional \code{data.frame} of the type | |
658 | #' novel returned by | |
659 | #' \link{findNovelAlleles} containing | |
660 | #' germline sequences that will be utilized if | |
661 | #' \code{find_unmutated} is \code{TRUE}. See | |
662 | #' details. | |
663 | #' @details Allele calls representing cases where multiple alleles have been | |
664 | #' assigned to a single sample sequence are rare among unmutated | |
665 | #' sequences but may result if nucleotides for certain positions are | |
666 | #' not available. Calls containing multiple alleles are treated as | |
667 | #' belonging to all groups. If \code{novel_df} is provided, all | |
668 | #' sequences that are assigned to the same starting allele as any | |
669 | #' novel germline allele will have the novel germline allele appended | |
670 | #' to their assignent prior to searching for unmutated sequences. | |
671 | #' | |
672 | #' @return A table of alleles denoting the genotype of the subject | |
673 | #' | |
674 | #' @note This method works best with data derived from blood, where a large | |
675 | #' portion of sequences are expected to be unmutated. Ideally, there | |
676 | #' should be hundreds of allele calls per gene in the input. | |
677 | #' | |
678 | #' @examples | |
679 | #' # Infer the IGHV genotype, using only unmutated sequences, including any | |
680 | #' # novel alleles | |
681 | #' data(sample_db) | |
682 | #' data(germline_ighv) | |
683 | #' data(novel_df) | |
684 | #' inferGenotype(sample_db, find_unmutated = TRUE, germline_db = germline_ighv, | |
685 | #' novel_df = novel_df) | |
843 | #' | |
844 | #' @return | |
845 | #' A \code{data.frame} of alleles denoting the genotype of the subject containing | |
846 | #' the following columns: | |
847 | #' | |
848 | #' \itemize{ | |
849 | #' \item \code{GENE}: The gene name without allele. | |
850 | #' \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}. | |
851 | #' \item \code{COUNTS}: Comma separated list of observed sequences for each | |
852 | #' corresponding allele in the \code{ALLELES} list. | |
853 | #' \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}. | |
854 | #' \item \code{NOTE}: Any comments on the inferrence. | |
855 | #' } | |
856 | #' | |
857 | #' @note | |
858 | #' This method works best with data derived from blood, where a large | |
859 | #' portion of sequences are expected to be unmutated. Ideally, there | |
860 | #' should be hundreds of allele calls per gene in the input. | |
686 | 861 | #' |
687 | 862 | #' @seealso \link{plotGenotype} for a colorful visualization and |
688 | 863 | #' \link{genotypeFasta} to convert the genotype to nucleotide sequences. |
864 | #' See \link{inferGenotypeBayesian} to infer a subject-specific genotype | |
865 | #' using a Bayesian approach. | |
866 | #' | |
867 | #' @examples | |
868 | #' # Infer IGHV genotype, using only unmutated sequences, including novel alleles | |
869 | #' inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel, | |
870 | #' find_unmutated=TRUE) | |
689 | 871 | #' |
690 | 872 | #' @export |
691 | inferGenotype <- function(clip_db, v_call="V_CALL", fraction_to_explain = 0.875, | |
692 | gene_cutoff = 1e-4, find_unmutated = TRUE, | |
693 | germline_db = NA, novel_df = NA){ | |
694 | ||
695 | . = NULL | |
696 | allele_calls = getAllele(clip_db[[v_call]], first=FALSE, strip_d=FALSE) | |
697 | # Find the unmutated subset, if requested | |
698 | if(find_unmutated){ | |
699 | if(is.na(germline_db[1])){ | |
700 | stop("germline_db needed if find_unmutated is TRUE") | |
701 | } | |
702 | if(!is.null(nrow(novel_df))){ | |
703 | novel_df = filter_(novel_df, ~!is.na(POLYMORPHISM_CALL)) %>% | |
704 | select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT) | |
705 | if(nrow(novel_df) > 0){ | |
706 | # Extract novel alleles if any and add them to germline_db | |
707 | novel_gl = novel_df$NOVEL_IMGT | |
708 | names(novel_gl) = novel_df$POLYMORPHISM_CALL | |
709 | germline_db = c(germline_db, novel_gl) | |
710 | # Add the novel allele calls to allele calls of the same starting allele | |
711 | for(r in 1:nrow(novel_df)){ | |
712 | ind = grep(novel_df$GERMLINE_CALL[r], allele_calls, fixed=TRUE) | |
713 | allele_calls[ind] = allele_calls[ind] %>% | |
714 | sapply(paste, novel_df$POLYMORPHISM_CALL[r], sep=",") | |
873 | inferGenotype <- function(data, germline_db=NA, novel=NA, v_call="V_CALL", | |
874 | fraction_to_explain=0.875, gene_cutoff=1e-4, | |
875 | find_unmutated=TRUE) { | |
876 | ||
877 | . = NULL | |
878 | allele_calls = getAllele(data[[v_call]], first=FALSE, strip_d=FALSE) | |
879 | # Find the unmutated subset, if requested | |
880 | if(find_unmutated){ | |
881 | if(is.na(germline_db[1])){ | |
882 | stop("germline_db needed if find_unmutated is TRUE") | |
715 | 883 | } |
716 | } | |
717 | } | |
718 | # Find unmutated sequences | |
719 | allele_calls = findUnmutatedCalls(allele_calls, | |
720 | as.character(clip_db$SEQUENCE_IMGT), | |
721 | germline_db) | |
722 | if(length(allele_calls) == 0){ | |
723 | stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.") | |
724 | } | |
725 | } | |
726 | ||
727 | # Find which rows' calls contain which genes | |
728 | cutoff = ifelse(gene_cutoff < 1, length(allele_calls)*gene_cutoff, gene_cutoff) | |
729 | gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>% | |
730 | getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="") | |
731 | gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE) | |
732 | names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE) | |
733 | gene_groups = gene_groups[sapply(gene_groups, length) >= cutoff] | |
734 | gene_groups = gene_groups[sortAlleles(names(gene_groups))] | |
735 | ||
736 | # Make a table to store the resulting genotype | |
737 | GENE = names(gene_groups) | |
738 | ALLELES = COUNTS = NOTE = rep("", length(GENE)) | |
739 | TOTAL = sapply(gene_groups, length) | |
740 | genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE) | |
741 | ||
742 | # For each gene, find which alleles to include | |
743 | for (g in GENE){ | |
744 | # Keep only the part of the allele calls that uses the gene being analyzed | |
745 | ac = allele_calls[gene_groups[[g]]] %>% | |
746 | strsplit(",") %>% | |
747 | lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>% | |
748 | sapply(paste, collapse=",") | |
749 | target = ceiling(fraction_to_explain*length(ac)) # how many we need to explain | |
750 | t_ac = table(ac) # table of allele calls | |
751 | potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles | |
752 | # One allele? Easy! | |
753 | if (length(potentials) == 1 | length(t_ac) == 1){ | |
754 | genotype[genotype[,"GENE"]==g,"ALLELES"] = | |
755 | gsub("[^d\\*]*[d\\*]","",potentials )[1] | |
756 | genotype[genotype[,"GENE"]==g,"COUNTS"] = t_ac | |
757 | } else { | |
758 | # More alleles? Let's find the fewest that can explain the needed fraction | |
759 | # Make a table of which alleles can explain which calls | |
760 | regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="") | |
761 | regexpotentials = | |
762 | paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|") | |
763 | tmat = | |
764 | sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE)) | |
765 | seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac)) | |
766 | colnames(seqs_expl) = potentials | |
767 | ||
768 | # Cycle through the table, including alleles to explain more sequences, | |
769 | # until we explain enough sequences | |
770 | included = counts = character(0) | |
771 | tot_expl = 0 | |
772 | while(tot_expl < target){ | |
773 | allele_tot = apply(seqs_expl, 2, sum) | |
774 | included = c(included, names(which.max(allele_tot))) | |
775 | counts = c(counts, max(allele_tot)) | |
776 | tot_expl = max(allele_tot) + tot_expl | |
777 | seqs_expl = seqs_expl[which(seqs_expl[,which.max(allele_tot)]==0),] | |
778 | } | |
779 | genotype[genotype[,"GENE"]==g,"ALLELES"] = | |
780 | paste(gsub("[^d\\*]*[d\\*]","",included ),collapse=",") | |
781 | genotype[genotype[,"GENE"]==g,"COUNTS"] = | |
782 | paste(counts,collapse=",") | |
783 | } | |
784 | ||
785 | } | |
786 | geno = as.data.frame(genotype, stringsAsFactors = FALSE) | |
787 | ||
788 | # Check for indistinguishable calls | |
789 | if(find_unmutated == TRUE){ | |
790 | seqs = genotypeFasta(geno, germline_db) | |
791 | dist_mat = seqs %>% | |
792 | sapply(function(x) sapply((getMutatedPositions(seqs, x)), length)) %>% | |
793 | as.matrix | |
794 | rownames(dist_mat) = colnames(dist_mat) | |
795 | for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA } | |
796 | same = which(dist_mat == 0, arr.ind=TRUE) | |
797 | if (nrow(same) > 0 ) { | |
798 | for (r in 1:nrow(same)) { | |
799 | inds = as.vector(same[r,]) | |
800 | geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE = | |
801 | paste(rownames(dist_mat)[inds], collapse=" and ") %>% | |
802 | paste("Cannot distinguish", .) | |
803 | } | |
804 | } | |
805 | } | |
806 | rownames(geno) = NULL | |
807 | return(geno) | |
808 | } | |
884 | if(!is.null(nrow(novel))){ | |
885 | novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>% | |
886 | select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT) | |
887 | if(nrow(novel) > 0){ | |
888 | # Extract novel alleles if any and add them to germline_db | |
889 | novel_gl = novel$NOVEL_IMGT | |
890 | names(novel_gl) = novel$POLYMORPHISM_CALL | |
891 | germline_db = c(germline_db, novel_gl) | |
892 | # Add the novel allele calls to allele calls of the same starting allele | |
893 | for(r in 1:nrow(novel)){ | |
894 | ind = grep(novel$GERMLINE_CALL[r], allele_calls, fixed=TRUE) | |
895 | allele_calls[ind] = allele_calls[ind] %>% | |
896 | sapply(paste, novel$POLYMORPHISM_CALL[r], sep=",") | |
897 | } | |
898 | } | |
899 | } | |
900 | # Find unmutated sequences | |
901 | allele_calls = findUnmutatedCalls(allele_calls, | |
902 | as.character(data$SEQUENCE_IMGT), | |
903 | germline_db) | |
904 | if(length(allele_calls) == 0){ | |
905 | stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.") | |
906 | } | |
907 | } | |
908 | ||
909 | # Find which rows' calls contain which genes | |
910 | cutoff = ifelse(gene_cutoff < 1, length(allele_calls)*gene_cutoff, gene_cutoff) | |
911 | gene_regex = allele_calls %>% strsplit(",") %>% unlist() %>% | |
912 | getGene(strip_d=FALSE) %>% unique() %>% paste("\\*", sep="") | |
913 | gene_groups = sapply(gene_regex, grep, allele_calls, simplify=FALSE) | |
914 | names(gene_groups) = gsub("\\*", "", gene_regex, fixed=TRUE) | |
915 | gene_groups = gene_groups[sapply(gene_groups, length) >= cutoff] | |
916 | gene_groups = gene_groups[sortAlleles(names(gene_groups))] | |
917 | ||
918 | # Make a table to store the resulting genotype | |
919 | GENE = names(gene_groups) | |
920 | ALLELES = COUNTS = NOTE = rep("", length(GENE)) | |
921 | TOTAL = sapply(gene_groups, length) | |
922 | genotype = cbind(GENE, ALLELES, COUNTS, TOTAL, NOTE) | |
923 | ||
924 | # For each gene, find which alleles to include | |
925 | for (g in GENE) { | |
926 | # Keep only the part of the allele calls that uses the gene being analyzed | |
927 | ac = allele_calls[gene_groups[[g]]] %>% | |
928 | strsplit(",") %>% | |
929 | lapply(function(x) x[grep(paste(g, "\\*", sep=""), x)]) %>% | |
930 | sapply(paste, collapse=",") | |
931 | target = ceiling(fraction_to_explain*length(ac)) # how many we need to explain | |
932 | t_ac = table(ac) # table of allele calls | |
933 | potentials = unique(unlist(strsplit(names(t_ac),","))) # potential alleles | |
934 | # One allele? Easy! | |
935 | if (length(potentials) == 1 | length(t_ac) == 1) { | |
936 | genotype[genotype[,"GENE"]==g,"ALLELES"] = gsub("[^d\\*]*[d\\*]","",potentials )[1] | |
937 | genotype[genotype[,"GENE"]==g,"COUNTS"] = t_ac | |
938 | } else { | |
939 | # More alleles? Let's find the fewest that can explain the needed fraction | |
940 | # Make a table of which alleles can explain which calls | |
941 | regexpotentials = paste(gsub("\\*","\\\\*", potentials),"$",sep="") | |
942 | regexpotentials = | |
943 | paste(regexpotentials,gsub("\\$",",",regexpotentials),sep="|") | |
944 | tmat = | |
945 | sapply(regexpotentials, function(x) grepl(x, names(t_ac),fixed=FALSE)) | |
946 | seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac)) | |
947 | colnames(seqs_expl) = potentials | |
948 | ||
949 | # Cycle through the table, including alleles to explain more sequences, | |
950 | # until we explain enough sequences | |
951 | included = counts = character(0) | |
952 | tot_expl = 0 | |
953 | while(tot_expl < target){ | |
954 | allele_tot = apply(seqs_expl, 2, sum) | |
955 | included = c(included, names(which.max(allele_tot))) | |
956 | counts = c(counts, max(allele_tot)) | |
957 | tot_expl = max(allele_tot) + tot_expl | |
958 | seqs_expl = seqs_expl[which(seqs_expl[,which.max(allele_tot)]==0),] | |
959 | } | |
960 | genotype[genotype[,"GENE"]==g,"ALLELES"] = | |
961 | paste(gsub("[^d\\*]*[d\\*]","",included ),collapse=",") | |
962 | genotype[genotype[,"GENE"]==g,"COUNTS"] = | |
963 | paste(counts,collapse=",") | |
964 | } | |
965 | } | |
966 | ||
967 | geno = as.data.frame(genotype, stringsAsFactors = FALSE) | |
968 | ||
969 | # Check for indistinguishable calls | |
970 | if (find_unmutated == TRUE) { | |
971 | seqs = genotypeFasta(geno, germline_db) | |
972 | dist_mat = seqs %>% | |
973 | sapply(function(x) sapply((getMutatedPositions(seqs, x)), length)) %>% | |
974 | as.matrix | |
975 | rownames(dist_mat) = colnames(dist_mat) | |
976 | for (i in 1:nrow(dist_mat)){ dist_mat[i,i] = NA } | |
977 | same = which(dist_mat == 0, arr.ind=TRUE) | |
978 | if (nrow(same) > 0 ) { | |
979 | for (r in 1:nrow(same)) { | |
980 | inds = as.vector(same[r,]) | |
981 | geno[getGene(rownames(dist_mat)[inds][1]),]$NOTE = | |
982 | paste(rownames(dist_mat)[inds], collapse=" and ") %>% | |
983 | paste("Cannot distinguish", .) | |
984 | } | |
985 | } | |
986 | } | |
987 | rownames(geno) = NULL | |
988 | ||
989 | return(geno) | |
990 | } | |
991 | ||
809 | 992 | |
810 | 993 | #' Show a colorful representation of a genotype |
811 | 994 | #' |
812 | 995 | #' \code{plotGenotype} plots a genotype table. |
813 | 996 | #' |
814 | #' @param genotype a table of alleles denoting a genotype, as returned by | |
815 | #' \link{inferGenotype} | |
997 | #' @param genotype a \code{data.frame} of alleles denoting a genotype, | |
998 | #' as returned by \link{inferGenotype}. | |
816 | 999 | #' @param facet_by a column name in \code{genotype} to facet the plot by. |
817 | 1000 | #' If \code{NULL}, then do not facet the plot. |
818 | 1001 | #' @param gene_sort a string defining the method to use when sorting alleles. |
819 | 1002 | #' If \code{"name"} then sort in lexicographic order. If |
820 | 1003 | #' \code{"position"} then sort by position in the locus, as |
821 | 1004 | #' determined by the final two numbers in the gene name. |
822 | #' @param text_size the point size of the plotted text | |
1005 | #' @param text_size the point size of the plotted text. | |
823 | 1006 | #' @param silent if \code{TRUE} do not draw the plot and just return the ggplot |
824 | 1007 | #' object; if \code{FALSE} draw the plot. |
825 | 1008 | #' @param ... additional arguments to pass to ggplot2::theme. |
829 | 1012 | #' @seealso \link{inferGenotype} |
830 | 1013 | #' |
831 | 1014 | #' @examples |
832 | #' # Load example data | |
833 | #' data(novel_df) | |
834 | #' data(genotype) | |
835 | #' | |
836 | 1015 | #' # Plot genotype |
837 | #' plotGenotype(genotype) | |
1016 | #' plotGenotype(SampleGenotype) | |
838 | 1017 | #' |
839 | 1018 | #' # Facet by subject |
840 | #' genotypea = genotypeb = genotype | |
841 | #' genotypea$SUBJECT = "A" | |
842 | #' genotypeb$SUBJECT = "B" | |
843 | #' geno_sub = rbind(genotypea, genotypeb) | |
1019 | #' genotype_a <- genotype_b <- SampleGenotype | |
1020 | #' genotype_a$SUBJECT <- "A" | |
1021 | #' genotype_b$SUBJECT <- "B" | |
1022 | #' geno_sub <- rbind(genotype_a, genotype_b) | |
844 | 1023 | #' plotGenotype(geno_sub, facet_by="SUBJECT", gene_sort="pos") |
845 | 1024 | #' |
846 | 1025 | #' @export |
847 | plotGenotype = function(genotype, facet_by=NULL, gene_sort=c("name", "position"), | |
848 | text_size=12, silent=FALSE, ...) { | |
849 | # Check arguments | |
850 | gene_sort <- match.arg(gene_sort) | |
851 | ||
852 | # Split genes' alleles into their own rows | |
853 | alleles = strsplit(genotype$ALLELES, ",") | |
854 | geno2 = genotype | |
855 | r = 1 | |
856 | for (g in 1:nrow(genotype)){ | |
857 | for(a in 1:length(alleles[[g]])) { | |
858 | geno2[r, ] = genotype[g, ] | |
859 | geno2[r, ]$ALLELES = alleles[[g]][a] | |
860 | r = r + 1 | |
861 | } | |
862 | } | |
863 | ||
864 | # Set the gene order | |
865 | geno2$GENE = factor(geno2$GENE, | |
866 | levels=rev(sortAlleles(unique(geno2$GENE), method=gene_sort))) | |
867 | ||
868 | # Create the base plot | |
869 | p = ggplot(geno2, aes_(x=~GENE, fill=~ALLELES)) + | |
870 | theme_bw() + | |
871 | theme(axis.ticks=element_blank(), | |
872 | axis.text.x=element_blank(), | |
873 | panel.grid.major=element_blank(), | |
874 | panel.grid.minor=element_blank(), | |
875 | text=element_text(size=text_size), | |
876 | strip.background=element_blank(), | |
877 | strip.text=element_text(face="bold")) + | |
878 | geom_bar(position="fill") + | |
879 | coord_flip() + xlab("Gene") + ylab("") + | |
880 | scale_fill_hue(name="Allele", h=c(0, 270), h.start=10) | |
881 | ||
882 | # Plot, with facets by SUBJECT if that column is present | |
883 | if (!is.null(facet_by)) { | |
884 | p = p + facet_grid(paste0(".~", facet_by)) | |
885 | } | |
886 | ||
887 | # Add additional theme elements | |
888 | p = p + do.call(theme, list(...)) | |
889 | ||
890 | # Plot | |
891 | if (!silent) { plot(p) } | |
892 | ||
893 | invisible(p) | |
1026 | plotGenotype <- function(genotype, facet_by=NULL, gene_sort=c("name", "position"), | |
1027 | text_size=12, silent=FALSE, ...) { | |
1028 | # Check arguments | |
1029 | gene_sort <- match.arg(gene_sort) | |
1030 | ||
1031 | # Split genes' alleles into their own rows | |
1032 | alleles = strsplit(genotype$ALLELES, ",") | |
1033 | geno2 = genotype | |
1034 | r = 1 | |
1035 | for (g in 1:nrow(genotype)){ | |
1036 | for(a in 1:length(alleles[[g]])) { | |
1037 | geno2[r, ] = genotype[g, ] | |
1038 | geno2[r, ]$ALLELES = alleles[[g]][a] | |
1039 | r = r + 1 | |
1040 | } | |
1041 | } | |
1042 | ||
1043 | # Set the gene order | |
1044 | geno2$GENE = factor(geno2$GENE, | |
1045 | levels=rev(sortAlleles(unique(geno2$GENE), method=gene_sort))) | |
1046 | ||
1047 | # Create the base plot | |
1048 | p = ggplot(geno2, aes_(x=~GENE, fill=~ALLELES)) + | |
1049 | theme_bw() + | |
1050 | theme(axis.ticks=element_blank(), | |
1051 | axis.text.x=element_blank(), | |
1052 | panel.grid.major=element_blank(), | |
1053 | panel.grid.minor=element_blank(), | |
1054 | text=element_text(size=text_size), | |
1055 | strip.background=element_blank(), | |
1056 | strip.text=element_text(face="bold")) + | |
1057 | geom_bar(position="fill") + | |
1058 | coord_flip() + xlab("Gene") + ylab("") + | |
1059 | scale_fill_hue(name="Allele", h=c(0, 270), h.start=10) | |
1060 | ||
1061 | # Plot, with facets by SUBJECT if that column is present | |
1062 | if (!is.null(facet_by)) { | |
1063 | p = p + facet_grid(paste0(".~", facet_by)) | |
1064 | } | |
1065 | ||
1066 | # Add additional theme elements | |
1067 | p = p + do.call(theme, list(...)) | |
1068 | ||
1069 | # Plot | |
1070 | if (!silent) { plot(p) } | |
1071 | ||
1072 | invisible(p) | |
894 | 1073 | } |
895 | 1074 | |
896 | 1075 | #' Return the nucleotide sequences of a genotype |
898 | 1077 | #' \code{genotypeFasta} converts a genotype table into a vector of nucleotide |
899 | 1078 | #' sequences. |
900 | 1079 | #' |
901 | #' @param genotype a table of alleles denoting a genotype, as returned by | |
902 | #' \link{inferGenotype} | |
1080 | #' @param genotype a \code{data.frame} of alleles denoting a genotype, | |
1081 | #' as returned by \link{inferGenotype}. | |
903 | 1082 | #' @param germline_db a vector of named nucleotide germline sequences |
904 | #' matching the alleles detailed in \code{genotype} | |
905 | #' @param novel_df an optional \code{data.frame} containing putative | |
1083 | #' matching the alleles detailed in \code{genotype}. | |
1084 | #' @param novel an optional \code{data.frame} containing putative | |
906 | 1085 | #' novel alleeles of the type returned by |
907 | #' \link{findNovelAlleles} | |
1086 | #' \link{findNovelAlleles}. | |
908 | 1087 | #' |
909 | 1088 | #' @return A named vector of strings containing the germline nucleotide |
910 | #' sequences of the alleles in the provided genotype | |
1089 | #' sequences of the alleles in the provided genotype. | |
911 | 1090 | #' |
912 | 1091 | #' @seealso \link{inferGenotype} |
913 | 1092 | #' |
914 | 1093 | #' @examples |
915 | #' # Load example data | |
916 | #' data(germline_ighv) | |
917 | #' data(novel_df) | |
918 | #' data(genotype) | |
919 | #' | |
920 | 1094 | #' # Find the sequences that correspond to the genotype |
921 | #' genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df) | |
922 | #' | |
1095 | #' genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, SampleNovel) | |
923 | 1096 | #' |
924 | 1097 | #' @export |
925 | genotypeFasta <- function(genotype, germline_db, novel_df=NA){ | |
926 | if(!is.null(nrow(novel_df))){ | |
927 | # Extract novel alleles if any and add them to germline_db | |
928 | novel_df = filter_(novel_df, ~!is.na(POLYMORPHISM_CALL)) %>% | |
929 | select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT) | |
930 | if(nrow(novel_df) > 0){ | |
931 | novel_gl = novel_df$NOVEL_IMGT | |
932 | names(novel_gl) = novel_df$POLYMORPHISM_CALL | |
933 | germline_db = c(germline_db, novel_gl) | |
934 | } | |
935 | } | |
936 | ||
937 | genotype$GENE = gsub("D$|d$","",genotype$GENE) | |
938 | ||
939 | g_names = names(germline_db) | |
940 | names(g_names) = gsub("D", "", names(germline_db)) | |
941 | table_calls = mapply(paste, genotype$GENE, strsplit(genotype$ALLELES, ","), | |
942 | sep="*") | |
943 | seqs = germline_db[as.vector(g_names[unlist(table_calls)])] | |
944 | if(sum(is.na(seqs)) > 0){ | |
945 | stop("The following genotype alleles were not found in germline_db: ", | |
946 | paste(unlist(table_calls)[which(is.na(seqs))], collapse = ", ")) | |
947 | } | |
948 | return(seqs) | |
1098 | genotypeFasta <- function(genotype, germline_db, novel=NA){ | |
1099 | if(!is.null(nrow(novel))){ | |
1100 | # Extract novel alleles if any and add them to germline_db | |
1101 | novel = filter_(novel, ~!is.na(POLYMORPHISM_CALL)) %>% | |
1102 | select_(~GERMLINE_CALL, ~POLYMORPHISM_CALL, ~NOVEL_IMGT) | |
1103 | if(nrow(novel) > 0){ | |
1104 | novel_gl = novel$NOVEL_IMGT | |
1105 | names(novel_gl) = novel$POLYMORPHISM_CALL | |
1106 | germline_db = c(germline_db, novel_gl) | |
1107 | } | |
1108 | } | |
1109 | ||
1110 | genotype$GENE = gsub("D$|d$","",genotype$GENE) | |
1111 | ||
1112 | g_names = names(germline_db) | |
1113 | names(g_names) = gsub("D", "", names(germline_db)) | |
1114 | table_calls = mapply(paste, genotype$GENE, strsplit(genotype$ALLELES, ","), | |
1115 | sep="*") | |
1116 | seqs = germline_db[as.vector(g_names[unlist(table_calls)])] | |
1117 | if(sum(is.na(seqs)) > 0){ | |
1118 | stop("The following genotype alleles were not found in germline_db: ", | |
1119 | paste(unlist(table_calls)[which(is.na(seqs))], collapse = ", ")) | |
1120 | } | |
1121 | return(seqs) | |
949 | 1122 | } |
950 | 1123 | |
951 | 1124 | #' Correct allele calls based on a personalized genotype |
954 | 1127 | #' correct preliminary allele assignments of a set of sequences derived |
955 | 1128 | #' from a single subject. |
956 | 1129 | #' |
957 | #' @details In order to save time, initial gene assignments are preserved and | |
1130 | #' @details | |
1131 | #' In order to save time, initial gene assignments are preserved and | |
958 | 1132 | #' the allele calls are chosen from among those provided in \code{genotype_db}, |
959 | 1133 | #' based on a simple alignment to the sample sequence. |
960 | 1134 | #' |
961 | #' @param clip_db a \code{data.frame} containing V allele calls from a | |
962 | #' single subject and the sample | |
963 | #' IMGT-gapped V(D)J sequences under | |
964 | #' \code{"SEQUENCE_IMGT"} | |
1135 | #' @param data a \code{data.frame} containing V allele calls from a | |
1136 | #' single subject and the sample IMGT-gapped V(D)J sequences under | |
1137 | #' \code{"SEQUENCE_IMGT"}. | |
965 | 1138 | #' @param genotype_db a vector of named nucleotide germline sequences |
966 | 1139 | #' matching the calls detailed in \code{allele_calls} |
967 | 1140 | #' and personalized to the subject |
968 | #' @param v_call name of the column in \code{clip_db} with V allele | |
969 | #' calls. Default is \code{"V_CALL"} | |
1141 | #' @param v_call name of the column in \code{data} with V allele | |
1142 | #' calls. Default is \code{"V_CALL"}. | |
970 | 1143 | #' @param method the method to be used when realigning sequences to |
971 | #' the genotype_db sequences. Currently only "hammming" | |
1144 | #' the genotype_db sequences. Currently, only \code{"hammming"} | |
972 | 1145 | #' (for Hamming distance) is implemented. |
973 | 1146 | #' @param path directory containing the tool used in the |
974 | 1147 | #' realignment method, if needed. Hamming distance does |
975 | 1148 | #' not require a path to a tool. |
976 | #' @param keep_gene logical indicating if gene assignments should be | |
977 | #' maintained when possible. Increases speed by | |
978 | #' minimizing required number of alignments. Currently | |
979 | #' only "TRUE" is implemented. | |
980 | #' | |
981 | #' @return a single-column \code{data.frame} corresponding to \code{clip.db} | |
982 | #' and containing the best allele call from among the sequences | |
983 | #' listed in \code{genotype_db} | |
1149 | #' @param keep_gene a string indicating if the gene (\code{"gene"}), | |
1150 | #' family (\code{"family"}) or complete repertoire | |
1151 | #' (\code{"repertoire"}) assignments should be performed. | |
1152 | #' Use of \code{"gene"} increases speed by minimizing required number of | |
1153 | #' alignments, as gene level assignments will be maintained when possible. | |
1154 | #' | |
1155 | #' @return A modifed input \code{data.frame} containing the best allele call from | |
1156 | #' among the sequences listed in \code{genotype_db} in the | |
1157 | #' \code{V_CALL_GENOTYPED} column. | |
984 | 1158 | #' |
985 | 1159 | #' @examples |
986 | #' # Load example data | |
987 | #' data(germline_ighv) | |
988 | #' data(sample_db) | |
989 | #' data(genotype) | |
990 | #' data(novel_df) | |
991 | #' | |
992 | 1160 | #' # Extract the database sequences that correspond to the genotype |
993 | #' genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df) | |
1161 | #' genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, novel=SampleNovel) | |
994 | 1162 | #' |
995 | 1163 | #' # Use the personlized genotype to determine corrected allele assignments |
996 | #' V_CALL_GENOTYPED = reassignAlleles(sample_db, genotype_seqs) | |
997 | #' sample_db = cbind(sample_db, V_CALL_GENOTYPED) | |
1164 | #' output_db <- reassignAlleles(SampleDb, genotype_db) | |
998 | 1165 | #' |
999 | 1166 | #' @export |
1000 | reassignAlleles <- function(clip_db, genotype_db, v_call="V_CALL", | |
1167 | reassignAlleles <- function(data, genotype_db, v_call="V_CALL", | |
1001 | 1168 | method="hamming", path=NA, |
1002 | keep_gene=TRUE){ | |
1003 | ||
1004 | # Extract data subset and prepare output vector | |
1005 | v_sequences = as.character(clip_db$SEQUENCE_IMGT) | |
1006 | v_calls = getAllele(clip_db[[v_call]], first=FALSE, strip_d=FALSE) | |
1007 | v_genes = getGene(v_calls, first = TRUE, strip_d=FALSE) | |
1008 | V_CALL_GENOTYPED = rep("", length(v_calls)) | |
1009 | ||
1010 | ||
1011 | if(keep_gene){ | |
1012 | # Find which genotype genes are homozygous and assign those alleles first | |
1013 | geno_genes = getGene(names(genotype_db),strip_d=TRUE) | |
1014 | names(geno_genes) = names(genotype_db) | |
1015 | hetero_genes = unique(geno_genes[which(duplicated(geno_genes))]) | |
1016 | homo_genes = geno_genes[!(geno_genes %in% hetero_genes)] | |
1017 | homo_alleles = names(homo_genes); names(homo_alleles) = homo_genes | |
1018 | homo_calls_i = which(v_genes %in% homo_genes) | |
1019 | V_CALL_GENOTYPED[homo_calls_i] = homo_alleles[v_genes[homo_calls_i]] | |
1169 | keep_gene=c("gene", "family", "repertoire")){ | |
1170 | # Check arguments | |
1171 | keep_gene <- match.arg(keep_gene) | |
1172 | ||
1173 | # Extract data subset and prepare output vector | |
1174 | v_sequences = as.character(data$SEQUENCE_IMGT) | |
1175 | v_calls = getAllele(data[[v_call]], first=FALSE, strip_d=FALSE) | |
1176 | v_call_genotyped = rep("", length(v_calls)) | |
1177 | ||
1178 | if (keep_gene == "gene") { | |
1179 | v = getGene(v_calls, first = TRUE, strip_d=FALSE) | |
1180 | geno = getGene(names(genotype_db),strip_d=TRUE) | |
1181 | names(geno) = names(genotype_db) | |
1182 | } else if (keep_gene == "family") { | |
1183 | v <- getFamily(v_calls, first = TRUE, strip_d = FALSE) | |
1184 | geno = getFamily(names(genotype_db),strip_d=TRUE) | |
1185 | names(geno) = names(genotype_db) | |
1186 | } else if (keep_gene == "repertoire") { | |
1187 | v <- rep(v_call, length(v_calls)) | |
1188 | geno = rep(v_call, length(genotype_db)) | |
1189 | names(geno) = names(genotype_db) | |
1190 | } else { | |
1191 | stop("Unknown keep_gene value: ", keep_gene) | |
1192 | } | |
1193 | ||
1194 | # keep_gene == FALSE | |
1195 | # Find which genotype genes/families are homozygous and assign those alleles first | |
1196 | hetero = unique(geno[which(duplicated(geno))]) | |
1197 | homo = geno[!(geno %in% hetero)] | |
1198 | homo_alleles = names(homo) | |
1199 | names(homo_alleles) = homo | |
1200 | homo_calls_i = which(v %in% homo) | |
1201 | v_call_genotyped[homo_calls_i] = homo_alleles[v[homo_calls_i]] | |
1020 | 1202 | |
1021 | 1203 | # Now realign the heterozygote sequences to each allele of that gene |
1022 | for (het_gene in hetero_genes){ | |
1023 | ind = which(v_genes %in% het_gene) | |
1024 | if (length(ind) > 0){ | |
1025 | het_alleles = names(geno_genes[which(geno_genes == het_gene)]) | |
1026 | het_seqs = genotype_db[het_alleles] | |
1027 | if(method == "hamming"){ | |
1028 | dists = lapply(het_seqs, function(x) | |
1029 | sapply(getMutatedPositions(v_sequences[ind], x, match_instead=FALSE), | |
1030 | length)) | |
1031 | dist_mat = matrix(unlist(dists), ncol = length(het_seqs)) | |
1204 | for (het in hetero){ | |
1205 | ind = which(v %in% het) | |
1206 | if (length(ind) > 0){ | |
1207 | het_alleles = names(geno[which(geno == het)]) | |
1208 | het_seqs = genotype_db[het_alleles] | |
1209 | if(method == "hamming"){ | |
1210 | dists = lapply(het_seqs, function(x) | |
1211 | sapply(getMutatedPositions(v_sequences[ind], x, match_instead=FALSE), | |
1212 | length)) | |
1213 | dist_mat = matrix(unlist(dists), ncol = length(het_seqs)) | |
1214 | } else { | |
1215 | stop("Only Hamming distance is currently supported as a method.") | |
1216 | } | |
1217 | # The sapply-apply approach could become problematic when nrow(dist_mat) | |
1218 | # is 1 and min(best_match) has multiple values, due to the fact that R | |
1219 | # does not always keep data structures unmutable | |
1220 | # Explicitly specifying a list and subsequently keeping it as a list by | |
1221 | # using lapply avoids that problem | |
1222 | best_match = vector("list", length=nrow(dist_mat)) | |
1223 | for (i in 1:nrow(dist_mat)) { | |
1224 | best_match[[i]] = which(dist_mat[i, ]==min(dist_mat[i, ])) | |
1225 | } | |
1226 | best_alleles = lapply(best_match, function(x) het_alleles[x]) | |
1227 | v_call_genotyped[ind] = unlist(lapply(best_alleles, paste, collapse=",")) | |
1228 | } | |
1229 | } | |
1230 | ||
1231 | # Now realign the gene-not-in-genotype calls to every genotype allele | |
1232 | hetero_calls_i = which(v %in% hetero) | |
1233 | not_called = setdiff(1:length(v), c(homo_calls_i, hetero_calls_i)) | |
1234 | if(length(not_called)>1){ | |
1235 | if(method == "hamming"){ | |
1236 | dists = lapply(genotype_db, function(x) | |
1237 | sapply(getMutatedPositions(v_sequences[not_called], x, match_instead=FALSE), | |
1238 | length)) | |
1239 | dist_mat = matrix(unlist(dists), ncol = length(genotype_db)) | |
1032 | 1240 | } else { |
1033 | stop("Only Hamming distance is currently supported as a method.") | |
1241 | stop("Only Hamming distance is currently supported as a method.") | |
1034 | 1242 | } |
1035 | best_match = apply(dist_mat, 1, function(x) which(x == min(x))) | |
1036 | best_alleles = sapply(best_match, function(x) het_alleles[x]) | |
1037 | V_CALL_GENOTYPED[ind] = sapply(best_alleles, paste, collapse=",") | |
1038 | } | |
1039 | } | |
1040 | ||
1041 | # Now realign the gene-not-in-genotype calls to every genotype allele | |
1042 | hetero_calls_i = which(v_genes %in% hetero_genes) | |
1043 | not_called = setdiff(1:length(v_genes), c(homo_calls_i, hetero_calls_i)) | |
1044 | if(length(not_called)>1){ | |
1045 | if(method == "hamming"){ | |
1046 | dists = lapply(genotype_db, function(x) | |
1047 | sapply(getMutatedPositions(v_sequences[not_called], x, match_instead=FALSE), | |
1048 | length)) | |
1049 | dist_mat = matrix(unlist(dists), ncol = length(genotype_db)) | |
1050 | } else { | |
1051 | stop("Only Hamming distance is currently supported as a method.") | |
1052 | } | |
1053 | best_match = apply(dist_mat, 1, function(x) which(x == min(x))) | |
1054 | best_alleles = sapply(best_match, function(x) names(genotype_db[x])) | |
1055 | V_CALL_GENOTYPED[not_called] = sapply(best_alleles, paste, collapse=",") | |
1056 | } | |
1057 | } else { | |
1058 | stop("Complete realignment is currently not supported.") | |
1059 | } | |
1060 | ||
1061 | return(data.frame(V_CALL_GENOTYPED,stringsAsFactors=FALSE)) | |
1243 | # The sapply-apply approach could become problematic when nrow(dist_mat) | |
1244 | # is 1 and min(best_match) has multiple values, due to the fact that R | |
1245 | # does not always keep data structures unmutable | |
1246 | # Explicitly specifying a list and subsequently keeping it as a list by | |
1247 | # using lapply avoids that problem | |
1248 | best_match = vector("list", length=nrow(dist_mat)) | |
1249 | for (i in 1:nrow(dist_mat)) { | |
1250 | best_match[[i]] = which(dist_mat[i, ]==min(dist_mat[i, ])) | |
1251 | } | |
1252 | best_alleles = lapply(best_match, function(x) names(genotype_db[x])) | |
1253 | v_call_genotyped[not_called] = unlist(lapply(best_alleles, paste, collapse=",")) | |
1254 | } | |
1255 | ||
1256 | if (all(v_call_genotyped == data[[v_call]])) { | |
1257 | msg <- ("No allele assignment corrections made.") | |
1258 | if (all(v %in% homo) & length(hetero) > 0) { | |
1259 | keep_opt <- eval(formals(reassignAlleles)$keep_gene) | |
1260 | i <- match(keep_gene, keep_opt) | |
1261 | rec_opt <- paste(keep_opt[(i+1):length(keep_opt)], collapse = ", ") | |
1262 | msg <- paste(msg, "Consider setting keep_gene to one of:", rec_opt) | |
1263 | } | |
1264 | warning(msg) | |
1265 | } | |
1266 | ||
1267 | data$V_CALL_GENOTYPED <- v_call_genotyped | |
1268 | ||
1269 | return(data) | |
1062 | 1270 | } |
1063 | 1271 | |
1064 | 1272 | |
1085 | 1293 | #' |
1086 | 1294 | #' @examples |
1087 | 1295 | #' # Create strings to act as a sample sequences and a reference sequence |
1088 | #' seqs = c("----GATA","GAGAGAGA","TANA") | |
1089 | #' ref = "GATAGATA" | |
1296 | #' seqs <- c("----GATA", "GAGAGAGA", "TANA") | |
1297 | #' ref <- "GATAGATA" | |
1090 | 1298 | #' |
1091 | 1299 | #' # Find the differences between the two |
1092 | 1300 | #' getMutatedPositions(seqs, ref) |
1094 | 1302 | #' @export |
1095 | 1303 | getMutatedPositions <- function(samples, germlines, ignored_regex="[\\.N-]", |
1096 | 1304 | match_instead=FALSE) { |
1097 | ||
1098 | # If only one germline sequence is given, use it for all the sample seqs | |
1099 | if(length(germlines) == 1){ germlines = rep(germlines, length(samples)) } | |
1100 | if(length(samples) != length(germlines)) { | |
1101 | stop("Number of input sequences does not match number of germlines.") | |
1102 | } | |
1103 | ||
1104 | # Truncate each pair of sequences to the length of the shorter | |
1105 | germ_mins = lapply(germlines, nchar) | |
1106 | samp_mins = lapply(samples, nchar) | |
1107 | min_lens = mapply(min, germ_mins, samp_mins) | |
1108 | germ = toupper(mapply(substr, germlines, 1, min_lens, SIMPLIFY=FALSE)) | |
1109 | samp = toupper(mapply(substr, samples, 1, min_lens, SIMPLIFY=FALSE)) | |
1110 | ||
1111 | # Calculate poisitions of mutations (or matches), ignoring gaps, Ns, and CDR3 | |
1112 | samp_char = strsplit(samp,"") | |
1113 | germ_char = strsplit(germ,"") | |
1114 | if(!match_instead){ | |
1115 | muts = lapply(mapply("!=", samp_char, germ_char, SIMPLIFY=FALSE), which) | |
1116 | } else { | |
1117 | muts = lapply(mapply("==", samp_char, germ_char, SIMPLIFY=FALSE), which) | |
1118 | } | |
1119 | ignore_germ = gregexpr(ignored_regex, germ) | |
1120 | ignore_samp = gregexpr(ignored_regex, samp) | |
1121 | ignore = mapply(c, ignore_germ, ignore_samp, SIMPLIFY=FALSE) | |
1122 | ||
1123 | muts = mapply(function(x, y) x[!x%in%y], muts, ignore, SIMPLIFY=FALSE) | |
1124 | return(muts) | |
1305 | ||
1306 | # If only one germline sequence is given, use it for all the sample seqs | |
1307 | if(length(germlines) == 1){ germlines = rep(germlines, length(samples)) } | |
1308 | if(length(samples) != length(germlines)) { | |
1309 | stop("Number of input sequences does not match number of germlines.") | |
1310 | } | |
1311 | ||
1312 | # Truncate each pair of sequences to the length of the shorter | |
1313 | germ_mins = lapply(germlines, nchar) | |
1314 | samp_mins = lapply(samples, nchar) | |
1315 | min_lens = mapply(min, germ_mins, samp_mins) | |
1316 | germ = toupper(mapply(substr, germlines, 1, min_lens, SIMPLIFY=FALSE)) | |
1317 | samp = toupper(mapply(substr, samples, 1, min_lens, SIMPLIFY=FALSE)) | |
1318 | ||
1319 | # Calculate poisitions of mutations (or matches), ignoring gaps, Ns, and CDR3 | |
1320 | samp_char = strsplit(samp,"") | |
1321 | germ_char = strsplit(germ,"") | |
1322 | if(!match_instead){ | |
1323 | muts = lapply(mapply("!=", samp_char, germ_char, SIMPLIFY=FALSE), which) | |
1324 | } else { | |
1325 | muts = lapply(mapply("==", samp_char, germ_char, SIMPLIFY=FALSE), which) | |
1326 | } | |
1327 | ignore_germ = gregexpr(ignored_regex, germ) | |
1328 | ignore_samp = gregexpr(ignored_regex, samp) | |
1329 | ignore = mapply(c, ignore_germ, ignore_samp, SIMPLIFY=FALSE) | |
1330 | ||
1331 | muts = mapply(function(x, y) x[!x%in%y], muts, ignore, SIMPLIFY=FALSE) | |
1332 | return(muts) | |
1125 | 1333 | } |
1126 | 1334 | |
1127 | 1335 | |
1143 | 1351 | #' each element of \code{samples} |
1144 | 1352 | #' |
1145 | 1353 | #' @examples |
1146 | #' # Load germline database | |
1147 | #' data(germline_ighv) | |
1148 | #' | |
1149 | #' # Use createGermlines to insert a mutation into a germline sequence | |
1150 | #' #sample_seqs = c(germline_ighv[2], | |
1151 | #' # createGermlines(germline_ighv[1], 103, "G"), | |
1152 | #' # createGermlines(germline_ighv[1], 107, "C")) | |
1354 | #' # Insert a mutation into a germline sequence | |
1355 | #' s2 <- s3 <- GermlineIGHV[1] | |
1356 | #' stringi::stri_sub(s2, 103, 103) <- "G" | |
1357 | #' stringi::stri_sub(s3, 107, 107) <- "C" | |
1358 | #' | |
1359 | #' sample_seqs <- c(GermlineIGHV[2], s2, s3) | |
1153 | 1360 | #' |
1154 | 1361 | #' # Pretend that one sample sequence has received an ambiguous allele call |
1155 | #' #sample_alleles = c(paste(names(germline_ighv[1:2]), collapse=","), | |
1156 | #' # names(germline_ighv[2]), | |
1157 | #' # names(germline_ighv[1])) | |
1362 | #' sample_alleles <- c(paste(names(GermlineIGHV[1:2]), collapse=","), | |
1363 | #' names(GermlineIGHV[2]), | |
1364 | #' names(GermlineIGHV[1])) | |
1158 | 1365 | #' |
1159 | 1366 | #' # Compare each sequence to its assigned germline(s) to determine the distance |
1160 | #' #getMutCount(sample_seqs, sample_alleles, germline_ighv) | |
1367 | #' getMutCount(sample_seqs, sample_alleles, GermlineIGHV) | |
1161 | 1368 | #' |
1162 | 1369 | #' @export |
1163 | 1370 | getMutCount <- function(samples, allele_calls, germline_db){ |
1164 | ||
1165 | call_list = strsplit(allele_calls, ",") | |
1166 | ||
1167 | germline_list = lapply(call_list, function(x) germline_db[x]) | |
1168 | ||
1169 | mut_pos_list = list() | |
1170 | mut_count_list = list() | |
1171 | # First, find mutations of all sequences with call count of 1 | |
1172 | call_count = sapply(germline_list, length) | |
1173 | cc1 = which(call_count == 1) | |
1174 | if (length(cc1) > 0) { | |
1175 | mut_pos_list[cc1] = getMutatedPositions(samples[cc1], | |
1176 | unlist(germline_list[cc1])) | |
1177 | mut_count_list[cc1] = lapply(mut_pos_list[cc1], length) | |
1178 | } | |
1179 | # Then find mutations of all sequences with call count > 1 | |
1180 | ccm = which(call_count > 1) | |
1181 | if (length(ccm) > 0){ | |
1182 | mut_pos_list[ccm] = mapply(getMutatedPositions, | |
1183 | germline_list[ccm], samples[ccm], | |
1184 | SIMPLIFY=FALSE) | |
1185 | mut_count_list[ccm] = lapply(mut_pos_list[ccm], | |
1186 | function(x) lapply(x,length)) | |
1187 | } | |
1188 | ||
1189 | return(mut_count_list) | |
1371 | ||
1372 | call_list = strsplit(allele_calls, ",") | |
1373 | ||
1374 | germline_list = lapply(call_list, function(x) germline_db[x]) | |
1375 | ||
1376 | mut_pos_list = list() | |
1377 | mut_count_list = list() | |
1378 | # First, find mutations of all sequences with call count of 1 | |
1379 | call_count = sapply(germline_list, length) | |
1380 | cc1 = which(call_count == 1) | |
1381 | if (length(cc1) > 0) { | |
1382 | mut_pos_list[cc1] = getMutatedPositions(samples[cc1], | |
1383 | unlist(germline_list[cc1])) | |
1384 | mut_count_list[cc1] = lapply(mut_pos_list[cc1], length) | |
1385 | } | |
1386 | # Then find mutations of all sequences with call count > 1 | |
1387 | ccm = which(call_count > 1) | |
1388 | if (length(ccm) > 0){ | |
1389 | mut_pos_list[ccm] = mapply(getMutatedPositions, | |
1390 | germline_list[ccm], samples[ccm], | |
1391 | SIMPLIFY=FALSE) | |
1392 | mut_count_list[ccm] = lapply(mut_pos_list[ccm], | |
1393 | function(x) lapply(x,length)) | |
1394 | } | |
1395 | ||
1396 | return(mut_count_list) | |
1190 | 1397 | } |
1191 | 1398 | |
1192 | 1399 | #' Determine which calls represent an unmutated allele |
1197 | 1404 | #' sequence, only the subset that would represent a perfect match is returned. |
1198 | 1405 | #' |
1199 | 1406 | #' @param allele_calls a vector of strings respresenting Ig allele calls, |
1200 | #' where multiple calls are separated by a comma | |
1407 | #' where multiple calls are separated by a comma. | |
1201 | 1408 | #' @param germline_db a vector of named nucleotide germline sequences |
1202 | 1409 | #' @param sample_seqs V(D)J-rearranged sample sequences matching the order |
1203 | #' of the given \code{allele_calls} | |
1410 | #' of the given \code{allele_calls}. | |
1204 | 1411 | #' |
1205 | 1412 | #' @return A vector of strings containing the members of \code{allele_calls} |
1206 | #' that represent unmutated sequences | |
1413 | #' that represent unmutated sequences. | |
1207 | 1414 | #' |
1208 | 1415 | #' @examples |
1209 | #' # Load data | |
1210 | #' data(germline_ighv) | |
1211 | #' data(sample_db) | |
1212 | #' | |
1213 | 1416 | #' # Find which of the sample alleles are unmutated |
1214 | #' calls <- findUnmutatedCalls(sample_db$V_CALL, sample_db$SEQUENCE_IMGT, | |
1215 | #' germline_db=germline_ighv) | |
1417 | #' calls <- findUnmutatedCalls(SampleDb$V_CALL, SampleDb$SEQUENCE_IMGT, | |
1418 | #' germline_db=GermlineIGHV) | |
1216 | 1419 | #' |
1217 | 1420 | #' @export |
1218 | 1421 | findUnmutatedCalls <- function(allele_calls, sample_seqs, germline_db){ |
1219 | . = NULL | |
1220 | allele_calls = getAllele(allele_calls, first = FALSE) | |
1221 | sample_seqs = as.character(sample_seqs) | |
1222 | ||
1223 | # Remove calls not in germline_db | |
1224 | not_in_db = allele_calls %>% | |
1225 | strsplit(",") %>% | |
1226 | unlist %>% | |
1227 | setdiff(names(germline_db)) | |
1228 | no_call = which(allele_calls == "") | |
1229 | in_db = not_in_db %>% | |
1230 | sapply(grep, allele_calls, fixed=TRUE) %>% | |
1231 | unlist() %>% | |
1232 | c(no_call) %>% | |
1233 | unique() %>% | |
1234 | setdiff(1:length(allele_calls), .) | |
1235 | allele_calls = allele_calls[in_db] | |
1236 | sample_seqs = sample_seqs[in_db] | |
1237 | ||
1238 | mut_counts = getMutCount(sample_seqs, allele_calls, germline_db) | |
1239 | ||
1240 | # Find which seqs are unmutated and which of the allele calls that represents | |
1241 | unmut_i = which(sapply(mut_counts, function(x) min(unlist(x))) == 0) | |
1242 | which_no_muts = sapply(mut_counts, function(x) grep("^0$", unlist(x)) ) | |
1243 | unmut_alleles = rep("", length(allele_calls)) | |
1244 | ||
1245 | # How many alleles represent perfect matches? | |
1246 | n_gl_unmut = sapply(which_no_muts, length) | |
1247 | ||
1248 | one_unmut = which(n_gl_unmut == 1) | |
1249 | split_names = strsplit(allele_calls, ",") | |
1250 | if (length(one_unmut) > 0){ | |
1251 | inds = unlist(which_no_muts[one_unmut]) | |
1252 | unmut_alleles[one_unmut] = mapply("[", split_names[one_unmut], inds) | |
1253 | } | |
1254 | ||
1255 | more_unmut = which(n_gl_unmut > 1) | |
1256 | if (length(more_unmut) > 0){ | |
1257 | inds = which_no_muts[more_unmut] | |
1258 | unmut_multi = mapply(function(x,y) x[unlist(y)], split_names[more_unmut], | |
1259 | inds, SIMPLIFY = FALSE) | |
1260 | unmut_alleles[more_unmut] = sapply(unmut_multi, paste, collapse=",") | |
1261 | } | |
1262 | ||
1263 | unmut_alleles = unmut_alleles[unmut_i] | |
1264 | ||
1265 | return(unmut_alleles) | |
1266 | ||
1267 | } | |
1268 | ||
1269 | #' Find Frequent Sequences' Mutation Counts | |
1422 | . = NULL | |
1423 | allele_calls = getAllele(allele_calls, first = FALSE) | |
1424 | sample_seqs = as.character(sample_seqs) | |
1425 | ||
1426 | # Remove calls not in germline_db | |
1427 | not_in_db = allele_calls %>% | |
1428 | strsplit(",") %>% | |
1429 | unlist %>% | |
1430 | setdiff(names(germline_db)) | |
1431 | no_call = which(allele_calls == "") | |
1432 | in_db = not_in_db %>% | |
1433 | sapply(grep, allele_calls, fixed=TRUE) %>% | |
1434 | unlist() %>% | |
1435 | c(no_call) %>% | |
1436 | unique() %>% | |
1437 | setdiff(1:length(allele_calls), .) | |
1438 | allele_calls = allele_calls[in_db] | |
1439 | sample_seqs = sample_seqs[in_db] | |
1440 | ||
1441 | mut_counts = getMutCount(sample_seqs, allele_calls, germline_db) | |
1442 | ||
1443 | # Find which seqs are unmutated and which of the allele calls that represents | |
1444 | unmut_i = which(sapply(mut_counts, function(x) min(unlist(x))) == 0) | |
1445 | which_no_muts = sapply(mut_counts, function(x) grep("^0$", unlist(x)) ) | |
1446 | unmut_alleles = rep("", length(allele_calls)) | |
1447 | ||
1448 | # How many alleles represent perfect matches? | |
1449 | n_gl_unmut = sapply(which_no_muts, length) | |
1450 | ||
1451 | one_unmut = which(n_gl_unmut == 1) | |
1452 | split_names = strsplit(allele_calls, ",") | |
1453 | if (length(one_unmut) > 0){ | |
1454 | inds = unlist(which_no_muts[one_unmut]) | |
1455 | unmut_alleles[one_unmut] = mapply("[", split_names[one_unmut], inds) | |
1456 | } | |
1457 | ||
1458 | more_unmut = which(n_gl_unmut > 1) | |
1459 | if (length(more_unmut) > 0){ | |
1460 | inds = which_no_muts[more_unmut] | |
1461 | unmut_multi = mapply(function(x,y) x[unlist(y)], split_names[more_unmut], | |
1462 | inds, SIMPLIFY = FALSE) | |
1463 | unmut_alleles[more_unmut] = sapply(unmut_multi, paste, collapse=",") | |
1464 | } | |
1465 | ||
1466 | unmut_alleles = unmut_alleles[unmut_i] | |
1467 | ||
1468 | return(unmut_alleles) | |
1469 | ||
1470 | } | |
1471 | ||
1472 | #' Find mutation counts for frequency sequences | |
1270 | 1473 | #' |
1271 | 1474 | #' \code{getPopularMutationCount} determines which sequences occur frequently |
1272 | 1475 | #' for each V gene and returns the mutation count of those sequences. |
1273 | 1476 | #' |
1274 | #' @param sample_db A Change-O db data frame. See | |
1477 | #' @param data a \code{data.frame} in the Change-O format. See | |
1275 | 1478 | #' \link{findNovelAlleles} for a list of required |
1276 | 1479 | #' columns. |
1277 | 1480 | #' @param germline_db A named list of IMGT-gapped germline sequences. |
1281 | 1484 | #' to avoid exclusion. |
1282 | 1485 | #' @param seq_p_of_max For each gene, fraction of the most common V sequence's |
1283 | 1486 | #' count that a sequence must meet to avoid exclusion. |
1284 | #' @param full_return If true, will return all \code{sample_db} columns and | |
1487 | #' @param full_return If \code{TRUE}, will return all \code{data} columns and | |
1285 | 1488 | #' will include sequences with mutation count < 1. |
1286 | 1489 | #' |
1287 | 1490 | #' @return A data frame of genes that have a frequent sequence mutation count |
1291 | 1494 | #' of a set of sequences are mutated. |
1292 | 1495 | #' |
1293 | 1496 | #' @examples |
1294 | #' data(sample_db, germline_ighv) | |
1295 | #' getPopularMutationCount(sample_db, germline_ighv) | |
1497 | #' getPopularMutationCount(SampleDb, GermlineIGHV) | |
1296 | 1498 | #' |
1297 | 1499 | #' @export |
1298 | getPopularMutationCount <- function(sample_db, germline_db, gene_min = 1e-03, | |
1500 | getPopularMutationCount <- function(data, germline_db, gene_min = 1e-03, | |
1299 | 1501 | seq_min = 50, seq_p_of_max = 1/8, |
1300 | 1502 | full_return = FALSE){ |
1301 | modified_db = sample_db %>% | |
1302 | mutate_(V_GENE = ~getGene(V_CALL)) %>% | |
1303 | group_by_(~1:n()) %>% | |
1304 | mutate_(V_SEQUENCE_IMGT = ~substring(SEQUENCE_IMGT, 1, 312)) %>% | |
1305 | # Count occurence of each unique IMGT-gapped V sequence | |
1306 | group_by_(~V_GENE, ~V_SEQUENCE_IMGT) %>% | |
1307 | mutate_(V_SEQUENCE_IMGT_N = ~n()) %>% | |
1308 | # Count occurence of each gene and determine count of most common sequence | |
1309 | mutate_(V_GENE_N = ~n()) %>% | |
1310 | mutate_(V_SEQUENCE_IMGT_N_MAX = ~max(V_SEQUENCE_IMGT_N)) %>% | |
1311 | # Remove rare V genes, rare sequences, and sequences not making up a | |
1312 | # sufficient proportion of sequences as compared to the most common | |
1313 | ungroup %>% | |
1314 | distinct_(~V_SEQUENCE_IMGT, .keep_all = TRUE) %>% | |
1315 | filter_(~V_GENE_N >= (nrow(sample_db)*gene_min)) %>% | |
1316 | filter_(~V_SEQUENCE_IMGT_N >= seq_min) %>% | |
1317 | mutate_(V_SEQUENCE_IMGT_P_MAX = ~V_SEQUENCE_IMGT_N/V_SEQUENCE_IMGT_N_MAX) %>% | |
1318 | filter_(~V_SEQUENCE_IMGT_P_MAX >= seq_p_of_max) | |
1319 | # Determine the mutation counts of the V sequences and append them to the db | |
1320 | MUTATION_COUNT = getMutCount(modified_db$V_SEQUENCE_IMGT, | |
1321 | modified_db$V_CALL, | |
1322 | germline_db) %>% | |
1323 | sapply(function(x) min(unlist(x))) | |
1324 | if (length(MUTATION_COUNT)==0){ | |
1325 | MUTATION_COUNT = integer(0) | |
1326 | } | |
1327 | merged_db = bind_cols(modified_db, data.frame(MUTATION_COUNT)) | |
1328 | # Strip down the data frame before returning it | |
1329 | if (!full_return) { | |
1330 | merged_db = merged_db %>% | |
1331 | filter_(~MUTATION_COUNT > 0) %>% | |
1332 | select_(~V_GENE, ~MUTATION_COUNT) | |
1333 | } | |
1334 | return(merged_db) | |
1503 | modified_db = data %>% | |
1504 | mutate_(V_GENE = ~getGene(V_CALL)) %>% | |
1505 | group_by_(~V_GENE) %>% | |
1506 | mutate_(V_GENE_N = ~n()) %>% | |
1507 | group_by_(~1:n()) %>% | |
1508 | mutate_(V_SEQUENCE_IMGT = ~substring(SEQUENCE_IMGT, 1, 312)) %>% | |
1509 | # Count occurence of each unique IMGT-gapped V sequence | |
1510 | group_by_(~V_GENE, ~V_SEQUENCE_IMGT) %>% | |
1511 | mutate_(V_SEQUENCE_IMGT_N = ~n()) %>% | |
1512 | # Determine count of most common sequence | |
1513 | group_by_(~V_GENE) %>% | |
1514 | mutate_(V_SEQUENCE_IMGT_N_MAX = ~max(V_SEQUENCE_IMGT_N)) %>% | |
1515 | # Remove rare V genes, rare sequences, and sequences not making up a | |
1516 | # sufficient proportion of sequences as compared to the most common | |
1517 | ungroup %>% | |
1518 | distinct_(~V_SEQUENCE_IMGT, .keep_all = TRUE) %>% | |
1519 | filter_(~V_GENE_N >= (nrow(data)*gene_min)) %>% | |
1520 | filter_(~V_SEQUENCE_IMGT_N >= seq_min) %>% | |
1521 | mutate_(V_SEQUENCE_IMGT_P_MAX = ~V_SEQUENCE_IMGT_N/V_SEQUENCE_IMGT_N_MAX) %>% | |
1522 | filter_(~V_SEQUENCE_IMGT_P_MAX >= seq_p_of_max) | |
1523 | # Determine the mutation counts of the V sequences and append them to the db | |
1524 | MUTATION_COUNT = getMutCount(modified_db$V_SEQUENCE_IMGT, | |
1525 | modified_db$V_CALL, | |
1526 | germline_db) %>% | |
1527 | sapply(function(x) min(unlist(x))) | |
1528 | if (length(MUTATION_COUNT)==0){ | |
1529 | MUTATION_COUNT = integer(0) | |
1530 | } | |
1531 | merged_db = bind_cols(modified_db, data.frame(MUTATION_COUNT)) | |
1532 | # Strip down the data frame before returning it | |
1533 | if (!full_return) { | |
1534 | merged_db = merged_db %>% | |
1535 | filter_(~MUTATION_COUNT > 0) %>% | |
1536 | select_(~V_GENE, ~MUTATION_COUNT) | |
1537 | } | |
1538 | return(merged_db) | |
1335 | 1539 | } |
1336 | 1540 | |
1337 | 1541 | #' Insert polymorphisms into a nucleotide sequence |
1339 | 1543 | #' \code{insertPolymorphisms} replaces nucleotides in the desired locations of a |
1340 | 1544 | #' provided sequence. |
1341 | 1545 | #' |
1342 | #' | |
1343 | #' @param sequence the starting nucletide sequence | |
1344 | #' @param positions a vector of positions which to be changed | |
1345 | #' @param nucleotides a vector of nucletides to which to change the | |
1346 | #' positions | |
1347 | #' @return a sequence with the desired nucleotides in provided locations | |
1546 | #' @param sequence starting nucletide sequence. | |
1547 | #' @param positions numeric vector of positions which to be changed. | |
1548 | #' @param nucleotides character vector of nucletides to which to change the | |
1549 | #' positions. | |
1550 | #' | |
1551 | #' @return A sequence with the desired nucleotides in the provided locations. | |
1348 | 1552 | #' |
1349 | 1553 | #' @examples |
1350 | #' insertPolymorphisms("hugged", c(1,6,2), c("t","r","i")) | |
1554 | #' insertPolymorphisms("HUGGED", c(1, 6, 2), c("T", "R", "I")) | |
1351 | 1555 | #' |
1352 | 1556 | #' @export |
1353 | insertPolymorphisms <- function(sequence, positions, nucleotides){ | |
1354 | ||
1355 | if(length(positions) != length(nucleotides)){ | |
1356 | stop("Number of nucleotides and number of positions do not match.") | |
1357 | } | |
1358 | names(positions) = nucleotides | |
1359 | for (i in 1:length(positions)){ | |
1360 | substr(sequence, positions[i], positions[i]) = names(positions[i]) | |
1361 | } | |
1362 | ||
1363 | return(sequence) | |
1557 | insertPolymorphisms <- function(sequence, positions, nucleotides) { | |
1558 | ||
1559 | if(length(positions) != length(nucleotides)){ | |
1560 | stop("Number of nucleotides and number of positions do not match.") | |
1561 | } | |
1562 | names(positions) = nucleotides | |
1563 | for (i in 1:length(positions)){ | |
1564 | substr(sequence, positions[i], positions[i]) = names(positions[i]) | |
1565 | } | |
1566 | ||
1567 | return(sequence) | |
1364 | 1568 | } |
1365 | 1569 | |
1366 | 1570 | # Formatting and Cleanup -------------------------------------------------- |
1370 | 1574 | #' \code{readIgFasta} reads a fasta-formatted file of immunoglobulin (Ig) |
1371 | 1575 | #' sequences and returns a named vector of those sequences. |
1372 | 1576 | #' |
1373 | #' @param fasta_file fasta-formatted file of immunoglobuling sequences | |
1577 | #' @param fasta_file fasta-formatted file of immunoglobuling sequences. | |
1374 | 1578 | #' @param strip_down_name if \code{TRUE}, will extract only the allele name |
1375 | #' from the strings fasta file's sequence names | |
1579 | #' from the strings fasta file's sequence names. | |
1376 | 1580 | #' @param force_caps if \code{TRUE}, will force nucleotides to |
1377 | #' uppercase | |
1378 | #' @return a named vector of strings respresenting Ig alleles | |
1581 | #' uppercase. | |
1582 | #' | |
1583 | #' @return Named vector of strings respresenting Ig alleles. | |
1379 | 1584 | #' |
1380 | 1585 | #' @seealso \link{writeFasta} to do the inverse. |
1381 | 1586 | #' |
1382 | 1587 | #' @export |
1383 | readIgFasta <- function(fasta_file, | |
1384 | strip_down_name = TRUE, | |
1385 | force_caps = TRUE){ | |
1386 | all_char = readChar(fasta_file, file.info(fasta_file)$size) | |
1387 | split_by_sequence = strsplit(all_char, "[ \t\r\n\v\f]?>") | |
1388 | add_name_break = sapply(split_by_sequence, function(x) sub("[\r\n]",">",x)) | |
1389 | cleaned_up = sapply(add_name_break, function(x) gsub("[ \t\r\n\v\f]", "", x)) | |
1390 | broken_names = sapply(cleaned_up, strsplit, ">") | |
1391 | seqs = sapply(broken_names, "[", 2) | |
1392 | seq_names = sapply(broken_names, "[", 1) | |
1393 | if(force_caps){ seqs = toupper(seqs) } | |
1394 | if(strip_down_name){ seq_names = getAllele(seq_names, strip_d=FALSE) } | |
1395 | names(seqs) = seq_names | |
1396 | return(seqs[which(!is.na(seqs))]) | |
1588 | readIgFasta <- function(fasta_file, strip_down_name=TRUE, force_caps=TRUE) { | |
1589 | all_char = readChar(fasta_file, file.info(fasta_file)$size) | |
1590 | split_by_sequence = strsplit(all_char, "[ \t\r\n\v\f]?>") | |
1591 | add_name_break = sapply(split_by_sequence, function(x) sub("[\r\n]",">",x)) | |
1592 | cleaned_up = sapply(add_name_break, function(x) gsub("[ \t\r\n\v\f]", "", x)) | |
1593 | broken_names = sapply(cleaned_up, strsplit, ">") | |
1594 | ||
1595 | seqs = sapply(broken_names, "[", 2) | |
1596 | seq_names = sapply(broken_names, "[", 1) | |
1597 | if(force_caps) { seqs = toupper(seqs) } | |
1598 | if(strip_down_name){ seq_names = getAllele(seq_names, strip_d=FALSE) } | |
1599 | names(seqs) = seq_names | |
1600 | ||
1601 | return(seqs[which(!is.na(seqs))]) | |
1397 | 1602 | } |
1398 | 1603 | |
1399 | 1604 | #' Write to a fasta file |
1402 | 1607 | #' format. |
1403 | 1608 | #' |
1404 | 1609 | #' @param named_sequences a vector of named string representing sequences |
1405 | #' @param file the name of the output file | |
1610 | #' @param file the name of the output file. | |
1406 | 1611 | #' @param width the number of characters to be printed per line. |
1407 | #' If not between 1 and 255, width with be infinite. | |
1612 | #' if not between 1 and 255, width with be infinite. | |
1408 | 1613 | #' @param append \code{logical} indicating if the output should be |
1409 | 1614 | #' appended to \code{file} instead of overwriting it |
1410 | 1615 | #' |
1411 | #' @return a named vector of strings respresenting Ig alleles | |
1616 | #' @return A named vector of strings respresenting Ig alleles. | |
1412 | 1617 | #' |
1413 | 1618 | #' @seealso \link{readIgFasta} to do the inverse. |
1414 | 1619 | #' |
1415 | 1620 | #' @export |
1416 | 1621 | writeFasta <- function(named_sequences, file, width=60, append=FALSE){ |
1417 | . = NULL | |
1418 | seq_names = names(named_sequences) %>% | |
1419 | paste(">", ., "\n", sep="") | |
1420 | seqs = as.character(named_sequences) | |
1421 | if(is.numeric(width) & width > 0 & width < 256){ | |
1422 | width_regex = paste("(.{", width, ",", width, "})", sep="") | |
1423 | seqs = gsub(width_regex, "\\1\n", seqs) | |
1424 | } | |
1425 | seqs = seqs %>% | |
1426 | paste("\n", sep="") %>% | |
1427 | gsub("\n\n", "\n", .) | |
1428 | paste(seq_names, seqs, sep="", collapse="") %>% | |
1429 | cat(file=file, append=append) | |
1622 | . = NULL | |
1623 | seq_names = names(named_sequences) %>% | |
1624 | paste(">", ., "\n", sep="") | |
1625 | seqs = as.character(named_sequences) | |
1626 | if(is.numeric(width) & width > 0 & width < 256){ | |
1627 | width_regex = paste("(.{", width, ",", width, "})", sep="") | |
1628 | seqs = gsub(width_regex, "\\1\n", seqs) | |
1629 | } | |
1630 | seqs = seqs %>% | |
1631 | paste("\n", sep="") %>% | |
1632 | gsub("\n\n", "\n", .) | |
1633 | paste(seq_names, seqs, sep="", collapse="") %>% | |
1634 | cat(file=file, append=append) | |
1430 | 1635 | } |
1431 | 1636 | |
1432 | 1637 | #' Update IGHV allele names |
1433 | 1638 | #' |
1434 | 1639 | #' \code{updateAlleleNames} takes a set of IGHV allele calls and replaces any |
1435 | 1640 | #' outdated names (e.g. IGHV1-f) with the new IMGT names. |
1436 | #' @details The updated allele names are based on IMGT release 201408-4. | |
1437 | #' @note IGMT has removed IGHV2-5*10 and IGHV2-5*07 as it has determined they | |
1438 | #' are actually alleles *02 and *04, respectively. | |
1439 | #' | |
1440 | #' @param allele_calls a vector of strings respresenting IGHV allele names | |
1441 | #' | |
1442 | #' @return vector of strings respresenting updated IGHV allele names | |
1443 | #' | |
1444 | #' @references Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes | |
1445 | #' and alleles: new entities, new names and implications for research and | |
1446 | #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6 | |
1641 | #' | |
1642 | #' @param allele_calls a vector of strings respresenting IGHV allele names. | |
1643 | #' | |
1644 | #' @return Vector of strings respresenting updated IGHV allele names. | |
1645 | #' | |
1646 | #' @note | |
1647 | #' IGMT has removed \code{IGHV2-5*10} and \code{IGHV2-5*07} as it has determined they | |
1648 | #' are actually alleles \code{02} and \code{04}, respectively. The updated allele | |
1649 | #' names are based on IMGT release 201408-4. | |
1650 | #' | |
1651 | #' @references | |
1652 | #' \enumerate{ | |
1653 | #' \item Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes | |
1654 | #' and alleles: new entities, new names and implications for research and | |
1655 | #' prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6 | |
1656 | #' } | |
1447 | 1657 | #' |
1448 | 1658 | #' @seealso Like \code{updateAlleleNames}, \link{sortAlleles} can help |
1449 | 1659 | #' format a list of allele names. |
1450 | 1660 | #' |
1451 | 1661 | #' @examples |
1452 | 1662 | #' # Create a vector that uses old gene/allele names. |
1453 | #' alleles = c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07") | |
1663 | #' alleles <- c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07") | |
1454 | 1664 | #' |
1455 | 1665 | #' # Update the alleles to the new names |
1456 | 1666 | #' updateAlleleNames(alleles) |
1457 | 1667 | #' |
1458 | 1668 | #' @export |
1459 | updateAlleleNames <- function(allele_calls){ | |
1460 | . = NULL | |
1461 | temporary_names = c("IGHV1-c*", | |
1462 | "IGHV1-f*", | |
1463 | "IGHV3-d*", | |
1464 | "IGHV3-h*", | |
1465 | "IGHV4-b*", | |
1466 | "IGHV5-a*", | |
1467 | "IGHV2-5*10", | |
1468 | "IGHV2-5*07") | |
1469 | definitive_names = c("IGHV1-38-4*", | |
1470 | "IGHV1-69-2*", | |
1471 | "IGHV3-38-3*", | |
1472 | "IGHV3-69-1*", | |
1473 | "IGHV4-38-2*", | |
1474 | "IGHV5-10-1*", | |
1475 | "IGHV2-5*02", | |
1476 | "IGHV2-5*04") | |
1477 | for (i in 1:length(temporary_names)){ | |
1478 | allele_calls = allele_calls %>% | |
1479 | gsub(temporary_names[i], definitive_names[i], ., fixed = TRUE) | |
1480 | } | |
1481 | return(allele_calls) | |
1669 | updateAlleleNames <- function(allele_calls) { | |
1670 | . = NULL | |
1671 | temporary_names = c("IGHV1-c*", | |
1672 | "IGHV1-f*", | |
1673 | "IGHV3-d*", | |
1674 | "IGHV3-h*", | |
1675 | "IGHV4-b*", | |
1676 | "IGHV5-a*", | |
1677 | "IGHV2-5*10", | |
1678 | "IGHV2-5*07") | |
1679 | definitive_names = c("IGHV1-38-4*", | |
1680 | "IGHV1-69-2*", | |
1681 | "IGHV3-38-3*", | |
1682 | "IGHV3-69-1*", | |
1683 | "IGHV4-38-2*", | |
1684 | "IGHV5-10-1*", | |
1685 | "IGHV2-5*02", | |
1686 | "IGHV2-5*04") | |
1687 | for (i in 1:length(temporary_names)){ | |
1688 | allele_calls = allele_calls %>% | |
1689 | gsub(temporary_names[i], definitive_names[i], ., fixed = TRUE) | |
1690 | } | |
1691 | return(allele_calls) | |
1482 | 1692 | } |
1483 | 1693 | |
1484 | 1694 | #' Sort allele names |
1486 | 1696 | #' \code{sortAlleles} returns a sorted vector of strings respresenting Ig allele |
1487 | 1697 | #' names. Names are first sorted by gene family, then by gene, then by allele. |
1488 | 1698 | #' Duplicated genes have their alleles are sorted as if they were part of their |
1489 | #' non-duplicated counterparts (e.g. IGHV1-69D*01 comes after IGHV1-69*01 but | |
1490 | #' before IGHV1-69*02), and non-localized genes (e.g. IGHV1-NL1*01) come last | |
1491 | #' within their gene family. | |
1492 | #' | |
1493 | #' @param allele_calls a vector of strings respresenting Ig allele names | |
1699 | #' non-duplicated counterparts (e.g. \code{IGHV1-69D*01} comes after \code{IGHV1-69*01} | |
1700 | #' but before \code{IGHV1-69*02}), and non-localized genes (e.g. \code{IGHV1-NL1*01}) | |
1701 | #' come last within their gene family. | |
1702 | #' | |
1703 | #' @param allele_calls a vector of strings respresenting Ig allele names. | |
1494 | 1704 | #' @param method a string defining the method to use when sorting alleles. |
1495 | 1705 | #' If \code{"name"} then sort in lexicographic order. If |
1496 | 1706 | #' \code{"position"} then sort by position in the locus, as |
1497 | 1707 | #' determined by the final two numbers in the gene name. |
1498 | #' @return A sorted vector of strings respresenting Ig allele names | |
1708 | #' @return A sorted vector of strings respresenting Ig allele names. | |
1499 | 1709 | #' |
1500 | 1710 | #' @seealso Like \code{sortAlleles}, \link{updateAlleleNames} can help |
1501 | 1711 | #' format a list of allele names. |
1502 | 1712 | #' |
1503 | 1713 | #' @examples |
1504 | 1714 | #' # Create a list of allele names |
1505 | #' alleles = c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01", | |
1506 | #' "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05", | |
1507 | #' "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02") | |
1715 | #' alleles <- c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01", | |
1716 | #' "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05", | |
1717 | #' "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02") | |
1508 | 1718 | #' |
1509 | 1719 | #' # Sort the alleles by name |
1510 | 1720 | #' sortAlleles(alleles) |
1514 | 1724 | #' |
1515 | 1725 | #' @export |
1516 | 1726 | sortAlleles <- function(allele_calls, method=c("name", "position")) { |
1517 | # Check arguments | |
1518 | method <- match.arg(method) | |
1519 | ||
1520 | # Standardize format of submitted alleles, first | |
1521 | SUBMITTED_CALLS = getAllele(allele_calls, first = FALSE, strip_d= FALSE) %>% | |
1522 | sort() | |
1523 | allele_df = data.frame(SUBMITTED_CALLS,stringsAsFactors = FALSE) %>% | |
1524 | # Determine the family | |
1525 | mutate_(FAMILY = ~getFamily(SUBMITTED_CALLS)) %>% | |
1526 | # Determine the gene (exclude family); convert letters to numbers for sort | |
1527 | mutate_(GENE = ~getGene(SUBMITTED_CALLS)) %>% | |
1528 | mutate_(GENE1 = ~gsub("[^-]+[-S]([^-\\*D]+).*","\\1",SUBMITTED_CALLS)) %>% | |
1529 | mutate_(GENE1 = ~as.numeric(gsub("[^0-9]+", "99", GENE1))) %>% | |
1530 | # If there is a second gene number, determine that, too | |
1531 | mutate_(GENE2 = ~gsub("[^-]+[-S][^-]+-?","",GENE)) %>% | |
1532 | mutate_(GENE2 = ~as.numeric(gsub("[^0-9]+", "99", GENE2))) %>% | |
1533 | mutate_(ALLELE = ~getAllele(SUBMITTED_CALLS)) %>% | |
1534 | mutate_(ALLELE = ~(sub("[^\\*]+\\*|[^\\*]+$","", | |
1535 | ALLELE))) %>% | |
1536 | mutate_(ALLELE = ~as.numeric(sub("_.+$","", | |
1537 | ALLELE))) | |
1538 | # Convert missing values to 0, sort data frame | |
1539 | allele_df[is.na(allele_df)] = 0 | |
1540 | if (method == "name") { | |
1541 | sorted_df = arrange_(allele_df, ~FAMILY, ~GENE1, ~GENE2, ~ALLELE) | |
1542 | } else if (method == "position") { | |
1543 | sorted_df = arrange_(allele_df, ~desc(GENE1), ~desc(GENE2), ~FAMILY, ~ALLELE) | |
1544 | } | |
1545 | ||
1546 | return(sorted_df$SUBMITTED_CALLS) | |
1727 | # Check arguments | |
1728 | method <- match.arg(method) | |
1729 | ||
1730 | # Standardize format of submitted alleles, first | |
1731 | SUBMITTED_CALLS = getAllele(allele_calls, first = FALSE, strip_d= FALSE) %>% | |
1732 | sort() | |
1733 | allele_df = data.frame(SUBMITTED_CALLS,stringsAsFactors = FALSE) %>% | |
1734 | # Determine the family | |
1735 | mutate_(FAMILY = ~getFamily(SUBMITTED_CALLS)) %>% | |
1736 | # Determine the gene (exclude family); convert letters to numbers for sort | |
1737 | mutate_(GENE = ~getGene(SUBMITTED_CALLS)) %>% | |
1738 | mutate_(GENE1 = ~gsub("[^-]+[-S]([^-\\*D]+).*","\\1",SUBMITTED_CALLS)) %>% | |
1739 | mutate_(GENE1 = ~as.numeric(gsub("[^0-9]+", "99", GENE1))) %>% | |
1740 | # If there is a second gene number, determine that, too | |
1741 | mutate_(GENE2 = ~gsub("[^-]+[-S][^-]+-?","",GENE)) %>% | |
1742 | mutate_(GENE2 = ~as.numeric(gsub("[^0-9]+", "99", GENE2))) %>% | |
1743 | mutate_(ALLELE = ~getAllele(SUBMITTED_CALLS)) %>% | |
1744 | mutate_(ALLELE = ~(sub("[^\\*]+\\*|[^\\*]+$","", | |
1745 | ALLELE))) %>% | |
1746 | mutate_(ALLELE = ~as.numeric(sub("_.+$","", | |
1747 | ALLELE))) | |
1748 | # Convert missing values to 0, sort data frame | |
1749 | allele_df[is.na(allele_df)] = 0 | |
1750 | if (method == "name") { | |
1751 | sorted_df = arrange_(allele_df, ~FAMILY, ~GENE1, ~GENE2, ~ALLELE) | |
1752 | } else if (method == "position") { | |
1753 | sorted_df = arrange_(allele_df, ~desc(GENE1), ~desc(GENE2), ~FAMILY, ~ALLELE) | |
1754 | } | |
1755 | ||
1756 | return(sorted_df$SUBMITTED_CALLS) | |
1547 | 1757 | } |
1548 | 1758 | |
1549 | 1759 | #' Clean up nucleotide sequences |
1550 | 1760 | #' |
1551 | #' \code{cleanSeqs} capitalizes nucleotides, replaces "." with "-", and then | |
1552 | #' replaces all characters besides ACGT- with "N". | |
1553 | #' | |
1554 | #' @param seqs a vector of nucleotide sequences | |
1555 | #' @return A vector of nucleotide sequences | |
1761 | #' \code{cleanSeqs} capitalizes nucleotides and replaces all characters | |
1762 | #' besides \code{c("A", "C", "G", "T", "-", ".")} with \code{"N"}. | |
1763 | #' | |
1764 | #' @param seqs a vector of nucleotide sequences. | |
1765 | #' | |
1766 | #' @return A modified vector of nucleotide sequences. | |
1556 | 1767 | #' |
1557 | 1768 | #' @seealso \link{sortAlleles} and \link{updateAlleleNames} can |
1558 | 1769 | #' help format a list of allele names. |
1559 | 1770 | #' |
1560 | 1771 | #' @examples |
1561 | #' # Create messy nucleotide sequences | |
1562 | #' seqs = c("AGAT.taa-GAG...ATA", | |
1563 | #' "GATACAGTXXXXXAGNNNPPPACA") | |
1564 | #' # Clean them up | |
1772 | #' # Clean messy nucleotide sequences | |
1773 | #' seqs <- c("AGAT.taa-GAG...ATA", "GATACAGTXXZZAGNNPPACA") | |
1565 | 1774 | #' cleanSeqs(seqs) |
1566 | 1775 | #' |
1567 | 1776 | #' @export |
1568 | cleanSeqs <- function(seqs){ | |
1569 | . = NULL | |
1570 | seqs %>% | |
1571 | toupper %>% | |
1572 | gsub(".", "-", . , fixed = TRUE) %>% | |
1573 | gsub("[^ACGT-]", "N", .) %>% | |
1574 | return | |
1777 | cleanSeqs <- function(seqs) { | |
1778 | # . = NULL | |
1779 | # seqs %>% | |
1780 | # toupper %>% | |
1781 | # gsub(".", "-", . , fixed = TRUE) %>% | |
1782 | # gsub("[^ACGT-]", "N", .) %>% | |
1783 | # return | |
1784 | ||
1785 | return (gsub("[^ACGT\\.\\-]", "N", toupper(seqs))) | |
1575 | 1786 | } |
1576 | 1787 | |
1577 | 1788 | |
1583 | 1794 | # position to be analyzed and determines if each sample is mutated at that |
1584 | 1795 | # position |
1585 | 1796 | # |
1586 | # @param clip_db A Change-O db data frame. See | |
1797 | # @param data a Change-O db data.frame. See | |
1587 | 1798 | # \link{findNovelAlleles} for a list of required |
1588 | 1799 | # columns. |
1589 | # @param germline The germline to which all the sequences should be | |
1800 | # @param germline the germline to which all the sequences should be | |
1590 | 1801 | # compared |
1591 | # @param pos_range The range of positions within the sequence for which | |
1802 | # @param pos_range the range of positions within the sequence for which | |
1592 | 1803 | # the rows should be duplicated and checked for mutation |
1593 | 1804 | # |
1594 | 1805 | # @return A data frame with rows duplicated for all the positions to be |
1595 | 1806 | # analyzed and a column indicating whether the position is mutated in |
1596 | 1807 | # comparison to the germline |
1597 | 1808 | # |
1598 | positionMutations <- function(clip_db, germline, pos_range){ | |
1599 | . = NULL | |
1600 | pos_db = pos_range %>% | |
1601 | length() %>% | |
1602 | rep("clip_db", .) %>% | |
1603 | paste(collapse=",") %>% | |
1604 | paste("bind_rows(",., ")") %>% | |
1605 | parse(text=.) %>% | |
1606 | eval() | |
1607 | pos_db$POSITION = c(sapply(pos_range, rep, nrow(clip_db))) | |
1608 | # Find which positions are mutated | |
1609 | pos_db = pos_db %>% | |
1610 | mutate_(NT = ~substring(SEQUENCE_IMGT, POSITION, POSITION)) %>% | |
1611 | mutate_(GERM_NT = ~substring(germline, POSITION, POSITION)) %>% | |
1612 | mutate_(MUTATED = ~(NT != GERM_NT & NT != "N" & NT != "-" & NT != "")) %>% | |
1613 | mutate_(OBSERVED = ~(NT != "-" & NT != "")) | |
1614 | return(pos_db) | |
1809 | positionMutations <- function(data, germline, pos_range){ | |
1810 | . = NULL | |
1811 | pos_db = pos_range %>% | |
1812 | length() %>% | |
1813 | rep("data", .) %>% | |
1814 | paste(collapse=",") %>% | |
1815 | paste("bind_rows(",., ")") %>% | |
1816 | parse(text=.) %>% | |
1817 | eval() | |
1818 | pos_db$POSITION = c(sapply(pos_range, rep, nrow(data))) | |
1819 | # Find which positions are mutated | |
1820 | pos_db = pos_db %>% | |
1821 | mutate_(NT = ~substring(SEQUENCE_IMGT, POSITION, POSITION)) %>% | |
1822 | mutate_(GERM_NT = ~substring(germline, POSITION, POSITION)) %>% | |
1823 | mutate_(MUTATED = ~(NT != GERM_NT & NT != "N" & NT != "-" & NT != "")) %>% | |
1824 | mutate_(OBSERVED = ~(NT != "-" & NT != "")) | |
1825 | return(pos_db) | |
1615 | 1826 | } |
1616 | 1827 | |
1617 | 1828 | # Find sequences carrying certain levels of mutation |
1620 | 1831 | # sequences and returns the subset of sequences that meet the given mutation |
1621 | 1832 | # count limits |
1622 | 1833 | # |
1623 | # @param clip_db A Change-O db data frame. See | |
1834 | # @param data a Change-O db data frame. See | |
1624 | 1835 | # \link{findNovelAlleles} for a list of required |
1625 | 1836 | # columns. |
1626 | # @param germline The germline to which all the sequences should be | |
1837 | # @param germline the germline to which all the sequences should be | |
1627 | 1838 | # compared |
1628 | # @param pos_range The range of positions within the sequences that should | |
1839 | # @param pos_range the range of positions within the sequences that should | |
1629 | 1840 | # be analyzed for mutations |
1630 | # @param pos_range The range of mutation counts that sequences can have | |
1841 | # @param pos_range the range of mutation counts that sequences can have | |
1631 | 1842 | # and still be included |
1632 | 1843 | # |
1633 | # @return A data frame containing only the subset carrying the desired levels | |
1844 | # @return | |
1845 | # A data.frame containing only the subset carrying the desired levels | |
1634 | 1846 | # of mutation |
1635 | 1847 | # |
1636 | mutationRangeSubset <- function(clip_db, germline, mut_range, pos_range){ | |
1637 | . = NULL | |
1638 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
1639 | clip_db$MUT_COUNT = clip_db$SEQUENCE_IMGT %>% | |
1640 | substring(min(pos_range), max(pos_range)) %>% | |
1641 | paste(pads, ., sep="") %>% | |
1642 | getMutatedPositions(germline) %>% | |
1643 | sapply(length) | |
1644 | clip_db = clip_db %>% | |
1645 | filter_(~MUT_COUNT %in% mut_range) | |
1646 | return(clip_db) | |
1848 | mutationRangeSubset <- function(data, germline, mut_range, pos_range){ | |
1849 | . = NULL | |
1850 | pads = paste(rep("-", min(pos_range)-1), collapse="") | |
1851 | data$MUT_COUNT = data$SEQUENCE_IMGT %>% | |
1852 | substring(min(pos_range), max(pos_range)) %>% | |
1853 | paste(pads, ., sep="") %>% | |
1854 | getMutatedPositions(germline) %>% | |
1855 | sapply(length) | |
1856 | data = data %>% | |
1857 | filter_(~MUT_COUNT %in% mut_range) | |
1858 | return(data) | |
1647 | 1859 | } |
1648 | 1860 | |
1649 | 1861 | # Find lower range of y-intercept confidence interval |
1650 | 1862 | # |
1651 | 1863 | # \code{findLowerY} finds the lower range of y-intercept confidence interval |
1652 | 1864 | # |
1865 | # @details If mut_min is 1, a y-intercept will be searched for at 0. If | |
1866 | # mut_min is above 1, then the "y-intercept" will be found at x = mut_min - 1. | |
1867 | # | |
1653 | 1868 | # @param x A vector of x values |
1654 | 1869 | # @param y A vector of y values |
1655 | 1870 | # @param mut_min The value where the the lowest mutation count should be |
1657 | 1872 | # @param alpha The alpha cutoff the be used in constructing the |
1658 | 1873 | # confidence interval |
1659 | 1874 | # |
1660 | # @details If mut_min is 1, a y-intercept will be searched for at 0. If | |
1661 | # mut_min is above 1, then the "y-intercept" will be found at x = mut_min - 1. | |
1662 | # | |
1663 | 1875 | # @return A data frame containing only the subset carrying the desired levels |
1664 | 1876 | # of mutation |
1665 | 1877 | # |
1666 | 1878 | findLowerY = function(x, y, mut_min, alpha){ |
1667 | y = y+1-mut_min | |
1668 | lowerY = suppressWarnings(confint(lm(x ~ y),level=1-2*alpha)[[1]]) | |
1669 | return(lowerY) | |
1879 | y = y + 1 - mut_min | |
1880 | lowerY = suppressWarnings(confint(lm(x ~ y), level=1 - 2*alpha)[[1]]) | |
1881 | return(lowerY) | |
1670 | 1882 | } |
1671 | 1883 | |
1672 | 1884 | # Enchanced substring extraction |
1680 | 1892 | # @return a substring |
1681 | 1893 | # |
1682 | 1894 | superSubstring = function(string, positions){ |
1683 | if(length(string) != 1){ stop("Please submit only one string.") } | |
1684 | chars = sapply(positions, function(x) substring(string, x, x)) | |
1685 | return(paste(chars, collapse="")) | |
1895 | if(length(string) != 1){ stop("Please submit only one string.") } | |
1896 | chars = sapply(positions, function(x) substring(string, x, x)) | |
1897 | return(paste(chars, collapse="")) | |
1686 | 1898 | } |
1687 | 1899 | |
1688 | 1900 | |
1701 | 1913 | # describing the heights of the rows in the layout. Will |
1702 | 1914 | # be passed to grid.layout. Default is all plots have |
1703 | 1915 | # the same height. |
1704 | multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL, heights=NULL) { | |
1705 | ||
1706 | # Make a list from the ... arguments and plotlist | |
1707 | plots <- c(list(...), plotlist) | |
1708 | ||
1709 | numPlots = length(plots) | |
1710 | if (is.null(heights)) { heights = rep(1,numPlots) } | |
1711 | ||
1712 | # If layout is NULL, then use 'cols' to determine layout | |
1713 | if (is.null(layout)) { | |
1714 | # Make the panel | |
1715 | # ncol: Number of columns of plots | |
1716 | # nrow: Number of rows needed, calculated from # of cols | |
1717 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), | |
1718 | ncol = cols, nrow = ceiling(numPlots/cols)) | |
1719 | } | |
1720 | ||
1721 | if (numPlots==1) { | |
1722 | print(plots[[1]]) | |
1723 | ||
1724 | } else { | |
1725 | # Set up the page | |
1726 | grid.newpage() | |
1727 | pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), | |
1728 | heights=heights))) | |
1729 | ||
1730 | # Make each plot, in the correct location | |
1731 | for (i in 1:numPlots) { | |
1732 | # Get the i,j matrix positions of the regions that contain this subplot | |
1733 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) | |
1734 | ||
1735 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, | |
1736 | layout.pos.col = matchidx$col)) | |
1737 | } | |
1738 | } | |
1739 | } | |
1740 | ||
1916 | multiplot <- function(..., plotlist=NULL, cols=1, layout=NULL, heights=NULL) { | |
1917 | # Make a list from the ... arguments and plotlist | |
1918 | plots <- c(list(...), plotlist) | |
1919 | numPlots <- length(plots) | |
1920 | ncol <- cols | |
1921 | nrow <- ceiling(numPlots/cols) | |
1922 | if (is.null(heights)) { heights = rep(1,nrow) } | |
1923 | if (is.null(layout)) { | |
1924 | # Make the panel | |
1925 | # ncol: Number of columns of plots | |
1926 | # nrow: Number of rows needed, calculated from # of cols | |
1927 | layout <- matrix(seq(1, cols * nrow), | |
1928 | ncol = cols, nrow = nrow) | |
1929 | } | |
1930 | grob <- gridExtra::arrangeGrob(grobs=plots, | |
1931 | nrow=nrow, ncol=ncol, layout_matrix = layout, | |
1932 | heights=heights) | |
1933 | p <- ggplot() + | |
1934 | layer(data = data.frame(x = NA), | |
1935 | stat = StatIdentity, | |
1936 | position = PositionIdentity, | |
1937 | # geom = GeomDrawGrob, | |
1938 | geom = GeomCustomAnn, | |
1939 | inherit.aes = FALSE, | |
1940 | params = list(grob = grob, | |
1941 | xmin = 0, | |
1942 | xmax = 1, | |
1943 | ymin = 0, | |
1944 | ymax = 1)) + | |
1945 | scale_x_continuous(expand=c(0,0)) + | |
1946 | scale_y_continuous(expand=c(0,0)) | |
1947 | p | |
1948 | } |
2 | 2 | # @author Daniel Gadala-Maria |
3 | 3 | # @copyright Copyright 2016 Kleinstein Lab, Yale University. All rights reserved |
4 | 4 | # @license Creative Commons Attribution-NonCommercial-ShareAlike 4.0 Unported |
5 | # @version 0.3.0 | |
6 | # @date 2017.05.29 | |
7 | 5 | |
8 | 6 | |
9 | 7 | #' tigger |
10 | 8 | #' |
11 | 9 | #' Here we provide a \strong{T}ool for \strong{I}mmuno\strong{g}lobulin |
12 | #' \strong{G}enotype \strong{E}lucidation via | |
13 | #' \strong{R}ep-Seq (TIgGER). TIgGER inferrs the set of Ig alleles carried by an | |
10 | #' \strong{G}enotype \strong{E}lucidation via \strong{R}ep-Seq (TIgGER). | |
11 | #' TIgGER inferrs the set of Ig alleles carried by an | |
14 | 12 | #' individual (including any novel alleles) and then uses this set of alleles to |
15 | 13 | #' correct the initial assignments given to sample sequences by existing tools. |
16 | 14 | #' |
17 | #' @details Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the | |
15 | #' @details | |
16 | #' Immunoglobulin repertoire sequencing (AIRR-Seq, Rep-Seq) data is currently the | |
18 | 17 | #' subject of much study. A key step in analyzing these data involves assigning |
19 | 18 | #' the closest known V(D)J germline alleles to the (often somatically mutated) |
20 | 19 | #' sample sequences using a tool such as IMGT/HighV-QUEST. However, if the |
21 | 20 | #' sample utilizes alleles not in the germline database used for alignment, this |
22 | 21 | #' step will fail. Additionally, this alignment has an associated error rate of |
23 | #' ~5 percent, notably among sequences carrying a large number of somatic | |
22 | #' ~5%, notably among sequences carrying a large number of somatic | |
24 | 23 | #' mutations. The purpose of TIgGER is to address these issues. |
25 | 24 | #' |
26 | #' @section Core tigger functions: | |
25 | #' @section Allele detection and genotyping: | |
27 | 26 | #' \itemize{ |
28 | #' \item \link{findNovelAlleles}: Detect novel alleles | |
29 | #' \item \link{plotNovel}: Plot evidence of novel alleles | |
30 | #' \item \link{inferGenotype}: Infer an Ig genotype | |
31 | #' \item \link{plotGenotype}: A colorful genotype visualization | |
32 | #' \item \link{genotypeFasta}: Convert a genotype to sequences | |
33 | #' \item \link{reassignAlleles}: Correct allele calls | |
27 | #' \item \link{findNovelAlleles}: Detect novel alleles. | |
28 | #' \item \link{plotNovel}: Plot evidence of novel alleles. | |
29 | #' \item \link{inferGenotype}: Infer an Ig genotype using a frequency approach. | |
30 | #' \item \link{inferGenotypeBayesian}: Infer an Ig genotype using a Bayesian approach. | |
31 | #' \item \link{plotGenotype}: A colorful genotype visualization. | |
32 | #' \item \link{genotypeFasta}: Convert a genotype to sequences. | |
33 | #' \item \link{reassignAlleles}: Correct allele calls. | |
34 | #' \item \link{generateEvidence}: Generate evidence for the genotype and | |
35 | #' allele detection inferrence. | |
34 | 36 | #' } |
35 | 37 | #' |
36 | #' @section Mutation-related functions: | |
38 | #' @section Mutation handling: | |
37 | 39 | #' \itemize{ |
38 | #' \item \link{getMutatedPositions}: Find mutation locations | |
39 | #' \item \link{getMutCount}: Find distance from germline | |
40 | #' \item \link{findUnmutatedCalls}: Subset unmutated sequences | |
40 | #' \item \link{getMutatedPositions}: Find mutation locations. | |
41 | #' \item \link{getMutCount}: Find distance from germline. | |
42 | #' \item \link{findUnmutatedCalls}: Subset unmutated sequences. | |
41 | 43 | #' \item \link{getPopularMutationCount}: Find most common sequence's |
42 | #' mutation count | |
43 | #' \item \link{insertPolymorphisms}: Insert SNPs into a sequence | |
44 | #' mutation count. | |
45 | #' \item \link{insertPolymorphisms}: Insert SNPs into a sequence. | |
44 | 46 | #' } |
45 | 47 | #' |
46 | #' @section Input and formatting: | |
48 | #' @section Input, output and formatting: | |
47 | 49 | #' \itemize{ |
48 | #' \item \link{readIgFasta}: Read a fasta file of Ig sequences | |
49 | #' \item \link{updateAlleleNames}: Correct outdated allele names | |
50 | #' \item \link{sortAlleles}: Sort allele names intelligently | |
51 | #' \item \link{cleanSeqs}: Standardize sequence format | |
50 | #' \item \link{readIgFasta}: Read a fasta file of Ig sequences. | |
51 | #' \item \link{updateAlleleNames}: Correct outdated allele names. | |
52 | #' \item \link{sortAlleles}: Sort allele names intelligently. | |
53 | #' \item \link{cleanSeqs}: Standardize sequence format. | |
52 | 54 | #' } |
53 | 55 | #' |
54 | 56 | #' @name tigger |
55 | 57 | #' @docType package |
56 | #' @references Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
57 | #' high-throughput B cell sequencing data reveals a high frequency of novel | |
58 | #' immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
58 | #' @references | |
59 | #' \enumerate{ | |
60 | #' \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
61 | #' sequencing data reveals a high frequency of novel immunoglobulin V gene | |
62 | #' segment alleles. PNAS. 112(8):E862-70. | |
63 | #' } | |
59 | 64 | #' |
60 | 65 | #' @import ggplot2 |
61 | #' @importFrom alakazam getAllele getGene getFamily DNA_COLORS | |
66 | #' @importFrom alakazam getAllele getGene getFamily translateDNA DNA_COLORS | |
62 | 67 | #' @importFrom doParallel registerDoParallel |
63 | 68 | #' @importFrom dplyr do n desc %>% |
64 | 69 | #' glimpse distinct distinct_ |
65 | 70 | #' as_data_frame data_frame data_frame_ |
66 | #' bind_cols bind_rows combine | |
71 | #' bind_cols bind_rows combine inner_join | |
67 | 72 | #' filter filter_ select select_ arrange arrange_ |
68 | 73 | #' group_by group_by_ ungroup |
69 | 74 | #' mutate mutate_ transmute transmute_ |
71 | 76 | #' slice slice_ |
72 | 77 | #' @importFrom foreach foreach %dopar% registerDoSEQ |
73 | 78 | #' @importFrom graphics plot |
74 | #' @importFrom grid grid.layout grid.newpage pushViewport viewport | |
79 | #' @importFrom gridExtra arrangeGrob | |
80 | #' @importFrom gtools ddirichlet | |
75 | 81 | #' @importFrom iterators icount |
76 | 82 | #' @importFrom lazyeval interp |
77 | 83 | #' @importFrom parallel clusterEvalQ clusterExport makeCluster stopCluster |
84 | #' @importFrom rlang .data | |
85 | #' @importFrom shazam calcObservedMutations | |
78 | 86 | #' @importFrom stats na.omit setNames ecdf sd cor cov median mad |
79 | 87 | #' confint lm |
80 | #' @importFrom tidyr gather gather_ spread spread_ | |
88 | #' @importFrom stringi stri_length | |
89 | #' @importFrom tidyr gather gather_ spread spread_ unnest | |
81 | 90 | NULL |
1 | 1 | |
2 | 2 | High-throughput sequencing of B cell immunoglobulin receptors is providing unprecedented insight into adaptive immunity. A key step in analyzing these data involves assignment of the germline V, D and J gene segment alleles that comprise each immunoglobulin sequence by matching them against a database of known V(D)J alleles. However, this process will fail for sequences that utilize previously undetected alleles, whose frequency in the population is unclear. |
3 | 3 | |
4 | **TIgGER is a computational method that significantly improves V(D)J allele assignments by first determining the complete set of gene segments carried by an individual (including novel alleles) from V(D)J-rearrange sequences. TIgGER can then infer a subject's genotype from these sequences, and use this genotype to correct the initial V(D)J allele assignments.** | |
4 | TIgGER is a computational method that significantly improves V(D)J allele assignments by first determining the complete set of gene segments carried by an individual (including novel alleles) from V(D)J-rearrange sequences. TIgGER can then infer a subject's genotype from these sequences, and use this genotype to correct the initial V(D)J allele assignments. | |
5 | 5 | |
6 | 6 | The application of TIgGER continues to identify a surprisingly high frequency of novel alleles in humans, highlighting the critical need for this approach. (TIgGER, however, can and has been used with data from other species.) |
7 | 7 |
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
0 | 0 | ## ---- eval=TRUE, message=FALSE, warning=FALSE---------------------------- |
1 | # Load packages required for this example | |
1 | 2 | library(tigger) |
2 | 3 | library(dplyr) |
3 | # Load example sequence data and example germline database | |
4 | data(sample_db, germline_ighv) | |
5 | 4 | |
6 | 5 | ## ---- eval=TRUE, warning=FALSE------------------------------------------- |
7 | 6 | # Detect novel alleles |
8 | novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1) | |
7 | novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1) | |
9 | 8 | |
10 | 9 | ## ---- eval=TRUE, warning=FALSE------------------------------------------- |
11 | 10 | # Extract and view the rows that contain successful novel allele calls |
12 | novel <- selectNovel(novel_df) | |
13 | novel[1:3] | |
11 | novel_rows <- selectNovel(novel) | |
12 | novel_rows[1:3] | |
14 | 13 | |
15 | 14 | ## ---- eval=TRUE, warning=FALSE, fig.width=6, fig.height=8---------------- |
16 | 15 | # Plot evidence of the first (and only) novel allele from the example data |
17 | plotNovel(sample_db, novel[1, ]) | |
16 | plotNovel(SampleDb, novel[1, ]) | |
18 | 17 | |
19 | 18 | ## ---- eval=TRUE, warning=FALSE, fig.width=4, fig.height=3---------------- |
20 | 19 | # Infer the individual's genotype, using only unmutated sequences and checking |
21 | 20 | # for the use of the novel alleles inferred in the earlier step. |
22 | geno <- inferGenotype(sample_db, find_unmutated = TRUE, | |
23 | germline_db = germline_ighv, novel_df = novel_df) | |
21 | geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel, | |
22 | find_unmutated=TRUE) | |
24 | 23 | # Save the genotype sequences to a vector |
25 | genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df) | |
24 | genotype_db <- genotypeFasta(geno, GermlineIGHV, novel) | |
26 | 25 | # Visualize the genotype and sequence counts |
27 | 26 | print(geno) |
28 | 27 | # Make a colorful visualization. Bars indicate presence, not proportion. |
29 | 28 | plotGenotype(geno, text_size = 10) |
30 | 29 | |
30 | ## ---- eval=TRUE, warning=FALSE, fig.width=4, fig.height=3---------------- | |
31 | # Infer the individual's genotype, using the bayesian method | |
32 | geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, | |
33 | novel=novel, find_unmutated=TRUE) | |
34 | # Visualize the genotype and sequence counts | |
35 | print(geno_bayesian) | |
36 | # Make a colorful visualization. Bars indicate presence, not proportion. | |
37 | plotGenotype(geno_bayesian, text_size=10) | |
31 | 38 | |
32 | 39 | ## ---- eval=TRUE, warning=FALSE------------------------------------------- |
33 | 40 | # Use the personlized genotype to determine corrected allele assignments |
34 | V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs) | |
35 | # Append the corrected calls to the original data.frame | |
36 | sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED) | |
41 | # Updated genotype will be placed in the V_CALL_GENOTYPED column | |
42 | sample_db <- reassignAlleles(SampleDb, genotype_db) | |
37 | 43 | |
38 | 44 | ## ---- eval=TRUE, warning=FALSE------------------------------------------- |
39 | 45 | # Find the set of alleles in the original calls that were not in the genotype |
40 | 46 | not_in_genotype <- sample_db$V_CALL %>% |
41 | strsplit(",") %>% | |
42 | unlist() %>% | |
43 | unique() %>% | |
44 | setdiff(names(genotype_seqs)) | |
47 | strsplit(",") %>% | |
48 | unlist() %>% | |
49 | unique() %>% | |
50 | setdiff(names(genotype_db)) | |
45 | 51 | |
46 | 52 | # Determine the fraction of calls that were ambigious before/after correction |
47 | 53 | # and the fraction that contained original calls to non-genotype alleles. Note |
48 | 54 | # that by design, only genotype alleles are allowed in "after" calls. |
49 | data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)), | |
50 | mean(grepl(",",sample_db$V_CALL_GENOTYPED))), | |
51 | NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype), | |
52 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
53 | row.names = c("Before", "After")) %>% | |
55 | data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)), | |
56 | mean(grepl(",", sample_db$V_CALL_GENOTYPED))), | |
57 | NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype), | |
58 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
59 | row.names=c("Before", "After")) %>% | |
54 | 60 | t() %>% round(3) |
55 | 61 | |
56 |
29 | 29 | |
30 | 30 | ## Introduction |
31 | 31 | |
32 | Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the subject of | |
33 | much study. A key step in analyzing these data involves assigning the closest | |
34 | known V(D)J germline alleles to the (often somatically mutated) sample sequences | |
35 | using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, if the sample utilizes | |
36 | alleles not in the germline database used for alignment, this step will fail. | |
37 | Additionally, this alignment has an associated error rate of ~5% ([[2]][2]), | |
38 | notably among sequences carrying a large number of somatic mutations. | |
32 | Adapative immune receptor repertoire sequencing (AIRR-Seq, Rep-Seq) data is | |
33 | currently the subject of much study. A key step in analyzing these data involves | |
34 | assigning the closest known V(D)J germline alleles to the (often somatically mutated) | |
35 | sample sequences using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, | |
36 | if the sample utilizes alleles not in the germline database used for alignment, | |
37 | this step will fail. Additionally, this alignment has an associated error rate | |
38 | of ~5% ([[2]][2]), notably among sequences carrying a large number of somatic | |
39 | mutations. | |
39 | 40 | |
40 | 41 | Here we provide a **T**ool for **I**mmuno**g**lobulin **G**enotype |
41 | 42 | **E**lucidation via **R**ep-Seq (TIgGER). TIgGER addresses these issues by |
42 | inferring the set of Ig alleles carried by an individual (including any novel | |
43 | alleles) and then using this set of alleles to correct the initial assignments | |
44 | given to sample sequences by existing tools. | |
45 | ||
46 | Additional information is available in: | |
43 | inferring the set of Immunoglobulin (Ig) alleles carried by an individual | |
44 | (including any novel alleles) and then using this set of alleles to correct | |
45 | the initial assignments given to sample sequences by existing tools. | |
46 | ||
47 | This vignette covers the following tasks: | |
48 | ||
49 | 1. Inferring the presence of novel IGHV alleles not in the germline database. | |
50 | 2. Inferring the personalized IGHV genotype of a sample. | |
51 | 3. Correcting the IGHV allele calls of a sample based on the IGHV genotype. | |
52 | ||
53 | Additional information about the methods used by TIgGER is available in: | |
47 | 54 | |
48 | 55 | [Gadala-Maria D, Yaari G, Uduman M, Kleinstein SH (2015) Automated analysis of |
49 | 56 | high-throughput B cell sequencing data reveals a high frequency of novel |
50 | 57 | immunoglobulin V gene segment alleles. *PNAS* |
51 | 58 | 112(8):E862-70](http://www.pnas.org/content/early/2015/02/05/1417683112). |
52 | 59 | |
53 | ||
54 | 60 | ## Input |
55 | 61 | |
56 | 62 | TIgGER requires two main inputs: |
57 | 63 | |
58 | 1. Pre-processed Rep-Seq data | |
64 | 1. Pre-processed Ig sequence data | |
59 | 65 | 2. Database germline sequences |
60 | 66 | |
61 | Rep-seq data is input as a data frame where each row represents a unique | |
62 | observation and and columns represent data about that observation. The required | |
63 | names of the required columns are provided below along with a description of | |
64 | each. | |
67 | AIRR-seq data is input as a data frame following the Change-O standard where | |
68 | each row represents a unique observation and and columns represent data about | |
69 | that observation. The required names of the required columns are provided below | |
70 | along with a description of each. | |
65 | 71 | |
66 | 72 | Column Name | Description |
67 | 73 | ----------------------|--------------------------------------------------------- |
70 | 76 | `J_CALL` | (Comma separated) name(s) of the nearest J allele(s) |
71 | 77 | `JUNCTION_LENGTH` | Length of the junction region of the V(D)J sample |
72 | 78 | |
73 | An example dataset is provided with the `tigger` package. It contains unique | |
74 | functional sequences assigned to IGHV1 family genes isolated from individual | |
75 | PGP1 (referenced in Gadala-Maria *et al.* 2015). | |
79 | An example dataset is provided with the `tigger` package as `SampleDb`. It | |
80 | contains unique functional sequences assigned to IGHV1 family genes isolated | |
81 | from individual PGP1 (referenced in Gadala-Maria *et al.* 2015). | |
76 | 82 | |
77 | 83 | The database of germline sequences should be provided in FASTA format with |
78 | 84 | sequences gapped according to the IMGT numbering scheme ([[3]][3]). IGHV alleles in |
79 | the IMGT database (build 201408-4) are provided with this package. You may read | |
80 | in your own fasta file using `readIgFasta`. | |
85 | the IMGT database (build 201408-4) are provided with this package as `GermlineIGHV`. | |
86 | You may read in your own fasta file using `readIgFasta`. | |
81 | 87 | |
82 | 88 | ```{r, eval=TRUE, message=FALSE, warning=FALSE} |
89 | # Load packages required for this example | |
83 | 90 | library(tigger) |
84 | 91 | library(dplyr) |
85 | # Load example sequence data and example germline database | |
86 | data(sample_db, germline_ighv) | |
87 | ``` | |
88 | ||
89 | ## Running TIgGER | |
90 | ||
91 | The functions provided by this package can be used to perform any combination of | |
92 | the following: | |
93 | ||
94 | 1. Infer the presence of novel IGHV alleles not in the germline database | |
95 | 2. Infer the individual's IGHV genotype | |
96 | 3. Correct the IGHV allele calls of the samples based on the IGHV genotype | |
97 | ||
98 | ### Novel Alleles | |
92 | ``` | |
93 | ||
94 | ## Novel allele detection | |
99 | 95 | |
100 | 96 | Potential novel alleles can be detected by TIgGER. Some of these may be included |
101 | 97 | in the genotype later (see below). `findNovelAlleles` will return a `data.frame` |
108 | 104 | |
109 | 105 | ```{r, eval=TRUE, warning=FALSE} |
110 | 106 | # Detect novel alleles |
111 | novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1) | |
107 | novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1) | |
112 | 108 | ``` |
113 | 109 | |
114 | 110 | ```{r, eval=TRUE, warning=FALSE} |
115 | 111 | # Extract and view the rows that contain successful novel allele calls |
116 | novel <- selectNovel(novel_df) | |
117 | novel[1:3] | |
112 | novel_rows <- selectNovel(novel) | |
113 | novel_rows[1:3] | |
118 | 114 | ``` |
119 | 115 | |
120 | 116 | The TIgGER procedure for identifying novel alleles (see citation above) involves |
147 | 143 | |
148 | 144 | ```{r, eval=TRUE, warning=FALSE, fig.width=6, fig.height=8} |
149 | 145 | # Plot evidence of the first (and only) novel allele from the example data |
150 | plotNovel(sample_db, novel[1, ]) | |
151 | ``` | |
152 | ||
153 | ### Genotype | |
154 | An individual's genotype can be inferred using the function `inferGenotype`. | |
155 | This function will remove from the genotype rare/erroneous allele calls which | |
156 | may result from mutations in allele-differentiating regions. This is done by | |
157 | determining the fewest alleles that account for nearly all (default is 7/8) of | |
158 | the allele calls made. The user may opt to only use sequences which perfectly | |
159 | match germline alleles, and may opt to include potential novel alleles. | |
160 | (The genotype output is designed to be human readable, though `plotGenotype` | |
161 | can be used to make a colorful visualization.) For each allele, the | |
162 | number of sequences which match the germline are listed in the same order as | |
163 | the alleles are listed. The total number of sequences that match any allele of | |
164 | that gene is also given. To output these alleles as a names vector of nucleotide | |
165 | sequences, the user may use the function `genotypeFasta`. To save this vector to | |
166 | a fasta file, `writeFasta` may be used. | |
146 | plotNovel(SampleDb, novel[1, ]) | |
147 | ``` | |
148 | ||
149 | ## Inferring genotypes | |
150 | ||
151 | An individual's genotype can be inferred using the functions `inferGenotype` or | |
152 | `inferGenotypeBayesian`. Using one of this functions allows to remove from the | |
153 | genotype rare/erroneous allele calls which may result from mutations in | |
154 | allele-differentiating regions. `inferGenotype` uses a frequency method to | |
155 | decide which alleles belong to the subjects genotype whereas | |
156 | `inferGenotypeBayesian` infers an subject's genotype applying a Bayesian | |
157 | framework and provides a confidence estimate associated with | |
158 | the genotype calls. | |
159 | ||
160 | ||
161 | ### Frequency genotyping approach | |
162 | ||
163 | `inferGenotype` identifies the fewest alleles that account for | |
164 | nearly all (default is 7/8) of the allele calls made. The user may opt to only | |
165 | use sequences which perfectly match germline alleles, and may opt to include | |
166 | potential novel alleles. (The genotype output is designed to be human readable, | |
167 | though `plotGenotype` can be used to make a colorful visualization.) For each | |
168 | allele, the number of sequences which match the germline are listed in the same | |
169 | order as the alleles are listed. The total number of sequences that match any | |
170 | allele of that gene is also given. To output these alleles as a names vector of | |
171 | nucleotide sequences, the user may use the function `genotypeFasta`. To save | |
172 | this vector to a fasta file, `writeFasta` may be used. | |
167 | 173 | |
168 | 174 | ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3} |
169 | 175 | # Infer the individual's genotype, using only unmutated sequences and checking |
170 | 176 | # for the use of the novel alleles inferred in the earlier step. |
171 | geno <- inferGenotype(sample_db, find_unmutated = TRUE, | |
172 | germline_db = germline_ighv, novel_df = novel_df) | |
177 | geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel, | |
178 | find_unmutated=TRUE) | |
173 | 179 | # Save the genotype sequences to a vector |
174 | genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df) | |
180 | genotype_db <- genotypeFasta(geno, GermlineIGHV, novel) | |
175 | 181 | # Visualize the genotype and sequence counts |
176 | 182 | print(geno) |
177 | 183 | # Make a colorful visualization. Bars indicate presence, not proportion. |
178 | 184 | plotGenotype(geno, text_size = 10) |
179 | ||
180 | ``` | |
181 | ||
182 | ### Corrected Allele Calls | |
185 | ``` | |
186 | ||
187 | ### Bayesian genotyping approach | |
188 | ||
189 | The method `inferGenotypeBayesian` analyzes the posterior probabilities of | |
190 | possible allele distributions, considering up to four distinct alleles per | |
191 | V gene, corresponding to a gene duplication with both loci being heterozygous | |
192 | (i.e., homozygous, heterozygous with one copy of each allele, etc.). The | |
193 | posterior probabilities for these four possible models are compared and a Bayes | |
194 | factor is calculated for the two most probable models. This Bayes factor | |
195 | reflects the confidence in the genotyping call of the method. The bayesian | |
196 | method doesn't use the strict cutoff criterion `fraction_to_explain` that | |
197 | `inferGenotype` uses wherein only the minimum set of alleles explaining | |
198 | 88% (7/8) of apparently-unmutated sequences are included in the genotype. | |
199 | ||
200 | ||
201 | ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3} | |
202 | # Infer the individual's genotype, using the bayesian method | |
203 | geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, | |
204 | novel=novel, find_unmutated=TRUE) | |
205 | # Visualize the genotype and sequence counts | |
206 | print(geno_bayesian) | |
207 | # Make a colorful visualization. Bars indicate presence, not proportion. | |
208 | plotGenotype(geno_bayesian, text_size=10) | |
209 | ``` | |
210 | ||
211 | ## Correcting allele calls | |
183 | 212 | |
184 | 213 | Finally, the original V allele calls may be limited to only those within the |
185 | 214 | inferred genotype. This can be done by using the function `reassignAlleles`. |
186 | By corrected the calls in this manner, the user can greatly reduce the numer of | |
215 | By correcting the calls in this manner, the user can greatly reduce the numer of | |
187 | 216 | ambiguous allele calls (where a single sample sequences is assigned to multiple |
188 | 217 | V alleles, thus preventing the mutations analysis of allele-differentiating |
189 | 218 | positions). Additionally, assignments to erroneous not-in-genotype alleles |
191 | 220 | |
192 | 221 | ```{r, eval=TRUE, warning=FALSE} |
193 | 222 | # Use the personlized genotype to determine corrected allele assignments |
194 | V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs) | |
195 | # Append the corrected calls to the original data.frame | |
196 | sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED) | |
223 | # Updated genotype will be placed in the V_CALL_GENOTYPED column | |
224 | sample_db <- reassignAlleles(SampleDb, genotype_db) | |
197 | 225 | ``` |
198 | 226 | |
199 | 227 | From here, one may proceed with further downstream analyses, but with the |
200 | 228 | advantage of having much-improved allele calls. Besides having discovered |
201 | alleles not in the IGMT database, the calls made by IMGT have been tailored to | |
229 | alleles not in the IMGT database, the calls made by IMGT have been tailored to | |
202 | 230 | the subject's genotype, greatly reducing the number of problematic calls, as |
203 | 231 | can be seen below. |
204 | 232 | |
205 | 233 | ```{r, eval=TRUE, warning=FALSE} |
206 | 234 | # Find the set of alleles in the original calls that were not in the genotype |
207 | 235 | not_in_genotype <- sample_db$V_CALL %>% |
208 | strsplit(",") %>% | |
209 | unlist() %>% | |
210 | unique() %>% | |
211 | setdiff(names(genotype_seqs)) | |
236 | strsplit(",") %>% | |
237 | unlist() %>% | |
238 | unique() %>% | |
239 | setdiff(names(genotype_db)) | |
212 | 240 | |
213 | 241 | # Determine the fraction of calls that were ambigious before/after correction |
214 | 242 | # and the fraction that contained original calls to non-genotype alleles. Note |
215 | 243 | # that by design, only genotype alleles are allowed in "after" calls. |
216 | data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)), | |
217 | mean(grepl(",",sample_db$V_CALL_GENOTYPED))), | |
218 | NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype), | |
219 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
220 | row.names = c("Before", "After")) %>% | |
244 | data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)), | |
245 | mean(grepl(",", sample_db$V_CALL_GENOTYPED))), | |
246 | NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype), | |
247 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
248 | row.names=c("Before", "After")) %>% | |
221 | 249 | t() %>% round(3) |
222 | ||
223 | ``` | |
224 | ||
250 | ``` | |
225 | 251 | |
226 | 252 | ## References |
227 | 253 | |
232 | 258 | [1]: http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.html "Alamyar et al. (2010)" |
233 | 259 | [2]: http://www.ncbi.nlm.nih.gov/pubmed/20147303 "Munshaw and Kepler (2010)" |
234 | 260 | [3]: http://www.ncbi.nlm.nih.gov/pubmed/12477501 "Lefranc et al. (2003)" |
261 |
Binary diff not shown
0 | library(markr) | |
1 | library(tigger) | |
2 | ||
3 | # Directories | |
4 | pkg_path <- "." | |
5 | doc_path <- "./docs" | |
6 | ||
7 | # Build | |
8 | build_mkdocs(pkg_path, doc_path=doc_path, yaml=F)⏎ |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{GermlineIGHV} | |
4 | \alias{GermlineIGHV} | |
5 | \title{Human IGHV germlines} | |
6 | \format{Values correspond to IMGT-gaped nuceltoide sequences (with | |
7 | nucleotides capitalized and gaps represented by ".") while names correspond | |
8 | to stripped-down IMGT allele names (e.g. "IGHV1-18*01").} | |
9 | \description{ | |
10 | A \code{character} vector of all 344 human IGHV germline gene segment alleles | |
11 | in IMGT/GENE-DB release 201408-4. | |
12 | } | |
13 | \references{ | |
14 | \enumerate{ | |
15 | \item Xochelli, et al. (2014) Immunoglobulin heavy variable (IGHV) genes and | |
16 | alleles: new entities, new names and implications for research and | |
17 | prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6. | |
18 | } | |
19 | } | |
20 | \keyword{data} |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{SampleDb} | |
4 | \alias{SampleDb} | |
5 | \title{Example human immune repertoire data} | |
6 | \format{A \code{data.frame} where rows correspond to unique V(D)J sequences and | |
7 | columns include: | |
8 | \itemize{ | |
9 | \item \code{"SEQUENCE_IMGT"}: IMGT-gapped V(D)J nucleotide sequence. | |
10 | \item \code{"V_CALL"}: IMGT/HighV-QUEST V segment allele calls. | |
11 | \item \code{"D_CALL"}: IMGT/HighV-QUEST D segment allele calls. | |
12 | \item \code{"J_CALL"}: IMGT/HighV-QUEST J segment allele calls. | |
13 | \item \code{"JUNCTION_LENGTH"}: Junction region length. | |
14 | }} | |
15 | \description{ | |
16 | A \code{data.frame} of example V(D)J immunoglobulin sequences derived from a | |
17 | single individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
18 | IMGT/HighV-QUEST to IGHV1 family alleles. | |
19 | } | |
20 | \references{ | |
21 | \enumerate{ | |
22 | \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
23 | sequencing data reveals a high frequency of novel immunoglobulin V gene | |
24 | segment alleles. PNAS. 112(8):E862-70. | |
25 | } | |
26 | } | |
27 | \keyword{data} |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{SampleGenotype} | |
4 | \alias{SampleGenotype} | |
5 | \title{Example genotype inferrence results} | |
6 | \format{A \code{data.frame} where rows correspond to genes carried by an | |
7 | individual and columns lists the alleles of those genes and their counts.} | |
8 | \description{ | |
9 | A \code{data.frame} of genotype inference results from \link{inferGenotype} | |
10 | after novel allele detection via \link{findNovelAlleles}. | |
11 | Source data was a collection of V(D)J immunoglobulin sequences derived from a single | |
12 | individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
13 | IMGT/HighV-QUEST to IGHV1 family alleles. | |
14 | } | |
15 | \references{ | |
16 | \enumerate{ | |
17 | \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
18 | sequencing data reveals a high frequency of novel immunoglobulin V gene | |
19 | segment alleles. PNAS. 112(8):E862-70. | |
20 | } | |
21 | } | |
22 | \seealso{ | |
23 | See \link{inferGenotype} for detailed column descriptions. | |
24 | } | |
25 | \keyword{data} |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{SampleNovel} | |
4 | \alias{SampleNovel} | |
5 | \title{Example novel allele detection results} | |
6 | \format{A \code{data.frame} where rows correspond to alleles checked for | |
7 | polymorphisms and columns give results as well as paramaters used to run | |
8 | the test.} | |
9 | \description{ | |
10 | A \code{data.frame} of novel allele detection results from \link{findNovelAlleles}. | |
11 | Source data was a collection of V(D)J immunoglobulin sequences derived from a single | |
12 | individual (PGP1), sequenced on the Roche 454 platform, and assigned by | |
13 | IMGT/HighV-QUEST to IGHV1 family alleles. | |
14 | } | |
15 | \references{ | |
16 | \enumerate{ | |
17 | \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
18 | sequencing data reveals a high frequency of novel immunoglobulin V gene | |
19 | segment alleles. PNAS. 112(8):E862-70. | |
20 | } | |
21 | } | |
22 | \seealso{ | |
23 | See \link{findNovelAlleles} for detailed column descriptions. | |
24 | } | |
25 | \keyword{data} |
6 | 6 | cleanSeqs(seqs) |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{seqs}{a vector of nucleotide sequences} | |
9 | \item{seqs}{a vector of nucleotide sequences.} | |
10 | 10 | } |
11 | 11 | \value{ |
12 | A vector of nucleotide sequences | |
12 | A modified vector of nucleotide sequences. | |
13 | 13 | } |
14 | 14 | \description{ |
15 | \code{cleanSeqs} capitalizes nucleotides, replaces "." with "-", and then | |
16 | replaces all characters besides ACGT- with "N". | |
15 | \code{cleanSeqs} capitalizes nucleotides and replaces all characters | |
16 | besides \code{c("A", "C", "G", "T", "-", ".")} with \code{"N"}. | |
17 | 17 | } |
18 | 18 | \examples{ |
19 | # Create messy nucleotide sequences | |
20 | seqs = c("AGAT.taa-GAG...ATA", | |
21 | "GATACAGTXXXXXAGNNNPPPACA") | |
22 | # Clean them up | |
19 | # Clean messy nucleotide sequences | |
20 | seqs <- c("AGAT.taa-GAG...ATA", "GATACAGTXXZZAGNNPPACA") | |
23 | 21 | cleanSeqs(seqs) |
24 | 22 | |
25 | 23 | } |
3 | 3 | \alias{findNovelAlleles} |
4 | 4 | \title{Find novel alleles from repertoire sequencing data} |
5 | 5 | \usage{ |
6 | findNovelAlleles(clip_db, germline_db, v_call = "V_CALL", | |
6 | findNovelAlleles(data, germline_db, v_call = "V_CALL", | |
7 | 7 | germline_min = 200, min_seqs = 50, auto_mutrange = TRUE, |
8 | 8 | mut_range = 1:10, pos_range = 1:312, y_intercept = 0.125, |
9 | 9 | alpha = 0.05, j_max = 0.15, min_frac = 0.75, nproc = 1) |
10 | 10 | } |
11 | 11 | \arguments{ |
12 | \item{clip_db}{a \code{data.frame} in Change-O format. See details.} | |
12 | \item{data}{a \code{data.frame} in Change-O format. See details.} | |
13 | 13 | |
14 | 14 | \item{germline_db}{a vector of named nucleotide germline sequences |
15 | matching the V calls in \code{clip_db}} | |
15 | matching the V calls in \code{data}.} | |
16 | 16 | |
17 | \item{v_call}{name of the column in clip_db with V allele calls. | |
17 | \item{v_call}{name of the column in \code{data} with V allele calls. | |
18 | 18 | Default is V_CALL.} |
19 | 19 | |
20 | 20 | \item{germline_min}{the minimum number of sequences that must have a |
36 | 36 | \item{pos_range}{the range of IMGT-numbered positions that should be |
37 | 37 | considered by the algorithm} |
38 | 38 | |
39 | \item{y_intercept}{the y-intercept above which positions should be | |
39 | \item{y_intercept}{the y-intercept threshold above which positions should be | |
40 | 40 | considered potentially polymorphic} |
41 | 41 | |
42 | \item{alpha}{the alpha cutoff to be used when constructing the | |
43 | confidence interval for the y-intercept} | |
42 | \item{alpha}{the alpha value used for determining whether the | |
43 | fit y-intercept is greater than the \code{y_intercept} | |
44 | threshold} | |
44 | 45 | |
45 | 46 | \item{j_max}{the maximum fraction of sequences perfectly aligning |
46 | 47 | to a potential novel allele that are allowed to |
54 | 55 | \item{nproc}{the number of processors to use} |
55 | 56 | } |
56 | 57 | \value{ |
57 | a \code{data.frame} with a row for each known allele analyzed. | |
58 | A \code{data.frame} with a row for each known allele analyzed. | |
58 | 59 | Besides metadata on the the parameters used in the search, each row will have |
59 | 60 | either a note as to where the polymorphism-finding algorithm exited or a |
60 | nucleotide sequence for the predicted novel allele. | |
61 | nucleotide sequence for the predicted novel allele, along with columns providing | |
62 | additional evidence. | |
63 | ||
64 | The output contains the following columns: | |
65 | \itemize{ | |
66 | \item \code{GERMLINE_CALL}: The input (uncorrected) V call. | |
67 | \item \code{NOTE}: Comments regarding the inferrence. | |
68 | \item \code{POLYMORPHISM_CALL}: The novel allele call. | |
69 | \item \code{NT_SUBSTITUTIONS}: Mutations identified in the novel allele, relative | |
70 | to the reference germline (\code{GERMLINE_CALL}) | |
71 | \item \code{NOVEL_IMGT}: The novel allele sequence. | |
72 | \item \code{NOVEL_IMGT_COUNT}: The number of times the sequence \code{NOVEL_IMGT} | |
73 | is found in the input data. Considers the subsequence of \code{NOVEL_IMGT} | |
74 | in the \code{pos_range}. | |
75 | \item \code{NOVEL_IMGT_UNIQUE_J}: Number of distinct J calls associated to \code{NOVEL_IMGT} | |
76 | in the input data. Considers the subsequence of \code{NOVEL_IMGT} in the \code{pos_range}. | |
77 | \item \code{NOVEL_IMGT_UNIQUE_CDR3}: Number of distinct CDR3 sequences associated | |
78 | with \code{NOVEL_IMGT} in the input data. Considers the subsequence of \code{NOVEL_IMGT} | |
79 | in the \code{pos_range}. | |
80 | \item \code{PERFECT_MATCH_COUNT}: Final number of sequences retained to call the new | |
81 | allele. These are unique sequences that have V segments that perfectly match | |
82 | the predicted germline in the \code{pos_range}. | |
83 | \item \code{PERFECT_MATCH_FREQ}: \code{PERFECT_MATCH_COUNT / GERMLINE_CALL_COUNT} | |
84 | \item \code{GERMLINE_CALL_COUNT}: The number of sequences with the \code{GERMLINE_CALL} | |
85 | in the input data that were initially considered for the analysis. | |
86 | \item \code{GERMLINE_CALL_FREQ}: The fraction of sequences with the \code{GERMLINE_CALL} | |
87 | in the input data initially considered for the analysis. | |
88 | \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}. | |
89 | \item \code{GERMLINE_IMGT_COUNT}: The number of times the \code{GERMLINE_IMGT} | |
90 | sequence is found in the input data. | |
91 | \item \code{MUT_MIN}: Minimum mutation considered by the algorithm. | |
92 | \item \code{MUT_MAX}: Maximum mutation considered by the algorithm. | |
93 | \item \code{MUT_PASS_COUNT}: Number of sequences in the mutation range. | |
94 | \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering). | |
95 | \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering). | |
96 | \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered | |
97 | potentially polymorphic. | |
98 | \item \code{Y_INTERCEPT_PASS}: Number of positions that pass the \code{Y_INTERCEPT} threshold. | |
99 | \item \code{SNP_PASS}: Number of sequences that pass the \code{Y_INTERCEPT} threshold and are | |
100 | within the desired nucleotide range (\code{min_seqs}). | |
101 | \item \code{UNMUTATED_COUNT}: Number of unmutated sequences. | |
102 | \item \code{UNMUTATED_FREQ}: Number of unmutated sequences over \code{GERMLINE_IMGT_COUNT}. | |
103 | \item \code{UNMUTATED_SNP_J_GENE_LENGTH_COUNT}: Number of distinct combinations | |
104 | of SNP, J gene, and junction length. | |
105 | \item \code{SNP_MIN_SEQS_J_MAX_PASS}: Number of SNPs that pass both the \code{min_seqs} | |
106 | and \code{j_max} thresholds. | |
107 | \item \code{ALPHA}: Significance threshold to be used when constructing the | |
108 | confidence interval for the y-intercept. | |
109 | \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences | |
110 | (within the desired mutational range and nucleotide range) required | |
111 | for the samples to be considered. | |
112 | \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly | |
113 | aligning to a potential novel allele that are allowed to utilize to a particular | |
114 | combination of junction length and J gene. | |
115 | \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must | |
116 | have usable nucleotides in a given position for that position to be considered. | |
117 | } | |
118 | ||
119 | The following comments can appear in the \code{NOTE} column: | |
120 | ||
121 | \itemize{ | |
122 | \item \emph{Novel allele found}: A novel allele was detected. | |
123 | \item \emph{Plurality sequence too rare}: No sequence is frequent enough to pass | |
124 | the J test (\code{j_max}). | |
125 | \item \emph{A J-junction combination is too prevalent}: Not enough J diversity (\code{j_max}). | |
126 | \item \emph{No positions pass y-intercept test}: No positions above \code{y_intercept}. | |
127 | \item \emph{Insufficient sequences in desired mutational range}: | |
128 | \code{mut_range} and \code{pos_range}. | |
129 | \item \emph{Not enough sequences}: Not enough sequences in the desired mutational | |
130 | range and nucleotide range (\code{min_seqs}). | |
131 | \item \emph{No unmutated versions of novel allele found}: All observed variants of the | |
132 | allele are mutated. | |
133 | } | |
61 | 134 | } |
62 | 135 | \description{ |
63 | 136 | \code{findNovelAlleles} analyzes mutation patterns in sequences thought to |
65 | 138 | might be polymorphic. |
66 | 139 | } |
67 | 140 | \details{ |
68 | A \code{data.frame} in Change-O format contains the following | |
69 | columns: | |
70 | \itemize{ | |
71 | \item \code{"SEQUENCE_IMGT"} containing the IMGT-gapped nucleotide sequence | |
72 | \item \code{"V_CALL"} containing the IMGT/V-QUEST V allele call(s) | |
73 | \item \code{"J_CALL"} containing the IMGT/V-QUEST J allele call(s) | |
74 | \item \code{"JUNCTION_LENGTH"} containing the junction length | |
75 | } | |
76 | 141 | The TIgGER allele-finding algorithm, briefly, works as follows: |
77 | 142 | Mutations are determined through comparison to the provided germline. |
78 | 143 | Mutation frequency at each *position* is determined as a function of |
83 | 148 | allele utilize a wide range of combinations of J gene and junction length. |
84 | 149 | } |
85 | 150 | \examples{ |
86 | # Load example data and germlines | |
87 | data(sample_db) | |
88 | data(germline_ighv) | |
89 | ||
151 | \donttest{ | |
90 | 152 | # Find novel alleles and return relevant data |
91 | \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)} | |
153 | novel <- findNovelAlleles(SampleDb, GermlineIGHV) | |
154 | } | |
92 | 155 | |
93 | 156 | } |
94 | 157 | \seealso{ |
95 | 158 | \link{plotNovel} to visualize the data supporting any |
96 | 159 | novel alleles hypothesized to be present in the data and |
97 | 160 | \link{inferGenotype} to determine if the novel alleles are frequent |
98 | enought to be included in the subject's genotype | |
161 | enought to be included in the subject's genotype. | |
99 | 162 | } |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | 9 | \item{allele_calls}{a vector of strings respresenting Ig allele calls, |
10 | where multiple calls are separated by a comma} | |
10 | where multiple calls are separated by a comma.} | |
11 | 11 | |
12 | 12 | \item{sample_seqs}{V(D)J-rearranged sample sequences matching the order |
13 | of the given \code{allele_calls}} | |
13 | of the given \code{allele_calls}.} | |
14 | 14 | |
15 | 15 | \item{germline_db}{a vector of named nucleotide germline sequences} |
16 | 16 | } |
17 | 17 | \value{ |
18 | 18 | A vector of strings containing the members of \code{allele_calls} |
19 | that represent unmutated sequences | |
19 | that represent unmutated sequences. | |
20 | 20 | } |
21 | 21 | \description{ |
22 | 22 | \code{findUnmutatedCalls} determines which allele calls would represent a |
25 | 25 | sequence, only the subset that would represent a perfect match is returned. |
26 | 26 | } |
27 | 27 | \examples{ |
28 | # Load data | |
29 | data(germline_ighv) | |
30 | data(sample_db) | |
31 | ||
32 | 28 | # Find which of the sample alleles are unmutated |
33 | calls <- findUnmutatedCalls(sample_db$V_CALL, sample_db$SEQUENCE_IMGT, | |
34 | germline_db=germline_ighv) | |
29 | calls <- findUnmutatedCalls(SampleDb$V_CALL, SampleDb$SEQUENCE_IMGT, | |
30 | germline_db=GermlineIGHV) | |
35 | 31 | |
36 | 32 | } |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/evidence.R | |
2 | \name{generateEvidence} | |
3 | \alias{generateEvidence} | |
4 | \title{Generate evidence} | |
5 | \usage{ | |
6 | generateEvidence(data, novel, genotype, genotype_db, germline_db, | |
7 | fields = NULL) | |
8 | } | |
9 | \arguments{ | |
10 | \item{data}{a \code{data.frame} containing sequence data that has been | |
11 | passed through \link{reassignAlleles} to correct the allele | |
12 | assignments.} | |
13 | ||
14 | \item{novel}{the \code{data.frame} returned by \link{findNovelAlleles}.} | |
15 | ||
16 | \item{genotype}{the \code{data.frame} of alleles generated with \link{inferGenotype} | |
17 | denoting the genotype of the subject.} | |
18 | ||
19 | \item{genotype_db}{a vector of named nucleotide germline sequences in the genotype. | |
20 | Returned by \link{genotypeFasta}.} | |
21 | ||
22 | \item{germline_db}{the original uncorrected germline database used to by | |
23 | \link{findNovelAlleles} to identify novel alleles.} | |
24 | ||
25 | \item{fields}{character vector of column names used to split the data to | |
26 | identify novel alleles, if any. If \code{NULL} then the data is | |
27 | not divided by grouping variables.} | |
28 | } | |
29 | \value{ | |
30 | Returns the \code{genotype} input \code{data.frame} with the following additional columns | |
31 | providing supporting evidence for each inferred allele: | |
32 | ||
33 | \itemize{ | |
34 | \item \code{FIELD_ID}: Data subset identifier, defined with the input paramter \code{fields}. | |
35 | \item A variable number of columns, specified with the input parameter \code{fields}. | |
36 | \item \code{POLYMORPHISM_CALL}: The novel allele call. | |
37 | \item \code{NOVEL_IMGT}: The novel allele sequence. | |
38 | \item \code{CLOSEST_REFERENCE}: The closest reference gene and allele in | |
39 | the \code{germline_db} database. | |
40 | \item \code{CLOSEST_REFERENCE_IMGT}: Sequence of the closest reference gene and | |
41 | allele in the \code{germline_db} database. | |
42 | \item \code{GERMLINE_CALL}: The input (uncorrected) V call. | |
43 | \item \code{GERMLINE_IMGT}: Germline sequence for \code{GERMLINE_CALL}. | |
44 | \item \code{NT_DIFF}: Number of nucleotides that differ between the new allele and | |
45 | the closest reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database. | |
46 | \item \code{NT_SUBSTITUTIONS}: A comma separated list of specific nucleotide | |
47 | differences (e.g. \code{112G>A}) in the novel allele. | |
48 | \item \code{AA_DIFF}: Number of amino acids that differ between the new allele and the closest | |
49 | reference (\code{CLOSEST_REFERENCE}) in the \code{germline_db} database. | |
50 | \item \code{AA_SUBSTITUTIONS}: A comma separated list with specific amino acid | |
51 | differences (e.g. \code{96A>N}) in the novel allele. | |
52 | \item \code{SEQUENCES}: Number of sequences unambiguosly assigned to this allele. | |
53 | \item \code{UNMUTATED_SEQUENCES}: Number of records with the unmutated novel allele sequence. | |
54 | \item \code{UNMUTATED_FREQUENCY}: Proportion of records with the unmutated novel allele | |
55 | sequence (\code{UNMUTATED_SEQUENCES / SEQUENCE}). | |
56 | \item \code{ALLELIC_PERCENTAGE}: Percentage at which the (unmutated) allele is observed | |
57 | in the sequence dataset compared to other (unmutated) alleles. | |
58 | \item \code{UNIQUE_JS}: Number of unique J sequences found associated with the | |
59 | novel allele. The sequences are those who have been unambiguously assigned | |
60 | to the novel allelle (\code{POLYMORPHISM_CALL}). | |
61 | \item \code{UNIQUE_CDR3S}: Number of unique CDR3s associated with the inferred allele. | |
62 | The sequences are those who have been unambiguously assigned to the | |
63 | novel allelle (POLYMORPHISM_CALL). | |
64 | \item \code{MUT_MIN}: Minimum mutation considered by the algorithm. | |
65 | \item \code{MUT_MAX}: Maximum mutation considered by the algorithm. | |
66 | \item \code{POS_MIN}: First position of the sequence considered by the algorithm (IMGT numbering). | |
67 | \item \code{POS_MAX}: Last position of the sequence considered by the algorithm (IMGT numbering). | |
68 | \item \code{Y_INTERCEPT}: The y-intercept above which positions were considered | |
69 | potentially polymorphic. | |
70 | \item \code{ALPHA}: Significance threshold to be used when constructing the | |
71 | confidence interval for the y-intercept. | |
72 | \item \code{MIN_SEQS}: Input \code{min_seqs}. The minimum number of total sequences | |
73 | (within the desired mutational range and nucleotide range) required | |
74 | for the samples to be considered. | |
75 | \item \code{J_MAX}: Input \code{j_max}. The maximum fraction of sequences perfectly | |
76 | aligning to a potential novel allele that are allowed to utilize to a particular | |
77 | combination of junction length and J gene. | |
78 | \item \code{MIN_FRAC}: Input \code{min_frac}. The minimum fraction of sequences that must | |
79 | have usable nucleotides in a given position for that position to be considered. | |
80 | \item \code{NOTE}: Comments regarding the novel allele inferrence. | |
81 | } | |
82 | } | |
83 | \description{ | |
84 | \code{generateEvidence} builds a table of evidence metrics for the final novel V | |
85 | allele detection and genotyping inferrences. | |
86 | } | |
87 | \examples{ | |
88 | \donttest{ | |
89 | # Generate input data | |
90 | novel <- findNovelAlleles(SampleDb, GermlineIGHV) | |
91 | genotype <- inferGenotype(SampleDb, find_unmutated=TRUE, germline_db=GermlineIGHV, | |
92 | novel=novel) | |
93 | genotype_db <- genotypeFasta(genotype, GermlineIGHV, novel) | |
94 | data_db <- reassignAlleles(SampleDb, genotype_db) | |
95 | ||
96 | # Assemble evidence table | |
97 | evidence <- generateEvidence(data_db, novel, genotype, genotype_db, GermlineIGHV) | |
98 | } | |
99 | ||
100 | } | |
101 | \seealso{ | |
102 | See \link{findNovelAlleles}, \link{inferGenotype} and \link{genotypeFasta} | |
103 | for generating the required input. | |
104 | } |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{genotype} | |
4 | \alias{genotype} | |
5 | \title{Example of an Inferred Genotype} | |
6 | \format{A \code{data.frame} where rows correspond to genes carried by an | |
7 | individual and columns lists the alleles of those genes and their counts.} | |
8 | \description{ | |
9 | Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
10 | individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
11 | IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by | |
12 | \link{findNovelAlleles} and \link{inferGenotype} | |
13 | } | |
14 | \references{ | |
15 | Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
16 | high-throughput B cell sequencing data reveals a high frequency of novel | |
17 | immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
18 | } | |
19 | \keyword{data} |
3 | 3 | \alias{genotypeFasta} |
4 | 4 | \title{Return the nucleotide sequences of a genotype} |
5 | 5 | \usage{ |
6 | genotypeFasta(genotype, germline_db, novel_df = NA) | |
6 | genotypeFasta(genotype, germline_db, novel = NA) | |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{genotype}{a table of alleles denoting a genotype, as returned by | |
10 | \link{inferGenotype}} | |
9 | \item{genotype}{a \code{data.frame} of alleles denoting a genotype, | |
10 | as returned by \link{inferGenotype}.} | |
11 | 11 | |
12 | 12 | \item{germline_db}{a vector of named nucleotide germline sequences |
13 | matching the alleles detailed in \code{genotype}} | |
13 | matching the alleles detailed in \code{genotype}.} | |
14 | 14 | |
15 | \item{novel_df}{an optional \code{data.frame} containing putative | |
15 | \item{novel}{an optional \code{data.frame} containing putative | |
16 | 16 | novel alleeles of the type returned by |
17 | \link{findNovelAlleles}} | |
17 | \link{findNovelAlleles}.} | |
18 | 18 | } |
19 | 19 | \value{ |
20 | 20 | A named vector of strings containing the germline nucleotide |
21 | sequences of the alleles in the provided genotype | |
21 | sequences of the alleles in the provided genotype. | |
22 | 22 | } |
23 | 23 | \description{ |
24 | 24 | \code{genotypeFasta} converts a genotype table into a vector of nucleotide |
25 | 25 | sequences. |
26 | 26 | } |
27 | 27 | \examples{ |
28 | # Load example data | |
29 | data(germline_ighv) | |
30 | data(novel_df) | |
31 | data(genotype) | |
32 | ||
33 | 28 | # Find the sequences that correspond to the genotype |
34 | genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df) | |
35 | ||
29 | genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, SampleNovel) | |
36 | 30 | |
37 | 31 | } |
38 | 32 | \seealso{ |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{germline_ighv} | |
4 | \alias{germline_ighv} | |
5 | \title{Human IGHV germlines} | |
6 | \format{Values correspond to IMGT-gaped nuceltoide sequences (with | |
7 | nucleotides capitalized and gaps represented by ".") while names correspond | |
8 | to stripped-down IMGT allele names (e.g. "IGHV1-18*01").} | |
9 | \description{ | |
10 | A \code{character} vector of all 344 human IGHV germline gene segment alleles | |
11 | in IMGT Gene-db release 201408-4. | |
12 | } | |
13 | \references{ | |
14 | Xochelli \emph{et al}. (2014) Immunoglobulin heavy variable | |
15 | (IGHV) genes and alleles: new entities, new names and implications for | |
16 | research and prognostication in chronic lymphocytic leukaemia. | |
17 | \emph{Immunogenetics}. 67(1):61-6. | |
18 | } | |
19 | \keyword{data} |
26 | 26 | contained within the call |
27 | 27 | } |
28 | 28 | \examples{ |
29 | # Load germline database | |
30 | data(germline_ighv) | |
29 | # Insert a mutation into a germline sequence | |
30 | s2 <- s3 <- GermlineIGHV[1] | |
31 | stringi::stri_sub(s2, 103, 103) <- "G" | |
32 | stringi::stri_sub(s3, 107, 107) <- "C" | |
31 | 33 | |
32 | # Use createGermlines to insert a mutation into a germline sequence | |
33 | #sample_seqs = c(germline_ighv[2], | |
34 | # createGermlines(germline_ighv[1], 103, "G"), | |
35 | # createGermlines(germline_ighv[1], 107, "C")) | |
34 | sample_seqs <- c(GermlineIGHV[2], s2, s3) | |
36 | 35 | |
37 | 36 | # Pretend that one sample sequence has received an ambiguous allele call |
38 | #sample_alleles = c(paste(names(germline_ighv[1:2]), collapse=","), | |
39 | # names(germline_ighv[2]), | |
40 | # names(germline_ighv[1])) | |
37 | sample_alleles <- c(paste(names(GermlineIGHV[1:2]), collapse=","), | |
38 | names(GermlineIGHV[2]), | |
39 | names(GermlineIGHV[1])) | |
41 | 40 | |
42 | 41 | # Compare each sequence to its assigned germline(s) to determine the distance |
43 | #getMutCount(sample_seqs, sample_alleles, germline_ighv) | |
42 | getMutCount(sample_seqs, sample_alleles, GermlineIGHV) | |
44 | 43 | |
45 | 44 | } |
32 | 32 | } |
33 | 33 | \examples{ |
34 | 34 | # Create strings to act as a sample sequences and a reference sequence |
35 | seqs = c("----GATA","GAGAGAGA","TANA") | |
36 | ref = "GATAGATA" | |
35 | seqs <- c("----GATA", "GAGAGAGA", "TANA") | |
36 | ref <- "GATAGATA" | |
37 | 37 | |
38 | 38 | # Find the differences between the two |
39 | 39 | getMutatedPositions(seqs, ref) |
1 | 1 | % Please edit documentation in R/functions.R |
2 | 2 | \name{getPopularMutationCount} |
3 | 3 | \alias{getPopularMutationCount} |
4 | \title{Find Frequent Sequences' Mutation Counts} | |
4 | \title{Find mutation counts for frequency sequences} | |
5 | 5 | \usage{ |
6 | getPopularMutationCount(sample_db, germline_db, gene_min = 0.001, | |
6 | getPopularMutationCount(data, germline_db, gene_min = 0.001, | |
7 | 7 | seq_min = 50, seq_p_of_max = 1/8, full_return = FALSE) |
8 | 8 | } |
9 | 9 | \arguments{ |
10 | \item{sample_db}{A Change-O db data frame. See | |
10 | \item{data}{a \code{data.frame} in the Change-O format. See | |
11 | 11 | \link{findNovelAlleles} for a list of required |
12 | 12 | columns.} |
13 | 13 | |
22 | 22 | \item{seq_p_of_max}{For each gene, fraction of the most common V sequence's |
23 | 23 | count that a sequence must meet to avoid exclusion.} |
24 | 24 | |
25 | \item{full_return}{If true, will return all \code{sample_db} columns and | |
25 | \item{full_return}{If \code{TRUE}, will return all \code{data} columns and | |
26 | 26 | will include sequences with mutation count < 1.} |
27 | 27 | } |
28 | 28 | \value{ |
34 | 34 | for each V gene and returns the mutation count of those sequences. |
35 | 35 | } |
36 | 36 | \examples{ |
37 | data(sample_db, germline_ighv) | |
38 | getPopularMutationCount(sample_db, germline_ighv) | |
37 | getPopularMutationCount(SampleDb, GermlineIGHV) | |
39 | 38 | |
40 | 39 | } |
41 | 40 | \seealso{ |
1 | 1 | % Please edit documentation in R/functions.R |
2 | 2 | \name{inferGenotype} |
3 | 3 | \alias{inferGenotype} |
4 | \title{Infer a subject-specific genotype} | |
4 | \title{Infer a subject-specific genotype using a frequency method} | |
5 | 5 | \usage{ |
6 | inferGenotype(clip_db, v_call = "V_CALL", fraction_to_explain = 0.875, | |
7 | gene_cutoff = 1e-04, find_unmutated = TRUE, germline_db = NA, | |
8 | novel_df = NA) | |
6 | inferGenotype(data, germline_db = NA, novel = NA, v_call = "V_CALL", | |
7 | fraction_to_explain = 0.875, gene_cutoff = 1e-04, | |
8 | find_unmutated = TRUE) | |
9 | 9 | } |
10 | 10 | \arguments{ |
11 | \item{clip_db}{a \code{data.frame} containing V allele | |
11 | \item{data}{a \code{data.frame} containing V allele | |
12 | 12 | calls from a single subject. If |
13 | 13 | \code{find_unmutated} is \code{TRUE}, then |
14 | 14 | the sample IMGT-gapped V(D)J sequence should} |
15 | ||
16 | \item{v_call}{column in \code{clip_db} with V allele calls. | |
17 | Default is \code{"V_CALL"} | |
18 | be provided in a column \code{"SEQUENCE_IMGT"}} | |
19 | ||
20 | \item{fraction_to_explain}{the portion of each gene that must be | |
21 | explained by the alleles that will be included | |
22 | in the genotype} | |
23 | ||
24 | \item{gene_cutoff}{either a number of sequences or a fraction of | |
25 | the length of \code{allele_calls} denoting the | |
26 | minimum number of times a gene must be | |
27 | observed in \code{allele_calls} to be included | |
28 | in the genotype} | |
29 | ||
30 | \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to | |
31 | find which samples are unmutated. Not needed | |
32 | if \code{allele_calls} only represent | |
33 | unmutated samples.} | |
34 | 15 | |
35 | 16 | \item{germline_db}{named vector of sequences containing the |
36 | 17 | germline sequences named in |
37 | 18 | \code{allele_calls}. Only required if |
38 | 19 | \code{find_unmutated} is \code{TRUE}.} |
39 | 20 | |
40 | \item{novel_df}{an optional \code{data.frame} of the type | |
21 | \item{novel}{an optional \code{data.frame} of the type | |
41 | 22 | novel returned by |
42 | 23 | \link{findNovelAlleles} containing |
43 | 24 | germline sequences that will be utilized if |
44 | 25 | \code{find_unmutated} is \code{TRUE}. See |
45 | details.} | |
26 | Details.} | |
27 | ||
28 | \item{v_call}{column in \code{data} with V allele calls. | |
29 | Default is \code{"V_CALL"}. | |
30 | be provided in a column \code{"SEQUENCE_IMGT"}} | |
31 | ||
32 | \item{fraction_to_explain}{the portion of each gene that must be | |
33 | explained by the alleles that will be included | |
34 | in the genotype.} | |
35 | ||
36 | \item{gene_cutoff}{either a number of sequences or a fraction of | |
37 | the length of \code{allele_calls} denoting the | |
38 | minimum number of times a gene must be | |
39 | observed in \code{allele_calls} to be included | |
40 | in the genotype.} | |
41 | ||
42 | \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to | |
43 | find which samples are unmutated. Not needed | |
44 | if \code{allele_calls} only represent | |
45 | unmutated samples.} | |
46 | 46 | } |
47 | 47 | \value{ |
48 | A table of alleles denoting the genotype of the subject | |
48 | A \code{data.frame} of alleles denoting the genotype of the subject containing | |
49 | the following columns: | |
50 | ||
51 | \itemize{ | |
52 | \item \code{GENE}: The gene name without allele. | |
53 | \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}. | |
54 | \item \code{COUNTS}: Comma separated list of observed sequences for each | |
55 | corresponding allele in the \code{ALLELES} list. | |
56 | \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}. | |
57 | \item \code{NOTE}: Any comments on the inferrence. | |
58 | } | |
49 | 59 | } |
50 | 60 | \description{ |
51 | \code{inferGenotype} infers an subject's genotype by finding the minimum | |
52 | number set of alleles that can explain the majority of each gene's calls. The | |
53 | most common allele of each gene is included in the genotype first, and the | |
54 | next most common allele is added until the desired fraction of alleles can be | |
55 | explained. In this way, mistaken allele calls (resulting from sequences which | |
61 | \code{inferGenotype} infers an subject's genotype using a frequency method. | |
62 | The genotype is inferred by finding the minimum number set of alleles that | |
63 | can explain the majority of each gene's calls. The most common allele of | |
64 | each gene is included in the genotype first, and the next most common allele | |
65 | is added until the desired fraction of alleles can be explained. In this | |
66 | way, mistaken allele calls (resulting from sequences which | |
56 | 67 | by chance have been mutated to look like another allele) can be removed. |
57 | 68 | } |
58 | 69 | \details{ |
59 | 70 | Allele calls representing cases where multiple alleles have been |
60 | assigned to a single sample sequence are rare among unmutated | |
61 | sequences but may result if nucleotides for certain positions are | |
62 | not available. Calls containing multiple alleles are treated as | |
63 | belonging to all groups. If \code{novel_df} is provided, all | |
64 | sequences that are assigned to the same starting allele as any | |
65 | novel germline allele will have the novel germline allele appended | |
66 | to their assignent prior to searching for unmutated sequences. | |
71 | assigned to a single sample sequence are rare among unmutated | |
72 | sequences but may result if nucleotides for certain positions are | |
73 | not available. Calls containing multiple alleles are treated as | |
74 | belonging to all groups. If \code{novel} is provided, all | |
75 | sequences that are assigned to the same starting allele as any | |
76 | novel germline allele will have the novel germline allele appended | |
77 | to their assignent prior to searching for unmutated sequences. | |
67 | 78 | } |
68 | 79 | \note{ |
69 | 80 | This method works best with data derived from blood, where a large |
70 | portion of sequences are expected to be unmutated. Ideally, there | |
71 | should be hundreds of allele calls per gene in the input. | |
81 | portion of sequences are expected to be unmutated. Ideally, there | |
82 | should be hundreds of allele calls per gene in the input. | |
72 | 83 | } |
73 | 84 | \examples{ |
74 | # Infer the IGHV genotype, using only unmutated sequences, including any | |
75 | # novel alleles | |
76 | data(sample_db) | |
77 | data(germline_ighv) | |
78 | data(novel_df) | |
79 | inferGenotype(sample_db, find_unmutated = TRUE, germline_db = germline_ighv, | |
80 | novel_df = novel_df) | |
85 | # Infer IGHV genotype, using only unmutated sequences, including novel alleles | |
86 | inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel, | |
87 | find_unmutated=TRUE) | |
81 | 88 | |
82 | 89 | } |
83 | 90 | \seealso{ |
84 | 91 | \link{plotGenotype} for a colorful visualization and |
85 | 92 | \link{genotypeFasta} to convert the genotype to nucleotide sequences. |
93 | See \link{inferGenotypeBayesian} to infer a subject-specific genotype | |
94 | using a Bayesian approach. | |
86 | 95 | } |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/bayesian.R | |
2 | \name{inferGenotypeBayesian} | |
3 | \alias{inferGenotypeBayesian} | |
4 | \title{Infer a subject-specific genotype using a Bayesian approach} | |
5 | \usage{ | |
6 | inferGenotypeBayesian(data, germline_db = NA, novel = NA, | |
7 | v_call = "V_CALL", find_unmutated = TRUE, priors = c(0.6, 0.4, 0.4, | |
8 | 0.35, 0.25, 0.25, 0.25, 0.25, 0.25)) | |
9 | } | |
10 | \arguments{ | |
11 | \item{data}{a \code{data.frame} containing V allele | |
12 | calls from a single subject. If \code{find_unmutated} | |
13 | is \code{TRUE}, then the sample IMGT-gapped V(D)J sequence | |
14 | should be provided in a column \code{"SEQUENCE_IMGT"}} | |
15 | ||
16 | \item{germline_db}{named vector of sequences containing the | |
17 | germline sequences named in \code{allele_calls}. | |
18 | Only required if \code{find_unmutated} is \code{TRUE}.} | |
19 | ||
20 | \item{novel}{an optional \code{data.frame} of the type | |
21 | novel returned by \link{findNovelAlleles} containing | |
22 | germline sequences that will be utilized if | |
23 | \code{find_unmutated} is \code{TRUE}. See Details.} | |
24 | ||
25 | \item{v_call}{column in \code{data} with V allele calls. | |
26 | Default is \code{"V_CALL"}.} | |
27 | ||
28 | \item{find_unmutated}{if \code{TRUE}, use \code{germline_db} to | |
29 | find which samples are unmutated. Not needed | |
30 | if \code{allele_calls} only represent | |
31 | unmutated samples.} | |
32 | ||
33 | \item{priors}{a numeric vector of priors for the multinomial distribution. | |
34 | The \code{priors} vector must be nine values that defined | |
35 | the priors for the heterozygous (two allele), | |
36 | trizygous (three allele), and quadrozygous (four allele) | |
37 | distributions. The first two values of \code{priors} define | |
38 | the prior for the heterozygous case, the next three values are for | |
39 | the trizygous case, and the final four values are for the | |
40 | quadrozygous case. Each set of priors should sum to one. | |
41 | Note, each distribution prior is actually defined internally | |
42 | by set of four numbers, with the unspecified final values | |
43 | assigned to \code{0}; e.g., the heterozygous case is | |
44 | \code{c(priors[1], priors[2], 0, 0)}. The prior for the | |
45 | homozygous distribution is fixed at \code{c(1, 0, 0, 0)}.} | |
46 | } | |
47 | \value{ | |
48 | A \code{data.frame} of alleles denoting the genotype of the subject with the log10 | |
49 | of the likelihood of each model and the log10 of the Bayes factor. The output | |
50 | contains the following columns: | |
51 | ||
52 | \itemize{ | |
53 | \item \code{GENE}: The gene name without allele. | |
54 | \item \code{ALLELES}: Comma separated list of alleles for the given \code{GENE}. | |
55 | \item \code{COUNTS}: Comma separated list of observed sequences for each | |
56 | corresponding allele in the \code{ALLELES} list. | |
57 | \item \code{TOTAL}: The total count of observed sequences for the given \code{GENE}. | |
58 | \item \code{NOTE}: Any comments on the inferrence. | |
59 | \item \code{KH}: log10 likelihood that the \code{GENE} is homozygous. | |
60 | \item \code{KD}: log10 likelihood that the \code{GENE} is heterozygous. | |
61 | \item \code{KT}: log10 likelihood that the \code{GENE} is trizygous | |
62 | \item \code{KQ}: log10 likelihood that the \code{GENE} is quadrozygous. | |
63 | \item \code{K_DIFF}: log10 ratio of the highest to second-highest zygosity likelihoods. | |
64 | } | |
65 | } | |
66 | \description{ | |
67 | \code{inferGenotypeBayesian} infers an subject's genotype by applying a Bayesian framework | |
68 | with a Dirichlet prior for the multinomial distribution. Up to four distinct alleles are | |
69 | allowed in an individual’s genotype. Four likelihood distributions were generated by | |
70 | empirically fitting three high coverage genotypes from three individuals | |
71 | (Laserson and Vigneault et al, 2014). A posterior probability is calculated for the | |
72 | four most common alleles. The certainty of the highest probability model was | |
73 | calculated using a Bayes factor (the most likely model divided by second-most likely model). | |
74 | The larger the Bayes factor (K), the greater the certainty in the model. | |
75 | } | |
76 | \details{ | |
77 | Allele calls representing cases where multiple alleles have been | |
78 | assigned to a single sample sequence are rare among unmutated | |
79 | sequences but may result if nucleotides for certain positions are | |
80 | not available. Calls containing multiple alleles are treated as | |
81 | belonging to all groups. If \code{novel} is provided, all | |
82 | sequences that are assigned to the same starting allele as any | |
83 | novel germline allele will have the novel germline allele appended | |
84 | to their assignent prior to searching for unmutated sequences. | |
85 | } | |
86 | \note{ | |
87 | This method works best with data derived from blood, where a large | |
88 | portion of sequences are expected to be unmutated. Ideally, there | |
89 | should be hundreds of allele calls per gene in the input. | |
90 | } | |
91 | \examples{ | |
92 | # Infer IGHV genotype, using only unmutated sequences, including novel alleles | |
93 | inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, novel=SampleNovel, | |
94 | find_unmutated=TRUE) | |
95 | ||
96 | } | |
97 | \references{ | |
98 | \enumerate{ | |
99 | \item Laserson U and Vigneault F, et al. High-resolution antibody dynamics of | |
100 | vaccine-induced immune responses. PNAS. 2014 111(13):4928-33. | |
101 | } | |
102 | } | |
103 | \seealso{ | |
104 | \link{plotGenotype} for a colorful visualization and | |
105 | \link{genotypeFasta} to convert the genotype to nucleotide sequences. | |
106 | See \link{inferGenotype} to infer a subject-specific genotype using | |
107 | a frequency method | |
108 | } |
6 | 6 | insertPolymorphisms(sequence, positions, nucleotides) |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{sequence}{the starting nucletide sequence} | |
9 | \item{sequence}{starting nucletide sequence.} | |
10 | 10 | |
11 | \item{positions}{a vector of positions which to be changed} | |
11 | \item{positions}{numeric vector of positions which to be changed.} | |
12 | 12 | |
13 | \item{nucleotides}{a vector of nucletides to which to change the | |
14 | positions} | |
13 | \item{nucleotides}{character vector of nucletides to which to change the | |
14 | positions.} | |
15 | 15 | } |
16 | 16 | \value{ |
17 | a sequence with the desired nucleotides in provided locations | |
17 | A sequence with the desired nucleotides in the provided locations. | |
18 | 18 | } |
19 | 19 | \description{ |
20 | 20 | \code{insertPolymorphisms} replaces nucleotides in the desired locations of a |
21 | 21 | provided sequence. |
22 | 22 | } |
23 | 23 | \examples{ |
24 | insertPolymorphisms("hugged", c(1,6,2), c("t","r","i")) | |
24 | insertPolymorphisms("HUGGED", c(1, 6, 2), c("T", "R", "I")) | |
25 | 25 | |
26 | 26 | } |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{novel_df} | |
4 | \alias{novel_df} | |
5 | \title{Example of Analyzed Rep-Seq data} | |
6 | \format{A \code{data.frame} where rows correspond to alleles checked for | |
7 | polymorphisms and columns give results as well as paramaters used to run | |
8 | the test.} | |
9 | \description{ | |
10 | Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
11 | individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
12 | IMGT/V-QUEST to utilize IGHV1 family alleles, as processed by | |
13 | \link{findNovelAlleles}. | |
14 | } | |
15 | \references{ | |
16 | Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
17 | high-throughput B cell sequencing data reveals a high frequency of novel | |
18 | immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
19 | } | |
20 | \keyword{data} |
3 | 3 | \alias{plotGenotype} |
4 | 4 | \title{Show a colorful representation of a genotype} |
5 | 5 | \usage{ |
6 | plotGenotype(genotype, facet_by = NULL, gene_sort = c("name", "position"), | |
7 | text_size = 12, silent = FALSE, ...) | |
6 | plotGenotype(genotype, facet_by = NULL, gene_sort = c("name", | |
7 | "position"), text_size = 12, silent = FALSE, ...) | |
8 | 8 | } |
9 | 9 | \arguments{ |
10 | \item{genotype}{a table of alleles denoting a genotype, as returned by | |
11 | \link{inferGenotype}} | |
10 | \item{genotype}{a \code{data.frame} of alleles denoting a genotype, | |
11 | as returned by \link{inferGenotype}.} | |
12 | 12 | |
13 | 13 | \item{facet_by}{a column name in \code{genotype} to facet the plot by. |
14 | 14 | If \code{NULL}, then do not facet the plot.} |
18 | 18 | \code{"position"} then sort by position in the locus, as |
19 | 19 | determined by the final two numbers in the gene name.} |
20 | 20 | |
21 | \item{text_size}{the point size of the plotted text} | |
21 | \item{text_size}{the point size of the plotted text.} | |
22 | 22 | |
23 | 23 | \item{silent}{if \code{TRUE} do not draw the plot and just return the ggplot |
24 | 24 | object; if \code{FALSE} draw the plot.} |
32 | 32 | \code{plotGenotype} plots a genotype table. |
33 | 33 | } |
34 | 34 | \examples{ |
35 | # Load example data | |
36 | data(novel_df) | |
37 | data(genotype) | |
38 | ||
39 | 35 | # Plot genotype |
40 | plotGenotype(genotype) | |
36 | plotGenotype(SampleGenotype) | |
41 | 37 | |
42 | 38 | # Facet by subject |
43 | genotypea = genotypeb = genotype | |
44 | genotypea$SUBJECT = "A" | |
45 | genotypeb$SUBJECT = "B" | |
46 | geno_sub = rbind(genotypea, genotypeb) | |
39 | genotype_a <- genotype_b <- SampleGenotype | |
40 | genotype_a$SUBJECT <- "A" | |
41 | genotype_b$SUBJECT <- "B" | |
42 | geno_sub <- rbind(genotype_a, genotype_b) | |
47 | 43 | plotGenotype(geno_sub, facet_by="SUBJECT", gene_sort="pos") |
48 | 44 | |
49 | 45 | } |
3 | 3 | \alias{plotNovel} |
4 | 4 | \title{Visualize evidence of novel V alleles} |
5 | 5 | \usage{ |
6 | plotNovel(clip_db, novel_df_row, ncol = 1, v_call = "V_CALL") | |
6 | plotNovel(data, novel_row, v_call = "V_CALL", ncol = 1) | |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{clip_db}{a \code{data.frame} in Change-O format. See | |
9 | \item{data}{a \code{data.frame} in Change-O format. See | |
10 | 10 | \link{findNovelAlleles} for details.} |
11 | 11 | |
12 | \item{novel_df_row}{a single row from a data frame as output by | |
12 | \item{novel_row}{a single row from a data frame as output by | |
13 | 13 | \link{findNovelAlleles} that contains a |
14 | 14 | polymorphism-containing germline allele} |
15 | 15 | |
16 | \item{v_call}{name of the column in \code{data} with V allele | |
17 | calls. Default is "V_CALL".} | |
18 | ||
16 | 19 | \item{ncol}{number of columns to use when laying out the plots} |
17 | ||
18 | \item{v_call}{name of the column in \code{clip_db} with V allele | |
19 | calls. Default is "V_CALL"} | |
20 | 20 | } |
21 | 21 | \description{ |
22 | 22 | \code{plotNovel} is be used to visualize the evidence of any novel V |
23 | alleles found using \link{findNovelAlleles}. | |
23 | alleles found using \link{findNovelAlleles}. It can also be used to | |
24 | visualize the results for alleles that did | |
25 | } | |
26 | \details{ | |
27 | The first panel in the plot shows, for all sequences which align to a particular | |
28 | germline allele, the mutation frequency at each postion along the aligned | |
29 | sequece as a function of the sequence-wide mutation. Sequences that pass | |
30 | the novel allele test are colored red, while sequences that don't pass | |
31 | the test are colored yellow. The second panel shows the nucleotide usage at the | |
32 | positions as a function of sequence-wide mutation count. | |
33 | ||
34 | To avoid cases where a clonal expansion might lead to a false positive, tigger examines | |
35 | the combinations of J gene and junction length among sequences which perfectly | |
36 | match the proposed germline allele. | |
24 | 37 | } |
25 | 38 | \examples{ |
26 | # Load example data and germlines | |
27 | data(sample_db) | |
28 | data(germline_ighv) | |
29 | ||
30 | # Find novel alleles and return relevant data | |
31 | \dontrun{novel_df = findNovelAlleles(sample_db, germline_ighv)} | |
32 | data(novel_df) | |
33 | 39 | # Plot the evidence for the first (and only) novel allele in the example data |
34 | novel = selectNovel(novel_df) | |
35 | plotNovel(sample_db, novel[1,]) | |
40 | novel <- selectNovel(SampleNovel) | |
41 | plotNovel(SampleDb, novel[1, ]) | |
36 | 42 | |
37 | 43 | } |
6 | 6 | readIgFasta(fasta_file, strip_down_name = TRUE, force_caps = TRUE) |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{fasta_file}{fasta-formatted file of immunoglobuling sequences} | |
9 | \item{fasta_file}{fasta-formatted file of immunoglobuling sequences.} | |
10 | 10 | |
11 | 11 | \item{strip_down_name}{if \code{TRUE}, will extract only the allele name |
12 | from the strings fasta file's sequence names} | |
12 | from the strings fasta file's sequence names.} | |
13 | 13 | |
14 | 14 | \item{force_caps}{if \code{TRUE}, will force nucleotides to |
15 | uppercase} | |
15 | uppercase.} | |
16 | 16 | } |
17 | 17 | \value{ |
18 | a named vector of strings respresenting Ig alleles | |
18 | Named vector of strings respresenting Ig alleles. | |
19 | 19 | } |
20 | 20 | \description{ |
21 | 21 | \code{readIgFasta} reads a fasta-formatted file of immunoglobulin (Ig) |
3 | 3 | \alias{reassignAlleles} |
4 | 4 | \title{Correct allele calls based on a personalized genotype} |
5 | 5 | \usage{ |
6 | reassignAlleles(clip_db, genotype_db, v_call = "V_CALL", method = "hamming", | |
7 | path = NA, keep_gene = TRUE) | |
6 | reassignAlleles(data, genotype_db, v_call = "V_CALL", | |
7 | method = "hamming", path = NA, keep_gene = c("gene", "family", | |
8 | "repertoire")) | |
8 | 9 | } |
9 | 10 | \arguments{ |
10 | \item{clip_db}{a \code{data.frame} containing V allele calls from a | |
11 | single subject and the sample | |
12 | IMGT-gapped V(D)J sequences under | |
13 | \code{"SEQUENCE_IMGT"}} | |
11 | \item{data}{a \code{data.frame} containing V allele calls from a | |
12 | single subject and the sample IMGT-gapped V(D)J sequences under | |
13 | \code{"SEQUENCE_IMGT"}.} | |
14 | 14 | |
15 | 15 | \item{genotype_db}{a vector of named nucleotide germline sequences |
16 | 16 | matching the calls detailed in \code{allele_calls} |
17 | 17 | and personalized to the subject} |
18 | 18 | |
19 | \item{v_call}{name of the column in \code{clip_db} with V allele | |
20 | calls. Default is \code{"V_CALL"}} | |
19 | \item{v_call}{name of the column in \code{data} with V allele | |
20 | calls. Default is \code{"V_CALL"}.} | |
21 | 21 | |
22 | 22 | \item{method}{the method to be used when realigning sequences to |
23 | the genotype_db sequences. Currently only "hammming" | |
23 | the genotype_db sequences. Currently, only \code{"hammming"} | |
24 | 24 | (for Hamming distance) is implemented.} |
25 | 25 | |
26 | 26 | \item{path}{directory containing the tool used in the |
27 | 27 | realignment method, if needed. Hamming distance does |
28 | 28 | not require a path to a tool.} |
29 | 29 | |
30 | \item{keep_gene}{logical indicating if gene assignments should be | |
31 | maintained when possible. Increases speed by | |
32 | minimizing required number of alignments. Currently | |
33 | only "TRUE" is implemented.} | |
30 | \item{keep_gene}{a string indicating if the gene (\code{"gene"}), | |
31 | family (\code{"family"}) or complete repertoire | |
32 | (\code{"repertoire"}) assignments should be performed. | |
33 | Use of \code{"gene"} increases speed by minimizing required number of | |
34 | alignments, as gene level assignments will be maintained when possible.} | |
34 | 35 | } |
35 | 36 | \value{ |
36 | a single-column \code{data.frame} corresponding to \code{clip.db} | |
37 | and containing the best allele call from among the sequences | |
38 | listed in \code{genotype_db} | |
37 | A modifed input \code{data.frame} containing the best allele call from | |
38 | among the sequences listed in \code{genotype_db} in the | |
39 | \code{V_CALL_GENOTYPED} column. | |
39 | 40 | } |
40 | 41 | \description{ |
41 | 42 | \code{reassignAlleles} uses a subject-specific genotype to correct |
48 | 49 | based on a simple alignment to the sample sequence. |
49 | 50 | } |
50 | 51 | \examples{ |
51 | # Load example data | |
52 | data(germline_ighv) | |
53 | data(sample_db) | |
54 | data(genotype) | |
55 | data(novel_df) | |
56 | ||
57 | 52 | # Extract the database sequences that correspond to the genotype |
58 | genotype_seqs = genotypeFasta(genotype, germline_ighv, novel_df) | |
53 | genotype_db <- genotypeFasta(SampleGenotype, GermlineIGHV, novel=SampleNovel) | |
59 | 54 | |
60 | 55 | # Use the personlized genotype to determine corrected allele assignments |
61 | V_CALL_GENOTYPED = reassignAlleles(sample_db, genotype_seqs) | |
62 | sample_db = cbind(sample_db, V_CALL_GENOTYPED) | |
56 | output_db <- reassignAlleles(SampleDb, genotype_db) | |
63 | 57 | |
64 | 58 | } |
0 | % Generated by roxygen2: do not edit by hand | |
1 | % Please edit documentation in R/data.R | |
2 | \docType{data} | |
3 | \name{sample_db} | |
4 | \alias{sample_db} | |
5 | \title{Example human Rep-Seq data} | |
6 | \format{A \code{data.frame} where rows correspond to unique VDJ sequences and | |
7 | columns include: | |
8 | \itemize{ | |
9 | \item IMGT-gapped nucleotide sequence (\code{"SEQUENCE_IMGT"}) | |
10 | \item IMGT/V-QUEST allele calls (\code{"V_CALL"}, \code{"D_CALL"}, and | |
11 | \code{"J_CALL"}) | |
12 | \item Junction length (\code{"JUNCTION_LENGTH"}) | |
13 | }} | |
14 | \description{ | |
15 | Example VDJ-rearranged immunoglobulin Rep-Seq sequences derived from a single | |
16 | individual (PGP1), sequenced on the Roche 454 platform, and thought by | |
17 | IMGT/V-QUEST to utilize IGHV1 family alleles. | |
18 | } | |
19 | \references{ | |
20 | Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
21 | high-throughput B cell sequencing data reveals a high frequency of novel | |
22 | immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
23 | } | |
24 | \keyword{data} |
3 | 3 | \alias{selectNovel} |
4 | 4 | \title{Select rows containing novel alleles} |
5 | 5 | \usage{ |
6 | selectNovel(novel_df, keep_alleles = FALSE) | |
6 | selectNovel(novel, keep_alleles = FALSE) | |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{novel_df}{A \code{data.frame} of the type returned by | |
10 | \link{findNovelAlleles}} | |
9 | \item{novel}{a \code{data.frame} of the type returned by | |
10 | \link{findNovelAlleles}.} | |
11 | 11 | |
12 | \item{keep_alleles}{A \code{logical} indicating if different alleles | |
12 | \item{keep_alleles}{a \code{logical} indicating if different alleles | |
13 | 13 | leading to the same novel sequence should be kept. |
14 | See details.} | |
14 | See Details.} | |
15 | 15 | } |
16 | 16 | \value{ |
17 | 17 | A \code{data.frame} containing only unique, novel alleles (if any) |
22 | 22 | selects only the rows containing unique, novel alleles. |
23 | 23 | } |
24 | 24 | \details{ |
25 | If, for instance, subject has in his genome IGHV1-2*02 and a novel | |
26 | allele equally close to IGHV1-2*02 and IGHV1-2*05, the novel allele may be | |
25 | If, for instance, subject has in his genome \code{IGHV1-2*02} and a novel | |
26 | allele equally close to \code{IGHV1-2*02} and \code{IGHV1-2*05}, the novel allele may be | |
27 | 27 | detected by analyzing sequences that best align to either of these alleles. |
28 | 28 | If \code{keep_alleles} is \code{TRUE}, both polymorphic allele calls will |
29 | 29 | be retained. In the case that multiple mutation ranges are checked for the |
30 | 30 | same allele, only one mutation range will be kept in the output. |
31 | 31 | } |
32 | 32 | \examples{ |
33 | data(novel_df) | |
34 | novel = selectNovel(novel_df) | |
33 | novel <- selectNovel(SampleNovel) | |
35 | 34 | |
36 | 35 | } |
6 | 6 | sortAlleles(allele_calls, method = c("name", "position")) |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{allele_calls}{a vector of strings respresenting Ig allele names} | |
9 | \item{allele_calls}{a vector of strings respresenting Ig allele names.} | |
10 | 10 | |
11 | 11 | \item{method}{a string defining the method to use when sorting alleles. |
12 | 12 | If \code{"name"} then sort in lexicographic order. If |
14 | 14 | determined by the final two numbers in the gene name.} |
15 | 15 | } |
16 | 16 | \value{ |
17 | A sorted vector of strings respresenting Ig allele names | |
17 | A sorted vector of strings respresenting Ig allele names. | |
18 | 18 | } |
19 | 19 | \description{ |
20 | 20 | \code{sortAlleles} returns a sorted vector of strings respresenting Ig allele |
21 | 21 | names. Names are first sorted by gene family, then by gene, then by allele. |
22 | 22 | Duplicated genes have their alleles are sorted as if they were part of their |
23 | non-duplicated counterparts (e.g. IGHV1-69D*01 comes after IGHV1-69*01 but | |
24 | before IGHV1-69*02), and non-localized genes (e.g. IGHV1-NL1*01) come last | |
25 | within their gene family. | |
23 | non-duplicated counterparts (e.g. \code{IGHV1-69D*01} comes after \code{IGHV1-69*01} | |
24 | but before \code{IGHV1-69*02}), and non-localized genes (e.g. \code{IGHV1-NL1*01}) | |
25 | come last within their gene family. | |
26 | 26 | } |
27 | 27 | \examples{ |
28 | 28 | # Create a list of allele names |
29 | alleles = c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01", | |
30 | "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05", | |
31 | "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02") | |
29 | alleles <- c("IGHV1-69D*01","IGHV1-69*01","IGHV1-2*01","IGHV1-69-2*01", | |
30 | "IGHV2-5*01","IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05", | |
31 | "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02") | |
32 | 32 | |
33 | 33 | # Sort the alleles by name |
34 | 34 | sortAlleles(alleles) |
6 | 6 | \title{tigger} |
7 | 7 | \description{ |
8 | 8 | Here we provide a \strong{T}ool for \strong{I}mmuno\strong{g}lobulin |
9 | \strong{G}enotype \strong{E}lucidation via | |
10 | \strong{R}ep-Seq (TIgGER). TIgGER inferrs the set of Ig alleles carried by an | |
9 | \strong{G}enotype \strong{E}lucidation via \strong{R}ep-Seq (TIgGER). | |
10 | TIgGER inferrs the set of Ig alleles carried by an | |
11 | 11 | individual (including any novel alleles) and then uses this set of alleles to |
12 | 12 | correct the initial assignments given to sample sequences by existing tools. |
13 | 13 | } |
14 | 14 | \details{ |
15 | Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the | |
15 | Immunoglobulin repertoire sequencing (AIRR-Seq, Rep-Seq) data is currently the | |
16 | 16 | subject of much study. A key step in analyzing these data involves assigning |
17 | 17 | the closest known V(D)J germline alleles to the (often somatically mutated) |
18 | 18 | sample sequences using a tool such as IMGT/HighV-QUEST. However, if the |
19 | 19 | sample utilizes alleles not in the germline database used for alignment, this |
20 | 20 | step will fail. Additionally, this alignment has an associated error rate of |
21 | ~5 percent, notably among sequences carrying a large number of somatic | |
21 | ~5%, notably among sequences carrying a large number of somatic | |
22 | 22 | mutations. The purpose of TIgGER is to address these issues. |
23 | 23 | } |
24 | \section{Core tigger functions}{ | |
24 | \section{Allele detection and genotyping}{ | |
25 | 25 | |
26 | 26 | \itemize{ |
27 | \item \link{findNovelAlleles}: Detect novel alleles | |
28 | \item \link{plotNovel}: Plot evidence of novel alleles | |
29 | \item \link{inferGenotype}: Infer an Ig genotype | |
30 | \item \link{plotGenotype}: A colorful genotype visualization | |
31 | \item \link{genotypeFasta}: Convert a genotype to sequences | |
32 | \item \link{reassignAlleles}: Correct allele calls | |
27 | \item \link{findNovelAlleles}: Detect novel alleles. | |
28 | \item \link{plotNovel}: Plot evidence of novel alleles. | |
29 | \item \link{inferGenotype}: Infer an Ig genotype using a frequency approach. | |
30 | \item \link{inferGenotypeBayesian}: Infer an Ig genotype using a Bayesian approach. | |
31 | \item \link{plotGenotype}: A colorful genotype visualization. | |
32 | \item \link{genotypeFasta}: Convert a genotype to sequences. | |
33 | \item \link{reassignAlleles}: Correct allele calls. | |
34 | \item \link{generateEvidence}: Generate evidence for the genotype and | |
35 | allele detection inferrence. | |
33 | 36 | } |
34 | 37 | } |
35 | 38 | |
36 | \section{Mutation-related functions}{ | |
39 | \section{Mutation handling}{ | |
37 | 40 | |
38 | 41 | \itemize{ |
39 | \item \link{getMutatedPositions}: Find mutation locations | |
40 | \item \link{getMutCount}: Find distance from germline | |
41 | \item \link{findUnmutatedCalls}: Subset unmutated sequences | |
42 | \item \link{getMutatedPositions}: Find mutation locations. | |
43 | \item \link{getMutCount}: Find distance from germline. | |
44 | \item \link{findUnmutatedCalls}: Subset unmutated sequences. | |
42 | 45 | \item \link{getPopularMutationCount}: Find most common sequence's |
43 | mutation count | |
44 | \item \link{insertPolymorphisms}: Insert SNPs into a sequence | |
46 | mutation count. | |
47 | \item \link{insertPolymorphisms}: Insert SNPs into a sequence. | |
45 | 48 | } |
46 | 49 | } |
47 | 50 | |
48 | \section{Input and formatting}{ | |
51 | \section{Input, output and formatting}{ | |
49 | 52 | |
50 | 53 | \itemize{ |
51 | \item \link{readIgFasta}: Read a fasta file of Ig sequences | |
52 | \item \link{updateAlleleNames}: Correct outdated allele names | |
53 | \item \link{sortAlleles}: Sort allele names intelligently | |
54 | \item \link{cleanSeqs}: Standardize sequence format | |
54 | \item \link{readIgFasta}: Read a fasta file of Ig sequences. | |
55 | \item \link{updateAlleleNames}: Correct outdated allele names. | |
56 | \item \link{sortAlleles}: Sort allele names intelligently. | |
57 | \item \link{cleanSeqs}: Standardize sequence format. | |
55 | 58 | } |
56 | 59 | } |
57 | 60 | |
58 | 61 | \references{ |
59 | Gadala-Maria \emph{et al}. (2015) Automated analysis of | |
60 | high-throughput B cell sequencing data reveals a high frequency of novel | |
61 | immunoglobulin V gene segment alleles. \emph{PNAS}. 112(8):E862-70. | |
62 | \enumerate{ | |
63 | \item Gadala-Maria, et al. (2015) Automated analysis of high-throughput B cell | |
64 | sequencing data reveals a high frequency of novel immunoglobulin V gene | |
65 | segment alleles. PNAS. 112(8):E862-70. | |
62 | 66 | } |
67 | } |
6 | 6 | updateAlleleNames(allele_calls) |
7 | 7 | } |
8 | 8 | \arguments{ |
9 | \item{allele_calls}{a vector of strings respresenting IGHV allele names} | |
9 | \item{allele_calls}{a vector of strings respresenting IGHV allele names.} | |
10 | 10 | } |
11 | 11 | \value{ |
12 | vector of strings respresenting updated IGHV allele names | |
12 | Vector of strings respresenting updated IGHV allele names. | |
13 | 13 | } |
14 | 14 | \description{ |
15 | 15 | \code{updateAlleleNames} takes a set of IGHV allele calls and replaces any |
16 | 16 | outdated names (e.g. IGHV1-f) with the new IMGT names. |
17 | 17 | } |
18 | \details{ | |
19 | The updated allele names are based on IMGT release 201408-4. | |
20 | } | |
21 | 18 | \note{ |
22 | IGMT has removed IGHV2-5*10 and IGHV2-5*07 as it has determined they | |
23 | are actually alleles *02 and *04, respectively. | |
19 | IGMT has removed \code{IGHV2-5*10} and \code{IGHV2-5*07} as it has determined they | |
20 | are actually alleles \code{02} and \code{04}, respectively. The updated allele | |
21 | names are based on IMGT release 201408-4. | |
24 | 22 | } |
25 | 23 | \examples{ |
26 | 24 | # Create a vector that uses old gene/allele names. |
27 | alleles = c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07") | |
25 | alleles <- c("IGHV1-c*01", "IGHV1-f*02", "IGHV2-5*07") | |
28 | 26 | |
29 | 27 | # Update the alleles to the new names |
30 | 28 | updateAlleleNames(alleles) |
31 | 29 | |
32 | 30 | } |
33 | 31 | \references{ |
34 | Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes | |
35 | and alleles: new entities, new names and implications for research and | |
36 | prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6 | |
32 | \enumerate{ | |
33 | \item Xochelli et al. (2014) Immunoglobulin heavy variable (IGHV) genes | |
34 | and alleles: new entities, new names and implications for research and | |
35 | prognostication in chronic lymphocytic leukaemia. Immunogenetics. 67(1):61-6 | |
36 | } | |
37 | 37 | } |
38 | 38 | \seealso{ |
39 | 39 | Like \code{updateAlleleNames}, \link{sortAlleles} can help |
8 | 8 | \arguments{ |
9 | 9 | \item{named_sequences}{a vector of named string representing sequences} |
10 | 10 | |
11 | \item{file}{the name of the output file} | |
11 | \item{file}{the name of the output file.} | |
12 | 12 | |
13 | 13 | \item{width}{the number of characters to be printed per line. |
14 | If not between 1 and 255, width with be infinite.} | |
14 | if not between 1 and 255, width with be infinite.} | |
15 | 15 | |
16 | 16 | \item{append}{\code{logical} indicating if the output should be |
17 | 17 | appended to \code{file} instead of overwriting it} |
18 | 18 | } |
19 | 19 | \value{ |
20 | a named vector of strings respresenting Ig alleles | |
20 | A named vector of strings respresenting Ig alleles. | |
21 | 21 | } |
22 | 22 | \description{ |
23 | 23 | \code{writeFasta} writes a named vector of sequences to a file in fasta |
29 | 29 | |
30 | 30 | ## Introduction |
31 | 31 | |
32 | Immunoglobulin Repertoire-Sequencing (Rep-Seq) data is currently the subject of | |
33 | much study. A key step in analyzing these data involves assigning the closest | |
34 | known V(D)J germline alleles to the (often somatically mutated) sample sequences | |
35 | using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, if the sample utilizes | |
36 | alleles not in the germline database used for alignment, this step will fail. | |
37 | Additionally, this alignment has an associated error rate of ~5% ([[2]][2]), | |
38 | notably among sequences carrying a large number of somatic mutations. | |
32 | Adapative immune receptor repertoire sequencing (AIRR-Seq, Rep-Seq) data is | |
33 | currently the subject of much study. A key step in analyzing these data involves | |
34 | assigning the closest known V(D)J germline alleles to the (often somatically mutated) | |
35 | sample sequences using a tool such as IMGT/HighV-QUEST ([[1]][1]). However, | |
36 | if the sample utilizes alleles not in the germline database used for alignment, | |
37 | this step will fail. Additionally, this alignment has an associated error rate | |
38 | of ~5% ([[2]][2]), notably among sequences carrying a large number of somatic | |
39 | mutations. | |
39 | 40 | |
40 | 41 | Here we provide a **T**ool for **I**mmuno**g**lobulin **G**enotype |
41 | 42 | **E**lucidation via **R**ep-Seq (TIgGER). TIgGER addresses these issues by |
42 | inferring the set of Ig alleles carried by an individual (including any novel | |
43 | alleles) and then using this set of alleles to correct the initial assignments | |
44 | given to sample sequences by existing tools. | |
45 | ||
46 | Additional information is available in: | |
43 | inferring the set of Immunoglobulin (Ig) alleles carried by an individual | |
44 | (including any novel alleles) and then using this set of alleles to correct | |
45 | the initial assignments given to sample sequences by existing tools. | |
46 | ||
47 | This vignette covers the following tasks: | |
48 | ||
49 | 1. Inferring the presence of novel IGHV alleles not in the germline database. | |
50 | 2. Inferring the personalized IGHV genotype of a sample. | |
51 | 3. Correcting the IGHV allele calls of a sample based on the IGHV genotype. | |
52 | ||
53 | Additional information about the methods used by TIgGER is available in: | |
47 | 54 | |
48 | 55 | [Gadala-Maria D, Yaari G, Uduman M, Kleinstein SH (2015) Automated analysis of |
49 | 56 | high-throughput B cell sequencing data reveals a high frequency of novel |
50 | 57 | immunoglobulin V gene segment alleles. *PNAS* |
51 | 58 | 112(8):E862-70](http://www.pnas.org/content/early/2015/02/05/1417683112). |
52 | 59 | |
53 | ||
54 | 60 | ## Input |
55 | 61 | |
56 | 62 | TIgGER requires two main inputs: |
57 | 63 | |
58 | 1. Pre-processed Rep-Seq data | |
64 | 1. Pre-processed Ig sequence data | |
59 | 65 | 2. Database germline sequences |
60 | 66 | |
61 | Rep-seq data is input as a data frame where each row represents a unique | |
62 | observation and and columns represent data about that observation. The required | |
63 | names of the required columns are provided below along with a description of | |
64 | each. | |
67 | AIRR-seq data is input as a data frame following the Change-O standard where | |
68 | each row represents a unique observation and and columns represent data about | |
69 | that observation. The required names of the required columns are provided below | |
70 | along with a description of each. | |
65 | 71 | |
66 | 72 | Column Name | Description |
67 | 73 | ----------------------|--------------------------------------------------------- |
70 | 76 | `J_CALL` | (Comma separated) name(s) of the nearest J allele(s) |
71 | 77 | `JUNCTION_LENGTH` | Length of the junction region of the V(D)J sample |
72 | 78 | |
73 | An example dataset is provided with the `tigger` package. It contains unique | |
74 | functional sequences assigned to IGHV1 family genes isolated from individual | |
75 | PGP1 (referenced in Gadala-Maria *et al.* 2015). | |
79 | An example dataset is provided with the `tigger` package as `SampleDb`. It | |
80 | contains unique functional sequences assigned to IGHV1 family genes isolated | |
81 | from individual PGP1 (referenced in Gadala-Maria *et al.* 2015). | |
76 | 82 | |
77 | 83 | The database of germline sequences should be provided in FASTA format with |
78 | 84 | sequences gapped according to the IMGT numbering scheme ([[3]][3]). IGHV alleles in |
79 | the IMGT database (build 201408-4) are provided with this package. You may read | |
80 | in your own fasta file using `readIgFasta`. | |
85 | the IMGT database (build 201408-4) are provided with this package as `GermlineIGHV`. | |
86 | You may read in your own fasta file using `readIgFasta`. | |
81 | 87 | |
82 | 88 | ```{r, eval=TRUE, message=FALSE, warning=FALSE} |
89 | # Load packages required for this example | |
83 | 90 | library(tigger) |
84 | 91 | library(dplyr) |
85 | # Load example sequence data and example germline database | |
86 | data(sample_db, germline_ighv) | |
87 | ``` | |
88 | ||
89 | ## Running TIgGER | |
90 | ||
91 | The functions provided by this package can be used to perform any combination of | |
92 | the following: | |
93 | ||
94 | 1. Infer the presence of novel IGHV alleles not in the germline database | |
95 | 2. Infer the individual's IGHV genotype | |
96 | 3. Correct the IGHV allele calls of the samples based on the IGHV genotype | |
97 | ||
98 | ### Novel Alleles | |
92 | ``` | |
93 | ||
94 | ## Novel allele detection | |
99 | 95 | |
100 | 96 | Potential novel alleles can be detected by TIgGER. Some of these may be included |
101 | 97 | in the genotype later (see below). `findNovelAlleles` will return a `data.frame` |
108 | 104 | |
109 | 105 | ```{r, eval=TRUE, warning=FALSE} |
110 | 106 | # Detect novel alleles |
111 | novel_df <- findNovelAlleles(sample_db, germline_ighv, nproc=1) | |
107 | novel <- findNovelAlleles(SampleDb, GermlineIGHV, nproc=1) | |
112 | 108 | ``` |
113 | 109 | |
114 | 110 | ```{r, eval=TRUE, warning=FALSE} |
115 | 111 | # Extract and view the rows that contain successful novel allele calls |
116 | novel <- selectNovel(novel_df) | |
117 | novel[1:3] | |
112 | novel_rows <- selectNovel(novel) | |
113 | novel_rows[1:3] | |
118 | 114 | ``` |
119 | 115 | |
120 | 116 | The TIgGER procedure for identifying novel alleles (see citation above) involves |
147 | 143 | |
148 | 144 | ```{r, eval=TRUE, warning=FALSE, fig.width=6, fig.height=8} |
149 | 145 | # Plot evidence of the first (and only) novel allele from the example data |
150 | plotNovel(sample_db, novel[1, ]) | |
151 | ``` | |
152 | ||
153 | ### Genotype | |
154 | An individual's genotype can be inferred using the function `inferGenotype`. | |
155 | This function will remove from the genotype rare/erroneous allele calls which | |
156 | may result from mutations in allele-differentiating regions. This is done by | |
157 | determining the fewest alleles that account for nearly all (default is 7/8) of | |
158 | the allele calls made. The user may opt to only use sequences which perfectly | |
159 | match germline alleles, and may opt to include potential novel alleles. | |
160 | (The genotype output is designed to be human readable, though `plotGenotype` | |
161 | can be used to make a colorful visualization.) For each allele, the | |
162 | number of sequences which match the germline are listed in the same order as | |
163 | the alleles are listed. The total number of sequences that match any allele of | |
164 | that gene is also given. To output these alleles as a names vector of nucleotide | |
165 | sequences, the user may use the function `genotypeFasta`. To save this vector to | |
166 | a fasta file, `writeFasta` may be used. | |
146 | plotNovel(SampleDb, novel[1, ]) | |
147 | ``` | |
148 | ||
149 | ## Inferring genotypes | |
150 | ||
151 | An individual's genotype can be inferred using the functions `inferGenotype` or | |
152 | `inferGenotypeBayesian`. Using one of this functions allows to remove from the | |
153 | genotype rare/erroneous allele calls which may result from mutations in | |
154 | allele-differentiating regions. `inferGenotype` uses a frequency method to | |
155 | decide which alleles belong to the subjects genotype whereas | |
156 | `inferGenotypeBayesian` infers an subject's genotype applying a Bayesian | |
157 | framework and provides a confidence estimate associated with | |
158 | the genotype calls. | |
159 | ||
160 | ||
161 | ### Frequency genotyping approach | |
162 | ||
163 | `inferGenotype` identifies the fewest alleles that account for | |
164 | nearly all (default is 7/8) of the allele calls made. The user may opt to only | |
165 | use sequences which perfectly match germline alleles, and may opt to include | |
166 | potential novel alleles. (The genotype output is designed to be human readable, | |
167 | though `plotGenotype` can be used to make a colorful visualization.) For each | |
168 | allele, the number of sequences which match the germline are listed in the same | |
169 | order as the alleles are listed. The total number of sequences that match any | |
170 | allele of that gene is also given. To output these alleles as a names vector of | |
171 | nucleotide sequences, the user may use the function `genotypeFasta`. To save | |
172 | this vector to a fasta file, `writeFasta` may be used. | |
167 | 173 | |
168 | 174 | ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3} |
169 | 175 | # Infer the individual's genotype, using only unmutated sequences and checking |
170 | 176 | # for the use of the novel alleles inferred in the earlier step. |
171 | geno <- inferGenotype(sample_db, find_unmutated = TRUE, | |
172 | germline_db = germline_ighv, novel_df = novel_df) | |
177 | geno <- inferGenotype(SampleDb, germline_db=GermlineIGHV, novel=novel, | |
178 | find_unmutated=TRUE) | |
173 | 179 | # Save the genotype sequences to a vector |
174 | genotype_seqs <- genotypeFasta(geno, germline_ighv, novel_df) | |
180 | genotype_db <- genotypeFasta(geno, GermlineIGHV, novel) | |
175 | 181 | # Visualize the genotype and sequence counts |
176 | 182 | print(geno) |
177 | 183 | # Make a colorful visualization. Bars indicate presence, not proportion. |
178 | 184 | plotGenotype(geno, text_size = 10) |
179 | ||
180 | ``` | |
181 | ||
182 | ### Corrected Allele Calls | |
185 | ``` | |
186 | ||
187 | ### Bayesian genotyping approach | |
188 | ||
189 | The method `inferGenotypeBayesian` analyzes the posterior probabilities of | |
190 | possible allele distributions, considering up to four distinct alleles per | |
191 | V gene, corresponding to a gene duplication with both loci being heterozygous | |
192 | (i.e., homozygous, heterozygous with one copy of each allele, etc.). The | |
193 | posterior probabilities for these four possible models are compared and a Bayes | |
194 | factor is calculated for the two most probable models. This Bayes factor | |
195 | reflects the confidence in the genotyping call of the method. The bayesian | |
196 | method doesn't use the strict cutoff criterion `fraction_to_explain` that | |
197 | `inferGenotype` uses wherein only the minimum set of alleles explaining | |
198 | 88% (7/8) of apparently-unmutated sequences are included in the genotype. | |
199 | ||
200 | ||
201 | ```{r, eval=TRUE, warning=FALSE, fig.width=4, fig.height=3} | |
202 | # Infer the individual's genotype, using the bayesian method | |
203 | geno_bayesian <- inferGenotypeBayesian(SampleDb, germline_db=GermlineIGHV, | |
204 | novel=novel, find_unmutated=TRUE) | |
205 | # Visualize the genotype and sequence counts | |
206 | print(geno_bayesian) | |
207 | # Make a colorful visualization. Bars indicate presence, not proportion. | |
208 | plotGenotype(geno_bayesian, text_size=10) | |
209 | ``` | |
210 | ||
211 | ## Correcting allele calls | |
183 | 212 | |
184 | 213 | Finally, the original V allele calls may be limited to only those within the |
185 | 214 | inferred genotype. This can be done by using the function `reassignAlleles`. |
186 | By corrected the calls in this manner, the user can greatly reduce the numer of | |
215 | By correcting the calls in this manner, the user can greatly reduce the numer of | |
187 | 216 | ambiguous allele calls (where a single sample sequences is assigned to multiple |
188 | 217 | V alleles, thus preventing the mutations analysis of allele-differentiating |
189 | 218 | positions). Additionally, assignments to erroneous not-in-genotype alleles |
191 | 220 | |
192 | 221 | ```{r, eval=TRUE, warning=FALSE} |
193 | 222 | # Use the personlized genotype to determine corrected allele assignments |
194 | V_CALL_GENOTYPED <- reassignAlleles(sample_db, genotype_seqs) | |
195 | # Append the corrected calls to the original data.frame | |
196 | sample_db <- bind_cols(sample_db, V_CALL_GENOTYPED) | |
223 | # Updated genotype will be placed in the V_CALL_GENOTYPED column | |
224 | sample_db <- reassignAlleles(SampleDb, genotype_db) | |
197 | 225 | ``` |
198 | 226 | |
199 | 227 | From here, one may proceed with further downstream analyses, but with the |
200 | 228 | advantage of having much-improved allele calls. Besides having discovered |
201 | alleles not in the IGMT database, the calls made by IMGT have been tailored to | |
229 | alleles not in the IMGT database, the calls made by IMGT have been tailored to | |
202 | 230 | the subject's genotype, greatly reducing the number of problematic calls, as |
203 | 231 | can be seen below. |
204 | 232 | |
205 | 233 | ```{r, eval=TRUE, warning=FALSE} |
206 | 234 | # Find the set of alleles in the original calls that were not in the genotype |
207 | 235 | not_in_genotype <- sample_db$V_CALL %>% |
208 | strsplit(",") %>% | |
209 | unlist() %>% | |
210 | unique() %>% | |
211 | setdiff(names(genotype_seqs)) | |
236 | strsplit(",") %>% | |
237 | unlist() %>% | |
238 | unique() %>% | |
239 | setdiff(names(genotype_db)) | |
212 | 240 | |
213 | 241 | # Determine the fraction of calls that were ambigious before/after correction |
214 | 242 | # and the fraction that contained original calls to non-genotype alleles. Note |
215 | 243 | # that by design, only genotype alleles are allowed in "after" calls. |
216 | data.frame(Ambiguous = c(mean(grepl(",",sample_db$V_CALL)), | |
217 | mean(grepl(",",sample_db$V_CALL_GENOTYPED))), | |
218 | NotInGenotype = c(mean(sample_db$V_CALL %in% not_in_genotype), | |
219 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
220 | row.names = c("Before", "After")) %>% | |
244 | data.frame(Ambiguous=c(mean(grepl(",", sample_db$V_CALL)), | |
245 | mean(grepl(",", sample_db$V_CALL_GENOTYPED))), | |
246 | NotInGenotype=c(mean(sample_db$V_CALL %in% not_in_genotype), | |
247 | mean(sample_db$V_CALL_GENOTYPED %in% not_in_genotype)), | |
248 | row.names=c("Before", "After")) %>% | |
221 | 249 | t() %>% round(3) |
222 | ||
223 | ``` | |
224 | ||
250 | ``` | |
225 | 251 | |
226 | 252 | ## References |
227 | 253 | |
232 | 258 | [1]: http://www.imgt.org/IMGTindex/IMGTHighV-QUEST.html "Alamyar et al. (2010)" |
233 | 259 | [2]: http://www.ncbi.nlm.nih.gov/pubmed/20147303 "Munshaw and Kepler (2010)" |
234 | 260 | [3]: http://www.ncbi.nlm.nih.gov/pubmed/12477501 "Lefranc et al. (2003)" |
261 |