diff --git a/DESCRIPTION b/DESCRIPTION index f3f69cb..9e82e39 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,27 +1,42 @@ Package: gbm -Version: 2.1.3 -Date: 2017-03-21 +Version: 2.1.4 Title: Generalized Boosted Regression Models -Author: Greg Ridgeway with contributions from - others -Maintainer: ORPHANED -Depends: R (>= 2.9.0), survival, lattice, splines, parallel -Suggests: RUnit -Description: An implementation of extensions to Freund and - Schapire's AdaBoost algorithm and Friedman's gradient boosting - machine. Includes regression methods for least squares, - absolute loss, t-distribution loss, quantile regression, - logistic, multinomial logistic, Poisson, Cox proportional - hazards partial likelihood, AdaBoost exponential loss, - Huberized hinge loss, and Learning to Rank measures - (LambdaMart). +Authors@R: c( + person("Brandon", "Greenwell", + email = "greenwell.brandon@gmail.com", + role = c("aut", "cre"), + comment = c(ORCID = "0000-0002-8120-0084")), + person("Bradley", "Boehmke", + email = "bradleyboehmke@gmail.com", + role = "aut", + comment = c(ORCID = "0000-0002-3611-8516")), + person("Jay", "Cunningham", + email = "james@notbadafterall.com", + role = "aut"), + person("GBM", "Developers", + role = "aut", + comment = "https://github.com/gbm-developers") + ) +Depends: R (>= 2.9.0) +Imports: gridExtra, lattice, parallel, survival +Suggests: knitr, pdp, RUnit, splines, viridis +Description: An implementation of extensions to Freund and Schapire's AdaBoost + algorithm and Friedman's gradient boosting machine. Includes regression + methods for least squares, absolute loss, t-distribution loss, quantile + regression, logistic, multinomial logistic, Poisson, Cox proportional hazards + partial likelihood, AdaBoost exponential loss, Huberized hinge loss, and + Learning to Rank measures (LambdaMart). Originally developed by Greg Ridgeway. License: GPL (>= 2) | file LICENSE -URL: http://code.google.com/p/gradientboostedmodels/ -Packaged: 2017-03-21 06:44:01 UTC; ripley +URL: https://github.com/gbm-developers/gbm +BugReports: https://github.com/gbm-developers/gbm/issues +RoxygenNote: 6.1.0 +VignetteBuilder: knitr NeedsCompilation: yes +Packaged: 2018-09-16 06:19:54 UTC; ripley +Author: Brandon Greenwell [aut, cre] (), + Bradley Boehmke [aut] (), + Jay Cunningham [aut], + GBM Developers [aut] (https://github.com/gbm-developers) +Maintainer: Brandon Greenwell Repository: CRAN -Date/Publication: 2017-03-21 06:48:03 UTC -X-CRAN-Original-Maintainer: Harry Southworth - -X-CRAN-Comment: Orphaned on 2017-03-21 as long-standing errors were not - corrected. NMU by CRAN team. +Date/Publication: 2018-09-16 08:20:11 diff --git a/LICENSE b/LICENSE index 4bc04f8..e78fd2a 100644 --- a/LICENSE +++ b/LICENSE @@ -13,4 +13,4 @@ General Public License for more details. Copies of the relevant licenses can be found at: -http://www.r-project.org/Licenses/ +https://www.r-project.org/Licenses/ diff --git a/MD5 b/MD5 index e460461..4b64c85 100644 --- a/MD5 +++ b/MD5 @@ -1,33 +1,33 @@ 108bdba2eb6f2ba6ce890f47224ef68f *CHANGES -cf8eedb04e0b7de4ba83cabfe278a328 *DESCRIPTION -c2cae135a9c0d4ae15e14e89166ba841 *LICENSE -dcb19d54815086722ad172c662cb7d03 *NAMESPACE -6a1293bc6f87d439da835b1b837f9c52 *R/basehaz.gbm.R -cc5e4cd5f5d5e23382bae904e9ada152 *R/calibrate.plot.R -5615ac799ce14603a692a2c29be9648f *R/checks.R -cf5a5bce0653ae59317ddac8bfe8d389 *R/gbm.R -428c0d3515d5fcbbdd992e10f5d22793 *R/gbm.fit.R -1de9823ae906c064f61a39bd1e0241d3 *R/gbm.loss.R -ab8e510ccde4446a7c93ff384ba3217c *R/gbm.more.R -5a79d41470d1f8ae3b8c278bc5e12389 *R/gbm.perf.R -0fdb6a883897939376827795e4ee5230 *R/gbmCluster.R -f4651f14ae6acdfa96319bb257f9d0e1 *R/gbmCrossVal.R -7201fac67c6152443cf2a2c3b5989116 *R/gbmDoFold.R -f5cc3af1a8eb7ddbf962038e88d27953 *R/getCVgroup.R -efd18f017f7a73397141bf4239c922ce *R/getStratify.R -696197960954e0845b8998911987cab2 *R/guessDist.R -be47e36ef092244404831df5227f6d65 *R/interact.gbm.R -f8c4c5e164b772b3bfc152b8e5659e2e *R/ir.measures.R -bbfe015167ca3c75ecd155f6b090f661 *R/permutation.test.gbm.R -51c2749906af39dc17eb1af54b4d861d *R/plot.gbm.R -b9c2bb5000212628b390b81dfdd895c0 *R/predict.gbm.R -7e3daea77a7b6ffa18e9f81cf0e0b152 *R/pretty.gbm.tree.R -13ac361d8e3f54893f7de0b66351eee4 *R/print.gbm.R -36d2345c029a4e8384703c92d46f9b2e *R/reconstructGBMdata.R -792e2a5c7cdfeeee3e29c4e418f8af35 *R/relative.influence.R -e8cf40a7c7efcd820e908a43252cfc2b *R/shrink.gbm.R -eefc2a06d746e77ac2ba101d240640b8 *R/shrink.gbm.pred.R -5b47e86c97e9b464bd64e7ea647c65ae *R/test.gbm.R +82faeac45c35b19ca76f8278d98e1d20 *DESCRIPTION +67f2f9cc8297be2f12dfe86e05277383 *LICENSE +00dda5f78be66b96a668b74b523fcac1 *NAMESPACE +ab6e6d294c6c724e76c5f069c1694fd2 *NEWS.md +061c315ef880f845918ff59cce721239 *R/basehaz.gbm.R +aef3622e1f5a19f9c74616130321851f *R/calibrate.plot.R +af7dcaeddbc7e6eb31b66290a98c0a1c *R/gbm-internals.R +2f21a77c0c4d5274533173b223f7f05e *R/gbm-package.R +cc641d322c124bfab6d7e2351cf4e6d2 *R/gbm.R +1a60700a939bb694799c92073d13b3a5 *R/gbm.fit.R +2f6a79af8a23dd4be5283881a82e5f5c *R/gbm.more.R +cdcc395f477e8a83fde52d313d5d9760 *R/gbm.object.R +b999e62a4727556bb73d893db39e9a83 *R/gbm.perf.R +f17f3d39a4d6820e78130748ce8032ff *R/gbmCrossVal.R +40231a31962f0df1ab182edcffe51b9f *R/interact.gbm.R +fc877c59338b8343545050803c29ec95 *R/ir.measures.R +1e1e9648a40d27a07c63e9c4103ba4d0 *R/plot.gbm.R +23d6e774a0facb281c6f179703b9533d *R/predict.gbm.R +48438bd417c4a7b3c0495c901c5d5060 *R/pretty.gbm.tree.R +b068e5396186cc21060477aac914abe7 *R/print.gbm.R +af4fd23ba860c912a1a237fb3b5631d1 *R/reconstructGBMdata.R +1a8dd026617a7bdc35d9e7ed8232c399 *R/relative.influence.R +81f913b053b7d402f4a808aeb3670e2f *R/shrink.gbm.R +d001fbd3c7de86463f4d0f1dff63a70b *R/shrink.gbm.pred.R +21f1a9fdd69be98ad81bbca7e18ec8a7 *R/test.gbm.R +3fc23fb8a1c816ac430c4e836a08078a *R/utils.R +08ab323918a24917e4d4638ca01c841a *R/zzz.R +55ae3c9b2954cd0ac1c317b5698d77c3 *README.md +4dc9151409b8112474ac3f1da044f7f7 *build/vignette.rds 4e38ebb4d3578e523b7d94fc9ece3d65 *demo/00Index e3bd8606063f15ded6ab3261c13d22af *demo/OOB-reps.R 354344b4f6e8a232508ef872ced5efa3 *demo/bernoulli.R @@ -37,37 +37,33 @@ af763746809ed98e48e065f77942cb05 *demo/pairwise.R dbff7ebcc6a18e27c1b423fd5db70ae3 *demo/printExamples.R 79316127956b8f5291f5021f1e7c89ef *demo/robustReg.R -5e674508b7fde23e247a6e1a6c6b6ec6 *inst/doc/gbm.Sweave -e73636a53327b5e049e5764b0620d03e *inst/doc/gbm.pdf -b63bc1c2450ad4bca8db60e03b932a53 *inst/doc/gbm.tex -64dbd4ec219c6e855b87bc4ddeba111e *inst/doc/index.html -dc706f07b81a76bf9aab2edf4641e86f *inst/doc/oobperf2.eps -7ba661d197d25537a69fc34d737b4d29 *inst/doc/oobperf2.pdf -9d73da9632fed38c327ffbd1b072347b *inst/doc/shrinkage-v-iterations.eps -3fda19791155842b0e48565781441aa2 *inst/doc/shrinkage-v-iterations.pdf +c044e4fcd21ef75478830ede774cfba7 *inst/doc/gbm.Rnw +d9afae55c8fff7ec22fc678aa3224efb *inst/doc/gbm.pdf 4d55dd49b93485a78ecb50caafd19b4e *inst/doc/shrinkageplot.R -90fd593dd07098b5600fb650e86733ff *inst/doc/srcltx.sty -ce7a173a73fb952a1bf40cb65e3b99f2 *man/basehaz.gbm.Rd -7fca3316fe15ef40546c3db911d67086 *man/calibrate.plot.Rd -99fab30dc167a5c90a1d1424e71a25f4 *man/gbm-internal.Rd -dbbaa87e0b50024671667d8d38008e64 *man/gbm-package.Rd -eac981fe86aac2cf2b76f2bcee97289f *man/gbm.Rd -089cf48c905c3429ed63f69a0cd982b5 *man/gbm.object.Rd -3ed5b048c81d016868ca2799e4504419 *man/gbm.perf.Rd -7359f0a3b1f2d27cf29e497745c6ba59 *man/gbm.roc.area.Rd -9e8eb660baefa82bc008cbf7e12babf8 *man/gbmCrossVal.Rd -8fca4f44be806cb17eb37affe8334618 *man/interact.gbm.Rd -a8728abc1dc77b599c2aa7d1df6f982e *man/plot.gbm.Rd -5896d84873dd1ed5d22005b5b37b17b6 *man/predict.gbm.Rd -1656ffd7646d41236545e0399a70afdd *man/pretty.gbm.tree.Rd -894215a9e1e715f39a6cb79a6fe81baf *man/print.gbm.Rd -0da8961be170c9a72df248d6f0fe985e *man/quantile.rug.Rd -9fbb2bddffae7a639d4f702817eeecb3 *man/reconstructGBMdata.Rd -e1dea92edf78383b17139d45c318294c *man/relative.influence.Rd -b58470798d31cfceceeec40252ce833f *man/shrink.gbm.Rd -ef52c476e46b4c64eee269064ea58b64 *man/shrink.gbm.pred.Rd -b73e9273873b1178e9a116187105c022 *man/summary.gbm.Rd -3e0b677bccf30388ec0fc96f77f5fb62 *man/validate.Rd +e89d6b6a7a2f19974d5c7916c9e2ae66 *man/basehaz.gbm.Rd +c606780ccf3028850a848dfc2b3f4739 *man/calibrate.plot.Rd +bf74b54c920807d509d5ff19e45e95d4 *man/gbm-internals.Rd +5f96c05f991a485fbfe7a23b87b3d649 *man/gbm-package.Rd +db08fe6fff6da69ebfbaad46ff0d902f *man/gbm.Rd +94befbc345d33d0ed250a227a1268603 *man/gbm.fit.Rd +a65152118be58b4d8bf48ad8c93614c7 *man/gbm.more.Rd +728fa0d75f96519d0156aa2891362b9b *man/gbm.object.Rd +d007fd2b010c4b6ccbd4c0ec2aba9ea0 *man/gbm.perf.Rd +c43f6a77ca7bec407e85b642d6dfa2be *man/gbm.roc.area.Rd +2cd76f2ffbdc511bb0ac0a9dc1fb393b *man/gbmCrossVal.Rd +7d42ecd6cfbbb3e83f94685f0ef7add4 *man/grid.arrange.Rd +c1789d7d5b7fc9be7665be55c1893d35 *man/interact.gbm.Rd +0a3f9f38c375609ef6380dceb1d4128c *man/plot.gbm.Rd +2a0d1ae9483de0ffb214d25623821f68 *man/predict.gbm.Rd +e368dcac4b75c8273529151e0087c5d4 *man/pretty.gbm.tree.Rd +21c028bad14805f40e0a7a0dc7e49e64 *man/print.gbm.Rd +f9563a4ec1265edfec56ecbdb8148e38 *man/quantile.rug.Rd +27aa52e20ea8281697e8357a36d58b85 *man/reconstructGBMdata.Rd +f17f451739be17e89ec1b227b6602c86 *man/relative.influence.Rd +6f99e3dde82cbc922d9f1fc7f22bdcd9 *man/shrink.gbm.Rd +d75c1d9e1ff0c6a83bb37df2591ae4d9 *man/shrink.gbm.pred.Rd +dd2dfa92c91ff3ae020d9dbdd23657fb *man/summary.gbm.Rd +8201654f42537ca205d0d5b138848df8 *man/test.gbm.Rd 0d32ce72a7b02fc57d602c60b9ba8305 *src/adaboost.cpp 2f5d22dc3043e69628763cbe303e6b5f *src/adaboost.h 6d2bd44a11975c8f023640eb7a9036c3 *src/bac/gaussian.cpp @@ -82,17 +78,17 @@ 91d88e455827695f63bf23df5dfb3108 *src/distribution.h 6d2bd44a11975c8f023640eb7a9036c3 *src/gaussian.cpp 6c2bf2616a3b4491aaaf501346246d3f *src/gaussian.h +889bfcdd44dc35824be51ba8ae2bd517 *src/gbm-init.c 1d8d4e59887769602b1d3c8dc3d5f94f *src/gbm.cpp 0f49e8549558916322ec80e29b591a73 *src/gbm.h c0c572eb464dae70700ffe8fdc3f6b9f *src/gbm_engine.cpp b3f1f49fa614ac6cfd52b28191bfdb70 *src/gbm_engine.h -f1da15864dab021cdac1617ffba4ff0f *src/gbmentry.cpp +1d924856d046e942a312d373cfce230f *src/gbmentry.cpp 1fba83f37e9f092d8b005e0c8f32a97b *src/huberized.cpp 141e5b762944c14a0b6294e15046296f *src/huberized.h -cd2cedbf213ddbc773ea20fe354a93ae *src/init.c 10dcf061e2807ca52f811ec6650f33ad *src/laplace.cpp 53b4d97c482517fbbc97162da1adf891 *src/laplace.h -e7958b4630de29d3848d057d2aebc6e2 *src/locationm.cpp +d25bcfb8da3565604f902270b25eb470 *src/locationm.cpp 932f3d98f158ebf6ae11ed47e873a7f3 *src/locationm.h 39094967ceaabf7c744bc93d0b86d22f *src/matrix.h 7242e54abea29c46990c4aabba7a65b6 *src/multinomial.cpp @@ -121,3 +117,8 @@ 9ab15eb81fc9a18ee7d14a76f7aefd2a *src/tdist.h 276e36bf158250eb458a1cdabcf975b5 *src/tree.cpp 6b2f1cd60e5d67638e110e1ac9552b27 *src/tree.h +c044e4fcd21ef75478830ede774cfba7 *vignettes/gbm.Rnw +b5633beb372053eac8730e76d8999ce9 *vignettes/gbm.bib +7ba661d197d25537a69fc34d737b4d29 *vignettes/oobperf2.pdf +3fda19791155842b0e48565781441aa2 *vignettes/shrinkage-v-iterations.pdf +90fd593dd07098b5600fb650e86733ff *vignettes/srcltx.sty diff --git a/NAMESPACE b/NAMESPACE index e3c3345..8d1dc86 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,30 +1,93 @@ -# Export all names that don't start with "." -exportPattern("^[^\\.]") +# Generated by roxygen2: do not edit by hand -useDynLib(gbm) - -importFrom(survival, Surv) - -# ns from splines is used in one of the examples -importFrom(splines, ns, splineDesign) - -# xyplot is used, which means several functions internal -# to lattice will also be used. Import the lot. +S3method(plot,gbm) +S3method(predict,gbm) +S3method(print,gbm) +S3method(summary,gbm) +export(basehaz.gbm) +export(calibrate.plot) +export(checkID) +export(checkMissing) +export(checkOffset) +export(checkWeights) +export(gbm) +export(gbm.conc) +export(gbm.fit) +export(gbm.loss) +export(gbm.more) +export(gbm.perf) +export(gbm.roc.area) +export(gbmCluster) +export(gbmCrossVal) +export(gbmCrossValErr) +export(gbmCrossValModelBuild) +export(gbmCrossValPredictions) +export(gbmDoFold) +export(getCVgroup) +export(getStratify) +export(getVarNames) +export(grid.arrange) +export(guessDist) +export(interact.gbm) +export(ir.measure.auc) +export(ir.measure.conc) +export(ir.measure.map) +export(ir.measure.mrr) +export(ir.measure.ndcg) +export(perf.pairwise) +export(permutation.test.gbm) +export(plot.gbm) +export(predict.gbm) +export(pretty.gbm.tree) +export(quantile.rug) +export(reconstructGBMdata) +export(relative.influence) +export(show.gbm) +export(shrink.gbm) +export(shrink.gbm.pred) +export(summary.gbm) +export(test.gbm) +export(test.relative.influence) +export(validate.gbm) import(lattice) - -import(parallel) - -importFrom("grDevices", "rainbow") -importFrom("graphics", "abline", "axis", "barplot", "lines", "mtext", - "par", "plot", "polygon", "rug", "segments", "title") -importFrom("stats", "approx", "binomial", "delete.response", - "gaussian", "glm", "loess", "model.extract", "model.frame", - "model.offset", "model.response", "model.weights", - "na.pass", "poisson", "predict", "quantile", "rbinom", - "reformulate", "rexp", "rnorm", "runif", "sd", "supsmu", - "terms", "var", "weighted.mean") - -S3method(plot, gbm) -S3method(predict, gbm) -S3method(print, gbm) -S3method(summary, gbm) +importFrom(grDevices,rainbow) +importFrom(graphics,abline) +importFrom(graphics,axis) +importFrom(graphics,barplot) +importFrom(graphics,lines) +importFrom(graphics,mtext) +importFrom(graphics,par) +importFrom(graphics,plot) +importFrom(graphics,polygon) +importFrom(graphics,rug) +importFrom(graphics,segments) +importFrom(graphics,title) +importFrom(gridExtra,grid.arrange) +importFrom(stats,approx) +importFrom(stats,binomial) +importFrom(stats,delete.response) +importFrom(stats,gaussian) +importFrom(stats,glm) +importFrom(stats,loess) +importFrom(stats,model.extract) +importFrom(stats,model.frame) +importFrom(stats,model.offset) +importFrom(stats,model.response) +importFrom(stats,model.weights) +importFrom(stats,na.pass) +importFrom(stats,poisson) +importFrom(stats,predict) +importFrom(stats,quantile) +importFrom(stats,rbinom) +importFrom(stats,reformulate) +importFrom(stats,reorder) +importFrom(stats,rexp) +importFrom(stats,rnorm) +importFrom(stats,runif) +importFrom(stats,sd) +importFrom(stats,supsmu) +importFrom(stats,terms) +importFrom(stats,var) +importFrom(stats,weighted.mean) +importFrom(survival,Surv) +useDynLib(gbm, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..96b0012 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,11 @@ +# NEWS for gbm package + +### Changes for version 2.1.4 +* Switched from `CHANGES` to `NEWS` file. +* Updated links and maintainer field in `DESCRIPTION` file. +* Fixed bug caused by factors with unused levels [(#5)](https://github.com/gbm-developers/gbm/issues/5). +* Fixed bug with axis labels in the `plot` method for `"gbm"` objects. [(#17)](https://github.com/gbm-developers/gbm/issues/17). +* The `plot` method for `"gbm"` objects is now more consistent and always returns a `"trellis"` object [(#19)](https://github.com/gbm-developers/gbm/issues/19). Consequently, setting graphical parameters via `par` will no longer have an effect on the output from `plot.gbm`. +* The `plot` method for `"gbm"` objects gained five new arguments: `level.plot`, `contour`, `number`, `overlap`, and `col.regions`; see `?plot.gbm` for details. +* The default color palette for false color level plots in `plot.gbm` has changed to the Matplotlib 'viridis' color map. +* Fixed a number of references and URLs. diff --git a/R/basehaz.gbm.R b/R/basehaz.gbm.R index 5aaa5e0..5c07eaf 100644 --- a/R/basehaz.gbm.R +++ b/R/basehaz.gbm.R @@ -1,43 +1,78 @@ -# compute Breslow estimator of the baseline hazard function -basehaz.gbm <- function(t,delta,f.x, - t.eval=NULL, - smooth=FALSE, - cumulative=TRUE) -{ - t.unique <- sort(unique(t[delta==1])) - alpha <- length(t.unique) - for(i in 1:length(t.unique)) - { - alpha[i] <- sum(t[delta==1]==t.unique[i])/ - sum(exp(f.x[t>=t.unique[i]])) - } +# rd2rox <- function(path = file.choose()) { +# info <- Rd2roxygen::parse_file(path) +# cat(Rd2roxygen::create_roxygen(info), sep = "\n") +# } - if(!smooth && !cumulative) - { - if(!is.null(t.eval)) + +#' Baseline hazard function +#' +#' Computes the Breslow estimator of the baseline hazard function for a +#' proportional hazard regression model. +#' +#' The proportional hazard model assumes h(t|x)=lambda(t)*exp(f(x)). +#' \code{\link{gbm}} can estimate the f(x) component via partial likelihood. +#' After estimating f(x), \code{basehaz.gbm} can compute the a nonparametric +#' estimate of lambda(t). +#' +#' @param t The survival times. +#' @param delta The censoring indicator. +#' @param f.x The predicted values of the regression model on the log hazard +#' scale. +#' @param t.eval Values at which the baseline hazard will be evaluated. +#' @param smooth If \code{TRUE} \code{basehaz.gbm} will smooth the estimated +#' baseline hazard using Friedman's super smoother \code{\link{supsmu}}. +#' @param cumulative If \code{TRUE} the cumulative survival function will be +#' computed. +#' @return A vector of length equal to the length of t (or of length +#' \code{t.eval} if \code{t.eval} is not \code{NULL}) containing the baseline +#' hazard evaluated at t (or at \code{t.eval} if \code{t.eval} is not +#' \code{NULL}). If \code{cumulative} is set to \code{TRUE} then the returned +#' vector evaluates the cumulative hazard function at those values. +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link[survival]{survfit}}, \code{\link{gbm}} +#' @references +#' N. Breslow (1972). "Discussion of `Regression Models and +#' Life-Tables' by D.R. Cox," Journal of the Royal Statistical Society, Series +#' B, 34(2):216-217. +#' +#' N. Breslow (1974). "Covariance analysis of censored survival data," +#' Biometrics 30:89-99. +#' @keywords methods survival +#' @export +basehaz.gbm <- function(t,delta, f.x, t.eval = NULL, smooth = FALSE, + cumulative = TRUE) { + + t.unique <- sort(unique(t[delta==1])) + alpha <- length(t.unique) + for(i in 1:length(t.unique)) { + alpha[i] <- sum(t[delta==1]==t.unique[i])/ + sum(exp(f.x[t>=t.unique[i]])) + } + + if(!smooth && !cumulative) { + if(!is.null(t.eval)) { + stop("Cannot evaluate unsmoothed baseline hazard at t.eval.") + } + } else { + if(smooth && !cumulative) { + lambda.smooth <- supsmu(t.unique,alpha) + } else { + if(smooth && cumulative) { - stop("Cannot evaluate unsmoothed baseline hazard at t.eval.") + lambda.smooth <- supsmu(t.unique, cumsum(alpha)) + } else { # (!smooth && cumulative) - THE DEFAULT + lambda.smooth <- list(x = t.unique, y = cumsum(alpha)) } - } else - if(smooth && !cumulative) - { - lambda.smooth <- supsmu(t.unique,alpha) - } else - if(smooth && cumulative) - { - lambda.smooth <- supsmu(t.unique,cumsum(alpha)) - } else # (!smooth && cumulative) - THE DEFAULT - { - lambda.smooth <- list(x=t.unique,y=cumsum(alpha)) - } + } + } - if(!is.null(t.eval)) - { - obj <- approx(lambda.smooth$x,lambda.smooth$y,xout=t.eval)$y - } else - { - obj <- approx(lambda.smooth$x,lambda.smooth$y,xout=t)$y - } - - return(obj) + + obj <- if(!is.null(t.eval)) { + approx(lambda.smooth$x, lambda.smooth$y, xout = t.eval)$y + } else { + approx(lambda.smooth$x, lambda.smooth$y, xout = t)$y + } + + return(obj) + } diff --git a/R/calibrate.plot.R b/R/calibrate.plot.R index a015a69..25d1a9f 100644 --- a/R/calibrate.plot.R +++ b/R/calibrate.plot.R @@ -1,90 +1,191 @@ -quantile.rug <- function(x,prob=(0:10)/10,...) -{ - quants <- quantile(x[!is.na(x)],prob=prob) - if(length(unique(quants)) < length(prob)) - { - quants <- jitter(quants) - } - rug(quants,...) +#' Quantile rug plot +#' +#' Marks the quantiles on the axes of the current plot. +#' +#' @param x A numeric vector. +#' +#' @param prob The quantiles of x to mark on the x-axis. +#' +#' @param ... Additional optional arguments to be passed onto +#' \code{\link[graphics]{rug}} +#' +#' @return No return values. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com}. +#' +#' @seealso \code{\link[graphics]{plot}}, \code{\link[stats]{quantile}}, +#' \code{\link[base]{jitter}}, \code{\link[graphics]{rug}}. +#' +#' @keywords aplot +#' +#' @export quantile.rug +#' +#' @examples +#' x <- rnorm(100) +#' y <- rnorm(100) +#' plot(x, y) +#' quantile.rug(x) +quantile.rug <- function(x, prob = 0:10/10, ...) { + quants <- quantile(x[!is.na(x)], prob = prob) + if(length(unique(quants)) < length(prob)) { + quants <- jitter(quants) + } + rug(quants, ...) } -calibrate.plot <- function(y,p, - distribution="bernoulli", - replace=TRUE, - line.par=list(col="black"), - shade.col="lightyellow", - shade.density=NULL, - rug.par=list(side=1), - xlab="Predicted value", - ylab="Observed average", - xlim=NULL,ylim=NULL, - knots=NULL,df=6, - ...) -{ - data <- data.frame(y=y,p=p) - if(is.null(knots) && is.null(df)) - stop("Either knots or df must be specified") - if((df != round(df)) || (df<1)) - stop("df must be a positive integer") - - if(distribution=="bernoulli") - { - family1 = binomial - } else if(distribution=="poisson") - { - family1 = poisson - } else - { - family1 = gaussian - } - gam1 <- glm(y~ns(p,df=df,knots=knots),data=data,family=family1) - - x <- seq(min(p),max(p),length=200) - yy <- predict(gam1,newdata=data.frame(p=x),se.fit=TRUE,type="response") - - x <- x[!is.na(yy$fit)] - yy$se.fit <- yy$se.fit[!is.na(yy$fit)] - yy$fit <- yy$fit[!is.na(yy$fit)] - - if(!is.na(shade.col)) - { - se.lower <- yy$fit-2*yy$se.fit - se.upper <- yy$fit+2*yy$se.fit - if(distribution=="bernoulli") - { - se.lower[se.lower < 0] <- 0 - se.upper[se.upper > 1] <- 1 - } - if(distribution=="poisson") - { - se.lower[se.lower < 0] <- 0 - } - if(is.null(xlim)) xlim <- range(se.lower,se.upper,x) - if(is.null(ylim)) ylim <- range(se.lower,se.upper,x) - } - else - { - if(is.null(xlim)) xlim <- range(yy$fit,x) - if(is.null(ylim)) ylim <- range(yy$fit,x) - } - if(replace) - { - plot(0,0, - type="n", - xlab=xlab,ylab=ylab, - xlim=xlim,ylim=ylim, - ...) - } - if(!is.na(shade.col)) - { - polygon(c(x,rev(x),x[1]), - c(se.lower,rev(se.upper),se.lower[1]), - col=shade.col, - border=NA, - density=shade.density) - } - lines(x,yy$fit,col=line.par$col) - quantile.rug(p,side=rug.par$side) - abline(0,1,col="red") +#' Calibration plot +#' +#' An experimental diagnostic tool that plots the fitted values versus the +#' actual average values. Currently only available when +#' \code{distribution = "bernoulli"}. +#' +#' Uses natural splines to estimate E(y|p). Well-calibrated predictions imply +#' that E(y|p) = p. The plot also includes a pointwise 95% confidence band. +#' +#' @param y The outcome 0-1 variable. +#' +#' @param p The predictions estimating E(y|x). +#' +#' @param distribution The loss function used in creating \code{p}. +#' \code{bernoulli} and \code{poisson} are currently the only special options. +#' All others default to squared error assuming \code{gaussian}. +#' +#' @param replace Determines whether this plot will replace or overlay the +#' current plot. \code{replace=FALSE} is useful for comparing the calibration +#' of several methods. +#' +#' @param line.par Graphics parameters for the line. +#' +#' @param shade.col Color for shading the 2 SE region. \code{shade.col=NA} +#' implies no 2 SE region. +#' +#' @param shade.density The \code{density} parameter for \code{\link{polygon}}. +#' +#' @param rug.par Graphics parameters passed to \code{\link{rug}}. +#' +#' @param xlab x-axis label corresponding to the predicted values. +#' +#' @param ylab y-axis label corresponding to the observed average. +#' +#' @param xlim,ylim x- and y-axis limits. If not specified te function will +#' select limits. +#' +#' @param knots,df These parameters are passed directly to +#' \code{\link[splines]{ns}} for constructing a natural spline smoother for the +#' calibration curve. +#' +#' @param ... Additional optional arguments to be passed onto +#' \code{\link[graphics]{plot}} +#' +#' @return No return values. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' @references +#' J.F. Yates (1982). "External correspondence: decomposition of +#' the mean probability score," Organisational Behaviour and Human Performance +#' 30:132-156. +#' +#' D.J. Spiegelhalter (1986). "Probabilistic Prediction in Patient Management +#' and Clinical Trials," Statistics in Medicine 5:421-433. +#' @keywords hplot +#' +#' @export +#' +#' @examples +#' # Don't want R CMD check to think there is a dependency on rpart +#' # so comment out the example +#' #library(rpart) +#' #data(kyphosis) +#' #y <- as.numeric(kyphosis$Kyphosis)-1 +#' #x <- kyphosis$Age +#' #glm1 <- glm(y~poly(x,2),family=binomial) +#' #p <- predict(glm1,type="response") +#' #calibrate.plot(y, p, xlim=c(0,0.6), ylim=c(0,0.6)) +calibrate.plot <- function(y, p, distribution = "bernoulli", replace = TRUE, + line.par = list(col = "black"), + shade.col = "lightyellow", + shade.density = NULL, rug.par = list(side = 1), + xlab = "Predicted value", ylab = "Observed average", + xlim = NULL, ylim = NULL, knots = NULL, df = 6, ...) + { + + # Sanity check + if (!requireNamespace("splines", quietly = TRUE)) { + stop("The splines package is needed for this function to work. Please ", + "install it.", call. = FALSE) + } + + data <- data.frame(y = y, p = p) + + # Check spline parameters + if(is.null(knots) && is.null(df)) { + stop("Either knots or df must be specified") + } + if((df != round(df)) || (df < 1)) { + stop("df must be a positive integer") + } + + # Check distribution + if(distribution == "bernoulli") { + family1 <- binomial + } else if(distribution == "poisson") { + family1 <- poisson + } else { + family1 <- gaussian + } + + # Fit a GLM using natural cubic splines + gam1 <- glm(y ~ splines::ns(p, df = df, knots = knots), data = data, + family = family1) + + # Plotting data + x <- seq(min(p), max(p), length = 200) + yy <- predict(gam1, newdata = data.frame(p = x), se.fit = TRUE, + type = "response") + x <- x[!is.na(yy$fit)] + yy$se.fit <- yy$se.fit[!is.na(yy$fit)] + yy$fit <- yy$fit[!is.na(yy$fit)] + + # Plotting parameters + if(!is.na(shade.col)) { + se.lower <- yy$fit - 2 * yy$se.fit + se.upper <- yy$fit + 2 * yy$se.fit + if(distribution == "bernoulli") { + se.lower[se.lower < 0] <- 0 + se.upper[se.upper > 1] <- 1 + } + if(distribution == "poisson") { + se.lower[se.lower < 0] <- 0 + } + if(is.null(xlim)) { + xlim <- range(se.lower, se.upper, x) + } + if(is.null(ylim)) { + ylim <- range(se.lower, se.upper, x) + } + } + else { + if(is.null(xlim)) { + xlim <- range(yy$fit,x) + } + if(is.null(ylim)) { + ylim <- range(yy$fit,x) + } + } + + # Construct plot + if(replace) { + plot(0, 0, type = "n", xlab = xlab, ylab = ylab, xlim = xlim, ylim = ylim, + ...) + } + if(!is.na(shade.col)) { + polygon(c(x, rev(x), x[1L]), c(se.lower, rev(se.upper), se.lower[1L]), + col = shade.col, border = NA, density = shade.density) + } + lines(x, yy$fit, col = line.par$col) + quantile.rug(p, side = rug.par$side) + abline(0, 1, col = "red") + } diff --git a/R/checks.R b/R/checks.R deleted file mode 100644 index ed2596c..0000000 --- a/R/checks.R +++ /dev/null @@ -1,45 +0,0 @@ -checkMissing <- function(x, y){ - nms <- getVarNames(x) - #### Check for NaNs in x and NAs in response - j <- apply(x, 2, function(z) any(is.nan(z))) - if(any(j)) { - stop("Use NA for missing values. NaN found in predictor variables:", - paste(nms[j],collapse=",")) - } - if(any(is.na(y))) stop("Missing values are not allowed in the response") - invisible(NULL) - } - -checkID <- function(id){ - # Check for disallowed interaction.depth - if(id < 1) { - stop("interaction.depth must be at least 1.") - } - else if(id > 49) { - stop("interaction.depth must be less than 50. You should also ask yourself why you want such large interaction terms. A value between 1 and 5 should be sufficient for most applications.") - } - invisible(id) -} - -checkWeights <- function(w, n){ - # Logical checks on weights - if(length(w)==0) { w <- rep(1, n) } - else if(any(w < 0)) stop("negative weights not allowed") - w -} - -checkOffset <- function(o, y){ - # Check offset - if(is.null(o) | all(o==0)) { o <- NA } - else if(length(o) != length(y)) { - stop("The length of offset does not equal the length of y.") - } - o -} - -getVarNames <- function(x){ - if(is.matrix(x)) { var.names <- colnames(x) } - else if(is.data.frame(x)) { var.names <- names(x) } - else { var.names <- paste("X",1:ncol(x),sep="") } - var.names - } diff --git a/R/gbm-internals.R b/R/gbm-internals.R new file mode 100644 index 0000000..7441d6f --- /dev/null +++ b/R/gbm-internals.R @@ -0,0 +1,155 @@ +#' gbm internal functions +#' +#' Helper functions for preprocessing data prior to building a \code{"gbm"} +#' object. +#' +#' @param y The response variable. +#' @param d,distribution The distribution, either specified by the user or +#' implied. +#' @param class.stratify.cv Whether or not to stratify, if provided by the user. +#' @param i.train Computed internally by \code{gbm}. +#' @param group The group, if using \code{distibution = "pairwise"}. +#' @param strat Whether or not to stratify. +#' @param cv.folds The number of cross-validation folds. +#' @param x The design matrix. +#' @param id The interaction depth. +#' @param w The weights. +#' @param n The number of cores to use in the cluster. +#' @param o The offset. +#' +#' @details +#' These are functions used internally by \code{gbm} and not intended for direct +#' use by the user. +#' +#' @aliases guessDist getStratify getCVgroup checkMissing checkID checkWeights +#' checkOffset getVarNames gbmCluster +#' +#' @rdname gbm-internals +#' @export +guessDist <- function(y){ + # If distribution is not given, try to guess it + if (length(unique(y)) == 2){ d <- "bernoulli" } + else if (class(y) == "Surv" ){ d <- "coxph" } + else if (is.factor(y)){ d <- "multinomial" } + else{ d <- "gaussian" } + cat(paste("Distribution not specified, assuming", d, "...\n")) + list(name=d) +} + + +#' @rdname gbm-internals +#' @export +getCVgroup <- function(distribution, class.stratify.cv, y, i.train, cv.folds, + group) { + # Construct cross-validation groups depending on the type of model to be fit + if (distribution$name %in% c( "bernoulli", "multinomial" ) & class.stratify.cv ){ + nc <- table(y[i.train]) # Number in each class + uc <- names(nc) + if (min(nc) < cv.folds){ + stop( paste("The smallest class has only", min(nc), "objects in the training set. Can't do", cv.folds, "fold cross-validation.")) + } + cv.group <- vector(length = length(i.train)) + for (i in 1:length(uc)){ + cv.group[y[i.train] == uc[i]] <- sample(rep(1:cv.folds , length = nc[i])) + } + } # Close if + else if (distribution$name == "pairwise") { + # Split into CV folds at group boundaries + s <- sample(rep(1:cv.folds, length=nlevels(group))) + cv.group <- s[as.integer(group[i.train])] + } + else { + cv.group <- sample(rep(1:cv.folds, length=length(i.train))) + } + cv.group + } + + +#' @rdname gbm-internals +#' @export +getStratify <- function(strat, d){ + if (is.null(strat)){ + if (d$name == "multinomial" ){ strat <- TRUE } + else { strat <- FALSE } + } + else { + if (!is.element(d$name, c( "bernoulli", "multinomial"))){ + warning("You can only use class.stratify.cv when distribution is bernoulli or multinomial. Ignored.") + strat <- FALSE + } + } # Close else + strat +} + + +#' @rdname gbm-internals +#' @export +checkMissing <- function(x, y){ + nms <- getVarNames(x) + #### Check for NaNs in x and NAs in response + j <- apply(x, 2, function(z) any(is.nan(z))) + if(any(j)) { + stop("Use NA for missing values. NaN found in predictor variables:", + paste(nms[j],collapse=",")) + } + if(any(is.na(y))) stop("Missing values are not allowed in the response") + invisible(NULL) +} + + +#' @rdname gbm-internals +#' @export +checkWeights <- function(w, n){ + # Logical checks on weights + if(length(w)==0) { w <- rep(1, n) } + else if(any(w < 0)) stop("negative weights not allowed") + w +} + + +#' @rdname gbm-internals +#' @export +checkID <- function(id){ + # Check for disallowed interaction.depth + if(id < 1) { + stop("interaction.depth must be at least 1.") + } + else if(id > 49) { + stop("interaction.depth must be less than 50. You should also ask yourself why you want such large interaction terms. A value between 1 and 5 should be sufficient for most applications.") + } + invisible(id) +} + + +#' @rdname gbm-internals +#' @export +checkOffset <- function(o, y){ + # Check offset + if(is.null(o) | all(o==0)) { o <- NA } + else if(length(o) != length(y)) { + stop("The length of offset does not equal the length of y.") + } + o +} + + +#' @rdname gbm-internals +#' @export +getVarNames <- function(x){ + if(is.matrix(x)) { var.names <- colnames(x) } + else if(is.data.frame(x)) { var.names <- names(x) } + else { var.names <- paste("X",1:ncol(x),sep="") } + var.names +} + + +#' @rdname gbm-internals +#' @export +gbmCluster <- function(n){ + # If number of cores (n) not given, try to work it out from the number + # that appear to be available and the number of CV folds. + if (is.null(n)){ + n <- parallel::detectCores() + } + parallel::makeCluster(n) +} diff --git a/R/gbm-package.R b/R/gbm-package.R new file mode 100644 index 0000000..23c2894 --- /dev/null +++ b/R/gbm-package.R @@ -0,0 +1,54 @@ +#' Generalized Boosted Regression Models (GBMs) +#' +#' This package implements extensions to Freund and Schapire's AdaBoost +#' algorithm and J. Friedman's gradient boosting machine. Includes regression +#' methods for least squares, absolute loss, logistic, Poisson, Cox +#' proportional hazards partial likelihood, multinomial, t-distribution, +#' AdaBoost exponential loss, Learning to Rank, and Huberized hinge loss. +#' +#' Further information is available in vignette: +#' \code{browseVignettes(package = "gbm")} +#' +#' @import lattice +#' +#' @importFrom grDevices rainbow +#' @importFrom graphics abline axis barplot lines mtext par plot polygon rug +#' @importFrom graphics segments title +#' @importFrom stats approx binomial delete.response gaussian glm loess +#' @importFrom stats model.extract model.frame model.offset model.response +#' @importFrom stats model.weights na.pass poisson predict quantile rbinom +#' @importFrom stats reformulate reorder rexp rnorm runif sd supsmu terms var +#' @importFrom stats weighted.mean +#' @importFrom survival Surv +#' +#' @useDynLib gbm, .registration = TRUE +#' +#' @name gbm-package +#' +#' @docType package +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} with contributions by +#' Daniel Edwards, Brian Kriegler, Stefan Schroedl and Harry Southworth. +#' +#' @references +#' Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +#' generalization of on-line learning and an application to boosting,} +#' \emph{Journal of Computer and System Sciences,} 55(1):119-139. +#' +#' G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +#' and Statistics} 31:172-181. +#' +#' J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +#' Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +#' 28(2):337-374. +#' +#' J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +#' Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. +#' +#' J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +#' \emph{Computational Statistics and Data Analysis} 38(4):367-378. +#' +#' The \url{http://statweb.stanford.edu/~jhf/R-MART} website. +#' +#' @keywords package +NULL \ No newline at end of file diff --git a/R/gbm.R b/R/gbm.R index fc38b93..1835de1 100644 --- a/R/gbm.R +++ b/R/gbm.R @@ -1,170 +1,482 @@ -.onAttach <- function(lib, pkg) -{ - vers <- library(help=gbm)$info[[1]] - vers <- vers[grep("Version:",vers)] - vers <- rev(strsplit(vers," ")[[1]])[1] - packageStartupMessage(paste("Loaded gbm",vers)) +#' Generalized Boosted Regression Modeling (GBM) +#' +#' Fits generalized boosted regression models. For technical details, see the +#' vignette: \code{utils::browseVignettes("gbm")}. +#' +#' \code{gbm.fit} provides the link between R and the C++ gbm engine. +#' \code{gbm} is a front-end to \code{gbm.fit} that uses the familiar R +#' modeling formulas. However, \code{\link[stats]{model.frame}} is very slow if +#' there are many predictor variables. For power-users with many variables use +#' \code{gbm.fit}. For general practice \code{gbm} is preferable. +#' +#' @param formula A symbolic description of the model to be fit. The formula +#' may include an offset term (e.g. y~offset(n)+x). If +#' \code{keep.data = FALSE} in the initial call to \code{gbm} then it is the +#' user's responsibility to resupply the offset to \code{\link{gbm.more}}. +#' +#' @param distribution Either a character string specifying the name of the +#' distribution to use or a list with a component \code{name} specifying the +#' distribution and any additional parameters needed. If not specified, +#' \code{gbm} will try to guess: if the response has only 2 unique values, +#' bernoulli is assumed; otherwise, if the response is a factor, multinomial is +#' assumed; otherwise, if the response has class \code{"Surv"}, coxph is +#' assumed; otherwise, gaussian is assumed. +#' +#' Currently available options are \code{"gaussian"} (squared error), +#' \code{"laplace"} (absolute loss), \code{"tdist"} (t-distribution loss), +#' \code{"bernoulli"} (logistic regression for 0-1 outcomes), +#' \code{"huberized"} (huberized hinge loss for 0-1 outcomes), classes), +#' \code{"adaboost"} (the AdaBoost exponential loss for 0-1 outcomes), +#' \code{"poisson"} (count outcomes), \code{"coxph"} (right censored +#' observations), \code{"quantile"}, or \code{"pairwise"} (ranking measure +#' using the LambdaMart algorithm). +#' +#' If quantile regression is specified, \code{distribution} must be a list of +#' the form \code{list(name = "quantile", alpha = 0.25)} where \code{alpha} is +#' the quantile to estimate. The current version's quantile regression method +#' does not handle non-constant weights and will stop. +#' +#' If \code{"tdist"} is specified, the default degrees of freedom is 4 and +#' this can be controlled by specifying +#' \code{distribution = list(name = "tdist", df = DF)} where \code{DF} is your +#' chosen degrees of freedom. +#' +#' If "pairwise" regression is specified, \code{distribution} must be a list of +#' the form \code{list(name="pairwise",group=...,metric=...,max.rank=...)} +#' (\code{metric} and \code{max.rank} are optional, see below). \code{group} is +#' a character vector with the column names of \code{data} that jointly +#' indicate the group an instance belongs to (typically a query in Information +#' Retrieval applications). For training, only pairs of instances from the same +#' group and with different target labels can be considered. \code{metric} is +#' the IR measure to use, one of +#' \describe{ +#' \item{list("conc")}{Fraction of concordant pairs; for binary labels, this +#' is equivalent to the Area under the ROC Curve} +#' \item{:}{Fraction of concordant pairs; for binary labels, this is +#' equivalent to the Area under the ROC Curve} +#' \item{list("mrr")}{Mean reciprocal rank of the highest-ranked positive +#' instance} +#' \item{:}{Mean reciprocal rank of the highest-ranked positive instance} +#' \item{list("map")}{Mean average precision, a generalization of \code{mrr} +#' to multiple positive instances}\item{:}{Mean average precision, a +#' generalization of \code{mrr} to multiple positive instances} +#' \item{list("ndcg:")}{Normalized discounted cumulative gain. The score is +#' the weighted sum (DCG) of the user-supplied target values, weighted +#' by log(rank+1), and normalized to the maximum achievable value. This +#' is the default if the user did not specify a metric.} +#' } +#' +#' \code{ndcg} and \code{conc} allow arbitrary target values, while binary +#' targets {0,1} are expected for \code{map} and \code{mrr}. For \code{ndcg} +#' and \code{mrr}, a cut-off can be chosen using a positive integer parameter +#' \code{max.rank}. If left unspecified, all ranks are taken into account. +#' +#' Note that splitting of instances into training and validation sets follows +#' group boundaries and therefore only approximates the specified +#' \code{train.fraction} ratio (the same applies to cross-validation folds). +#' Internally, queries are randomly shuffled before training, to avoid bias. +#' +#' Weights can be used in conjunction with pairwise metrics, however it is +#' assumed that they are constant for instances from the same group. +#' +#' For details and background on the algorithm, see e.g. Burges (2010). +#' +#' @param data an optional data frame containing the variables in the model. By +#' default the variables are taken from \code{environment(formula)}, typically +#' the environment from which \code{gbm} is called. If \code{keep.data=TRUE} in +#' the initial call to \code{gbm} then \code{gbm} stores a copy with the +#' object. If \code{keep.data=FALSE} then subsequent calls to +#' \code{\link{gbm.more}} must resupply the same dataset. It becomes the user's +#' responsibility to resupply the same data at this point. +#' +#' @param weights an optional vector of weights to be used in the fitting +#' process. Must be positive but do not need to be normalized. If +#' \code{keep.data=FALSE} in the initial call to \code{gbm} then it is the +#' user's responsibility to resupply the weights to \code{\link{gbm.more}}. +#' +#' @param var.monotone an optional vector, the same length as the number of +#' predictors, indicating which variables have a monotone increasing (+1), +#' decreasing (-1), or arbitrary (0) relationship with the outcome. +#' +#' @param n.trees Integer specifying the total number of trees to fit. This is +#' equivalent to the number of iterations and the number of basis functions in +#' the additive expansion. Default is 100. +#' +#' @param interaction.depth Integer specifying the maximum depth of each tree +#' (i.e., the highest level of variable interactions allowed). A value of 1 +#' implies an additive model, a value of 2 implies a model with up to 2-way +#' interactions, etc. Default is 1. +#' +#' @param n.minobsinnode Integer specifying the minimum number of observations +#' in the terminal nodes of the trees. Note that this is the actual number of +#' observations, not the total weight. +#' +#' @param shrinkage a shrinkage parameter applied to each tree in the +#' expansion. Also known as the learning rate or step-size reduction; 0.001 to +#' 0.1 usually work, but a smaller learning rate typically requires more trees. +#' Default is 0.1. +#' +#' @param bag.fraction the fraction of the training set observations randomly +#' selected to propose the next tree in the expansion. This introduces +#' randomnesses into the model fit. If \code{bag.fraction} < 1 then running the +#' same model twice will result in similar but different fits. \code{gbm} uses +#' the R random number generator so \code{set.seed} can ensure that the model +#' can be reconstructed. Preferably, the user can save the returned +#' \code{\link{gbm.object}} using \code{\link{save}}. Default is 0.5. +#' +#' @param train.fraction The first \code{train.fraction * nrows(data)} +#' observations are used to fit the \code{gbm} and the remainder are used for +#' computing out-of-sample estimates of the loss function. +#' +#' @param cv.folds Number of cross-validation folds to perform. If +#' \code{cv.folds}>1 then \code{gbm}, in addition to the usual fit, will +#' perform a cross-validation, calculate an estimate of generalization error +#' returned in \code{cv.error}. +#' +#' @param keep.data a logical variable indicating whether to keep the data and +#' an index of the data stored with the object. Keeping the data and index +#' makes subsequent calls to \code{\link{gbm.more}} faster at the cost of +#' storing an extra copy of the dataset. +#' +#' @param verbose Logical indicating whether or not to print out progress and +#' performance indicators (\code{TRUE}). If this option is left unspecified for +#' \code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +#' \code{FALSE}. +#' +#' @param class.stratify.cv Logical indicating whether or not the +#' cross-validation should be stratified by class. Defaults to \code{TRUE} for +#' \code{distribution = "multinomial"} and is only implemented for +#' \code{"multinomial"} and \code{"bernoulli"}. The purpose of stratifying the +#' cross-validation is to help avoiding situations in which training sets do +#' not contain all classes. +#' +#' @param n.cores The number of CPU cores to use. The cross-validation loop +#' will attempt to send different CV folds off to different cores. If +#' \code{n.cores} is not specified by the user, it is guessed using the +#' \code{detectCores} function in the \code{parallel} package. Note that the +#' documentation for \code{detectCores} makes clear that it is not failsafe and +#' could return a spurious number of available cores. +#' +#' @return A \code{\link{gbm.object}} object. +#' +#' @details +#' This package implements the generalized boosted modeling framework. Boosting +#' is the process of iteratively adding basis functions in a greedy fashion so +#' that each additional basis function further reduces the selected loss +#' function. This implementation closely follows Friedman's Gradient Boosting +#' Machine (Friedman, 2001). +#' +#' In addition to many of the features documented in the Gradient Boosting +#' Machine, \code{gbm} offers additional features including the out-of-bag +#' estimator for the optimal number of iterations, the ability to store and +#' manipulate the resulting \code{gbm} object, and a variety of other loss +#' functions that had not previously had associated boosting algorithms, +#' including the Cox partial likelihood for censored data, the poisson +#' likelihood for count outcomes, and a gradient boosting implementation to +#' minimize the AdaBoost exponential loss function. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' Quantile regression code developed by Brian Kriegler +#' \email{bk@@stat.ucla.edu} +#' +#' t-distribution, and multinomial code developed by Harry Southworth and +#' Daniel Edwards +#' +#' Pairwise code developed by Stefan Schroedl \email{schroedl@@a9.com} +#' +#' @seealso \code{\link{gbm.object}}, \code{\link{gbm.perf}}, +#' \code{\link{plot.gbm}}, \code{\link{predict.gbm}}, \code{\link{summary.gbm}}, +#' and \code{\link{pretty.gbm.tree}}. +#' +#' @references +#' Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +#' generalization of on-line learning and an application to boosting,} +#' \emph{Journal of Computer and System Sciences,} 55(1):119-139. +#' +#' G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +#' and Statistics} 31:172-181. +#' +#' J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +#' Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +#' 28(2):337-374. +#' +#' J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +#' Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. +#' +#' J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +#' \emph{Computational Statistics and Data Analysis} 38(4):367-378. +#' +#' B. Kriegler (2007). Cost-Sensitive Stochastic Gradient Boosting Within a +#' Quantitative Regression Framework. Ph.D. Dissertation. University of +#' California at Los Angeles, Los Angeles, CA, USA. Advisor(s) Richard A. Berk. +#' url{https://dl.acm.org/citation.cfm?id=1354603}. +#' +#' C. Burges (2010). \dQuote{From RankNet to LambdaRank to LambdaMART: An +#' Overview,} Microsoft Research Technical Report MSR-TR-2010-82. +#' +#' @export +#' +#' @examples +#' # +#' # A least squares regression example +#' # +#' +#' # Simulate data +#' set.seed(101) # for reproducibility +#' N <- 1000 +#' X1 <- runif(N) +#' X2 <- 2 * runif(N) +#' X3 <- ordered(sample(letters[1:4], N, replace = TRUE), levels = letters[4:1]) +#' X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +#' X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +#' X6 <- 3 * runif(N) +#' mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +#' SNR <- 10 # signal-to-noise ratio +#' Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu +#' sigma <- sqrt(var(Y) / SNR) +#' Y <- Y + rnorm(N, 0, sigma) +#' X1[sample(1:N,size=500)] <- NA # introduce some missing values +#' X4[sample(1:N,size=300)] <- NA # introduce some missing values +#' data <- data.frame(Y, X1, X2, X3, X4, X5, X6) +#' +#' # Fit a GBM +#' set.seed(102) # for reproducibility +#' gbm1 <- gbm(Y ~ ., data = data, var.monotone = c(0, 0, 0, 0, 0, 0), +#' distribution = "gaussian", n.trees = 100, shrinkage = 0.1, +#' interaction.depth = 3, bag.fraction = 0.5, train.fraction = 0.5, +#' n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE, +#' verbose = FALSE, n.cores = 1) +#' +#' # Check performance using the out-of-bag (OOB) error; the OOB error typically +#' # underestimates the optimal number of iterations +#' best.iter <- gbm.perf(gbm1, method = "OOB") +#' print(best.iter) +#' +#' # Check performance using the 50% heldout test set +#' best.iter <- gbm.perf(gbm1, method = "test") +#' print(best.iter) +#' +#' # Check performance using 5-fold cross-validation +#' best.iter <- gbm.perf(gbm1, method = "cv") +#' print(best.iter) +#' +#' # Plot relative influence of each variable +#' par(mfrow = c(1, 2)) +#' summary(gbm1, n.trees = 1) # using first tree +#' summary(gbm1, n.trees = best.iter) # using estimated best number of trees +#' +#' # Compactly print the first and last trees for curiosity +#' print(pretty.gbm.tree(gbm1, i.tree = 1)) +#' print(pretty.gbm.tree(gbm1, i.tree = gbm1$n.trees)) +#' +#' # Simulate new data +#' set.seed(103) # for reproducibility +#' N <- 1000 +#' X1 <- runif(N) +#' X2 <- 2 * runif(N) +#' X3 <- ordered(sample(letters[1:4], N, replace = TRUE)) +#' X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +#' X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +#' X6 <- 3 * runif(N) +#' mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +#' Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu + rnorm(N, 0, sigma) +#' data2 <- data.frame(Y, X1, X2, X3, X4, X5, X6) +#' +#' # Predict on the new data using the "best" number of trees; by default, +#' # predictions will be on the link scale +#' Yhat <- predict(gbm1, newdata = data2, n.trees = best.iter, type = "link") +#' +#' # least squares error +#' print(sum((data2$Y - Yhat)^2)) +#' +#' # Construct univariate partial dependence plots +#' p1 <- plot(gbm1, i.var = 1, n.trees = best.iter) +#' p2 <- plot(gbm1, i.var = 2, n.trees = best.iter) +#' p3 <- plot(gbm1, i.var = "X3", n.trees = best.iter) # can use index or name +#' grid.arrange(p1, p2, p3, ncol = 3) +#' +#' # Construct bivariate partial dependence plots +#' plot(gbm1, i.var = 1:2, n.trees = best.iter) +#' plot(gbm1, i.var = c("X2", "X3"), n.trees = best.iter) +#' plot(gbm1, i.var = 3:4, n.trees = best.iter) +#' +#' # Construct trivariate partial dependence plots +#' plot(gbm1, i.var = c(1, 2, 6), n.trees = best.iter, +#' continuous.resolution = 20) +#' plot(gbm1, i.var = 1:3, n.trees = best.iter) +#' plot(gbm1, i.var = 2:4, n.trees = best.iter) +#' plot(gbm1, i.var = 3:5, n.trees = best.iter) +#' +#' # Add more (i.e., 100) boosting iterations to the ensemble +#' gbm2 <- gbm.more(gbm1, n.new.trees = 100, verbose = FALSE) +gbm <- function(formula = formula(data), distribution = "bernoulli", + data = list(), weights, var.monotone = NULL, n.trees = 100, + interaction.depth = 1, n.minobsinnode = 10, shrinkage = 0.1, + bag.fraction = 0.5, train.fraction = 1.0, cv.folds = 0, + keep.data = TRUE, verbose = FALSE, class.stratify.cv = NULL, + n.cores = NULL) { + + # Match the call to gbm + mcall <- match.call() + + # Verbose output? + lVerbose <- if (!is.logical(verbose)) { + FALSE + } else { + verbose + } + + # Construct model frame, terms object, weights, and offset + mf <- match.call(expand.dots = FALSE) + m <- match(c("formula", "data", "weights", "offset"), names(mf), 0) + mf <- mf[c(1, m)] + mf$drop.unused.levels <- TRUE + mf$na.action <- na.pass + mf[[1]] <- as.name("model.frame") + m <- mf + mf <- eval(mf, parent.frame()) + Terms <- attr(mf, "terms") + w <- model.weights(mf) + offset <- model.offset(mf) + + # Determine and check response distribution + distribution <- if (missing(distribution)) { + y <- data[, all.vars(formula)[1L], drop = TRUE] + guessDist(y) + } else if (is.character(distribution)) { + list(name = distribution) + } + if (!is.element(distribution$name, getAvailableDistributions())) { + stop("Distribution ", distribution$name, " is not supported.") + } + + # Extract and check response values + y <- model.response(mf) + + # Construct data frame of predictor values + var.names <- attributes(Terms)$term.labels + x <- model.frame(terms(reformulate(var.names)), data = data, + na.action = na.pass) + + # Extract response name as a character string + response.name <- as.character(formula[[2L]]) + + # Stratify cross-validation by class (only for bernoulli and multinomial) + class.stratify.cv <- getStratify(class.stratify.cv, d = distribution) + + # Groups (for pairwise distribution only) + group <- NULL + num.groups <- 0 + + # Determine number of training instances + if (distribution$name != "pairwise"){ + + # Number of training instances + nTrain <- floor(train.fraction * nrow(x)) + + } else { + + # Sampling is by group, so we need to calculate them here + distribution.group <- distribution[["group"]] + if (is.null(distribution.group)) { + stop(paste("For pairwise regression, `distribution` must be a list of", + "the form `list(name = \"pairwise\", group = c(\"date\",", + "\"session\", \"category\", \"keywords\"))`.")) + } + + # Check if group names are valid + i <- match(distribution.group, colnames(data)) + if (any(is.na(i))) { + stop("Group column does not occur in data: ", + distribution.group[is.na(i)], ".") + } + + # Construct group index + group <- factor( + do.call(paste, c(data[, distribution.group, drop = FALSE], sep = ":")) + ) + + # Check that weights are constant across groups + if ((!missing(weights)) && (!is.null(weights))) { + w.min <- tapply(w, INDEX = group, FUN = min) + w.max <- tapply(w, INDEX = group, FUN = max) + if (any(w.min != w.max)) { + stop("For `distribution = \"pairwise\"`, all instances for the same ", + "group must have the same weight.") + } + w <- w * length(w.min) / sum(w.min) # normalize across groups + } + + # Shuffle groups to remove bias when split into train/test sets and/or CV + # folds + perm.levels <- levels(group)[sample(1:nlevels(group))] + group <- factor(group, levels = perm.levels) + + # The C function expects instances to be sorted by group and descending by + # target + ord.group <- order(group, -y) + group <- group[ord.group] + y <- y[ord.group] + x <- x[ord.group, , drop = FALSE] + w <- w[ord.group] + + # Split into train and validation sets at group boundary + num.groups.train <- max(1, round(train.fraction * nlevels(group))) + + # Include all groups up to the num.groups.train + nTrain <- max(which(group==levels(group)[num.groups.train])) + Misc <- group + + } + + # Set up for k-fold cross-validation + cv.error <- NULL + if(cv.folds > 1) { + cv.results <- gbmCrossVal(cv.folds = cv.folds, nTrain = nTrain, + n.cores = n.cores, + class.stratify.cv = class.stratify.cv, + data = data, x = x, y = y, offset = offset, + distribution = distribution, w = w, + var.monotone = var.monotone, n.trees = n.trees, + interaction.depth = interaction.depth, + n.minobsinnode = n.minobsinnode, + shrinkage = shrinkage, + bag.fraction = bag.fraction, + var.names = var.names, + response.name = response.name, group = group) + cv.error <- cv.results$error + p <- cv.results$predictions + } + + # Fit a GBM + gbm.obj <- gbm.fit(x = x, y = y, offset = offset, distribution = distribution, + w = w, var.monotone = var.monotone, n.trees = n.trees, + interaction.depth = interaction.depth, + n.minobsinnode = n.minobsinnode, shrinkage = shrinkage, + bag.fraction = bag.fraction, nTrain = nTrain, + keep.data = keep.data, verbose = lVerbose, + var.names = var.names, response.name = response.name, + group = group) + + # Attach further components + gbm.obj$train.fraction <- train.fraction + gbm.obj$Terms <- Terms + gbm.obj$cv.error <- cv.error + gbm.obj$cv.folds <- cv.folds + gbm.obj$call <- mcall + gbm.obj$m <- m + if (cv.folds > 0) { + gbm.obj$cv.fitted <- p + } + if (distribution$name == "pairwise") { + # Data has been reordered according to queries. We need to permute the + # fitted values so that they correspond to the original order. + gbm.obj$ord.group <- ord.group + gbm.obj$fit <- gbm.obj$fit[order(ord.group)] + } + + # Return "gbm" object + gbm.obj + } - -gbm <- function(formula = formula(data), - distribution = "bernoulli", - data = list(), - weights, - var.monotone = NULL, - n.trees = 100, - interaction.depth = 1, - n.minobsinnode = 10, - shrinkage = 0.001, - bag.fraction = 0.5, - train.fraction = 1.0, - cv.folds=0, - keep.data = TRUE, - verbose = 'CV', - class.stratify.cv=NULL, - n.cores=NULL){ - theCall <- match.call() - - - lVerbose <- if (!is.logical(verbose)) { FALSE } - else { verbose } - - mf <- match.call(expand.dots = FALSE) - m <- match(c("formula", "data", "weights", "offset"), names(mf), 0) - mf <- mf[c(1, m)] - mf$drop.unused.levels <- TRUE - mf$na.action <- na.pass - mf[[1]] <- as.name("model.frame") - m <- mf - mf <- eval(mf, parent.frame()) - Terms <- attr(mf, "terms") - - y <- model.response(mf) - - if (missing(distribution)){ distribution <- guessDist(y) } - else if (is.character(distribution)){ distribution <- list(name=distribution) } - - w <- model.weights(mf) - offset <- model.offset(mf) - - var.names <- attributes(Terms)$term.labels - x <- model.frame(terms(reformulate(var.names)), - data, - na.action=na.pass) - - # get the character name of the response variable - response.name <- as.character(formula[[2]]) - - lVerbose <- if (!is.logical(verbose)) { FALSE } - else { verbose } - - class.stratify.cv <- getStratify(class.stratify.cv, distribution) - - # groups (for pairwise distribution only) - group <- NULL - num.groups <- 0 - - # determine number of training instances - if (distribution$name != "pairwise"){ - nTrain <- floor(train.fraction * nrow(x)) - } - else { - # distribution$name == "pairwise": - # Sampling is by group, so we need to calculate them here - distribution.group <- distribution[["group"]] - if (is.null(distribution.group)) - { - stop("For pairwise regression, the distribution parameter must be a list with a parameter 'group' for the a list of the column names indicating groups, for example list(name=\"pairwise\",group=c(\"date\",\"session\",\"category\",\"keywords\")).") - } - - # Check if group names are valid - i <- match(distribution.group, colnames(data)) - if (any(is.na(i))) - { - stop("Group column does not occur in data: ", distribution.group[is.na(i)]) - } - - # Construct group index - group <- factor(do.call(paste, c(data[,distribution.group, drop=FALSE], sep=":"))) - - # Check that weights are constant across groups - if ((!missing(weights)) && (!is.null(weights))) - { - w.min <- tapply(w, INDEX=group, FUN=min) - w.max <- tapply(w, INDEX=group, FUN=max) - - if (any(w.min != w.max)) - { - stop("For distribution 'pairwise', all instances for the same group must have the same weight") - } - - # Normalize across groups - w <- w * length(w.min) / sum(w.min) - } - - # Shuffle groups, to remove bias when splitting into train/test set and/or CV folds - perm.levels <- levels(group)[sample(1:nlevels(group))] - group <- factor(group, levels=perm.levels) - - # The C function expects instances to be sorted by group and descending by target - ord.group <- order(group, -y) - group <- group[ord.group] - y <- y[ord.group] - x <- x[ord.group,,drop=FALSE] - w <- w[ord.group] - - # Split into train and validation set, at group boundary - num.groups.train <- max(1, round(train.fraction * nlevels(group))) - - # include all groups up to the num.groups.train - nTrain <- max(which(group==levels(group)[num.groups.train])) - Misc <- group - } # close if(distribution$name=="coxph") ... - - cv.error <- NULL - if(cv.folds>1) { - cv.results <- gbmCrossVal(cv.folds, nTrain, n.cores, - class.stratify.cv, data, - x, y, offset, distribution, w, var.monotone, - n.trees, interaction.depth, n.minobsinnode, - shrinkage, bag.fraction, - var.names, response.name, group) - cv.error <- cv.results$error - p <- cv.results$predictions - } # Close if(cv.folds > 1 - - gbm.obj <- gbm.fit(x,y, - offset = offset, - distribution = distribution, - w = w, - var.monotone = var.monotone, - n.trees = n.trees, - interaction.depth = interaction.depth, - n.minobsinnode = n.minobsinnode, - shrinkage = shrinkage, - bag.fraction = bag.fraction, - nTrain = nTrain, - keep.data = keep.data, - verbose = lVerbose, - var.names = var.names, - response.name = response.name, - group = group) - - gbm.obj$train.fraction <- train.fraction - gbm.obj$Terms <- Terms - gbm.obj$cv.error <- cv.error - gbm.obj$cv.folds <- cv.folds - gbm.obj$call <- theCall - gbm.obj$m <- m - if (cv.folds > 0){ gbm.obj$cv.fitted <- p } - - if (distribution$name == "pairwise") - { - # Data has been reordered according to queries. - # We need to permute the fitted values to correspond - # to the original order. - gbm.obj$ord.group <- ord.group - gbm.obj$fit <- gbm.obj$fit[order(ord.group)] - } - - return(gbm.obj) -} diff --git a/R/gbm.fit.R b/R/gbm.fit.R index e08cfcd..1e8a7b4 100644 --- a/R/gbm.fit.R +++ b/R/gbm.fit.R @@ -1,404 +1,585 @@ -gbm.fit <- function(x,y, - offset = NULL, - misc = NULL, - distribution = "bernoulli", - w = NULL, - var.monotone = NULL, - n.trees = 100, - interaction.depth = 1, - n.minobsinnode = 10, - shrinkage = 0.001, - bag.fraction = 0.5, - nTrain = NULL, - train.fraction = NULL, - keep.data = TRUE, - verbose = TRUE, - var.names = NULL, - response.name = "y", - group = NULL) -{ - - if(is.character(distribution)) { distribution <- list(name=distribution) } - - cRows <- nrow(x) - cCols <- ncol(x) - - if(nrow(x) != ifelse(class(y)=="Surv", nrow(y), length(y))) { - stop("The number of rows in x does not equal the length of y.") - } - - # the preferred way to specify the number of training instances is via parameter 'nTrain'. - # parameter 'train.fraction' is only maintained for backward compatibility. - - if(!is.null(nTrain) && !is.null(train.fraction)) { - stop("Parameters 'nTrain' and 'train.fraction' cannot both be specified") - } - else if(!is.null(train.fraction)) { - warning("Parameter 'train.fraction' of gbm.fit is deprecated, please specify 'nTrain' instead") - nTrain <- floor(train.fraction*cRows) - } - else if(is.null(nTrain)) { - # both undefined, use all training data - nTrain <- cRows - } - - if (is.null(train.fraction)){ - train.fraction <- nTrain / cRows - } - - if(is.null(var.names)) { - var.names <- getVarNames(x) - } - -# if(is.null(response.name)) { response.name <- "y" } - - # check dataset size - if(nTrain * bag.fraction <= 2*n.minobsinnode+1) { - stop("The dataset size is too small or subsampling rate is too large: nTrain*bag.fraction <= n.minobsinnode") - } - - if (distribution$name != "pairwise") { - w <- w*length(w)/sum(w) # normalize to N - } - - # Do sanity checks - ch <- checkMissing(x, y) - interaction.depth <- checkID(interaction.depth) - w <- checkWeights(w, length(y)) - offset <- checkOffset(offset, y) - - Misc <- NA - - # setup variable types - var.type <- rep(0,cCols) - var.levels <- vector("list",cCols) - for(i in 1:length(var.type)) - { - if(all(is.na(x[,i]))) - { - stop("variable ",i,": ",var.names[i]," has only missing values.") +#' Generalized Boosted Regression Modeling (GBM) +#' +#' Workhorse function providing the link between R and the C++ gbm engine. +#' \code{gbm} is a front-end to \code{gbm.fit} that uses the familiar R +#' modeling formulas. However, \code{\link[stats]{model.frame}} is very slow if +#' there are many predictor variables. For power-users with many variables use +#' \code{gbm.fit}. For general practice \code{gbm} is preferable. +#' +#' @param x A data frame or matrix containing the predictor variables. The +#' number of rows in \code{x} must be the same as the length of \code{y}. +#' +#' @param y A vector of outcomes. The number of rows in \code{x} must be the +#' same as the length of \code{y}. +#' +#' @param offset A vector of offset values. +#' +#' @param misc An R object that is simply passed on to the gbm engine. It can be +#' used for additional data for the specific distribution. Currently it is only +#' used for passing the censoring indicator for the Cox proportional hazards +#' model. +#' +#' @param distribution Either a character string specifying the name of the +#' distribution to use or a list with a component \code{name} specifying the +#' distribution and any additional parameters needed. If not specified, +#' \code{gbm} will try to guess: if the response has only 2 unique values, +#' bernoulli is assumed; otherwise, if the response is a factor, multinomial is +#' assumed; otherwise, if the response has class \code{"Surv"}, coxph is +#' assumed; otherwise, gaussian is assumed. +#' +#' Currently available options are \code{"gaussian"} (squared error), +#' \code{"laplace"} (absolute loss), \code{"tdist"} (t-distribution loss), +#' \code{"bernoulli"} (logistic regression for 0-1 outcomes), +#' \code{"huberized"} (huberized hinge loss for 0-1 outcomes), classes), +#' \code{"adaboost"} (the AdaBoost exponential loss for 0-1 outcomes), +#' \code{"poisson"} (count outcomes), \code{"coxph"} (right censored +#' observations), \code{"quantile"}, or \code{"pairwise"} (ranking measure +#' using the LambdaMart algorithm). +#' +#' If quantile regression is specified, \code{distribution} must be a list of +#' the form \code{list(name = "quantile", alpha = 0.25)} where \code{alpha} is +#' the quantile to estimate. The current version's quantile regression method +#' does not handle non-constant weights and will stop. +#' +#' If \code{"tdist"} is specified, the default degrees of freedom is 4 and +#' this can be controlled by specifying +#' \code{distribution = list(name = "tdist", df = DF)} where \code{DF} is your +#' chosen degrees of freedom. +#' +#' If "pairwise" regression is specified, \code{distribution} must be a list of +#' the form \code{list(name="pairwise",group=...,metric=...,max.rank=...)} +#' (\code{metric} and \code{max.rank} are optional, see below). \code{group} is +#' a character vector with the column names of \code{data} that jointly +#' indicate the group an instance belongs to (typically a query in Information +#' Retrieval applications). For training, only pairs of instances from the same +#' group and with different target labels can be considered. \code{metric} is +#' the IR measure to use, one of +#' \describe{ +#' \item{list("conc")}{Fraction of concordant pairs; for binary labels, this +#' is equivalent to the Area under the ROC Curve} +#' \item{:}{Fraction of concordant pairs; for binary labels, this is +#' equivalent to the Area under the ROC Curve} +#' \item{list("mrr")}{Mean reciprocal rank of the highest-ranked positive +#' instance} +#' \item{:}{Mean reciprocal rank of the highest-ranked positive instance} +#' \item{list("map")}{Mean average precision, a generalization of \code{mrr} +#' to multiple positive instances}\item{:}{Mean average precision, a +#' generalization of \code{mrr} to multiple positive instances} +#' \item{list("ndcg:")}{Normalized discounted cumulative gain. The score is +#' the weighted sum (DCG) of the user-supplied target values, weighted +#' by log(rank+1), and normalized to the maximum achievable value. This +#' is the default if the user did not specify a metric.} +#' } +#' +#' \code{ndcg} and \code{conc} allow arbitrary target values, while binary +#' targets {0,1} are expected for \code{map} and \code{mrr}. For \code{ndcg} +#' and \code{mrr}, a cut-off can be chosen using a positive integer parameter +#' \code{max.rank}. If left unspecified, all ranks are taken into account. +#' +#' Note that splitting of instances into training and validation sets follows +#' group boundaries and therefore only approximates the specified +#' \code{train.fraction} ratio (the same applies to cross-validation folds). +#' Internally, queries are randomly shuffled before training, to avoid bias. +#' +#' Weights can be used in conjunction with pairwise metrics, however it is +#' assumed that they are constant for instances from the same group. +#' +#' For details and background on the algorithm, see e.g. Burges (2010). +#' +#' @param w A vector of weights of the same length as the \code{y}. +#' +#' @param var.monotone an optional vector, the same length as the number of +#' predictors, indicating which variables have a monotone increasing (+1), +#' decreasing (-1), or arbitrary (0) relationship with the outcome. +#' +#' @param n.trees the total number of trees to fit. This is equivalent to the +#' number of iterations and the number of basis functions in the additive +#' expansion. +#' +#' @param interaction.depth The maximum depth of variable interactions. A value +#' of 1 implies an additive model, a value of 2 implies a model with up to 2-way +#' interactions, etc. Default is \code{1}. +#' +#' @param n.minobsinnode Integer specifying the minimum number of observations +#' in the trees terminal nodes. Note that this is the actual number of +#' observations not the total weight. +#' +#' @param shrinkage The shrinkage parameter applied to each tree in the +#' expansion. Also known as the learning rate or step-size reduction; 0.001 to +#' 0.1 usually work, but a smaller learning rate typically requires more trees. +#' Default is \code{0.1}. +#' +#' @param bag.fraction The fraction of the training set observations randomly +#' selected to propose the next tree in the expansion. This introduces +#' randomnesses into the model fit. If \code{bag.fraction} < 1 then running the +#' same model twice will result in similar but different fits. \code{gbm} uses +#' the R random number generator so \code{set.seed} can ensure that the model +#' can be reconstructed. Preferably, the user can save the returned +#' \code{\link{gbm.object}} using \code{\link{save}}. Default is \code{0.5}. +#' +#' @param nTrain An integer representing the number of cases on which to train. +#' This is the preferred way of specification for \code{gbm.fit}; The option +#' \code{train.fraction} in \code{gbm.fit} is deprecated and only maintained +#' for backward compatibility. These two parameters are mutually exclusive. If +#' both are unspecified, all data is used for training. +#' +#' @param train.fraction The first \code{train.fraction * nrows(data)} +#' observations are used to fit the \code{gbm} and the remainder are used for +#' computing out-of-sample estimates of the loss function. +#' +#' @param keep.data Logical indicating whether or not to keep the data and an +#' index of the data stored with the object. Keeping the data and index makes +#' subsequent calls to \code{\link{gbm.more}} faster at the cost of storing an +#' extra copy of the dataset. +#' +#' @param verbose Logical indicating whether or not to print out progress and +#' performance indicators (\code{TRUE}). If this option is left unspecified for +#' \code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +#' \code{FALSE}. +#' +#' @param var.names Vector of strings of length equal to the number of columns +#' of \code{x} containing the names of the predictor variables. +#' +#' @param response.name Character string label for the response variable. +#' +#' @param group The \code{group} to use when \code{distribution = "pairwise"}. +#' +#' @return A \code{\link{gbm.object}} object. +#' +#' @details +#' This package implements the generalized boosted modeling framework. Boosting +#' is the process of iteratively adding basis functions in a greedy fashion so +#' that each additional basis function further reduces the selected loss +#' function. This implementation closely follows Friedman's Gradient Boosting +#' Machine (Friedman, 2001). +#' +#' In addition to many of the features documented in the Gradient Boosting +#' Machine, \code{gbm} offers additional features including the out-of-bag +#' estimator for the optimal number of iterations, the ability to store and +#' manipulate the resulting \code{gbm} object, and a variety of other loss +#' functions that had not previously had associated boosting algorithms, +#' including the Cox partial likelihood for censored data, the poisson +#' likelihood for count outcomes, and a gradient boosting implementation to +#' minimize the AdaBoost exponential loss function. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' Quantile regression code developed by Brian Kriegler +#' \email{bk@@stat.ucla.edu} +#' +#' t-distribution, and multinomial code developed by Harry Southworth and +#' Daniel Edwards +#' +#' Pairwise code developed by Stefan Schroedl \email{schroedl@@a9.com} +#' +#' @seealso \code{\link{gbm.object}}, \code{\link{gbm.perf}}, +#' \code{\link{plot.gbm}}, \code{\link{predict.gbm}}, \code{\link{summary.gbm}}, +#' and \code{\link{pretty.gbm.tree}}. +#' +#' @references +#' Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +#' generalization of on-line learning and an application to boosting,} +#' \emph{Journal of Computer and System Sciences,} 55(1):119-139. +#' +#' G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +#' and Statistics} 31:172-181. +#' +#' J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +#' Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +#' 28(2):337-374. +#' +#' J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +#' Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. +#' +#' J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +#' \emph{Computational Statistics and Data Analysis} 38(4):367-378. +#' +#' B. Kriegler (2007). Cost-Sensitive Stochastic Gradient Boosting Within a +#' Quantitative Regression Framework. Ph.D. Dissertation. University of +#' California at Los Angeles, Los Angeles, CA, USA. Advisor(s) Richard A. Berk. +#' url{https://dl.acm.org/citation.cfm?id=1354603}. +#' +#' C. Burges (2010). \dQuote{From RankNet to LambdaRank to LambdaMART: An +#' Overview,} Microsoft Research Technical Report MSR-TR-2010-82. +#' +#' @export +gbm.fit <- function(x, y, offset = NULL, misc = NULL, distribution = "bernoulli", + w = NULL, var.monotone = NULL, n.trees = 100, + interaction.depth = 1, n.minobsinnode = 10, + shrinkage = 0.001, bag.fraction = 0.5, nTrain = NULL, + train.fraction = NULL, keep.data = TRUE, verbose = TRUE, + var.names = NULL, response.name = "y", group = NULL) { + + # Reformat distribution into a named list + if(is.character(distribution)) { + distribution <- list(name = distribution) + } + + # Dimensions of predictor data + cRows <- nrow(x) + cCols <- ncol(x) + + if(nrow(x) != ifelse(class(y) == "Surv", nrow(y), length(y))) { + stop("The number of rows in x does not equal the length of y.") + } + + # The preferred way to specify the number of training instances is via the + # parameter `nTrain`. The parameter `train.fraction` is only maintained for + # back compatibility. + if(!is.null(nTrain) && !is.null(train.fraction)) { + stop("Parameters `nTrain` and `train.fraction` cannot both be specified.") + } else if(!is.null(train.fraction)) { + warning("Parameter `train.fraction` is deprecated, please specify ", + "`nTrain` instead.") + nTrain <- floor(train.fraction*cRows) + } else if(is.null(nTrain)) { + nTrain <- cRows # both undefined, use all training data + } + if (is.null(train.fraction)){ + train.fraction <- nTrain / cRows + } + + # Extract var.names if NULL + if(is.null(var.names)) { + var.names <- getVarNames(x) + } + + # Check size of data + if(nTrain * bag.fraction <= 2 * n.minobsinnode + 1) { + stop("The data set is too small or the subsampling rate is too large: ", + "`nTrain * bag.fraction <= n.minobsinnode`") + } + + if (distribution$name != "pairwise") { + w <- w * length(w) / sum(w) # normalize to N + } + + # Sanity checks + ch <- checkMissing(x, y) + interaction.depth <- checkID(interaction.depth) + w <- checkWeights(w, length(y)) + offset <- checkOffset(offset, y) + + Misc <- NA + + # setup variable types + var.type <- rep(0,cCols) + var.levels <- vector("list",cCols) + for(i in 1:length(var.type)) + { + if(all(is.na(x[,i]))) + { + stop("variable ",i,": ",var.names[i]," has only missing values.") + } + if(is.ordered(x[,i])) + { + var.levels[[i]] <- levels(factor(x[,i])) + x[,i] <- as.numeric(factor(x[,i]))-1 + var.type[i] <- 0 + } + else if(is.factor(x[,i])) + { + if(length(levels(x[,i]))>1024) + stop("gbm does not currently handle categorical variables with more than 1024 levels. Variable ",i,": ",var.names[i]," has ",length(levels(x[,i]))," levels.") + var.levels[[i]] <- levels(factor(x[,i])) + x[,i] <- as.numeric(factor(x[,i]))-1 + var.type[i] <- max(x[,i],na.rm=TRUE)+1 + } + else if(is.numeric(x[,i])) + { + var.levels[[i]] <- quantile(x[,i],prob=(0:10)/10,na.rm=TRUE) + } + else + { + stop("variable ",i,": ",var.names[i]," is not of type numeric, ordered, or factor.") + } + + # check for some variation in each variable + if(length(unique(var.levels[[i]])) == 1) + { + warning("variable ",i,": ",var.names[i]," has no variation.") + } + } + + nClass <- 1 + + if(!("name" %in% names(distribution))) { + stop("The distribution is missing a `name` component; for example, ", + "distribution = list(name = \"gaussian\").") + } + supported.distributions <- getAvailableDistributions() + distribution.call.name <- distribution$name + + # Check for potential problems with the distribution + if(!is.element(distribution$name,supported.distributions)) { + stop("Distribution ",distribution$name," is not supported") + } + if((distribution$name == "bernoulli") && !all(is.element(y,0:1))) { + stop("Bernoulli requires the response to be in {0,1}") + if (is.factor(y)) { + y <- as.integer(y) - 1 + } + } + if((distribution$name == "huberized") && !all(is.element(y,0:1))) { + stop("Huberized square hinged loss requires the response to be in {0,1}") + if (is.factor(y)) { + y <- as.integer(y) - 1 + } + } + if((distribution$name == "poisson") && any(y<0)) { + stop("Poisson requires the response to be positive") + } + if((distribution$name == "poisson") && any(y != trunc(y))) { + stop("Poisson requires the response to be a positive integer") + } + if((distribution$name == "adaboost") && !all(is.element(y,0:1))) { + stop("This version of AdaBoost requires the response to be in {0,1}") + if (is.factor(y)) { + y <- as.integer(y) - 1 + } + } + if(distribution$name == "quantile") { + if(length(unique(w)) > 1) { + stop("This version of gbm for the quantile regression lacks a weighted quantile. For now the weights must be constant.") + } + if(is.null(distribution$alpha)) { + stop("For quantile regression, the distribution parameter must be a list with a parameter 'alpha' indicating the quantile, for example list(name=\"quantile\",alpha=0.95).") + } else { + if((distribution$alpha < 0) || (distribution$alpha > 1)) { + stop("alpha must be between 0 and 1.") } - if(is.ordered(x[,i])) - { - var.levels[[i]] <- levels(x[,i]) - x[,i] <- as.numeric(x[,i])-1 - var.type[i] <- 0 + } + Misc <- c(alpha=distribution$alpha) + } + if(distribution$name == "coxph") { + if(class(y)!="Surv") { + stop("Outcome must be a survival object Surv(time,failure)") + } + if(attr(y,"type")!="right") { + stop("gbm() currently only handles right censored observations") + } + Misc <- y[,2] + y <- y[,1] + + # reverse sort the failure times to compute risk sets on the fly + i.train <- order(-y[1:nTrain]) + n.test <- cRows - nTrain + if(n.test > 0) { + i.test <- order(-y[(nTrain+1):cRows]) + nTrain + } + else { + i.test <- NULL + } + i.timeorder <- c(i.train,i.test) + + y <- y[i.timeorder] + Misc <- Misc[i.timeorder] + x <- x[i.timeorder,,drop=FALSE] + w <- w[i.timeorder] + if(!is.na(offset)) offset <- offset[i.timeorder] + } + if(distribution$name == "tdist") { + if (is.null(distribution$df) || !is.numeric(distribution$df)){ + Misc <- 4 + } + else { + Misc <- distribution$df[1] + } + } + if (distribution$name == "multinomial") { + ## Ensure that the training set contains all classes + classes <- attr(factor(y), "levels") + nClass <- length(classes) + + if (nClass > nTrain) { + stop(paste("Number of classes (", nClass, ") must be less than the", + " size of the training set (", nTrain, ").", sep = "")) + } + + new.idx <- as.vector(sapply(classes, function(a,x){ min((1:length(x))[x==a]) }, y)) + + all.idx <- 1:length(y) + new.idx <- c(new.idx, all.idx[!(all.idx %in% new.idx)]) + + y <- y[new.idx] + x <- x[new.idx, ] + w <- w[new.idx] + if (!is.null(offset)) { + offset <- offset[new.idx] + } + + ## Get the factors + y <- as.numeric(as.vector(outer(y, classes, "=="))) + + ## Fill out the weight and offset + w <- rep(w, nClass) + if (!is.null(offset)) { + offset <- rep(offset, nClass) + } + } # close if (dist... == "multinomial" + + if(distribution$name == "pairwise") { + distribution.metric <- distribution[["metric"]] + if (!is.null(distribution.metric)) { + distribution.metric <- tolower(distribution.metric) + supported.metrics <- c("conc", "ndcg", "map", "mrr") + if (!is.element(distribution.metric, supported.metrics)) { + stop("Metric '", distribution.metric, "' is not supported, use either 'conc', 'ndcg', 'map', or 'mrr'") } - else if(is.factor(x[,i])) - { - if(length(levels(x[,i]))>1024) - stop("gbm does not currently handle categorical variables with more than 1024 levels. Variable ",i,": ",var.names[i]," has ",length(levels(x[,i]))," levels.") - var.levels[[i]] <- levels(x[,i]) - x[,i] <- as.numeric(x[,i])-1 - var.type[i] <- max(x[,i],na.rm=TRUE)+1 - } - else if(is.numeric(x[,i])) - { - var.levels[[i]] <- quantile(x[,i],prob=(0:10)/10,na.rm=TRUE) - } - else - { - stop("variable ",i,": ",var.names[i]," is not of type numeric, ordered, or factor.") - } - - # check for some variation in each variable - if(length(unique(var.levels[[i]])) == 1) - { - warning("variable ",i,": ",var.names[i]," has no variation.") - } - } - - nClass <- 1 - - if(!("name" %in% names(distribution))) { - stop("The distribution is missing a 'name' component, for example list(name=\"gaussian\")") - } - supported.distributions <- - c("bernoulli","gaussian","poisson","adaboost","laplace","coxph","quantile", - "tdist", "multinomial", "huberized", "pairwise") - - distribution.call.name <- distribution$name - - # check potential problems with the distributions - if(!is.element(distribution$name,supported.distributions)) - { - stop("Distribution ",distribution$name," is not supported") - } - if((distribution$name == "bernoulli") && !all(is.element(y,0:1))) - { - stop("Bernoulli requires the response to be in {0,1}") - } - if((distribution$name == "huberized") && !all(is.element(y,0:1))) - { - stop("Huberized square hinged loss requires the response to be in {0,1}") - } - if((distribution$name == "poisson") && any(y<0)) - { - stop("Poisson requires the response to be positive") - } - if((distribution$name == "poisson") && any(y != trunc(y))) - { - stop("Poisson requires the response to be a positive integer") - } - if((distribution$name == "adaboost") && !all(is.element(y,0:1))) - { - stop("This version of AdaBoost requires the response to be in {0,1}") - } - if(distribution$name == "quantile") - { - if(length(unique(w)) > 1) - { - stop("This version of gbm for the quantile regression lacks a weighted quantile. For now the weights must be constant.") - } - if(is.null(distribution$alpha)) - { - stop("For quantile regression, the distribution parameter must be a list with a parameter 'alpha' indicating the quantile, for example list(name=\"quantile\",alpha=0.95).") - } else - if((distribution$alpha<0) || (distribution$alpha>1)) - { - stop("alpha must be between 0 and 1.") - } - Misc <- c(alpha=distribution$alpha) - } - if(distribution$name == "coxph") - { - if(class(y)!="Surv") - { - stop("Outcome must be a survival object Surv(time,failure)") - } - if(attr(y,"type")!="right") - { - stop("gbm() currently only handles right censored observations") - } - Misc <- y[,2] - y <- y[,1] - - # reverse sort the failure times to compute risk sets on the fly - i.train <- order(-y[1:nTrain]) - n.test <- cRows - nTrain - if(n.test > 0) - { - i.test <- order(-y[(nTrain+1):cRows]) + nTrain - } - else - { - i.test <- NULL - } - i.timeorder <- c(i.train,i.test) - - y <- y[i.timeorder] - Misc <- Misc[i.timeorder] - x <- x[i.timeorder,,drop=FALSE] - w <- w[i.timeorder] - if(!is.na(offset)) offset <- offset[i.timeorder] - } - if(distribution$name == "tdist") - { - if (is.null(distribution$df) || !is.numeric(distribution$df)){ - Misc <- 4 + metric <- distribution.metric + } else { + warning("No metric specified, using 'ndcg'") + metric <- "ndcg" # default + distribution[["metric"]] <- metric + } + + if (any(y<0)) { + stop("targets for 'pairwise' should be non-negative") + } + + if (is.element(metric, c("mrr", "map")) && (!all(is.element(y, 0:1)))) { + stop("Metrics 'map' and 'mrr' require the response to be in {0,1}") + } + + # Cut-off rank for metrics + # Default of 0 means no cutoff + + max.rank <- 0 + if (!is.null(distribution[["max.rank"]]) && distribution[["max.rank"]] > 0) { + if (is.element(metric, c("ndcg", "mrr"))) { + max.rank <- distribution[["max.rank"]] } else { - Misc <- distribution$df[1] + stop("Parameter 'max.rank' cannot be specified for metric '", distribution.metric, "', only supported for 'ndcg' and 'mrr'") } - } - if (distribution$name == "multinomial") - { - ## Ensure that the training set contains all classes - classes <- attr(factor(y), "levels") - nClass <- length(classes) + } + + # We pass the cut-off rank to the C function as the last element in the Misc vector + Misc <- c(group, max.rank) + + distribution.call.name <- sprintf("pairwise_%s", metric) + } # close if (dist... == "pairwise" + + # create index upfront... subtract one for 0 based order + x.order <- apply(x[1:nTrain,,drop=FALSE],2,order,na.last=FALSE)-1 + + x <- as.vector(data.matrix(x)) + predF <- rep(0,length(y)) + train.error <- rep(0,n.trees) + valid.error <- rep(0,n.trees) + oobag.improve <- rep(0,n.trees) + + if(is.null(var.monotone)) { + var.monotone <- rep(0,cCols) + } else if(length(var.monotone)!=cCols) { + stop("Length of var.monotone != number of predictors") + } else if(!all(is.element(var.monotone,-1:1))) { + stop("var.monotone must be -1, 0, or 1") + } + fError <- FALSE + + gbm.obj <- .Call("gbm_fit", + Y=as.double(y), + Offset=as.double(offset), + X=as.double(x), + X.order=as.integer(x.order), + weights=as.double(w), + Misc=as.double(Misc), + cRows=as.integer(cRows), + cCols=as.integer(cCols), + var.type=as.integer(var.type), + var.monotone=as.integer(var.monotone), + distribution=as.character(distribution.call.name), + n.trees=as.integer(n.trees), + interaction.depth=as.integer(interaction.depth), + n.minobsinnode=as.integer(n.minobsinnode), + n.classes = as.integer(nClass), + shrinkage=as.double(shrinkage), + bag.fraction=as.double(bag.fraction), + nTrain=as.integer(nTrain), + fit.old=as.double(NA), + n.cat.splits.old=as.integer(0), + n.trees.old=as.integer(0), + verbose=as.integer(verbose), + PACKAGE = "gbm") + + names(gbm.obj) <- c("initF","fit","train.error","valid.error", + "oobag.improve","trees","c.splits") + + gbm.obj$bag.fraction <- bag.fraction + gbm.obj$distribution <- distribution + gbm.obj$interaction.depth <- interaction.depth + gbm.obj$n.minobsinnode <- n.minobsinnode + gbm.obj$num.classes <- nClass + gbm.obj$n.trees <- length(gbm.obj$trees) / nClass + gbm.obj$nTrain <- nTrain + gbm.obj$train.fraction <- train.fraction + gbm.obj$response.name <- response.name + gbm.obj$shrinkage <- shrinkage + gbm.obj$var.levels <- var.levels + gbm.obj$var.monotone <- var.monotone + gbm.obj$var.names <- var.names + gbm.obj$var.type <- var.type + gbm.obj$verbose <- verbose + gbm.obj$Terms <- NULL + + if(distribution$name == "coxph") { + gbm.obj$fit[i.timeorder] <- gbm.obj$fit + } + ## If K-Classification is used then split the fit and tree components + if (distribution$name == "multinomial") { + gbm.obj$fit <- matrix(gbm.obj$fit, ncol = nClass) + dimnames(gbm.obj$fit)[[2]] <- classes + gbm.obj$classes <- classes + + ## Also get the class estimators + exp.f <- exp(gbm.obj$fit) + denom <- matrix(rep(rowSums(exp.f), nClass), ncol = nClass) + gbm.obj$estimator <- exp.f/denom + } + + if(keep.data) { + if(distribution$name == "coxph") { + # Put the observations back in order + gbm.obj$data <- list( + y = y, + x = x, + x.order = x.order, + offset = offset, + Misc = Misc, + w = w, + i.timeorder = i.timeorder + ) + } + else if ( distribution$name == "multinomial" ) { + # Restore original order of the data + new.idx <- order(new.idx) + gbm.obj$data <- list( + y = as.vector(matrix(y, ncol = length(classes), byrow = FALSE)[new.idx, ]), + x = as.vector(matrix(x, ncol = length(var.names), byrow = FALSE)[new.idx, ]), + x.order = x.order, + offset = offset[new.idx], + Misc = Misc, + w = w[new.idx] + ) + } else { + gbm.obj$data <- list( + y = y, + x = x, + x.order = x.order, + offset = offset, + Misc = Misc, + w = w + ) + } + } + else { + gbm.obj$data <- NULL + } + + # Reuturn object of class "gbm" + class(gbm.obj) <- "gbm" + gbm.obj + +} - if (nClass > nTrain){ - stop(paste("Number of classes (", nClass, - ") must be less than the size of the training set (", nTrain, ")", - sep = "")) - } - - # f <- function(a,x){ - # min((1:length(x))[x==a]) - # } - - new.idx <- as.vector(sapply(classes, function(a,x){ min((1:length(x))[x==a]) }, y)) - - all.idx <- 1:length(y) - new.idx <- c(new.idx, all.idx[!(all.idx %in% new.idx)]) - - y <- y[new.idx] - x <- x[new.idx, ] - w <- w[new.idx] - if (!is.null(offset)){ - offset <- offset[new.idx] - } - - ## Get the factors - y <- as.numeric(as.vector(outer(y, classes, "=="))) - - ## Fill out the weight and offset - w <- rep(w, nClass) - if (!is.null(offset)){ - offset <- rep(offset, nClass) - } - } # close if (dist... == "multinomial" - - if(distribution$name == "pairwise") - { - distribution.metric <- distribution[["metric"]] - if (!is.null(distribution.metric)) - { - distribution.metric <- tolower(distribution.metric) - supported.metrics <- c("conc", "ndcg", "map", "mrr") - if (!is.element(distribution.metric, supported.metrics)) - { - stop("Metric '", distribution.metric, "' is not supported, use either 'conc', 'ndcg', 'map', or 'mrr'") - } - metric <- distribution.metric - } - else - { - warning("No metric specified, using 'ndcg'") - metric <- "ndcg" # default - distribution[["metric"]] <- metric - } - - if (any(y<0)) - { - stop("targets for 'pairwise' should be non-negative") - } - - if (is.element(metric, c("mrr", "map")) && (!all(is.element(y, 0:1)))) - { - stop("Metrics 'map' and 'mrr' require the response to be in {0,1}") - } - - # Cut-off rank for metrics - # Default of 0 means no cutoff - - max.rank <- 0 - if (!is.null(distribution[["max.rank"]]) && distribution[["max.rank"]] > 0) - { - if (is.element(metric, c("ndcg", "mrr"))) - { - max.rank <- distribution[["max.rank"]] - } - else - { - stop("Parameter 'max.rank' cannot be specified for metric '", distribution.metric, "', only supported for 'ndcg' and 'mrr'") - } - } - - # We pass the cut-off rank to the C function as the last element in the Misc vector - Misc <- c(group, max.rank) - - distribution.call.name <- sprintf("pairwise_%s", metric) - } # close if (dist... == "pairwise" - - # create index upfront... subtract one for 0 based order - x.order <- apply(x[1:nTrain,,drop=FALSE],2,order,na.last=FALSE)-1 - - x <- as.vector(data.matrix(x)) - predF <- rep(0,length(y)) - train.error <- rep(0,n.trees) - valid.error <- rep(0,n.trees) - oobag.improve <- rep(0,n.trees) - - if(is.null(var.monotone)) var.monotone <- rep(0,cCols) - else if(length(var.monotone)!=cCols) - { - stop("Length of var.monotone != number of predictors") - } - else if(!all(is.element(var.monotone,-1:1))) - { - stop("var.monotone must be -1, 0, or 1") - } - fError <- FALSE - - gbm.obj <- .Call("gbm", - Y=as.double(y), - Offset=as.double(offset), - X=as.double(x), - X.order=as.integer(x.order), - weights=as.double(w), - Misc=as.double(Misc), - cRows=as.integer(cRows), - cCols=as.integer(cCols), - var.type=as.integer(var.type), - var.monotone=as.integer(var.monotone), - distribution=as.character(distribution.call.name), - n.trees=as.integer(n.trees), - interaction.depth=as.integer(interaction.depth), - n.minobsinnode=as.integer(n.minobsinnode), - n.classes = as.integer(nClass), - shrinkage=as.double(shrinkage), - bag.fraction=as.double(bag.fraction), - nTrain=as.integer(nTrain), - fit.old=as.double(NA), - n.cat.splits.old=as.integer(0), - n.trees.old=as.integer(0), - verbose=as.integer(verbose), - PACKAGE = "gbm") - - names(gbm.obj) <- c("initF","fit","train.error","valid.error", - "oobag.improve","trees","c.splits") - - gbm.obj$bag.fraction <- bag.fraction - gbm.obj$distribution <- distribution - gbm.obj$interaction.depth <- interaction.depth - gbm.obj$n.minobsinnode <- n.minobsinnode - gbm.obj$num.classes <- nClass - gbm.obj$n.trees <- length(gbm.obj$trees) / nClass - gbm.obj$nTrain <- nTrain - gbm.obj$train.fraction <- train.fraction - gbm.obj$response.name <- response.name - gbm.obj$shrinkage <- shrinkage - gbm.obj$var.levels <- var.levels - gbm.obj$var.monotone <- var.monotone - gbm.obj$var.names <- var.names - gbm.obj$var.type <- var.type - gbm.obj$verbose <- verbose - gbm.obj$Terms <- NULL - - if(distribution$name == "coxph") - { - gbm.obj$fit[i.timeorder] <- gbm.obj$fit - } - ## If K-Classification is used then split the fit and tree components - if (distribution$name == "multinomial"){ - gbm.obj$fit <- matrix(gbm.obj$fit, ncol = nClass) - dimnames(gbm.obj$fit)[[2]] <- classes - gbm.obj$classes <- classes - - ## Also get the class estimators - exp.f <- exp(gbm.obj$fit) - denom <- matrix(rep(rowSums(exp.f), nClass), ncol = nClass) - gbm.obj$estimator <- exp.f/denom - } - - if(keep.data) - { - if(distribution$name == "coxph") - { - # put the observations back in order - gbm.obj$data <- list(y=y,x=x,x.order=x.order,offset=offset,Misc=Misc,w=w, - i.timeorder=i.timeorder) - } - else if ( distribution$name == "multinomial" ){ - # Restore original order of the data - new.idx <- order( new.idx ) - gbm.obj$data <- list( y=as.vector(matrix(y, ncol=length(classes),byrow=FALSE)[new.idx,]), - x=as.vector(matrix(x, ncol=length(var.names), byrow=FALSE)[new.idx,]), - x.order=x.order, - offset=offset[new.idx], - Misc=Misc, w=w[new.idx] ) - } - else - { - gbm.obj$data <- list(y=y,x=x,x.order=x.order,offset=offset,Misc=Misc,w=w) - } - } - else - { - gbm.obj$data <- NULL - } - - class(gbm.obj) <- "gbm" - return(gbm.obj) -} diff --git a/R/gbm.loss.R b/R/gbm.loss.R deleted file mode 100644 index 7e6e2b5..0000000 --- a/R/gbm.loss.R +++ /dev/null @@ -1,35 +0,0 @@ -gbm.loss <- function(y, f, w, offset, dist, baseline, group=NULL, max.rank=NULL) -{ - if (!is.na(offset)) - { - f <- offset+f - } - - if (dist$name != "pairwise") - { - switch(dist$name, - gaussian = weighted.mean((y - f)^2,w) - baseline, - bernoulli = -2*weighted.mean(y*f - log(1+exp(f)),w) - baseline, - laplace = weighted.mean(abs(y-f),w) - baseline, - adaboost = weighted.mean(exp(-(2*y-1)*f),w) - baseline, - poisson = -2*weighted.mean(y*f-exp(f),w) - baseline, - stop(paste("Distribution",dist$name,"is not yet supported for method=permutation.test.gbm"))) - } - else # dist$name == "pairwise" - { - if (is.null(dist$metric)) - { - stop("No metric specified for distribution 'pairwise'") - } - if (!is.element(dist$metric, c("conc", "ndcg", "map", "mrr"))) - { - stop("Invalid metric '", dist$metric, "' specified for distribution 'pairwise'") - } - if (is.null(group)) - { - stop("For distribution 'pairwise', parameter 'group' has to be supplied") - } - # Loss = 1 - utility - (1 - perf.pairwise(y, f, group, dist$metric, w, max.rank)) - baseline - } -} diff --git a/R/gbm.more.R b/R/gbm.more.R index d21cc24..5c82dd0 100644 --- a/R/gbm.more.R +++ b/R/gbm.more.R @@ -1,10 +1,136 @@ +#' Generalized Boosted Regression Modeling (GBM) +#' +#' Adds additional trees to a \code{\link{gbm.object}} object. +#' +#' @param object A \code{\link{gbm.object}} object created from an initial call +#' to \code{\link{gbm}}. +#' +#' @param n.new.trees Integer specifying the number of additional trees to add +#' to \code{object}. Default is 100. +#' +#' @param data An optional data frame containing the variables in the model. By +#' default the variables are taken from \code{environment(formula)}, typically +#' the environment from which \code{gbm} is called. If \code{keep.data=TRUE} in +#' the initial call to \code{gbm} then \code{gbm} stores a copy with the +#' object. If \code{keep.data=FALSE} then subsequent calls to +#' \code{\link{gbm.more}} must resupply the same dataset. It becomes the user's +#' responsibility to resupply the same data at this point. +#' +#' @param weights An optional vector of weights to be used in the fitting +#' process. Must be positive but do not need to be normalized. If +#' \code{keep.data=FALSE} in the initial call to \code{gbm} then it is the +#' user's responsibility to resupply the weights to \code{\link{gbm.more}}. +#' +#' @param offset A vector of offset values. +#' +#' @param verbose Logical indicating whether or not to print out progress and +#' performance indicators (\code{TRUE}). If this option is left unspecified for +#' \code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +#' \code{FALSE}. +#' +#' @return A \code{\link{gbm.object}} object. +#' +#' @export +#' +#' @examples +#' # +#' # A least squares regression example +#' # +#' +#' # Simulate data +#' set.seed(101) # for reproducibility +#' N <- 1000 +#' X1 <- runif(N) +#' X2 <- 2 * runif(N) +#' X3 <- ordered(sample(letters[1:4], N, replace = TRUE), levels = letters[4:1]) +#' X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +#' X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +#' X6 <- 3 * runif(N) +#' mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +#' SNR <- 10 # signal-to-noise ratio +#' Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu +#' sigma <- sqrt(var(Y) / SNR) +#' Y <- Y + rnorm(N, 0, sigma) +#' X1[sample(1:N,size=500)] <- NA # introduce some missing values +#' X4[sample(1:N,size=300)] <- NA # introduce some missing values +#' data <- data.frame(Y, X1, X2, X3, X4, X5, X6) +#' +#' # Fit a GBM +#' set.seed(102) # for reproducibility +#' gbm1 <- gbm(Y ~ ., data = data, var.monotone = c(0, 0, 0, 0, 0, 0), +#' distribution = "gaussian", n.trees = 100, shrinkage = 0.1, +#' interaction.depth = 3, bag.fraction = 0.5, train.fraction = 0.5, +#' n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE, +#' verbose = FALSE, n.cores = 1) +#' +#' # Check performance using the out-of-bag (OOB) error; the OOB error typically +#' # underestimates the optimal number of iterations +#' best.iter <- gbm.perf(gbm1, method = "OOB") +#' print(best.iter) +#' +#' # Check performance using the 50% heldout test set +#' best.iter <- gbm.perf(gbm1, method = "test") +#' print(best.iter) +#' +#' # Check performance using 5-fold cross-validation +#' best.iter <- gbm.perf(gbm1, method = "cv") +#' print(best.iter) +#' +#' # Plot relative influence of each variable +#' par(mfrow = c(1, 2)) +#' summary(gbm1, n.trees = 1) # using first tree +#' summary(gbm1, n.trees = best.iter) # using estimated best number of trees +#' +#' # Compactly print the first and last trees for curiosity +#' print(pretty.gbm.tree(gbm1, i.tree = 1)) +#' print(pretty.gbm.tree(gbm1, i.tree = gbm1$n.trees)) +#' +#' # Simulate new data +#' set.seed(103) # for reproducibility +#' N <- 1000 +#' X1 <- runif(N) +#' X2 <- 2 * runif(N) +#' X3 <- ordered(sample(letters[1:4], N, replace = TRUE)) +#' X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +#' X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +#' X6 <- 3 * runif(N) +#' mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +#' Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu + rnorm(N, 0, sigma) +#' data2 <- data.frame(Y, X1, X2, X3, X4, X5, X6) +#' +#' # Predict on the new data using the "best" number of trees; by default, +#' # predictions will be on the link scale +#' Yhat <- predict(gbm1, newdata = data2, n.trees = best.iter, type = "link") +#' +#' # least squares error +#' print(sum((data2$Y - Yhat)^2)) +#' +#' # Construct univariate partial dependence plots +#' p1 <- plot(gbm1, i.var = 1, n.trees = best.iter) +#' p2 <- plot(gbm1, i.var = 2, n.trees = best.iter) +#' p3 <- plot(gbm1, i.var = "X3", n.trees = best.iter) # can use index or name +#' grid.arrange(p1, p2, p3, ncol = 3) +#' +#' # Construct bivariate partial dependence plots +#' plot(gbm1, i.var = 1:2, n.trees = best.iter) +#' plot(gbm1, i.var = c("X2", "X3"), n.trees = best.iter) +#' plot(gbm1, i.var = 3:4, n.trees = best.iter) +#' +#' # Construct trivariate partial dependence plots +#' plot(gbm1, i.var = c(1, 2, 6), n.trees = best.iter, +#' continuous.resolution = 20) +#' plot(gbm1, i.var = 1:3, n.trees = best.iter) +#' plot(gbm1, i.var = 2:4, n.trees = best.iter) +#' plot(gbm1, i.var = 3:5, n.trees = best.iter) +#' +#' # Add more (i.e., 100) boosting iterations to the ensemble +#' gbm2 <- gbm.more(gbm1, n.new.trees = 100, verbose = FALSE) gbm.more <- function(object, n.new.trees = 100, data = NULL, weights = NULL, offset = NULL, - verbose = NULL) -{ + verbose = NULL) { theCall <- match.call() nTrain <- object$nTrain @@ -181,7 +307,7 @@ } x <- as.vector(x) - gbm.obj <- .Call("gbm", + gbm.obj <- .Call("gbm_fit", Y = as.double(y), Offset = as.double(offset), X = as.double(x), diff --git a/R/gbm.object.R b/R/gbm.object.R new file mode 100644 index 0000000..1778a73 --- /dev/null +++ b/R/gbm.object.R @@ -0,0 +1,39 @@ +#' Generalized Boosted Regression Model Object +#' +#' These are objects representing fitted \code{gbm}s. +#' +#' @return \item{initF}{the "intercept" term, the initial predicted value to +#' which trees make adjustments} \item{fit}{a vector containing the fitted +#' values on the scale of regression function (e.g. log-odds scale for +#' bernoulli, log scale for poisson)} \item{train.error}{a vector of length +#' equal to the number of fitted trees containing the value of the loss +#' function for each boosting iteration evaluated on the training data} +#' \item{valid.error}{a vector of length equal to the number of fitted trees +#' containing the value of the loss function for each boosting iteration +#' evaluated on the validation data} \item{cv.error}{if \code{cv.folds}<2 this +#' component is NULL. Otherwise, this component is a vector of length equal to +#' the number of fitted trees containing a cross-validated estimate of the loss +#' function for each boosting iteration} \item{oobag.improve}{a vector of +#' length equal to the number of fitted trees containing an out-of-bag estimate +#' of the marginal reduction in the expected value of the loss function. The +#' out-of-bag estimate uses only the training data and is useful for estimating +#' the optimal number of boosting iterations. See \code{\link{gbm.perf}}} +#' \item{trees}{a list containing the tree structures. The components are best +#' viewed using \code{\link{pretty.gbm.tree}}} \item{c.splits}{a list of all +#' the categorical splits in the collection of trees. If the \code{trees[[i]]} +#' component of a \code{gbm} object describes a categorical split then the +#' splitting value will refer to a component of \code{c.splits}. That component +#' of \code{c.splits} will be a vector of length equal to the number of levels +#' in the categorical split variable. -1 indicates left, +1 indicates right, +#' and 0 indicates that the level was not present in the training data} +#' \item{cv.fitted}{If cross-validation was performed, the cross-validation +#' predicted values on the scale of the linear predictor. That is, the fitted +#' values from the ith CV-fold, for the model having been trained on the data +#' in all other folds.} +#' @section Structure: The following components must be included in a +#' legitimate \code{gbm} object. +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{gbm}} +#' @keywords methods +#' @name gbm.object +NULL diff --git a/R/gbm.perf.R b/R/gbm.perf.R index a1b117e..c377f8c 100644 --- a/R/gbm.perf.R +++ b/R/gbm.perf.R @@ -1,187 +1,95 @@ -gbm.perf <- function(object, - plot.it=TRUE, - oobag.curve=FALSE, - overlay=TRUE, - method) -{ - smoother <- NULL - - if ( missing( method ) ){ - if ( object$train.fraction < 1 ){ - method <- "test" - } - else if ( !is.null( object$cv.error ) ){ - method <- "cv" - } - else { method <- "OOB" } - cat( paste( "Using", method, "method...\n" ) ) - } - - if((method == "OOB") || oobag.curve) - { - if(object$bag.fraction==1) - stop("Cannot compute OOB estimate or the OOB curve when bag.fraction=1") - if(all(!is.finite(object$oobag.improve))) - stop("Cannot compute OOB estimate or the OOB curve. No finite OOB estimates of improvement") - x <- 1:object$n.trees - smoother <- loess(object$oobag.improve~x, - enp.target=min(max(4,length(x)/10),50)) - smoother$y <- smoother$fitted - smoother$x <- x - - best.iter.oob <- x[which.min(-cumsum(smoother$y))] - best.iter <- best.iter.oob - } - - if(method == "OOB") - { - warning("OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv.folds>0 when calling gbm usually results in improved predictive performance.") - } - - if(method == "test") - { - best.iter.test <- which.min(object$valid.error) - best.iter <- best.iter.test - } - - if(method == "cv") - { - if(is.null(object$cv.error)) - stop("In order to use method=\"cv\" gbm must be called with cv.folds>1.") - if(length(object$cv.error) < object$n.trees) - warning("cross-validation error is not computed for any additional iterations run using gbm.more().") - best.iter.cv <- which.min(object$cv.error) - best.iter <- best.iter.cv - } - - if(!is.element(method,c("OOB","test","cv"))) - stop("method must be cv, test, or OOB") - - if(plot.it) - { - par(mar=c(5,4,4,4)+.1) - if (object$distribution$name !="pairwise") - { - ylab <- switch(substring(object$distribution$name,1,2), - ga="Squared error loss", - be="Bernoulli deviance", - po="Poisson deviance", - ad="AdaBoost exponential bound", - co="Cox partial deviance", - la="Absolute loss", - qu="Quantile loss", - mu="Multinomial deviance", - td="t-distribution deviance" - ) - } - else # object$distribution$name =="pairwise" - { - ylab <- switch(object$distribution$metric, - conc ="Fraction of concordant pairs", - ndcg="Normalized discounted cumulative gain", - map ="Mean average precision", - mrr ="Mean reciprocal rank" - ) - } - - if(object$train.fraction==1) - { # HS Next line changed to scale axis to include other error - # ylim <- range(object$train.error) - if ( method=="cv" ){ ylim <- range(object$train.error, object$cv.error) } - else if ( method == "test" ){ ylim <- range( object$train.error, object$valid.error) } - else { ylim <- range(object$train.error) } - } - else - { - ylim <- range(object$train.error,object$valid.error) - } - - plot(object$train.error, - ylim=ylim, +#' GBM performance +#' +#' Estimates the optimal number of boosting iterations for a \code{gbm} object +#' and optionally plots various performance measures +#' +#' @param object A \code{\link{gbm.object}} created from an initial call to +#' \code{\link{gbm}}. +#' +#' @param plot.it An indicator of whether or not to plot the performance +#' measures. Setting \code{plot.it = TRUE} creates two plots. The first plot +#' plots \code{object$train.error} (in black) and \code{object$valid.error} +#' (in red) versus the iteration number. The scale of the error measurement, +#' shown on the left vertical axis, depends on the \code{distribution} +#' argument used in the initial call to \code{\link{gbm}}. +#' +#' @param oobag.curve Indicates whether to plot the out-of-bag performance +#' measures in a second plot. +#' +#' @param overlay If TRUE and oobag.curve=TRUE then a right y-axis is added to +#' the training and test error plot and the estimated cumulative improvement +#' in the loss function is plotted versus the iteration number. +#' +#' @param method Indicate the method used to estimate the optimal number of +#' boosting iterations. \code{method = "OOB"} computes the out-of-bag estimate +#' and \code{method = "test"} uses the test (or validation) dataset to compute +#' an out-of-sample estimate. \code{method = "cv"} extracts the optimal number +#' of iterations using cross-validation if \code{gbm} was called with +#' \code{cv.folds} > 1. +#' +#' @return \code{gbm.perf} Returns the estimated optimal number of iterations. +#' The method of computation depends on the \code{method} argument. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' @seealso \code{\link{gbm}}, \code{\link{gbm.object}} +#' +#' @keywords nonlinear survival nonparametric tree +#' +#' @export +gbm.perf <- function(object, plot.it = TRUE, oobag.curve = FALSE, + overlay = TRUE, method) { + + # Determine method, if missing + if (missing(method)) { + method <- guess_error_method(object) + } + + # Determine "optimal" number of iterations + best.iter <- best_iter(object, method = method) + + # Determine an appropriate y-axis label + ylab <- get_ylab(object) + + # Determine an appropriate range for the y-axis + ylim <- get_ylim(object, method = method) + + # Plot results + plot(object$train.error, ylim = ylim, type = "l", xlab = "Iteration", + ylab = ylab) + + if(object$train.fraction!=1) { + lines(object$valid.error,col="red") + } + if(method=="cv") { + lines(object$cv.error,col="green") + } + if(!is.na(best.iter)) { + abline(v=best.iter,col="blue",lwd=2,lty=2) + } + if(oobag.curve) { + if(overlay) { + smoother <- attr(best.iter, "smoother") + par(new = TRUE) + plot(smoother$x, + cumsum(smoother$y), + col="blue", type="l", - xlab="Iteration",ylab=ylab) - - if(object$train.fraction!=1) - { - lines(object$valid.error,col="red") - } - if(method=="cv") - { - lines(object$cv.error,col="green") - } - if(!is.na(best.iter)) abline(v=best.iter,col="blue",lwd=2,lty=2) - if(oobag.curve) - { - if(overlay) - { - par(new=TRUE) - plot(smoother$x, - cumsum(smoother$y), - col="blue", - type="l", - xlab="",ylab="", - axes=FALSE) - axis(4,srt=0) - at <- mean(range(smoother$y)) - mtext(paste("OOB improvement in",ylab),side=4,srt=270,line=2) - abline(h=0,col="blue",lwd=2) - } - - plot(object$oobag.improve,type="l", - xlab="Iteration", - ylab=paste("OOB change in",ylab)) - lines(smoother,col="red",lwd=2) - abline(h=0,col="blue",lwd=1) - - abline(v=best.iter,col="blue",lwd=1) - } - } - - return(best.iter) + xlab="",ylab="", + axes=FALSE) + axis(4,srt=0) + at <- mean(range(smoother$y)) + mtext(paste("OOB improvement in",ylab),side=4,srt=270,line=2) + abline(h=0,col="blue",lwd=2) + } + + plot(object$oobag.improve,type="l", + xlab="Iteration", + ylab=paste("OOB change in",ylab)) + lines(smoother,col="red",lwd=2) + abline(h=0,col="blue",lwd=1) + + abline(v=best.iter,col="blue",lwd=1) + } + return(best.iter) } - - -perf.pairwise <- function(y, f, group, metric="ndcg", w=NULL, max.rank=0) -{ - func.name <- switch(metric, - conc = "ir.measure.conc", - mrr = "ir.measure.mrr", - map = "ir.measure.map", - ndcg = "ir.measure.ndcg", - stop(paste("Metric",metric,"is not supported")) - ) - - # Optimization: for binary targets, - # AUC is equivalent but faster than CONC - if (metric == "conc" && all(is.element(y, 0:1))) - { - func.name <- "ir.measure.auc" - } - - # Max rank = 0 means no cut off - if (max.rank <= 0) - { - max.rank <- length(y)+1 - } - - # Random tie breaking in case of duplicate scores. - # (Without tie breaking, we would overestimate if instances are - # sorted descending on target) - f <- f + 1E-10 * runif(length(f), min=-0.5, max=0.5) - - measure.by.group <- as.matrix(by(list(y, f), INDICES=group, FUN=get(func.name), max.rank=max.rank)) - - # Exclude groups with single result or only negative or positive instances - idx <- which((!is.null(measure.by.group)) & measure.by.group >= 0) - - if (is.null(w)) - { - return (mean(measure.by.group[idx])) - } - else - { - # Assumption: weights are constant per group - w.by.group <- tapply(w, group, mean) - return (weighted.mean(measure.by.group[idx], w=w.by.group[idx])) - } -} diff --git a/R/gbmCluster.R b/R/gbmCluster.R deleted file mode 100644 index 6cf435d..0000000 --- a/R/gbmCluster.R +++ /dev/null @@ -1,8 +0,0 @@ -gbmCluster <- function(n){ - # If number of cores (n) not given, try to work it out from the number - # that appear to be available and the number of CV folds. - if (is.null(n)){ - n <- detectCores() - } - makeCluster(n) -} diff --git a/R/gbmCrossVal.R b/R/gbmCrossVal.R index b331838..aae373a 100644 --- a/R/gbmCrossVal.R +++ b/R/gbmCrossVal.R @@ -1,7 +1,58 @@ -##' Perform gbm cross-validation -##' -##' This function has far too many arguments, but there isn't the -##' abstraction in gbm to lose them. +#' Cross-validate a gbm +#' +#' Functions for cross-validating gbm. These functions are used internally and +#' are not intended for end-user direct usage. +#' +#' These functions are not intended for end-user direct usage, but are used +#' internally by \code{gbm}. +#' +#' @aliases gbmCrossVal gbmCrossValModelBuild gbmDoFold gbmCrossValErr +#' gbmCrossValPredictions +#' @param cv.folds The number of cross-validation folds. +#' @param nTrain The number of training samples. +#' @param n.cores The number of cores to use. +#' @param class.stratify.cv Whether or not stratified cross-validation samples +#' are used. +#' @param data The data. +#' @param x The model matrix. +#' @param y The response variable. +#' @param offset The offset. +#' @param distribution The type of loss function. See \code{\link{gbm}}. +#' @param w Observation weights. +#' @param var.monotone See \code{\link{gbm}}. +#' @param n.trees The number of trees to fit. +#' @param interaction.depth The degree of allowed interactions. See +#' \code{\link{gbm}}. +#' @param n.minobsinnode See \code{\link{gbm}}. +#' @param shrinkage See \code{\link{gbm}}. +#' @param bag.fraction See \code{\link{gbm}}. +#' @param var.names See \code{\link{gbm}}. +#' @param response.name See \code{\link{gbm}}. +#' @param group Used when \code{distribution = "pairwise"}. See +#' \code{\link{gbm}}. +#' @param i.train Items in the training set. +#' @param cv.models A list containing the models for each fold. +#' @param cv.group A vector indicating the cross-validation fold for each +#' member of the training set. +#' @param best.iter.cv The iteration with lowest cross-validation error. +#' @param X Index (cross-validation fold) on which to subset. +#' @param s Random seed. +#' @return A list containing the cross-validation error and predictions. +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{gbm}} +#' @references J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +#' Boosting Machine," Annals of Statistics 29(5):1189-1232. +#' +#' L. Breiman (2001). +#' \url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. +#' @keywords models + +# Perform gbm cross-validation +# +# This function has far too many arguments, but there isn't the +# abstraction in gbm to lose them. +#' @rdname gbmCrossVal +#' @export gbmCrossVal <- function(cv.folds, nTrain, n.cores, class.stratify.cv, data, x, y, offset, distribution, w, var.monotone, @@ -19,18 +70,22 @@ n.minobsinnode, shrinkage, bag.fraction, var.names, response.name, group) + ## get the errors cv.error <- gbmCrossValErr(cv.models, cv.folds, cv.group, nTrain, n.trees) best.iter.cv <- which.min(cv.error) + ## get the predictions predictions <- gbmCrossValPredictions(cv.models, cv.folds, cv.group, best.iter.cv, distribution, - data[i.train,], y) - list(error=cv.error, - predictions=predictions) -} - -##' Get the gbm cross-validation error + data[i.train, ], y) + list(error = cv.error, predictions = predictions) +} + + +# Get the gbm cross-validation error +#' @rdname gbmCrossVal +#' @export gbmCrossValErr <- function(cv.models, cv.folds, cv.group, nTrain, n.trees) { in.group <- tabulate(cv.group, nbins=cv.folds) cv.error <- vapply(1:cv.folds, @@ -39,70 +94,125 @@ model$valid.error * in.group[[index]] }, double(n.trees)) ## this is now a (n.trees, cv.folds) matrix - + ## and now a n.trees vector rowSums(cv.error) / nTrain } -##' Get the predictions for GBM cross validation -##' -##' This function is not as nice as it could be (leakage of y) + +#' @rdname gbmCrossVal +#' @export gbmCrossValPredictions <- function(cv.models, cv.folds, cv.group, best.iter.cv, distribution, data, y) { - ## test cv.group and data match + + # Get the predictions for GBM cross validation. This function is not as nice + # as it could be (i.e., leakage of y) + + # Test that cv.group and data match if (nrow(data) != length(cv.group)) { - stop("mismatch between data and cv.group") - } - ## this is a little complicated due to multinomial distribution + stop("Mismatch between `data` and `cv.group`.") + } + + # This is a little complicated due to multinomial distribution num.cols <- if (distribution$name == "multinomial") { nlevels(factor(y)) } else { 1 } - result <- matrix(nrow=nrow(data), ncol=num.cols) - ## there's no real reason to do this as other than a for loop - data.names <- names(data) + + # Initialize results matrix + res <- matrix(nrow = nrow(data), ncol = num.cols) + + # There's no real reason to do this as other than a for loop + data.names <- names(data) # column names for (ind in 1:cv.folds) { - ## these are the particular elements + + # These are the particular elements flag <- cv.group == ind model <- cv.models[[ind]] - ## the %in% here is to handle coxph + + # The %in% here is to handle coxph my.data <- data[flag, !(data.names %in% model$response.name)] - predictions <- predict(model, newdata=my.data, n.trees=best.iter.cv) - predictions <- matrix(predictions, ncol=num.cols) - result[flag,] <- predictions - } - + predictions <- predict(model, newdata = my.data, n.trees = best.iter.cv) # FIXME + predictions <- matrix(predictions, ncol = num.cols) + res[flag, ] <- predictions + + } + + # Handle multinomial case if (distribution$name != "multinomial") { - result <- as.numeric(result) - } - - result -} - - -##' Perform gbm cross-validation -##' -##' This function has far too many arguments. -gbmCrossValModelBuild <- function(cv.folds, cv.group, n.cores, i.train, - x, y, offset, distribution, - w, var.monotone, n.trees, - interaction.depth, n.minobsinnode, - shrinkage, bag.fraction, - var.names, response.name, - group) { - ## set up the cluster and add a finalizer + res <- as.numeric(res) + } + + # Return the result + res + +} + + +# Perform gbm cross-validation +# +# This function has far too many arguments. +#' @rdname gbmCrossVal +#' @export +gbmCrossValModelBuild <- function(cv.folds, cv.group, n.cores, i.train, x, y, + offset, distribution, w, var.monotone, + n.trees, interaction.depth, n.minobsinnode, + shrinkage, bag.fraction, var.names, + response.name, group) { + + # Set up cluster and add finalizer cluster <- gbmCluster(n.cores) - on.exit(stopCluster(cluster)) - - ## get ourselves some random seeds + on.exit(parallel::stopCluster(cluster)) + + # Set random seeds seeds <- as.integer(runif(cv.folds, -(2^31 - 1), 2^31)) - - ## now do the cross-validation model builds - parLapply(cl=cluster, X=1:cv.folds, - gbmDoFold, i.train, x, y, offset, distribution, - w, var.monotone, n.trees, - interaction.depth, n.minobsinnode, shrinkage, - bag.fraction, - cv.group, var.names, response.name, group, seeds) -} + + # Perform cross-validation model builds + parallel::parLapply(cl = cluster, X = 1:cv.folds, fun = gbmDoFold, i.train, x, + y, offset, distribution, w, var.monotone, n.trees, + interaction.depth, n.minobsinnode, shrinkage, + bag.fraction, cv.group, var.names, response.name, group, + seeds) + +} + + +#' @rdname gbmCrossVal +#' @export +gbmDoFold <- function(X, i.train, x, y, offset, distribution, w, var.monotone, + n.trees, interaction.depth, n.minobsinnode, shrinkage, + bag.fraction, cv.group, var.names, response.name, group, + s) { + + # Do specified cross-validation fold - a self-contained function for passing + # to individual cores. + + # Load required packages for core + library(gbm, quietly=TRUE) + + # Print CV information + cat("CV:", X, "\n") + + # Setup + set.seed(s[[X]]) + i <- order(cv.group == X) + x <- x[i.train,,drop=TRUE][i,,drop=FALSE] + y <- y[i.train][i] + offset <- offset[i.train][i] + nTrain <- length(which(cv.group != X)) + group <- group[i.train][i] + + # Fit a GBM + res <- gbm.fit(x = x, y = y, offset = offset, distribution = distribution, + w = w, var.monotone = var.monotone, n.trees = n.trees, + interaction.depth = interaction.depth, + n.minobsinnode = n.minobsinnode, + shrinkage = shrinkage, bag.fraction = bag.fraction, + nTrain = nTrain, keep.data = FALSE, verbose = FALSE, + response.name = response.name, group = group) + + # Return the result + res + +} diff --git a/R/gbmDoFold.R b/R/gbmDoFold.R deleted file mode 100644 index ec69fe3..0000000 --- a/R/gbmDoFold.R +++ /dev/null @@ -1,31 +0,0 @@ -gbmDoFold <- - # Do specified cross-validation fold - a self-contained function for - # passing to individual cores. -function(X, - i.train, x, y, offset, distribution, w, var.monotone, n.trees, - interaction.depth, n.minobsinnode, shrinkage, bag.fraction, - cv.group, var.names, response.name, group, s){ - library(gbm, quietly=TRUE) - cat("CV:", X, "\n") - - set.seed(s[[X]]) - - i <- order(cv.group == X) - x <- x[i.train,,drop=TRUE][i,,drop=FALSE] - y <- y[i.train][i] - offset <- offset[i.train][i] - nTrain <- length(which(cv.group != X)) - group <- group[i.train][i] - - res <- gbm.fit(x, y, - offset=offset, distribution=distribution, - w=w, var.monotone=var.monotone, n.trees=n.trees, - interaction.depth=interaction.depth, - n.minobsinnode=n.minobsinnode, - shrinkage=shrinkage, - bag.fraction=bag.fraction, - nTrain=nTrain, keep.data=FALSE, - verbose=FALSE, response.name=response.name, - group=group) - res -} diff --git a/R/getCVgroup.R b/R/getCVgroup.R deleted file mode 100644 index 10fd2f2..0000000 --- a/R/getCVgroup.R +++ /dev/null @@ -1,25 +0,0 @@ -getCVgroup <- - # Construct cross-validation groups depending on the type of model to be fit -function(distribution, class.stratify.cv, y, i.train, cv.folds, group){ - - if (distribution$name %in% c( "bernoulli", "multinomial" ) & class.stratify.cv ){ - nc <- table(y[i.train]) # Number in each class - uc <- names(nc) - if (min(nc) < cv.folds){ - stop( paste("The smallest class has only", min(nc), "objects in the training set. Can't do", cv.folds, "fold cross-validation.")) - } - cv.group <- vector(length = length(i.train)) - for (i in 1:length(uc)){ - cv.group[y[i.train] == uc[i]] <- sample(rep(1:cv.folds , length = nc[i])) - } - } # Close if - else if (distribution$name == "pairwise") { - # Split into CV folds at group boundaries - s <- sample(rep(1:cv.folds, length=nlevels(group))) - cv.group <- s[as.integer(group[i.train])] - } - else { - cv.group <- sample(rep(1:cv.folds, length=length(i.train))) - } - cv.group -} diff --git a/R/getStratify.R b/R/getStratify.R deleted file mode 100644 index e0c3b4f..0000000 --- a/R/getStratify.R +++ /dev/null @@ -1,13 +0,0 @@ -getStratify <- function(strat, d){ - if (is.null(strat)){ - if (d$name == "multinomial" ){ strat <- TRUE } - else { strat <- FALSE } - } - else { - if (!is.element(d$name, c( "bernoulli", "multinomial"))){ - warning("You can only use class.stratify.cv when distribution is bernoulli or multinomial. Ignored.") - strat <- FALSE - } - } # Close else - strat -} diff --git a/R/guessDist.R b/R/guessDist.R deleted file mode 100644 index 65c7d4a..0000000 --- a/R/guessDist.R +++ /dev/null @@ -1,9 +0,0 @@ -guessDist <- function(y){ - # If distribution is not given, try to guess it - if (length(unique(y)) == 2){ d <- "bernoulli" } - else if (class(y) == "Surv" ){ d <- "coxph" } - else if (is.factor(y)){ d <- "multinomial" } - else{ d <- "gaussian" } - cat(paste("Distribution not specified, assuming", d, "...\n")) - list(name=d) -} diff --git a/R/interact.gbm.R b/R/interact.gbm.R index 5a50e19..ee88a0e 100644 --- a/R/interact.gbm.R +++ b/R/interact.gbm.R @@ -1,4 +1,44 @@ -# Compute Friedman's H statistic for interaction effects +#' Estimate the strength of interaction effects +#' +#' Computes Friedman's H-statistic to assess the strength of variable +#' interactions. +#' +#' @param x A \code{\link{gbm.object}} fitted using a call to \code{\link{gbm}}. +#' +#' @param data The dataset used to construct \code{x}. If the original dataset +#' is large, a random subsample may be used to accelerate the computation in +#' \code{interact.gbm}. +#' +#' @param i.var A vector of indices or the names of the variables for compute +#' the interaction effect. If using indices, the variables are indexed in the +#' same order that they appear in the initial \code{gbm} formula. +#' +#' @param n.trees The number of trees used to generate the plot. Only the first +#' \code{n.trees} trees will be used. +#' +#' @return Returns the value of \eqn{H}. +#' +#' @details +#' \code{interact.gbm} computes Friedman's H-statistic to assess the relative +#' strength of interaction effects in non-linear models. H is on the scale of +#' [0-1] with higher values indicating larger interaction effects. To connect +#' to a more familiar measure, if \eqn{x_1} and \eqn{x_2} are uncorrelated +#' covariates with mean 0 and variance 1 and the model is of the form +#' \deqn{y=\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3} then +#' \deqn{H=\frac{\beta_3}{\sqrt{\beta_1^2+\beta_2^2+\beta_3^2}}} +#' +#' Note that if the main effects are weak, the estimated H will be unstable. +#' For example, if (in the case of a two-way interaction) neither main effect +#' is in the selected model (relative influence is zero), the result will be +#' 0/0. Also, with weak main effects, rounding errors can result in values of H +#' > 1 which are not possible. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{gbm}}, \code{\link{gbm.object}} +#' @references J.H. Friedman and B.E. Popescu (2005). \dQuote{Predictive +#' Learning via Rule Ensembles.} Section 8.1 +#' @keywords methods +#' @export interact.gbm <- function(x, data, i.var = 1, n.trees = x$n.trees){ ############################################################### # Do sanity checks on the call diff --git a/R/ir.measures.R b/R/ir.measures.R index 4fd5a9b..07c4329 100644 --- a/R/ir.measures.R +++ b/R/ir.measures.R @@ -10,10 +10,55 @@ # inaccurate for individual groups, but should have # a small effect on the overall measure. +#' Compute Information Retrieval measures. +#' +#' Functions to compute Information Retrieval measures for pairwise loss for a +#' single group. The function returns the respective metric, or a negative +#' value if it is undefined for the given group. +#' +#' @param obs Observed value. +#' @param pred Predicted value. +#' @param metric What type of performance measure to compute. +#' @param y,y.f,f,w,group,max.rank Used internally. +#' @param x ?. +#' @return The requested performance measure. +#' +#' @details +#' For simplicity, we have no special handling for ties; instead, we break ties +#' randomly. This is slightly inaccurate for individual groups, but should have +#' only a small effect on the overall measure. +#' +#' \code{gbm.conc} computes the concordance index: Fraction of all pairs (i,j) +#' with i Define data, use random, +#' ##-- or do help(data=index) for the standard data sets. + # Area under ROC curve = ratio of correctly ranking pairs -gbm.roc.area <- function(obs, pred) -{ +#' @rdname gbm.roc.area +#' @export +gbm.roc.area <- function(obs, pred) { n1 <- sum(obs) n <- length(obs) if (n==n1) { return(1) } @@ -23,18 +68,24 @@ return ((mean(rank(pred)[obs > 0]) - (n1 + 1)/2)/(n - n1)) } + # Concordance Index: # Fraction of all pairs (i,j) with i= 0) + + if (is.null(w)) + { + return (mean(measure.by.group[idx])) + } + else + { + # Assumption: weights are constant per group + w.by.group <- tapply(w, group, mean) + return (weighted.mean(measure.by.group[idx], w=w.by.group[idx])) + } +} diff --git a/R/permutation.test.gbm.R b/R/permutation.test.gbm.R deleted file mode 100644 index b8438cc..0000000 --- a/R/permutation.test.gbm.R +++ /dev/null @@ -1,50 +0,0 @@ -permutation.test.gbm <- function(object, - n.trees) -{ - # get variables used in the model - i.vars <- sort(unique(unlist(lapply(object$trees[1:n.trees], - function(x){unique(x[[1]])})))) - i.vars <- i.vars[i.vars!=-1] + 1 - rel.inf <- rep(0,length(object$var.names)) - - if(!is.null(object$data)) - { - y <- object$data$y - os <- object$data$offset - Misc <- object$data$Misc - w <- object$data$w - x <- matrix(object$data$x, ncol=length(object$var.names)) - object$Terms <- NULL # this makes predict.gbm take x as it is - - if (object$distribution$name == "pairwise") - { - # group and cutoff are only relevant for distribution "pairwise" - # in this case, the last element specifies the max rank - # max rank = 0 means no cut off - group <- Misc[1:length(y)] - max.rank <- Misc[length(y)+1] - } - } - else - { - stop("Model was fit with keep.data=FALSE. permutation.test.gbm has not been implemented for that case.") - } - - # the index shuffler - j <- sample(1:nrow(x)) - for(i in 1:length(i.vars)) - { - x[ ,i.vars[i]] <- x[j,i.vars[i]] - - new.pred <- predict.gbm(object,newdata=x,n.trees=n.trees) - rel.inf[i.vars[i]] <- gbm.loss(y,new.pred,w,os, - object$distribution, - object$train.error[n.trees], - group, - max.rank) - - x[j,i.vars[i]] <- x[ ,i.vars[i]] - } - - return(rel.inf=rel.inf) -} diff --git a/R/plot.gbm.R b/R/plot.gbm.R index 4daaedb..948822b 100644 --- a/R/plot.gbm.R +++ b/R/plot.gbm.R @@ -1,397 +1,356 @@ -plot.gbm <- function(x, - i.var=1, - n.trees=x$n.trees, - continuous.resolution=100, - return.grid=FALSE, - type="link", - ...) -{ - if (!is.element(type, c("link", "response"))){ - stop( "type must be either 'link' or 'response'") - } - - if(all(is.character(i.var))) - { - i <- match(i.var,x$var.names) - if(any(is.na(i))) - { - stop("Plot variables not used in gbm model fit: ",i.var[is.na(i)]) - } else - { - i.var <- i - } - } - - if((min(i.var)<1) || (max(i.var)>length(x$var.names))) - { - warning("i.var must be between 1 and ",length(x$var.names)) - } - if(n.trees > x$n.trees) - { - warning(paste("n.trees exceeds the number of trees in the model, ",x$n.trees, - ". Plotting using ",x$n.trees," trees.",sep="")) - n.trees <- x$n.trees - } - - if(length(i.var) > 3) - { - warning("gbm.int.plot creates up to 3-way interaction plots.\nplot.gbm will only return the plotting data structure.") - return.grid = TRUE - } - - # generate grid to evaluate gbm model - grid.levels <- vector("list",length(i.var)) - for(i in 1:length(i.var)) - { - # continuous - if(is.numeric(x$var.levels[[i.var[i]]])) - { - grid.levels[[i]] <- seq(min(x$var.levels[[i.var[i]]]), - max(x$var.levels[[i.var[i]]]), - length=continuous.resolution) - } - # categorical or ordered - else - { - grid.levels[[i]] <- as.numeric(factor(x$var.levels[[i.var[i]]], - levels=x$var.levels[[i.var[i]]]))-1 - } - } - - X <- expand.grid(grid.levels) - names(X) <- paste("X",1:length(i.var),sep="") - - # Next if block for compatibility with objects created with 1.6 - if (is.null(x$num.classes)){ - x$num.classes <- 1 - } - - # evaluate at each data point - y <- .Call("gbm_plot", - X = as.double(data.matrix(X)), - cRows = as.integer(nrow(X)), - cCols = as.integer(ncol(X)), - n.class = as.integer(x$num.classes), - i.var = as.integer(i.var-1), - n.trees = as.integer(n.trees) , - initF = as.double(x$initF), - trees = x$trees, - c.splits = x$c.splits, - var.type = as.integer(x$var.type), - PACKAGE = "gbm") - - if (x$distribution$name=="multinomial") - { - ## Put result into matrix form - X$y <- matrix(y, ncol = x$num.classes) - colnames(X$y) <- x$classes - - ## Use class probabilities - if (type=="response"){ - X$y <- exp(X$y) - X$y <- X$y / matrix(rowSums(X$y), ncol=ncol(X$y), nrow=nrow(X$y)) - } - } - else if(is.element(x$distribution$name, c("bernoulli", "pairwise")) && type=="response") { - X$y <- 1/(1+exp(-y)) - } - else if ((x$distribution$name=="poisson") && (type=="response")){ - X$y <- exp(y) - } - else if (type=="response"){ - warning("type 'response' only implemented for 'bernoulli', 'poisson', 'multinomial', and 'pairwise'. Ignoring" ) - } - else { X$y <- y } - - # transform categorical variables back to factors - f.factor <- rep(FALSE,length(i.var)) - for(i in 1:length(i.var)) - { - if(!is.numeric(x$var.levels[[i.var[i]]])) - { - X[,i] <- factor(x$var.levels[[i.var[i]]][X[,i]+1], - levels=x$var.levels[[i.var[i]]]) - f.factor[i] <- TRUE - } - } - - if(return.grid) - { - names(X)[1:length(i.var)] <- x$var.names[i.var] - return(X) - } - - # create the plots - if(length(i.var)==1) - { - if(!f.factor) - { - j <- order(X$X1) - - if (x$distribution$name == "multinomial") { - if ( type == "response" ){ - ylabel <- "Predicted class probability" - } - else { ylabel <- paste("f(",x$var.names[i.var],")",sep="") } - plot(range(X$X1), range(X$y), type = "n", xlab = x$var.names[i.var], - ylab = ylabel) - - for (ii in 1:x$num.classes){ - lines(X$X1,X$y[,ii], - xlab=x$var.names[i.var], - ylab=paste("f(",x$var.names[i.var],")",sep=""), - col = ii, ...) - } - } - else if (is.element(x$distribution$name, c("bernoulli", "pairwise"))) { - if ( type == "response" ){ - ylabel <- "Predicted probability" - } - else { - ylabel <- paste("f(",x$var.names[i.var],")",sep="") - } - plot( X$X1, X$y , type = "l", xlab = x$var.names[i.var], ylab=ylabel ) - } - else if ( x$distribution$name == "poisson" ){ - if (type == "response" ){ - ylabel <- "Predicted count" - } - else{ - ylabel <- paste("f(",x$var.names[i.var],")",sep="") - } - plot( X$X1, X$y , type = "l", xlab = x$var.names[i.var], ylab=ylabel ) - } - else { - plot(X$X1,X$y, - type="l", - xlab=x$var.names[i.var], - ylab=paste("f(",x$var.names[i.var],")",sep=""),...) - } - } - else - { - if (x$distribution$name == "multinomial") { - nX <- length(X$X1) - dim.y <- dim(X$y) - if (type == "response" ){ - ylabel <- "Predicted probability" - } - else{ ylabel <- paste("f(",x$var.names[i.var],")",sep="") } - - plot(c(0,nX), range(X$y), axes = FALSE, type = "n", - xlab = x$var.names[i.var], ylab = ylabel) - axis(side = 1, labels = FALSE, at = 0:nX) - axis(side = 2) - - mtext(as.character(X$X1), side = 1, at = 1:nX - 0.5) - - segments(x1 = rep(1:nX - 0.75, each = dim.y[2]), y1 = as.vector(t(X$y)), - x2 = rep(1:nX - 0.25, each = dim.y[2]), col = 1:dim.y[2]) - } - else if (is.element(x$distribution$name, c("bernoulli", "pairwise")) && type == "response" ){ - ylabel <- "Predicted probability" - plot( X$X1, X$y, type = "l", xlab=x$var.names[i.var], ylab=ylabel ) - } - else if ( x$distribution$name == "poisson" & type == "response" ){ - ylabel <- "Predicted count" - plot( X$X1, X$y, type = "l", xlab=x$var.names[i.var], ylab=ylabel ) - } - else { - plot(X$X1,X$y, - type="l", - xlab=x$var.names[i.var], - ylab=paste("f(",x$var.names[i.var],")",sep=""),...) - } - } - } - else if(length(i.var)==2) - { - if(!f.factor[1] && !f.factor[2]) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X$temp <- X$y[, ii] - print(levelplot(temp~X1*X2,data=X, - xlab=x$var.names[i.var[1]], - ylab=x$var.names[i.var[2]],...)) - title(paste("Class:", dimnames(X$y)[[2]][ii])) - } - X$temp <- NULL - } - else { - print(levelplot(y~X1*X2,data=X, - xlab=x$var.names[i.var[1]], - ylab=x$var.names[i.var[2]],...)) - } - } - else if(f.factor[1] && !f.factor[2]) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X$temp <- X$y[, ii] - print( xyplot(temp~X2|X1,data=X, - xlab=x$var.names[i.var[2]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - type="l", - panel = panel.xyplot, - ...) ) - title(paste("Class:", dimnames(X$y)[[2]][ii])) - } - X$temp <- NULL - } - else { - print(xyplot(y~X2|X1,data=X, - xlab=x$var.names[i.var[2]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - type="l", - panel = panel.xyplot, - ...)) - } - } - else if(!f.factor[1] && f.factor[2]) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X$temp <- X$y[, ii] - print( xyplot(temp~X1|X2,data=X, - xlab=x$var.names[i.var[1]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - type="l", - panel = panel.xyplot, - ...) ) - title(paste("Class:", dimnames(X$y)[[2]][ii])) - } - X$temp <- NULL - } - else { - print(xyplot(y~X1|X2,data=X, - xlab=x$var.names[i.var[1]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - type="l", - panel = panel.xyplot, - ...)) - } - } - else - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X$temp <- X$y[, ii] - print( stripplot(X1~temp|X2,data=X, - xlab=x$var.names[i.var[2]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - ...) ) - title(paste("Class:", dimnames(X$y)[[2]][ii])) - } - X$temp <- NULL - } - else { - print(stripplot(X1~y|X2,data=X, - xlab=x$var.names[i.var[2]], - ylab=paste("f(",x$var.names[i.var[1]],",",x$var.names[i.var[2]],")",sep=""), - ...)) - } - } - } - else if(length(i.var)==3) - { - i <- order(f.factor) - X.new <- X[,i] - X.new$y <- X$y - names(X.new) <- names(X) - - # 0 factor, 3 continuous - if(sum(f.factor)==0) - { - X.new$X3 <- equal.count(X.new$X3) - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X.new$temp <- X.new$y[, ii] - print( levelplot(temp~X1*X2|X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=x$var.names[i.var[i[2]]],...) ) - title(paste("Class:", dimnames(X.new$y)[[2]][ii])) - } - X.new$temp <- NULL - } - else { - print(levelplot(y~X1*X2|X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=x$var.names[i.var[i[2]]],...)) - } - } - # 1 factor, 2 continuous - else if(sum(f.factor)==1) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X.new$temp <- X.new$y[, ii] - print( levelplot(temp~X1*X2|X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=x$var.names[i.var[i[2]]],...)) - title(paste("Class:", dimnames(X.new$y)[[2]][ii]) ) - } - X.new$temp <- NULL - } - else { - print(levelplot(y~X1*X2|X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=x$var.names[i.var[i[2]]],...)) - } - } - # 2 factors, 1 continuous - else if(sum(f.factor)==2) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X.new$temp <- X.new$y[, ii] - print( xyplot(temp~X1|X2*X3,data=X.new, - type="l", - xlab=x$var.names[i.var[i[1]]], - ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""), - panel = panel.xyplot, - ...) ) - title(paste("Class:", dimnames(X.new$y)[[2]][ii]) ) - } - X.new$temp <- NULL - } - else { - print(xyplot(y~X1|X2*X3,data=X.new, - type="l", - xlab=x$var.names[i.var[i[1]]], - ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""), - panel = panel.xyplot, - ...)) - } - } - # 3 factors, 0 continuous - else if(sum(f.factor)==3) - { - if (x$distribution$name == "multinomial") - { - for (ii in 1:x$num.classes){ - X.new$temp <- X.new$y[, ii] - print( stripplot(X1~temp|X2*X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""), - ...) ) - title(paste("Class:", dimnames(X.new$y)[[2]][ii]) ) - } - X.new$temp <- NULL - } - else { - print(stripplot(X1~y|X2*X3,data=X.new, - xlab=x$var.names[i.var[i[1]]], - ylab=paste("f(",paste(x$var.names[i.var[1:3]],collapse=","),")",sep=""), - ...)) - } - } - } +#' Marginal plots of fitted gbm objects +#' +#' Plots the marginal effect of the selected variables by "integrating" out the +#' other variables. +#' +#' \code{plot.gbm} produces low dimensional projections of the +#' \code{\link{gbm.object}} by integrating out the variables not included in +#' the \code{i.var} argument. The function selects a grid of points and uses +#' the weighted tree traversal method described in Friedman (2001) to do the +#' integration. Based on the variable types included in the projection, +#' \code{plot.gbm} selects an appropriate display choosing amongst line plots, +#' contour plots, and \code{\link[lattice]{lattice}} plots. If the default +#' graphics are not sufficient the user may set \code{return.grid=TRUE}, store +#' the result of the function, and develop another graphic display more +#' appropriate to the particular example. +#' +#' @param x A \code{\link{gbm.object}} that was fit using a call to +#' \code{\link{gbm}}. +#' +#' @param i.var Vector of indices or the names of the variables to plot. If +#' using indices, the variables are indexed in the same order that they appear +#' in the initial \code{gbm} formula. If \code{length(i.var)} is between 1 and +#' 3 then \code{plot.gbm} produces the plots. Otherwise, \code{plot.gbm} +#' returns only the grid of evaluation points and their average predictions +#' +#' @param n.trees Integer specifying the number of trees to use to generate the +#' plot. Default is to use \code{x$n.trees} (i.e., the entire ensemble). +#' +#' @param continuous.resolution Integer specifying the number of equally space +#' points at which to evaluate continuous predictors. +#' +#' @param return.grid Logical indicating whether or not to produce graphics +#' \code{FALSE} or only return the grid of evaluation points and their average +#' predictions \code{TRUE}. This is useful for customizing the graphics for +#' special variable types, or for higher dimensional graphs. +#' +#' @param type Character string specifying the type of prediction to plot on the +#' vertical axis. See \code{\link{predict.gbm}} for details. +#' +#' @param level.plot Logical indicating whether or not to use a false color +#' level plot (\code{TRUE}) or a 3-D surface (\code{FALSE}). Default is +#' \code{TRUE}. +#' +#' @param contour Logical indicating whether or not to add contour lines to the +#' level plot. Only used when \code{level.plot = TRUE}. Default is \code{FALSE}. +#' +#' @param number Integer specifying the number of conditional intervals to use +#' for the continuous panel variables. See \code{\link[graphics]{co.intervals}} +#' and \code{\link[lattice]{equal.count}} for further details. +#' +#' @param overlap The fraction of overlap of the conditioning variables. See +#' \code{\link[graphics]{co.intervals}} and \code{\link[lattice]{equal.count}} +#' for further details. +#' +#' @param col.regions Color vector to be used if \code{level.plot} is +#' \code{TRUE}. Defaults to the wonderful Matplotlib 'viridis' color map +#' provided by the \code{viridis} package. See \code{\link[viridis]{viridis}} +#' for details. +#' +#' @param ... Additional optional arguments to be passed onto +#' \code{\link[graphics]{plot}}. +#' +#' @return If \code{return.grid = TRUE}, a grid of evaluation points and their +#' average predictions. Otherwise, a plot is returned. +#' +#' @note More flexible plotting is available using the +#' \code{\link[pdp]{partial}} and \code{\link[pdp]{plotPartial}} functions. +#' +#' @seealso \code{\link[pdp]{partial}}, \code{\link[pdp]{plotPartial}}, +#' \code{\link{gbm}}, and \code{\link{gbm.object}}. +#' +#' @references J. H. Friedman (2001). "Greedy Function Approximation: A Gradient +#' Boosting Machine," Annals of Statistics 29(4). +#' +#' @references B. M. Greenwell (2017). "pdp: An R Package for Constructing +#' Partial Dependence Plots," The R Journal 9(1), 421--436. +#' \url{https://journal.r-project.org/archive/2017/RJ-2017-016/index.html}. +#' +#' @export plot.gbm +#' @export +plot.gbm <- function(x, i.var = 1, n.trees = x$n.trees, + continuous.resolution = 100, return.grid = FALSE, + type = c("link", "response"), level.plot = TRUE, + contour = FALSE, number = 4, overlap = 0.1, + col.regions = viridis::viridis, ...) { + + # Match type argument + type <- match.arg(type) + + # Sanity checks + if(all(is.character(i.var))) { + i <- match(i.var, x$var.names) + if(any(is.na(i))) { + stop("Requested variables not found in ", deparse(substitute(x)), ": ", + i.var[is.na(i)]) + } else { + i.var <- i + } + } + if((min(i.var) < 1) || (max(i.var) > length(x$var.names))) { + warning("i.var must be between 1 and ", length(x$var.names)) + } + if(n.trees > x$n.trees) { + warning(paste("n.trees exceeds the number of tree(s) in the model: ", + x$n.trees, ". Using ", x$n.trees, + " tree(s) instead.", sep = "")) + n.trees <- x$n.trees + } + + if(length(i.var) > 3) { + warning("plot.gbm() will only create up to (and including) 3-way ", + "interaction plots.\nBeyond that, plot.gbm() will only return ", + "the plotting data structure.") + return.grid <- TRUE + } + + # Generate grid of predictor values on which to compute the partial + # dependence values + grid.levels <- vector("list", length(i.var)) + for(i in 1:length(i.var)) { + if(is.numeric(x$var.levels[[i.var[i]]])) { # continuous + grid.levels[[i]] <- seq(from = min(x$var.levels[[i.var[i]]]), + to = max(x$var.levels[[i.var[i]]]), + length = continuous.resolution) + } else { # categorical + grid.levels[[i]] <- + as.numeric(factor(x$var.levels[[i.var[i]]], + levels = x$var.levels[[i.var[i]]])) - 1 + } + } + X <- expand.grid(grid.levels) + names(X) <- paste("X", 1:length(i.var), sep = "") + + # For compatibility with gbm version 1.6 + if (is.null(x$num.classes)) { + x$num.classes <- 1 + } + + # Compute partial dependence values + y <- .Call("gbm_plot", X = as.double(data.matrix(X)), + cRows = as.integer(nrow(X)), cCols = as.integer(ncol(X)), + n.class = as.integer(x$num.classes), + i.var = as.integer(i.var - 1), n.trees = as.integer(n.trees), + initF = as.double(x$initF), trees = x$trees, + c.splits = x$c.splits, var.type = as.integer(x$var.type), + PACKAGE = "gbm") + + if (x$distribution$name == "multinomial") { # reshape into matrix + X$y <- matrix(y, ncol = x$num.classes) + colnames(X$y) <- x$classes + + # Convert to class probabilities (if requested) + if (type == "response") { + X$y <- exp(X$y) + X$y <- X$y / matrix(rowSums(X$y), ncol = ncol(X$y), nrow = nrow(X$y)) + } + } else if(is.element(x$distribution$name, c("bernoulli", "pairwise")) && + type == "response") { + X$y <- 1 / (1 + exp(-y)) + } else if ((x$distribution$name == "poisson") && (type == "response")) { + X$y <- exp(y) + } else if (type == "response"){ + warning("`type = \"response\"` only implemented for \"bernoulli\", ", + "\"poisson\", \"multinomial\", and \"pairwise\" distributions. ", + "Ignoring." ) + } else { + X$y <- y + } + + # Transform categorical variables back to factors + f.factor <- rep(FALSE, length(i.var)) + for(i in 1:length(i.var)) { + if(!is.numeric(x$var.levels[[i.var[i]]])) { + X[,i] <- factor(x$var.levels[[i.var[i]]][X[, i] + 1], + levels = x$var.levels[[i.var[i]]]) + f.factor[i] <- TRUE + } + } + + # Return original variable names + names(X)[1:length(i.var)] <- x$var.names[i.var] + + # Return grid only (if requested) + if(return.grid) { + return(X) + } + + # Determine number of predictors + nx <- length(i.var) + + # Determine which type of plot to draw based on the number of predictors + if (nx == 1L) { + + # Single predictor + plotOnePredictorPDP(X, ...) + + } else if (nx == 2) { + + # Two predictors + plotTwoPredictorPDP(X, level.plot = level.plot, contour = contour, + col.regions = col.regions, ...) + + } else { + + # Three predictors (paneled version of plotTwoPredictorPDP) + plotThreePredictorPDP(X, nx = nx, level.plot = level.plot, + contour = contour, col.regions = col.regions, + number = number, overlap = overlap, ...) + + } + } + + +#' @keywords internal +plotOnePredictorPDP <- function(X, ...) { + + # Use the first column to determine which type of plot to construct + if (is.numeric(X[[1L]])) { + + # Draw a line plot + lattice::xyplot(stats::as.formula(paste("y ~", names(X)[1L])), + data = X, type = "l", ...) + + } else { + + # Draw a Cleveland dot plot + lattice::dotplot(stats::as.formula(paste("y ~", names(X)[1L])), + data = X, xlab = names(X)[1L], ...) + + } +} + + +#' @keywords internal +plotTwoPredictorPDP <- function(X, level.plot, contour, col.regions, ...) { + + # Use the first two columns to determine which type of plot to construct + if (is.factor(X[[1L]]) && is.factor(X[[2L]])) { + + # Draw a Cleveland dot plot + lattice::dotplot(stats::as.formula( + paste("y ~", paste(names(X)[1L:2L], collapse = "|")) + ), data = X, xlab = names(X)[1L], ...) + + } else if (is.factor(X[[1L]]) || is.factor(X[[2L]])) { + + # Lattice plot formula + form <- if (is.factor(X[[1L]])) { + stats::as.formula(paste("y ~", paste(names(X)[2L:1L], collapse = "|"))) + } else { + stats::as.formula(paste("y ~", paste(names(X)[1L:2L], collapse = "|"))) + } + + # Draw a paneled line plot + lattice::xyplot(form, data = X, type = "l", ...) + + } else { + + # Lattice plot formula + form <- stats::as.formula( + paste("y ~", paste(names(X)[1L:2L], collapse = "*")) + ) + + # Draw a three-dimensional surface + if (level.plot) { + + # Draw a false color level plot + lattice::levelplot(form, data = X, col.regions = col.regions, + contour = contour, ...) + + } else { + + # Draw a wireframe plot + lattice::wireframe(form, data = X, ...) + + } + + } +} + + +#' @keywords internal +plotThreePredictorPDP <- function(X, nx, level.plot, contour, col.regions, + number, overlap, ...) { + + # Factor, numeric, numeric + if (is.factor(X[[1L]]) && !is.factor(X[[2L]]) && !is.factor(X[[3L]])) { + X[, 1L:3L] <- X[, c(2L, 3L, 1L)] + } + + # Numeric, factor, numeric + if (!is.factor(X[[1L]]) && is.factor(X[[2L]]) && !is.factor(X[[3L]])) { + X[, 1L:3L] <- X[, c(1L, 3L, 2L)] + } + + # Factor, factor, numeric + if (is.factor(X[[1L]]) && is.factor(X[[2L]]) && !is.factor(X[[3L]])) { + X[, 1L:3L] <- X[, c(3L, 1L, 2L)] + } + + # Factor, numeric, factor + if (is.factor(X[[1L]]) && !is.factor(X[[2L]]) && is.factor(X[[3L]])) { + X[, 1L:3L] <- X[, c(2L, 1L, 3L)] + } + + # Convert third predictor to a factor using the equal count algorithm + if (is.numeric(X[[3L]])) { + X[[3L]] <- equal.count(X[[3L]], number = number, overlap = overlap) + } + + if (is.factor(X[[1L]]) && is.factor(X[[2L]])) { + + # Lattice plot formula + form <- stats::as.formula( + paste("y ~", names(X)[1L], "|", paste(names(X)[2L:nx], collapse = "*")) + ) + + # Produce a paneled dotplot + lattice::dotplot(form, data = X, xlab = names(X)[1L], ...) + + } else if (is.numeric(X[[1L]]) && is.factor(X[[2L]])) { + + # Lattice plot formula + form <- stats::as.formula( + paste("y ~", names(X)[1L], "|", paste(names(X)[2L:nx], collapse = "*")) + ) + + # Produce a paneled lineplot + lattice::xyplot(form, data = X, type = "l", ...) + + } else { + + # Lattice plot formula + form <- stats::as.formula( + paste("y ~", paste(names(X)[1L:2L], collapse = "*"), "|", + paste(names(X)[3L:nx], collapse = "*")) + ) + + # Draw a three-dimensional surface + if (level.plot) { + + # Draw a false color level plot + lattice::levelplot(form, data = X, col.regions = col.regions, + contour = contour, ...) + + } else { + + # Draw a wireframe plot + lattice::wireframe(form, data = X, ...) + + } + + } + +} \ No newline at end of file diff --git a/R/predict.gbm.R b/R/predict.gbm.R index 769b2f6..9e66ddb 100644 --- a/R/predict.gbm.R +++ b/R/predict.gbm.R @@ -1,3 +1,53 @@ +#' Predict method for GBM Model Fits +#' +#' Predicted values based on a generalized boosted model object +#' +#' \code{predict.gbm} produces predicted values for each observation in +#' \code{newdata} using the the first \code{n.trees} iterations of the boosting +#' sequence. If \code{n.trees} is a vector than the result is a matrix with +#' each column representing the predictions from gbm models with +#' \code{n.trees[1]} iterations, \code{n.trees[2]} iterations, and so on. +#' +#' The predictions from \code{gbm} do not include the offset term. The user may +#' add the value of the offset to the predicted value if desired. +#' +#' If \code{object} was fit using \code{\link{gbm.fit}} there will be no +#' \code{Terms} component. Therefore, the user has greater responsibility to +#' make sure that \code{newdata} is of the same format (order and number of +#' variables) as the one originally used to fit the model. +#' +#' @param object Object of class inheriting from (\code{\link{gbm.object}}) +#' +#' @param newdata Data frame of observations for which to make predictions +#' +#' @param n.trees Number of trees used in the prediction. \code{n.trees} may be +#' a vector in which case predictions are returned for each iteration specified +#' +#' @param type The scale on which gbm makes the predictions +#' +#' @param single.tree If \code{single.tree=TRUE} then \code{predict.gbm} +#' returns only the predictions from tree(s) \code{n.trees} +#' +#' @param \dots further arguments passed to or from other methods +#' +#' @return Returns a vector of predictions. By default the predictions are on +#' the scale of f(x). For example, for the Bernoulli loss the returned value is +#' on the log odds scale, poisson loss on the log scale, and coxph is on the +#' log hazard scale. +#' +#' If \code{type="response"} then \code{gbm} converts back to the same scale as +#' the outcome. Currently the only effect this will have is returning +#' probabilities for bernoulli and expected counts for poisson. For the other +#' distributions "response" and "link" return the same. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' @seealso \code{\link{gbm}}, \code{\link{gbm.object}} +#' +#' @keywords models regression +#' +#' @export predict.gbm +#' @export predict.gbm <- function(object,newdata,n.trees, type="link", single.tree = FALSE, @@ -47,7 +97,7 @@ if (!identical(object$var.levels[[i]], new.compare)) { x[,i] <- factor(x[,i], union(object$var.levels[[i]], levels(x[,i]))) } - x[,i] <- as.numeric(x[,i])-1 + x[,i] <- as.numeric(factor(x[,i]))-1 } } diff --git a/R/pretty.gbm.tree.R b/R/pretty.gbm.tree.R index 7ac1a73..187c6bf 100644 --- a/R/pretty.gbm.tree.R +++ b/R/pretty.gbm.tree.R @@ -1,3 +1,31 @@ +#' Print gbm tree components +#' +#' \code{gbm} stores the collection of trees used to construct the model in a +#' compact matrix structure. This function extracts the information from a +#' single tree and displays it in a slightly more readable form. This function +#' is mostly for debugging purposes and to satisfy some users' curiosity. +#' +#' +#' @param object a \code{\link{gbm.object}} initially fit using +#' \code{\link{gbm}} +#' @param i.tree the index of the tree component to extract from \code{object} +#' and display +#' @return \code{pretty.gbm.tree} returns a data frame. Each row corresponds to +#' a node in the tree. Columns indicate \item{SplitVar}{index of which variable +#' is used to split. -1 indicates a terminal node.} \item{SplitCodePred}{if the +#' split variable is continuous then this component is the split point. If the +#' split variable is categorical then this component contains the index of +#' \code{object$c.split} that describes the categorical split. If the node is a +#' terminal node then this is the prediction.} \item{LeftNode}{the index of the +#' row corresponding to the left node.} \item{RightNode}{the index of the row +#' corresponding to the right node.} \item{ErrorReduction}{the reduction in the +#' loss function as a result of splitting this node.} \item{Weight}{the total +#' weight of observations in the node. If weights are all equal to 1 then this +#' is the number of observations in the node.} +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{gbm}}, \code{\link{gbm.object}} +#' @keywords print +#' @export pretty.gbm.tree pretty.gbm.tree <- function(object,i.tree=1) { if((i.tree<1) || (i.tree>length(object$trees))) diff --git a/R/print.gbm.R b/R/print.gbm.R index 299bdf3..c5ea494 100644 --- a/R/print.gbm.R +++ b/R/print.gbm.R @@ -1,5 +1,52 @@ -# print, show and summary functions for gbm +#' Print model summary +#' +#' Display basic information about a \code{gbm} object. +#' +#' Prints some information about the model object. In particular, this method +#' prints the call to \code{gbm()}, the type of loss function that was used, +#' and the total number of iterations. +#' +#' If cross-validation was performed, the 'best' number of trees as estimated +#' by cross-validation error is displayed. If a test set was used, the 'best' +#' number of trees as estimated by the test set error is displayed. +#' +#' The number of available predictors, and the number of those having non-zero +#' influence on predictions is given (which might be interesting in data mining +#' applications). +#' +#' If multinomial, bernoulli or adaboost was used, the confusion matrix and +#' prediction accuracy are printed (objects being allocated to the class with +#' highest probability for multinomial and bernoulli). These classifications +#' are performed on the entire training data using the model with the 'best' +#' number of trees as described above, or the maximum number of trees if the +#' 'best' cannot be computed. +#' +#' If the 'distribution' was specified as gaussian, laplace, quantile or +#' t-distribution, a summary of the residuals is displayed. The residuals are +#' for the training data with the model at the 'best' number of trees, as +#' described above, or the maximum number of trees if the 'best' cannot be +#' computed. +#' +#' @aliases print.gbm show.gbm +#' @param x an object of class \code{gbm}. +#' @param \dots arguments passed to \code{print.default}. +#' @author Harry Southworth, Daniel Edwards +#' @seealso \code{\link{gbm}} +#' @keywords models nonlinear survival nonparametric +#' @examples +#' +#' data(iris) +#' iris.mod <- gbm(Species ~ ., distribution="multinomial", data=iris, +#' n.trees=2000, shrinkage=0.01, cv.folds=5, +#' verbose=FALSE, n.cores=1) +#' iris.mod +#' #data(lung) +#' #lung.mod <- gbm(Surv(time, status) ~ ., distribution="coxph", data=lung, +#' # n.trees=2000, shrinkage=0.01, cv.folds=5,verbose =FALSE) +#' #lung.mod +#' @rdname print.gbm +#' @export print.gbm <- function(x, ... ) { if (!is.null(x$call)){ print(x$call) } @@ -39,8 +86,60 @@ invisible() } + +#' @rdname print.gbm +#' +#' @export show.gbm <- print.gbm + +#' Summary of a gbm object +#' +#' Computes the relative influence of each variable in the gbm object. +#' +#' For \code{distribution="gaussian"} this returns exactly the reduction of +#' squared error attributable to each variable. For other loss functions this +#' returns the reduction attributable to each variable in sum of squared error +#' in predicting the gradient on each iteration. It describes the relative +#' influence of each variable in reducing the loss function. See the references +#' below for exact details on the computation. +#' +#' @param object a \code{gbm} object created from an initial call to +#' \code{\link{gbm}}. +#' @param cBars the number of bars to plot. If \code{order=TRUE} the only the +#' variables with the \code{cBars} largest relative influence will appear in +#' the barplot. If \code{order=FALSE} then the first \code{cBars} variables +#' will appear in the plot. In either case, the function will return the +#' relative influence of all of the variables. +#' @param n.trees the number of trees used to generate the plot. Only the first +#' \code{n.trees} trees will be used. +#' @param plotit an indicator as to whether the plot is generated. +#' @param order an indicator as to whether the plotted and/or returned relative +#' influences are sorted. +#' @param method The function used to compute the relative influence. +#' \code{\link{relative.influence}} is the default and is the same as that +#' described in Friedman (2001). The other current (and experimental) choice is +#' \code{\link{permutation.test.gbm}}. This method randomly permutes each +#' predictor variable at a time and computes the associated reduction in +#' predictive performance. This is similar to the variable importance measures +#' Breiman uses for random forests, but \code{gbm} currently computes using the +#' entire training dataset (not the out-of-bag observations). +#' @param normalize if \code{FALSE} then \code{summary.gbm} returns the +#' unnormalized influence. +#' @param ... other arguments passed to the plot function. +#' @return Returns a data frame where the first component is the variable name +#' and the second is the computed relative influence, normalized to sum to 100. +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{gbm}} +#' @references J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +#' Boosting Machine," Annals of Statistics 29(5):1189-1232. +#' +#' L. Breiman +#' (2001).\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. +#' @keywords hplot +#' +#' @export summary.gbm +#' @export summary.gbm <- function(object, cBars=length(object$var.names), n.trees=object$n.trees, diff --git a/R/reconstructGBMdata.R b/R/reconstructGBMdata.R index 5333558..c347e49 100644 --- a/R/reconstructGBMdata.R +++ b/R/reconstructGBMdata.R @@ -1,3 +1,16 @@ +#' Reconstruct a GBM's Source Data +#' +#' Helper function to reconstitute the data for plots and summaries. This +#' function is not intended for the user to call directly. +#' +#' +#' @param x a \code{\link{gbm.object}} initially fit using \code{\link{gbm}} +#' @return Returns a data used to fit the gbm in a format that can subsequently +#' be used for plots and summaries +#' @author Harry Southworth +#' @seealso \code{\link{gbm}}, \code{\link{gbm.object}} +#' @keywords manip +#' @export reconstructGBMdata <- function(x) { if(class(x) != "gbm") diff --git a/R/relative.influence.R b/R/relative.influence.R index 4a49f4d..6d00681 100644 --- a/R/relative.influence.R +++ b/R/relative.influence.R @@ -1,3 +1,56 @@ +#' Methods for estimating relative influence +#' +#' Helper functions for computing the relative influence of each variable in +#' the gbm object. +#' +#' @details +#' This is not intended for end-user use. These functions offer the different +#' methods for computing the relative influence in \code{\link{summary.gbm}}. +#' \code{gbm.loss} is a helper function for \code{permutation.test.gbm}. +#' +#' @aliases relative.influence permutation.test.gbm gbm.loss +#' +#' @param object a \code{gbm} object created from an initial call to +#' \code{\link{gbm}}. +#' +#' @param n.trees the number of trees to use for computations. If not provided, +#' the the function will guess: if a test set was used in fitting, the number +#' of trees resulting in lowest test set error will be used; otherwise, if +#' cross-validation was performed, the number of trees resulting in lowest +#' cross-validation error will be used; otherwise, all trees will be used. +#' +#' @param scale. whether or not the result should be scaled. Defaults to +#' \code{FALSE}. +#' +#' @param sort. whether or not the results should be (reverse) sorted. +#' Defaults to \code{FALSE}. +#' +#' @param y,f,w,offset,dist,baseline For \code{gbm.loss}: These components are +#' the outcome, predicted value, observation weight, offset, distribution, and +#' comparison loss function, respectively. +#' +#' @param group,max.rank Used internally when \code{distribution = +#' \'pairwise\'}. +#' +#' @return By default, returns an unprocessed vector of estimated relative +#' influences. If the \code{scale.} and \code{sort.} arguments are used, +#' returns a processed version of the same. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' @seealso \code{\link{summary.gbm}} +#' +#' @references J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +#' Boosting Machine," Annals of Statistics 29(5):1189-1232. +#' +#' L. Breiman (2001). +#' \url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. +#' +#' @keywords hplot +#' +#' @rdname relative.influence +#' +#' @export relative.influence <- function(object, n.trees, scale. = FALSE, @@ -47,3 +100,96 @@ return(rel.inf=rel.inf) } + + +#' @rdname relative.influence +#' @export +permutation.test.gbm <- function(object, + n.trees) +{ + # get variables used in the model + i.vars <- sort(unique(unlist(lapply(object$trees[1:n.trees], + function(x){unique(x[[1]])})))) + i.vars <- i.vars[i.vars!=-1] + 1 + rel.inf <- rep(0,length(object$var.names)) + + if(!is.null(object$data)) + { + y <- object$data$y + os <- object$data$offset + Misc <- object$data$Misc + w <- object$data$w + x <- matrix(object$data$x, ncol=length(object$var.names)) + object$Terms <- NULL # this makes predict.gbm take x as it is + + if (object$distribution$name == "pairwise") + { + # group and cutoff are only relevant for distribution "pairwise" + # in this case, the last element specifies the max rank + # max rank = 0 means no cut off + group <- Misc[1:length(y)] + max.rank <- Misc[length(y)+1] + } + } + else + { + stop("Model was fit with keep.data=FALSE. permutation.test.gbm has not been implemented for that case.") + } + + # the index shuffler + j <- sample(1:nrow(x)) + for(i in 1:length(i.vars)) + { + x[ ,i.vars[i]] <- x[j,i.vars[i]] + + new.pred <- predict.gbm(object,newdata=x,n.trees=n.trees) + rel.inf[i.vars[i]] <- gbm.loss(y,new.pred,w,os, + object$distribution, + object$train.error[n.trees], + group, + max.rank) + + x[j,i.vars[i]] <- x[ ,i.vars[i]] + } + + return(rel.inf=rel.inf) +} + + +#' @rdname relative.influence +#' @export +gbm.loss <- function(y, f, w, offset, dist, baseline, group=NULL, max.rank=NULL) +{ + if (!is.na(offset)) + { + f <- offset+f + } + + if (dist$name != "pairwise") + { + switch(dist$name, + gaussian = weighted.mean((y - f)^2,w) - baseline, + bernoulli = -2*weighted.mean(y*f - log(1+exp(f)),w) - baseline, + laplace = weighted.mean(abs(y-f),w) - baseline, + adaboost = weighted.mean(exp(-(2*y-1)*f),w) - baseline, + poisson = -2*weighted.mean(y*f-exp(f),w) - baseline, + stop(paste("Distribution",dist$name,"is not yet supported for method=permutation.test.gbm"))) + } + else # dist$name == "pairwise" + { + if (is.null(dist$metric)) + { + stop("No metric specified for distribution 'pairwise'") + } + if (!is.element(dist$metric, c("conc", "ndcg", "map", "mrr"))) + { + stop("Invalid metric '", dist$metric, "' specified for distribution 'pairwise'") + } + if (is.null(group)) + { + stop("For distribution 'pairwise', parameter 'group' has to be supplied") + } + # Loss = 1 - utility + (1 - perf.pairwise(y, f, group, dist$metric, w, max.rank)) - baseline + } +} diff --git a/R/shrink.gbm.R b/R/shrink.gbm.R index 27df32d..daa60fe 100644 --- a/R/shrink.gbm.R +++ b/R/shrink.gbm.R @@ -1,5 +1,43 @@ # evaluates the objective function and gradient with respect to beta # beta = log(lambda/(1-lambda)) + +#' L1 shrinkage of the predictor variables in a GBM +#' +#' Performs recursive shrinkage in each of the trees in a GBM fit using +#' different shrinkage parameters for each variable. +#' +#' This function is currently experimental. Used in conjunction with a gradient +#' ascent search for inclusion of variables. +#' +#' @param object A \code{\link{gbm.object}}. +#' +#' @param n.trees Integer specifying the number of trees to use. +#' +#' @param lambda Vector of length equal to the number of variables containing +#' the shrinkage parameter for each variable. +#' +#' @param \dots Additional optional arguments. (Currently ignored.) +#' +#' @return \item{predF}{Predicted values from the shrunken tree} +#' \item{objective}{The value of the loss function associated with the +#' predicted values} \item{gradient}{A vector with length equal to the number +#' of variables containing the derivative of the objective function with +#' respect to beta, the logit transform of the shrinkage parameter for each +#' variable} +#' +#' @note Warning: This function is experimental. +#' +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' +#' @seealso \code{\link{shrink.gbm.pred}}, \code{\link{gbm}} +#' +#' @references Hastie, T. J., and Pregibon, D. +#' \url{https://web.stanford.edu/~hastie/Papers/shrink_tree.pdf}. AT&T Bell +#' Laboratories Technical Report (March 1990). +#' +#' @keywords methods +#' +#' @export shrink.gbm <- function(object,n.trees, lambda=rep(10,length(object$var.names)), ...) diff --git a/R/shrink.gbm.pred.R b/R/shrink.gbm.pred.R index 3e48002..48658f6 100644 --- a/R/shrink.gbm.pred.R +++ b/R/shrink.gbm.pred.R @@ -1,3 +1,20 @@ +#' Predictions from a shrunked GBM +#' +#' Makes predictions from a shrunken GBM model. +#' +#' @param object a \code{\link{gbm.object}} +#' @param newdata dataset for predictions +#' @param n.trees the number of trees to use +#' @param lambda a vector with length equal to the number of variables +#' containing the shrinkage parameter for each variable +#' @param \dots other parameters (ignored) +#' @return A vector with length equal to the number of observations in newdata +#' containing the predictions +#' @section Warning: This function is experimental +#' @author Greg Ridgeway \email{gregridgeway@@gmail.com} +#' @seealso \code{\link{shrink.gbm}}, \code{\link{gbm}} +#' @keywords methods +#' @export shrink.gbm.pred <- function(object,newdata,n.trees, lambda=rep(1,length(object$var.names)), ...) diff --git a/R/test.gbm.R b/R/test.gbm.R index 1888f7c..c50247a 100644 --- a/R/test.gbm.R +++ b/R/test.gbm.R @@ -1,3 +1,28 @@ +#' Test the \code{gbm} package. +#' +#' Run tests on \code{gbm} functions to perform logical checks and +#' reproducibility. +#' +#' The function uses functionality in the \code{RUnit} package. A fairly small +#' validation suite is executed that checks to see that relative influence +#' identifies sensible variables from simulated data, and that predictions from +#' GBMs with Gaussian, Cox or binomial distributions are sensible, +#' +#' @aliases validate.gbm test.gbm test.relative.influence +#' @return An object of class \code{RUnitTestData}. See the help for +#' \code{RUnit} for details. +#' @note The test suite is not comprehensive. +#' @author Harry Southworth +#' @seealso \code{\link{gbm}} +#' @keywords models +#' @examples +#' +#' # Uncomment the following lines to run - commented out to make CRAN happy +#' #library(RUnit) +#' #val <- validate.texmex() +#' #printHTMLProtocol(val, "texmexReport.html") + +#' @export test.gbm <- function(){ # Based on example in R package # Gaussian example @@ -103,7 +128,7 @@ data <- data.frame(tt=tt,delta=delta,X1=X1,X2=X2,X3=X3) # fit initial model - gbm1 <- gbm(Surv(tt,delta)~X1+X2+X3, # formula + gbm1 <- gbm(Surv(tt,delta)~X1+X2+X3, # formula data=data, # dataset weights=w, var.monotone=c(0,0,0), # -1: monotone decrease, +1: monotone increase, 0: no monotone restrictions @@ -138,7 +163,7 @@ # predict on the new data using "best" number of trees # f.predict will be on the canonical scale (logit,log,etc.) - f.predict <- predict(gbm1,data2,best.iter) + f.predict <- predict(gbm1, newdata = data2, n.trees = best.iter) #plot(data2$f,f.predict) # Use observed sd @@ -198,7 +223,7 @@ # predict on the new data using "best" number of trees # f.predict will be on the canonical scale (logit,log,etc.) - f.1.predict <- predict.gbm(gbm1,data2, n.trees=best.iter.test) + f.1.predict <- predict(gbm1,data2, n.trees=best.iter.test) # compute quantity prior to transformation f.new = sin(3*X1) - 4*X2 + mu @@ -213,6 +238,8 @@ ########################### test.relative.influence() ########################## ########################### ########################## + +#' @export test.relative.influence <- function(){ # Test that relative.influence really does pick out the true predictors set.seed(1234) @@ -234,28 +261,26 @@ ################################ validate.gbm() ################################ ################################ ################################ + +#' @export validate.gbm <- function () { - if(!requireNamespace("RUnit", quietly = TRUE)) - stop("You need to install the RUnit package to validate gbm") - - wh <- (1:length(search()))[search() == "package:gbm"] - tests <- objects(wh)[substring(objects(wh), 1, 5) == "test."] - - # Create temporary directory to put tests into - if (.Platform$OS.type == "windows"){ sep <- "\\" } - else { sep <- "/" } - - dir <- file.path(tempdir(), "gbm.tests", fsep = sep) - - dir.create(dir) - - for (i in 1:length(tests)) { - str <- paste(dir, sep, tests[i], ".R", sep = "") - dump(tests[i], file = str) - } - res <- RUnit::defineTestSuite("gbm", dirs = dir, testFuncRegexp = "^test.+", testFileRegexp = "*.R") - cat("Running gbm test suite.\nThis will take some time...\n\n") - res <- RUnit::runTestSuite(res) - res + wh <- (1:length(search()))[search() == "package:gbm"] + tests <- objects(wh)[substring(objects(wh), 1, 5) == "test."] + + # Create temporary directory to put tests into + sep <- if (.Platform$OS.type == "windows") "\\" else "/" + + dir <- file.path(tempdir(), "gbm.tests", fsep = sep) + + dir.create(dir) + + for (i in 1:length(tests)) { + str <- paste(dir, sep, tests[i], ".R", sep = "") + dump(tests[i], file = str) + } + res <- RUnit::defineTestSuite("gbm", dirs = dir, testFuncRegexp = "^test.+", + testFileRegexp = "*.R") + cat("Running gbm test suite.\nThis will take some time...\n\n") + RUnit::runTestSuite(res) } diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..7678ab7 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,158 @@ +#' Arrange multiple grobs on a page +#' +#' See \code{\link[gridExtra]{grid.arrange}} for more details. +#' +#' @name grid.arrange +#' @rdname grid.arrange +#' @keywords internal +#' @export +#' @importFrom gridExtra grid.arrange +#' @usage grid.arrange(..., newpage = TRUE) +NULL + + +#' @keywords internal +getAvailableDistributions <- function() { + c("adaboost", "bernoulli", "coxph", "gaussian", "huberized", "laplace", + "multinomial", "pairwise", "poisson", "quantile", "tdist") +} + + +#' @keywords internal +guess_error_method <- function(object) { + if (has_train_test_split(object)) { + "test" + } else if (has_cross_validation(object)) { + "cv" + } else { + "OOB" + } +} + + +#' @keywords internal +has_train_test_split <- function(object) { + object$train.fraction < 1 +} + + +#' @keywords internal +has_cross_validation <- function(object) { + !is.null(object$cv.error) +} + + +#' @keywords internal +best_iter <- function(object, method) { + check_if_gbm_fit(object) + if (method == "OOB") { + best_iter_out_of_bag(object) + } else if (method == "test") { + best_iter_test(object) + } else if (method == "cv") { + best_iter_cv(object) + } else { + stop("method must be one of \"cv\", \"test\", or \"OOB\"") + } +} + + +#' @keywords internal +best_iter_test <- function(object) { + check_if_gbm_fit(object) + best_iter_test <- which.min(object$valid.error) + return(best_iter_test) +} + + +#' @keywords internal +best_iter_cv <- function(object) { + check_if_gbm_fit(object) + if(!has_cross_validation(object)) { + stop('In order to use method="cv" gbm must be called with cv_folds>1.') + } + best_iter_cv <- which.min(object$cv.error) + return(best_iter_cv) +} + + +#' @keywords internal +best_iter_out_of_bag <- function(object) { + check_if_gbm_fit(object) + if(object$bag.fraction == 1) { + stop("Cannot compute OOB estimate or the OOB curve when bag_fraction=1.") + } + if(all(!is.finite(object$oobag.improve))) { + stop("Cannot compute OOB estimate or the OOB curve. No finite OOB ", + "estimates of improvement.") + } + message("OOB generally underestimates the optimal number of iterations ", + "although predictive performance is reasonably competitive. Using ", + "cv_folds>1 when calling gbm usually results in improved predictive ", + "performance.") + smoother <- generate_smoother_oobag(object) + best_iter_oob <- smoother$x[which.min(-cumsum(smoother$y))] + attr(best_iter_oob, "smoother") <- smoother + return(best_iter_oob) +} + + +#' @keywords internal +generate_smoother_oobag <- function(object) { + check_if_gbm_fit(object) + x <- seq_len(object$n.trees) + smoother <- loess(object$oobag.improve ~ x, + enp.target = min(max(4, length(x) / 10), 50)) + smoother$y <- smoother$fitted + smoother$x <- x + return(smoother) +} + + +#' @keywords internal +check_if_gbm_fit <- function(object) { + if (!inherits(object, "gbm")) { + stop(deparse(substitute(object)), " is not a valid \"gbm\" object.") + } +} + + +#' @keywords internal +get_ylab <- function(object) { + check_if_gbm_fit(object) + if (object$distribution$name != "pairwise") { + switch(substring(object$distribution$name, 1, 2), + ga = "Squared error loss", + be = "Bernoulli deviance", + po = "Poisson deviance", + ad = "AdaBoost exponential bound", + co = "Cox partial deviance", + la = "Absolute loss", + qu = "Quantile loss", + mu = "Multinomial deviance", + td = "t-distribution deviance") + } else { + switch(object$distribution$metric, + conc = "Fraction of concordant pairs", + ndcg = "Normalized discounted cumulative gain", + map = "Mean average precision", + mrr = "Mean reciprocal rank") + } +} + + +#' @keywords internal +get_ylim <- function(object, method) { + check_if_gbm_fit(object) + if(object$train.fraction == 1) { + if ( method=="cv" ) { + range(object$train.error, object$cv.error) + } else if ( method == "test" ) { + range( object$train.error, object$valid.error) + } else { + range(object$train.error) + } + } else { + range(object$train.error, object$valid.error) + } +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..5832396 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,5 @@ +#' @keywords internal +.onAttach <- function(lib, pkg) { + vers <- utils::packageVersion("gbm") + packageStartupMessage(paste("Loaded gbm", vers)) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..dad7ec2 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +gbm +=== + +[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/gbm)](https://cran.r-project.org/package=gbm) +[![Build +Status](https://travis-ci.org/gbm-developers/gbm.svg?branch=master)](https://travis-ci.org/gbm-developers/gbm) +[![Downloads](http://cranlogs.r-pkg.org/badges/gbm)](http://cranlogs.r-pkg.org/badges/gbm) +[![Total +Downloads](http://cranlogs.r-pkg.org/badges/grand-total/gbm)](http://cranlogs.r-pkg.org/badges/grand-total/gbm) + +Overview +-------- + +The gbm package (which stands for **g**eneralized **b**oosted +**m**odels) implements extensions to Freund and Schapire’s AdaBoost +algorithm and [Friedman’s gradient boosting +machine](http://projecteuclid.org/euclid.aos/1013203451). It includes +regression methods for least squares, absolute loss, t-distribution +loss, quantile regression, logistic, multinomial logistic, Poisson, Cox +proportional hazards partial likelihood, AdaBoost exponential loss, +Huberized hinge loss, and Learning to Rank measures (i.e., +[LambdaMart](https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/)). + +Installation +------------ + +``` r +# The easiest way to get gbm is to it install from CRAN: +install.packages("gbm") + +# Or the the development version from GitHub: +# install.packages("devtools") +devtools::install_github("gbm-developers/gbm") +``` + +Lifecycle +--------- + +[![lifecycle](https://img.shields.io/badge/lifecycle-retired-orange.svg)](https://www.tidyverse.org/lifecycle/#retired) + +The gbm package is retired and no longer under active development. We +will only make the necessary changes to ensure that gbm remain on CRAN. +For the most part, no new features will be added, and only the most +critical of bugs will be fixed. + +This is a maintained version of `gbm` back compatible to CRAN versions +of `gbm` 2.1.x. It exists mainly for the purpose of reproducible +research and data analyses performed with the 2.1.x versions of `gbm`. +For newer development, and a more consistent API, try out the +[gbm3](https://github.com/gbm-developers/gbm3) package! diff --git a/build/vignette.rds b/build/vignette.rds new file mode 100644 index 0000000..cf69d5f Binary files /dev/null and b/build/vignette.rds differ diff --git a/inst/doc/gbm.Rnw b/inst/doc/gbm.Rnw new file mode 100644 index 0000000..26cfda0 --- /dev/null +++ b/inst/doc/gbm.Rnw @@ -0,0 +1,373 @@ +\documentclass{article} + +\bibliographystyle{plain} + +\newcommand{\EV}{\mathrm{E}} +\newcommand{\Var}{\mathrm{Var}} +\newcommand{\aRule}{\begin{center} \rule{5in}{1mm} \end{center}} + +\title{Generalized Boosted Models:\\A guide to the gbm package} \author{Greg Ridgeway} + +%\VignetteEngine{knitr::knitr} +%\VignetteIndexEntry{Generalized Boosted Models: A guide to the gbm package} + +\newcommand{\mathgbf}[1]{{\mbox{\boldmath$#1$\unboldmath}}} + +\begin{document} + +\maketitle + +Boosting takes on various forms with different programs using different loss functions, different base models, and different optimization schemes. The gbm package takes the approach described in \cite{Friedman:2001} and \cite{Friedman:2002}. Some of the terminology differs, mostly due to an effort to cast boosting terms into more standard statistical terminology (e.g. deviance). In addition, the gbm package implements boosting for models commonly used in statistics but not commonly associated with boosting. The Cox proportional hazard model, for example, is an incredibly useful model and the boosting framework applies quite readily with only slight modification \cite{Ridgeway:1999}. Also some algorithms implemented in the gbm package differ from the standard implementation. The AdaBoost algorithm \cite{FreundSchapire:1997} has a particular loss function and a particular optimization algorithm associated with it. The gbm implementation of AdaBoost adopts AdaBoost's exponential loss function (its bound on misclassification rate) but uses Friedman's gradient descent algorithm rather than the original one proposed. So the main purposes of this document is to spell out in detail what the gbm package implements. + +\section{Gradient boosting} + +This section essentially presents the derivation of boosting described in \cite{Friedman:2001}. The gbm package also adopts the stochastic gradient boosting strategy, a small but important tweak on the basic algorithm, described in \cite{Friedman:2002}. + +\subsection{Friedman's gradient boosting machine} \label{sec:GradientBoostingMachine} + +\begin{figure} +\aRule Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$. \\ +For $t$ in $1,\ldots,T$ do +\begin{enumerate} +\item Compute the negative gradient as the working response + \begin{equation} + z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} + \end{equation} +\item Fit a regression model, $g(\mathbf{x})$, predicting $z_i$ from the covariates $\mathbf{x}_i$. \item Choose a gradient descent step size as + \begin{equation} + \rho = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\hat f(\mathbf{x}_i)+\rho g(\mathbf{x}_i)) + \end{equation} +\item Update the estimate of $f(\mathbf{x})$ as + \begin{equation} + \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \rho g(\mathbf{x}) + \end{equation} +\end{enumerate} \aRule \caption{Friedman's Gradient Boost algorithm} \label{fig:GradientBoost} \end{figure} + +Friedman (2001) and the companion paper Friedman (2002) extended the work of Friedman, Hastie, and Tibshirani (2000) and laid the ground work for a new generation of boosting algorithms. Using the connection between boosting and optimization, this new work proposes the Gradient Boosting Machine. + +In any function estimation problem we wish to find a regression function, $\hat f(\mathbf{x})$, that minimizes the expectation of some loss function, $\Psi(y,f)$, as shown in (\ref{NonparametricRegression1}). + +\begin{eqnarray} +\hspace{0.5in} +\hat f(\mathbf{x}) &=& \arg \min_{f(\mathbf{x})} \EV_{y,\mathbf{x}} \Psi(y,f(\mathbf{x})) \nonumber \\ \label{NonparametricRegression1} +&=& \arg \min_{f(\mathbf{x})} \EV_x \left[ \EV_{y|\mathbf{x}} \Psi(y,f(\mathbf{x})) \Big| \mathbf{x} \right] +\end{eqnarray} + +We will focus on finding estimates of $f(\mathbf{x})$ such that \begin{equation} +\label{NonparametricRegression2} +\hspace{0.5in} +\hat f(\mathbf{x}) = \arg \min_{f(\mathbf{x})} \EV_{y|\mathbf{x}} \left[ \Psi(y,f(\mathbf{x}))|\mathbf{x} \right] +\end{equation} +Parametric regression models assume that $f(\mathbf{x})$ is a function with a finite number of parameters, $\beta$, and estimates them by selecting those values that minimize a loss function (e.g. squared error loss) over a training sample of $N$ observations on $(y,\mathbf{x})$ pairs as in (\ref{eq:Friedman1}). +\begin{equation} +\label{eq:Friedman1} +\hspace{0.5in} +\hat\beta = \arg \min_{\beta} \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i;\beta)) +\end{equation} +When we wish to estimate $f(\mathbf{x})$ non-parametrically the task becomes more difficult. Again we can proceed similarly to \cite{FHT:2000} and modify our current estimate of $f(\mathbf{x})$ by adding a new function $f(\mathbf{x})$ in a greedy fashion. Letting $f_i = f(\mathbf{x}_i)$, we see that we want to decrease the $N$ dimensional function +\begin{eqnarray} +\label{EQ:Friedman2} +\hspace{0.5in} +J(\mathbf{f}) &=& \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i)) \nonumber \\ + &=& \sum_{i=1}^N \Psi(y_i,F_i). +\end{eqnarray} +The negative gradient of $J(\mathbf{f})$ indicates the direction of the locally greatest decrease in $J(\mathbf{f})$. Gradient descent would then have us modify $\mathbf{f}$ as +\begin{equation} +\label{eq:Friedman3} +\hspace{0.5in} +\hat \mathbf{f} \leftarrow \hat \mathbf{f} - \rho \nabla J(\mathbf{f}) +\end{equation} +where $\rho$ is the size of the step along the direction of greatest descent. Clearly, this step alone is far from our desired goal. First, it only fits $f$ at values of $\mathbf{x}$ for which we have observations. Second, it does not take into account that observations with similar $\mathbf{x}$ are likely to have similar values of $f(\mathbf{x})$. Both these problems would have disastrous effects on generalization error. However, Friedman suggests selecting a class of functions that use the covariate information to approximate the gradient, usually a regression tree. This line of reasoning produces his Gradient Boosting algorithm shown in Figure~\ref{fig:GradientBoost}. At each iteration the algorithm determines the direction, the gradient, in which it needs to improve the fit to the data and selects a particular model from the allowable class of functions that is in most agreement with the direction. In the case of squared-error loss, $\Psi(y_i,f(\mathbf{x}_i)) = \sum_{i=1}^N (y_i-f(\mathbf{x}_i))^2$, this algorithm corresponds exactly to residual fitting. + +There are various ways to extend and improve upon the basic framework suggested in Figure~\ref{fig:GradientBoost}. For example, Friedman (2001) substituted several choices in for $\Psi$ to develop new boosting algorithms for robust regression with least absolute deviation and Huber loss functions. Friedman (2002) showed that a simple subsampling trick can greatly improve predictive performance while simultaneously reduce computation time. Section~\ref{GBMModifications} discusses some of these modifications. + +\section{Improving boosting methods using control of the learning rate, sub-sampling, and a decomposition for interpretation} \label{GBMModifications} + +This section explores the variations of the previous algorithms that have the potential to improve their predictive performance and interpretability. In particular, by controlling the optimization speed or learning rate, introducing low-variance regression methods, and applying ideas from robust regression we can produce non-parametric regression procedures with many desirable properties. As a by-product some of these modifications lead directly into implementations for learning from massive datasets. All these methods take advantage of the general form of boosting +\begin{equation} +\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). +\end{equation} So far we have taken advantage of this form only by substituting in our favorite regression procedure for $\EV_w(z|\mathbf{x})$. I will discuss some modifications to estimating $\EV_w(z|\mathbf{x})$ that have the potential to improve our algorithm. + +\subsection{Decreasing the learning rate} As several authors have phrased slightly differently, ``...boosting, whatever flavor, seldom seems to overfit, no matter how many terms are included in the additive expansion''. This is not true as the discussion to \cite{FHT:2000} points out. + +In the update step of any boosting algorithm we can introduce a learning rate to dampen the proposed move. +\begin{equation} +\label{eq:shrinkage} +\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). +\end{equation} +By multiplying the gradient step by $\lambda$ as in equation~\ref{eq:shrinkage} we have control on the rate at which the boosting algorithm descends the error surface (or ascends the likelihood surface). When $\lambda=1$ we return to performing full gradient steps. Friedman (2001) relates the learning rate to regularization through shrinkage. + +The optimal number of iterations, $T$, and the learning rate, $\lambda$, depend on each other. In practice I set $\lambda$ to be as small as possible and then select $T$ by cross-validation. Performance is best when $\lambda$ is as small as possible performance with decreasing marginal utility for smaller and smaller $\lambda$. Slower learning rates do not necessarily scale the number of optimal iterations. That is, if when $\lambda=1.0$ and the optimal $T$ is 100 iterations, does {\it not} necessarily imply that when $\lambda=0.1$ the optimal $T$ is 1000 iterations. + +\subsection{Variance reduction using subsampling} + +Friedman (2002) proposed the stochastic gradient boosting algorithm that simply samples uniformly without replacement from the dataset before estimating the next gradient step. He found that this additional step greatly improved performance. We estimate the regression $\EV(z(y,\hat f(\mathbf{x}))|\mathbf{x})$ using a random subsample of the dataset. + +\subsection{ANOVA decomposition} + +Certain function approximation methods are decomposable in terms of a ``functional ANOVA decomposition''. That is a function is decomposable as +\begin{equation} +\label{ANOVAdecomp} +f(\mathbf{x}) = \sum_j f_j(x_j) + \sum_{jk} f_{jk}(x_j,x_k) + \sum_{jk\ell} f_{jk\ell}(x_j,x_k,x_\ell) + \cdots. +\end{equation} This applies to boosted trees. Regression stumps (one split decision trees) depend on only one variable and fall into the first term of \ref{ANOVAdecomp}. Trees with two splits fall into the second term of \ref{ANOVAdecomp} and so on. By restricting the depth of the trees produced on each boosting iteration we can control the order of approximation. Often additive components are sufficient to approximate a multivariate function well, generalized additive models, the na\"{\i}ve Bayes classifier, and boosted stumps for example. When the approximation is restricted to a first order we can also produce plots of $x_j$ versus $f_j(x_j)$ to demonstrate how changes in $x_j$ might affect changes in the response variable. + +\subsection{Relative influence} Friedman (2001) also develops an extension of a variable's ``relative influence'' for boosted estimates. For tree based methods the approximate relative influence of a variable $x_j$ is +\begin{equation} +\label{RelInfluence} +\hspace{0.5in} +\hat J_j^2 = \hspace{-0.1in}\sum_{\mathrm{splits~on~}x_j}\hspace{-0.2in}I_t^2 +\end{equation} where $I_t^2$ is the empirical improvement by splitting on $x_j$ at that point. Friedman's extension to boosted models is to average the relative influence of variable $x_j$ across all the trees generated by the boosting algorithm. + +\begin{figure} +\aRule +Select +\begin{itemize} +\item a loss function (\texttt{distribution}) +\item the number of iterations, $T$ (\texttt{n.trees}) +\item the depth of each tree, $K$ (\texttt{interaction.depth}) +\item the shrinkage (or learning rate) parameter, $\lambda$ (\texttt{shrinkage}) +\item the subsampling rate, $p$ (\texttt{bag.fraction}) +\end{itemize} +Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$ \\ +For $t$ in $1,\ldots,T$ do +\begin{enumerate} +\item Compute the negative gradient as the working response + \begin{equation} + z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} + \end{equation} +\item Randomly select $p\times N$ cases from the dataset +\item Fit a regression tree with $K$ terminal nodes, $g(\mathbf{x})=\EV(z|\mathbf{x})$. This tree is fit using only those randomly selected observations +\item Compute the optimal terminal node predictions, $\rho_1,\ldots,\rho_K$, as + \begin{equation} + \rho_k = \arg \min_{\rho} \sum_{\mathbf{x}_i\in S_k} \Psi(y_i,\hat f(\mathbf{x}_i)+\rho) + \end{equation} +where $S_k$ is the set of $\mathbf{x}$s that define terminal node $k$. Again this step uses only the randomly selected observations. +\item Update $\hat f(\mathbf{x})$ as + \begin{equation} + \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda\rho_{k(\mathbf{x})} + \end{equation} +where $k(\mathbf{x})$ indicates the index of the terminal node into which an observation with features $\mathbf{x}$ would fall. +\end{enumerate} +\aRule +\caption{Boosting as implemented in \texttt{gbm()}} +\label{fig:gbm} +\end{figure} + +\section{Common user options} + +This section discusses the options to gbm that most users will need to change or tune. + +\subsection{Loss function} + +The first and foremost choice is \texttt{distribution}. This should be easily dictated by the application. For most classification problems either \texttt{bernoulli} or \texttt{adaboost} will be appropriate, the former being recommended. For continuous outcomes the choices are \texttt{gaussian} (for minimizing squared error), \texttt{laplace} (for minimizing absolute error), and quantile regression (for estimating percentiles of the conditional distribution of the outcome). Censored survival outcomes should require \texttt{coxph}. Count outcomes may use \texttt{poisson} although one might also consider \texttt{gaussian} or \texttt{laplace} depending on the analytical goals. + +\subsection{The relationship between shrinkage and number of iterations} The issues that most new users of gbm struggle with are the choice of \texttt{n.trees} and \texttt{shrinkage}. It is important to know that smaller values of \texttt{shrinkage} (almost) always give improved predictive performance. That is, setting \texttt{shrinkage=0.001} will almost certainly result in a model with better out-of-sample predictive performance than setting \texttt{shrinkage=0.01}. However, there are computational costs, both storage and CPU time, associated with setting \texttt{shrinkage} to be low. The model with \texttt{shrinkage=0.001} will likely require ten times as many iterations as the model with \texttt{shrinkage=0.01}, increasing storage and computation time by a factor of 10. Figure~\ref{fig:shrinkViters} shows the relationship between predictive performance, the number of iterations, and the shrinkage parameter. Note that the increase in the optimal number of iterations between two choices for shrinkage is roughly equal to the ratio of the shrinkage parameters. It is generally the case that for small shrinkage parameters, 0.001 for example, there is a fairly long plateau in which predictive performance is at its best. My rule of thumb is to set \texttt{shrinkage} as small as possible while still being able to fit the model in a reasonable amount of time and storage. I usually aim for 3,000 to 10,000 iterations with shrinkage rates between 0.01 and 0.001. + +\begin{figure}[ht] \begin{center} \includegraphics[width=5in]{shrinkage-v-iterations} \end{center} \caption{Out-of-sample predictive performance by number of iterations and shrinkage. Smaller values of the shrinkage parameter offer improved predictive performance, but with decreasing marginal improvement.} \label{fig:shrinkViters} \end{figure} + +\subsection{Estimating the optimal number of iterations} gbm offers three methods for estimating the optimal number of iterations after the gbm model has been fit, an independent test set (\texttt{test}), out-of-bag estimation (\texttt{OOB}), and $v$-fold cross validation (\texttt{cv}). The function \texttt{gbm.perf} computes the iteration estimate. + +Like Friedman's MART software, the independent test set method uses a single holdout test set to select the optimal number of iterations. If \texttt{train.fraction} is set to be less than 1, then only the \textit{first} \texttt{train.fraction}$\times$\texttt{nrow(data)} will be used to fit the model. Note that if the data are sorted in a systematic way (such as cases for which $y=1$ come first), then the data should be shuffled before running gbm. Those observations not used in the model fit can be used to get an unbiased estimate of the optimal number of iterations. The downside of this method is that a considerable number of observations are used to estimate the single regularization parameter (number of iterations) leaving a reduced dataset for estimating the entire multivariate model structure. Use \texttt{gbm.perf(...,method="test")} to obtain an estimate of the optimal number of iterations using the held out test set. + +If \texttt{bag.fraction} is set to be greater than 0 (0.5 is recommended), gbm computes an out-of-bag estimate of the improvement in predictive performance. It evaluates the reduction in deviance on those observations not used in selecting the next regression tree. The out-of-bag estimator underestimates the reduction in deviance. As a result, it almost always is too conservative in its selection for the optimal number of iterations. The motivation behind this method was to avoid having to set aside a large independent dataset, which reduces the information available for learning the model structure. Use \texttt{gbm.perf(...,method="OOB")} to obtain the OOB estimate. + +Lastly, gbm offers $v$-fold cross validation for estimating the optimal number of iterations. If when fitting the gbm model, \texttt{cv.folds=5} then gbm will do 5-fold cross validation. gbm will fit five gbm models in order to compute the cross validation error estimate and then will fit a sixth and final gbm model with \texttt{n.trees}iterations using all of the data. The returned model object will have a component labeled \texttt{cv.error}. Note that \texttt{gbm.more} will do additional gbm iterations but will not add to the \texttt{cv.error} component. Use \texttt{gbm.perf(...,method="cv")} to obtain the cross validation estimate. + +\begin{figure}[ht] +\begin{center} +\includegraphics[width=5in]{oobperf2} +\end{center} +\caption{Out-of-sample predictive performance of four methods of selecting the optimal number of iterations. The vertical axis plots performance relative the best. The boxplots indicate relative performance across thirteen real datasets from the UCI repository. See \texttt{demo(OOB-reps)}.} +\label{fig:oobperf} +\end{figure} + +Figure~\ref{fig:oobperf} compares the three methods for estimating the optimal number of iterations across 13 datasets. The boxplots show the methods performance relative to the best method on that dataset. For most datasets the method perform similarly, however, 5-fold cross validation is consistently the best of them. OOB, using a 33\% test set, and using a 20\% test set all have datasets for which the perform considerably worse than the best method. My recommendation is to use 5- or 10-fold cross validation if you can afford the computing time. Otherwise you may choose among the other options, knowing that OOB is conservative. + +\section{Available distributions} + +This section gives some of the mathematical detail for each of the distribution options that gbm offers. The gbm engine written in C++ has access to a C++ class for each of these distributions. Each class contains methods for computing the associated deviance, initial value, the gradient, and the constants to predict in each terminal node. + +In the equations shown below, for non-zero offset terms, replace $f(\mathbf{x}_i)$ with $o_i + f(\mathbf{x}_i)$. + +\subsection{Gaussian} + +\begin{tabular}{ll} +Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i(y_i-f(\mathbf{x}_i))^2$ \\ +Initial value & $\displaystyle f(\mathbf{x})=\frac{\sum w_i(y_i-o_i)}{\sum w_i}$ \\ +Gradient & $z_i=y_i - f(\mathbf{x}_i)$ \\ +Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-f(\mathbf{x}_i))}{\sum w_i}$ +\end{tabular} + +\subsection{AdaBoost} + +\begin{tabular}{ll} Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Initial value & $\displaystyle \frac{1}{2}\log\frac{\sum y_iw_ie^{-o_i}}{\sum (1-y_i)w_ie^{o_i}}$ \\ Gradient & $\displaystyle z_i= -(2y_i-1)\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Terminal node estimates & $\displaystyle \frac{\sum (2y_i-1)w_i\exp(-(2y_i-1)f(\mathbf{x}_i))} + {\sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))}$ +\end{tabular} + +\subsection{Bernoulli} + +\begin{tabular}{ll} Deviance & $\displaystyle -2\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\log(1+\exp(f(\mathbf{x}_i))))$ \\ Initial value & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i(1-y_i)}$ \\ Gradient & $\displaystyle z_i=y_i-\frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ \\ + & where $\displaystyle p_i = \frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ +\end{tabular} + +Notes: \begin{itemize} \item For non-zero offset terms, the computation of the initial value requires Newton-Raphson. Initialize $f_0=0$ and iterate $\displaystyle f_0 \leftarrow f_0 + \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ where $\displaystyle p_i = \frac{1}{1+\exp(-(o_i+f_0))}$. \end{itemize} + +\subsection{Laplace} + +\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} \sum w_i|y_i-f(\mathbf{x}_i)|$ \\ Initial value & $\mbox{median}_w(y)$ \\ Gradient & $z_i=\mbox{sign}(y_i-f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mbox{median}_w(z)$ \end{tabular} + +Notes: \begin{itemize} \item $\mbox{median}_w(y)$ denotes the weighted median, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq m)}{\sum w_i}=\frac{1}{2}$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution="laplace"}. \end{itemize} + + +\subsection{Quantile regression} + +Contributed by Brian Kriegler (see \cite{Kriegler:2010}). + +\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} + \left(\alpha\sum_{y_i>f(\mathbf{x}_i)} w_i(y_i-f(\mathbf{x}_i))\right. +$ \\ + & \hspace{0.5in}$\left.(1-\alpha)\sum_{y_i\leq f(\mathbf{x}_i)} w_i(f(\mathbf{x}_i)-y_i)\right)$ \\ +Initial value & $\mathrm{quantile}^{(\alpha)}_w(y)$ \\ Gradient & $z_i=\alpha I(y_i>f(\mathbf{x}_i))-(1-\alpha)I(y_i\leq f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mathrm{quantile}^{(\alpha)}_w(z)$ \end{tabular} + +Notes: \begin{itemize} \item $\mathrm{quantile}^{(\alpha)}_w(y)$ denotes the weighted quantile, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq q)}{\sum w_i}=\alpha$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution=list(name="quantile")}. \end{itemize} + + +\subsection{Cox Proportional Hazard} + +\begin{tabular}{ll} Deviance & $-2\sum w_i(\delta_i(f(\mathbf{x}_i)-\log(R_i/w_i)))$\\ Gradient & $\displaystyle z_i=\delta_i - \sum_j \delta_j + \frac{w_jI(t_i\geq t_j)e^{f(\mathbf{x}_i)}} + {\sum_k w_kI(t_k\geq t_j)e^{f(\mathbf{x}_k)}}$ \\ +Initial value & 0 \\ Terminal node estimates & Newton-Raphson algorithm \end{tabular} + +\begin{enumerate} + \item Initialize the terminal node predictions to 0, $\mathgbf{\rho}=0$ + \item Let $\displaystyle + p_i^{(k)}=\frac{\sum_j I(k(j)=k)I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}} + {\sum_j I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}}$ + \item Let $g_k=\sum w_i\delta_i\left(I(k(i)=k)-p_i^{(k)}\right)$ + \item Let $\mathbf{H}$ be a $k\times k$ matrix with diagonal elements + \begin{enumerate} + \item Set diagonal elements $H_{mm}=\sum w_i\delta_i p_i^{(m)}\left(1-p_i^{(m)}\right)$ + \item Set off diagonal elements $H_{mn}=-\sum w_i\delta_i p_i^{(m)}p_i^{(n)}$ + \end{enumerate} + \item Newton-Raphson update $\mathgbf{\rho} \leftarrow \mathgbf{\rho} - \mathbf{H}^{-1}\mathbf{g}$ + \item Return to step 2 until convergence +\end{enumerate} + +Notes: +\begin{itemize} +\item $t_i$ is the survival time and $\delta_i$ is the death indicator. +\item $R_i$ denotes the hazard for the risk set, $R_i=\sum_{j=1}^N w_jI(t_j\geq t_i)e^{f(\mathbf{x}_i)}$ +\item $k(i)$ indexes the terminal node of observation $i$ +\item For speed, \texttt{gbm()} does only one step of the Newton-Raphson algorithm rather than iterating to convergence. No appreciable loss of accuracy since the next boosting iteration will simply correct for the prior iterations inadequacy. +\item \texttt{gbm()} initially sorts the data by survival time. Doing this reduces the computation of the risk set from $O(n^2)$ to $O(n)$ at the cost of a single up front sort on survival time. After the model is fit, the data are then put back in their original order. +\end{itemize} + +\subsection{Poisson} +\begin{tabular}{ll} +Deviance & -2$\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\exp(f(\mathbf{x}_i)))$ \\ +Initial value & $\displaystyle f(\mathbf{x})= \log\left(\frac{\sum w_iy_i}{\sum w_ie^{o_i}}\right)$ \\ +Gradient & $z_i=y_i - \exp(f(\mathbf{x}_i))$ \\ +Terminal node estimates & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i\exp(f(\mathbf{x}_i))}$ +\end{tabular} + +The Poisson class includes special safeguards so that the most extreme predicted values are $e^{-19}$ and $e^{+19}$. This behavior is consistent with \texttt{glm()}. + +\subsection{Pairwise} + +This distribution implements ranking measures following the +\emph{LambdaMart} algorithm \cite{Burges:2010}. Instances belong to +\emph{groups}; all pairs of items with different labels, belonging to +the same group, are used for training. In \emph{Information Retrieval} +applications, groups correspond to user queries, +and items to (feature vectors of) documents in the associated match +set to be ranked. + +For consistency with typical usage, our goal is to \emph{maximize} one +of the \emph{utility} functions listed below. Consider a group with +instances $x_1, \dots, x_n$, ordered such that $f(x_1) \geq f(x_2) +\geq \dots f(x_n)$; i.e., the \emph{rank} of $x_i$ is $i$, where +smaller ranks are preferable. Let $P$ be the set of all ordered pairs +such that $y_i > y_j$. + +\begin{enumerate} +\item[{\bf Concordance:}] Fraction of concordant (i.e, correctly ordered) + pairs. For the special case of binary labels, this is equivalent to + the Area under the ROC Curve. +$$\left\{ \begin{array}{l l}\frac{\|\{(i,j)\in P | + f(x_i)>f(x_j)\}\|}{\|P\|} + & P \neq \emptyset\\ + 0 & \mbox{otherwise.} + \end{array}\right. +$$ +\item[{\bf MRR:}] Mean reciprocal rank of the highest-ranked positive + instance (it is assumed $y_i\in\{0,1\}$): +$$\left\{ \begin{array}{l l}\frac{1}{\min\{1 \leq i \leq n |y_i=1\}} + & \exists i: \, 1 \leq i \leq n, y_i=1\\ + 0 & \mbox{otherwise.}\end{array}\right.$$ +\item[{\bf MAP:}] Mean average precision, a generalization of + MRR to multiple positive instances: +$$\left\{ \begin{array}{l l} \frac{\sum_{1\leq i\leq n | y_i=1} \|\{1\leq j\leq i + |y_j=1\}\|\,/\,i}{\|\{1\leq i\leq n | y_i=1\}\|} & \exists i: \, + 1 \leq i \leq n, y_i=1\\ + 0 & \mbox{otherwise.}\end{array}\right.$$ +\item[{\bf nDCG:}] Normalized discounted cumulative gain: +$$\frac{\sum_{1\leq i\leq n} \log_2(i+1) \, y_i}{\sum_{1\leq i\leq n} + \log_2(i+1) \, y'_i},$$ where $y'_1, \dots, y'_n$ is a reordering of $y_1, + \dots,y_n$ with $y'_1 \geq y'_2 \geq \dots \geq y'_n$. +\end{enumerate} + +The generalization to multiple (possibly weighted) groups is +straightforward. Sometimes a cut-off rank $k$ is given for \emph{MRR} +and \emph{nDCG}, in which case we replace the outer index $n$ by +$\min(n,k)$. + +The initial value for $f(x_i)$ is always zero. We derive the gradient of +a cost function whose gradient locally approximates the gradient of +the IR measure for a fixed ranking: + +\begin{eqnarray*} +\Phi & = & \sum_{(i,j) \in P} \Phi_{ij}\\ + & = & \sum_{(i,j) \in P} |\Delta Z_{ij}| \log \left( 1 + e^{-(f(x_i) - + f(x_j))}\right), +\end{eqnarray*} +where $|\Delta Z_{ij}|$ is the absolute utility difference when +swapping the ranks of $i$ and $j$, while leaving all other instances +the same. Define +\begin{eqnarray*} + \lambda_{ij} & = & \frac{\partial\Phi_{ij}}{\partial f(x_i)}\\ + & = & - |\Delta Z_{ij}| \frac{1}{1 + e^{f(x_i) - f(x_j)}}\\ +& = & - |\Delta Z_{ij}| \, \rho_{ij}, +\end{eqnarray*} +with +$$ \rho_{ij} = - \frac{\lambda_{ij }}{|\Delta Z_{ij}|} = \frac{1}{1 + e^{f(x_i) - f(x_j)}}$$ + + For the gradient of $\Phi$ with respect to $f(x_i)$, define +\begin{eqnarray*} +\lambda_i & = & \frac{\partial \Phi}{\partial f(x_i)}\\ +& = & \sum_{j|(i,j) \in P} \lambda_{ij} - \sum_{j|(j,i) \in P} \lambda_{ji}\\ +& = & - \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij}\\ +& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji}. +\end{eqnarray*} + + The second derivative is +\begin{eqnarray*} + \gamma_i & \stackrel{def}{=} & \frac{\partial^2\Phi}{\partial f(x_i)^2}\\ + & = & \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij} \, (1-\rho_{ij})\\ +& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji} \, (1-\rho_{ji}). +\end{eqnarray*} + +Now consider again all groups with associated weights. For a given terminal node, let $i$ +range over all contained instances. Then its estimate is +$$-\frac{\sum_i v_i\lambda_{i}}{\sum_i v_i \gamma_i},$$ where +$v_i=w(\mbox{\em group}(i))/\|\{(j,k)\in\mbox{\em group}(i)\}\|.$ + +In each iteration, instances are reranked according to the preliminary +scores $f(x_i)$ to determine the $|\Delta Z_{ij}|$. Note that in order +to avoid ranking bias, we break ties by adding a small amount of +random noise. + + + +\bibliography{gbm} + +\end{document} diff --git a/inst/doc/gbm.Sweave b/inst/doc/gbm.Sweave deleted file mode 100644 index ef5c014..0000000 --- a/inst/doc/gbm.Sweave +++ /dev/null @@ -1,391 +0,0 @@ -% setwd("c:/dev/gbm/inst/doc") % Sweave("gbm.rnw"); system("texify gbm.tex"); system("c:\\MiKTeX\\texmf\\miktex\\bin\\yap.exe gbm.dvi",wait=FALSE) - -\documentclass{article} -\bibliographystyle{plain} -\usepackage[active]{srcltx} -\newcommand{\EV}{\mathrm{E}} -\newcommand{\Var}{\mathrm{Var}} -\newcommand{\aRule}{\begin{center} \rule{5in}{1mm} \end{center}} - -\title{Generalized Boosted Models:\\A guide to the gbm package} \author{Greg Ridgeway} - -%\VignetteIndexEntry{Generalized Boosted Models: A guide to the gbm package} - -\newcommand{\mathgbf}[1]{{\mbox{\boldmath$#1$\unboldmath}}} - -\begin{document} - -\maketitle - -Boosting takes on various forms with different programs using different loss functions, different base models, and different optimization schemes. The gbm package takes the approach described in \cite{Friedman:2001} and \cite{Friedman:2002}. Some of the terminology differs, mostly due to an effort to cast boosting terms into more standard statistical terminology (e.g. deviance). In addition, the gbm package implements boosting for models commonly used in statistics but not commonly associated with boosting. The Cox proportional hazard model, for example, is an incredibly useful model and the boosting framework applies quite readily with only slight modification \cite{Ridgeway:1999}. Also some algorithms implemented in the gbm package differ from the standard implementation. The AdaBoost algorithm \cite{FreundSchapire:1997} has a particular loss function and a particular optimization algorithm associated with it. The gbm implementation of AdaBoost adopts AdaBoost's exponential loss function (its bound on misclassification rate) but uses Friedman's gradient descent algorithm rather than the original one proposed. So the main purposes of this document is to spell out in detail what the gbm package implements. - -\section{Gradient boosting} - -This section essentially presents the derivation of boosting described in \cite{Friedman:2001}. The gbm package also adopts the stochastic gradient boosting strategy, a small but important tweak on the basic algorithm, described in \cite{Friedman:2002}. - -\subsection{Friedman's gradient boosting machine} \label{sec:GradientBoostingMachine} - -\begin{figure} -\aRule Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$. \\ -For $t$ in $1,\ldots,T$ do -\begin{enumerate} -\item Compute the negative gradient as the working response - \begin{equation} - z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} - \end{equation} -\item Fit a regression model, $g(\mathbf{x})$, predicting $z_i$ from the covariates $\mathbf{x}_i$. \item Choose a gradient descent step size as - \begin{equation} - \rho = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\hat f(\mathbf{x}_i)+\rho g(\mathbf{x}_i)) - \end{equation} -\item Update the estimate of $f(\mathbf{x})$ as - \begin{equation} - \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \rho g(\mathbf{x}) - \end{equation} -\end{enumerate} \aRule \caption{Friedman's Gradient Boost algorithm} \label{fig:GradientBoost} \end{figure} - -Friedman (2001) and the companion paper Friedman (2002) extended the work of Friedman, Hastie, and Tibshirani (2000) and laid the ground work for a new generation of boosting algorithms. Using the connection between boosting and optimization, this new work proposes the Gradient Boosting Machine. - -In any function estimation problem we wish to find a regression function, $\hat f(\mathbf{x})$, that minimizes the expectation of some loss function, $\Psi(y,f)$, as shown in (\ref{NonparametricRegression1}). - -\begin{eqnarray} -\hspace{0.5in} -\hat f(\mathbf{x}) &=& \arg \min_{f(\mathbf{x})} \EV_{y,\mathbf{x}} \Psi(y,f(\mathbf{x})) \nonumber \\ \label{NonparametricRegression1} -&=& \arg \min_{f(\mathbf{x})} \EV_x \left[ \EV_{y|\mathbf{x}} \Psi(y,f(\mathbf{x})) \Big| \mathbf{x} \right] -\end{eqnarray} - -We will focus on finding estimates of $f(\mathbf{x})$ such that \begin{equation} -\label{NonparametricRegression2} -\hspace{0.5in} -\hat f(\mathbf{x}) = \arg \min_{f(\mathbf{x})} \EV_{y|\mathbf{x}} \left[ \Psi(y,f(\mathbf{x}))|\mathbf{x} \right] -\end{equation} -Parametric regression models assume that $f(\mathbf{x})$ is a function with a finite number of parameters, $\beta$, and estimates them by selecting those values that minimize a loss function (e.g. squared error loss) over a training sample of $N$ observations on $(y,\mathbf{x})$ pairs as in (\ref{eq:Friedman1}). -\begin{equation} -\label{eq:Friedman1} -\hspace{0.5in} -\hat\beta = \arg \min_{\beta} \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i;\beta)) -\end{equation} -When we wish to estimate $f(\mathbf{x})$ non-parametrically the task becomes more difficult. Again we can proceed similarly to \cite{FHT:2000} and modify our current estimate of $f(\mathbf{x})$ by adding a new function $f(\mathbf{x})$ in a greedy fashion. Letting $f_i = f(\mathbf{x}_i)$, we see that we want to decrease the $N$ dimensional function -\begin{eqnarray} -\label{EQ:Friedman2} -\hspace{0.5in} -J(\mathbf{f}) &=& \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i)) \nonumber \\ - &=& \sum_{i=1}^N \Psi(y_i,F_i). -\end{eqnarray} -The negative gradient of $J(\mathbf{f})$ indicates the direction of the locally greatest decrease in $J(\mathbf{f})$. Gradient descent would then have us modify $\mathbf{f}$ as -\begin{equation} -\label{eq:Friedman3} -\hspace{0.5in} -\hat \mathbf{f} \leftarrow \hat \mathbf{f} - \rho \nabla J(\mathbf{f}) -\end{equation} -where $\rho$ is the size of the step along the direction of greatest descent. Clearly, this step alone is far from our desired goal. First, it only fits $f$ at values of $\mathbf{x}$ for which we have observations. Second, it does not take into account that observations with similar $\mathbf{x}$ are likely to have similar values of $f(\mathbf{x})$. Both these problems would have disastrous effects on generalization error. However, Friedman suggests selecting a class of functions that use the covariate information to approximate the gradient, usually a regression tree. This line of reasoning produces his Gradient Boosting algorithm shown in Figure~\ref{fig:GradientBoost}. At each iteration the algorithm determines the direction, the gradient, in which it needs to improve the fit to the data and selects a particular model from the allowable class of functions that is in most agreement with the direction. In the case of squared-error loss, $\Psi(y_i,f(\mathbf{x}_i)) = \sum_{i=1}^N (y_i-f(\mathbf{x}_i))^2$, this algorithm corresponds exactly to residual fitting. - -There are various ways to extend and improve upon the basic framework suggested in Figure~\ref{fig:GradientBoost}. For example, Friedman (2001) substituted several choices in for $\Psi$ to develop new boosting algorithms for robust regression with least absolute deviation and Huber loss functions. Friedman (2002) showed that a simple subsampling trick can greatly improve predictive performance while simultaneously reduce computation time. Section~\ref{GBMModifications} discusses some of these modifications. - -\section{Improving boosting methods using control of the learning rate, sub-sampling, and a decomposition for interpretation} \label{GBMModifications} - -This section explores the variations of the previous algorithms that have the potential to improve their predictive performance and interpretability. In particular, by controlling the optimization speed or learning rate, introducing low-variance regression methods, and applying ideas from robust regression we can produce non-parametric regression procedures with many desirable properties. As a by-product some of these modifications lead directly into implementations for learning from massive datasets. All these methods take advantage of the general form of boosting -\begin{equation} -\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). -\end{equation} So far we have taken advantage of this form only by substituting in our favorite regression procedure for $\EV_w(z|\mathbf{x})$. I will discuss some modifications to estimating $\EV_w(z|\mathbf{x})$ that have the potential to improve our algorithm. - -\subsection{Decreasing the learning rate} As several authors have phrased slightly differently, ``...boosting, whatever flavor, seldom seems to overfit, no matter how many terms are included in the additive expansion''. This is not true as the discussion to \cite{FHT:2000} points out. - -In the update step of any boosting algorithm we can introduce a learning rate to dampen the proposed move. -\begin{equation} -\label{eq:shrinkage} -\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). -\end{equation} -By multiplying the gradient step by $\lambda$ as in equation~\ref{eq:shrinkage} we have control on the rate at which the boosting algorithm descends the error surface (or ascends the likelihood surface). When $\lambda=1$ we return to performing full gradient steps. Friedman (2001) relates the learning rate to regularization through shrinkage. - -The optimal number of iterations, $T$, and the learning rate, $\lambda$, depend on each other. In practice I set $\lambda$ to be as small as possible and then select $T$ by cross-validation. Performance is best when $\lambda$ is as small as possible performance with decreasing marginal utility for smaller and smaller $\lambda$. Slower learning rates do not necessarily scale the number of optimal iterations. That is, if when $\lambda=1.0$ and the optimal $T$ is 100 iterations, does {\it not} necessarily imply that when $\lambda=0.1$ the optimal $T$ is 1000 iterations. - -\subsection{Variance reduction using subsampling} - -Friedman (2002) proposed the stochastic gradient boosting algorithm that simply samples uniformly without replacement from the dataset before estimating the next gradient step. He found that this additional step greatly improved performance. We estimate the regression $\EV(z(y,\hat f(\mathbf{x}))|\mathbf{x})$ using a random subsample of the dataset. - -\subsection{ANOVA decomposition} - -Certain function approximation methods are decomposable in terms of a ``functional ANOVA decomposition''. That is a function is decomposable as -\begin{equation} -\label{ANOVAdecomp} -f(\mathbf{x}) = \sum_j f_j(x_j) + \sum_{jk} f_{jk}(x_j,x_k) + \sum_{jk\ell} f_{jk\ell}(x_j,x_k,x_\ell) + \cdots. -\end{equation} This applies to boosted trees. Regression stumps (one split decision trees) depend on only one variable and fall into the first term of \ref{ANOVAdecomp}. Trees with two splits fall into the second term of \ref{ANOVAdecomp} and so on. By restricting the depth of the trees produced on each boosting iteration we can control the order of approximation. Often additive components are sufficient to approximate a multivariate function well, generalized additive models, the na\"{\i}ve Bayes classifier, and boosted stumps for example. When the approximation is restricted to a first order we can also produce plots of $x_j$ versus $f_j(x_j)$ to demonstrate how changes in $x_j$ might affect changes in the response variable. - -\subsection{Relative influence} Friedman (2001) also develops an extension of a variable's ``relative influence'' for boosted estimates. For tree based methods the approximate relative influence of a variable $x_j$ is -\begin{equation} -\label{RelInfluence} -\hspace{0.5in} -\hat J_j^2 = \hspace{-0.1in}\sum_{\mathrm{splits~on~}x_j}\hspace{-0.2in}I_t^2 -\end{equation} where $I_t^2$ is the empirical improvement by splitting on $x_j$ at that point. Friedman's extension to boosted models is to average the relative influence of variable $x_j$ across all the trees generated by the boosting algorithm. - -\begin{figure} -\aRule -Select -\begin{itemize} -\item a loss function (\texttt{distribution}) -\item the number of iterations, $T$ (\texttt{n.trees}) -\item the depth of each tree, $K$ (\texttt{interaction.depth}) -\item the shrinkage (or learning rate) parameter, $\lambda$ (\texttt{shrinkage}) -\item the subsampling rate, $p$ (\texttt{bag.fraction}) -\end{itemize} -Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$ \\ -For $t$ in $1,\ldots,T$ do -\begin{enumerate} -\item Compute the negative gradient as the working response - \begin{equation} - z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} - \end{equation} -\item Randomly select $p\times N$ cases from the dataset -\item Fit a regression tree with $K$ terminal nodes, $g(\mathbf{x})=\EV(z|\mathbf{x})$. This tree is fit using only those randomly selected observations -\item Compute the optimal terminal node predictions, $\rho_1,\ldots,\rho_K$, as - \begin{equation} - \rho_k = \arg \min_{\rho} \sum_{\mathbf{x}_i\in S_k} \Psi(y_i,\hat f(\mathbf{x}_i)+\rho) - \end{equation} -where $S_k$ is the set of $\mathbf{x}$s that define terminal node $k$. Again this step uses only the randomly selected observations. -\item Update $\hat f(\mathbf{x})$ as - \begin{equation} - \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda\rho_{k(\mathbf{x})} - \end{equation} -where $k(\mathbf{x})$ indicates the index of the terminal node into which an observation with features $\mathbf{x}$ would fall. -\end{enumerate} -\aRule -\caption{Boosting as implemented in \texttt{gbm()}} -\label{fig:gbm} -\end{figure} - -\section{Common user options} - -This section discusses the options to gbm that most users will need to change or tune. - -\subsection{Loss function} - -The first and foremost choice is \texttt{distribution}. This should be easily dictated by the application. For most classification problems either \texttt{bernoulli} or \texttt{adaboost} will be appropriate, the former being recommended. For continuous outcomes the choices are \texttt{gaussian} (for minimizing squared error), \texttt{laplace} (for minimizing absolute error), and quantile regression (for estimating percentiles of the conditional distribution of the outcome). Censored survival outcomes should require \texttt{coxph}. Count outcomes may use \texttt{poisson} although one might also consider \texttt{gaussian} or \texttt{laplace} depending on the analytical goals. - -\subsection{The relationship between shrinkage and number of iterations} The issues that most new users of gbm struggle with are the choice of \texttt{n.trees} and \texttt{shrinkage}. It is important to know that smaller values of \texttt{shrinkage} (almost) always give improved predictive performance. That is, setting \texttt{shrinkage=0.001} will almost certainly result in a model with better out-of-sample predictive performance than setting \texttt{shrinkage=0.01}. However, there are computational costs, both storage and CPU time, associated with setting \texttt{shrinkage} to be low. The model with \texttt{shrinkage=0.001} will likely require ten times as many iterations as the model with \texttt{shrinkage=0.01}, increasing storage and computation time by a factor of 10. Figure~\ref{fig:shrinkViters} shows the relationship between predictive performance, the number of iterations, and the shrinkage parameter. Note that the increase in the optimal number of iterations between two choices for shrinkage is roughly equal to the ratio of the shrinkage parameters. It is generally the case that for small shrinkage parameters, 0.001 for example, there is a fairly long plateau in which predictive performance is at its best. My rule of thumb is to set \texttt{shrinkage} as small as possible while still being able to fit the model in a reasonable amount of time and storage. I usually aim for 3,000 to 10,000 iterations with shrinkage rates between 0.01 and 0.001. - -\begin{figure}[ht] \begin{center} \includegraphics[width=5in]{shrinkage-v-iterations} \end{center} \caption{Out-of-sample predictive performance by number of iterations and shrinkage. Smaller values of the shrinkage parameter offer improved predictive performance, but with decreasing marginal improvement.} \label{fig:shrinkViters} \end{figure} - -\subsection{Estimating the optimal number of iterations} gbm offers three methods for estimating the optimal number of iterations after the gbm model has been fit, an independent test set (\texttt{test}), out-of-bag estimation (\texttt{OOB}), and $v$-fold cross validation (\texttt{cv}). The function \texttt{gbm.perf} computes the iteration estimate. - -Like Friedman's MART software, the independent test set method uses a single holdout test set to select the optimal number of iterations. If \texttt{train.fraction} is set to be less than 1, then only the \textit{first} \texttt{train.fraction}$\times$\texttt{nrow(data)} will be used to fit the model. Note that if the data are sorted in a systematic way (such as cases for which $y=1$ come first), then the data should be shuffled before running gbm. Those observations not used in the model fit can be used to get an unbiased estimate of the optimal number of iterations. The downside of this method is that a considerable number of observations are used to estimate the single regularization parameter (number of iterations) leaving a reduced dataset for estimating the entire multivariate model structure. Use \texttt{gbm.perf(...,method="test")} to obtain an estimate of the optimal number of iterations using the held out test set. - -If \texttt{bag.fraction} is set to be greater than 0 (0.5 is recommended), gbm computes an out-of-bag estimate of the improvement in predictive performance. It evaluates the reduction in deviance on those observations not used in selecting the next regression tree. The out-of-bag estimator underestimates the reduction in deviance. As a result, it almost always is too conservative in its selection for the optimal number of iterations. The motivation behind this method was to avoid having to set aside a large independent dataset, which reduces the information available for learning the model structure. Use \texttt{gbm.perf(...,method="OOB")} to obtain the OOB estimate. - -Lastly, gbm offers $v$-fold cross validation for estimating the optimal number of iterations. If when fitting the gbm model, \texttt{cv.folds=5} then gbm will do 5-fold cross validation. gbm will fit five gbm models in order to compute the cross validation error estimate and then will fit a sixth and final gbm model with \texttt{n.trees}iterations using all of the data. The returned model object will have a component labeled \texttt{cv.error}. Note that \texttt{gbm.more} will do additional gbm iterations but will not add to the \texttt{cv.error} component. Use \texttt{gbm.perf(...,method="cv")} to obtain the cross validation estimate. - -\begin{figure}[ht] -\begin{center} -\includegraphics[width=5in]{oobperf2} -\end{center} -\caption{Out-of-sample predictive performance of four methods of selecting the optimal number of iterations. The vertical axis plots performance relative the best. The boxplots indicate relative performance across thirteen real datasets from the UCI repository. See \texttt{demo(OOB-reps)}.} -\label{fig:oobperf} -\end{figure} - -Figure~\ref{fig:oobperf} compares the three methods for estimating the optimal number of iterations across 13 datasets. The boxplots show the methods performance relative to the best method on that dataset. For most datasets the method perform similarly, however, 5-fold cross validation is consistently the best of them. OOB, using a 33\% test set, and using a 20\% test set all have datasets for which the perform considerably worse than the best method. My recommendation is to use 5- or 10-fold cross validation if you can afford the computing time. Otherwise you may choose among the other options, knowing that OOB is conservative. - -\section{Available distributions} - -This section gives some of the mathematical detail for each of the distribution options that gbm offers. The gbm engine written in C++ has access to a C++ class for each of these distributions. Each class contains methods for computing the associated deviance, initial value, the gradient, and the constants to predict in each terminal node. - -In the equations shown below, for non-zero offset terms, replace $f(\mathbf{x}_i)$ with $o_i + f(\mathbf{x}_i)$. - -\subsection{Gaussian} - -\begin{tabular}{ll} -Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i(y_i-f(\mathbf{x}_i))^2$ \\ -Initial value & $\displaystyle f(\mathbf{x})=\frac{\sum w_i(y_i-o_i)}{\sum w_i}$ \\ -Gradient & $z_i=y_i - f(\mathbf{x}_i)$ \\ -Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-f(\mathbf{x}_i))}{\sum w_i}$ -\end{tabular} - -\subsection{AdaBoost} - -\begin{tabular}{ll} Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Initial value & $\displaystyle \frac{1}{2}\log\frac{\sum y_iw_ie^{-o_i}}{\sum (1-y_i)w_ie^{o_i}}$ \\ Gradient & $\displaystyle z_i= -(2y_i-1)\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Terminal node estimates & $\displaystyle \frac{\sum (2y_i-1)w_i\exp(-(2y_i-1)f(\mathbf{x}_i))} - {\sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))}$ -\end{tabular} - -\subsection{Bernoulli} - -\begin{tabular}{ll} Deviance & $\displaystyle -2\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\log(1+\exp(f(\mathbf{x}_i))))$ \\ Initial value & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i(1-y_i)}$ \\ Gradient & $\displaystyle z_i=y_i-\frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ \\ - & where $\displaystyle p_i = \frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ -\end{tabular} - -Notes: \begin{itemize} \item For non-zero offset terms, the computation of the initial value requires Newton-Raphson. Initialize $f_0=0$ and iterate $\displaystyle f_0 \leftarrow f_0 + \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ where $\displaystyle p_i = \frac{1}{1+\exp(-(o_i+f_0))}$. \end{itemize} - -\subsection{Laplace} - -\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} \sum w_i|y_i-f(\mathbf{x}_i)|$ \\ Initial value & $\mbox{median}_w(y)$ \\ Gradient & $z_i=\mbox{sign}(y_i-f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mbox{median}_w(z)$ \end{tabular} - -Notes: \begin{itemize} \item $\mbox{median}_w(y)$ denotes the weighted median, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq m)}{\sum w_i}=\frac{1}{2}$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution="laplace"}. \end{itemize} - - -\subsection{Quantile regression} - -Contributed by Brian Kriegler (see \cite{Kriegler:2010}). - -\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} - \left(\alpha\sum_{y_i>f(\mathbf{x}_i)} w_i(y_i-f(\mathbf{x}_i))\right. +$ \\ - & \hspace{0.5in}$\left.(1-\alpha)\sum_{y_i\leq f(\mathbf{x}_i)} w_i(f(\mathbf{x}_i)-y_i)\right)$ \\ -Initial value & $\mathrm{quantile}^{(\alpha)}_w(y)$ \\ Gradient & $z_i=\alpha I(y_i>f(\mathbf{x}_i))-(1-\alpha)I(y_i\leq f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mathrm{quantile}^{(\alpha)}_w(z)$ \end{tabular} - -Notes: \begin{itemize} \item $\mathrm{quantile}^{(\alpha)}_w(y)$ denotes the weighted quantile, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq q)}{\sum w_i}=\alpha$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution=list(name="quantile")}. \end{itemize} - - -\subsection{Cox Proportional Hazard} - -\begin{tabular}{ll} Deviance & $-2\sum w_i(\delta_i(f(\mathbf{x}_i)-\log(R_i/w_i)))$\\ Gradient & $\displaystyle z_i=\delta_i - \sum_j \delta_j - \frac{w_jI(t_i\geq t_j)e^{f(\mathbf{x}_i)}} - {\sum_k w_kI(t_k\geq t_j)e^{f(\mathbf{x}_k)}}$ \\ -Initial value & 0 \\ Terminal node estimates & Newton-Raphson algorithm \end{tabular} - -\begin{enumerate} - \item Initialize the terminal node predictions to 0, $\mathgbf{\rho}=0$ - \item Let $\displaystyle - p_i^{(k)}=\frac{\sum_j I(k(j)=k)I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}} - {\sum_j I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}}$ - \item Let $g_k=\sum w_i\delta_i\left(I(k(i)=k)-p_i^{(k)}\right)$ - \item Let $\mathbf{H}$ be a $k\times k$ matrix with diagonal elements - \begin{enumerate} - \item Set diagonal elements $H_{mm}=\sum w_i\delta_i p_i^{(m)}\left(1-p_i^{(m)}\right)$ - \item Set off diagonal elements $H_{mn}=-\sum w_i\delta_i p_i^{(m)}p_i^{(n)}$ - \end{enumerate} - \item Newton-Raphson update $\mathgbf{\rho} \leftarrow \mathgbf{\rho} - \mathbf{H}^{-1}\mathbf{g}$ - \item Return to step 2 until convergence -\end{enumerate} - -Notes: -\begin{itemize} -\item $t_i$ is the survival time and $\delta_i$ is the death indicator. -\item $R_i$ denotes the hazard for the risk set, $R_i=\sum_{j=1}^N w_jI(t_j\geq t_i)e^{f(\mathbf{x}_i)}$ -\item $k(i)$ indexes the terminal node of observation $i$ -\item For speed, \texttt{gbm()} does only one step of the Newton-Raphson algorithm rather than iterating to convergence. No appreciable loss of accuracy since the next boosting iteration will simply correct for the prior iterations inadequacy. -\item \texttt{gbm()} initially sorts the data by survival time. Doing this reduces the computation of the risk set from $O(n^2)$ to $O(n)$ at the cost of a single up front sort on survival time. After the model is fit, the data are then put back in their original order. -\end{itemize} - -\subsection{Poisson} -\begin{tabular}{ll} -Deviance & -2$\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\exp(f(\mathbf{x}_i)))$ \\ -Initial value & $\displaystyle f(\mathbf{x})= \log\left(\frac{\sum w_iy_i}{\sum w_ie^{o_i}}\right)$ \\ -Gradient & $z_i=y_i - \exp(f(\mathbf{x}_i))$ \\ -Terminal node estimates & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i\exp(f(\mathbf{x}_i))}$ -\end{tabular} - -The Poisson class includes special safeguards so that the most extreme predicted values are $e^{-19}$ and $e^{+19}$. This behavior is consistent with \texttt{glm()}. - -\subsection{Pairwise} - -This distribution implements ranking measures following the -\emph{LambdaMart} algorithm \cite{Burges:2010}. Instances belong to -\emph{groups}; all pairs of items with different labels, belonging to -the same group, are used for training. In \emph{Information Retrieval} -applications, groups correspond to user queries, -and items to (feature vectors of) documents in the associated match -set to be ranked. - -For consistency with typical usage, our goal is to \emph{maximize} one -of the \emph{utility} functions listed below. Consider a group with -instances $x_1, \dots, x_n$, ordered such that $f(x_1) \geq f(x_2) -\geq \dots f(x_n)$; i.e., the \emph{rank} of $x_i$ is $i$, where -smaller ranks are preferable. Let $P$ be the set of all ordered pairs -such that $y_i > y_j$. - -\begin{enumerate} -\item[{\bf Concordance:}] Fraction of concordant (i.e, correctly ordered) - pairs. For the special case of binary labels, this is equivalent to - the Area under the ROC Curve. -$$\left\{ \begin{array}{l l}\frac{\|\{(i,j)\in P | - f(x_i)>f(x_j)\}\|}{\|P\|} - & P \neq \emptyset\\ - 0 & \mbox{otherwise.} - \end{array}\right. -$$ -\item[{\bf MRR:}] Mean reciprocal rank of the highest-ranked positive - instance (it is assumed $y_i\in\{0,1\}$): -$$\left\{ \begin{array}{l l}\frac{1}{\min\{1 \leq i \leq n |y_i=1\}} - & \exists i: \, 1 \leq i \leq n, y_i=1\\ - 0 & \mbox{otherwise.}\end{array}\right.$$ -\item[{\bf MAP:}] Mean average precision, a generalization of - MRR to multiple positive instances: -$$\left\{ \begin{array}{l l} \frac{\sum_{1\leq i\leq n | y_i=1} \|\{1\leq j\leq i - |y_j=1\}\|\,/\,i}{\|\{1\leq i\leq n | y_i=1\}\|} & \exists i: \, - 1 \leq i \leq n, y_i=1\\ - 0 & \mbox{otherwise.}\end{array}\right.$$ -\item[{\bf nDCG:}] Normalized discounted cumulative gain: -$$\frac{\sum_{1\leq i\leq n} \log_2(i+1) \, y_i}{\sum_{1\leq i\leq n} - \log_2(i+1) \, y'_i},$$ where $y'_1, \dots, y'_n$ is a reordering of $y_1, - \dots,y_n$ with $y'_1 \geq y'_2 \geq \dots \geq y'_n$. -\end{enumerate} - -The generalization to multiple (possibly weighted) groups is -straightforward. Sometimes a cut-off rank $k$ is given for \emph{MRR} -and \emph{nDCG}, in which case we replace the outer index $n$ by -$\min(n,k)$. - -The initial value for $f(x_i)$ is always zero. We derive the gradient of -a cost function whose gradient locally approximates the gradient of -the IR measure for a fixed ranking: - -\begin{eqnarray*} -\Phi & = & \sum_{(i,j) \in P} \Phi_{ij}\\ - & = & \sum_{(i,j) \in P} |\Delta Z_{ij}| \log \left( 1 + e^{-(f(x_i) - - f(x_j))}\right), -\end{eqnarray*} -where $|\Delta Z_{ij}|$ is the absolute utility difference when -swapping the ranks of $i$ and $j$, while leaving all other instances -the same. Define -\begin{eqnarray*} - \lambda_{ij} & = & \frac{\partial\Phi_{ij}}{\partial f(x_i)}\\ - & = & - |\Delta Z_{ij}| \frac{1}{1 + e^{f(x_i) - f(x_j)}}\\ -& = & - |\Delta Z_{ij}| \, \rho_{ij}, -\end{eqnarray*} -with -$$ \rho_{ij} = - \frac{\lambda_{ij }}{|\Delta Z_{ij}|} = \frac{1}{1 + e^{f(x_i) - f(x_j)}}$$ - - For the gradient of $\Phi$ with respect to $f(x_i)$, define -\begin{eqnarray*} -\lambda_i & = & \frac{\partial \Phi}{\partial f(x_i)}\\ -& = & \sum_{j|(i,j) \in P} \lambda_{ij} - \sum_{j|(j,i) \in P} \lambda_{ji}\\ -& = & - \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij}\\ -& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji}. -\end{eqnarray*} - - The second derivative is -\begin{eqnarray*} - \gamma_i & \stackrel{def}{=} & \frac{\partial^2\Phi}{\partial f(x_i)^2}\\ - & = & \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij} \, (1-\rho_{ij})\\ -& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji} \, (1-\rho_{ji}). -\end{eqnarray*} - -Now consider again all groups with associated weights. For a given terminal node, let $i$ -range over all contained instances. Then its estimate is -$$-\frac{\sum_i v_i\lambda_{i}}{\sum_i v_i \gamma_i},$$ where -$v_i=w(\mbox{\em group}(i))/\|\{(j,k)\in\mbox{\em group}(i)\}\|.$ - -In each iteration, instances are reranked according to the preliminary -scores $f(x_i)$ to determine the $|\Delta Z_{ij}|$. Note that in order -to avoid ranking bias, we break ties by adding a small amount of -random noise. - - - -\begin{thebibliography}{77} % start the bibliography - -\small % put the bibliography in a small font - -\bibitem{FreundSchapire:1997} Y. Freund and R.E. Schapire (1997). ``A decision-theoretic generalization of on-line learning and an application to boosting,'' \textit{Journal of Computer and System Sciences}, 55(1):119-139. - -\bibitem{Friedman:2001} J.H. Friedman (2001). ``Greedy Function Approximation: A Gradient Boosting Machine,'' \textit{Annals of Statistics} 29(5):1189-1232. - -\bibitem{Friedman:2002} J.H. Friedman (2002). ``Stochastic Gradient Boosting,'' \textit{Computational Statistics and Data Analysis} 38(4):367-378. - -\bibitem{FHT:2000} J.H. Friedman, T. Hastie, R. Tibshirani (2000). ``Additive Logistic Regression: a Statistical View of Boosting,'' \textit{Annals of Statistics} 28(2):337-374. - -\bibitem{Kriegler:2010} B. Kriegler and R. Berk (2010). ``Small Area Estimation of the Homeless in Los Angeles, An Application of Cost-Sensitive Stochastic Gradient Boosting,'' \textit{Annals of Applied Statistics} 4(3):1234-1255. - -\bibitem{Ridgeway:1999} G. Ridgeway (1999). ``The state of boosting,'' \textit{Computing Science and Statistics} 31:172-181. - -\bibitem{Burges:2010} C. Burges (2010). ``From RankNet to LambdaRank to LambdaMART: An Overview'', \textit{Microsoft Research Technical Report MSR-TR-2010-82} - -\end{thebibliography} % end the bibliography - -\end{document} diff --git a/inst/doc/gbm.pdf b/inst/doc/gbm.pdf index 40411b5..2292b5a 100644 Binary files a/inst/doc/gbm.pdf and b/inst/doc/gbm.pdf differ diff --git a/inst/doc/gbm.tex b/inst/doc/gbm.tex deleted file mode 100644 index fe3a9a8..0000000 --- a/inst/doc/gbm.tex +++ /dev/null @@ -1,391 +0,0 @@ -% setwd("c:/dev/gbm/inst/doc") % Sweave("gbm.rnw"); system("texify gbm.tex"); system("c:\\MiKTeX\\texmf\\miktex\\bin\\yap.exe gbm.dvi",wait=FALSE) - -\documentclass{article} -\bibliographystyle{plain} -\usepackage[active]{srcltx} -\newcommand{\EV}{\mathrm{E}} -\newcommand{\Var}{\mathrm{Var}} -\newcommand{\aRule}{\begin{center} \rule{5in}{1mm} \end{center}} - -\title{Generalized Boosted Models:\\A guide to the gbm package} \author{Greg Ridgeway} - -%\VignetteIndexEntry{Generalized Boosted Models: A guide to the gbm package} - -\newcommand{\mathgbf}[1]{{\mbox{\boldmath$#1$\unboldmath}}} - -\usepackage{Sweave} -\begin{document} - -\maketitle - -Boosting takes on various forms with different programs using different loss functions, different base models, and different optimization schemes. The gbm package takes the approach described in \cite{Friedman:2001} and \cite{Friedman:2002}. Some of the terminology differs, mostly due to an effort to cast boosting terms into more standard statistical terminology (e.g. deviance). In addition, the gbm package implements boosting for models commonly used in statistics but not commonly associated with boosting. The Cox proportional hazard model, for example, is an incredibly useful model and the boosting framework applies quite readily with only slight modification \cite{Ridgeway:1999}. Also some algorithms implemented in the gbm package differ from the standard implementation. The AdaBoost algorithm \cite{FreundSchapire:1997} has a particular loss function and a particular optimization algorithm associated with it. The gbm implementation of AdaBoost adopts AdaBoost's exponential loss function (its bound on misclassification rate) but uses Friedman's gradient descent algorithm rather than the original one proposed. So the main purposes of this document is to spell out in detail what the gbm package implements. - -\section{Gradient boosting} - -This section essentially presents the derivation of boosting described in \cite{Friedman:2001}. The gbm package also adopts the stochastic gradient boosting strategy, a small but important tweak on the basic algorithm, described in \cite{Friedman:2002}. - -\subsection{Friedman's gradient boosting machine} \label{sec:GradientBoostingMachine} - -\begin{figure} -\aRule Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$. \\ -For $t$ in $1,\ldots,T$ do -\begin{enumerate} -\item Compute the negative gradient as the working response - \begin{equation} - z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} - \end{equation} -\item Fit a regression model, $g(\mathbf{x})$, predicting $z_i$ from the covariates $\mathbf{x}_i$. \item Choose a gradient descent step size as - \begin{equation} - \rho = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\hat f(\mathbf{x}_i)+\rho g(\mathbf{x}_i)) - \end{equation} -\item Update the estimate of $f(\mathbf{x})$ as - \begin{equation} - \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \rho g(\mathbf{x}) - \end{equation} -\end{enumerate} \aRule \caption{Friedman's Gradient Boost algorithm} \label{fig:GradientBoost} \end{figure} - -Friedman (2001) and the companion paper Friedman (2002) extended the work of Friedman, Hastie, and Tibshirani (2000) and laid the ground work for a new generation of boosting algorithms. Using the connection between boosting and optimization, this new work proposes the Gradient Boosting Machine. - -In any function estimation problem we wish to find a regression function, $\hat f(\mathbf{x})$, that minimizes the expectation of some loss function, $\Psi(y,f)$, as shown in (\ref{NonparametricRegression1}). - -\begin{eqnarray} -\hspace{0.5in} -\hat f(\mathbf{x}) &=& \arg \min_{f(\mathbf{x})} \EV_{y,\mathbf{x}} \Psi(y,f(\mathbf{x})) \nonumber \\ \label{NonparametricRegression1} -&=& \arg \min_{f(\mathbf{x})} \EV_x \left[ \EV_{y|\mathbf{x}} \Psi(y,f(\mathbf{x})) \Big| \mathbf{x} \right] -\end{eqnarray} - -We will focus on finding estimates of $f(\mathbf{x})$ such that \begin{equation} -\label{NonparametricRegression2} -\hspace{0.5in} -\hat f(\mathbf{x}) = \arg \min_{f(\mathbf{x})} \EV_{y|\mathbf{x}} \left[ \Psi(y,f(\mathbf{x}))|\mathbf{x} \right] -\end{equation} -Parametric regression models assume that $f(\mathbf{x})$ is a function with a finite number of parameters, $\beta$, and estimates them by selecting those values that minimize a loss function (e.g. squared error loss) over a training sample of $N$ observations on $(y,\mathbf{x})$ pairs as in (\ref{eq:Friedman1}). -\begin{equation} -\label{eq:Friedman1} -\hspace{0.5in} -\hat\beta = \arg \min_{\beta} \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i;\beta)) -\end{equation} -When we wish to estimate $f(\mathbf{x})$ non-parametrically the task becomes more difficult. Again we can proceed similarly to \cite{FHT:2000} and modify our current estimate of $f(\mathbf{x})$ by adding a new function $f(\mathbf{x})$ in a greedy fashion. Letting $f_i = f(\mathbf{x}_i)$, we see that we want to decrease the $N$ dimensional function -\begin{eqnarray} -\label{EQ:Friedman2} -\hspace{0.5in} -J(\mathbf{f}) &=& \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i)) \nonumber \\ - &=& \sum_{i=1}^N \Psi(y_i,F_i). -\end{eqnarray} -The negative gradient of $J(\mathbf{f})$ indicates the direction of the locally greatest decrease in $J(\mathbf{f})$. Gradient descent would then have us modify $\mathbf{f}$ as -\begin{equation} -\label{eq:Friedman3} -\hspace{0.5in} -\hat \mathbf{f} \leftarrow \hat \mathbf{f} - \rho \nabla J(\mathbf{f}) -\end{equation} -where $\rho$ is the size of the step along the direction of greatest descent. Clearly, this step alone is far from our desired goal. First, it only fits $f$ at values of $\mathbf{x}$ for which we have observations. Second, it does not take into account that observations with similar $\mathbf{x}$ are likely to have similar values of $f(\mathbf{x})$. Both these problems would have disastrous effects on generalization error. However, Friedman suggests selecting a class of functions that use the covariate information to approximate the gradient, usually a regression tree. This line of reasoning produces his Gradient Boosting algorithm shown in Figure~\ref{fig:GradientBoost}. At each iteration the algorithm determines the direction, the gradient, in which it needs to improve the fit to the data and selects a particular model from the allowable class of functions that is in most agreement with the direction. In the case of squared-error loss, $\Psi(y_i,f(\mathbf{x}_i)) = \sum_{i=1}^N (y_i-f(\mathbf{x}_i))^2$, this algorithm corresponds exactly to residual fitting. - -There are various ways to extend and improve upon the basic framework suggested in Figure~\ref{fig:GradientBoost}. For example, Friedman (2001) substituted several choices in for $\Psi$ to develop new boosting algorithms for robust regression with least absolute deviation and Huber loss functions. Friedman (2002) showed that a simple subsampling trick can greatly improve predictive performance while simultaneously reduce computation time. Section~\ref{GBMModifications} discusses some of these modifications. - -\section{Improving boosting methods using control of the learning rate, sub-sampling, and a decomposition for interpretation} \label{GBMModifications} - -This section explores the variations of the previous algorithms that have the potential to improve their predictive performance and interpretability. In particular, by controlling the optimization speed or learning rate, introducing low-variance regression methods, and applying ideas from robust regression we can produce non-parametric regression procedures with many desirable properties. As a by-product some of these modifications lead directly into implementations for learning from massive datasets. All these methods take advantage of the general form of boosting -\begin{equation} -\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). -\end{equation} So far we have taken advantage of this form only by substituting in our favorite regression procedure for $\EV_w(z|\mathbf{x})$. I will discuss some modifications to estimating $\EV_w(z|\mathbf{x})$ that have the potential to improve our algorithm. - -\subsection{Decreasing the learning rate} As several authors have phrased slightly differently, ``...boosting, whatever flavor, seldom seems to overfit, no matter how many terms are included in the additive expansion''. This is not true as the discussion to \cite{FHT:2000} points out. - -In the update step of any boosting algorithm we can introduce a learning rate to dampen the proposed move. -\begin{equation} -\label{eq:shrinkage} -\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). -\end{equation} -By multiplying the gradient step by $\lambda$ as in equation~\ref{eq:shrinkage} we have control on the rate at which the boosting algorithm descends the error surface (or ascends the likelihood surface). When $\lambda=1$ we return to performing full gradient steps. Friedman (2001) relates the learning rate to regularization through shrinkage. - -The optimal number of iterations, $T$, and the learning rate, $\lambda$, depend on each other. In practice I set $\lambda$ to be as small as possible and then select $T$ by cross-validation. Performance is best when $\lambda$ is as small as possible performance with decreasing marginal utility for smaller and smaller $\lambda$. Slower learning rates do not necessarily scale the number of optimal iterations. That is, if when $\lambda=1.0$ and the optimal $T$ is 100 iterations, does {\it not} necessarily imply that when $\lambda=0.1$ the optimal $T$ is 1000 iterations. - -\subsection{Variance reduction using subsampling} - -Friedman (2002) proposed the stochastic gradient boosting algorithm that simply samples uniformly without replacement from the dataset before estimating the next gradient step. He found that this additional step greatly improved performance. We estimate the regression $\EV(z(y,\hat f(\mathbf{x}))|\mathbf{x})$ using a random subsample of the dataset. - -\subsection{ANOVA decomposition} - -Certain function approximation methods are decomposable in terms of a ``functional ANOVA decomposition''. That is a function is decomposable as -\begin{equation} -\label{ANOVAdecomp} -f(\mathbf{x}) = \sum_j f_j(x_j) + \sum_{jk} f_{jk}(x_j,x_k) + \sum_{jk\ell} f_{jk\ell}(x_j,x_k,x_\ell) + \cdots. -\end{equation} This applies to boosted trees. Regression stumps (one split decision trees) depend on only one variable and fall into the first term of \ref{ANOVAdecomp}. Trees with two splits fall into the second term of \ref{ANOVAdecomp} and so on. By restricting the depth of the trees produced on each boosting iteration we can control the order of approximation. Often additive components are sufficient to approximate a multivariate function well, generalized additive models, the na\"{\i}ve Bayes classifier, and boosted stumps for example. When the approximation is restricted to a first order we can also produce plots of $x_j$ versus $f_j(x_j)$ to demonstrate how changes in $x_j$ might affect changes in the response variable. - -\subsection{Relative influence} Friedman (2001) also develops an extension of a variable's ``relative influence'' for boosted estimates. For tree based methods the approximate relative influence of a variable $x_j$ is -\begin{equation} -\label{RelInfluence} -\hspace{0.5in} -\hat J_j^2 = \hspace{-0.1in}\sum_{\mathrm{splits~on~}x_j}\hspace{-0.2in}I_t^2 -\end{equation} where $I_t^2$ is the empirical improvement by splitting on $x_j$ at that point. Friedman's extension to boosted models is to average the relative influence of variable $x_j$ across all the trees generated by the boosting algorithm. - -\begin{figure} -\aRule -Select -\begin{itemize} -\item a loss function (\texttt{distribution}) -\item the number of iterations, $T$ (\texttt{n.trees}) -\item the depth of each tree, $K$ (\texttt{interaction.depth}) -\item the shrinkage (or learning rate) parameter, $\lambda$ (\texttt{shrinkage}) -\item the subsampling rate, $p$ (\texttt{bag.fraction}) -\end{itemize} -Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$ \\ -For $t$ in $1,\ldots,T$ do -\begin{enumerate} -\item Compute the negative gradient as the working response - \begin{equation} - z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} - \end{equation} -\item Randomly select $p\times N$ cases from the dataset -\item Fit a regression tree with $K$ terminal nodes, $g(\mathbf{x})=\EV(z|\mathbf{x})$. This tree is fit using only those randomly selected observations -\item Compute the optimal terminal node predictions, $\rho_1,\ldots,\rho_K$, as - \begin{equation} - \rho_k = \arg \min_{\rho} \sum_{\mathbf{x}_i\in S_k} \Psi(y_i,\hat f(\mathbf{x}_i)+\rho) - \end{equation} -where $S_k$ is the set of $\mathbf{x}$s that define terminal node $k$. Again this step uses only the randomly selected observations. -\item Update $\hat f(\mathbf{x})$ as - \begin{equation} - \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda\rho_{k(\mathbf{x})} - \end{equation} -where $k(\mathbf{x})$ indicates the index of the terminal node into which an observation with features $\mathbf{x}$ would fall. -\end{enumerate} -\aRule -\caption{Boosting as implemented in \texttt{gbm()}} -\label{fig:gbm} -\end{figure} - -\section{Common user options} - -This section discusses the options to gbm that most users will need to change or tune. - -\subsection{Loss function} - -The first and foremost choice is \texttt{distribution}. This should be easily dictated by the application. For most classification problems either \texttt{bernoulli} or \texttt{adaboost} will be appropriate, the former being recommended. For continuous outcomes the choices are \texttt{gaussian} (for minimizing squared error), \texttt{laplace} (for minimizing absolute error), and quantile regression (for estimating percentiles of the conditional distribution of the outcome). Censored survival outcomes should require \texttt{coxph}. Count outcomes may use \texttt{poisson} although one might also consider \texttt{gaussian} or \texttt{laplace} depending on the analytical goals. - -\subsection{The relationship between shrinkage and number of iterations} The issues that most new users of gbm struggle with are the choice of \texttt{n.trees} and \texttt{shrinkage}. It is important to know that smaller values of \texttt{shrinkage} (almost) always give improved predictive performance. That is, setting \texttt{shrinkage=0.001} will almost certainly result in a model with better out-of-sample predictive performance than setting \texttt{shrinkage=0.01}. However, there are computational costs, both storage and CPU time, associated with setting \texttt{shrinkage} to be low. The model with \texttt{shrinkage=0.001} will likely require ten times as many iterations as the model with \texttt{shrinkage=0.01}, increasing storage and computation time by a factor of 10. Figure~\ref{fig:shrinkViters} shows the relationship between predictive performance, the number of iterations, and the shrinkage parameter. Note that the increase in the optimal number of iterations between two choices for shrinkage is roughly equal to the ratio of the shrinkage parameters. It is generally the case that for small shrinkage parameters, 0.001 for example, there is a fairly long plateau in which predictive performance is at its best. My rule of thumb is to set \texttt{shrinkage} as small as possible while still being able to fit the model in a reasonable amount of time and storage. I usually aim for 3,000 to 10,000 iterations with shrinkage rates between 0.01 and 0.001. - -\begin{figure}[ht] \begin{center} \includegraphics[width=5in]{shrinkage-v-iterations} \end{center} \caption{Out-of-sample predictive performance by number of iterations and shrinkage. Smaller values of the shrinkage parameter offer improved predictive performance, but with decreasing marginal improvement.} \label{fig:shrinkViters} \end{figure} - -\subsection{Estimating the optimal number of iterations} gbm offers three methods for estimating the optimal number of iterations after the gbm model has been fit, an independent test set (\texttt{test}), out-of-bag estimation (\texttt{OOB}), and $v$-fold cross validation (\texttt{cv}). The function \texttt{gbm.perf} computes the iteration estimate. - -Like Friedman's MART software, the independent test set method uses a single holdout test set to select the optimal number of iterations. If \texttt{train.fraction} is set to be less than 1, then only the \textit{first} \texttt{train.fraction}$\times$\texttt{nrow(data)} will be used to fit the model. Note that if the data are sorted in a systematic way (such as cases for which $y=1$ come first), then the data should be shuffled before running gbm. Those observations not used in the model fit can be used to get an unbiased estimate of the optimal number of iterations. The downside of this method is that a considerable number of observations are used to estimate the single regularization parameter (number of iterations) leaving a reduced dataset for estimating the entire multivariate model structure. Use \texttt{gbm.perf(...,method="test")} to obtain an estimate of the optimal number of iterations using the held out test set. - -If \texttt{bag.fraction} is set to be greater than 0 (0.5 is recommended), gbm computes an out-of-bag estimate of the improvement in predictive performance. It evaluates the reduction in deviance on those observations not used in selecting the next regression tree. The out-of-bag estimator underestimates the reduction in deviance. As a result, it almost always is too conservative in its selection for the optimal number of iterations. The motivation behind this method was to avoid having to set aside a large independent dataset, which reduces the information available for learning the model structure. Use \texttt{gbm.perf(...,method="OOB")} to obtain the OOB estimate. - -Lastly, gbm offers $v$-fold cross validation for estimating the optimal number of iterations. If when fitting the gbm model, \texttt{cv.folds=5} then gbm will do 5-fold cross validation. gbm will fit five gbm models in order to compute the cross validation error estimate and then will fit a sixth and final gbm model with \texttt{n.trees}iterations using all of the data. The returned model object will have a component labeled \texttt{cv.error}. Note that \texttt{gbm.more} will do additional gbm iterations but will not add to the \texttt{cv.error} component. Use \texttt{gbm.perf(...,method="cv")} to obtain the cross validation estimate. - -\begin{figure}[ht] -\begin{center} -\includegraphics[width=5in]{oobperf2} -\end{center} -\caption{Out-of-sample predictive performance of four methods of selecting the optimal number of iterations. The vertical axis plots performance relative the best. The boxplots indicate relative performance across thirteen real datasets from the UCI repository. See \texttt{demo(OOB-reps)}.} -\label{fig:oobperf} -\end{figure} - -Figure~\ref{fig:oobperf} compares the three methods for estimating the optimal number of iterations across 13 datasets. The boxplots show the methods performance relative to the best method on that dataset. For most datasets the method perform similarly, however, 5-fold cross validation is consistently the best of them. OOB, using a 33\% test set, and using a 20\% test set all have datasets for which the perform considerably worse than the best method. My recommendation is to use 5- or 10-fold cross validation if you can afford the computing time. Otherwise you may choose among the other options, knowing that OOB is conservative. - -\section{Available distributions} - -This section gives some of the mathematical detail for each of the distribution options that gbm offers. The gbm engine written in C++ has access to a C++ class for each of these distributions. Each class contains methods for computing the associated deviance, initial value, the gradient, and the constants to predict in each terminal node. - -In the equations shown below, for non-zero offset terms, replace $f(\mathbf{x}_i)$ with $o_i + f(\mathbf{x}_i)$. - -\subsection{Gaussian} - -\begin{tabular}{ll} -Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i(y_i-f(\mathbf{x}_i))^2$ \\ -Initial value & $\displaystyle f(\mathbf{x})=\frac{\sum w_i(y_i-o_i)}{\sum w_i}$ \\ -Gradient & $z_i=y_i - f(\mathbf{x}_i)$ \\ -Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-f(\mathbf{x}_i))}{\sum w_i}$ -\end{tabular} - -\subsection{AdaBoost} - -\begin{tabular}{ll} Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Initial value & $\displaystyle \frac{1}{2}\log\frac{\sum y_iw_ie^{-o_i}}{\sum (1-y_i)w_ie^{o_i}}$ \\ Gradient & $\displaystyle z_i= -(2y_i-1)\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Terminal node estimates & $\displaystyle \frac{\sum (2y_i-1)w_i\exp(-(2y_i-1)f(\mathbf{x}_i))} - {\sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))}$ -\end{tabular} - -\subsection{Bernoulli} - -\begin{tabular}{ll} Deviance & $\displaystyle -2\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\log(1+\exp(f(\mathbf{x}_i))))$ \\ Initial value & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i(1-y_i)}$ \\ Gradient & $\displaystyle z_i=y_i-\frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ \\ - & where $\displaystyle p_i = \frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ -\end{tabular} - -Notes: \begin{itemize} \item For non-zero offset terms, the computation of the initial value requires Newton-Raphson. Initialize $f_0=0$ and iterate $\displaystyle f_0 \leftarrow f_0 + \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ where $\displaystyle p_i = \frac{1}{1+\exp(-(o_i+f_0))}$. \end{itemize} - -\subsection{Laplace} - -\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} \sum w_i|y_i-f(\mathbf{x}_i)|$ \\ Initial value & $\mbox{median}_w(y)$ \\ Gradient & $z_i=\mbox{sign}(y_i-f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mbox{median}_w(z)$ \end{tabular} - -Notes: \begin{itemize} \item $\mbox{median}_w(y)$ denotes the weighted median, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq m)}{\sum w_i}=\frac{1}{2}$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution="laplace"}. \end{itemize} - - -\subsection{Quantile regression} - -Contributed by Brian Kriegler (see \cite{Kriegler:2010}). - -\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} - \left(\alpha\sum_{y_i>f(\mathbf{x}_i)} w_i(y_i-f(\mathbf{x}_i))\right. +$ \\ - & \hspace{0.5in}$\left.(1-\alpha)\sum_{y_i\leq f(\mathbf{x}_i)} w_i(f(\mathbf{x}_i)-y_i)\right)$ \\ -Initial value & $\mathrm{quantile}^{(\alpha)}_w(y)$ \\ Gradient & $z_i=\alpha I(y_i>f(\mathbf{x}_i))-(1-\alpha)I(y_i\leq f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mathrm{quantile}^{(\alpha)}_w(z)$ \end{tabular} - -Notes: \begin{itemize} \item $\mathrm{quantile}^{(\alpha)}_w(y)$ denotes the weighted quantile, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq q)}{\sum w_i}=\alpha$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution=list(name="quantile")}. \end{itemize} - - -\subsection{Cox Proportional Hazard} - -\begin{tabular}{ll} Deviance & $-2\sum w_i(\delta_i(f(\mathbf{x}_i)-\log(R_i/w_i)))$\\ Gradient & $\displaystyle z_i=\delta_i - \sum_j \delta_j - \frac{w_jI(t_i\geq t_j)e^{f(\mathbf{x}_i)}} - {\sum_k w_kI(t_k\geq t_j)e^{f(\mathbf{x}_k)}}$ \\ -Initial value & 0 \\ Terminal node estimates & Newton-Raphson algorithm \end{tabular} - -\begin{enumerate} - \item Initialize the terminal node predictions to 0, $\mathgbf{\rho}=0$ - \item Let $\displaystyle - p_i^{(k)}=\frac{\sum_j I(k(j)=k)I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}} - {\sum_j I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}}$ - \item Let $g_k=\sum w_i\delta_i\left(I(k(i)=k)-p_i^{(k)}\right)$ - \item Let $\mathbf{H}$ be a $k\times k$ matrix with diagonal elements - \begin{enumerate} - \item Set diagonal elements $H_{mm}=\sum w_i\delta_i p_i^{(m)}\left(1-p_i^{(m)}\right)$ - \item Set off diagonal elements $H_{mn}=-\sum w_i\delta_i p_i^{(m)}p_i^{(n)}$ - \end{enumerate} - \item Newton-Raphson update $\mathgbf{\rho} \leftarrow \mathgbf{\rho} - \mathbf{H}^{-1}\mathbf{g}$ - \item Return to step 2 until convergence -\end{enumerate} - -Notes: -\begin{itemize} -\item $t_i$ is the survival time and $\delta_i$ is the death indicator. -\item $R_i$ denotes the hazard for the risk set, $R_i=\sum_{j=1}^N w_jI(t_j\geq t_i)e^{f(\mathbf{x}_i)}$ -\item $k(i)$ indexes the terminal node of observation $i$ -\item For speed, \texttt{gbm()} does only one step of the Newton-Raphson algorithm rather than iterating to convergence. No appreciable loss of accuracy since the next boosting iteration will simply correct for the prior iterations inadequacy. -\item \texttt{gbm()} initially sorts the data by survival time. Doing this reduces the computation of the risk set from $O(n^2)$ to $O(n)$ at the cost of a single up front sort on survival time. After the model is fit, the data are then put back in their original order. -\end{itemize} - -\subsection{Poisson} -\begin{tabular}{ll} -Deviance & -2$\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\exp(f(\mathbf{x}_i)))$ \\ -Initial value & $\displaystyle f(\mathbf{x})= \log\left(\frac{\sum w_iy_i}{\sum w_ie^{o_i}}\right)$ \\ -Gradient & $z_i=y_i - \exp(f(\mathbf{x}_i))$ \\ -Terminal node estimates & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i\exp(f(\mathbf{x}_i))}$ -\end{tabular} - -The Poisson class includes special safeguards so that the most extreme predicted values are $e^{-19}$ and $e^{+19}$. This behavior is consistent with \texttt{glm()}. - - -\subsection{Pairwise} - -This distribution implements ranking measures following the -\emph{LambdaMart} algorithm \cite{Burges:2010}. Instances belong to -\emph{groups}; all pairs of items with different labels, belonging to -the same group, are used for training. In \emph{Information Retrieval} -applications, groups correspond to user queries, -and items to (feature vectors of) documents in the associated match -set to be ranked. - -For consistency with typical usage, our goal is to \emph{maximize} one -of the \emph{utility} functions listed below. Consider a group with -instances $x_1, \dots, x_n$, ordered such that $f(x_1) \geq f(x_2) -\geq \dots f(x_n)$; i.e., the \emph{rank} of $x_i$ is $i$, where -smaller ranks are preferable. Let $P$ be the set of all ordered pairs -such that $y_i > y_j$. - -\begin{enumerate} -\item[{\bf Concordance:}] Fraction of concordant (i.e, correctly ordered) - pairs. For the special case of binary labels, this is equivalent to - the Area under the ROC Curve. -$$\left\{ \begin{array}{l l}\frac{\|\{(i,j)\in P | - f(x_i)>f(x_j)\}\|}{\|P\|} - & P \neq \emptyset\\ - 0 & \mbox{otherwise.} - \end{array}\right. -$$ -\item[{\bf MRR:}] Mean reciprocal rank of the highest-ranked positive - instance (it is assumed $y_i\in\{0,1\}$): -$$\left\{ \begin{array}{l l}\frac{1}{\min\{1 \leq i \leq n |y_i=1\}} - & \exists i: \, 1 \leq i \leq n, y_i=1\\ - 0 & \mbox{otherwise.}\end{array}\right.$$ -\item[{\bf MAP:}] Mean average precision, a generalization of - MRR to multiple positive instances: -$$\left\{ \begin{array}{l l} \frac{\sum_{1\leq i\leq n | y_i=1} \|\{1\leq j\leq i - |y_j=1\}\|\,/\,i}{\|\{1\leq i\leq n | y_i=1\}\|} & \exists i: \, - 1 \leq i \leq n, y_i=1\\ - 0 & \mbox{otherwise.}\end{array}\right.$$ -\item[{\bf nDCG:}] Normalized discounted cumulative gain: -$$\frac{\sum_{1\leq i\leq n} \log_2(i+1) \, y_i}{\sum_{1\leq i\leq n} - \log_2(i+1) \, y'_i},$$ where $y'_1, \dots, y'_n$ is a reordering of $y_1, - \dots,y_n$ with $y'_1 \geq y'_2 \geq \dots \geq y'_n$. -\end{enumerate} - -The generalization to multiple (possibly weighted) groups is -straightforward. Sometimes a cut-off rank $k$ is given for \emph{MRR} -and \emph{nDCG}, in which case we replace the outer index $n$ by -$\min(n,k)$. - -The initial value for $f(x_i)$ is always zero. We derive the gradient of -a cost function whose gradient locally approximates the gradient of -the IR measure for a fixed ranking: - -\begin{eqnarray*} -\Phi & = & \sum_{(i,j) \in P} \Phi_{ij}\\ - & = & \sum_{(i,j) \in P} |\Delta Z_{ij}| \log \left( 1 + e^{-(f(x_i) - - f(x_j))}\right), -\end{eqnarray*} -where $|\Delta Z_{ij}|$ is the absolute utility difference when -swapping the ranks of $i$ and $j$, while leaving all other instances -the same. Define -\begin{eqnarray*} - \lambda_{ij} & = & \frac{\partial\Phi_{ij}}{\partial f(x_i)}\\ - & = & - |\Delta Z_{ij}| \frac{1}{1 + e^{f(x_i) - f(x_j)}}\\ -& = & - |\Delta Z_{ij}| \, \rho_{ij}, -\end{eqnarray*} -with -$$ \rho_{ij} = - \frac{\lambda_{ij }}{|\Delta Z_{ij}|} = \frac{1}{1 + e^{f(x_i) - f(x_j)}}$$ - - For the gradient of $\Phi$ with respect to $f(x_i)$, define -\begin{eqnarray*} -\lambda_i & = & \frac{\partial \Phi}{\partial f(x_i)}\\ -& = & \sum_{j|(i,j) \in P} \lambda_{ij} - \sum_{j|(j,i) \in P} \lambda_{ji}\\ -& = & - \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij}\\ -& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji}. -\end{eqnarray*} - - The second derivative is -\begin{eqnarray*} - \gamma_i & \stackrel{def}{=} & \frac{\partial^2\Phi}{\partial f(x_i)^2}\\ - & = & \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij} \, (1-\rho_{ij})\\ -& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji} \, (1-\rho_{ji}). -\end{eqnarray*} - -Now consider again all groups with associated weights. For a given terminal node, let $i$ -range over all contained instances. Then its estimate is -$$-\frac{\sum_i v_i\lambda_{i}}{\sum_i v_i \gamma_i},$$ where -$v_i=w(\mbox{\em group}(i))/\|\{(j,k)\in\mbox{\em group}(i)\}\|.$ - -In each iteration, instances are reranked according to the preliminary -scores $f(x_i)$ to determine the $|\Delta Z_{ij}|$. Note that in order -to avoid ranking bias, we break ties by adding a small amount of -random noise. - -\begin{thebibliography}{77} % start the bibliography - -\small % put the bibliography in a small font - -\bibitem{FreundSchapire:1997} Y. Freund and R.E. Schapire (1997). ``A decision-theoretic generalization of on-line learning and an application to boosting,'' \textit{Journal of Computer and System Sciences}, 55(1):119-139. - -\bibitem{Friedman:2001} J.H. Friedman (2001). ``Greedy Function Approximation: A Gradient Boosting Machine,'' \textit{Annals of Statistics} 29(5):1189-1232. - -\bibitem{Friedman:2002} J.H. Friedman (2002). ``Stochastic Gradient Boosting,'' \textit{Computational Statistics and Data Analysis} 38(4):367-378. - -\bibitem{FHT:2000} J.H. Friedman, T. Hastie, R. Tibshirani (2000). ``Additive Logistic Regression: a Statistical View of Boosting,'' \textit{Annals of Statistics} 28(2):337-374. - -\bibitem{Kriegler:2010} B. Kriegler and R. Berk (2010). ``Small Area Estimation of the Homeless in Los Angeles, An Application of Cost-Sensitive Stochastic Gradient Boosting,'' \textit{Annals of Applied Statistics} 4(3):1234-1255. - -\bibitem{Ridgeway:1999} G. Ridgeway (1999). ``The state of boosting,'' \textit{Computing Science and Statistics} 31:172-181. - -\bibitem{Burges:2010} C. Burges (2010). ``From RankNet to LambdaRank to LambdaMART: An Overview'', \textit{Microsoft Research Technical Report MSR-TR-2010-82} - -\end{thebibliography} % end the bibliography - -\end{document} diff --git a/inst/doc/index.html b/inst/doc/index.html deleted file mode 100644 index 300c5fb..0000000 --- a/inst/doc/index.html +++ /dev/null @@ -1,21 +0,0 @@ - -R: Vignettes - - - - -

Vignettes - -

-
-
-[Top] -
-

Vignettes from package 'gbm'

- - - diff --git a/inst/doc/oobperf2.eps b/inst/doc/oobperf2.eps deleted file mode 100644 index aff744d..0000000 --- a/inst/doc/oobperf2.eps +++ /dev/null @@ -1,487 +0,0 @@ -%!PS-Adobe-3.0 EPSF-3.0 -%%Title: WMF2EPS 1.32 : WMF->EPS conversion for oobperf2.wmf -%%Creator: PScript5.dll Version 5.2.2 -%%CreationDate: 1/30/2005 11:3:37 -%%For: gregr -%%BoundingBox: 56 56 404 271 -%%Pages: 1 -%%Orientation: Portrait -%%PageOrder: Ascend -%%DocumentNeededResources: (atend) -%%DocumentSuppliedResources: (atend) -%%DocumentData: Clean7Bit -%%TargetDevice: (WMF2EPS Color PS) (2010.0) 2 -%%LanguageLevel: 2 -%%EndComments - -%%BeginDefaults -%%PageBoundingBox: 0 0 405 271 -%%ViewingOrientation: 1 0 0 1 -%%EndDefaults - -%%BeginProlog -%%BeginResource: file Pscript_WinNT_ErrorHandler 5.0 0 -/currentpacking where{pop/oldpack currentpacking def/setpacking where{pop false -setpacking}if}if/$brkpage 64 dict def $brkpage begin/prnt{dup type/stringtype -ne{=string cvs}if dup length 6 mul/tx exch def/ty 10 def currentpoint/toy exch -def/tox exch def 1 setgray newpath tox toy 2 sub moveto 0 ty rlineto tx 0 -rlineto 0 ty neg rlineto closepath fill tox toy moveto 0 setgray show}bind def -/nl{currentpoint exch pop lmargin exch moveto 0 -10 rmoveto}def/=={/cp 0 def -typeprint nl}def/typeprint{dup type exec}readonly def/lmargin 72 def/rmargin 72 -def/tprint{dup length cp add rmargin gt{nl/cp 0 def}if dup length cp add/cp -exch def prnt}readonly def/cvsprint{=string cvs tprint( )tprint}readonly def -/integertype{cvsprint}readonly def/realtype{cvsprint}readonly def/booleantype -{cvsprint}readonly def/operatortype{(--)tprint =string cvs tprint(-- )tprint} -readonly def/marktype{pop(-mark- )tprint}readonly def/dicttype{pop -(-dictionary- )tprint}readonly def/nulltype{pop(-null- )tprint}readonly def -/filetype{pop(-filestream- )tprint}readonly def/savetype{pop(-savelevel- ) -tprint}readonly def/fonttype{pop(-fontid- )tprint}readonly def/nametype{dup -xcheck not{(/)tprint}if cvsprint}readonly def/stringtype{dup rcheck{(\()tprint -tprint(\))tprint}{pop(-string- )tprint}ifelse}readonly def/arraytype{dup rcheck -{dup xcheck{({)tprint{typeprint}forall(})tprint}{([)tprint{typeprint}forall(]) -tprint}ifelse}{pop(-array- )tprint}ifelse}readonly def/packedarraytype{dup -rcheck{dup xcheck{({)tprint{typeprint}forall(})tprint}{([)tprint{typeprint} -forall(])tprint}ifelse}{pop(-packedarray- )tprint}ifelse}readonly def/courier -/Courier findfont 10 scalefont def end errordict/handleerror{systemdict begin -$error begin $brkpage begin newerror{/newerror false store vmstatus pop pop 0 -ne{grestoreall}if errorname(VMerror)ne{showpage}if initgraphics courier setfont -lmargin 720 moveto errorname(VMerror)eq{userdict/ehsave known{clear userdict -/ehsave get restore 2 vmreclaim}if vmstatus exch pop exch pop PrtVMMsg}{ -(ERROR: )prnt errorname prnt nl(OFFENDING COMMAND: )prnt/command load prnt -$error/ostack known{nl nl(STACK:)prnt nl nl $error/ostack get aload length{==} -repeat}if}ifelse systemdict/showpage get exec(%%[ Error: )print errorname -=print(; OffendingCommand: )print/command load =print( ]%%)= flush}if end end -end}dup 0 systemdict put dup 4 $brkpage put bind readonly put/currentpacking -where{pop/setpacking where{pop oldpack setpacking}if}if -%%EndResource -userdict /Pscript_WinNT_Incr 230 dict dup begin put -%%BeginResource: file Pscript_FatalError 5.0 0 -userdict begin/FatalErrorIf{{initgraphics findfont 1 index 0 eq{exch pop}{dup -length dict begin{1 index/FID ne{def}{pop pop}ifelse}forall/Encoding -{ISOLatin1Encoding}stopped{StandardEncoding}if def currentdict end -/ErrFont-Latin1 exch definefont}ifelse exch scalefont setfont counttomark 3 div -cvi{moveto show}repeat showpage quit}{cleartomark}ifelse}bind def end -%%EndResource -userdict begin/PrtVMMsg{vmstatus exch sub exch pop gt{[ -(This job requires more memory than is available in this printer.)100 500 -(Try one or more of the following, and then print again:)100 485 -(For the output format, choose Optimize For Portability.)115 470 -(In the Device Settings page, make sure the Available PostScript Memory is accurate.) -115 455(Reduce the number of fonts in the document.)115 440 -(Print the document in parts.)115 425 12/Times-Roman showpage -(%%[ PrinterError: Low Printer VM ]%%)= true FatalErrorIf}if}bind def end -version cvi 2016 ge{/VM?{pop}bind def}{/VM? userdict/PrtVMMsg get def}ifelse -%%BeginResource: file Pscript_Win_Basic 5.0 0 -/d/def load def/,/load load d/~/exch , d/?/ifelse , d/!/pop , d/`/begin , d/^ -/index , d/@/dup , d/+/translate , d/$/roll , d/U/userdict , d/M/moveto , d/- -/rlineto , d/&/currentdict , d/:/gsave , d/;/grestore , d/F/false , d/T/true , -d/N/newpath , d/E/end , d/Ac/arc , d/An/arcn , d/A/ashow , d/D/awidthshow , d/C -/closepath , d/V/div , d/O/eofill , d/L/fill , d/I/lineto , d/-c/curveto , d/-M -/rmoveto , d/+S/scale , d/Ji/setfont , d/Lc/setlinecap , d/Lj/setlinejoin , d -/Lw/setlinewidth , d/Lm/setmiterlimit , d/sd/setdash , d/S/show , d/LH/showpage -, d/K/stroke , d/W/widthshow , d/R/rotate , d/L2? false/languagelevel where{pop -languagelevel 2 ge{pop true}if}if d L2?{/xS/xshow , d/yS/yshow , d/zS/xyshow , -d}if/b{bind d}bind d/bd{bind d}bind d/xd{~ d}bd/ld{, d}bd/bn/bind ld/lw/Lw ld -/lc/Lc ld/lj/Lj ld/sg/setgray ld/ADO_mxRot null d/self & d/OrgMx matrix -currentmatrix d/reinitialize{: OrgMx setmatrix[/TextInit/GraphInit/UtilsInit -counttomark{@ where{self eq}{F}?{cvx exec}{!}?}repeat cleartomark ;}b -/initialize{`{/Pscript_Win_Data where{!}{U/Pscript_Win_Data & put}?/ADO_mxRot ~ -d/TextInitialised? F d reinitialize E}{U/Pscript_Win_Data 230 dict @ ` put -/ADO_mxRot ~ d/TextInitialised? F d reinitialize}?}b/terminate{!{& self eq -{exit}{E}?}loop E}b/suspend/terminate , d/resume{` Pscript_Win_Data `}b U ` -/lucas 21690 d/featurebegin{countdictstack lucas[}b/featurecleanup{stopped -{cleartomark @ lucas eq{! exit}if}loop countdictstack ~ sub @ 0 gt{{E}repeat} -{!}?}b E/snap{transform 0.25 sub round 0.25 add ~ 0.25 sub round 0.25 add ~ -itransform}b/dsnap{dtransform round ~ round ~ idtransform}b/nonzero_round{@ 0.5 -ge{round}{@ -0.5 lt{round}{0 ge{1}{-1}?}?}?}b/nonzero_dsnap{dtransform -nonzero_round ~ nonzero_round ~ idtransform}b U<04>cvn{}put/rr{1 ^ 0 - 0 ~ - -neg 0 - C}b/irp{4 -2 $ + +S fx 4 2 $ M 1 ^ 0 - 0 ~ - neg 0 -}b/rp{4 2 $ M 1 ^ 0 -- 0 ~ - neg 0 -}b/solid{[]0 sd}b/g{@ not{U/DefIf_save save put}if U/DefIf_bool -2 ^ put}b/DefIf_El{if U/DefIf_bool get not @{U/DefIf_save get restore}if}b/e -{DefIf_El !}b/UDF{L2?{undefinefont}{!}?}b/UDR{L2?{undefineresource}{! !}?}b -/freeVM{/Courier findfont[40 0 0 -40 0 0]makefont Ji 2 vmreclaim}b/hfRedefFont -{findfont @ length dict `{1 ^/FID ne{d}{! !}?}forall & E @ ` ~{/CharStrings 1 -dict `/.notdef 0 d & E d}if/Encoding 256 array 0 1 255{1 ^ ~/.notdef put}for d -E definefont !}bind d/hfMkCIDFont{/CIDFont findresource @ length 2 add dict `{1 -^ @/FID eq ~ @/XUID eq ~/UIDBase eq or or{! !}{d}?}forall/CDevProc ~ d/Metrics2 -16 dict d/CIDFontName 1 ^ d & E 1 ^ ~/CIDFont defineresource ![~]composefont !} -bind d -%%EndResource -%%BeginResource: file Pscript_Win_Utils_L2 5.0 0 -/rf/rectfill , d/fx{1 1 dtransform @ 0 ge{1 sub 0.5}{1 add -0.5}? 3 -1 $ @ 0 ge -{1 sub 0.5}{1 add -0.5}? 3 1 $ 4 1 $ idtransform 4 -2 $ idtransform}b/BZ{4 -2 $ -snap + +S fx rf}b/rs/rectstroke , d/rc/rectclip , d/UtilsInit{currentglobal{F -setglobal}if}b/scol{! setcolor}b/colspA/DeviceGray d/colspABC/DeviceRGB d -/colspRefresh{colspABC setcolorspace}b/SetColSpace{colspABC setcolorspace}b -/resourcestatus where{!/ColorRendering/ProcSet resourcestatus{! ! T}{F}?}{F}? -not{/ColorRendering<>/defineresource where{!/ProcSet -defineresource !}{! !}?}if/buildcrdname{/ColorRendering/ProcSet findresource ` -mark GetHalftoneName @ type @/nametype ne ~/stringtype ne and{!/none}if(.) -GetPageDeviceName @ type @/nametype ne ~/stringtype ne and{!/none}if(.)5 ^ 0 5 --1 1{^ length add}for string 6 1 $ 5 ^ 5{~ 1 ^ cvs length 1 ^ length 1 ^ sub -getinterval}repeat ! cvn 3 1 $ ! ! E}b/definecolorrendering{~ buildcrdname ~ -/ColorRendering defineresource !}b/findcolorrendering where{!}{ -/findcolorrendering{buildcrdname @/ColorRendering resourcestatus{! ! T}{ -/ColorRendering/ProcSet findresource ` GetSubstituteCRD E F}?}b}? -/selectcolorrendering{findcolorrendering !/ColorRendering findresource -setcolorrendering}b/G2UBegin{findresource/FontInfo get/GlyphNames2Unicode get -`}bind d/G2CCBegin{findresource/FontInfo get/GlyphNames2HostCode get `}bind d -/G2UEnd{E}bind d/AddFontInfoBegin{/FontInfo 8 dict @ `}bind d/AddFontInfo{ -/GlyphNames2Unicode 16 dict d/GlyphNames2HostCode 16 dict d}bind d -/AddFontInfoEnd{E d}bind d/T0AddCFFMtx2{/CIDFont findresource/Metrics2 get ` d -E}bind d -%%EndResource -end -%%EndProlog - -%%BeginSetup -[ 1 0 0 1 0 0 ] false Pscript_WinNT_Incr dup /initialize get exec -1 setlinecap 1 setlinejoin -/mysetup [ 72 600 V 0 0 -72 600 V 0 270.99212 ] def -%%EndSetup - -%%Page: 1 1 -%%PageBoundingBox: 0 0 405 271 -%%EndPageComments -%%BeginPageSetup -/DeviceRGB dup setcolorspace /colspABC exch def -mysetup concat colspRefresh -%%EndPageSetup - -0 0 0 1 scol : 472 1 43 112 rc Pscript_WinNT_Incr begin -%%BeginResource: file Pscript_Text 5.0 0 -/TextInit{TextInitialised? not{/Pscript_Windows_Font & d/TextInitialised? T d -/fM[1 0 0 1 0 0]d/mFM matrix d/iMat[1 0 0.212557 1 0 0]d}if}b/copyfont{1 ^ -length add dict `{1 ^/FID ne{d}{! !}?}forall & E}b/EncodeDict 11 dict d/bullets -{{/bullet}repeat}b/rF{3 copyfont @ ` ~ EncodeDict ~ get/Encoding ~ 3 ^/0 eq{& -/CharStrings known{CharStrings/Eth known not{! EncodeDict/ANSIEncodingOld get} -if}if}if d E}b/mF{@ 7 1 $ findfont ~{@/Encoding get @ StandardEncoding eq{! T}{ -{ISOLatin1Encoding}stopped{! F}{eq}?{T}{@ ` T 32 1 127{Encoding 1 ^ get -StandardEncoding 3 -1 $ get eq and}for E}?}?}{F}?{1 ^ ~ rF}{0 copyfont}? 6 -2 $ -! ! ~ !/pd_charset @ where{~ get 128 eq{@ FDV 2 copy get @ length array copy -put pd_CoverFCRange}if}{!}? 2 ^ ~ definefont fM 5 4 -1 $ put fM 4 0 put fM -makefont Pscript_Windows_Font 3 1 $ put}b/sLT{: Lw -M currentpoint snap M 0 - 0 -Lc K ;}b/xUP null d/yUP null d/uW null d/xSP null d/ySP null d/sW null d/sSU{N -/uW ~ d/yUP ~ d/xUP ~ d}b/sU{xUP yUP uW sLT}b/sST{N/sW ~ d/ySP ~ d/xSP ~ d}b/sT -{xSP ySP sW sLT}b/sR{: + R 0 0 M}b/sRxy{: matrix astore concat 0 0 M}b/eR/; , d -/AddOrigFP{{&/FontInfo known{&/FontInfo get length 6 add}{6}? dict ` -/WinPitchAndFamily ~ d/WinCharSet ~ d/OrigFontType ~ d/OrigFontStyle ~ d -/OrigFontName ~ d & E/FontInfo ~ d}{! ! ! ! !}?}b/mFS{makefont -Pscript_Windows_Font 3 1 $ put}b/mF42D{0 copyfont `/FontName ~ d 2 copy ~ sub 1 -add dict `/.notdef 0 d 2 copy 1 ~{@ 3 ^ sub Encoding ~ get ~ d}for & E -/CharStrings ~ d ! ! & @ E/FontName get ~ definefont}b/mF42{15 dict ` @ 4 1 $ -FontName ~ d/FontType 0 d/FMapType 2 d/FontMatrix[1 0 0 1 0 0]d 1 ^ 254 add 255 -idiv @ array/Encoding ~ d 0 1 3 -1 $ 1 sub{@ Encoding 3 1 $ put}for/FDepVector -Encoding length array d/CharStrings 2 dict `/.notdef 0 d & E d 0 1 Encoding -length 1 sub{@ @ 10 lt{! FontName length 1 add string}{100 lt{FontName length 2 -add string}{FontName length 3 add string}?}? @ 0 FontName @ length string cvs -putinterval @ 3 -1 $ @ 4 1 $ 3 string cvs FontName length ~ putinterval cvn 1 ^ -256 mul @ 255 add 3 -1 $ 4 ^ findfont mF42D FDepVector 3 1 $ put}for & @ E -/FontName get ~ definefont ! ! ! mF}b/mF_OTF_V{~ ! ~ ! 4 -1 $ ! findfont 2 ^ ~ -definefont fM @ @ 4 6 -1 $ neg put 5 0 put 90 matrix R matrix concatmatrix -makefont Pscript_Windows_Font 3 1 $ put}b/mF_TTF_V{3{~ !}repeat 3 -1 $ ! -findfont 1 ^ ~ definefont Pscript_Windows_Font 3 1 $ put}b/UmF{L2? -{Pscript_Windows_Font ~ undef}{!}?}b/UmF42{@ findfont/FDepVector get{/FontName -get undefinefont}forall undefinefont}b -%%EndResource -end reinitialize -Pscript_WinNT_Incr begin -%%BeginResource: file Pscript_Encoding256 5.0 0 -/CharCol256Encoding[/.notdef/breve/caron/dotaccent/dotlessi/fi/fl/fraction -/hungarumlaut/Lslash/lslash/minus/ogonek/ring/Zcaron/zcaron/.notdef/.notdef -/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef -/.notdef/.notdef/.notdef/.notdef/.notdef/space/exclam/quotedbl/numbersign -/dollar/percent/ampersand/quotesingle/parenleft/parenright/asterisk/plus/comma -/hyphen/period/slash/zero/one/two/three/four/five/six/seven/eight/nine/colon -/semicolon/less/equal/greater/question/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O/P/Q/R/S -/T/U/V/W/X/Y/Z/bracketleft/backslash/bracketright/asciicircum/underscore/grave -/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/braceleft/bar/braceright -/asciitilde/.notdef/Euro/.notdef/quotesinglbase/florin/quotedblbase/ellipsis -/dagger/daggerdbl/circumflex/perthousand/Scaron/guilsinglleft/OE/.notdef -/.notdef/.notdef/.notdef/quoteleft/quoteright/quotedblleft/quotedblright/bullet -/endash/emdash/tilde/trademark/scaron/guilsinglright/oe/.notdef/.notdef -/Ydieresis/.notdef/exclamdown/cent/sterling/currency/yen/brokenbar/section -/dieresis/copyright/ordfeminine/guillemotleft/logicalnot/.notdef/registered -/macron/degree/plusminus/twosuperior/threesuperior/acute/mu/paragraph -/periodcentered/cedilla/onesuperior/ordmasculine/guillemotright/onequarter -/onehalf/threequarters/questiondown/Agrave/Aacute/Acircumflex/Atilde/Adieresis -/Aring/AE/Ccedilla/Egrave/Eacute/Ecircumflex/Edieresis/Igrave/Iacute -/Icircumflex/Idieresis/Eth/Ntilde/Ograve/Oacute/Ocircumflex/Otilde/Odieresis -/multiply/Oslash/Ugrave/Uacute/Ucircumflex/Udieresis/Yacute/Thorn/germandbls -/agrave/aacute/acircumflex/atilde/adieresis/aring/ae/ccedilla/egrave/eacute -/ecircumflex/edieresis/igrave/iacute/icircumflex/idieresis/eth/ntilde/ograve -/oacute/ocircumflex/otilde/odieresis/divide/oslash/ugrave/uacute/ucircumflex -/udieresis/yacute/thorn/ydieresis]def EncodeDict/256 CharCol256Encoding put -%%EndResource -end reinitialize - -%%IncludeResource: font Times-Roman -Pscript_WinNT_Incr begin -%%BeginResource: file Pscript_Win_Euro_L2 5.0 0 -/UseT3EuroFont{/currentdistillerparams where{pop currentdistillerparams -/CoreDistVersion get 4000 le}{false}ifelse}bind def/NewEuroT3Font?{dup/FontType -get 3 eq{dup/EuroFont known exch/BaseFont known and}{pop false}ifelse}bind def -/T1FontHasEuro{dup/CharStrings known not{dup NewEuroT3Font?{dup/EuroGlyphName -get exch/EuroFont get/CharStrings get exch known{true}{false}ifelse}{pop false} -ifelse}{dup/FontType get 1 eq{/CharStrings get/Euro known}{dup/InfoDict known{ -/InfoDict get/Euro known}{/CharStrings get/Euro known}ifelse}ifelse}ifelse}bind -def/FontHasEuro{findfont dup/Blend known{pop true}{T1FontHasEuro}ifelse}bind -def/EuroEncodingIdx 1 def/EuroFontHdr{12 dict begin/FontInfo 10 dict dup begin -/version(001.000)readonly def/Notice(Copyright (c)1999 Adobe Systems -Incorporated. All Rights Reserved.)readonly def/FullName(Euro)readonly def -/FamilyName(Euro)readonly def/Weight(Regular)readonly def/isFixedPitch false -def/ItalicAngle 0 def/UnderlinePosition -100 def/UnderlineThickness 50 def end -readonly def/FontName/Euro def/Encoding 256 array 0 1 255{1 index exch/.notdef -put}for def/PaintType 0 def/FontType 1 def/FontMatrix[0.001 0 0 0.001 0 0]def -/FontBBox{-25 -23 1500 804}readonly def currentdict end dup/Private 20 dict dup -begin/ND{def}def/NP{put}def/lenIV -1 def/RD{string currentfile exch -readhexstring pop}def/-|{string currentfile exch readstring pop}executeonly def -/|-{def}executeonly def/|{put}executeonly def/BlueValues[-20 0 706 736 547 572] -|-/OtherBlues[-211 -203]|-/BlueScale 0.0312917 def/MinFeature{16 16}|-/StdHW -[60]|-/StdVW[71]|-/ForceBold false def/password 5839 def/Erode{8.5 dup 3 -1 -roll 0.1 mul exch 0.5 sub mul cvi sub dup mul 71 0 dtransform dup mul exch dup -mul add le{pop pop 1.0 1.0}{pop pop 0.0 1.5}ifelse}def/OtherSubrs[{}{}{} -{systemdict/internaldict known not{pop 3}{1183615869 systemdict/internaldict -get exec dup/startlock known{/startlock get exec}{dup/strtlck known{/strtlck -get exec}{pop 3}ifelse}ifelse}ifelse}executeonly]|-/Subrs 5 array dup 0 -<8E8B0C100C110C110C210B>put dup 1<8B8C0C100B>put dup 2<8B8D0C100B>put dup 3<0B> -put dup 4<8E8C8E0C100C110A0B>put |- 2 index/CharStrings 256 dict dup begin -/.notdef<8b8b0d0e>def end end put put dup/FontName get exch definefont pop}bind -def/AddEuroGlyph{2 index exch EuroEncodingIdx 1 eq{EuroFontHdr}if systemdict -begin/Euro findfont dup dup/Encoding get 5 1 roll/Private get begin/CharStrings -get dup 3 index known{pop pop pop pop end end}{begin 1 index exch def end end -end EuroEncodingIdx dup 1 add/EuroEncodingIdx exch def exch put}ifelse}bind def -/GetNewXUID{currentdict/XUID known{[7 XUID aload pop]true}{currentdict/UniqueID -known{[7 UniqueID]true}{false}ifelse}ifelse}bind def/BuildT3EuroFont{exch 16 -dict begin dup/FontName exch def findfont dup/Encoding get/Encoding exch def -dup length 1 add dict copy dup/FID undef begin dup dup/FontName exch def -/Encoding 256 array 0 1 255{1 index exch/.notdef put}for def GetNewXUID{/XUID -exch def}if currentdict end definefont pop/BaseFont exch findfont 1000 -scalefont def/EuroFont exch findfont 1000 scalefont def pop/EuroGlyphName exch -def/FontType 3 def/FontMatrix[.001 0 0 .001 0 0]def/FontBBox BaseFont/FontBBox -get def/Char 1 string def/BuildChar{exch dup begin/Encoding get 1 index get -/Euro eq{BaseFont T1FontHasEuro{false}{true}ifelse}{false}ifelse{EuroFont -setfont pop userdict/Idx 0 put EuroFont/Encoding get{EuroGlyphName eq{exit} -{userdict/Idx Idx 1 add put}ifelse}forall userdict/Idx get}{dup dup Encoding -exch get BaseFont/Encoding get 3 1 roll put BaseFont setfont}ifelse Char 0 3 -1 -roll put Char stringwidth newpath 0 0 moveto Char true charpath flattenpath -pathbbox setcachedevice 0 0 moveto Char show end}bind def currentdict end dup -/FontName get exch definefont pop}bind def/AddEuroToT1Font{dup findfont dup -length 10 add dict copy dup/FID undef begin/EuroFont 3 -1 roll findfont 1000 -scalefont def CharStrings dup length 1 add dict copy begin/Euro{EuroFont -setfont pop EuroGBBox aload pop setcachedevice 0 0 moveto EuroGName glyphshow} -bind def currentdict end/CharStrings exch def GetNewXUID{/XUID exch def}if 3 1 -roll/EuroGBBox exch def/EuroGName exch def currentdict end definefont pop}bind -def/BuildNewFont{UseT3EuroFont{BuildT3EuroFont}{pop AddEuroToT1Font}ifelse}bind -def/UseObliqueEuro{findfont/FontMatrix get dup 2 get 0 eq exch dup 0 get exch 3 -get eq and UseT3EuroFont or}bind def -%%EndResource -end reinitialize -/Times-Roman FontHasEuro not -{ -/Euro.Times-Roman - [500 0 24 -14 493 676 ] - -AddEuroGlyph -/Euro /Times-Roman /Times-Roman-Copy BuildNewFont -} if -F /F0 0 /256 T /Times-Roman mF -/F0S63 F0 [99.363 0 0 -99.363 0 0 ] mFS -F0S63 Ji -469 89 M ( )S -Pscript_WinNT_Incr begin -%%BeginResource: file Pscript_Win_GdiObject 5.0 0 -/SavedCTM null d/CTMsave{/SavedCTM SavedCTM currentmatrix d}b/CTMrestore -{SavedCTM setmatrix}b/mp null d/ADO_mxRot null d/GDIHMatrix null d -/GDIHPatternDict 22 dict d GDIHPatternDict `/PatternType 1 d/PaintType 2 d/Reps -L2?{1}{5}? d/XStep 8 Reps mul d/YStep XStep d/BBox[0 0 XStep YStep]d/TilingType -1 d/PaintProc{` 1 Lw[]0 sd PaintData , exec E}b/FGnd null d/BGnd null d -/HS_Horizontal{horiz}b/HS_Vertical{vert}b/HS_FDiagonal{fdiag}b/HS_BDiagonal -{biag}b/HS_Cross{horiz vert}b/HS_DiagCross{fdiag biag}b/MaxXYStep XStep YStep -gt{XStep}{YStep}? d/horiz{Reps{0 4 M XStep 0 - 0 8 +}repeat 0 -8 Reps mul + K}b -/vert{Reps{4 0 M 0 YStep - 8 0 +}repeat 0 -8 Reps mul + K}b/biag{Reps{0 0 M -MaxXYStep @ - 0 YStep neg M MaxXYStep @ - 0 8 +}repeat 0 -8 Reps mul + 0 YStep -M 8 8 - K}b/fdiag{Reps{0 0 M MaxXYStep @ neg - 0 YStep M MaxXYStep @ neg - 0 8 -+}repeat 0 -8 Reps mul + MaxXYStep @ M 8 -8 - K}b E/makehatch{4 -2 $/yOrg ~ d -/xOrg ~ d GDIHPatternDict/PaintData 3 -1 $ put CTMsave GDIHMatrix setmatrix -GDIHPatternDict matrix xOrg yOrg + mp CTMrestore ~ U ~ 2 ^ put}b/h0{/h0 -/HS_Horizontal makehatch}b/h1{/h1/HS_Vertical makehatch}b/h2{/h2/HS_FDiagonal -makehatch}b/h3{/h3/HS_BDiagonal makehatch}b/h4{/h4/HS_Cross makehatch}b/h5{/h5 -/HS_DiagCross makehatch}b/GDIBWPatternMx null d/pfprep{save 8 1 $ -/PatternOfTheDay 8 1 $ GDIBWPatternDict `/yOrg ~ d/xOrg ~ d/PaintData ~ d/yExt -~ d/Width ~ d/BGnd ~ d/FGnd ~ d/Height yExt RepsV mul d/mx[Width 0 0 Height 0 -0]d E build_pattern ~ !}b/pfbf{/fEOFill ~ d pfprep hbf fEOFill{O}{L}? restore}b -/GraphInit{GDIHMatrix null eq{/SavedCTM matrix d : ADO_mxRot concat 0 0 snap + -: 0.48 @ GDIHPatternDict ` YStep mul ~ XStep mul ~ nonzero_dsnap YStep V ~ -XStep V ~ E +S/GDIHMatrix matrix currentmatrix readonly d ; : 0.24 -0.24 +S -GDIBWPatternDict ` Width Height E nonzero_dsnap +S/GDIBWPatternMx matrix -currentmatrix readonly d ; ;}if}b -%%EndResource -%%BeginResource: file Pscript_Win_GdiObject_L2 5.0 0 -/GDIBWPatternDict 25 dict @ `/PatternType 1 d/PaintType 1 d/RepsV 1 d/RepsH 1 d -/BBox[0 0 RepsH 1]d/TilingType 1 d/XStep 1 d/YStep 1 d/Height 8 RepsV mul d -/Width 8 d/mx[Width 0 0 Height neg 0 Height]d/FGnd null d/BGnd null d -/SetBGndFGnd{BGnd null ne{BGnd aload ! scol BBox aload ! 2 ^ sub ~ 3 ^ sub ~ -rf}if FGnd null ne{FGnd aload ! scol}if}b/PaintProc{` SetBGndFGnd RepsH{Width -Height F mx PaintData imagemask Width 0 +}repeat E}b E d/mp/makepattern , d -/build_pattern{CTMsave GDIBWPatternMx setmatrix/nupangle where{! nupangle -90 -eq{nupangle R}if}if GDIBWPatternDict @ ` Width Height ne{Width Height gt{Width -Height V 1}{1 Height Width V}? +S}if xOrg yOrg E matrix + mp CTMrestore}b/hbf -{setpattern}b/hf{:/fEOFill ~ d ~ ! setpattern fEOFill{O}{L}? ;}b/pbf{: ! -/fEOFill ~ d GDIBWPatternDict `/yOrg ~ d/xOrg ~ d/PaintData ~ d/OutputBPP ~ d -/Height ~ d/Width ~ d/PaintType 1 d/PatternType 1 d/TilingType 1 d/BBox[0 0 -Width Height]d/XStep Width d/YStep Height d/mx xOrg yOrg matrix + d 20 dict @ ` -/ImageType 1 d/Width Width d/Height Height d/ImageMatrix[1 0 0 1 0 0]d -/BitsPerComponent 8 d OutputBPP 24 eq{/Decode[0 1 0 1 0 1]d}{OutputBPP 8 eq{ -/Decode[0 1]d}{/Decode[0 1 0 1 0 1 0 1]d}?}?/DataSource{PaintData}d E/ImageDict -~ d/PaintProc{` ImageDict image E}b & mx makepattern setpattern E fEOFill{O}{L} -? ;}b/mask_pbf{:/fEOFill ~ d 20 dict `/yOrg ~ d/xOrg ~ d/PaintData ~ d/Height ~ -d/Width ~ d/PatternType 1 d/PaintType 2 d/TilingType 1 d/BBox[0 0 Width Height] -d/XStep Width d/YStep Height d/mx xOrg yOrg matrix + d/PaintProc{` Width Height -T 1 1 dtransform abs ~ abs ~ 0 0 3 -1 $ 0 0 6 array astore{PaintData}imagemask -E}b & mx makepattern setpattern E fEOFill{O}{L}? ;}b -%%EndResource -end reinitialize -; N 961 92 M 961 176 I 1463 176 I 1463 92 I 961 92 I C -0.68 0.848 0.902 1 scol O 0 0 0 1 scol 1 Lj 1 Lc 5 Lw solid N 961 92 M 961 176 I 1463 176 I 1463 92 I C -: 1.289 1.289 +S K -; 4 Lw N 961 92 M 961 176 I 1463 176 I 1463 92 I C -: 1.289 1.289 +S K -; N 961 147 M 1463 147 I : 1.289 1.289 +S K -; N 1215 209 M 1215 190 I : 1.289 1.289 +S K -; N 1215 77 M 1215 92 I : 1.289 1.289 +S K -; N 1087 209 M 1337 209 I : 1.289 1.289 +S K -; N 1087 77 M 1337 77 I : 1.289 1.289 +S K -; N 1215 302 M 1211 303 I 1208 303 I 1207 305 I 1204 307 I 1203 308 I 1202 311 I 1200 314 I 1200 316 I 1200 319 I 1202 321 I 1203 324 I 1204 327 I 1207 328 I 1208 329 I 1211 330 I 1215 330 I 1217 330 I 1220 329 I 1222 328 I 1224 327 I 1226 324 I 1228 321 I 1228 319 I 1229 316 I 1228 314 I 1228 311 I 1226 308 I 1224 307 I 1222 305 I 1220 303 I 1217 303 I 1215 302 I : 1.289 1.289 +S K -; N 1215 1336 M 1211 1336 I 1208 1337 I 1207 1338 I 1204 1340 I 1203 1342 I 1202 1345 I 1200 1347 I 1200 1350 I 1200 1352 I 1202 1355 I 1203 1358 I 1204 1360 I 1207 1361 I 1208 1363 I 1211 1364 I 1215 1364 I 1217 1364 I 1220 1363 I 1222 1361 I 1224 1360 I 1226 1358 I 1228 1355 I 1228 1352 I 1229 1350 I 1228 1347 I 1228 1345 I 1226 1342 I 1224 1340 I 1222 1338 I 1220 1337 I 1217 1336 I 1215 1336 I : 1.289 1.289 +S K -; N 1591 86 M 1591 129 I 2094 129 I 2094 86 I 1591 86 I C -0.68 0.848 0.902 1 scol O 0 0 0 1 scol 5 Lw N 1591 86 M 1591 129 I 2094 129 I 2094 86 I C -: 1.289 1.289 +S K -; 4 Lw N 1591 86 M 1591 129 I 2094 129 I 2094 86 I C -: 1.289 1.289 +S K -; N 1591 110 M 2094 110 I : 1.289 1.289 +S K -; N 1840 77 M 1840 86 I : 1.289 1.289 +S K -; N 1712 129 M 1966 129 I : 1.289 1.289 +S K -; N 1712 77 M 1966 77 I : 1.289 1.289 +S K -; N 1840 190 M 1837 190 I 1835 191 I 1832 192 I 1830 194 I 1828 196 I 1827 199 I 1826 201 I 1826 204 I 1826 207 I 1827 209 I 1828 212 I 1830 214 I 1832 216 I 1835 217 I 1837 218 I 1840 218 I 1843 218 I 1845 217 I 1848 216 I 1849 214 I 1852 212 I 1853 209 I 1853 207 I 1854 204 I 1853 201 I 1853 199 I 1852 196 I 1849 194 I 1848 192 I 1845 191 I 1843 190 I 1840 190 I : 1.289 1.289 +S K -; N 1840 218 M 1837 218 I 1835 219 I 1832 221 I 1830 222 I 1828 225 I 1827 226 I 1826 228 I 1826 232 I 1826 235 I 1827 237 I 1828 240 I 1830 241 I 1832 244 I 1835 245 I 1837 245 I 1840 247 I 1843 245 I 1845 245 I 1848 244 I 1849 241 I 1852 240 I 1853 237 I 1853 235 I 1854 232 I 1853 228 I 1853 226 I 1852 225 I 1849 222 I 1848 221 I 1845 219 I 1843 218 I 1840 218 I : 1.289 1.289 +S K -; N 1840 588 M 1837 590 I 1835 590 I 1832 591 I 1830 594 I 1828 595 I 1827 598 I 1826 600 I 1826 603 I 1826 605 I 1827 608 I 1828 610 I 1830 613 I 1832 614 I 1835 616 I 1837 617 I 1840 617 I 1843 617 I 1845 616 I 1848 614 I 1849 613 I 1852 610 I 1853 608 I 1853 605 I 1854 603 I 1853 600 I 1853 598 I 1852 595 I 1849 594 I 1848 591 I 1845 590 I 1843 590 I 1840 588 I : 1.289 1.289 +S K -; N 2216 92 M 2216 185 I 2719 185 I 2719 92 I 2216 92 I C -0.68 0.848 0.902 1 scol O 0 0 0 1 scol 5 Lw N 2216 92 M 2216 185 I 2719 185 I 2719 92 I C -: 1.289 1.289 +S K -; 4 Lw N 2216 92 M 2216 185 I 2719 185 I 2719 92 I C -: 1.289 1.289 +S K -; N 2216 101 M 2719 101 I : 1.289 1.289 +S K -; N 2469 232 M 2469 213 I : 1.289 1.289 +S K -; N 2469 195 M 2469 185 I : 1.289 1.289 +S K -; N 2469 77 M 2469 92 I : 1.289 1.289 +S K -; N 2343 232 M 2597 232 I : 1.289 1.289 +S K -; N 2343 77 M 2597 77 I : 1.289 1.289 +S K -; N 2469 327 M 2467 327 I 2464 327 I 2461 328 I 2460 330 I 2458 332 I 2456 334 I 2456 337 I 2455 339 I 2456 343 I 2456 346 I 2458 348 I 2460 350 I 2461 352 I 2464 354 I 2467 354 I 2469 354 I 2473 354 I 2476 354 I 2477 352 I 2479 350 I 2481 348 I 2482 346 I 2483 343 I 2483 339 I 2483 337 I 2482 334 I 2481 332 I 2479 330 I 2477 328 I 2476 327 I 2473 327 I 2469 327 I : 1.289 1.289 +S K -; N 2469 579 M 2467 579 I 2464 581 I 2461 582 I 2460 583 I 2458 586 I 2456 588 I 2456 591 I 2455 594 I 2456 596 I 2456 599 I 2458 601 I 2460 604 I 2461 605 I 2464 607 I 2467 608 I 2469 608 I 2473 608 I 2476 607 I 2477 605 I 2479 604 I 2481 601 I 2482 599 I 2483 596 I 2483 594 I 2483 591 I 2482 588 I 2481 586 I 2479 583 I 2477 582 I 2476 581 I 2473 579 I 2469 579 I : 1.289 1.289 +S K -; N 2846 77 M 2846 77 I 3348 77 I 3348 77 I 2846 77 I C -0.68 0.848 0.902 1 scol O 0 0 0 1 scol 5 Lw N 2846 77 M 2846 77 I 3348 77 I 3348 77 I C -: 1.289 1.289 +S K -; 4 Lw N 2846 77 M 3348 77 I 2846 77 I 3348 77 I : 1.289 1.289 +S K -; N 2973 77 M 3222 77 I : 1.289 1.289 +S K -; N 2973 77 M 3222 77 I : 1.289 1.289 +S K -; N 3094 72 M 3092 72 I 3089 74 I 3087 75 I 3085 76 I 3083 79 I 3082 81 I 3082 84 I 3080 86 I 3082 89 I 3082 92 I 3083 94 I 3085 97 I 3087 98 I 3089 99 I 3092 101 I 3094 101 I 3098 101 I 3101 99 I 3102 98 I 3105 97 I 3106 94 I 3107 92 I 3109 89 I 3109 86 I 3109 84 I 3107 81 I 3106 79 I 3105 76 I 3102 75 I 3101 74 I 3098 72 I 3094 72 I : 1.289 1.289 +S K -; N 3094 170 M 3092 172 I 3089 172 I 3087 173 I 3085 176 I 3083 177 I 3082 179 I 3082 182 I 3080 185 I 3082 188 I 3082 191 I 3083 192 I 3085 195 I 3087 196 I 3089 197 I 3092 199 I 3094 199 I 3098 199 I 3101 197 I 3102 196 I 3105 195 I 3106 192 I 3107 191 I 3109 188 I 3109 185 I 3109 182 I 3107 179 I 3106 177 I 3105 176 I 3102 173 I 3101 172 I 3098 172 I 3094 170 I : 1.289 1.289 +S K -; N 1215 1401 M 1215 1443 I : 1.289 1.289 +S K -; N 1840 1401 M 1840 1443 I : 1.289 1.289 +S K -; N 2469 1401 M 2469 1443 I : 1.289 1.289 +S K -; N 3094 1401 M 3094 1443 I : 1.289 1.289 +S K -; -%%IncludeResource: font Helvetica -/Helvetica FontHasEuro not -{ -/Euro.Helvetica - [556 0 24 -19 541 703 ] - -AddEuroGlyph -/Euro /Helvetica /Helvetica-Copy BuildNewFont -} if -F /F1 0 /256 T /Helvetica mF -/F1S4A F1 [74.844 0 0 -74.844 0 0 ] mFS -F1S4A Ji -1132 1567 M (OOB)[58 58 0]xS -: 1298 1497 30 94 rc F0S63 Ji -1298 1567 M ( )S -; 1681 1567 M (Test 33%)[47 41 37 21 21 41 41 0]xS -: 1997 1497 27 94 rc F0S63 Ji -1997 1567 M ( )S -; 2312 1567 M (Test 20%)[46 42 37 21 20 41 42 0]xS -: 2628 1497 27 94 rc F0S63 Ji -2628 1567 M ( )S -; 2940 1567 M (5)S -2981 1567 M (-)S -3005 1567 M (fold CV)[21 41 17 41 21 54 0]xS -: 3250 1497 39 94 rc F0S63 Ji -3250 1567 M ( )S -; N 794 1392 M 752 1392 I : 1.289 1.289 +S K -; N 794 1063 M 752 1063 I : 1.289 1.289 +S K -; N 794 734 M 752 734 I : 1.289 1.289 +S K -; N 794 405 M 752 405 I : 1.289 1.289 +S K -; N 794 77 M 752 77 I : 1.289 1.289 +S K -; /F1S00IFFFFFFB5 F1 [0 -74.844 -74.844 0 0 0 ] mFS -F1S00IFFFFFFB5 Ji -712 1443 M (0.2)[-42 -20 0]yS -: 641 1315 95 26 rc /F0S00IFFFFFF9C F0 [0 -99.363 -99.363 0 0 0 ] mFS -F0S00IFFFFFF9C Ji -712 1340 M ( )S -; 712 1114 M (0.4)[-42 -20 0]yS -: 641 986 95 25 rc F0S00IFFFFFF9C Ji -712 1010 M ( )S -; 712 786 M (0.6)[-41 -21 0]yS -: 641 658 95 26 rc F0S00IFFFFFF9C Ji -712 683 M ( )S -; 712 457 M (0.8)[-41 -21 0]yS -: 641 329 95 26 rc F0S00IFFFFFF9C Ji -712 354 M ( )S -; 712 128 M (1.0)[-42 -20 0]yS -: 641 0 95 26 rc F0S00IFFFFFF9C Ji -712 25 M ( )S -; F1S4A Ji -1416 1734 M (Method for selecting the number of iterations)[62 41 21 41 41 43 20 21 41 25 20 39 41 17 41 38 20 17 41 41 22 21 41 42 20 43 41 63 41 42 24 21 -41 22 21 16 21 41 26 41 21 17 41 41 0]xS -: 2895 1665 47 93 rc F0S63 Ji -2895 1734 M ( )S -; F1S00IFFFFFFB5 Ji -543 1210 M (Performance over 13 datasets)[-50 -41 -25 -20 -42 -24 -63 -42 -41 -37 -42 -20 -42 -38 -42 -24 -21 -42 -42 -20 -42 -41 -22 -41 -38 -41 -21 0]yS -: 472 162 95 48 rc F0S00IFFFFFF9C Ji -543 209 M ( )S -; 5 Lw : 789 0 2573 1406 rc N 794 4 M 794 1400 I 3356 1400 I 3356 4 I C -: 1.289 1.289 +S K -; ; LH -%%PageTrailer - -%%Trailer -%%DocumentNeededResources: -%%+ font Times-Roman -%%+ font Helvetica -%%DocumentSuppliedResources: -%%+ procset Pscript_WinNT_ErrorHandler 5.0 0 -%%+ procset Pscript_FatalError 5.0 0 -%%+ procset Pscript_Win_Basic 5.0 0 -%%+ procset Pscript_Win_Utils_L2 5.0 0 -%%+ procset Pscript_Text 5.0 0 -%%+ procset Pscript_Encoding256 5.0 0 -%%+ procset Pscript_Win_Euro_L2 5.0 0 -%%+ procset Pscript_Win_GdiObject 5.0 0 -%%+ procset Pscript_Win_GdiObject_L2 5.0 0 -Pscript_WinNT_Incr dup /terminate get exec -%%EOF diff --git a/inst/doc/oobperf2.pdf b/inst/doc/oobperf2.pdf deleted file mode 100644 index 67571bb..0000000 Binary files a/inst/doc/oobperf2.pdf and /dev/null differ diff --git a/inst/doc/shrinkage-v-iterations.eps b/inst/doc/shrinkage-v-iterations.eps deleted file mode 100644 index 34825b5..0000000 --- a/inst/doc/shrinkage-v-iterations.eps +++ /dev/null @@ -1,1672 +0,0 @@ -%!PS-Adobe-3.0 -%%DocumentNeededResources: font Helvetica -%%+ font Helvetica-Bold -%%+ font Helvetica-Oblique -%%+ font Helvetica-BoldOblique -%%+ font Symbol -%%DocumentMedia: a4 595 841 0 () () -%%Title: R Graphics Output -%%Creator: R Software -%%Pages: (atend) -%%Orientation: Portrait -%%BoundingBox: 18 205 577 637 -%%EndComments -%%BeginProlog -/bp { gs gs } def -% begin .ps.prolog -/gs { gsave } def -/gr { grestore } def -/ep { showpage gr gr } def -/m { moveto } def -/l { rlineto } def -/np { newpath } def -/cp { closepath } def -/f { fill } def -/o { stroke } def -/c { newpath 0 360 arc } def -/r { 4 2 roll moveto 1 copy 3 -1 roll exch 0 exch rlineto 0 rlineto -1 mul 0 exch rlineto closepath } def -/p1 { stroke } def -/p2 { gsave bg setrgbcolor fill grestore newpath } def -/p3 { gsave bg setrgbcolor fill grestore stroke } def -/t { 6 -2 roll moveto gsave rotate - ps mul neg 0 2 1 roll rmoveto - 1 index stringwidth pop - mul neg 0 rmoveto show grestore } def -/cl { grestore gsave newpath 3 index 3 index moveto 1 index - 4 -1 roll lineto exch 1 index lineto lineto - closepath clip newpath } def -/rgb { setrgbcolor } def -/s { scalefont setfont } def -% end .ps.prolog -%%IncludeResource: font Helvetica -/Helvetica findfont -dup length dict begin - {1 index /FID ne {def} {pop pop} ifelse} forall - /Encoding ISOLatin1Encoding def - currentdict - end -/Font1 exch definefont pop -%%IncludeResource: font Helvetica-Bold -/Helvetica-Bold findfont -dup length dict begin - {1 index /FID ne {def} {pop pop} ifelse} forall - /Encoding ISOLatin1Encoding def - currentdict - end -/Font2 exch definefont pop -%%IncludeResource: font Helvetica-Oblique -/Helvetica-Oblique findfont -dup length dict begin - {1 index /FID ne {def} {pop pop} ifelse} forall - /Encoding ISOLatin1Encoding def - currentdict - end -/Font3 exch definefont pop -%%IncludeResource: font Helvetica-BoldOblique -/Helvetica-BoldOblique findfont -dup length dict begin - {1 index /FID ne {def} {pop pop} ifelse} forall - /Encoding ISOLatin1Encoding def - currentdict - end -/Font4 exch definefont pop -%%IncludeResource: font Symbol -/Symbol findfont -dup length dict begin - {1 index /FID ne {def} {pop pop} ifelse} forall - currentdict - end -/Font5 exch definefont pop -%%EndProlog -%%Page: 1 1 -bp -18.00 204.94 577.28 636.94 cl -0 0 0 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -94.45 278.38 m -435.18 0 l -o -np -94.45 278.38 m -0 -7.20 l -o -np -181.48 278.38 m -0 -7.20 l -o -np -268.52 278.38 m -0 -7.20 l -o -np -355.56 278.38 m -0 -7.20 l -o -np -442.59 278.38 m -0 -7.20 l -o -np -529.63 278.38 m -0 -7.20 l -o -/ps 12 def /Font1 findfont 12 s -94.45 252.46 (0) .5 0 0 t -181.48 252.46 (2000) .5 0 0 t -268.52 252.46 (4000) .5 0 0 t -355.56 252.46 (6000) .5 0 0 t -442.59 252.46 (8000) .5 0 0 t -529.63 252.46 (10000) .5 0 0 t -np -77.04 289.48 m -0 277.33 l -o -np -77.04 289.48 m --7.20 0 l -o -np -77.04 358.81 m --7.20 0 l -o -np -77.04 428.14 m --7.20 0 l -o -np -77.04 497.48 m --7.20 0 l -o -np -77.04 566.81 m --7.20 0 l -o -59.76 289.48 (0.190) .5 0 90 t -59.76 358.81 (0.195) .5 0 90 t -59.76 428.14 (0.200) .5 0 90 t -59.76 497.48 (0.205) .5 0 90 t -59.76 566.81 (0.210) .5 0 90 t -np -77.04 278.38 m -470.00 0 l -0 299.52 l --470.00 0 l -0 -299.52 l -o -18.00 204.94 577.28 636.94 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -312.04 223.66 (Iterations) .5 0 0 t -30.96 428.14 (Squared error) .5 0 90 t -77.04 278.38 547.04 577.90 cl -0 0 0 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -96.30 636.94 m -0.80 -256.68 l -0.87 -37.13 l -0.35 -0.45 l -0.52 11.92 l -0.87 2.33 l -0.87 -2.31 l -0.87 9.87 l -0.87 15.91 l -0.87 2.35 l -0.87 14.48 l -0.88 3.97 l -0.87 10.90 l -0.91 13.99 l -0.87 9.66 l -0.87 14.32 l -0.87 12.06 l -0.87 0.63 l -0.87 7.84 l -0.87 0.86 l -0.87 16.91 l -0.87 13.95 l -0.87 6.15 l -0.87 19.28 l -0.87 13.99 l -0.87 7.21 l -0.87 3.98 l -0.87 6.88 l -0.87 6.38 l -0.88 14.52 l -0.87 15.71 l -0.87 12.61 l -0.87 1.80 l -0.87 4.97 l -0.87 5.38 l -0.87 8.96 l -0.87 5.94 l -0.08 0.86 l -o -18.00 204.94 577.28 636.94 cl -0 0 0 rgb -0.38 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -98.32 278.38 m -0 0 l -o -np -98.32 278.38 m -0 8.99 l -o -77.04 278.38 547.04 577.90 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -98.32 331.64 (0.1) 1 0 0 t -1 0 0 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -98.18 636.94 m -0.66 -170.73 l -0.87 -91.55 l -0.87 -32.71 l -0.87 -11.27 l -0.83 -4.42 l -0.04 1.99 l -0.87 2.36 l -0.87 5.74 l -0.88 5.32 l -0.87 10.10 l -0.91 -0.18 l -0.87 0.01 l -0.87 7.30 l -0.87 5.28 l -0.87 3.18 l -0.87 8.16 l -0.87 3.52 l -0.87 3.60 l -0.87 6.17 l -0.87 5.93 l -0.87 5.86 l -0.87 9.58 l -0.87 4.88 l -0.87 1.50 l -0.87 5.88 l -0.87 1.56 l -0.88 1.58 l -0.87 6.89 l -0.87 8.30 l -0.87 3.06 l -0.87 3.80 l -0.87 7.77 l -0.87 6.79 l -0.87 0.12 l -0.87 5.03 l -0.87 3.56 l -0.91 5.44 l -0.87 0.11 l -0.87 2.74 l -0.87 3.12 l -0.87 2.71 l -0.87 4.84 l -0.87 9.32 l -0.87 2.49 l -0.88 2.34 l -0.87 8.43 l -0.87 3.91 l -0.87 3.49 l -0.87 8.17 l -0.87 6.87 l -0.87 2.14 l -0.87 -1.13 l -0.87 1.81 l -0.87 5.45 l -0.87 3.55 l -0.87 4.15 l -0.87 -0.88 l -0.87 3.64 l -0.87 6.16 l -0.87 6.55 l -0.87 3.76 l -0.87 -1.51 l -0.92 3.77 l -0.87 4.83 l -0.87 4.47 l -0.87 -0.78 l -0.87 3.07 l -0.87 2.23 l -0.87 3.07 l -0.87 4.25 l -0.87 0.91 l -0.87 2.07 l -0.87 3.87 l -0.87 -0.27 l -0.87 4.13 l -0.87 2.34 l -0.87 6.20 l -0.87 -0.02 l -0.87 2.70 l -0.87 4.35 l -0.87 1.13 l -0.87 1.01 l -0.87 6.80 l -0.87 1.41 l -0.87 4.57 l -0.34 2.26 l -o -18.00 204.94 577.28 636.94 cl -1 0 0 rgb -0.38 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -102.28 278.38 m -0 0 l -o -np -102.28 278.38 m -0 8.99 l -o -77.04 278.38 547.04 577.90 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -102.28 315.22 (0.05) 1 0 0 t -0 0.8039 0 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -112.73 636.94 m -0.08 -6.15 l -0.87 -56.32 l -0.87 -45.80 l -0.87 -38.10 l -0.87 -32.80 l -0.87 -27.05 l -0.87 -22.21 l -0.87 -20.65 l -0.87 -16.95 l -0.88 -11.58 l -0.87 -9.86 l -0.87 -9.21 l -0.87 -6.46 l -0.87 -5.19 l -0.87 -3.64 l -0.87 -3.58 l -0.87 -1.01 l -0.87 -2.63 l -0.87 -2.59 l -0.91 -1.32 l -0.87 -1.63 l -0.87 0.29 l -0.87 -0.46 l -0.87 -0.33 l -0.87 0.60 l -0.87 -0.35 l -0.87 -0.30 l -0.88 0.56 l -0.87 0.21 l -0.87 0.44 l -0.87 -1.21 l -0.69 -0.58 l -0.18 0.27 l -0.87 0.81 l -0.87 1.14 l -0.87 0.71 l -0.87 0.66 l -0.87 0.85 l -0.87 1.16 l -0.87 1.98 l -0.87 0.37 l -0.87 0 l -0.87 0.88 l -0.87 0.51 l -0.87 -0.38 l -0.87 0.17 l -0.92 0.88 l -0.87 1.91 l -0.87 0.90 l -0.87 1.37 l -0.87 2.06 l -0.87 1.83 l -0.87 0.91 l -0.87 0.82 l -0.87 0.86 l -0.87 0.51 l -0.87 -0.11 l -0.87 0.64 l -0.87 1.33 l -0.87 1.11 l -0.87 0.91 l -0.87 0.58 l -0.87 0.11 l -0.87 1.19 l -0.87 2.27 l -0.87 1.39 l -0.87 -0.25 l -0.87 0.04 l -0.87 0.37 l -0.87 1.15 l -0.87 0.02 l -0.87 0.51 l -0.92 0.29 l -0.87 1.00 l -0.87 1.59 l -0.87 -0.20 l -0.87 0.27 l -0.87 0.64 l -0.87 -0.20 l -0.87 1.00 l -0.87 0.78 l -0.87 0.81 l -0.87 1.66 l -0.87 0.58 l -0.87 0.25 l -0.87 0.42 l -0.87 1.09 l -0.87 1.17 l -0.87 1.33 l -0.87 -0.17 l -0.87 0.97 l -0.88 0.62 l -0.87 1.02 l -0.87 0.29 l -0.87 0.87 l -0.87 0.76 l -0.87 1.14 l -0.87 1.21 l -0.87 0.65 l -198.28 363.84 lineto -0.87 0.66 l -0.87 0.85 l -0.87 0.97 l -0.87 0.44 l -0.87 1.68 l -0.87 0.14 l -0.87 1.08 l -0.87 1.38 l -0.87 1.74 l -0.88 0.28 l -0.87 1.61 l -0.87 0.37 l -0.87 0.22 l -0.87 0.99 l -0.87 1.60 l -0.87 0.66 l -0.87 1.76 l -0.87 -0.19 l -0.87 0.51 l -0.87 0.94 l -0.87 0.38 l -0.87 -0.91 l -0.87 0.12 l -0.87 1.15 l -0.87 1.08 l -0.91 0.21 l -0.87 0.27 l -0.88 1.68 l -0.87 0.51 l -0.87 0.39 l -0.87 1.43 l -0.87 0.10 l -0.87 -0.25 l -0.87 0.91 l -0.87 0.85 l -0.87 0.22 l -0.87 0.42 l -0.87 0.45 l -0.87 0.93 l -0.87 1.95 l -0.87 0.77 l -0.87 0.36 l -0.87 1.83 l -0.87 0.75 l -0.87 1.03 l -0.87 1.37 l -0.87 0.72 l -0.87 0.91 l -0.87 0.23 l -0.87 -0.03 l -0.87 1.04 l -0.92 0.62 l -0.87 2.08 l -0.87 0.94 l -0.87 1.17 l -0.87 0.61 l -0.87 0.70 l -0.87 0.75 l -0.87 0.41 l -0.87 1.05 l -0.87 0.98 l -0.87 1.62 l -0.87 1.02 l -0.87 0.90 l -0.87 0.61 l -0.87 1.07 l -0.87 0.31 l -0.87 1.95 l -0.87 0.57 l -0.87 0.43 l -0.87 0.26 l -0.87 1.33 l -0.87 1.35 l -0.88 0.39 l -0.87 0.46 l -0.87 1.00 l -0.87 0.97 l -0.91 1.56 l -0.87 0.76 l -0.87 0.31 l -0.87 0.64 l -0.87 1.48 l -0.87 0.36 l -0.87 1.65 l -0.87 0.04 l -0.87 0.60 l -0.87 0.97 l -0.87 -0.25 l -0.87 1.14 l -0.87 1.21 l -0.87 0.97 l -0.88 0.71 l -0.87 1.29 l -0.87 2.65 l -0.87 0.27 l -0.87 0.83 l -0.87 0.48 l -0.87 1.17 l -0.87 1.93 l -285.45 447.56 lineto -0.87 1.61 l -0.87 1.50 l -0.87 0.99 l -0.87 0.53 l -0.91 1.19 l -0.87 0.24 l -0.87 0.58 l -0.87 1.15 l -0.88 0.63 l -0.87 0.98 l -0.87 0.41 l -0.87 0.41 l -0.87 0.54 l -0.87 0.71 l -0.87 0.36 l -0.87 1.06 l -0.87 1.10 l -0.87 0.91 l -0.87 0.15 l -0.87 1.07 l -0.87 0.36 l -0.87 0.21 l -0.87 -0.30 l -0.87 0.62 l -0.87 1.48 l -0.87 1.05 l -0.87 1.33 l -0.87 0.22 l -0.87 0.56 l -0.87 0.59 l -0.92 1.34 l -0.87 1.07 l -0.87 0.45 l -0.87 0.94 l -0.87 1.14 l -0.87 1.16 l -0.87 0.24 l -0.87 0.68 l -0.87 1.09 l -0.87 0.54 l -0.87 0.18 l -0.87 0.45 l -0.87 0.34 l -0.87 0.82 l -0.87 1.84 l -0.87 0.53 l -0.87 0.76 l -0.87 1.45 l -0.87 0.78 l -0.87 -0.06 l -0.87 -0.05 l -0.87 0.72 l -0.87 0.31 l -0.87 -0.60 l -0.88 0.38 l -0.87 0.46 l -0.91 1.02 l -0.87 0.88 l -0.87 0.81 l -0.87 0.38 l -0.87 0.96 l -0.87 1.18 l -0.87 0.54 l -0.87 0.37 l -0.87 0.55 l -0.87 0.44 l -0.87 0.68 l -0.87 0.28 l -0.87 0.87 l -0.87 0.37 l -0.87 2.00 l -0.87 2.04 l -0.88 0.18 l -0.87 1.12 l -0.87 0.16 l -0.87 0.73 l -0.87 0.50 l -0.87 0.32 l -0.87 0.53 l -0.87 0.79 l -0.87 0.81 l -0.87 0.58 l -0.87 1.01 l -0.91 0.83 l -0.87 0.47 l -0.87 0.19 l -0.87 1.51 l -0.87 0.76 l -0.87 0.82 l -0.87 -0.46 l -0.88 0.31 l -0.87 0.80 l -0.87 1.23 l -0.87 0.96 l -0.87 0.80 l -0.87 1.16 l -0.87 0.92 l -0.87 0.23 l -0.87 0.79 l -372.66 519.24 lineto -0.87 0.41 l -0.87 0.50 l -0.87 0.44 l -0.87 0.47 l -0.87 1.13 l -0.87 -0.09 l -0.87 0.65 l -0.87 1.10 l -0.87 0.35 l -0.92 0.59 l -0.87 0.81 l -0.87 0.66 l -0.87 1.19 l -0.87 0.17 l -0.87 0.55 l -0.87 1.10 l -0.87 0.75 l -0.87 -0.18 l -0.87 0.06 l -0.87 0.26 l -0.87 0.50 l -0.87 0.42 l -0.87 1.57 l -0.87 -0.33 l -0.87 0.58 l -0.87 0 l -0.87 0.12 l -0.87 0.19 l -0.87 1.21 l -0.87 0.97 l -0.87 -0.84 l -0.87 0.74 l -0.87 0.72 l -0.87 0.74 l -0.87 -0.11 l -0.92 0.28 l -0.87 -0.04 l -0.87 0.08 l -0.87 -0.49 l -0.87 -0.63 l -0.87 0.57 l -0.87 1.38 l -0.87 0.20 l -0.87 0.99 l -0.87 -0.07 l -0.87 0.52 l -0.87 0.85 l -0.87 0.78 l -0.87 0.01 l -0.87 1.07 l -0.87 1.58 l -0.87 0.84 l -0.87 0.79 l -0.88 1.16 l -0.87 1.08 l -0.87 0.38 l -0.87 0.24 l -0.87 -0.48 l -0.87 1.08 l -0.87 0.71 l -0.87 0.92 l -0.91 0.40 l -0.87 1.26 l -0.87 0.13 l -0.87 0.65 l -0.87 0.34 l -0.87 -0.78 l -0.87 0.49 l -0.87 0.94 l -0.87 0.88 l -0.87 1.49 l -0.88 1.43 l -0.87 0.77 l -0.87 0.39 l -0.87 0.26 l -0.87 0.77 l -0.87 1.01 l -0.87 1.04 l -0.87 0.13 l -0.87 0.77 l -0.87 0.92 l -0.87 1.01 l -0.87 0.85 l -0.87 0.40 l -0.87 1.08 l -0.87 0.02 l -0.87 0.70 l -0.87 0.79 l -0.91 1.16 l -0.88 1.49 l -0.87 0.46 l -0.87 0.96 l -0.87 0.51 l -0.87 0.42 l -0.87 0.53 l -0.87 0.83 l -0.87 0.11 l -0.87 1.15 l -0.87 0.17 l -459.87 576.51 lineto -0.87 1.18 l -0.87 0.49 l -0.87 0.58 l -0.87 0.42 l -0.87 0.78 l -0.87 0.61 l -0.87 0.78 l -0.87 1.21 l -0.87 0.97 l -0.87 1.38 l -0.87 1.18 l -0.87 0.85 l -0.87 1.89 l -0.87 0.01 l -0.92 0 l -0.87 0.64 l -0.87 1.02 l -0.87 0.35 l -0.87 -1.00 l -0.87 1.41 l -0.87 1.70 l -0.87 -0.01 l -0.87 0.55 l -0.87 0.91 l -0.87 0.34 l -0.87 0.71 l -0.87 0.43 l -0.87 0.82 l -0.87 0.59 l -0.87 -0.05 l -0.87 0.11 l -0.87 1.12 l -0.87 0.70 l -0.87 0.45 l -0.88 0.65 l -0.87 0.81 l -0.87 0 l -0.87 0 l -0.87 0.20 l -0.87 1.37 l -0.91 0.80 l -0.87 0.44 l -0.87 0.61 l -0.87 -0.29 l -0.87 0.67 l -0.87 0.45 l -0.87 0.76 l -0.87 0.44 l -0.87 -0.19 l -0.87 0.81 l -0.87 1.87 l -0.87 0.69 l -0.88 0.20 l -0.87 1.37 l -0.87 0.65 l -0.87 0.85 l -0.87 0.57 l -0.87 -0.65 l -0.87 0.66 l -0.87 0.69 l -0.87 1.65 l -0.87 1.24 l -0.87 -0.53 l -0.87 -0.67 l -0.87 -0.17 l -0.87 -0.31 l -0.91 1.21 l -0.87 0.78 l -0.87 0.44 l -0.87 0.46 l -0.88 0.91 l -0.87 0.45 l -0.87 0.56 l -0.87 -0.07 l -0.87 0.52 l -0.87 1.12 l -0.87 1.18 l -0.87 1.15 l -0.87 1.25 l -0.87 0.33 l -o -18.00 204.94 577.28 636.94 cl -0 0.8039 0 rgb -0.38 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -139.66 278.38 m -0 0 l -o -np -139.66 278.38 m -0 8.99 l -o -77.04 278.38 547.04 577.90 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -139.66 300.04 (0.01) 1 0 0 t -0 0 1 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -130.97 636.94 m -0.16 -5.96 l -0.87 -29.77 l -0.87 -27.35 l -0.87 -23.76 l -0.87 -22.19 l -0.87 -18.93 l -0.88 -19.94 l -0.87 -17.25 l -0.87 -15.60 l -0.87 -14.01 l -0.87 -13.88 l -0.87 -10.34 l -0.87 -9.97 l -0.87 -9.50 l -0.87 -8.65 l -0.87 -6.63 l -0.87 -7.77 l -0.87 -7.32 l -0.87 -6.40 l -0.87 -4.39 l -0.87 -5.45 l -0.87 -4.60 l -0.87 -4.51 l -0.87 -4.87 l -0.92 -4.37 l -0.87 -3.10 l -0.87 -2.55 l -0.87 -2.58 l -0.87 -1.95 l -0.87 -1.30 l -0.87 -1.30 l -0.87 -1.61 l -0.87 -1.12 l -0.87 -1.40 l -0.87 -0.32 l -0.87 -0.93 l -0.87 -0.70 l -0.87 -1.47 l -0.87 -0.31 l -0.87 -0.14 l -0.87 -1.08 l -0.87 -0.21 l -0.87 -0.21 l -0.87 -0.90 l -0.87 -0.70 l -0.48 -0.22 l -0.39 0.32 l -0.87 0.37 l -0.87 0.05 l -0.87 0.46 l -0.87 -0.33 l -0.92 0.09 l -0.87 0.43 l -0.87 0.19 l -0.87 0.30 l -0.87 0.14 l -0.87 0.01 l -0.87 0.44 l -0.87 0.23 l -0.87 0 l -0.87 0.53 l -0.87 0 l -0.87 -0.06 l -0.87 0.59 l -0.87 -0.07 l -0.87 0.46 l -0.87 0.37 l -0.87 0.59 l -0.87 0.67 l -0.87 0.26 l -0.88 0.96 l -0.87 -0.17 l -0.87 0.37 l -0.87 0.63 l -0.87 0.63 l -0.87 -0.22 l -0.87 0.38 l -0.87 0.16 l -0.91 0.58 l -0.87 0.02 l -0.87 0.32 l -0.87 0.30 l -0.87 0.54 l -0.87 -0.02 l -0.87 0.95 l -0.87 0.32 l -0.87 0.27 l -0.87 0.63 l -0.88 0.11 l -0.87 0.08 l -0.87 0.47 l -0.87 0.44 l -0.87 0.43 l -0.87 0.41 l -0.87 0.71 l -0.87 0.43 l -0.87 0.22 l -0.87 0.77 l -0.87 0.21 l -216.56 326.63 lineto -0.87 0.44 l -0.87 0.56 l -0.87 0.34 l -0.87 0.73 l -0.91 0.42 l -0.87 0.53 l -0.88 0.31 l -0.87 0.69 l -0.87 -0.31 l -0.87 -0.15 l -0.87 0.84 l -0.87 0.28 l -0.87 0.03 l -0.87 0.59 l -0.87 0.09 l -0.87 0.42 l -0.87 0.32 l -0.87 0.30 l -0.87 0.26 l -0.87 0.61 l -0.87 -0.07 l -0.87 0.38 l -0.87 0.39 l -0.87 0.18 l -0.87 0.26 l -0.87 0.17 l -0.87 0.45 l -0.87 0.26 l -0.87 1.10 l -0.87 0.59 l -0.92 0.35 l -0.87 0.58 l -0.87 0.82 l -0.87 0.31 l -0.87 0.04 l -0.87 0.88 l -0.87 0.16 l -0.87 0.25 l -0.87 1.26 l -0.87 0.33 l -0.87 0.34 l -0.87 -0.03 l -0.87 -0.05 l -0.87 0.15 l -0.87 0.59 l -0.87 0.50 l -0.87 0.65 l -0.87 0.12 l -0.87 0.73 l -0.87 0.56 l -0.87 0.42 l -0.87 0.61 l -0.88 0.13 l -0.87 0.68 l -0.87 0.75 l -0.87 0.28 l -0.91 -0.07 l -0.87 0.56 l -0.87 0.43 l -0.87 0.49 l -0.87 0.64 l -0.87 0.46 l -0.87 0.58 l -0.87 0.17 l -0.87 0.88 l -0.87 0.30 l -0.87 0.85 l -0.87 0.47 l -0.87 0.09 l -0.87 0.31 l -0.88 0.99 l -0.87 0.49 l -0.87 0.37 l -0.87 -0.04 l -0.87 0.68 l -0.87 0.75 l -0.87 0.49 l -0.87 -0.01 l -0.87 0.47 l -0.87 0.47 l -0.87 -0.10 l -0.87 0.34 l -0.87 0.04 l -0.91 0.58 l -0.87 0.03 l -0.87 0.28 l -0.87 0.85 l -0.88 0.36 l -0.87 0.68 l -0.87 0.37 l -0.87 0.38 l -0.87 0.64 l -0.87 0.22 l -0.87 0.31 l -0.87 0.82 l -0.87 0.35 l -0.87 0.43 l -0.87 0.42 l -0.87 0.30 l -303.77 367.68 lineto -0.87 0.46 l -0.87 -0.27 l -0.87 0.40 l -0.87 0.47 l -0.87 0.65 l -0.87 0.07 l -0.87 0.22 l -0.87 0.45 l -0.87 0.37 l -0.92 1.28 l -0.87 0.77 l -0.87 0.69 l -0.87 0.22 l -0.87 0.73 l -0.87 0.78 l -0.87 0.22 l -0.87 0.56 l -0.87 0.81 l -0.87 -0.01 l -0.87 0.72 l -0.87 0.24 l -0.87 0.32 l -0.87 0.21 l -0.87 0.31 l -0.87 0.19 l -0.87 0.46 l -0.87 0.61 l -0.87 0.24 l -0.87 0.22 l -0.87 0.31 l -0.87 -0.14 l -0.87 0.52 l -0.87 0.21 l -0.88 0.47 l -0.87 0.38 l -0.91 0.58 l -0.87 0.41 l -0.87 -0.05 l -0.87 -0.34 l -0.87 0.71 l -0.87 0.76 l -0.87 0.24 l -0.87 0.42 l -0.87 0.55 l -0.87 0.46 l -0.87 0.23 l -0.87 0.45 l -0.87 0.84 l -0.87 0.46 l -0.87 0.53 l -0.87 0.27 l -0.88 0.02 l -0.87 0.68 l -0.87 0.87 l -0.87 0.72 l -0.87 0.34 l -0.87 0.17 l -0.87 0.77 l -0.87 -0.02 l -0.87 0.74 l -0.87 0.35 l -0.87 0.34 l -0.91 0.75 l -0.87 0.20 l -0.87 0.36 l -0.87 0.62 l -0.87 0.38 l -0.87 0.10 l -0.87 0.15 l -0.88 0.10 l -0.87 -0.03 l -0.87 0.68 l -0.87 0.59 l -0.87 0.60 l -0.87 0.25 l -0.87 0.04 l -0.87 0.12 l -0.87 0.61 l -0.87 0.80 l -0.87 0.30 l -0.87 0.79 l -0.87 0.69 l -0.87 0.31 l -0.87 0.15 l -0.87 0.16 l -0.87 0.51 l -0.87 0.65 l -0.87 0.45 l -0.92 0.01 l -0.87 0.55 l -0.87 0.62 l -0.87 -0.02 l -0.87 0.36 l -0.87 0.73 l -0.87 0.54 l -0.87 0.45 l -0.87 0.08 l -0.87 0.34 l -0.87 0.82 l -390.98 409.25 lineto -0.87 0.30 l -0.87 0.16 l -0.87 0.35 l -0.87 0.40 l -0.87 0.41 l -0.87 0.48 l -0.87 0.19 l -0.87 0.39 l -0.87 0.36 l -0.87 0.03 l -0.87 0.22 l -0.87 0.59 l -0.87 0.43 l -0.87 0.58 l -0.92 0.91 l -0.87 0.47 l -0.87 0.46 l -0.87 0.38 l -0.87 0.10 l -0.87 0.56 l -0.87 0.51 l -0.87 0.09 l -0.87 0.65 l -0.87 0.31 l -0.87 0.39 l -0.87 0.42 l -0.87 0.10 l -0.87 0.91 l -0.87 -0.17 l -0.87 -0.02 l -0.87 0.36 l -0.87 0.06 l -0.88 0.90 l -0.87 0.64 l -0.87 0.46 l -0.87 0.47 l -0.87 0.61 l -0.87 0.44 l -0.87 -0.02 l -0.87 0.64 l -0.91 1.08 l -0.87 0.38 l -0.87 0.96 l -0.87 0 l -0.87 0.33 l -0.87 0.59 l -0.87 0.62 l -0.87 0.33 l -0.87 0.33 l -0.87 0.30 l -0.88 0.38 l -0.87 0.15 l -0.87 0.42 l -0.87 0.80 l -0.87 0.50 l -0.87 0.56 l -0.87 -0.05 l -0.87 0.52 l -0.87 0.52 l -0.87 0.19 l -0.87 0.05 l -0.87 0.21 l -0.87 0.46 l -0.87 0.31 l -0.87 0.30 l -0.87 0.44 l -0.87 0.29 l -0.91 0.42 l -0.88 0.27 l -0.87 0.27 l -0.87 0.31 l -0.87 0.19 l -0.87 0.03 l -0.87 1.10 l -0.87 0.75 l -0.87 0.19 l -0.87 0.37 l -0.87 0.36 l -0.87 0 l -0.87 0.37 l -0.87 0.34 l -0.87 -0.04 l -0.87 0.51 l -0.87 0.30 l -0.87 0.50 l -0.87 0.15 l -0.87 0.79 l -0.87 0.12 l -0.87 0.22 l -0.87 0.60 l -0.87 0.53 l -0.87 0.60 l -0.87 0.59 l -0.92 0.89 l -0.87 1.02 l -0.87 0.35 l -0.87 0.01 l -0.87 0.42 l -0.87 0.52 l -478.19 448.90 lineto -0.87 0.38 l -0.87 -0.21 l -0.87 0.74 l -0.87 0.35 l -0.87 0.37 l -0.87 0.55 l -0.87 0.64 l -0.87 0.14 l -0.87 0.36 l -0.87 0.77 l -0.87 0.33 l -0.87 0.26 l -0.87 1.09 l -0.88 0.15 l -0.87 0.03 l -0.87 0.50 l -0.87 0.25 l -0.87 0.34 l -0.87 -0.12 l -0.91 -0.04 l -0.87 0.25 l -0.87 0.56 l -0.87 0.16 l -0.87 0.29 l -0.87 0.45 l -0.87 0.04 l -0.87 0.16 l -0.87 0.87 l -0.87 0.71 l -0.87 0.87 l -0.87 0.83 l -0.88 0.63 l -0.87 0.21 l -0.87 0.23 l -0.87 0.20 l -0.87 0.08 l -0.87 0.46 l -0.87 -0.02 l -0.87 0.60 l -0.87 0.32 l -0.87 0.83 l -0.87 -0.29 l -0.87 0.79 l -0.87 0.34 l -0.87 0.58 l -0.91 -0.32 l -0.87 -0.11 l -0.87 -0.11 l -0.87 0.96 l -0.88 0.21 l -0.87 0.59 l -0.87 0.37 l -0.87 0.76 l -0.87 0.19 l -0.87 0.53 l -0.87 0.51 l -0.87 0.40 l -0.87 0.33 l -0.87 0.49 l -o -18.00 204.94 577.28 636.94 cl -0 0 1 rgb -0.38 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -169.95 278.38 m -0 0 l -o -np -169.95 278.38 m -0 8.99 l -o -77.04 278.38 547.04 577.90 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -169.95 298.39 (0.005) 1 0 0 t -0 1 1 rgb -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -277.47 636.94 m -0.14 -1.03 l -0.88 -6.00 l -0.87 -5.98 l -0.87 -6.15 l -0.87 -5.86 l -0.87 -5.45 l -0.87 -6.00 l -0.87 -5.63 l -0.87 -5.28 l -0.87 -5.66 l -0.87 -5.09 l -0.87 -5.20 l -0.87 -4.82 l -0.87 -4.75 l -0.91 -5.14 l -0.87 -5.00 l -0.87 -4.85 l -0.87 -4.44 l -0.88 -4.82 l -0.87 -4.31 l -0.87 -4.45 l -0.87 -4.48 l -0.87 -4.15 l -0.87 -4.00 l -0.87 -4.48 l -0.87 -3.96 l -0.87 -3.99 l -0.87 -3.78 l -0.87 -3.72 l -0.87 -3.65 l -0.87 -3.75 l -0.87 -3.48 l -0.87 -3.59 l -0.87 -3.03 l -0.87 -3.49 l -0.87 -3.03 l -0.87 -3.15 l -0.87 -3.24 l -0.87 -3.13 l -0.87 -3.11 l -0.92 -3.59 l -0.87 -3.04 l -0.87 -2.87 l -0.87 -2.65 l -0.87 -2.75 l -0.87 -2.57 l -0.87 -2.39 l -0.87 -2.70 l -0.87 -2.83 l -0.87 -2.41 l -0.87 -2.33 l -0.87 -2.53 l -0.87 -2.52 l -0.87 -2.19 l -0.87 -2.49 l -0.87 -2.01 l -0.87 -2.09 l -0.87 -2.13 l -0.87 -2.24 l -0.87 -2.03 l -0.87 -2.16 l -0.87 -1.93 l -0.87 -1.88 l -0.87 -1.85 l -0.88 -1.75 l -0.87 -1.94 l -0.91 -1.67 l -0.87 -1.68 l -0.87 -1.89 l -0.87 -1.69 l -0.87 -1.72 l -0.87 -1.71 l -0.87 -1.75 l -0.87 -1.57 l -0.87 -1.86 l -0.87 -1.42 l -0.87 -1.66 l -0.87 -1.56 l -0.87 -1.41 l -0.87 -1.42 l -0.87 -1.41 l -0.87 -1.45 l -0.88 -1.57 l -0.87 -1.34 l -0.87 -1.42 l -0.87 -1.11 l -0.87 -1.08 l -0.87 -1.16 l -0.87 -1.15 l -0.87 -1.00 l -0.87 -1.20 l -0.87 -1.28 l -0.87 -1.39 l -0.91 -1.23 l -0.87 -1.02 l -0.87 -1.04 l -0.87 -0.94 l -0.87 -0.99 l -0.87 -0.91 l -363.95 353.37 lineto -0.88 -1.01 l -0.87 -0.95 l -0.87 -1.14 l -0.87 -0.89 l -0.87 -1.03 l -0.87 -0.95 l -0.87 -0.75 l -0.87 -0.73 l -0.87 -0.68 l -0.87 -0.78 l -0.87 -0.87 l -0.87 -0.89 l -0.87 -0.89 l -0.87 -0.66 l -0.87 -0.82 l -0.87 -0.77 l -0.87 -0.74 l -0.87 -0.55 l -0.87 -0.76 l -0.92 -0.61 l -0.87 -0.61 l -0.87 -0.55 l -0.87 -0.36 l -0.87 -0.59 l -0.87 -0.62 l -0.87 -0.64 l -0.87 -0.81 l -0.87 -0.67 l -0.87 -0.75 l -0.87 -0.71 l -0.87 -0.49 l -0.87 -0.52 l -0.87 -0.37 l -0.87 -0.51 l -0.87 -0.53 l -0.87 -0.51 l -0.87 -0.42 l -0.87 -0.35 l -0.87 -0.37 l -0.87 -0.60 l -0.87 -0.49 l -0.87 -0.37 l -0.87 -0.50 l -0.87 -0.42 l -0.87 -0.34 l -0.92 -0.41 l -0.87 -0.33 l -0.87 -0.50 l -0.87 -0.57 l -0.87 -0.29 l -0.87 -0.35 l -0.87 -0.42 l -0.87 -0.23 l -0.87 -0.47 l -0.87 -0.25 l -0.87 -0.39 l -0.87 -0.32 l -0.87 -0.30 l -0.87 -0.39 l -0.87 -0.32 l -0.87 -0.34 l -0.87 -0.28 l -0.87 -0.37 l -0.88 -0.30 l -0.87 -0.20 l -0.87 -0.27 l -0.87 -0.27 l -0.87 -0.18 l -0.87 -0.36 l -0.87 -0.24 l -0.87 -0.35 l -0.91 -0.26 l -0.87 -0.24 l -0.87 -0.35 l -0.87 -0.36 l -0.87 -0.19 l -0.87 -0.23 l -0.87 -0.21 l -0.87 -0.19 l -0.87 -0.18 l -0.87 -0.18 l -0.88 -0.22 l -0.87 -0.28 l -0.87 -0.19 l -0.87 -0.11 l -0.87 -0.05 l -0.87 -0.10 l -0.87 -0.10 l -0.87 -0.14 l -0.87 0.01 l -0.87 -0.12 l -0.87 -0.16 l -0.87 -0.10 l -0.87 -0.05 l -0.87 -0.09 l -0.87 -0.10 l -0.87 -0.06 l -0.87 -0.08 l -0.91 -0.08 l -451.17 310.65 lineto -0.87 -0.14 l -0.87 -0.12 l -0.87 -0.09 l -0.87 -0.06 l -0.87 -0.19 l -0.87 -0.14 l -0.87 -0.01 l -0.87 -0.07 l -0.87 -0.08 l -0.87 -0.12 l -0.87 -0.21 l -0.87 -0.03 l -0.87 0.02 l -0.87 0.06 l -0.87 0.01 l -0.87 -0.09 l -0.87 -0.03 l -0.87 -0.16 l -0.87 -0.09 l -0.87 -0.12 l -0.87 0.06 l -0.87 -0.09 l -0.87 -0.04 l -0.87 -0.06 l -0.92 -0.13 l -0.87 -0.01 l -0.87 0.03 l -0.87 0.02 l -0.87 -0.07 l -0.87 -0.04 l -0.87 -0.01 l -0.87 0.02 l -0.87 0.01 l -0.87 -0.04 l -0.87 -0.06 l -0.87 -0.09 l -0.87 0.15 l -0.87 -0.06 l -0.87 -0.08 l -0.87 0.07 l -0.87 -0.06 l -0.87 0 l -0.87 -0.07 l -0.22 -0.07 l -0.65 0.03 l -0.88 0.17 l -0.87 0.03 l -0.87 0.06 l -0.87 -0.06 l -0.87 -0.05 l -0.87 0.12 l -0.91 -0.14 l -0.87 -0.04 l -0.87 -0.02 l -0.87 0.06 l -0.87 0.08 l -0.87 0.07 l -0.87 0.02 l -0.87 -0.03 l -0.87 0.02 l -0.87 0.02 l -0.87 -0.01 l -0.87 0.05 l -0.88 0.05 l -0.87 0.07 l -0.87 -0.06 l -0.87 -0.01 l -0.87 0 l -0.87 -0.13 l -0.87 0.03 l -0.87 0.01 l -0.87 -0.12 l -0.87 0.10 l -0.87 -0.02 l -0.87 0.08 l -0.87 0.05 l -0.87 0.07 l -0.91 -0.07 l -0.87 -0.02 l -0.87 -0.06 l -0.87 -0.05 l -0.88 0.07 l -0.87 -0.04 l -0.87 0.04 l -0.87 0.14 l -0.87 -0.02 l -0.87 -0.05 l -0.87 -0.02 l -0.87 0.13 l -0.87 0.11 l -0.87 -0.07 l -o -18.00 204.94 577.28 636.94 cl -0 1 1 rgb -0.38 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -488.85 278.38 m -0 0 l -o -np -488.85 278.38 m -0 8.99 l -o -77.04 278.38 547.04 577.90 cl -/ps 12 def /Font1 findfont 12 s -0 0 0 rgb -488.85 297.33 (0.001) 1 0 0 t -0.75 setlinewidth -[] 0 setdash -1 setlinecap -1 setlinejoin -10.00 setmiterlimit -np -77.04 308.37 m -470.00 0 l -o -ep -%%Trailer -%%Pages: 1 -%%EOF diff --git a/inst/doc/shrinkage-v-iterations.pdf b/inst/doc/shrinkage-v-iterations.pdf deleted file mode 100644 index 12a3ed4..0000000 Binary files a/inst/doc/shrinkage-v-iterations.pdf and /dev/null differ diff --git a/inst/doc/srcltx.sty b/inst/doc/srcltx.sty deleted file mode 100644 index a38d206..0000000 --- a/inst/doc/srcltx.sty +++ /dev/null @@ -1,172 +0,0 @@ -%% -%% This is file `srcltx.sty', -%% generated with the docstrip utility. -%% -%% The original source files were: -%% -%% srcltx.dtx (with options: `package,latex') -%% -%% This package is in the public domain. It comes with no guarantees -%% and no reserved rights. You can use or modify this package at your -%% own risk. -%% Originally written by: Aleksander Simonic -%% Current maintainer: Stefan Ulrich -%% -\NeedsTeXFormat{LaTeX2e} -\ProvidesPackage{srcltx}[2006/11/12 v1.6 Source specials for inverse search in DVI files] -\newif\ifSRCOK \SRCOKtrue -\newif\ifsrc@debug@ -\newif\ifsrc@dviwin@ -\newif\ifsrc@winedt@\src@winedt@true -\newif\ifsrc@everypar@\src@everypar@true -\newif\ifsrc@everymath@\src@everymath@true -\RequirePackage{ifthen} -\DeclareOption{active}{\SRCOKtrue} -\DeclareOption{inactive}{\SRCOKfalse} -\DeclareOption{nowinedt}{\src@winedt@false} -\DeclareOption{debug}{\src@debug@true} -\DeclareOption{nopar}{\global\src@everypar@false} -\DeclareOption{nomath}{\global\src@everymath@false} -\newcommand*\src@maybe@space{} -\let\src@maybe@space\space -\DeclareOption{dviwin}{\let\src@maybe@space\relax} -\ExecuteOptions{active} -\ProcessOptions -\newcount\src@lastline -\global\src@lastline=-1 -\newcommand*\src@debug{} -\def\src@debug#1{\ifsrc@debug@\typeout{DBG: |#1|}\fi} -\newcommand*\MainFile{} -\def\MainFile{\jobname.tex} -\newcommand*\CurrentInput{} -\gdef\CurrentInput{\MainFile} -\newcommand*\WinEdt{} -\def\WinEdt#1{\ifsrc@winedt@\typeout{:#1}\fi} -\newcommand\src@AfterFi{} -\def\src@AfterFi#1\fi{\fi#1} -\AtBeginDocument{% - \@ifpackageloaded{soul}{% - \let\src@SOUL@\SOUL@ - \def\SOUL@#1{% - \ifSRCOK - \SRCOKfalse\src@SOUL@{#1}\SRCOKtrue - \else - \src@AfterFi\src@SOUL@{#1}% - \fi - }% - }{}% -} -\newcommand*\srcIncludeHook[1]{\protected@xdef\CurrentInput{#1.tex}} -\newcommand*\srcInputHook[1]{% - \src@getfilename@with@ext{#1}% -} -\newcommand*\src@spec{} -\def\src@spec{% - \ifSRCOK - \ifnum\inputlineno>\src@lastline - \global\src@lastline=\inputlineno - \src@debug{% - src:\the\inputlineno\src@maybe@space\CurrentInput}% - \special{src:\the\inputlineno\src@maybe@space\CurrentInput}% - \fi - \fi -} -\newcommand\src@before@file@hook{} -\newcommand\src@after@file@hook{} -\def\src@before@file@hook{% - \WinEdt{<+ \CurrentInput}% - \global\src@lastline=0 - \ifSRCOK\special{src:1\src@maybe@space\CurrentInput}\fi -} -\def\src@after@file@hook#1{% - \WinEdt{<-}% - \global\src@lastline=\inputlineno - \global\advance\src@lastline by -1% - \gdef\CurrentInput{#1}% - \src@spec -} -\newcommand*\src@fname{}% -\newcommand*\src@tempa{}% -\newcommand*\src@extensions@path{}% -\newcommand*\src@getfilename@with@ext{}% -\def\src@extensions@path#1.#2\end{% - \ifthenelse{\equal{#2}{}}{% - \protected@edef\src@extensions@last{#1}% - \let\src@tempa\relax - }{% - \def\src@tempa{\src@extensions@path#2\end}% - }% - \src@tempa -} -\def\src@getfilename@with@ext#1{% - \expandafter\src@extensions@path#1.\end - \ifthenelse{\equal{\src@extensions@last}{tex}}{% - \protected@xdef\CurrentInput{#1}% - }{% - \protected@xdef\CurrentInput{#1.tex}% - }% - \PackageInfo{srcltx}{Expanded filename `#1' to `\CurrentInput'}% -} -\newcommand*\src@include{} -\newcommand*\src@@include{} -\let\src@include\include -\def\include#1{% - \src@spec - \clearpage - \expandafter\src@@include\expandafter{\CurrentInput}{#1}% -}% -\def\src@@include#1#2{% - \srcIncludeHook{#2}% - \src@before@file@hook - \src@include{#2}% - \src@after@file@hook{#1}% -} -\newcommand*\src@input{} -\newcommand*\src@@input{} -\newcommand*\src@@@input{} -\let\src@input\input -\def\input{\src@spec\@ifnextchar\bgroup\src@@input\@@input}% -\def\src@@input#1{% - \expandafter\src@@@input\expandafter{\CurrentInput}{#1}% -} -\def\src@@@input#1#2{% - \srcInputHook{#2}% - \src@before@file@hook - \src@input{#2}% - \src@after@file@hook{#1}% -} -\newcommand\Input{} -\let\Input\input -\ifsrc@everypar@ - \newcommand*\src@old@everypar{} - \let\src@old@everypar\everypar - \newtoks\src@new@everypar - \let\everypar\src@new@everypar - \everypar\expandafter{\the\src@old@everypar} - \src@old@everypar{\the\src@new@everypar\src@spec} -\fi -\ifsrc@everymath@ - \def\@tempa#1\the\everymath#2\delimiter{{#1\src@spec\the\everymath#2}} - \frozen@everymath=\expandafter\@tempa\the\frozen@everymath\delimiter -\fi -\newcommand*\src@bibliography{} -\newcommand*\src@@bibliography{} -\let\src@bibliography\bibliography -\def\bibliography#1{% - \expandafter\src@@bibliography\expandafter{\CurrentInput}{#1}% -} -\def\src@@bibliography#1#2{% - \protected@xdef\CurrentInput{\jobname.bbl}% - \src@before@file@hook - \src@bibliography{#2}% - \src@after@file@hook{#1}% -} -\newcommand*\src@old@output{} -\let\src@old@output\output -\newtoks\src@new@output -\let\output\src@new@output -\output\expandafter{\the\src@old@output} -\src@old@output{\SRCOKfalse\the\src@new@output} -\endinput -%% -%% End of file `srcltx.sty'. diff --git a/man/basehaz.gbm.Rd b/man/basehaz.gbm.Rd index d094e3c..dba7b7f 100644 --- a/man/basehaz.gbm.Rd +++ b/man/basehaz.gbm.Rd @@ -1,38 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/basehaz.gbm.R \name{basehaz.gbm} \alias{basehaz.gbm} -\title{ Baseline hazard function } -\description{ - Computes the Breslow estimator of the baseline hazard function for a proportional hazard regression model -} +\title{Baseline hazard function} \usage{ -basehaz.gbm(t, delta, f.x, - t.eval = NULL, - smooth = FALSE, - cumulative = TRUE) +basehaz.gbm(t, delta, f.x, t.eval = NULL, smooth = FALSE, + cumulative = TRUE) } \arguments{ - \item{t}{ the survival times } - \item{delta}{ the censoring indicator } - \item{f.x}{ the predicted values of the regression model on the log hazard scale } - \item{t.eval}{ values at which the baseline hazard will be evaluated } - \item{smooth}{ if \code{TRUE} \code{basehaz.gbm} will smooth the estimated baseline hazard using Friedman's super smoother \code{\link{supsmu}}} - \item{cumulative}{ if \code{TRUE} the cumulative survival function will be computed } +\item{t}{The survival times.} + +\item{delta}{The censoring indicator.} + +\item{f.x}{The predicted values of the regression model on the log hazard +scale.} + +\item{t.eval}{Values at which the baseline hazard will be evaluated.} + +\item{smooth}{If \code{TRUE} \code{basehaz.gbm} will smooth the estimated +baseline hazard using Friedman's super smoother \code{\link{supsmu}}.} + +\item{cumulative}{If \code{TRUE} the cumulative survival function will be +computed.} +} +\value{ +A vector of length equal to the length of t (or of length + \code{t.eval} if \code{t.eval} is not \code{NULL}) containing the baseline + hazard evaluated at t (or at \code{t.eval} if \code{t.eval} is not + \code{NULL}). If \code{cumulative} is set to \code{TRUE} then the returned + vector evaluates the cumulative hazard function at those values. +} +\description{ +Computes the Breslow estimator of the baseline hazard function for a +proportional hazard regression model. } \details{ - The proportional hazard model assumes h(t|x)=lambda(t)*exp(f(x)). \code{\link{gbm}} can estimate the f(x) component via partial likelihood. After estimating f(x), \code{basehaz.gbm} can compute the a nonparametric estimate of lambda(t). +The proportional hazard model assumes h(t|x)=lambda(t)*exp(f(x)). +\code{\link{gbm}} can estimate the f(x) component via partial likelihood. +After estimating f(x), \code{basehaz.gbm} can compute the a nonparametric +estimate of lambda(t). } -\value{ - a vector of length equal to the length of t (or of length \code{t.eval} if \code{t.eval} is not \code{NULL}) containing the baseline hazard evaluated at t (or at \code{t.eval} if \code{t.eval} is not \code{NULL}). If \code{cumulative} is set to \code{TRUE} then the returned vector evaluates the cumulative hazard function at those values. +\references{ +N. Breslow (1972). "Discussion of `Regression Models and +Life-Tables' by D.R. Cox," Journal of the Royal Statistical Society, Series +B, 34(2):216-217. + +N. Breslow (1974). "Covariance analysis of censored survival data," +Biometrics 30:89-99. } -\references{N. Breslow (1972). "Disussion of `Regression Models and Life-Tables' by D.R. Cox," Journal of the Royal Statistical Society, Series B, 34(2):216-217. - -N. Breslow (1974). "Covariance analysis of censored survival data," Biometrics 30:89-99. -} -\author{ Greg Ridgeway \email{gregridgeway@gmail.com}} - - - -\seealso{ \code{\link[survival]{survfit}}, \code{\link{gbm}} } - -\keyword{ methods } -\keyword{ survival } +\seealso{ +\code{\link[survival]{survfit}}, \code{\link{gbm}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{methods} +\keyword{survival} diff --git a/man/calibrate.plot.Rd b/man/calibrate.plot.Rd index 319893f..282350e 100644 --- a/man/calibrate.plot.Rd +++ b/man/calibrate.plot.Rd @@ -1,64 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calibrate.plot.R \name{calibrate.plot} \alias{calibrate.plot} \title{Calibration plot} -\description{ -An experimental diagnostic tool that plots the fitted values versus the actual average values. -Currently developed for only \code{distribution="bernoulli"}. -} \usage{ -calibrate.plot(y,p, - distribution="bernoulli", - replace=TRUE, - line.par=list(col="black"), - shade.col="lightyellow", - shade.density=NULL, - rug.par=list(side=1), - xlab="Predicted value", - ylab="Observed average", - xlim=NULL,ylim=NULL, - knots=NULL,df=6, - ...) +calibrate.plot(y, p, distribution = "bernoulli", replace = TRUE, + line.par = list(col = "black"), shade.col = "lightyellow", + shade.density = NULL, rug.par = list(side = 1), + xlab = "Predicted value", ylab = "Observed average", xlim = NULL, + ylim = NULL, knots = NULL, df = 6, ...) } \arguments{ - \item{y}{ the outcome 0-1 variable } - \item{p}{ the predictions estimating E(y|x) } - \item{distribution}{the loss function used in creating \code{p}. - \code{bernoulli} and \code{poisson} are currently the - only special options. All others default to squared error - assuming \code{gaussian}} - \item{replace}{ determines whether this plot will replace or overlay the current plot. - \code{replace=FALSE} is useful for comparing the calibration of several - methods} - \item{line.par}{ graphics parameters for the line } - \item{shade.col}{ color for shading the 2 SE region. \code{shade.col=NA} implies no 2 SE - region} - \item{shade.density}{ the \code{density} parameter for \code{\link{polygon}}} - \item{rug.par}{graphics parameters passed to \code{\link{rug}}} - \item{xlab}{x-axis label corresponding to the predicted values} - \item{ylab}{y-axis label corresponding to the observed average} - \item{xlim,ylim}{x and y-axis limits. If not specified te function will select - limits} - \item{knots,df}{these parameters are passed directly to - \code{\link[splines]{ns}} for constructing a natural spline - smoother for the calibration curve} - \item{...}{ other graphics parameters passed on to the plot function } +\item{y}{The outcome 0-1 variable.} + +\item{p}{The predictions estimating E(y|x).} + +\item{distribution}{The loss function used in creating \code{p}. +\code{bernoulli} and \code{poisson} are currently the only special options. +All others default to squared error assuming \code{gaussian}.} + +\item{replace}{Determines whether this plot will replace or overlay the +current plot. \code{replace=FALSE} is useful for comparing the calibration +of several methods.} + +\item{line.par}{Graphics parameters for the line.} + +\item{shade.col}{Color for shading the 2 SE region. \code{shade.col=NA} +implies no 2 SE region.} + +\item{shade.density}{The \code{density} parameter for \code{\link{polygon}}.} + +\item{rug.par}{Graphics parameters passed to \code{\link{rug}}.} + +\item{xlab}{x-axis label corresponding to the predicted values.} + +\item{ylab}{y-axis label corresponding to the observed average.} + +\item{xlim, ylim}{x- and y-axis limits. If not specified te function will +select limits.} + +\item{knots, df}{These parameters are passed directly to +\code{\link[splines]{ns}} for constructing a natural spline smoother for the +calibration curve.} + +\item{...}{Additional optional arguments to be passed onto +\code{\link[graphics]{plot}}} +} +\value{ +No return values. +} +\description{ +An experimental diagnostic tool that plots the fitted values versus the +actual average values. Currently only available when +\code{distribution = "bernoulli"}. } \details{ -Uses natural splines to estimate E(y|p). Well-calibrated predictions -imply that E(y|p) = p. The plot also includes a pointwise 95% confidence -band. +Uses natural splines to estimate E(y|p). Well-calibrated predictions imply +that E(y|p) = p. The plot also includes a pointwise 95% confidence band. } -\value{ -\code{calibrate.plot} returns no values. -} -\references{ -J.F. Yates (1982). "External correspondence: decomposition of the mean -probability score," Organisational Behaviour and Human Performance 30:132-156. - -D.J. Spiegelhalter (1986). "Probabilistic Prediction in Patient Management -and Clinical Trials," Statistics in Medicine 5:421-433. -} -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} \examples{ # Don't want R CMD check to think there is a dependency on rpart # so comment out the example @@ -70,4 +69,15 @@ #p <- predict(glm1,type="response") #calibrate.plot(y, p, xlim=c(0,0.6), ylim=c(0,0.6)) } -\keyword{ hplot } +\references{ +J.F. Yates (1982). "External correspondence: decomposition of +the mean probability score," Organisational Behaviour and Human Performance +30:132-156. + +D.J. Spiegelhalter (1986). "Probabilistic Prediction in Patient Management +and Clinical Trials," Statistics in Medicine 5:421-433. +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{hplot} diff --git a/man/gbm-internal.Rd b/man/gbm-internal.Rd deleted file mode 100644 index 379e036..0000000 --- a/man/gbm-internal.Rd +++ /dev/null @@ -1,48 +0,0 @@ -\name{gbm-internal} -\alias{guessDist} -\alias{getStratify} -\alias{getCVgroup} -\alias{checkMissing} -\alias{checkID} -\alias{checkWeights} -\alias{checkOffset} -\alias{getVarNames} -\alias{gbmCluster} -\title{gbm internal functions} -\description{Helper functions for preprocessing data prior to - building the model} -\usage{ -guessDist(y) -getCVgroup(distribution, class.stratify.cv, y, i.train, cv.folds, group) -getStratify(strat, d) -checkMissing(x, y) -checkWeights(w, n) -checkID(id) -checkOffset(o, y) -getVarNames(x) -gbmCluster(n) -} -\arguments{ - \item{y}{The response variable} - \item{d, distribution}{The distribution, either specified by the user or - implied} - \item{class.stratify.cv}{Whether or not to stratify, if provided by - the user} - \item{i.train}{Computed internally by \code{gbm}} - \item{group}{The group, if using \code{distibution='pairwise'}} - \item{strat}{Whether or not to stratify} - \item{cv.folds}{The number of cross-validation folds} - \item{x}{The design matrix} - \item{id}{The interaction depth} - \item{w}{The weights} - \item{n}{The number of cores to use in the cluster.} - \item{o}{The offset} -% \item{verbose}{Whether or not to print output to screen} -% \item{X, var.monotone, n.trees, n.minobsinnode, shrinkage, bag.fraction, -% var.names, response.name, cv.group}{Arguments passed -% through to gbm.fit} -} % Close \arguments -\details{ - These are functions used internally by \code{gbm} and not intended for - direct use by the user. -} diff --git a/man/gbm-internals.Rd b/man/gbm-internals.Rd new file mode 100644 index 0000000..d3c562c --- /dev/null +++ b/man/gbm-internals.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm-internals.R +\name{guessDist} +\alias{guessDist} +\alias{getStratify} +\alias{getCVgroup} +\alias{checkMissing} +\alias{checkID} +\alias{checkWeights} +\alias{checkOffset} +\alias{getVarNames} +\alias{gbmCluster} +\title{gbm internal functions} +\usage{ +guessDist(y) + +getCVgroup(distribution, class.stratify.cv, y, i.train, cv.folds, group) + +getStratify(strat, d) + +checkMissing(x, y) + +checkWeights(w, n) + +checkID(id) + +checkOffset(o, y) + +getVarNames(x) + +gbmCluster(n) +} +\arguments{ +\item{y}{The response variable.} + +\item{class.stratify.cv}{Whether or not to stratify, if provided by the user.} + +\item{i.train}{Computed internally by \code{gbm}.} + +\item{cv.folds}{The number of cross-validation folds.} + +\item{group}{The group, if using \code{distibution = "pairwise"}.} + +\item{strat}{Whether or not to stratify.} + +\item{d, distribution}{The distribution, either specified by the user or +implied.} + +\item{x}{The design matrix.} + +\item{w}{The weights.} + +\item{n}{The number of cores to use in the cluster.} + +\item{id}{The interaction depth.} + +\item{o}{The offset.} +} +\description{ +Helper functions for preprocessing data prior to building a \code{"gbm"} +object. +} +\details{ +These are functions used internally by \code{gbm} and not intended for direct +use by the user. +} diff --git a/man/gbm-package.Rd b/man/gbm-package.Rd index 4761685..52dc51b 100644 --- a/man/gbm-package.Rd +++ b/man/gbm-package.Rd @@ -1,64 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm-package.R +\docType{package} \name{gbm-package} \alias{gbm-package} -\docType{package} -\title{Generalized Boosted Regression Models} -\description{This package implements extensions to Freund and -Schapire's AdaBoost algorithm and J. Friedman's gradient -boosting machine. Includes regression methods for least -squares, absolute loss, logistic, Poisson, Cox proportional -hazards partial likelihood, multinomial, t-distribution, -AdaBoost exponential loss, Learning to Rank, and -Huberized hinge loss.} +\title{Generalized Boosted Regression Models (GBMs)} +\description{ +This package implements extensions to Freund and Schapire's AdaBoost +algorithm and J. Friedman's gradient boosting machine. Includes regression +methods for least squares, absolute loss, logistic, Poisson, Cox +proportional hazards partial likelihood, multinomial, t-distribution, +AdaBoost exponential loss, Learning to Rank, and Huberized hinge loss. +} \details{ -\tabular{ll}{ -Package: \tab gbm\cr -Version: \tab 2.1\cr -Date: \tab 2013-05-10\cr -Depends: \tab R (>= 2.9.0), survival, lattice, mgcv\cr -License: \tab GPL (version 2 or newer)\cr -URL: \tab http://code.google.com/p/gradientboostedmodels/\cr -} -Index: -\preformatted{basehaz.gbm Baseline hazard function -calibrate.plot Calibration plot -gbm Generalized Boosted Regression Modeling -gbm.object Generalized Boosted Regression Model Object -gbm.perf GBM performance -plot.gbm Marginal plots of fitted gbm objects -predict.gbm Predict method for GBM Model Fits -pretty.gbm.tree Print gbm tree components -quantile.rug Quantile rug plot -relative.influence Methods for estimating relative influence -shrink.gbm L1 shrinkage of the predictor variables in a GBM -shrink.gbm.pred Predictions from a shrunked GBM -summary.gbm Summary of a gbm object -} - -Further information is available in the following vignettes: -\tabular{ll}{ -\code{gbm} \tab Generalized Boosted Models: A guide to the gbm package (source, pdf)\cr} -} % Close \details -\author{ - Greg Ridgeway \email{gregridgeway@gmail.com} with contributions by - Daniel Edwards, Brian Kriegler, Stefan Schroedl and Harry Southworth. +Further information is available in vignette: +\code{browseVignettes(package = "gbm")} } \references{ -Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic generalization of -on-line learning and an application to boosting,} \emph{Journal of Computer and -System Sciences,} 55(1):119-139. +Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +generalization of on-line learning and an application to boosting,} +\emph{Journal of Computer and System Sciences,} 55(1):119-139. -G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science and -Statistics} 31:172-181. +G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +and Statistics} 31:172-181. -J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic Regression: -a Statistical View of Boosting,} \emph{Annals of Statistics} 28(2):337-374. +J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +28(2):337-374. -J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient Boosting -Machine,} \emph{Annals of Statistics} 29(5):1189-1232. +J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. -J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} \emph{Computational Statistics -and Data Analysis} 38(4):367-378. +J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +\emph{Computational Statistics and Data Analysis} 38(4):367-378. -The \href{http://www-stat.stanford.edu/~jhf/R-MART.html}{MART} website. -} % Close \references +The \url{http://statweb.stanford.edu/~jhf/R-MART} website. +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} with contributions by +Daniel Edwards, Brian Kriegler, Stefan Schroedl and Harry Southworth. +} \keyword{package} diff --git a/man/gbm.Rd b/man/gbm.Rd index 7fce1ac..f52616a 100644 --- a/man/gbm.Rd +++ b/man/gbm.Rd @@ -1,293 +1,328 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm.R \name{gbm} \alias{gbm} -\alias{gbm.more} -\alias{gbm.fit} -\title{Generalized Boosted Regression Modeling} -\description{Fits generalized boosted regression models.} +\title{Generalized Boosted Regression Modeling (GBM)} \usage{ -gbm(formula = formula(data), - distribution = "bernoulli", - data = list(), - weights, - var.monotone = NULL, - n.trees = 100, - interaction.depth = 1, - n.minobsinnode = 10, - shrinkage = 0.001, - bag.fraction = 0.5, - train.fraction = 1.0, - cv.folds=0, - keep.data = TRUE, - verbose = "CV", - class.stratify.cv=NULL, - n.cores = NULL) - -gbm.fit(x, y, - offset = NULL, - misc = NULL, - distribution = "bernoulli", - w = NULL, - var.monotone = NULL, - n.trees = 100, - interaction.depth = 1, - n.minobsinnode = 10, - shrinkage = 0.001, - bag.fraction = 0.5, - nTrain = NULL, - train.fraction = NULL, - keep.data = TRUE, - verbose = TRUE, - var.names = NULL, - response.name = "y", - group = NULL) - -gbm.more(object, - n.new.trees = 100, - data = NULL, - weights = NULL, - offset = NULL, - verbose = NULL) -} -\arguments{\item{formula}{a symbolic description of the model to be fit. The formula may include an offset term (e.g. y~offset(n)+x). If \code{keep.data=FALSE} in the initial call to \code{gbm} then it is the user's responsibility to resupply the offset to \code{\link{gbm.more}}.} -\item{distribution}{either a character string specifying the name of the distribution to use or a list with a component \code{name} specifying the distribution and any additional parameters needed. If not specified, \code{gbm} will try to guess: if the response has only 2 unique values, bernoulli is assumed; otherwise, if the response is a factor, multinomial is assumed; otherwise, if the response has class "Surv", coxph is assumed; otherwise, gaussian is assumed. - -Currently available options are "gaussian" (squared error), "laplace" (absolute loss), "tdist" (t-distribution loss), "bernoulli" (logistic regression for 0-1 outcomes), -"huberized" (huberized hinge loss for 0-1 outcomes), -"multinomial" (classification when there are more than 2 classes), "adaboost" (the AdaBoost exponential loss for 0-1 outcomes), "poisson" (count outcomes), "coxph" (right censored observations), "quantile", or "pairwise" (ranking measure using the LambdaMart algorithm). - -If quantile regression is specified, \code{distribution} must be a list of the form \code{list(name="quantile",alpha=0.25)} where \code{alpha} is the quantile to estimate. The current version's quantile regression method does not handle non-constant weights and will stop. - -If "tdist" is specified, the default degrees of freedom is 4 and this can be controlled by specifying \code{distribution=list(name="tdist", df=DF)} where \code{DF} is your chosen degrees of freedom. - -If "pairwise" regression is specified, \code{distribution} must be a list -of the form -\code{list(name="pairwise",group=...,metric=...,max.rank=...)} -(\code{metric} and \code{max.rank} are optional, see -below). \code{group} is a character vector with the column names of -\code{data} that jointly indicate the group an instance belongs to -(typically a query in Information Retrieval applications). -For training, only pairs of -instances from the same group and with different target labels can be -considered. \code{metric} is the IR measure to use, one of -\describe{ -\item{\code{conc}:}{Fraction of concordant pairs; for binary labels, - this is equivalent to the Area under the ROC Curve} -\item{\code{mrr}:}{Mean reciprocal rank of the highest-ranked positive instance} -\item{\code{map}:}{Mean average precision, a generalization of - \code{mrr} to multiple positive instances} -\item{\code{ndcg:}}{Normalized discounted cumulative gain. The score is - the weighted sum (DCG) of the user-supplied target values, weighted by - log(rank+1), and normalized to the maximum achievable value. This - is the default if the user did not specify a metric.} +gbm(formula = formula(data), distribution = "bernoulli", + data = list(), weights, var.monotone = NULL, n.trees = 100, + interaction.depth = 1, n.minobsinnode = 10, shrinkage = 0.1, + bag.fraction = 0.5, train.fraction = 1, cv.folds = 0, + keep.data = TRUE, verbose = FALSE, class.stratify.cv = NULL, + n.cores = NULL) +} +\arguments{ +\item{formula}{A symbolic description of the model to be fit. The formula +may include an offset term (e.g. y~offset(n)+x). If +\code{keep.data = FALSE} in the initial call to \code{gbm} then it is the +user's responsibility to resupply the offset to \code{\link{gbm.more}}.} + +\item{distribution}{Either a character string specifying the name of the +distribution to use or a list with a component \code{name} specifying the +distribution and any additional parameters needed. If not specified, +\code{gbm} will try to guess: if the response has only 2 unique values, +bernoulli is assumed; otherwise, if the response is a factor, multinomial is +assumed; otherwise, if the response has class \code{"Surv"}, coxph is +assumed; otherwise, gaussian is assumed. + +Currently available options are \code{"gaussian"} (squared error), +\code{"laplace"} (absolute loss), \code{"tdist"} (t-distribution loss), +\code{"bernoulli"} (logistic regression for 0-1 outcomes), +\code{"huberized"} (huberized hinge loss for 0-1 outcomes), classes), +\code{"adaboost"} (the AdaBoost exponential loss for 0-1 outcomes), +\code{"poisson"} (count outcomes), \code{"coxph"} (right censored +observations), \code{"quantile"}, or \code{"pairwise"} (ranking measure +using the LambdaMart algorithm). + +If quantile regression is specified, \code{distribution} must be a list of +the form \code{list(name = "quantile", alpha = 0.25)} where \code{alpha} is +the quantile to estimate. The current version's quantile regression method +does not handle non-constant weights and will stop. + +If \code{"tdist"} is specified, the default degrees of freedom is 4 and +this can be controlled by specifying +\code{distribution = list(name = "tdist", df = DF)} where \code{DF} is your +chosen degrees of freedom. + +If "pairwise" regression is specified, \code{distribution} must be a list of +the form \code{list(name="pairwise",group=...,metric=...,max.rank=...)} +(\code{metric} and \code{max.rank} are optional, see below). \code{group} is +a character vector with the column names of \code{data} that jointly +indicate the group an instance belongs to (typically a query in Information +Retrieval applications). For training, only pairs of instances from the same +group and with different target labels can be considered. \code{metric} is +the IR measure to use, one of +\describe{ + \item{list("conc")}{Fraction of concordant pairs; for binary labels, this + is equivalent to the Area under the ROC Curve} + \item{:}{Fraction of concordant pairs; for binary labels, this is + equivalent to the Area under the ROC Curve} + \item{list("mrr")}{Mean reciprocal rank of the highest-ranked positive + instance} + \item{:}{Mean reciprocal rank of the highest-ranked positive instance} + \item{list("map")}{Mean average precision, a generalization of \code{mrr} + to multiple positive instances}\item{:}{Mean average precision, a + generalization of \code{mrr} to multiple positive instances} + \item{list("ndcg:")}{Normalized discounted cumulative gain. The score is + the weighted sum (DCG) of the user-supplied target values, weighted + by log(rank+1), and normalized to the maximum achievable value. This + is the default if the user did not specify a metric.} } \code{ndcg} and \code{conc} allow arbitrary target values, while binary -targets \{0,1\} are expected for \code{map} and \code{mrr}. For -\code{ndcg} and \code{mrr}, a cut-off can be chosen using a positive -integer parameter \code{max.rank}. If left unspecified, all ranks are -taken into account. - -Note that splitting of instances into training and validation sets -follows group boundaries and therefore only approximates the specified -\code{train.fraction} ratio (the same applies to cross-validation -folds). Internally, queries are randomly shuffled before training, to -avoid bias. +targets {0,1} are expected for \code{map} and \code{mrr}. For \code{ndcg} +and \code{mrr}, a cut-off can be chosen using a positive integer parameter +\code{max.rank}. If left unspecified, all ranks are taken into account. + +Note that splitting of instances into training and validation sets follows +group boundaries and therefore only approximates the specified +\code{train.fraction} ratio (the same applies to cross-validation folds). +Internally, queries are randomly shuffled before training, to avoid bias. Weights can be used in conjunction with pairwise metrics, however it is assumed that they are constant for instances from the same group. -For details and background on the algorithm, see e.g. Burges (2010). -} - -\item{data}{an optional data frame containing the variables in the model. By default the variables are taken from \code{environment(formula)}, typically the environment from which \code{gbm} is called. If \code{keep.data=TRUE} in the initial call to \code{gbm} then \code{gbm} stores a copy with the object. If \code{keep.data=FALSE} then subsequent calls to \code{\link{gbm.more}} must resupply the same dataset. It becomes the user's responsibility to resupply the same data at this point.} -\item{weights}{an optional vector of weights to be used in the fitting process. Must be positive but do not need to be normalized. If \code{keep.data=FALSE} in the initial call to \code{gbm} then it is the user's responsibility to resupply the weights to \code{\link{gbm.more}}.} -\item{var.monotone}{an optional vector, the same length as the number of predictors, indicating which variables have a monotone increasing (+1), decreasing (-1), or arbitrary (0) relationship with the outcome.} -\item{n.trees}{the total number of trees to fit. This is equivalent to the number of iterations and the number of basis functions in the additive expansion.} -\item{cv.folds}{Number of cross-validation folds to perform. If \code{cv.folds}>1 then \code{gbm}, in addition to the usual fit, will perform a cross-validation, calculate an estimate of generalization error returned in \code{cv.error}.} -\item{interaction.depth}{The maximum depth of variable interactions. 1 implies an additive model, 2 implies a model with up to 2-way interactions, etc.} -\item{n.minobsinnode}{minimum number of observations in the trees terminal nodes. Note that this is the actual number of observations not the total weight.} -\item{shrinkage}{a shrinkage parameter applied to each tree in the expansion. Also known as the learning rate or step-size reduction.} -\item{bag.fraction}{the fraction of the training set observations randomly selected to propose the next tree in the expansion. This introduces randomnesses into the model fit. If \code{bag.fraction}<1 then running the same model twice will result in similar but different fits. \code{gbm} uses the R random number generator so \code{set.seed} can ensure that the model can be reconstructed. Preferably, the user can save the returned \code{\link{gbm.object}} using \code{\link{save}}.} +For details and background on the algorithm, see e.g. Burges (2010).} + +\item{data}{an optional data frame containing the variables in the model. By +default the variables are taken from \code{environment(formula)}, typically +the environment from which \code{gbm} is called. If \code{keep.data=TRUE} in +the initial call to \code{gbm} then \code{gbm} stores a copy with the +object. If \code{keep.data=FALSE} then subsequent calls to +\code{\link{gbm.more}} must resupply the same dataset. It becomes the user's +responsibility to resupply the same data at this point.} + +\item{weights}{an optional vector of weights to be used in the fitting +process. Must be positive but do not need to be normalized. If +\code{keep.data=FALSE} in the initial call to \code{gbm} then it is the +user's responsibility to resupply the weights to \code{\link{gbm.more}}.} + +\item{var.monotone}{an optional vector, the same length as the number of +predictors, indicating which variables have a monotone increasing (+1), +decreasing (-1), or arbitrary (0) relationship with the outcome.} + +\item{n.trees}{Integer specifying the total number of trees to fit. This is +equivalent to the number of iterations and the number of basis functions in +the additive expansion. Default is 100.} + +\item{interaction.depth}{Integer specifying the maximum depth of each tree +(i.e., the highest level of variable interactions allowed). A value of 1 +implies an additive model, a value of 2 implies a model with up to 2-way +interactions, etc. Default is 1.} + +\item{n.minobsinnode}{Integer specifying the minimum number of observations +in the terminal nodes of the trees. Note that this is the actual number of +observations, not the total weight.} + +\item{shrinkage}{a shrinkage parameter applied to each tree in the +expansion. Also known as the learning rate or step-size reduction; 0.001 to +0.1 usually work, but a smaller learning rate typically requires more trees. +Default is 0.1.} + +\item{bag.fraction}{the fraction of the training set observations randomly +selected to propose the next tree in the expansion. This introduces +randomnesses into the model fit. If \code{bag.fraction} < 1 then running the +same model twice will result in similar but different fits. \code{gbm} uses +the R random number generator so \code{set.seed} can ensure that the model +can be reconstructed. Preferably, the user can save the returned +\code{\link{gbm.object}} using \code{\link{save}}. Default is 0.5.} + \item{train.fraction}{The first \code{train.fraction * nrows(data)} - observations are used to fit the \code{gbm} and the remainder are used - for computing out-of-sample estimates of the loss function.} -\item{nTrain}{An integer representing the number of cases on which to - train. This is the preferred way of specification for \code{gbm.fit}; - The option \code{train.fraction} in \code{gbm.fit} is deprecated and - only maintained for backward compatibility. These two parameters are - mutually exclusive. If both are unspecified, all data is used for training.} -\item{keep.data}{a logical variable indicating whether to keep the data and an index of the data stored with the object. Keeping the data and index makes subsequent calls to \code{\link{gbm.more}} faster at the cost of storing an extra copy of the dataset.} -\item{object}{a \code{gbm} object created from an initial call to \code{\link{gbm}}.} -\item{n.new.trees}{the number of additional trees to add to \code{object}.} -\item{verbose}{If TRUE, gbm will print out progress and performance indicators. If this option is left unspecified for gbm.more then it uses \code{verbose} from \code{object}.} -\item{class.stratify.cv}{whether or not the cross-validation should be stratified by class. Defaults to \code{TRUE} for \code{distribution="multinomial"} and is only implementated for \code{multinomial} and \code{bernoulli}. The purpose of stratifying the cross-validation is to help avoiding situations in which training sets do not contain all classes.} -\item{x, y}{For \code{gbm.fit}: \code{x} is a data frame or data matrix containing the predictor variables and \code{y} is the vector of outcomes. The number of rows in \code{x} must be the same as the length of \code{y}.} -\item{offset}{a vector of values for the offset} -\item{misc}{For \code{gbm.fit}: \code{misc} is an R object that is simply passed on to the gbm engine. It can be used for additional data for the specific distribution. Currently it is only used for passing the censoring indicator for the Cox proportional hazards model.} -\item{w}{For \code{gbm.fit}: \code{w} is a vector of weights of the same length as the \code{y}.} -\item{var.names}{For \code{gbm.fit}: A vector of strings of length equal to the number of columns of \code{x} containing the names of the predictor variables.} -\item{response.name}{For \code{gbm.fit}: A character string label for the response variable.} -\item{group}{\code{group} used when \code{distribution = 'pairwise'.}} +observations are used to fit the \code{gbm} and the remainder are used for +computing out-of-sample estimates of the loss function.} + +\item{cv.folds}{Number of cross-validation folds to perform. If +\code{cv.folds}>1 then \code{gbm}, in addition to the usual fit, will +perform a cross-validation, calculate an estimate of generalization error +returned in \code{cv.error}.} + +\item{keep.data}{a logical variable indicating whether to keep the data and +an index of the data stored with the object. Keeping the data and index +makes subsequent calls to \code{\link{gbm.more}} faster at the cost of +storing an extra copy of the dataset.} + +\item{verbose}{Logical indicating whether or not to print out progress and +performance indicators (\code{TRUE}). If this option is left unspecified for +\code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +\code{FALSE}.} + +\item{class.stratify.cv}{Logical indicating whether or not the +cross-validation should be stratified by class. Defaults to \code{TRUE} for +\code{distribution = "multinomial"} and is only implemented for +\code{"multinomial"} and \code{"bernoulli"}. The purpose of stratifying the +cross-validation is to help avoiding situations in which training sets do +not contain all classes.} + \item{n.cores}{The number of CPU cores to use. The cross-validation loop - will attempt to send different CV folds off to different cores. If - \code{n.cores} is not specified by the user, it is guessed using the - \code{detectCores} function in the \code{parallel} package. Note that - the documentation for \code{detectCores} makes clear that it is not - failsave and could return a spurious number of available cores.} -} - -\details{See the \href{../doc/gbm.pdf}{gbm vignette} for technical details. - -This package implements the generalized boosted modeling framework. Boosting is the process of iteratively adding basis functions in a greedy fashion so that each additional basis function further reduces the selected loss function. This implementation closely follows Friedman's Gradient Boosting Machine (Friedman, 2001). - -In addition to many of the features documented in the Gradient Boosting Machine, \code{gbm} offers additional features including the out-of-bag estimator for the optimal number of iterations, the ability to store and manipulate the resulting \code{gbm} object, and a variety of other loss functions that had not previously had associated boosting algorithms, including the Cox partial likelihood for censored data, the poisson likelihood for count outcomes, and a gradient boosting implementation to minimize the AdaBoost exponential loss function. - -\code{gbm.fit} provides the link between R and the C++ gbm engine. \code{gbm} is a front-end to \code{gbm.fit} that uses the familiar R modeling formulas. However, \code{\link[stats]{model.frame}} is very slow if there are many predictor variables. For power-users with many variables use \code{gbm.fit}. For general practice \code{gbm} is preferable.} - -\value{ \code{gbm}, \code{gbm.fit}, and \code{gbm.more} return a \code{\link{gbm.object}}. } - -\references{ -Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic generalization of on-line learning and an application to boosting,} \emph{Journal of Computer and System Sciences,} 55(1):119-139. - -G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science and Statistics} 31:172-181. - -J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} 28(2):337-374. - -J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. - -J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} \emph{Computational Statistics and Data Analysis} 38(4):367-378. - -B. Kriegler (2007). \href{http://statistics.ucla.edu/theses/uclastat-dissertation-2007:2}{Cost-Sensitive Stochastic Gradient Boosting Within a Quantitative Regression Framework}. PhD dissertation, UCLA Statistics. - -C. Burges (2010). \dQuote{From RankNet to LambdaRank to LambdaMART: An Overview,} Microsoft Research Technical Report MSR-TR-2010-82. - -\href{http://sites.google.com/site/gregridgeway}{Greg Ridgeway's site}. - -The \href{http://www-stat.stanford.edu/~jhf/R-MART.html}{MART} website. } - -\author{Greg Ridgeway \email{gregridgeway@gmail.com} - -Quantile regression code developed by Brian Kriegler \email{bk@stat.ucla.edu} - -t-distribution, and multinomial code developed by Harry Southworth and Daniel Edwards - -Pairwise code developed by Stefan Schroedl \email{schroedl@a9.com}} - -\seealso{ \code{\link{gbm.object}}, \code{\link{gbm.perf}}, \code{\link{plot.gbm}}, - \code{\link{predict.gbm}}, \code{\link{summary.gbm}}, \code{\link{pretty.gbm.tree}}. } - -\examples{ # A least squares regression example # create some data - +will attempt to send different CV folds off to different cores. If +\code{n.cores} is not specified by the user, it is guessed using the +\code{detectCores} function in the \code{parallel} package. Note that the +documentation for \code{detectCores} makes clear that it is not failsafe and +could return a spurious number of available cores.} +} +\value{ +A \code{\link{gbm.object}} object. +} +\description{ +Fits generalized boosted regression models. For technical details, see the +vignette: \code{utils::browseVignettes("gbm")}. +} +\details{ +\code{gbm.fit} provides the link between R and the C++ gbm engine. +\code{gbm} is a front-end to \code{gbm.fit} that uses the familiar R +modeling formulas. However, \code{\link[stats]{model.frame}} is very slow if +there are many predictor variables. For power-users with many variables use +\code{gbm.fit}. For general practice \code{gbm} is preferable. + + +This package implements the generalized boosted modeling framework. Boosting +is the process of iteratively adding basis functions in a greedy fashion so +that each additional basis function further reduces the selected loss +function. This implementation closely follows Friedman's Gradient Boosting +Machine (Friedman, 2001). + +In addition to many of the features documented in the Gradient Boosting +Machine, \code{gbm} offers additional features including the out-of-bag +estimator for the optimal number of iterations, the ability to store and +manipulate the resulting \code{gbm} object, and a variety of other loss +functions that had not previously had associated boosting algorithms, +including the Cox partial likelihood for censored data, the poisson +likelihood for count outcomes, and a gradient boosting implementation to +minimize the AdaBoost exponential loss function. +} +\examples{ +# +# A least squares regression example +# + +# Simulate data +set.seed(101) # for reproducibility N <- 1000 X1 <- runif(N) -X2 <- 2*runif(N) -X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1]) -X4 <- factor(sample(letters[1:6],N,replace=TRUE)) -X5 <- factor(sample(letters[1:3],N,replace=TRUE)) -X6 <- 3*runif(N) -mu <- c(-1,0,1,2)[as.numeric(X3)] - -SNR <- 10 # signal-to-noise ratio -Y <- X1**1.5 + 2 * (X2**.5) + mu -sigma <- sqrt(var(Y)/SNR) -Y <- Y + rnorm(N,0,sigma) - -# introduce some missing values -X1[sample(1:N,size=500)] <- NA -X4[sample(1:N,size=300)] <- NA - -data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) - -# fit initial model -gbm1 <- -gbm(Y~X1+X2+X3+X4+X5+X6, # formula - data=data, # dataset - var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease, - # +1: monotone increase, - # 0: no monotone restrictions - distribution="gaussian", # see the help for other choices - n.trees=1000, # number of trees - shrinkage=0.05, # shrinkage or learning rate, - # 0.001 to 0.1 usually work - interaction.depth=3, # 1: additive model, 2: two-way interactions, etc. - bag.fraction = 0.5, # subsampling fraction, 0.5 is probably best - train.fraction = 0.5, # fraction of data for training, - # first train.fraction*N used for training - n.minobsinnode = 10, # minimum total weight needed in each node - cv.folds = 3, # do 3-fold cross-validation - keep.data=TRUE, # keep a copy of the dataset with the object - verbose=FALSE, # don't print out progress - n.cores=1) # use only a single core (detecting #cores is - # error-prone, so avoided here) - -# check performance using an out-of-bag estimator -# OOB underestimates the optimal number of iterations -best.iter <- gbm.perf(gbm1,method="OOB") +X2 <- 2 * runif(N) +X3 <- ordered(sample(letters[1:4], N, replace = TRUE), levels = letters[4:1]) +X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +X6 <- 3 * runif(N) +mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +SNR <- 10 # signal-to-noise ratio +Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu +sigma <- sqrt(var(Y) / SNR) +Y <- Y + rnorm(N, 0, sigma) +X1[sample(1:N,size=500)] <- NA # introduce some missing values +X4[sample(1:N,size=300)] <- NA # introduce some missing values +data <- data.frame(Y, X1, X2, X3, X4, X5, X6) + +# Fit a GBM +set.seed(102) # for reproducibility +gbm1 <- gbm(Y ~ ., data = data, var.monotone = c(0, 0, 0, 0, 0, 0), + distribution = "gaussian", n.trees = 100, shrinkage = 0.1, + interaction.depth = 3, bag.fraction = 0.5, train.fraction = 0.5, + n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE, + verbose = FALSE, n.cores = 1) + +# Check performance using the out-of-bag (OOB) error; the OOB error typically +# underestimates the optimal number of iterations +best.iter <- gbm.perf(gbm1, method = "OOB") print(best.iter) -# check performance using a 50\% heldout test set -best.iter <- gbm.perf(gbm1,method="test") +# Check performance using the 50\% heldout test set +best.iter <- gbm.perf(gbm1, method = "test") print(best.iter) -# check performance using 5-fold cross-validation -best.iter <- gbm.perf(gbm1,method="cv") +# Check performance using 5-fold cross-validation +best.iter <- gbm.perf(gbm1, method = "cv") print(best.iter) -# plot the performance # plot variable influence -summary(gbm1,n.trees=1) # based on the first tree -summary(gbm1,n.trees=best.iter) # based on the estimated best number of trees - -# compactly print the first and last trees for curiosity -print(pretty.gbm.tree(gbm1,1)) -print(pretty.gbm.tree(gbm1,gbm1$n.trees)) - -# make some new data +# Plot relative influence of each variable +par(mfrow = c(1, 2)) +summary(gbm1, n.trees = 1) # using first tree +summary(gbm1, n.trees = best.iter) # using estimated best number of trees + +# Compactly print the first and last trees for curiosity +print(pretty.gbm.tree(gbm1, i.tree = 1)) +print(pretty.gbm.tree(gbm1, i.tree = gbm1$n.trees)) + +# Simulate new data +set.seed(103) # for reproducibility N <- 1000 X1 <- runif(N) -X2 <- 2*runif(N) -X3 <- ordered(sample(letters[1:4],N,replace=TRUE)) -X4 <- factor(sample(letters[1:6],N,replace=TRUE)) -X5 <- factor(sample(letters[1:3],N,replace=TRUE)) -X6 <- 3*runif(N) -mu <- c(-1,0,1,2)[as.numeric(X3)] - -Y <- X1**1.5 + 2 * (X2**.5) + mu + rnorm(N,0,sigma) - -data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) - -# predict on the new data using "best" number of trees -# f.predict generally will be on the canonical scale (logit,log,etc.) -f.predict <- predict(gbm1,data2,best.iter) +X2 <- 2 * runif(N) +X3 <- ordered(sample(letters[1:4], N, replace = TRUE)) +X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +X6 <- 3 * runif(N) +mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu + rnorm(N, 0, sigma) +data2 <- data.frame(Y, X1, X2, X3, X4, X5, X6) + +# Predict on the new data using the "best" number of trees; by default, +# predictions will be on the link scale +Yhat <- predict(gbm1, newdata = data2, n.trees = best.iter, type = "link") # least squares error -print(sum((data2$Y-f.predict)^2)) - -# create marginal plots -# plot variable X1,X2,X3 after "best" iterations -par(mfrow=c(1,3)) -plot(gbm1,1,best.iter) -plot(gbm1,2,best.iter) -plot(gbm1,3,best.iter) -par(mfrow=c(1,1)) -# contour plot of variables 1 and 2 after "best" iterations -plot(gbm1,1:2,best.iter) -# lattice plot of variables 2 and 3 -plot(gbm1,2:3,best.iter) -# lattice plot of variables 3 and 4 -plot(gbm1,3:4,best.iter) - -# 3-way plots -plot(gbm1,c(1,2,6),best.iter,cont=20) -plot(gbm1,1:3,best.iter) -plot(gbm1,2:4,best.iter) -plot(gbm1,3:5,best.iter) - -# do another 100 iterations -gbm2 <- gbm.more(gbm1,100, - verbose=FALSE) # stop printing detailed progress -} -\keyword{models} -\keyword{nonlinear} -\keyword{survival} -\keyword{nonparametric} -\keyword{tree} +print(sum((data2$Y - Yhat)^2)) + +# Construct univariate partial dependence plots +p1 <- plot(gbm1, i.var = 1, n.trees = best.iter) +p2 <- plot(gbm1, i.var = 2, n.trees = best.iter) +p3 <- plot(gbm1, i.var = "X3", n.trees = best.iter) # can use index or name +grid.arrange(p1, p2, p3, ncol = 3) + +# Construct bivariate partial dependence plots +plot(gbm1, i.var = 1:2, n.trees = best.iter) +plot(gbm1, i.var = c("X2", "X3"), n.trees = best.iter) +plot(gbm1, i.var = 3:4, n.trees = best.iter) + +# Construct trivariate partial dependence plots +plot(gbm1, i.var = c(1, 2, 6), n.trees = best.iter, + continuous.resolution = 20) +plot(gbm1, i.var = 1:3, n.trees = best.iter) +plot(gbm1, i.var = 2:4, n.trees = best.iter) +plot(gbm1, i.var = 3:5, n.trees = best.iter) + +# Add more (i.e., 100) boosting iterations to the ensemble +gbm2 <- gbm.more(gbm1, n.new.trees = 100, verbose = FALSE) +} +\references{ +Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +generalization of on-line learning and an application to boosting,} +\emph{Journal of Computer and System Sciences,} 55(1):119-139. + +G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +and Statistics} 31:172-181. + +J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +28(2):337-374. + +J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. + +J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +\emph{Computational Statistics and Data Analysis} 38(4):367-378. + +B. Kriegler (2007). Cost-Sensitive Stochastic Gradient Boosting Within a +Quantitative Regression Framework. Ph.D. Dissertation. University of +California at Los Angeles, Los Angeles, CA, USA. Advisor(s) Richard A. Berk. +url{https://dl.acm.org/citation.cfm?id=1354603}. + +C. Burges (2010). \dQuote{From RankNet to LambdaRank to LambdaMART: An +Overview,} Microsoft Research Technical Report MSR-TR-2010-82. +} +\seealso{ +\code{\link{gbm.object}}, \code{\link{gbm.perf}}, +\code{\link{plot.gbm}}, \code{\link{predict.gbm}}, \code{\link{summary.gbm}}, +and \code{\link{pretty.gbm.tree}}. +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} + +Quantile regression code developed by Brian Kriegler +\email{bk@stat.ucla.edu} + +t-distribution, and multinomial code developed by Harry Southworth and +Daniel Edwards + +Pairwise code developed by Stefan Schroedl \email{schroedl@a9.com} +} diff --git a/man/gbm.fit.Rd b/man/gbm.fit.Rd new file mode 100644 index 0000000..7288d67 --- /dev/null +++ b/man/gbm.fit.Rd @@ -0,0 +1,220 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm.fit.R +\name{gbm.fit} +\alias{gbm.fit} +\title{Generalized Boosted Regression Modeling (GBM)} +\usage{ +gbm.fit(x, y, offset = NULL, misc = NULL, distribution = "bernoulli", + w = NULL, var.monotone = NULL, n.trees = 100, + interaction.depth = 1, n.minobsinnode = 10, shrinkage = 0.001, + bag.fraction = 0.5, nTrain = NULL, train.fraction = NULL, + keep.data = TRUE, verbose = TRUE, var.names = NULL, + response.name = "y", group = NULL) +} +\arguments{ +\item{x}{A data frame or matrix containing the predictor variables. The +number of rows in \code{x} must be the same as the length of \code{y}.} + +\item{y}{A vector of outcomes. The number of rows in \code{x} must be the +same as the length of \code{y}.} + +\item{offset}{A vector of offset values.} + +\item{misc}{An R object that is simply passed on to the gbm engine. It can be +used for additional data for the specific distribution. Currently it is only +used for passing the censoring indicator for the Cox proportional hazards +model.} + +\item{distribution}{Either a character string specifying the name of the +distribution to use or a list with a component \code{name} specifying the +distribution and any additional parameters needed. If not specified, +\code{gbm} will try to guess: if the response has only 2 unique values, +bernoulli is assumed; otherwise, if the response is a factor, multinomial is +assumed; otherwise, if the response has class \code{"Surv"}, coxph is +assumed; otherwise, gaussian is assumed. + +Currently available options are \code{"gaussian"} (squared error), +\code{"laplace"} (absolute loss), \code{"tdist"} (t-distribution loss), +\code{"bernoulli"} (logistic regression for 0-1 outcomes), +\code{"huberized"} (huberized hinge loss for 0-1 outcomes), classes), +\code{"adaboost"} (the AdaBoost exponential loss for 0-1 outcomes), +\code{"poisson"} (count outcomes), \code{"coxph"} (right censored +observations), \code{"quantile"}, or \code{"pairwise"} (ranking measure +using the LambdaMart algorithm). + +If quantile regression is specified, \code{distribution} must be a list of +the form \code{list(name = "quantile", alpha = 0.25)} where \code{alpha} is +the quantile to estimate. The current version's quantile regression method +does not handle non-constant weights and will stop. + +If \code{"tdist"} is specified, the default degrees of freedom is 4 and +this can be controlled by specifying +\code{distribution = list(name = "tdist", df = DF)} where \code{DF} is your +chosen degrees of freedom. + +If "pairwise" regression is specified, \code{distribution} must be a list of +the form \code{list(name="pairwise",group=...,metric=...,max.rank=...)} +(\code{metric} and \code{max.rank} are optional, see below). \code{group} is +a character vector with the column names of \code{data} that jointly +indicate the group an instance belongs to (typically a query in Information +Retrieval applications). For training, only pairs of instances from the same +group and with different target labels can be considered. \code{metric} is +the IR measure to use, one of +\describe{ + \item{list("conc")}{Fraction of concordant pairs; for binary labels, this + is equivalent to the Area under the ROC Curve} + \item{:}{Fraction of concordant pairs; for binary labels, this is + equivalent to the Area under the ROC Curve} + \item{list("mrr")}{Mean reciprocal rank of the highest-ranked positive + instance} + \item{:}{Mean reciprocal rank of the highest-ranked positive instance} + \item{list("map")}{Mean average precision, a generalization of \code{mrr} + to multiple positive instances}\item{:}{Mean average precision, a + generalization of \code{mrr} to multiple positive instances} + \item{list("ndcg:")}{Normalized discounted cumulative gain. The score is + the weighted sum (DCG) of the user-supplied target values, weighted + by log(rank+1), and normalized to the maximum achievable value. This + is the default if the user did not specify a metric.} +} + +\code{ndcg} and \code{conc} allow arbitrary target values, while binary +targets {0,1} are expected for \code{map} and \code{mrr}. For \code{ndcg} +and \code{mrr}, a cut-off can be chosen using a positive integer parameter +\code{max.rank}. If left unspecified, all ranks are taken into account. + +Note that splitting of instances into training and validation sets follows +group boundaries and therefore only approximates the specified +\code{train.fraction} ratio (the same applies to cross-validation folds). +Internally, queries are randomly shuffled before training, to avoid bias. + +Weights can be used in conjunction with pairwise metrics, however it is +assumed that they are constant for instances from the same group. + +For details and background on the algorithm, see e.g. Burges (2010).} + +\item{w}{A vector of weights of the same length as the \code{y}.} + +\item{var.monotone}{an optional vector, the same length as the number of +predictors, indicating which variables have a monotone increasing (+1), +decreasing (-1), or arbitrary (0) relationship with the outcome.} + +\item{n.trees}{the total number of trees to fit. This is equivalent to the +number of iterations and the number of basis functions in the additive +expansion.} + +\item{interaction.depth}{The maximum depth of variable interactions. A value +of 1 implies an additive model, a value of 2 implies a model with up to 2-way +interactions, etc. Default is \code{1}.} + +\item{n.minobsinnode}{Integer specifying the minimum number of observations +in the trees terminal nodes. Note that this is the actual number of +observations not the total weight.} + +\item{shrinkage}{The shrinkage parameter applied to each tree in the +expansion. Also known as the learning rate or step-size reduction; 0.001 to +0.1 usually work, but a smaller learning rate typically requires more trees. +Default is \code{0.1}.} + +\item{bag.fraction}{The fraction of the training set observations randomly +selected to propose the next tree in the expansion. This introduces +randomnesses into the model fit. If \code{bag.fraction} < 1 then running the +same model twice will result in similar but different fits. \code{gbm} uses +the R random number generator so \code{set.seed} can ensure that the model +can be reconstructed. Preferably, the user can save the returned +\code{\link{gbm.object}} using \code{\link{save}}. Default is \code{0.5}.} + +\item{nTrain}{An integer representing the number of cases on which to train. +This is the preferred way of specification for \code{gbm.fit}; The option +\code{train.fraction} in \code{gbm.fit} is deprecated and only maintained +for backward compatibility. These two parameters are mutually exclusive. If +both are unspecified, all data is used for training.} + +\item{train.fraction}{The first \code{train.fraction * nrows(data)} +observations are used to fit the \code{gbm} and the remainder are used for +computing out-of-sample estimates of the loss function.} + +\item{keep.data}{Logical indicating whether or not to keep the data and an +index of the data stored with the object. Keeping the data and index makes +subsequent calls to \code{\link{gbm.more}} faster at the cost of storing an +extra copy of the dataset.} + +\item{verbose}{Logical indicating whether or not to print out progress and +performance indicators (\code{TRUE}). If this option is left unspecified for +\code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +\code{FALSE}.} + +\item{var.names}{Vector of strings of length equal to the number of columns +of \code{x} containing the names of the predictor variables.} + +\item{response.name}{Character string label for the response variable.} + +\item{group}{The \code{group} to use when \code{distribution = "pairwise"}.} +} +\value{ +A \code{\link{gbm.object}} object. +} +\description{ +Workhorse function providing the link between R and the C++ gbm engine. +\code{gbm} is a front-end to \code{gbm.fit} that uses the familiar R +modeling formulas. However, \code{\link[stats]{model.frame}} is very slow if +there are many predictor variables. For power-users with many variables use +\code{gbm.fit}. For general practice \code{gbm} is preferable. +} +\details{ +This package implements the generalized boosted modeling framework. Boosting +is the process of iteratively adding basis functions in a greedy fashion so +that each additional basis function further reduces the selected loss +function. This implementation closely follows Friedman's Gradient Boosting +Machine (Friedman, 2001). + +In addition to many of the features documented in the Gradient Boosting +Machine, \code{gbm} offers additional features including the out-of-bag +estimator for the optimal number of iterations, the ability to store and +manipulate the resulting \code{gbm} object, and a variety of other loss +functions that had not previously had associated boosting algorithms, +including the Cox partial likelihood for censored data, the poisson +likelihood for count outcomes, and a gradient boosting implementation to +minimize the AdaBoost exponential loss function. +} +\references{ +Y. Freund and R.E. Schapire (1997) \dQuote{A decision-theoretic +generalization of on-line learning and an application to boosting,} +\emph{Journal of Computer and System Sciences,} 55(1):119-139. + +G. Ridgeway (1999). \dQuote{The state of boosting,} \emph{Computing Science +and Statistics} 31:172-181. + +J.H. Friedman, T. Hastie, R. Tibshirani (2000). \dQuote{Additive Logistic +Regression: a Statistical View of Boosting,} \emph{Annals of Statistics} +28(2):337-374. + +J.H. Friedman (2001). \dQuote{Greedy Function Approximation: A Gradient +Boosting Machine,} \emph{Annals of Statistics} 29(5):1189-1232. + +J.H. Friedman (2002). \dQuote{Stochastic Gradient Boosting,} +\emph{Computational Statistics and Data Analysis} 38(4):367-378. + +B. Kriegler (2007). Cost-Sensitive Stochastic Gradient Boosting Within a +Quantitative Regression Framework. Ph.D. Dissertation. University of +California at Los Angeles, Los Angeles, CA, USA. Advisor(s) Richard A. Berk. +url{https://dl.acm.org/citation.cfm?id=1354603}. + +C. Burges (2010). \dQuote{From RankNet to LambdaRank to LambdaMART: An +Overview,} Microsoft Research Technical Report MSR-TR-2010-82. +} +\seealso{ +\code{\link{gbm.object}}, \code{\link{gbm.perf}}, +\code{\link{plot.gbm}}, \code{\link{predict.gbm}}, \code{\link{summary.gbm}}, +and \code{\link{pretty.gbm.tree}}. +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} + +Quantile regression code developed by Brian Kriegler +\email{bk@stat.ucla.edu} + +t-distribution, and multinomial code developed by Harry Southworth and +Daniel Edwards + +Pairwise code developed by Stefan Schroedl \email{schroedl@a9.com} +} diff --git a/man/gbm.more.Rd b/man/gbm.more.Rd new file mode 100644 index 0000000..c2a68aa --- /dev/null +++ b/man/gbm.more.Rd @@ -0,0 +1,136 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm.more.R +\name{gbm.more} +\alias{gbm.more} +\title{Generalized Boosted Regression Modeling (GBM)} +\usage{ +gbm.more(object, n.new.trees = 100, data = NULL, weights = NULL, + offset = NULL, verbose = NULL) +} +\arguments{ +\item{object}{A \code{\link{gbm.object}} object created from an initial call +to \code{\link{gbm}}.} + +\item{n.new.trees}{Integer specifying the number of additional trees to add +to \code{object}. Default is 100.} + +\item{data}{An optional data frame containing the variables in the model. By +default the variables are taken from \code{environment(formula)}, typically +the environment from which \code{gbm} is called. If \code{keep.data=TRUE} in +the initial call to \code{gbm} then \code{gbm} stores a copy with the +object. If \code{keep.data=FALSE} then subsequent calls to +\code{\link{gbm.more}} must resupply the same dataset. It becomes the user's +responsibility to resupply the same data at this point.} + +\item{weights}{An optional vector of weights to be used in the fitting +process. Must be positive but do not need to be normalized. If +\code{keep.data=FALSE} in the initial call to \code{gbm} then it is the +user's responsibility to resupply the weights to \code{\link{gbm.more}}.} + +\item{offset}{A vector of offset values.} + +\item{verbose}{Logical indicating whether or not to print out progress and +performance indicators (\code{TRUE}). If this option is left unspecified for +\code{gbm.more}, then it uses \code{verbose} from \code{object}. Default is +\code{FALSE}.} +} +\value{ +A \code{\link{gbm.object}} object. +} +\description{ +Adds additional trees to a \code{\link{gbm.object}} object. +} +\examples{ +# +# A least squares regression example +# + +# Simulate data +set.seed(101) # for reproducibility +N <- 1000 +X1 <- runif(N) +X2 <- 2 * runif(N) +X3 <- ordered(sample(letters[1:4], N, replace = TRUE), levels = letters[4:1]) +X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +X6 <- 3 * runif(N) +mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +SNR <- 10 # signal-to-noise ratio +Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu +sigma <- sqrt(var(Y) / SNR) +Y <- Y + rnorm(N, 0, sigma) +X1[sample(1:N,size=500)] <- NA # introduce some missing values +X4[sample(1:N,size=300)] <- NA # introduce some missing values +data <- data.frame(Y, X1, X2, X3, X4, X5, X6) + +# Fit a GBM +set.seed(102) # for reproducibility +gbm1 <- gbm(Y ~ ., data = data, var.monotone = c(0, 0, 0, 0, 0, 0), + distribution = "gaussian", n.trees = 100, shrinkage = 0.1, + interaction.depth = 3, bag.fraction = 0.5, train.fraction = 0.5, + n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE, + verbose = FALSE, n.cores = 1) + +# Check performance using the out-of-bag (OOB) error; the OOB error typically +# underestimates the optimal number of iterations +best.iter <- gbm.perf(gbm1, method = "OOB") +print(best.iter) + +# Check performance using the 50\% heldout test set +best.iter <- gbm.perf(gbm1, method = "test") +print(best.iter) + +# Check performance using 5-fold cross-validation +best.iter <- gbm.perf(gbm1, method = "cv") +print(best.iter) + +# Plot relative influence of each variable +par(mfrow = c(1, 2)) +summary(gbm1, n.trees = 1) # using first tree +summary(gbm1, n.trees = best.iter) # using estimated best number of trees + +# Compactly print the first and last trees for curiosity +print(pretty.gbm.tree(gbm1, i.tree = 1)) +print(pretty.gbm.tree(gbm1, i.tree = gbm1$n.trees)) + +# Simulate new data +set.seed(103) # for reproducibility +N <- 1000 +X1 <- runif(N) +X2 <- 2 * runif(N) +X3 <- ordered(sample(letters[1:4], N, replace = TRUE)) +X4 <- factor(sample(letters[1:6], N, replace = TRUE)) +X5 <- factor(sample(letters[1:3], N, replace = TRUE)) +X6 <- 3 * runif(N) +mu <- c(-1, 0, 1, 2)[as.numeric(X3)] +Y <- X1 ^ 1.5 + 2 * (X2 ^ 0.5) + mu + rnorm(N, 0, sigma) +data2 <- data.frame(Y, X1, X2, X3, X4, X5, X6) + +# Predict on the new data using the "best" number of trees; by default, +# predictions will be on the link scale +Yhat <- predict(gbm1, newdata = data2, n.trees = best.iter, type = "link") + +# least squares error +print(sum((data2$Y - Yhat)^2)) + +# Construct univariate partial dependence plots +p1 <- plot(gbm1, i.var = 1, n.trees = best.iter) +p2 <- plot(gbm1, i.var = 2, n.trees = best.iter) +p3 <- plot(gbm1, i.var = "X3", n.trees = best.iter) # can use index or name +grid.arrange(p1, p2, p3, ncol = 3) + +# Construct bivariate partial dependence plots +plot(gbm1, i.var = 1:2, n.trees = best.iter) +plot(gbm1, i.var = c("X2", "X3"), n.trees = best.iter) +plot(gbm1, i.var = 3:4, n.trees = best.iter) + +# Construct trivariate partial dependence plots +plot(gbm1, i.var = c(1, 2, 6), n.trees = best.iter, + continuous.resolution = 20) +plot(gbm1, i.var = 1:3, n.trees = best.iter) +plot(gbm1, i.var = 2:4, n.trees = best.iter) +plot(gbm1, i.var = 3:5, n.trees = best.iter) + +# Add more (i.e., 100) boosting iterations to the ensemble +gbm2 <- gbm.more(gbm1, n.new.trees = 100, verbose = FALSE) +} diff --git a/man/gbm.object.Rd b/man/gbm.object.Rd index a3f48e6..a489d12 100644 --- a/man/gbm.object.Rd +++ b/man/gbm.object.Rd @@ -1,45 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm.object.R \name{gbm.object} \alias{gbm.object} \title{Generalized Boosted Regression Model Object} -\description{These are objects representing fitted \code{gbm}s.} -\section{Structure}{The following components must be included in a legitimate \code{gbm} object.} \value{ -\item{initF}{the "intercept" term, the initial predicted value to which trees -make adjustments} -\item{fit}{a vector containing the fitted values on the scale of regression -function (e.g. log-odds scale for bernoulli, log scale for poisson)} -\item{train.error}{a vector of length equal to the number of fitted trees -containing the value of the loss function for each boosting iteration -evaluated on the training data} +\item{initF}{the "intercept" term, the initial predicted value to +which trees make adjustments} \item{fit}{a vector containing the fitted +values on the scale of regression function (e.g. log-odds scale for +bernoulli, log scale for poisson)} \item{train.error}{a vector of length +equal to the number of fitted trees containing the value of the loss +function for each boosting iteration evaluated on the training data} \item{valid.error}{a vector of length equal to the number of fitted trees containing the value of the loss function for each boosting iteration -evaluated on the validation data} -\item{cv.error}{if \code{cv.folds}<2 this component is NULL. Otherwise, this -component is a vector of length equal to the number of fitted trees -containing a cross-validated estimate of the loss function for each boosting -iteration} -\item{oobag.improve}{a vector of length equal to the number of fitted trees -containing an out-of-bag estimate of the marginal reduction in the expected -value of the loss function. The out-of-bag estimate uses only the training -data and is useful for estimating the optimal number of boosting iterations. -See \code{\link{gbm.perf}}} +evaluated on the validation data} \item{cv.error}{if \code{cv.folds}<2 this +component is NULL. Otherwise, this component is a vector of length equal to +the number of fitted trees containing a cross-validated estimate of the loss +function for each boosting iteration} \item{oobag.improve}{a vector of +length equal to the number of fitted trees containing an out-of-bag estimate +of the marginal reduction in the expected value of the loss function. The +out-of-bag estimate uses only the training data and is useful for estimating +the optimal number of boosting iterations. See \code{\link{gbm.perf}}} \item{trees}{a list containing the tree structures. The components are best -viewed using \code{\link{pretty.gbm.tree}}} -\item{c.splits}{a list of all the categorical splits in the collection of -trees. If the \code{trees[[i]]} component of a \code{gbm} object describes a -categorical split then the splitting value will refer to a component of -\code{c.splits}. That component of \code{c.splits} will be a vector of length -equal to the number of levels in the categorical split variable. -1 indicates -left, +1 indicates right, and 0 indicates that the level was not present in the -training data} +viewed using \code{\link{pretty.gbm.tree}}} \item{c.splits}{a list of all +the categorical splits in the collection of trees. If the \code{trees[[i]]} +component of a \code{gbm} object describes a categorical split then the +splitting value will refer to a component of \code{c.splits}. That component +of \code{c.splits} will be a vector of length equal to the number of levels +in the categorical split variable. -1 indicates left, +1 indicates right, +and 0 indicates that the level was not present in the training data} \item{cv.fitted}{If cross-validation was performed, the cross-validation - predicted values on the scale of the linear predictor. That is, the - fitted values from the ith CV-fold, for the model having been trained on - the data in all other folds.} +predicted values on the scale of the linear predictor. That is, the fitted +values from the ith CV-fold, for the model having been trained on the data +in all other folds.} } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} +\description{ +These are objects representing fitted \code{gbm}s. +} +\section{Structure}{ + The following components must be included in a +legitimate \code{gbm} object. +} + \seealso{ \code{\link{gbm}} } - +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} \keyword{methods} diff --git a/man/gbm.perf.Rd b/man/gbm.perf.Rd index c9d0640..dada47c 100644 --- a/man/gbm.perf.Rd +++ b/man/gbm.perf.Rd @@ -1,46 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbm.perf.R \name{gbm.perf} \alias{gbm.perf} \title{GBM performance} -\description{ -Estimates the optimal number of boosting iterations for a \code{gbm} object and -optionally plots various performance measures -} \usage{ -gbm.perf(object, - plot.it = TRUE, - oobag.curve = FALSE, - overlay = TRUE, - method) +gbm.perf(object, plot.it = TRUE, oobag.curve = FALSE, overlay = TRUE, + method) } \arguments{ -\item{object}{a \code{\link{gbm.object}} created from an initial call to +\item{object}{A \code{\link{gbm.object}} created from an initial call to \code{\link{gbm}}.} -\item{plot.it}{an indicator of whether or not to plot the performance measures. -Setting \code{plot.it=TRUE} creates two plots. The first plot plots -\code{object$train.error} (in black) and \code{object$valid.error} (in red) -versus the iteration number. The scale of the error measurement, shown on the -left vertical axis, depends on the \code{distribution} argument used in the -initial call to \code{\link{gbm}}.} -\item{oobag.curve}{indicates whether to plot the out-of-bag performance measures -in a second plot.} -\item{overlay}{if TRUE and oobag.curve=TRUE then a right y-axis is added to the -training and test error plot and the estimated cumulative improvement in the loss -function is plotted versus the iteration number.} -\item{method}{indicate the method used to estimate the optimal number -of boosting iterations. \code{method="OOB"} computes the out-of-bag -estimate and \code{method="test"} uses the test (or validation) dataset -to compute an out-of-sample estimate. \code{method="cv"} extracts the -optimal number of iterations using cross-validation if \code{gbm} was called -with \code{cv.folds}>1} + +\item{plot.it}{An indicator of whether or not to plot the performance +measures. Setting \code{plot.it = TRUE} creates two plots. The first plot +plots \code{object$train.error} (in black) and \code{object$valid.error} +(in red) versus the iteration number. The scale of the error measurement, +shown on the left vertical axis, depends on the \code{distribution} +argument used in the initial call to \code{\link{gbm}}.} + +\item{oobag.curve}{Indicates whether to plot the out-of-bag performance +measures in a second plot.} + +\item{overlay}{If TRUE and oobag.curve=TRUE then a right y-axis is added to +the training and test error plot and the estimated cumulative improvement +in the loss function is plotted versus the iteration number.} + +\item{method}{Indicate the method used to estimate the optimal number of +boosting iterations. \code{method = "OOB"} computes the out-of-bag estimate +and \code{method = "test"} uses the test (or validation) dataset to compute +an out-of-sample estimate. \code{method = "cv"} extracts the optimal number +of iterations using cross-validation if \code{gbm} was called with +\code{cv.folds} > 1.} } \value{ -\code{gbm.perf} returns the estimated optimal number of iterations. The method -of computation depends on the \code{method} argument.} - -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} -\seealso{\code{\link{gbm}}, \code{\link{gbm.object}}} - +\code{gbm.perf} Returns the estimated optimal number of iterations. + The method of computation depends on the \code{method} argument. +} +\description{ +Estimates the optimal number of boosting iterations for a \code{gbm} object +and optionally plots various performance measures +} +\seealso{ +\code{\link{gbm}}, \code{\link{gbm.object}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} \keyword{nonlinear} +\keyword{nonparametric} \keyword{survival} -\keyword{nonparametric} \keyword{tree} diff --git a/man/gbm.roc.area.Rd b/man/gbm.roc.area.Rd index e310101..cd428d3 100644 --- a/man/gbm.roc.area.Rd +++ b/man/gbm.roc.area.Rd @@ -1,3 +1,5 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ir.measures.R \name{gbm.roc.area} \alias{gbm.roc.area} \alias{gbm.conc} @@ -7,62 +9,71 @@ \alias{ir.measure.map} \alias{ir.measure.ndcg} \alias{perf.pairwise} -\title{ -Compute Information Retrieval measures. +\title{Compute Information Retrieval measures.} +\usage{ +gbm.roc.area(obs, pred) + +gbm.conc(x) + +ir.measure.conc(y.f, max.rank = 0) + +ir.measure.auc(y.f, max.rank = 0) + +ir.measure.mrr(y.f, max.rank) + +ir.measure.map(y.f, max.rank = 0) + +ir.measure.ndcg(y.f, max.rank) + +perf.pairwise(y, f, group, metric = "ndcg", w = NULL, max.rank = 0) +} +\arguments{ +\item{obs}{Observed value.} + +\item{pred}{Predicted value.} + +\item{x}{?.} + +\item{y, y.f, f, w, group, max.rank}{Used internally.} + +\item{metric}{What type of performance measure to compute.} +} +\value{ +The requested performance measure. } \description{ - Functions to compute Information Retrieval measures for pairwise loss for - a single group. The function returns the respective metric, or a negative value if -it is undefined for the given group. -} -\usage{ -gbm.roc.area(obs, pred) -ir.measure.conc(y.f, max.rank) -ir.measure.auc(y.f, max.rank) -ir.measure.mrr(y.f, max.rank) -ir.measure.map(y.f, max.rank) -ir.measure.ndcg(y.f, max.rank) -perf.pairwise(y, f, group, metric="ndcg", w=NULL, max.rank=0) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{obs}{Observed value} - \item{pred}{Predicted value} -\item{metric}{What type of performance measure to compute.} -\item{y, y.f, f, w, group, max.rank}{Used internally.} +Functions to compute Information Retrieval measures for pairwise loss for a +single group. The function returns the respective metric, or a negative +value if it is undefined for the given group. } \details{ -For simplicity, we have no special handling for ties; -instead, we break ties randomly. This is slightly -inaccurate for individual groups, but should have +For simplicity, we have no special handling for ties; instead, we break ties +randomly. This is slightly inaccurate for individual groups, but should have only a small effect on the overall measure. -\code{gbm.conc} computes the concordance index: -Fraction of all pairs (i,j) with i Define data, use random, +##-- or do help(data=index) for the standard data sets. +} \references{ -C. Burges (2010). "From RankNet to LambdaRank to LambdaMART: An Overview", -Microsoft Research Technical Report MSR-TR-2010-82. +C. Burges (2010). "From RankNet to LambdaRank to LambdaMART: An +Overview", Microsoft Research Technical Report MSR-TR-2010-82. +} +\seealso{ +\code{\link{gbm}} } \author{ Stefan Schroedl } - -\seealso{ -\code{\link{gbm}} -} -\examples{ -##---- Should be DIRECTLY executable !! ---- -##-- ==> Define data, use random, -##-- or do help(data=index) for the standard data sets. - -} -\keyword{ models } - +\keyword{models} diff --git a/man/gbmCrossVal.Rd b/man/gbmCrossVal.Rd index 79c8bf5..9a9f7a4 100644 --- a/man/gbmCrossVal.Rd +++ b/man/gbmCrossVal.Rd @@ -1,3 +1,5 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gbmCrossVal.R \name{gbmCrossVal} \alias{gbmCrossVal} \alias{gbmCrossValModelBuild} @@ -5,79 +7,101 @@ \alias{gbmCrossValErr} \alias{gbmCrossValPredictions} \title{Cross-validate a gbm} -\description{Functions for cross-validating gbm. These functions are - used internally and are not intended for end-user direct usage.} \usage{ -gbmCrossVal(cv.folds, nTrain, n.cores, -class.stratify.cv, data, -x, y, offset, distribution, w, var.monotone, -n.trees, interaction.depth, n.minobsinnode, -shrinkage, bag.fraction, -var.names, response.name, group) - -gbmCrossValModelBuild(cv.folds, cv.group, n.cores, -i.train, x, y, offset, -distribution, w, var.monotone, -n.trees, interaction.depth, -n.minobsinnode, shrinkage, -bag.fraction, var.names, -response.name, group) - -gbmDoFold(X, i.train, x, y, offset, distribution, w, var.monotone, n.trees, - interaction.depth, n.minobsinnode, shrinkage, bag.fraction, - cv.group, var.names, response.name, group, s) +gbmCrossVal(cv.folds, nTrain, n.cores, class.stratify.cv, data, x, y, + offset, distribution, w, var.monotone, n.trees, interaction.depth, + n.minobsinnode, shrinkage, bag.fraction, var.names, response.name, group) gbmCrossValErr(cv.models, cv.folds, cv.group, nTrain, n.trees) -gbmCrossValPredictions(cv.models, cv.folds, cv.group, -best.iter.cv, distribution, data, y) +gbmCrossValPredictions(cv.models, cv.folds, cv.group, best.iter.cv, + distribution, data, y) +gbmCrossValModelBuild(cv.folds, cv.group, n.cores, i.train, x, y, offset, + distribution, w, var.monotone, n.trees, interaction.depth, + n.minobsinnode, shrinkage, bag.fraction, var.names, response.name, group) +gbmDoFold(X, i.train, x, y, offset, distribution, w, var.monotone, n.trees, + interaction.depth, n.minobsinnode, shrinkage, bag.fraction, cv.group, + var.names, response.name, group, s) } \arguments{ \item{cv.folds}{The number of cross-validation folds.} + \item{nTrain}{The number of training samples.} + \item{n.cores}{The number of cores to use.} -\item{class.stratify.cv}{Whether or not stratified cross-validation - samples are used.} + +\item{class.stratify.cv}{Whether or not stratified cross-validation samples +are used.} + \item{data}{The data.} + \item{x}{The model matrix.} + \item{y}{The response variable.} + \item{offset}{The offset.} + \item{distribution}{The type of loss function. See \code{\link{gbm}}.} + \item{w}{Observation weights.} + \item{var.monotone}{See \code{\link{gbm}}.} + \item{n.trees}{The number of trees to fit.} -\item{interaction.depth}{The degree of allowed interactions. See \code{\link{gbm}}.} + +\item{interaction.depth}{The degree of allowed interactions. See +\code{\link{gbm}}.} + \item{n.minobsinnode}{See \code{\link{gbm}}.} + \item{shrinkage}{See \code{\link{gbm}}.} + \item{bag.fraction}{See \code{\link{gbm}}.} + \item{var.names}{See \code{\link{gbm}}.} + \item{response.name}{See \code{\link{gbm}}.} + \item{group}{Used when \code{distribution = "pairwise"}. See - \code{\link{gbm}}.} +\code{\link{gbm}}.} + +\item{cv.models}{A list containing the models for each fold.} + +\item{cv.group}{A vector indicating the cross-validation fold for each +member of the training set.} + +\item{best.iter.cv}{The iteration with lowest cross-validation error.} + \item{i.train}{Items in the training set.} -\item{cv.models}{A list containing the models for each fold.} -\item{cv.group}{A vector indicating the cross-validation fold for each - member of the training set.} -\item{best.iter.cv}{The iteration with lowest cross-validation error.} + \item{X}{Index (cross-validation fold) on which to subset.} + \item{s}{Random seed.} - -} % Close arguments +} +\value{ +A list containing the cross-validation error and predictions. +} +\description{ +Functions for cross-validating gbm. These functions are used internally and +are not intended for end-user direct usage. +} \details{ These functions are not intended for end-user direct usage, but are used -internally by \code{gbm}.} -\value{A list containing the cross-validation error and predictions.} +internally by \code{gbm}. +} \references{ -J.H. Friedman (2001). "Greedy Function Approximation: A Gradient Boosting -Machine," Annals of Statistics 29(5):1189-1232. +J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +Boosting Machine," Annals of Statistics 29(5):1189-1232. -L. Breiman (2001). \href{https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf}{Random Forests}. - +L. Breiman (2001). +\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} - -\seealso{ \code{\link{gbm}} } - -\keyword{ models } +\seealso{ +\code{\link{gbm}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{models} diff --git a/man/grid.arrange.Rd b/man/grid.arrange.Rd new file mode 100644 index 0000000..9effe42 --- /dev/null +++ b/man/grid.arrange.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{grid.arrange} +\alias{grid.arrange} +\title{Arrange multiple grobs on a page} +\usage{ +grid.arrange(..., newpage = TRUE) +} +\description{ +See \code{\link[gridExtra]{grid.arrange}} for more details. +} +\keyword{internal} diff --git a/man/interact.gbm.Rd b/man/interact.gbm.Rd index e7104aa..e25ec2e 100644 --- a/man/interact.gbm.Rd +++ b/man/interact.gbm.Rd @@ -1,49 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interact.gbm.R \name{interact.gbm} \alias{interact.gbm} -\title{ Estimate the strength of interaction effects } -\description{ Computes Friedman's H-statistic to assess the strength of variable interactions. } +\title{Estimate the strength of interaction effects} \usage{ -interact.gbm(x, - data, - i.var = 1, - n.trees = x$n.trees) +interact.gbm(x, data, i.var = 1, n.trees = x$n.trees) } \arguments{ - \item{x}{ a \code{\link{gbm.object}} fitted using a call to \code{\link{gbm}}} - \item{data}{ the dataset used to construct \code{x}. If the original dataset is - large, a random subsample may be used to accelerate the computation in - \code{interact.gbm}} - \item{i.var}{a vector of indices or the names of the variables for compute - the interaction effect. If using indices, the variables are indexed in the - same order that they appear in the initial \code{gbm} formula.} - \item{n.trees}{ the number of trees used to generate the plot. Only the first - \code{n.trees} trees will be used} +\item{x}{A \code{\link{gbm.object}} fitted using a call to \code{\link{gbm}}.} + +\item{data}{The dataset used to construct \code{x}. If the original dataset +is large, a random subsample may be used to accelerate the computation in +\code{interact.gbm}.} + +\item{i.var}{A vector of indices or the names of the variables for compute +the interaction effect. If using indices, the variables are indexed in the +same order that they appear in the initial \code{gbm} formula.} + +\item{n.trees}{The number of trees used to generate the plot. Only the first +\code{n.trees} trees will be used.} +} +\value{ +Returns the value of \eqn{H}. +} +\description{ +Computes Friedman's H-statistic to assess the strength of variable +interactions. } \details{ \code{interact.gbm} computes Friedman's H-statistic to assess the relative strength of interaction effects in non-linear models. H is on the scale of -[0-1] with higher values indicating larger interaction effects. To connect to -a more familiar measure, if \eqn{x_1} and \eqn{x_2} are uncorrelated covariates -with mean 0 and variance 1 and the model is of the form -\deqn{y=\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3} -then +[0-1] with higher values indicating larger interaction effects. To connect +to a more familiar measure, if \eqn{x_1} and \eqn{x_2} are uncorrelated +covariates with mean 0 and variance 1 and the model is of the form +\deqn{y=\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3} then \deqn{H=\frac{\beta_3}{\sqrt{\beta_1^2+\beta_2^2+\beta_3^2}}} -Note that if the main effects are weak, the estimated H will be unstable. For -example, if (in the case of a two-way interaction) neither main effect is in -the selected model (relative influence is zero), the result will be 0/0. Also, -with weak main effects, rounding errors can result in values of H > 1 which are -not possible. -} -\value{ -Returns the value of \eqn{H}. +Note that if the main effects are weak, the estimated H will be unstable. +For example, if (in the case of a two-way interaction) neither main effect +is in the selected model (relative influence is zero), the result will be +0/0. Also, with weak main effects, rounding errors can result in values of H +> 1 which are not possible. } \references{ -J.H. Friedman and B.E. Popescu (2005). \dQuote{Predictive Learning via Rule -Ensembles.} Section 8.1 +J.H. Friedman and B.E. Popescu (2005). \dQuote{Predictive +Learning via Rule Ensembles.} Section 8.1 } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} - -\seealso{ \code{\link{gbm}}, \code{\link{gbm.object}} } - -\keyword{ methods } +\seealso{ +\code{\link{gbm}}, \code{\link{gbm.object}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{methods} diff --git a/man/plot.gbm.Rd b/man/plot.gbm.Rd index 83f907e..461f8ad 100644 --- a/man/plot.gbm.Rd +++ b/man/plot.gbm.Rd @@ -1,60 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gbm.R \name{plot.gbm} \alias{plot.gbm} -\title{ Marginal plots of fitted gbm objects } -\description{ -Plots the marginal effect of the selected variables by "integrating" out the other variables. -} +\title{Marginal plots of fitted gbm objects} \usage{ -\method{plot}{gbm}(x, - i.var = 1, - n.trees = x$n.trees, - continuous.resolution = 100, - return.grid = FALSE, - type = "link", - ...) +\method{plot}{gbm}(x, i.var = 1, n.trees = x$n.trees, + continuous.resolution = 100, return.grid = FALSE, type = c("link", + "response"), level.plot = TRUE, contour = FALSE, number = 4, + overlap = 0.1, col.regions = viridis::viridis, ...) } \arguments{ - \item{x}{ a \code{\link{gbm.object}} fitted using a call to \code{\link{gbm}}} - \item{i.var}{a vector of indices or the names of the variables to plot. If - using indices, the variables are indexed in the same order that they appear - in the initial \code{gbm} formula. - If \code{length(i.var)} is between 1 and 3 then \code{plot.gbm} produces the plots. Otherwise, - \code{plot.gbm} returns only the grid of evaluation points and their average predictions} - \item{n.trees}{ the number of trees used to generate the plot. Only the first - \code{n.trees} trees will be used} - \item{continuous.resolution}{ The number of equally space points at which to - evaluate continuous predictors } - \item{return.grid}{ if \code{TRUE} then \code{plot.gbm} produces no graphics and only returns - the grid of evaluation points and their average predictions. This is useful for - customizing the graphics for special variable types or for dimensions greater - than 3 } - \item{type}{ the type of prediction to plot on the vertical axis. See - \code{predict.gbm}} - \item{\dots}{ other arguments passed to the plot function } +\item{x}{A \code{\link{gbm.object}} that was fit using a call to +\code{\link{gbm}}.} + +\item{i.var}{Vector of indices or the names of the variables to plot. If +using indices, the variables are indexed in the same order that they appear +in the initial \code{gbm} formula. If \code{length(i.var)} is between 1 and +3 then \code{plot.gbm} produces the plots. Otherwise, \code{plot.gbm} +returns only the grid of evaluation points and their average predictions} + +\item{n.trees}{Integer specifying the number of trees to use to generate the +plot. Default is to use \code{x$n.trees} (i.e., the entire ensemble).} + +\item{continuous.resolution}{Integer specifying the number of equally space +points at which to evaluate continuous predictors.} + +\item{return.grid}{Logical indicating whether or not to produce graphics +\code{FALSE} or only return the grid of evaluation points and their average +predictions \code{TRUE}. This is useful for customizing the graphics for +special variable types, or for higher dimensional graphs.} + +\item{type}{Character string specifying the type of prediction to plot on the +vertical axis. See \code{\link{predict.gbm}} for details.} + +\item{level.plot}{Logical indicating whether or not to use a false color +level plot (\code{TRUE}) or a 3-D surface (\code{FALSE}). Default is +\code{TRUE}.} + +\item{contour}{Logical indicating whether or not to add contour lines to the +level plot. Only used when \code{level.plot = TRUE}. Default is \code{FALSE}.} + +\item{number}{Integer specifying the number of conditional intervals to use +for the continuous panel variables. See \code{\link[graphics]{co.intervals}} +and \code{\link[lattice]{equal.count}} for further details.} + +\item{overlap}{The fraction of overlap of the conditioning variables. See +\code{\link[graphics]{co.intervals}} and \code{\link[lattice]{equal.count}} +for further details.} + +\item{col.regions}{Color vector to be used if \code{level.plot} is +\code{TRUE}. Defaults to the wonderful Matplotlib 'viridis' color map +provided by the \code{viridis} package. See \code{\link[viridis]{viridis}} +for details.} + +\item{...}{Additional optional arguments to be passed onto +\code{\link[graphics]{plot}}.} +} +\value{ +If \code{return.grid = TRUE}, a grid of evaluation points and their +average predictions. Otherwise, a plot is returned. +} +\description{ +Plots the marginal effect of the selected variables by "integrating" out the +other variables. } \details{ \code{plot.gbm} produces low dimensional projections of the -\code{\link{gbm.object}} by integrating out the variables not included in the -\code{i.var} argument. The function selects a grid of points and uses the -weighted tree traversal method described in Friedman (2001) to do the +\code{\link{gbm.object}} by integrating out the variables not included in +the \code{i.var} argument. The function selects a grid of points and uses +the weighted tree traversal method described in Friedman (2001) to do the integration. Based on the variable types included in the projection, \code{plot.gbm} selects an appropriate display choosing amongst line plots, -contour plots, and \code{\link[lattice]{lattice}} plots. If the default graphics -are not sufficient the user may set \code{return.grid=TRUE}, store the result -of the function, and develop another graphic display more appropriate to the -particular example. +contour plots, and \code{\link[lattice]{lattice}} plots. If the default +graphics are not sufficient the user may set \code{return.grid=TRUE}, store +the result of the function, and develop another graphic display more +appropriate to the particular example. } -\value{ -Nothing unless \code{return.grid} is true then \code{plot.gbm} produces no -graphics and only returns the grid of evaluation points and their average -predictions. +\note{ +More flexible plotting is available using the +\code{\link[pdp]{partial}} and \code{\link[pdp]{plotPartial}} functions. } \references{ -J.H. Friedman (2001). "Greedy Function Approximation: A Gradient Boosting -Machine," Annals of Statistics 29(4). +J. H. Friedman (2001). "Greedy Function Approximation: A Gradient +Boosting Machine," Annals of Statistics 29(4). + +B. M. Greenwell (2017). "pdp: An R Package for Constructing +Partial Dependence Plots," The R Journal 9(1), 421--436. +\url{https://journal.r-project.org/archive/2017/RJ-2017-016/index.html}. } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} - -\seealso{ \code{\link{gbm}}, \code{\link{gbm.object}}, \code{\link[graphics]{plot}} } - -\keyword{ hplot } +\seealso{ +\code{\link[pdp]{partial}}, \code{\link[pdp]{plotPartial}}, +\code{\link{gbm}}, and \code{\link{gbm.object}}. +} diff --git a/man/predict.gbm.Rd b/man/predict.gbm.Rd index 1d88f11..01a80a5 100644 --- a/man/predict.gbm.Rd +++ b/man/predict.gbm.Rd @@ -1,47 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.gbm.R \name{predict.gbm} \alias{predict.gbm} -\title{ Predict method for GBM Model Fits } -\description{ - Predicted values based on a generalized boosted model object -} +\title{Predict method for GBM Model Fits} \usage{ -\method{predict}{gbm}(object, - newdata, - n.trees, - type="link", - single.tree=FALSE, - ...) +\method{predict}{gbm}(object, newdata, n.trees, type = "link", + single.tree = FALSE, ...) } \arguments{ - \item{object}{ Object of class inheriting from (\code{\link{gbm.object}}) } - \item{newdata}{ Data frame of observations for which to make predictions } - \item{n.trees}{ Number of trees used in the prediction. \code{n.trees} may - be a vector in which case predictions are returned for each - iteration specified} - \item{type}{ The scale on which gbm makes the predictions } - \item{single.tree}{If \code{single.tree=TRUE} then \code{predict.gbm} returns - only the predictions from tree(s) \code{n.trees}} - \item{\dots}{ further arguments passed to or from other methods } +\item{object}{Object of class inheriting from (\code{\link{gbm.object}})} + +\item{newdata}{Data frame of observations for which to make predictions} + +\item{n.trees}{Number of trees used in the prediction. \code{n.trees} may be +a vector in which case predictions are returned for each iteration specified} + +\item{type}{The scale on which gbm makes the predictions} + +\item{single.tree}{If \code{single.tree=TRUE} then \code{predict.gbm} +returns only the predictions from tree(s) \code{n.trees}} + +\item{\dots}{further arguments passed to or from other methods} +} +\value{ +Returns a vector of predictions. By default the predictions are on +the scale of f(x). For example, for the Bernoulli loss the returned value is +on the log odds scale, poisson loss on the log scale, and coxph is on the +log hazard scale. + +If \code{type="response"} then \code{gbm} converts back to the same scale as +the outcome. Currently the only effect this will have is returning +probabilities for bernoulli and expected counts for poisson. For the other +distributions "response" and "link" return the same. +} +\description{ +Predicted values based on a generalized boosted model object } \details{ -\code{predict.gbm} produces predicted values for each observation in \code{newdata} using the the first \code{n.trees} iterations of the boosting sequence. If \code{n.trees} is a vector than the result is a matrix with each column representing the predictions from gbm models with \code{n.trees[1]} iterations, \code{n.trees[2]} iterations, and so on. +\code{predict.gbm} produces predicted values for each observation in +\code{newdata} using the the first \code{n.trees} iterations of the boosting +sequence. If \code{n.trees} is a vector than the result is a matrix with +each column representing the predictions from gbm models with +\code{n.trees[1]} iterations, \code{n.trees[2]} iterations, and so on. -The predictions from \code{gbm} do not include the offset term. The user may add the value of the offset to the predicted value if desired. +The predictions from \code{gbm} do not include the offset term. The user may +add the value of the offset to the predicted value if desired. If \code{object} was fit using \code{\link{gbm.fit}} there will be no -\code{Terms} component. Therefore, the user has greater responsibility to make -sure that \code{newdata} is of the same format (order and number of variables) -as the one originally used to fit the model. +\code{Terms} component. Therefore, the user has greater responsibility to +make sure that \code{newdata} is of the same format (order and number of +variables) as the one originally used to fit the model. } -\value{ -Returns a vector of predictions. By default the predictions are on the scale of f(x). For example, for the Bernoulli loss the returned value is on the log odds scale, poisson loss on the log scale, and coxph is on the log hazard scale. - -If \code{type="response"} then \code{gbm} converts back to the same scale as the outcome. Currently the only effect this will have is returning probabilities for bernoulli and expected counts for poisson. For the other distributions "response" and "link" return the same. -} -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} \seealso{ \code{\link{gbm}}, \code{\link{gbm.object}} } - -\keyword{ models } -\keyword{ regression } +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{models} +\keyword{regression} diff --git a/man/pretty.gbm.tree.Rd b/man/pretty.gbm.tree.Rd index 24c93c1..9fdac4d 100644 --- a/man/pretty.gbm.tree.Rd +++ b/man/pretty.gbm.tree.Rd @@ -1,38 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pretty.gbm.tree.R \name{pretty.gbm.tree} \alias{pretty.gbm.tree} -\title{ Print gbm tree components } -\description{ -\code{gbm} stores the collection of trees used to construct the model in a -compact matrix structure. This function extracts the information from a single -tree and displays it in a slightly more readable form. This function is mostly -for debugging purposes and to satisfy some users' curiosity. -} +\title{Print gbm tree components} \usage{ -pretty.gbm.tree(object, i.tree = 1) +\method{pretty}{gbm.tree}(object, i.tree = 1) } \arguments{ - \item{object}{ a \code{\link{gbm.object}} initially fit using \code{\link{gbm}}} - \item{i.tree}{ the index of the tree component to extract from \code{object} - and display } +\item{object}{a \code{\link{gbm.object}} initially fit using +\code{\link{gbm}}} + +\item{i.tree}{the index of the tree component to extract from \code{object} +and display} } \value{ -\code{pretty.gbm.tree} returns a data frame. Each row corresponds to a node in -the tree. Columns indicate -\item{SplitVar}{index of which variable is used to split. -1 indicates a -terminal node.} -\item{SplitCodePred}{if the split variable is continuous then this component -is the split point. If the split variable is categorical then this component -contains the index of \code{object$c.split} that describes the categorical -split. If the node is a terminal node then this is the prediction.} -\item{LeftNode}{the index of the row corresponding to the left node.} -\item{RightNode}{the index of the row corresponding to the right node.} -\item{ErrorReduction}{the reduction in the loss function as a result of -splitting this node.} -\item{Weight}{the total weight of observations in the node. If weights are all -equal to 1 then this is the number of observations in the node.} +\code{pretty.gbm.tree} returns a data frame. Each row corresponds to +a node in the tree. Columns indicate \item{SplitVar}{index of which variable +is used to split. -1 indicates a terminal node.} \item{SplitCodePred}{if the +split variable is continuous then this component is the split point. If the +split variable is categorical then this component contains the index of +\code{object$c.split} that describes the categorical split. If the node is a +terminal node then this is the prediction.} \item{LeftNode}{the index of the +row corresponding to the left node.} \item{RightNode}{the index of the row +corresponding to the right node.} \item{ErrorReduction}{the reduction in the +loss function as a result of splitting this node.} \item{Weight}{the total +weight of observations in the node. If weights are all equal to 1 then this +is the number of observations in the node.} } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} +\description{ +\code{gbm} stores the collection of trees used to construct the model in a +compact matrix structure. This function extracts the information from a +single tree and displays it in a slightly more readable form. This function +is mostly for debugging purposes and to satisfy some users' curiosity. +} \seealso{ \code{\link{gbm}}, \code{\link{gbm.object}} } -\keyword{ print } +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{print} diff --git a/man/print.gbm.Rd b/man/print.gbm.Rd index 934edef..c825f47 100644 --- a/man/print.gbm.Rd +++ b/man/print.gbm.Rd @@ -1,55 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.gbm.R \name{print.gbm} \alias{print.gbm} \alias{show.gbm} - -\title{ Print model summary } -\description{ - Display basic information about a \code{gbm} object. -} +\title{Print model summary} \usage{ \method{print}{gbm}(x, ...) -\method{show}{gbm}(x, ...) + +show.gbm(x, ...) } \arguments{ - \item{x}{ an object of class \code{gbm}. } - \item{\dots}{ arguments passed to \code{print.default}. } +\item{x}{an object of class \code{gbm}.} + +\item{\dots}{arguments passed to \code{print.default}.} +} +\description{ +Display basic information about a \code{gbm} object. } \details{ - Prints some information about the model object. In particular, this method - prints the call to \code{gbm()}, the type of loss function - that was used, and the total number of iterations. +Prints some information about the model object. In particular, this method +prints the call to \code{gbm()}, the type of loss function that was used, +and the total number of iterations. - If cross-validation was performed, the 'best' number of trees as - estimated by cross-validation error is displayed. If a test set - was used, the 'best' number - of trees as estimated by the test set error is displayed. +If cross-validation was performed, the 'best' number of trees as estimated +by cross-validation error is displayed. If a test set was used, the 'best' +number of trees as estimated by the test set error is displayed. - The number of available predictors, and the number of those having - non-zero influence on predictions is given (which might be interesting - in data mining applications). +The number of available predictors, and the number of those having non-zero +influence on predictions is given (which might be interesting in data mining +applications). - If multinomial, bernoulli or adaboost was used, - the confusion matrix and prediction accuracy are printed (objects - being allocated to the class with highest probability for multinomial - and bernoulli). These classifications are performed on the entire - training - data using the model with the 'best' number of trees as described - above, or the maximum number of trees if the 'best' cannot be - computed. +If multinomial, bernoulli or adaboost was used, the confusion matrix and +prediction accuracy are printed (objects being allocated to the class with +highest probability for multinomial and bernoulli). These classifications +are performed on the entire training data using the model with the 'best' +number of trees as described above, or the maximum number of trees if the +'best' cannot be computed. - If the 'distribution' was specified as gaussian, laplace, quantile - or t-distribution, a summary of the residuals is displayed. - The residuals are for the training data with the model at the 'best' - number of trees, as - described above, or the maximum number of trees if the 'best' cannot - be computed. +If the 'distribution' was specified as gaussian, laplace, quantile or +t-distribution, a summary of the residuals is displayed. The residuals are +for the training data with the model at the 'best' number of trees, as +described above, or the maximum number of trees if the 'best' cannot be +computed. } +\examples{ - -\author{ Harry Southworth, Daniel Edwards } - -\seealso{ \code{\link{gbm}} } -\examples{ data(iris) iris.mod <- gbm(Species ~ ., distribution="multinomial", data=iris, n.trees=2000, shrinkage=0.01, cv.folds=5, @@ -60,7 +55,13 @@ # n.trees=2000, shrinkage=0.01, cv.folds=5,verbose =FALSE) #lung.mod } +\seealso{ +\code{\link{gbm}} +} +\author{ +Harry Southworth, Daniel Edwards +} \keyword{models} \keyword{nonlinear} +\keyword{nonparametric} \keyword{survival} -\keyword{nonparametric} diff --git a/man/quantile.rug.Rd b/man/quantile.rug.Rd index 84f7dea..de06627 100644 --- a/man/quantile.rug.Rd +++ b/man/quantile.rug.Rd @@ -1,27 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calibrate.plot.R \name{quantile.rug} \alias{quantile.rug} \title{Quantile rug plot} -\description{Marks the quantiles on the axes of the current plot.} \usage{ -quantile.rug(x,prob=(0:10)/10,...) +\method{quantile}{rug}(x, prob = 0:10/10, ...) } \arguments{ -\item{x}{a numeric vector.} -\item{prob}{the quantiles of x to mark on the x-axis.} -\item{...}{additional graphics parameters currently ignored.} +\item{x}{A numeric vector.} + +\item{prob}{The quantiles of x to mark on the x-axis.} + +\item{...}{Additional optional arguments to be passed onto +\code{\link[graphics]{rug}}} } -\value{No return values} -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} -\seealso{ -\code{\link[graphics]{plot}}, -\code{\link[stats]{quantile}}, -\code{\link[base]{jitter}}, -\code{\link[graphics]{rug}}. +\value{ +No return values. +} +\description{ +Marks the quantiles on the axes of the current plot. } \examples{ x <- rnorm(100) y <- rnorm(100) -plot(x,y) +plot(x, y) quantile.rug(x) } +\seealso{ +\code{\link[graphics]{plot}}, \code{\link[stats]{quantile}}, +\code{\link[base]{jitter}}, \code{\link[graphics]{rug}}. +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com}. +} \keyword{aplot} diff --git a/man/reconstructGBMdata.Rd b/man/reconstructGBMdata.Rd old file mode 100755 new file mode 100644 index 496d732..557185c 100644 --- a/man/reconstructGBMdata.Rd +++ b/man/reconstructGBMdata.Rd @@ -1,31 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reconstructGBMdata.R \name{reconstructGBMdata} -\Rdversion{1.1} \alias{reconstructGBMdata} \title{Reconstruct a GBM's Source Data} - - +\usage{ +reconstructGBMdata(x) +} +\arguments{ +\item{x}{a \code{\link{gbm.object}} initially fit using \code{\link{gbm}}} +} +\value{ +Returns a data used to fit the gbm in a format that can subsequently +be used for plots and summaries +} \description{ Helper function to reconstitute the data for plots and summaries. This function is not intended for the user to call directly. } -\usage{ -reconstructGBMdata(x) -} -\arguments{ - \item{x}{ -a \code{\link{gbm.object}} initially fit using \code{\link{gbm}} -} -} -\value{ -Returns a data used to fit the gbm in a format that can subsequently be used -for plots and summaries +\seealso{ +\code{\link{gbm}}, \code{\link{gbm.object}} } \author{ Harry Southworth } - -\seealso{ -\code{\link{gbm}}, \code{\link{gbm.object}} -} - -\keyword{ manip } +\keyword{manip} diff --git a/man/relative.influence.Rd b/man/relative.influence.Rd index 0c1d651..1a20052 100644 --- a/man/relative.influence.Rd +++ b/man/relative.influence.Rd @@ -1,50 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relative.influence.R \name{relative.influence} \alias{relative.influence} \alias{permutation.test.gbm} \alias{gbm.loss} -\title{ Methods for estimating relative influence } -\description{ -Helper functions for computing the relative influence of each variable in the gbm object. -} +\title{Methods for estimating relative influence} \usage{ -relative.influence(object, n.trees, scale., sort.) +relative.influence(object, n.trees, scale. = FALSE, sort. = FALSE) + permutation.test.gbm(object, n.trees) -gbm.loss(y,f,w,offset,dist,baseline, group, max.rank) + +gbm.loss(y, f, w, offset, dist, baseline, group = NULL, + max.rank = NULL) } \arguments{ -\item{object}{a \code{gbm} object created from an initial call to \code{\link{gbm}}.} -\item{n.trees}{ the number of trees to use for computations. If not provided, the - the function will guess: if a test set was used in fitting, the number of - trees resulting in lowest test set error will be used; otherwise, if - cross-validation was performed, the number of trees resulting in lowest - cross-validation error will be used; otherwise, all trees will be used.} -\item{scale.}{ whether or not the result should be scaled. Defaults to \code{FALSE}.} -\item{sort.}{ whether or not the results should be (reverse) sorted. - Defaults to \code{FALSE}.} -\item{y,f,w,offset,dist,baseline}{For \code{gbm.loss}: These components are the -outcome, predicted value, observation weight, offset, distribution, and comparison -loss function, respectively.} -\item{group, max.rank}{Used internally when \code{distribution = \'pairwise\'}.} +\item{object}{a \code{gbm} object created from an initial call to +\code{\link{gbm}}.} + +\item{n.trees}{the number of trees to use for computations. If not provided, +the the function will guess: if a test set was used in fitting, the number +of trees resulting in lowest test set error will be used; otherwise, if +cross-validation was performed, the number of trees resulting in lowest +cross-validation error will be used; otherwise, all trees will be used.} + +\item{scale.}{whether or not the result should be scaled. Defaults to +\code{FALSE}.} + +\item{sort.}{whether or not the results should be (reverse) sorted. +Defaults to \code{FALSE}.} + +\item{y, f, w, offset, dist, baseline}{For \code{gbm.loss}: These components are +the outcome, predicted value, observation weight, offset, distribution, and +comparison loss function, respectively.} + +\item{group, max.rank}{Used internally when \code{distribution = +\'pairwise\'}.} +} +\value{ +By default, returns an unprocessed vector of estimated relative +influences. If the \code{scale.} and \code{sort.} arguments are used, +returns a processed version of the same. +} +\description{ +Helper functions for computing the relative influence of each variable in +the gbm object. } \details{ This is not intended for end-user use. These functions offer the different methods for computing the relative influence in \code{\link{summary.gbm}}. \code{gbm.loss} is a helper function for \code{permutation.test.gbm}. } -\value{ -By default, returns an unprocessed vector of estimated relative influences. -If the \code{scale.} and \code{sort.} arguments are used, returns a processed -version of the same. +\references{ +J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +Boosting Machine," Annals of Statistics 29(5):1189-1232. + +L. Breiman (2001). +\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. } -\references{ -J.H. Friedman (2001). "Greedy Function Approximation: A Gradient Boosting -Machine," Annals of Statistics 29(5):1189-1232. - -L. Breiman (2001). \href{https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf}{Random Forests}. - +\seealso{ +\code{\link{summary.gbm}} } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} - -\seealso{ \code{\link{summary.gbm}} } - -\keyword{ hplot } +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{hplot} diff --git a/man/shrink.gbm.Rd b/man/shrink.gbm.Rd index 1f6eed8..8d6717e 100644 --- a/man/shrink.gbm.Rd +++ b/man/shrink.gbm.Rd @@ -1,35 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/shrink.gbm.R \name{shrink.gbm} \alias{shrink.gbm} -\title{ L1 shrinkage of the predictor variables in a GBM } -\description{ -Performs recursive shrinkage in each of the trees in a GBM fit using different shrinkage parameters for each variable. -} +\title{L1 shrinkage of the predictor variables in a GBM} \usage{ -shrink.gbm(object, - n.trees, - lambda = rep(10, length(object$var.names)), - ...) +shrink.gbm(object, n.trees, lambda = rep(10, length(object$var.names)), + ...) } \arguments{ - \item{object}{ A \code{\link{gbm.object}} } - \item{n.trees}{ the number of trees to use } - \item{lambda}{ a vector with length equal to the number of variables containing the shrinkage parameter for each variable } - \item{\dots}{ other parameters (ignored) } +\item{object}{A \code{\link{gbm.object}}.} + +\item{n.trees}{Integer specifying the number of trees to use.} + +\item{lambda}{Vector of length equal to the number of variables containing +the shrinkage parameter for each variable.} + +\item{\dots}{Additional optional arguments. (Currently ignored.)} +} +\value{ +\item{predF}{Predicted values from the shrunken tree} +\item{objective}{The value of the loss function associated with the +predicted values} \item{gradient}{A vector with length equal to the number +of variables containing the derivative of the objective function with +respect to beta, the logit transform of the shrinkage parameter for each +variable} +} +\description{ +Performs recursive shrinkage in each of the trees in a GBM fit using +different shrinkage parameters for each variable. } \details{ -This function is currently experimental. Used in conjunction with a gradient ascent search for inclusion of variables. +This function is currently experimental. Used in conjunction with a gradient +ascent search for inclusion of variables. } -\value{ - \item{predF}{Predicted values from the shrunken tree} - \item{objective}{The value of the loss function associated with the predicted values} - \item{gradient}{A vector with length equal to the number of variables containing the derivative of the objective function with respect to beta, the logit transform of the shrinkage parameter for each variable} +\note{ +Warning: This function is experimental. } -\references{ Hastie, T. J., and Pregibon, D. \href{http://www-stat.stanford.edu/~hastie/Papers/shrinktree.ps}{Shrinking Trees}. AT&T Bell Laboratories Technical Report (March 1990).} -\author{ Greg Ridgeway \email{gregridgeway@gmail.com} } - - -\section{Warning}{This function is experimental.} - -\seealso{ \code{\link{shrink.gbm.pred}}, \code{\link{gbm}} } -\keyword{ methods}% at least one, from doc/KEYWORDS - +\references{ +Hastie, T. J., and Pregibon, D. +\url{https://web.stanford.edu/~hastie/Papers/shrink_tree.pdf}. AT&T Bell +Laboratories Technical Report (March 1990). +} +\seealso{ +\code{\link{shrink.gbm.pred}}, \code{\link{gbm}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{methods} diff --git a/man/shrink.gbm.pred.Rd b/man/shrink.gbm.pred.Rd index 5b76dde..dafc76d 100644 --- a/man/shrink.gbm.pred.Rd +++ b/man/shrink.gbm.pred.Rd @@ -1,34 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/shrink.gbm.pred.R \name{shrink.gbm.pred} \alias{shrink.gbm.pred} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Predictions from a shrunked GBM } +\title{Predictions from a shrunked GBM} +\usage{ +shrink.gbm.pred(object, newdata, n.trees, lambda = rep(1, + length(object$var.names)), ...) +} +\arguments{ +\item{object}{a \code{\link{gbm.object}}} + +\item{newdata}{dataset for predictions} + +\item{n.trees}{the number of trees to use} + +\item{lambda}{a vector with length equal to the number of variables +containing the shrinkage parameter for each variable} + +\item{\dots}{other parameters (ignored)} +} +\value{ +A vector with length equal to the number of observations in newdata +containing the predictions +} \description{ - Makes predictions from a shrunken GBM model. +Makes predictions from a shrunken GBM model. } -\usage{ -shrink.gbm.pred(object, - newdata, - n.trees, - lambda = rep(1, length(object$var.names)), - ...) +\section{Warning}{ + This function is experimental } -\arguments{ - \item{object}{ a \code{\link{gbm.object}} } - \item{newdata}{ dataset for predictions } - \item{n.trees}{ the number of trees to use } - \item{lambda}{ a vector with length equal to the number of variables containing the shrinkage parameter for each variable } - \item{\dots}{ other parameters (ignored) } +\seealso{ +\code{\link{shrink.gbm}}, \code{\link{gbm}} } - -\value{ -A vector with length equal to the number of observations in newdata containing the predictions +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} } - -\author{ Greg Ridgeway \email{gregridgeway@gmail.com} } - -\section{Warning}{This function is experimental} - -\seealso{ \code{\link{shrink.gbm}}, \code{\link{gbm}} } - -\keyword{ methods } +\keyword{methods} diff --git a/man/summary.gbm.Rd b/man/summary.gbm.Rd index 731fb8e..da60004 100644 --- a/man/summary.gbm.Rd +++ b/man/summary.gbm.Rd @@ -1,64 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.gbm.R \name{summary.gbm} \alias{summary.gbm} -\title{ Summary of a gbm object } -\description{ -Computes the relative influence of each variable in the gbm object. -} +\title{Summary of a gbm object} \usage{ -\method{summary}{gbm}(object, - cBars=length(object$var.names), - n.trees=object$n.trees, - plotit=TRUE, - order=TRUE, - method=relative.influence, - normalize=TRUE, - ...) +\method{summary}{gbm}(object, cBars = length(object$var.names), + n.trees = object$n.trees, plotit = TRUE, order = TRUE, + method = relative.influence, normalize = TRUE, ...) } \arguments{ \item{object}{a \code{gbm} object created from an initial call to \code{\link{gbm}}.} -\item{cBars}{ the number of bars to plot. If \code{order=TRUE} the only the -variables with the \code{cBars} largest relative influence will appear in the -barplot. If \code{order=FALSE} then the first \code{cBars} variables will -appear in the plot. In either case, the function will return the relative -influence of all of the variables.} -\item{n.trees}{ the number of trees used to generate the plot. Only the first + +\item{cBars}{the number of bars to plot. If \code{order=TRUE} the only the +variables with the \code{cBars} largest relative influence will appear in +the barplot. If \code{order=FALSE} then the first \code{cBars} variables +will appear in the plot. In either case, the function will return the +relative influence of all of the variables.} + +\item{n.trees}{the number of trees used to generate the plot. Only the first \code{n.trees} trees will be used.} -\item{plotit}{ an indicator as to whether the plot is generated. } -\item{order}{ an indicator as to whether the plotted and/or returned relative -influences are sorted. } -\item{method}{ The function used to compute the relative influence. + +\item{plotit}{an indicator as to whether the plot is generated.} + +\item{order}{an indicator as to whether the plotted and/or returned relative +influences are sorted.} + +\item{method}{The function used to compute the relative influence. \code{\link{relative.influence}} is the default and is the same as that described in Friedman (2001). The other current (and experimental) choice is -\code{\link{permutation.test.gbm}}. This method randomly permutes each predictor -variable at a time and computes the associated reduction in predictive -performance. This is similar to the variable importance measures Breiman uses -for random forests, but \code{gbm} currently computes using the entire training -dataset (not the out-of-bag observations).} -\item{normalize}{ if \code{FALSE} then \code{summary.gbm} returns the -unnormalized influence. } -\item{...}{ other arguments passed to the plot function. } +\code{\link{permutation.test.gbm}}. This method randomly permutes each +predictor variable at a time and computes the associated reduction in +predictive performance. This is similar to the variable importance measures +Breiman uses for random forests, but \code{gbm} currently computes using the +entire training dataset (not the out-of-bag observations).} + +\item{normalize}{if \code{FALSE} then \code{summary.gbm} returns the +unnormalized influence.} + +\item{...}{other arguments passed to the plot function.} +} +\value{ +Returns a data frame where the first component is the variable name +and the second is the computed relative influence, normalized to sum to 100. +} +\description{ +Computes the relative influence of each variable in the gbm object. } \details{ -For \code{distribution="gaussian"} this returns exactly the reduction -of squared error attributable to each variable. For other loss functions this -returns the reduction attributeable to each varaible in sum of squared error in -predicting the gradient on each iteration. It describes the relative influence -of each variable in reducing the loss function. See the references below for -exact details on the computation. -} -\value{ -Returns a data frame where the first component is the variable name and the -second is the computed relative influence, normalized to sum to 100. +For \code{distribution="gaussian"} this returns exactly the reduction of +squared error attributable to each variable. For other loss functions this +returns the reduction attributable to each variable in sum of squared error +in predicting the gradient on each iteration. It describes the relative +influence of each variable in reducing the loss function. See the references +below for exact details on the computation. } \references{ -J.H. Friedman (2001). "Greedy Function Approximation: A Gradient Boosting -Machine," Annals of Statistics 29(5):1189-1232. +J.H. Friedman (2001). "Greedy Function Approximation: A Gradient +Boosting Machine," Annals of Statistics 29(5):1189-1232. -L. Breiman (2001).\href{https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf}{Random Forests}. +L. Breiman +(2001).\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}. } -\author{Greg Ridgeway \email{gregridgeway@gmail.com}} - -\seealso{ \code{\link{gbm}} } - -\keyword{ hplot } +\seealso{ +\code{\link{gbm}} +} +\author{ +Greg Ridgeway \email{gregridgeway@gmail.com} +} +\keyword{hplot} diff --git a/man/test.gbm.Rd b/man/test.gbm.Rd new file mode 100644 index 0000000..809c45f --- /dev/null +++ b/man/test.gbm.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/test.gbm.R +\name{test.gbm} +\alias{test.gbm} +\alias{validate.gbm} +\alias{test.relative.influence} +\title{Test the \code{gbm} package.} +\usage{ +test.gbm() +} +\value{ +An object of class \code{RUnitTestData}. See the help for +\code{RUnit} for details. +} +\description{ +Run tests on \code{gbm} functions to perform logical checks and +reproducibility. +} +\details{ +The function uses functionality in the \code{RUnit} package. A fairly small +validation suite is executed that checks to see that relative influence +identifies sensible variables from simulated data, and that predictions from +GBMs with Gaussian, Cox or binomial distributions are sensible, +} +\note{ +The test suite is not comprehensive. +} +\examples{ + +# Uncomment the following lines to run - commented out to make CRAN happy +#library(RUnit) +#val <- validate.texmex() +#printHTMLProtocol(val, "texmexReport.html") +} +\seealso{ +\code{\link{gbm}} +} +\author{ +Harry Southworth +} +\keyword{models} diff --git a/man/validate.Rd b/man/validate.Rd deleted file mode 100644 index aa50ffa..0000000 --- a/man/validate.Rd +++ /dev/null @@ -1,43 +0,0 @@ -\name{validate.gbm} -\alias{validate.gbm} -\alias{test.gbm} -\alias{test.relative.influence} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{Test the \code{gbm} package.} -\description{ -Run tests on \code{gbm} functions to perform logical checks and -reproducibility. -} -\usage{ -validate.gbm() -} -%- maybe also 'usage' for other objects documented here. -\details{ -The function uses functionality in the \code{RUnit} package. -A fairly small validation suite is executed that checks to see that -relative influence identifies sensible variables from simulated data, -and that predictions from GBMs with Gaussian, Cox or binomial distributions -are sensible, -} -\value{ -An object of class \code{RUnitTestData}. See the help for \code{RUnit} for -details. -} -\author{ -Harry Southworth -} -\note{The test suite is not comprehensive.} - -%% ~Make other sections like Warning with \section{Warning }{....} ~ - -\seealso{ -\code{\link{gbm}} -} -\examples{ -# Uncomment the following lines to run - commented out to make CRAN happy -#library(RUnit) -#val <- validate.texmex() -#printHTMLProtocol(val, "texmexReport.html") -} -\keyword{models} - diff --git a/src/gbm-init.c b/src/gbm-init.c new file mode 100644 index 0000000..d576324 --- /dev/null +++ b/src/gbm-init.c @@ -0,0 +1,30 @@ +#include +#include +#include // for NULL +#include + +/* FIXME: + Check these declarations against the C/Fortran source code. + */ + +/* .Call calls */ +extern SEXP gbm_fit(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP gbm_plot(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP gbm_pred(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP gbm_shrink_gradient(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP gbm_shrink_pred(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + +static const R_CallMethodDef CallEntries[] = { + {"gbm_fit", (DL_FUNC) &gbm_fit, 22}, + {"gbm_plot", (DL_FUNC) &gbm_plot, 10}, + {"gbm_pred", (DL_FUNC) &gbm_pred, 10}, + {"gbm_shrink_gradient", (DL_FUNC) &gbm_shrink_gradient, 11}, + {"gbm_shrink_pred", (DL_FUNC) &gbm_shrink_pred, 10}, + {NULL, NULL, 0} +}; + +void R_init_gbm(DllInfo *dll) +{ + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/gbmentry.cpp b/src/gbmentry.cpp index 14d08c8..759a958 100644 --- a/src/gbmentry.cpp +++ b/src/gbmentry.cpp @@ -7,7 +7,7 @@ #include #include -SEXP gbm +SEXP gbm_fit ( SEXP radY, // outcome or response SEXP radOffset, // offset for f(x), NA for no offset diff --git a/src/init.c b/src/init.c deleted file mode 100644 index c6a86d6..0000000 --- a/src/init.c +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include -#include // for NULL -#include - -/* .Call calls */ -extern SEXP gbm(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP gbm_plot(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP gbm_pred(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP gbm_shrink_gradient(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP gbm_shrink_pred(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); - -static const R_CallMethodDef CallEntries[] = { - {"gbm", (DL_FUNC) &gbm, 22}, - {"gbm_plot", (DL_FUNC) &gbm_plot, 10}, - {"gbm_pred", (DL_FUNC) &gbm_pred, 10}, - {"gbm_shrink_gradient", (DL_FUNC) &gbm_shrink_gradient, 11}, - {"gbm_shrink_pred", (DL_FUNC) &gbm_shrink_pred, 10}, - {NULL, NULL, 0} -}; - -void R_init_gbm(DllInfo *dll) -{ - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} diff --git a/src/locationm.cpp b/src/locationm.cpp index 4e432cb..9762235 100644 --- a/src/locationm.cpp +++ b/src/locationm.cpp @@ -12,7 +12,6 @@ #include "locationm.h" #include -#include // for fmax2 using namespace std; @@ -190,7 +189,7 @@ } double dScale0 = 1.4826 * Median(iN, adDiff, adW); - dScale0 = fmax2(dScale0, mdEps); + dScale0 = fmax(dScale0, mdEps); // Loop over until the error is low enough double dErr = 1.0; @@ -203,7 +202,7 @@ for (ii = 0; ii < iN; ii++) { double dT = fabs(adX[ii] - dBeta0) / dScale0; - dT = fmax2(dT, mdEps); + dT = fmax(dT, mdEps); double dWt = adW[ii] * PsiFun(dT) / dT; dSumWX += dWt * adX[ii]; diff --git a/vignettes/gbm.Rnw b/vignettes/gbm.Rnw new file mode 100644 index 0000000..26cfda0 --- /dev/null +++ b/vignettes/gbm.Rnw @@ -0,0 +1,373 @@ +\documentclass{article} + +\bibliographystyle{plain} + +\newcommand{\EV}{\mathrm{E}} +\newcommand{\Var}{\mathrm{Var}} +\newcommand{\aRule}{\begin{center} \rule{5in}{1mm} \end{center}} + +\title{Generalized Boosted Models:\\A guide to the gbm package} \author{Greg Ridgeway} + +%\VignetteEngine{knitr::knitr} +%\VignetteIndexEntry{Generalized Boosted Models: A guide to the gbm package} + +\newcommand{\mathgbf}[1]{{\mbox{\boldmath$#1$\unboldmath}}} + +\begin{document} + +\maketitle + +Boosting takes on various forms with different programs using different loss functions, different base models, and different optimization schemes. The gbm package takes the approach described in \cite{Friedman:2001} and \cite{Friedman:2002}. Some of the terminology differs, mostly due to an effort to cast boosting terms into more standard statistical terminology (e.g. deviance). In addition, the gbm package implements boosting for models commonly used in statistics but not commonly associated with boosting. The Cox proportional hazard model, for example, is an incredibly useful model and the boosting framework applies quite readily with only slight modification \cite{Ridgeway:1999}. Also some algorithms implemented in the gbm package differ from the standard implementation. The AdaBoost algorithm \cite{FreundSchapire:1997} has a particular loss function and a particular optimization algorithm associated with it. The gbm implementation of AdaBoost adopts AdaBoost's exponential loss function (its bound on misclassification rate) but uses Friedman's gradient descent algorithm rather than the original one proposed. So the main purposes of this document is to spell out in detail what the gbm package implements. + +\section{Gradient boosting} + +This section essentially presents the derivation of boosting described in \cite{Friedman:2001}. The gbm package also adopts the stochastic gradient boosting strategy, a small but important tweak on the basic algorithm, described in \cite{Friedman:2002}. + +\subsection{Friedman's gradient boosting machine} \label{sec:GradientBoostingMachine} + +\begin{figure} +\aRule Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$. \\ +For $t$ in $1,\ldots,T$ do +\begin{enumerate} +\item Compute the negative gradient as the working response + \begin{equation} + z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} + \end{equation} +\item Fit a regression model, $g(\mathbf{x})$, predicting $z_i$ from the covariates $\mathbf{x}_i$. \item Choose a gradient descent step size as + \begin{equation} + \rho = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\hat f(\mathbf{x}_i)+\rho g(\mathbf{x}_i)) + \end{equation} +\item Update the estimate of $f(\mathbf{x})$ as + \begin{equation} + \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \rho g(\mathbf{x}) + \end{equation} +\end{enumerate} \aRule \caption{Friedman's Gradient Boost algorithm} \label{fig:GradientBoost} \end{figure} + +Friedman (2001) and the companion paper Friedman (2002) extended the work of Friedman, Hastie, and Tibshirani (2000) and laid the ground work for a new generation of boosting algorithms. Using the connection between boosting and optimization, this new work proposes the Gradient Boosting Machine. + +In any function estimation problem we wish to find a regression function, $\hat f(\mathbf{x})$, that minimizes the expectation of some loss function, $\Psi(y,f)$, as shown in (\ref{NonparametricRegression1}). + +\begin{eqnarray} +\hspace{0.5in} +\hat f(\mathbf{x}) &=& \arg \min_{f(\mathbf{x})} \EV_{y,\mathbf{x}} \Psi(y,f(\mathbf{x})) \nonumber \\ \label{NonparametricRegression1} +&=& \arg \min_{f(\mathbf{x})} \EV_x \left[ \EV_{y|\mathbf{x}} \Psi(y,f(\mathbf{x})) \Big| \mathbf{x} \right] +\end{eqnarray} + +We will focus on finding estimates of $f(\mathbf{x})$ such that \begin{equation} +\label{NonparametricRegression2} +\hspace{0.5in} +\hat f(\mathbf{x}) = \arg \min_{f(\mathbf{x})} \EV_{y|\mathbf{x}} \left[ \Psi(y,f(\mathbf{x}))|\mathbf{x} \right] +\end{equation} +Parametric regression models assume that $f(\mathbf{x})$ is a function with a finite number of parameters, $\beta$, and estimates them by selecting those values that minimize a loss function (e.g. squared error loss) over a training sample of $N$ observations on $(y,\mathbf{x})$ pairs as in (\ref{eq:Friedman1}). +\begin{equation} +\label{eq:Friedman1} +\hspace{0.5in} +\hat\beta = \arg \min_{\beta} \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i;\beta)) +\end{equation} +When we wish to estimate $f(\mathbf{x})$ non-parametrically the task becomes more difficult. Again we can proceed similarly to \cite{FHT:2000} and modify our current estimate of $f(\mathbf{x})$ by adding a new function $f(\mathbf{x})$ in a greedy fashion. Letting $f_i = f(\mathbf{x}_i)$, we see that we want to decrease the $N$ dimensional function +\begin{eqnarray} +\label{EQ:Friedman2} +\hspace{0.5in} +J(\mathbf{f}) &=& \sum_{i=1}^N \Psi(y_i,f(\mathbf{x}_i)) \nonumber \\ + &=& \sum_{i=1}^N \Psi(y_i,F_i). +\end{eqnarray} +The negative gradient of $J(\mathbf{f})$ indicates the direction of the locally greatest decrease in $J(\mathbf{f})$. Gradient descent would then have us modify $\mathbf{f}$ as +\begin{equation} +\label{eq:Friedman3} +\hspace{0.5in} +\hat \mathbf{f} \leftarrow \hat \mathbf{f} - \rho \nabla J(\mathbf{f}) +\end{equation} +where $\rho$ is the size of the step along the direction of greatest descent. Clearly, this step alone is far from our desired goal. First, it only fits $f$ at values of $\mathbf{x}$ for which we have observations. Second, it does not take into account that observations with similar $\mathbf{x}$ are likely to have similar values of $f(\mathbf{x})$. Both these problems would have disastrous effects on generalization error. However, Friedman suggests selecting a class of functions that use the covariate information to approximate the gradient, usually a regression tree. This line of reasoning produces his Gradient Boosting algorithm shown in Figure~\ref{fig:GradientBoost}. At each iteration the algorithm determines the direction, the gradient, in which it needs to improve the fit to the data and selects a particular model from the allowable class of functions that is in most agreement with the direction. In the case of squared-error loss, $\Psi(y_i,f(\mathbf{x}_i)) = \sum_{i=1}^N (y_i-f(\mathbf{x}_i))^2$, this algorithm corresponds exactly to residual fitting. + +There are various ways to extend and improve upon the basic framework suggested in Figure~\ref{fig:GradientBoost}. For example, Friedman (2001) substituted several choices in for $\Psi$ to develop new boosting algorithms for robust regression with least absolute deviation and Huber loss functions. Friedman (2002) showed that a simple subsampling trick can greatly improve predictive performance while simultaneously reduce computation time. Section~\ref{GBMModifications} discusses some of these modifications. + +\section{Improving boosting methods using control of the learning rate, sub-sampling, and a decomposition for interpretation} \label{GBMModifications} + +This section explores the variations of the previous algorithms that have the potential to improve their predictive performance and interpretability. In particular, by controlling the optimization speed or learning rate, introducing low-variance regression methods, and applying ideas from robust regression we can produce non-parametric regression procedures with many desirable properties. As a by-product some of these modifications lead directly into implementations for learning from massive datasets. All these methods take advantage of the general form of boosting +\begin{equation} +\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). +\end{equation} So far we have taken advantage of this form only by substituting in our favorite regression procedure for $\EV_w(z|\mathbf{x})$. I will discuss some modifications to estimating $\EV_w(z|\mathbf{x})$ that have the potential to improve our algorithm. + +\subsection{Decreasing the learning rate} As several authors have phrased slightly differently, ``...boosting, whatever flavor, seldom seems to overfit, no matter how many terms are included in the additive expansion''. This is not true as the discussion to \cite{FHT:2000} points out. + +In the update step of any boosting algorithm we can introduce a learning rate to dampen the proposed move. +\begin{equation} +\label{eq:shrinkage} +\hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda \EV(z(y,\hat f(\mathbf{x}))|\mathbf{x}). +\end{equation} +By multiplying the gradient step by $\lambda$ as in equation~\ref{eq:shrinkage} we have control on the rate at which the boosting algorithm descends the error surface (or ascends the likelihood surface). When $\lambda=1$ we return to performing full gradient steps. Friedman (2001) relates the learning rate to regularization through shrinkage. + +The optimal number of iterations, $T$, and the learning rate, $\lambda$, depend on each other. In practice I set $\lambda$ to be as small as possible and then select $T$ by cross-validation. Performance is best when $\lambda$ is as small as possible performance with decreasing marginal utility for smaller and smaller $\lambda$. Slower learning rates do not necessarily scale the number of optimal iterations. That is, if when $\lambda=1.0$ and the optimal $T$ is 100 iterations, does {\it not} necessarily imply that when $\lambda=0.1$ the optimal $T$ is 1000 iterations. + +\subsection{Variance reduction using subsampling} + +Friedman (2002) proposed the stochastic gradient boosting algorithm that simply samples uniformly without replacement from the dataset before estimating the next gradient step. He found that this additional step greatly improved performance. We estimate the regression $\EV(z(y,\hat f(\mathbf{x}))|\mathbf{x})$ using a random subsample of the dataset. + +\subsection{ANOVA decomposition} + +Certain function approximation methods are decomposable in terms of a ``functional ANOVA decomposition''. That is a function is decomposable as +\begin{equation} +\label{ANOVAdecomp} +f(\mathbf{x}) = \sum_j f_j(x_j) + \sum_{jk} f_{jk}(x_j,x_k) + \sum_{jk\ell} f_{jk\ell}(x_j,x_k,x_\ell) + \cdots. +\end{equation} This applies to boosted trees. Regression stumps (one split decision trees) depend on only one variable and fall into the first term of \ref{ANOVAdecomp}. Trees with two splits fall into the second term of \ref{ANOVAdecomp} and so on. By restricting the depth of the trees produced on each boosting iteration we can control the order of approximation. Often additive components are sufficient to approximate a multivariate function well, generalized additive models, the na\"{\i}ve Bayes classifier, and boosted stumps for example. When the approximation is restricted to a first order we can also produce plots of $x_j$ versus $f_j(x_j)$ to demonstrate how changes in $x_j$ might affect changes in the response variable. + +\subsection{Relative influence} Friedman (2001) also develops an extension of a variable's ``relative influence'' for boosted estimates. For tree based methods the approximate relative influence of a variable $x_j$ is +\begin{equation} +\label{RelInfluence} +\hspace{0.5in} +\hat J_j^2 = \hspace{-0.1in}\sum_{\mathrm{splits~on~}x_j}\hspace{-0.2in}I_t^2 +\end{equation} where $I_t^2$ is the empirical improvement by splitting on $x_j$ at that point. Friedman's extension to boosted models is to average the relative influence of variable $x_j$ across all the trees generated by the boosting algorithm. + +\begin{figure} +\aRule +Select +\begin{itemize} +\item a loss function (\texttt{distribution}) +\item the number of iterations, $T$ (\texttt{n.trees}) +\item the depth of each tree, $K$ (\texttt{interaction.depth}) +\item the shrinkage (or learning rate) parameter, $\lambda$ (\texttt{shrinkage}) +\item the subsampling rate, $p$ (\texttt{bag.fraction}) +\end{itemize} +Initialize $\hat f(\mathbf{x})$ to be a constant, $\hat f(\mathbf{x}) = \arg \min_{\rho} \sum_{i=1}^N \Psi(y_i,\rho)$ \\ +For $t$ in $1,\ldots,T$ do +\begin{enumerate} +\item Compute the negative gradient as the working response + \begin{equation} + z_i = -\frac{\partial}{\partial f(\mathbf{x}_i)} \Psi(y_i,f(\mathbf{x}_i)) \mbox{\Huge $|$}_{f(\mathbf{x}_i)=\hat f(\mathbf{x}_i)} + \end{equation} +\item Randomly select $p\times N$ cases from the dataset +\item Fit a regression tree with $K$ terminal nodes, $g(\mathbf{x})=\EV(z|\mathbf{x})$. This tree is fit using only those randomly selected observations +\item Compute the optimal terminal node predictions, $\rho_1,\ldots,\rho_K$, as + \begin{equation} + \rho_k = \arg \min_{\rho} \sum_{\mathbf{x}_i\in S_k} \Psi(y_i,\hat f(\mathbf{x}_i)+\rho) + \end{equation} +where $S_k$ is the set of $\mathbf{x}$s that define terminal node $k$. Again this step uses only the randomly selected observations. +\item Update $\hat f(\mathbf{x})$ as + \begin{equation} + \hat f(\mathbf{x}) \leftarrow \hat f(\mathbf{x}) + \lambda\rho_{k(\mathbf{x})} + \end{equation} +where $k(\mathbf{x})$ indicates the index of the terminal node into which an observation with features $\mathbf{x}$ would fall. +\end{enumerate} +\aRule +\caption{Boosting as implemented in \texttt{gbm()}} +\label{fig:gbm} +\end{figure} + +\section{Common user options} + +This section discusses the options to gbm that most users will need to change or tune. + +\subsection{Loss function} + +The first and foremost choice is \texttt{distribution}. This should be easily dictated by the application. For most classification problems either \texttt{bernoulli} or \texttt{adaboost} will be appropriate, the former being recommended. For continuous outcomes the choices are \texttt{gaussian} (for minimizing squared error), \texttt{laplace} (for minimizing absolute error), and quantile regression (for estimating percentiles of the conditional distribution of the outcome). Censored survival outcomes should require \texttt{coxph}. Count outcomes may use \texttt{poisson} although one might also consider \texttt{gaussian} or \texttt{laplace} depending on the analytical goals. + +\subsection{The relationship between shrinkage and number of iterations} The issues that most new users of gbm struggle with are the choice of \texttt{n.trees} and \texttt{shrinkage}. It is important to know that smaller values of \texttt{shrinkage} (almost) always give improved predictive performance. That is, setting \texttt{shrinkage=0.001} will almost certainly result in a model with better out-of-sample predictive performance than setting \texttt{shrinkage=0.01}. However, there are computational costs, both storage and CPU time, associated with setting \texttt{shrinkage} to be low. The model with \texttt{shrinkage=0.001} will likely require ten times as many iterations as the model with \texttt{shrinkage=0.01}, increasing storage and computation time by a factor of 10. Figure~\ref{fig:shrinkViters} shows the relationship between predictive performance, the number of iterations, and the shrinkage parameter. Note that the increase in the optimal number of iterations between two choices for shrinkage is roughly equal to the ratio of the shrinkage parameters. It is generally the case that for small shrinkage parameters, 0.001 for example, there is a fairly long plateau in which predictive performance is at its best. My rule of thumb is to set \texttt{shrinkage} as small as possible while still being able to fit the model in a reasonable amount of time and storage. I usually aim for 3,000 to 10,000 iterations with shrinkage rates between 0.01 and 0.001. + +\begin{figure}[ht] \begin{center} \includegraphics[width=5in]{shrinkage-v-iterations} \end{center} \caption{Out-of-sample predictive performance by number of iterations and shrinkage. Smaller values of the shrinkage parameter offer improved predictive performance, but with decreasing marginal improvement.} \label{fig:shrinkViters} \end{figure} + +\subsection{Estimating the optimal number of iterations} gbm offers three methods for estimating the optimal number of iterations after the gbm model has been fit, an independent test set (\texttt{test}), out-of-bag estimation (\texttt{OOB}), and $v$-fold cross validation (\texttt{cv}). The function \texttt{gbm.perf} computes the iteration estimate. + +Like Friedman's MART software, the independent test set method uses a single holdout test set to select the optimal number of iterations. If \texttt{train.fraction} is set to be less than 1, then only the \textit{first} \texttt{train.fraction}$\times$\texttt{nrow(data)} will be used to fit the model. Note that if the data are sorted in a systematic way (such as cases for which $y=1$ come first), then the data should be shuffled before running gbm. Those observations not used in the model fit can be used to get an unbiased estimate of the optimal number of iterations. The downside of this method is that a considerable number of observations are used to estimate the single regularization parameter (number of iterations) leaving a reduced dataset for estimating the entire multivariate model structure. Use \texttt{gbm.perf(...,method="test")} to obtain an estimate of the optimal number of iterations using the held out test set. + +If \texttt{bag.fraction} is set to be greater than 0 (0.5 is recommended), gbm computes an out-of-bag estimate of the improvement in predictive performance. It evaluates the reduction in deviance on those observations not used in selecting the next regression tree. The out-of-bag estimator underestimates the reduction in deviance. As a result, it almost always is too conservative in its selection for the optimal number of iterations. The motivation behind this method was to avoid having to set aside a large independent dataset, which reduces the information available for learning the model structure. Use \texttt{gbm.perf(...,method="OOB")} to obtain the OOB estimate. + +Lastly, gbm offers $v$-fold cross validation for estimating the optimal number of iterations. If when fitting the gbm model, \texttt{cv.folds=5} then gbm will do 5-fold cross validation. gbm will fit five gbm models in order to compute the cross validation error estimate and then will fit a sixth and final gbm model with \texttt{n.trees}iterations using all of the data. The returned model object will have a component labeled \texttt{cv.error}. Note that \texttt{gbm.more} will do additional gbm iterations but will not add to the \texttt{cv.error} component. Use \texttt{gbm.perf(...,method="cv")} to obtain the cross validation estimate. + +\begin{figure}[ht] +\begin{center} +\includegraphics[width=5in]{oobperf2} +\end{center} +\caption{Out-of-sample predictive performance of four methods of selecting the optimal number of iterations. The vertical axis plots performance relative the best. The boxplots indicate relative performance across thirteen real datasets from the UCI repository. See \texttt{demo(OOB-reps)}.} +\label{fig:oobperf} +\end{figure} + +Figure~\ref{fig:oobperf} compares the three methods for estimating the optimal number of iterations across 13 datasets. The boxplots show the methods performance relative to the best method on that dataset. For most datasets the method perform similarly, however, 5-fold cross validation is consistently the best of them. OOB, using a 33\% test set, and using a 20\% test set all have datasets for which the perform considerably worse than the best method. My recommendation is to use 5- or 10-fold cross validation if you can afford the computing time. Otherwise you may choose among the other options, knowing that OOB is conservative. + +\section{Available distributions} + +This section gives some of the mathematical detail for each of the distribution options that gbm offers. The gbm engine written in C++ has access to a C++ class for each of these distributions. Each class contains methods for computing the associated deviance, initial value, the gradient, and the constants to predict in each terminal node. + +In the equations shown below, for non-zero offset terms, replace $f(\mathbf{x}_i)$ with $o_i + f(\mathbf{x}_i)$. + +\subsection{Gaussian} + +\begin{tabular}{ll} +Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i(y_i-f(\mathbf{x}_i))^2$ \\ +Initial value & $\displaystyle f(\mathbf{x})=\frac{\sum w_i(y_i-o_i)}{\sum w_i}$ \\ +Gradient & $z_i=y_i - f(\mathbf{x}_i)$ \\ +Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-f(\mathbf{x}_i))}{\sum w_i}$ +\end{tabular} + +\subsection{AdaBoost} + +\begin{tabular}{ll} Deviance & $\displaystyle \frac{1}{\sum w_i} \sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Initial value & $\displaystyle \frac{1}{2}\log\frac{\sum y_iw_ie^{-o_i}}{\sum (1-y_i)w_ie^{o_i}}$ \\ Gradient & $\displaystyle z_i= -(2y_i-1)\exp(-(2y_i-1)f(\mathbf{x}_i))$ \\ Terminal node estimates & $\displaystyle \frac{\sum (2y_i-1)w_i\exp(-(2y_i-1)f(\mathbf{x}_i))} + {\sum w_i\exp(-(2y_i-1)f(\mathbf{x}_i))}$ +\end{tabular} + +\subsection{Bernoulli} + +\begin{tabular}{ll} Deviance & $\displaystyle -2\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\log(1+\exp(f(\mathbf{x}_i))))$ \\ Initial value & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i(1-y_i)}$ \\ Gradient & $\displaystyle z_i=y_i-\frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ Terminal node estimates & $\displaystyle \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ \\ + & where $\displaystyle p_i = \frac{1}{1+\exp(-f(\mathbf{x}_i))}$ \\ +\end{tabular} + +Notes: \begin{itemize} \item For non-zero offset terms, the computation of the initial value requires Newton-Raphson. Initialize $f_0=0$ and iterate $\displaystyle f_0 \leftarrow f_0 + \frac{\sum w_i(y_i-p_i)}{\sum w_ip_i(1-p_i)}$ where $\displaystyle p_i = \frac{1}{1+\exp(-(o_i+f_0))}$. \end{itemize} + +\subsection{Laplace} + +\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} \sum w_i|y_i-f(\mathbf{x}_i)|$ \\ Initial value & $\mbox{median}_w(y)$ \\ Gradient & $z_i=\mbox{sign}(y_i-f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mbox{median}_w(z)$ \end{tabular} + +Notes: \begin{itemize} \item $\mbox{median}_w(y)$ denotes the weighted median, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq m)}{\sum w_i}=\frac{1}{2}$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution="laplace"}. \end{itemize} + + +\subsection{Quantile regression} + +Contributed by Brian Kriegler (see \cite{Kriegler:2010}). + +\begin{tabular}{ll} Deviance & $\frac{1}{\sum w_i} + \left(\alpha\sum_{y_i>f(\mathbf{x}_i)} w_i(y_i-f(\mathbf{x}_i))\right. +$ \\ + & \hspace{0.5in}$\left.(1-\alpha)\sum_{y_i\leq f(\mathbf{x}_i)} w_i(f(\mathbf{x}_i)-y_i)\right)$ \\ +Initial value & $\mathrm{quantile}^{(\alpha)}_w(y)$ \\ Gradient & $z_i=\alpha I(y_i>f(\mathbf{x}_i))-(1-\alpha)I(y_i\leq f(\mathbf{x}_i))$ \\ Terminal node estimates & $\mathrm{quantile}^{(\alpha)}_w(z)$ \end{tabular} + +Notes: \begin{itemize} \item $\mathrm{quantile}^{(\alpha)}_w(y)$ denotes the weighted quantile, defined as the solution to the equation $\frac{\sum w_iI(y_i\leq q)}{\sum w_i}=\alpha$ \item \texttt{gbm()} currently does not implement the weighted median and issues a warning when the user uses weighted data with \texttt{distribution=list(name="quantile")}. \end{itemize} + + +\subsection{Cox Proportional Hazard} + +\begin{tabular}{ll} Deviance & $-2\sum w_i(\delta_i(f(\mathbf{x}_i)-\log(R_i/w_i)))$\\ Gradient & $\displaystyle z_i=\delta_i - \sum_j \delta_j + \frac{w_jI(t_i\geq t_j)e^{f(\mathbf{x}_i)}} + {\sum_k w_kI(t_k\geq t_j)e^{f(\mathbf{x}_k)}}$ \\ +Initial value & 0 \\ Terminal node estimates & Newton-Raphson algorithm \end{tabular} + +\begin{enumerate} + \item Initialize the terminal node predictions to 0, $\mathgbf{\rho}=0$ + \item Let $\displaystyle + p_i^{(k)}=\frac{\sum_j I(k(j)=k)I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}} + {\sum_j I(t_j\geq t_i)e^{f(\mathbf{x}_i)+\rho_k}}$ + \item Let $g_k=\sum w_i\delta_i\left(I(k(i)=k)-p_i^{(k)}\right)$ + \item Let $\mathbf{H}$ be a $k\times k$ matrix with diagonal elements + \begin{enumerate} + \item Set diagonal elements $H_{mm}=\sum w_i\delta_i p_i^{(m)}\left(1-p_i^{(m)}\right)$ + \item Set off diagonal elements $H_{mn}=-\sum w_i\delta_i p_i^{(m)}p_i^{(n)}$ + \end{enumerate} + \item Newton-Raphson update $\mathgbf{\rho} \leftarrow \mathgbf{\rho} - \mathbf{H}^{-1}\mathbf{g}$ + \item Return to step 2 until convergence +\end{enumerate} + +Notes: +\begin{itemize} +\item $t_i$ is the survival time and $\delta_i$ is the death indicator. +\item $R_i$ denotes the hazard for the risk set, $R_i=\sum_{j=1}^N w_jI(t_j\geq t_i)e^{f(\mathbf{x}_i)}$ +\item $k(i)$ indexes the terminal node of observation $i$ +\item For speed, \texttt{gbm()} does only one step of the Newton-Raphson algorithm rather than iterating to convergence. No appreciable loss of accuracy since the next boosting iteration will simply correct for the prior iterations inadequacy. +\item \texttt{gbm()} initially sorts the data by survival time. Doing this reduces the computation of the risk set from $O(n^2)$ to $O(n)$ at the cost of a single up front sort on survival time. After the model is fit, the data are then put back in their original order. +\end{itemize} + +\subsection{Poisson} +\begin{tabular}{ll} +Deviance & -2$\frac{1}{\sum w_i} \sum w_i(y_if(\mathbf{x}_i)-\exp(f(\mathbf{x}_i)))$ \\ +Initial value & $\displaystyle f(\mathbf{x})= \log\left(\frac{\sum w_iy_i}{\sum w_ie^{o_i}}\right)$ \\ +Gradient & $z_i=y_i - \exp(f(\mathbf{x}_i))$ \\ +Terminal node estimates & $\displaystyle \log\frac{\sum w_iy_i}{\sum w_i\exp(f(\mathbf{x}_i))}$ +\end{tabular} + +The Poisson class includes special safeguards so that the most extreme predicted values are $e^{-19}$ and $e^{+19}$. This behavior is consistent with \texttt{glm()}. + +\subsection{Pairwise} + +This distribution implements ranking measures following the +\emph{LambdaMart} algorithm \cite{Burges:2010}. Instances belong to +\emph{groups}; all pairs of items with different labels, belonging to +the same group, are used for training. In \emph{Information Retrieval} +applications, groups correspond to user queries, +and items to (feature vectors of) documents in the associated match +set to be ranked. + +For consistency with typical usage, our goal is to \emph{maximize} one +of the \emph{utility} functions listed below. Consider a group with +instances $x_1, \dots, x_n$, ordered such that $f(x_1) \geq f(x_2) +\geq \dots f(x_n)$; i.e., the \emph{rank} of $x_i$ is $i$, where +smaller ranks are preferable. Let $P$ be the set of all ordered pairs +such that $y_i > y_j$. + +\begin{enumerate} +\item[{\bf Concordance:}] Fraction of concordant (i.e, correctly ordered) + pairs. For the special case of binary labels, this is equivalent to + the Area under the ROC Curve. +$$\left\{ \begin{array}{l l}\frac{\|\{(i,j)\in P | + f(x_i)>f(x_j)\}\|}{\|P\|} + & P \neq \emptyset\\ + 0 & \mbox{otherwise.} + \end{array}\right. +$$ +\item[{\bf MRR:}] Mean reciprocal rank of the highest-ranked positive + instance (it is assumed $y_i\in\{0,1\}$): +$$\left\{ \begin{array}{l l}\frac{1}{\min\{1 \leq i \leq n |y_i=1\}} + & \exists i: \, 1 \leq i \leq n, y_i=1\\ + 0 & \mbox{otherwise.}\end{array}\right.$$ +\item[{\bf MAP:}] Mean average precision, a generalization of + MRR to multiple positive instances: +$$\left\{ \begin{array}{l l} \frac{\sum_{1\leq i\leq n | y_i=1} \|\{1\leq j\leq i + |y_j=1\}\|\,/\,i}{\|\{1\leq i\leq n | y_i=1\}\|} & \exists i: \, + 1 \leq i \leq n, y_i=1\\ + 0 & \mbox{otherwise.}\end{array}\right.$$ +\item[{\bf nDCG:}] Normalized discounted cumulative gain: +$$\frac{\sum_{1\leq i\leq n} \log_2(i+1) \, y_i}{\sum_{1\leq i\leq n} + \log_2(i+1) \, y'_i},$$ where $y'_1, \dots, y'_n$ is a reordering of $y_1, + \dots,y_n$ with $y'_1 \geq y'_2 \geq \dots \geq y'_n$. +\end{enumerate} + +The generalization to multiple (possibly weighted) groups is +straightforward. Sometimes a cut-off rank $k$ is given for \emph{MRR} +and \emph{nDCG}, in which case we replace the outer index $n$ by +$\min(n,k)$. + +The initial value for $f(x_i)$ is always zero. We derive the gradient of +a cost function whose gradient locally approximates the gradient of +the IR measure for a fixed ranking: + +\begin{eqnarray*} +\Phi & = & \sum_{(i,j) \in P} \Phi_{ij}\\ + & = & \sum_{(i,j) \in P} |\Delta Z_{ij}| \log \left( 1 + e^{-(f(x_i) - + f(x_j))}\right), +\end{eqnarray*} +where $|\Delta Z_{ij}|$ is the absolute utility difference when +swapping the ranks of $i$ and $j$, while leaving all other instances +the same. Define +\begin{eqnarray*} + \lambda_{ij} & = & \frac{\partial\Phi_{ij}}{\partial f(x_i)}\\ + & = & - |\Delta Z_{ij}| \frac{1}{1 + e^{f(x_i) - f(x_j)}}\\ +& = & - |\Delta Z_{ij}| \, \rho_{ij}, +\end{eqnarray*} +with +$$ \rho_{ij} = - \frac{\lambda_{ij }}{|\Delta Z_{ij}|} = \frac{1}{1 + e^{f(x_i) - f(x_j)}}$$ + + For the gradient of $\Phi$ with respect to $f(x_i)$, define +\begin{eqnarray*} +\lambda_i & = & \frac{\partial \Phi}{\partial f(x_i)}\\ +& = & \sum_{j|(i,j) \in P} \lambda_{ij} - \sum_{j|(j,i) \in P} \lambda_{ji}\\ +& = & - \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij}\\ +& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji}. +\end{eqnarray*} + + The second derivative is +\begin{eqnarray*} + \gamma_i & \stackrel{def}{=} & \frac{\partial^2\Phi}{\partial f(x_i)^2}\\ + & = & \sum_{j|(i,j) \in P} |\Delta Z_{ij}| \, \rho_{ij} \, (1-\rho_{ij})\\ +& & \mbox{} + \sum_{j|(j,i) \in P} |\Delta Z_{ji}| \, \rho_{ji} \, (1-\rho_{ji}). +\end{eqnarray*} + +Now consider again all groups with associated weights. For a given terminal node, let $i$ +range over all contained instances. Then its estimate is +$$-\frac{\sum_i v_i\lambda_{i}}{\sum_i v_i \gamma_i},$$ where +$v_i=w(\mbox{\em group}(i))/\|\{(j,k)\in\mbox{\em group}(i)\}\|.$ + +In each iteration, instances are reranked according to the preliminary +scores $f(x_i)$ to determine the $|\Delta Z_{ij}|$. Note that in order +to avoid ranking bias, we break ties by adding a small amount of +random noise. + + + +\bibliography{gbm} + +\end{document} diff --git a/vignettes/gbm.bib b/vignettes/gbm.bib new file mode 100644 index 0000000..144b2c5 --- /dev/null +++ b/vignettes/gbm.bib @@ -0,0 +1,65 @@ +@article{FreundSchapire:1997, + author = {Y. Freund and R. E. Schapire}, + title = {A decision-theoretic generalization of on-line learning and an application to boosting}, + journal = {Journal of Computer and System Sciences}, + volume = {55}, + number = {1}, + pages = {119--139}, + year = {1997} +} + +@article{Friedman:2001, + author = {J. H. Friedman}, + title = {Greedy Function Approximation: A Gradient Boosting Machine}, + journal = {Annals of Statistics}, + volume = {29}, + number = {5}, + pages = {1189--1232}, + year = {2001} +} + +@article{Friedman:2002, + author = {J. H. Friedman}, + title = {Stochastic Gradient Boosting}, + journal = {Computational Statistics and Data Analysis}, + volume = {38}, + number = {4}, + pages = {367--378}, + year = {2002} +} + +@article{FHT:2000, + author = {J. H. Friedman and T. Hastie and and R. Tibshirani}, + title = {Additive Logistic Regression: a Statistical View of Boosting}, + journal = {Annals of Statistics}, + volume = {28}, + number = {2}, + pages = {337--374}, + year = {2000} +} + +@article{Kriegler:2010, + author = {B. Kriegler and R. Berk}, + title = {Small Area Estimation of the Homeless in Los Angeles, An Application of Cost-Sensitive Stochastic Gradient Boosting}, + journal = {Annals of Applied Statistics}, + volume = {4}, + number = {3}, + pages = {1234--1255}, + year = {2010} +} + +@article{Ridgeway:1999, + author = {G. Ridgeway}, + title = {The state of boosting}, + journal = {Computing Science and Statistics}, + volume = {31}, + pages = {172--181}, + year = {1999} +} + +@article{Burges:2010, + author = {C. Burges}, + title = {From RankNet to LambdaRank to LambdaMART: An Overview}, + journal = {Microsoft Research Technical Report MSR-TR-2010-82}, + year = {2010} +} diff --git a/vignettes/oobperf2.pdf b/vignettes/oobperf2.pdf new file mode 100644 index 0000000..67571bb Binary files /dev/null and b/vignettes/oobperf2.pdf differ diff --git a/vignettes/shrinkage-v-iterations.pdf b/vignettes/shrinkage-v-iterations.pdf new file mode 100644 index 0000000..12a3ed4 Binary files /dev/null and b/vignettes/shrinkage-v-iterations.pdf differ diff --git a/vignettes/srcltx.sty b/vignettes/srcltx.sty new file mode 100644 index 0000000..a38d206 --- /dev/null +++ b/vignettes/srcltx.sty @@ -0,0 +1,172 @@ +%% +%% This is file `srcltx.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% srcltx.dtx (with options: `package,latex') +%% +%% This package is in the public domain. It comes with no guarantees +%% and no reserved rights. You can use or modify this package at your +%% own risk. +%% Originally written by: Aleksander Simonic +%% Current maintainer: Stefan Ulrich +%% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{srcltx}[2006/11/12 v1.6 Source specials for inverse search in DVI files] +\newif\ifSRCOK \SRCOKtrue +\newif\ifsrc@debug@ +\newif\ifsrc@dviwin@ +\newif\ifsrc@winedt@\src@winedt@true +\newif\ifsrc@everypar@\src@everypar@true +\newif\ifsrc@everymath@\src@everymath@true +\RequirePackage{ifthen} +\DeclareOption{active}{\SRCOKtrue} +\DeclareOption{inactive}{\SRCOKfalse} +\DeclareOption{nowinedt}{\src@winedt@false} +\DeclareOption{debug}{\src@debug@true} +\DeclareOption{nopar}{\global\src@everypar@false} +\DeclareOption{nomath}{\global\src@everymath@false} +\newcommand*\src@maybe@space{} +\let\src@maybe@space\space +\DeclareOption{dviwin}{\let\src@maybe@space\relax} +\ExecuteOptions{active} +\ProcessOptions +\newcount\src@lastline +\global\src@lastline=-1 +\newcommand*\src@debug{} +\def\src@debug#1{\ifsrc@debug@\typeout{DBG: |#1|}\fi} +\newcommand*\MainFile{} +\def\MainFile{\jobname.tex} +\newcommand*\CurrentInput{} +\gdef\CurrentInput{\MainFile} +\newcommand*\WinEdt{} +\def\WinEdt#1{\ifsrc@winedt@\typeout{:#1}\fi} +\newcommand\src@AfterFi{} +\def\src@AfterFi#1\fi{\fi#1} +\AtBeginDocument{% + \@ifpackageloaded{soul}{% + \let\src@SOUL@\SOUL@ + \def\SOUL@#1{% + \ifSRCOK + \SRCOKfalse\src@SOUL@{#1}\SRCOKtrue + \else + \src@AfterFi\src@SOUL@{#1}% + \fi + }% + }{}% +} +\newcommand*\srcIncludeHook[1]{\protected@xdef\CurrentInput{#1.tex}} +\newcommand*\srcInputHook[1]{% + \src@getfilename@with@ext{#1}% +} +\newcommand*\src@spec{} +\def\src@spec{% + \ifSRCOK + \ifnum\inputlineno>\src@lastline + \global\src@lastline=\inputlineno + \src@debug{% + src:\the\inputlineno\src@maybe@space\CurrentInput}% + \special{src:\the\inputlineno\src@maybe@space\CurrentInput}% + \fi + \fi +} +\newcommand\src@before@file@hook{} +\newcommand\src@after@file@hook{} +\def\src@before@file@hook{% + \WinEdt{<+ \CurrentInput}% + \global\src@lastline=0 + \ifSRCOK\special{src:1\src@maybe@space\CurrentInput}\fi +} +\def\src@after@file@hook#1{% + \WinEdt{<-}% + \global\src@lastline=\inputlineno + \global\advance\src@lastline by -1% + \gdef\CurrentInput{#1}% + \src@spec +} +\newcommand*\src@fname{}% +\newcommand*\src@tempa{}% +\newcommand*\src@extensions@path{}% +\newcommand*\src@getfilename@with@ext{}% +\def\src@extensions@path#1.#2\end{% + \ifthenelse{\equal{#2}{}}{% + \protected@edef\src@extensions@last{#1}% + \let\src@tempa\relax + }{% + \def\src@tempa{\src@extensions@path#2\end}% + }% + \src@tempa +} +\def\src@getfilename@with@ext#1{% + \expandafter\src@extensions@path#1.\end + \ifthenelse{\equal{\src@extensions@last}{tex}}{% + \protected@xdef\CurrentInput{#1}% + }{% + \protected@xdef\CurrentInput{#1.tex}% + }% + \PackageInfo{srcltx}{Expanded filename `#1' to `\CurrentInput'}% +} +\newcommand*\src@include{} +\newcommand*\src@@include{} +\let\src@include\include +\def\include#1{% + \src@spec + \clearpage + \expandafter\src@@include\expandafter{\CurrentInput}{#1}% +}% +\def\src@@include#1#2{% + \srcIncludeHook{#2}% + \src@before@file@hook + \src@include{#2}% + \src@after@file@hook{#1}% +} +\newcommand*\src@input{} +\newcommand*\src@@input{} +\newcommand*\src@@@input{} +\let\src@input\input +\def\input{\src@spec\@ifnextchar\bgroup\src@@input\@@input}% +\def\src@@input#1{% + \expandafter\src@@@input\expandafter{\CurrentInput}{#1}% +} +\def\src@@@input#1#2{% + \srcInputHook{#2}% + \src@before@file@hook + \src@input{#2}% + \src@after@file@hook{#1}% +} +\newcommand\Input{} +\let\Input\input +\ifsrc@everypar@ + \newcommand*\src@old@everypar{} + \let\src@old@everypar\everypar + \newtoks\src@new@everypar + \let\everypar\src@new@everypar + \everypar\expandafter{\the\src@old@everypar} + \src@old@everypar{\the\src@new@everypar\src@spec} +\fi +\ifsrc@everymath@ + \def\@tempa#1\the\everymath#2\delimiter{{#1\src@spec\the\everymath#2}} + \frozen@everymath=\expandafter\@tempa\the\frozen@everymath\delimiter +\fi +\newcommand*\src@bibliography{} +\newcommand*\src@@bibliography{} +\let\src@bibliography\bibliography +\def\bibliography#1{% + \expandafter\src@@bibliography\expandafter{\CurrentInput}{#1}% +} +\def\src@@bibliography#1#2{% + \protected@xdef\CurrentInput{\jobname.bbl}% + \src@before@file@hook + \src@bibliography{#2}% + \src@after@file@hook{#1}% +} +\newcommand*\src@old@output{} +\let\src@old@output\output +\newtoks\src@new@output +\let\output\src@new@output +\output\expandafter{\the\src@old@output} +\src@old@output{\SRCOKfalse\the\src@new@output} +\endinput +%% +%% End of file `srcltx.sty'.