New upstream release.
Debian Janitor
2 years ago
0 | # Byte-compiled / optimized / DLL files | |
1 | __pycache__/ | |
2 | *.py[cod] | |
3 | ||
4 | # C extensions | |
5 | *.so | |
6 | ||
7 | # Distribution / packaging | |
8 | .Python | |
9 | env/ | |
10 | build/ | |
11 | develop-eggs/ | |
12 | dist/ | |
13 | downloads/ | |
14 | eggs/ | |
15 | .eggs/ | |
16 | lib64/ | |
17 | parts/ | |
18 | sdist/ | |
19 | var/ | |
20 | *.egg-info/ | |
21 | .installed.cfg | |
22 | *.egg | |
23 | ||
24 | # PyInstaller | |
25 | # Usually these files are written by a python script from a template | |
26 | # before PyInstaller builds the exe, so as to inject date/other infos into it. | |
27 | *.manifest | |
28 | *.spec | |
29 | ||
30 | # Installer logs | |
31 | pip-log.txt | |
32 | pip-delete-this-directory.txt | |
33 | ||
34 | # Unit test / coverage reports | |
35 | htmlcov/ | |
36 | .tox/ | |
37 | .coverage | |
38 | .coverage.* | |
39 | .cache | |
40 | nosetests.xml | |
41 | coverage.xml | |
42 | *,cover | |
43 | ||
44 | # Translations | |
45 | *.mo | |
46 | *.pot | |
47 | ||
48 | # Django stuff: | |
49 | *.log | |
50 | ||
51 | # Sphinx documentation | |
52 | docs/_build/ | |
53 | ||
54 | # PyBuilder | |
55 | target/ | |
56 | ||
57 | *.o | |
58 | #./setup.py sdist creates this | |
59 | MANIFEST | |
60 | ||
61 | *.swp |
0 | language: python | |
1 | python: | |
2 | - "3.5" | |
3 | - "3.6" | |
4 | install: | |
5 | - pip install numpy && python ./setup.py install | |
6 | matrix: | |
7 | include: | |
8 | - python: 3.7 | |
9 | dist: xenial | |
10 | script: nosetests -sv |
0 | Metadata-Version: 2.1 | |
1 | Name: pyBigWig | |
2 | Version: 0.3.18 | |
3 | Summary: A package for accessing bigWig files using libBigWig | |
4 | Home-page: https://github.com/dpryan79/pyBigWig | |
5 | Author: Devon P. Ryan | |
6 | Author-email: ryan@ie-freiburg.mpg.de | |
7 | License: UNKNOWN | |
8 | Download-URL: https://github.com/dpryan79/pyBigWig/tarball/0.3.13 | |
9 | Description: UNKNOWN | |
10 | Keywords: bioinformatics,bigWig,bigBed | |
11 | Platform: UNKNOWN | |
12 | Provides-Extra: numpy input |
6 | 6 | ================= |
7 | 7 | |
8 | 8 | * [Installation](#installation) |
9 | * [Requirements](#requirements) | |
9 | 10 | * [Usage](#usage) |
10 | 11 | * [Load the extension](#load-the-extension) |
11 | 12 | * [Open a bigWig or bigBed file](#open-a-bigwig-or-bigbed-file) |
33 | 34 | |
34 | 35 | or with conda |
35 | 36 | |
36 | conda install pybigwig -c bioconda | |
37 | ||
38 | Note that libcurl (and the `curl-config` command) are required for installation. This is typically already installed on many Linux and OSX systems (if you install with conda then this will happen automatically). | |
37 | conda install pybigwig -c conda-forge -c bioconda | |
38 | ||
39 | ## Requirements | |
40 | ||
41 | The follow non-python requirements must be installed: | |
42 | ||
43 | - libcurl (and the `curl-config` config) | |
44 | - zlib | |
45 | ||
46 | The headers and libraries for these are required. | |
39 | 47 | |
40 | 48 | # Usage |
41 | 49 | Basic usage is as follows: |
0 | pybigwig (0.3.18-1) UNRELEASED; urgency=low | |
1 | ||
2 | * New upstream release. | |
3 | ||
4 | -- Debian Janitor <janitor@jelmer.uk> Sun, 06 Jun 2021 10:02:01 -0000 | |
5 | ||
0 | 6 | pybigwig (0.3.17-1) unstable; urgency=medium |
1 | 7 | |
2 | 8 | [ Steffen Möller ] |
0 | The MIT License (MIT) | |
1 | ||
2 | Copyright (c) 2015 Devon Ryan | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 | of this software and associated documentation files (the "Software"), to deal | |
6 | in the Software without restriction, including without limitation the rights | |
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 | copies of the Software, and to permit persons to whom the Software is | |
9 | furnished to do so, subject to the following conditions: | |
10 | ||
11 | The above copyright notice and this permission notice shall be included in all | |
12 | copies or substantial portions of the Software. | |
13 | ||
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
20 | SOFTWARE. | |
21 |
0 | ![Master build status](https://travis-ci.org/dpryan79/libBigWig.svg?branch=master) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.45278.svg)](http://dx.doi.org/10.5281/zenodo.45278) | |
1 | ||
2 | A C library for reading/parsing local and remote bigWig and bigBed files. While Kent's source code is free to use for these purposes, it's really inappropriate as library code since it has the unfortunate habit of calling `exit()` whenever there's an error. If that's then used inside of something like python then the python interpreter gets killed. This library is aimed at resolving these sorts of issues and should also use more standard things like curl and has a friendlier license to boot. | |
3 | ||
4 | Documentation is automatically generated by doxygen and can be found under `docs/html` or online [here](https://cdn.rawgit.com/dpryan79/libBigWig/master/docs/html/index.html). | |
5 | ||
6 | # Example | |
7 | ||
8 | The only functions and structures that end users need to care about are in "bigWig.h". Below is a commented example. You can see the files under `test/` for further examples. | |
9 | ||
10 | #include "bigWig.h" | |
11 | int main(int argc, char *argv[]) { | |
12 | bigWigFile_t *fp = NULL; | |
13 | bwOverlappingIntervals_t *intervals = NULL; | |
14 | double *stats = NULL; | |
15 | if(argc != 2) { | |
16 | fprintf(stderr, "Usage: %s {file.bw|URL://path/file.bw}\n", argv[0]); | |
17 | return 1; | |
18 | } | |
19 | ||
20 | //Initialize enough space to hold 128KiB (1<<17) of data at a time | |
21 | if(bwInit(1<<17) != 0) { | |
22 | fprintf(stderr, "Received an error in bwInit\n"); | |
23 | return 1; | |
24 | } | |
25 | ||
26 | //Open the local/remote file | |
27 | fp = bwOpen(argv[1], NULL, "r"); | |
28 | if(!fp) { | |
29 | fprintf(stderr, "An error occurred while opening %s\n", argv[1]); | |
30 | return 1; | |
31 | } | |
32 | ||
33 | //Get values in a range (0-based, half open) without NAs | |
34 | intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 0); | |
35 | bwDestroyOverlappingIntervals(intervals); //Free allocated memory | |
36 | ||
37 | //Get values in a range (0-based, half open) with NAs | |
38 | intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 1); | |
39 | bwDestroyOverlappingIntervals(intervals); //Free allocated memory | |
40 | ||
41 | //Get the full intervals that overlap | |
42 | intervals = bwGetOverlappingIntervals(fp, "chr1", 10000000, 10000100); | |
43 | bwDestroyOverlappingIntervals(intervals); | |
44 | ||
45 | //Get an example statistic - standard deviation | |
46 | //We want ~4 bins in the range | |
47 | stats = bwStats(fp, "chr1", 10000000, 10000100, 4, dev); | |
48 | if(stats) { | |
49 | printf("chr1:10000000-10000100 std. dev.: %f %f %f %f\n", stats[0], stats[1], stats[2], stats[3]); | |
50 | free(stats); | |
51 | } | |
52 | ||
53 | bwClose(fp); | |
54 | bwCleanup(); | |
55 | return 0; | |
56 | } | |
57 | ||
58 | ##Writing example | |
59 | ||
60 | N.B., creation of bigBed files is not supported (there are no plans to change this). | |
61 | ||
62 | Below is an example of how to write bigWig files. You can also find this file under `test/exampleWrite.c`. Unlike with Kent's tools, you can create bigWig files entry by entry without needing an intermediate wiggle or bedGraph file. Entries in bigWig files are stored in blocks with each entry in a block referring to the same chromosome and having the same type, of which there are three (see the [wiggle specification](http://genome.ucsc.edu/goldenpath/help/wiggle.html) for more information on this). | |
63 | ||
64 | #include "bigWig.h" | |
65 | ||
66 | int main(int argc, char *argv[]) { | |
67 | bigWigFile_t *fp = NULL; | |
68 | char *chroms[] = {"1", "2"}; | |
69 | char *chromsUse[] = {"1", "1", "1"}; | |
70 | uint32_t chrLens[] = {1000000, 1500000}; | |
71 | uint32_t starts[] = {0, 100, 125, | |
72 | 200, 220, 230, | |
73 | 500, 600, 625, | |
74 | 700, 800, 850}; | |
75 | uint32_t ends[] = {5, 120, 126, | |
76 | 205, 226, 231}; | |
77 | float values[] = {0.0f, 1.0f, 200.0f, | |
78 | -2.0f, 150.0f, 25.0f, | |
79 | 0.0f, 1.0f, 200.0f, | |
80 | -2.0f, 150.0f, 25.0f, | |
81 | -5.0f, -20.0f, 25.0f, | |
82 | -5.0f, -20.0f, 25.0f}; | |
83 | ||
84 | if(bwInit(1<<17) != 0) { | |
85 | fprintf(stderr, "Received an error in bwInit\n"); | |
86 | return 1; | |
87 | } | |
88 | ||
89 | fp = bwOpen("example_output.bw", NULL, "w"); | |
90 | if(!fp) { | |
91 | fprintf(stderr, "An error occurred while opening example_output.bw for writingn\n"); | |
92 | return 1; | |
93 | } | |
94 | ||
95 | //Allow up to 10 zoom levels, though fewer will be used in practice | |
96 | if(bwCreateHdr(fp, 10)) goto error; | |
97 | ||
98 | //Create the chromosome lists | |
99 | fp->cl = bwCreateChromList(chroms, chrLens, 2); | |
100 | if(!fp->cl) goto error; | |
101 | ||
102 | //Write the header | |
103 | if(bwWriteHdr(fp)) goto error; | |
104 | ||
105 | //Some example bedGraph-like entries | |
106 | if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; | |
107 | //We can continue appending similarly formatted entries | |
108 | //N.B. you can't append a different chromosome (those always go into different | |
109 | if(bwAppendIntervals(fp, starts+3, ends+3, values+3, 3)) goto error; | |
110 | ||
111 | //Add a new block of entries with a span. Since bwAdd/AppendIntervals was just used we MUST create a new block | |
112 | if(bwAddIntervalSpans(fp, "1", starts+6, 20, values+6, 3)) goto error; | |
113 | //We can continue appending similarly formatted entries | |
114 | if(bwAppendIntervalSpans(fp, starts+9, values+9, 3)) goto error; | |
115 | ||
116 | //Add a new block of fixed-step entries | |
117 | if(bwAddIntervalSpanSteps(fp, "1", 900, 20, 30, values+12, 3)) goto error; | |
118 | //The start is then 760, since that's where the previous step ended | |
119 | if(bwAppendIntervalSpanSteps(fp, values+15, 3)) goto error; | |
120 | ||
121 | //Add a new chromosome | |
122 | chromsUse[0] = "2"; | |
123 | chromsUse[1] = "2"; | |
124 | chromsUse[2] = "2"; | |
125 | if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; | |
126 | ||
127 | //Closing the file causes the zoom levels to be created | |
128 | bwClose(fp); | |
129 | bwCleanup(); | |
130 | ||
131 | return 0; | |
132 | ||
133 | error: | |
134 | fprintf(stderr, "Received an error somewhere!\n"); | |
135 | bwClose(fp); | |
136 | bwCleanup(); | |
137 | return 1; | |
138 | } | |
139 | ||
140 | # Testing file types | |
141 | ||
142 | As of version 0.3.0, this library supports accessing bigBed files, which are related to bigWig files. Applications that need to support both bigWig and bigBed input can use the `bwIsBigWig` and `bbIsBigBed` functions to determine if their inputs are bigWig/bigBed files: | |
143 | ||
144 | ...code... | |
145 | if(bwIsBigWig(input_file_name, NULL)) { | |
146 | //do something | |
147 | } else if(bbIsBigBed(input_file_name, NULL)) { | |
148 | //do something else | |
149 | } else { | |
150 | //handle unknown input | |
151 | } | |
152 | ||
153 | Note that these two functions rely on the "magic number" at the beginning of each file, which differs between bigWig and bigBed files. | |
154 | ||
155 | # bigBed support | |
156 | ||
157 | Support for accessing bigBed files was added in version 0.3.0. The function names used for accessing bigBed files are similar to those used for bigWig files. | |
158 | ||
159 | Function | Use | |
160 | --- | --- | |
161 | bbOpen | Opens a bigBed file | |
162 | bbGetSQL | Returns the SQL string (if it exists) in a bigBed file | |
163 | bbGetOverlappingEntries | Returns all entries overlapping an interval (either with or without their associated strings | |
164 | bbDestroyOverlappingEntries | Free memory allocated by the above command | |
165 | ||
166 | Other functions, such as `bwClose` and `bwInit`, are shared between bigWig and bigBed files. See `test/testBigBed.c` for a full example. | |
167 | ||
168 | # A note on bigBed entries | |
169 | ||
170 | Inside bigBed files, entries are stored as chromosome, start, and end coordinates with an (optional) associated string. For example, a "bedRNAElements" file from Encode has name, score, strand, "level", "significance", and "score2" values associated with each entry. These are stored inside the bigBed files as a single tab-separated character vector (char \*), which makes parsing difficult. The names of the various fields inside of bigBed files is stored as an SQL string, for example: | |
171 | ||
172 | table RnaElements | |
173 | "BED6 + 3 scores for RNA Elements data " | |
174 | ( | |
175 | string chrom; "Reference sequence chromosome or scaffold" | |
176 | uint chromStart; "Start position in chromosome" | |
177 | uint chromEnd; "End position in chromosome" | |
178 | string name; "Name of item" | |
179 | uint score; "Normalized score from 0-1000" | |
180 | char[1] strand; "+ or - or . for unknown" | |
181 | float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." | |
182 | float signif; "Statistical significance such as IDR. Set to -1 for no data." | |
183 | uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." | |
184 | ) | |
185 | ||
186 | Entries will then be of the form (one per line): | |
187 | ||
188 | 59426 115 - 0.021 0.48 218 | |
189 | 51 209 + 0.071 0.74 130 | |
190 | 52 170 + 0.045 0.61 171 | |
191 | 59433 178 - 0.049 0.34 296 | |
192 | 53 156 + 0.038 0.19 593 | |
193 | 59436 186 - 0.054 0.15 1010 | |
194 | 59437 506 - 1.560 0.00 430611 | |
195 | ||
196 | Note that chromosome and start/end intervals are stored separately, so there's no need to parse them out of string. libBigWig can return these entries, either with or without the above associated strings. Parsing these string is left to the application requiring them and is currently outside the scope of this library. | |
197 | ||
198 | # Interval/Entry iterators | |
199 | ||
200 | Sometimes it is desirable to request a large number of intervals from a bigWig file or entries from a bigBed file, but not hold them all in memory at once (e.g., due to saving memory). To support this, libBigWig (since version 0.3.0) supports two kinds of iterators. The general process of using iterators is: (1) iterator creation, (2) traversal, and finally (3) iterator destruction. Only iterator creation differs between bigWig and bigBed files. | |
201 | ||
202 | Importantly, iterators return results by one or more blocks. This is for convenience, since bigWig intervals and bigBed entries are stored in together in fixed-size groups, called blocks. The number of blocks of entries returned, therefore, is an option that can be specified to balance performance and memory usage. | |
203 | ||
204 | ## Iterator creation | |
205 | ||
206 | For bigwig files, iterators are created with the `bwOverlappingIntervalsIterator()`. This function takes chromosomal bounds (chromosome name, start, and end position) as well as a number of blocks. The equivalent function for bigBed files is `bbOverlappingEntriesIterator()`, which additionally takes a `withString` argutment, which dictates whether the returned entries include the associated string values or not. | |
207 | ||
208 | Each of the aforementioned files returns a pointer to a `bwOverlapIterator_t` object. The only important parts of this structure for end users are the following members: `entries`, `intervals`, and `data`. `entries` is a pointer to a `bbOverlappingEntries_t` object, or `NULL` if a bigWig file is being used. Likewise, `intervals` is a pointer to a `bwOverlappingIntervals_t` object, or `NULL` if a bigBed file is being used. `data` is a special pointer, used to signify the end of iteration. Thus, when `data` is a `NULL` pointer, iteration has ended. | |
209 | ||
210 | ## Iterator traversal | |
211 | ||
212 | Regardless of whether a bigWig or bigBed file is being used, the `bwIteratorNext()` function will free currently used memory and load the appropriate intervals or entries for the next block(s). On error, this will return a NULL pointer (memory is already internally freed in this case). | |
213 | ||
214 | ## Iterator destruction | |
215 | ||
216 | `bwOverlapIterator_t` objects MUST be destroyed after use. This can be done with the `bwIteratorDestroy()` function. | |
217 | ||
218 | ## Example | |
219 | ||
220 | A full example is provided in `tests/testIterator.c`, but a small example of iterating over all bigWig intervals in `chr1:0-10000000` in chunks of 5 blocks follows: | |
221 | ||
222 | iter = bwOverlappingIntervalsIterator(fp, "chr1", 0, 10000000, 5); | |
223 | while(iter->data) { | |
224 | //Do stuff with iter->intervals | |
225 | iter = bwIteratorNext(iter); | |
226 | } | |
227 | bwIteratorDestroy(iter); | |
228 | ||
229 | # A note on bigWig statistics | |
230 | ||
231 | The results of `min`, `max`, and `mean` should be the same as those from `BigWigSummary`. `stdev` and `coverage`, however, may differ due to Kent's tools producing incorrect results (at least for `coverage`, though the same appears to be the case for `stdev`). | |
232 | ||
233 | # Python interface | |
234 | ||
235 | There are currently two python interfaces that make use of libBigWig: [pyBigWig](https://github.com/dpryan79/pyBigWig) by me and [bw-python](https://github.com/brentp/bw-python) by Brent Pederson. Those interested are encouraged to give both a try! |
0 | #ifndef LIBBIGWIG_H | |
1 | #define LIBBIGWIG_H | |
2 | ||
3 | #include "bigWigIO.h" | |
4 | #include "bwValues.h" | |
5 | #include <inttypes.h> | |
6 | #include <zlib.h> | |
7 | ||
8 | #ifdef __cplusplus | |
9 | extern "C" { | |
10 | #endif | |
11 | ||
12 | /*! \mainpage libBigWig | |
13 | * | |
14 | * \section Introduction | |
15 | * | |
16 | * libBigWig is a C library for parsing local/remote bigWig and bigBed files. This is similar to Kent's library from UCSC, except | |
17 | * * The license is much more liberal | |
18 | * * This code doesn't call `exit()` on error, thereby killing the calling application. | |
19 | * | |
20 | * External files are accessed using [curl](http://curl.haxx.se/). | |
21 | * | |
22 | * Please submit issues and pull requests [here](https://github.com/dpryan79/libBigWig). | |
23 | * | |
24 | * \section Compilation | |
25 | * | |
26 | * Assuming you already have the curl libraries installed (not just the curl binary!): | |
27 | * | |
28 | * make install prefix=/some/path | |
29 | * | |
30 | * \section Writing bigWig files | |
31 | * | |
32 | * There are three methods for storing values in a bigWig file, further described in the [wiggle format](http://genome.ucsc.edu/goldenpath/help/wiggle.html). The entries within the file are grouped into "blocks" and each such block is limited to storing entries of a single type. So, it is unwise to use a single bedGraph-like endtry followed by a single fixed-step entry followed by a variable-step entry, as that would require three separate blocks, with additional space required for each. | |
33 | * | |
34 | * \section Testing file types | |
35 | * | |
36 | * As of version 0.3.0, libBigWig supports reading bigBed files. If an application needs to support both bigBed and bigWig input, then the `bwIsBigWig` and `bbIsBigBed` functions can be used to determine the file type. These both use the "magic" number at the beginning of the file to determine the file type. | |
37 | * | |
38 | * \section Interval and entry iterators | |
39 | * | |
40 | * As of version 0.3.0, libBigWig supports iterating over intervals in bigWig files and entries in bigBed files. The number of intervals/entries returned with each iteration can be controlled by setting the number of blocks processed in each iteration (intervals and entries are group inside of bigWig and bigBed files into blocks of entries). See `test/testIterator.c` for an example. | |
41 | * | |
42 | * \section Examples | |
43 | * | |
44 | * Please see [README.md](README.md) and the files under `test/` for examples. | |
45 | */ | |
46 | ||
47 | ||
48 | /*! \file bigWig.h | |
49 | * | |
50 | * These are the functions and structured that should be used by external users. While I don't particularly recommend dealing with some of the structures (e.g., a bigWigHdr_t), they're described here in case you need them. | |
51 | * | |
52 | * BTW, this library doesn't switch endianness as appropriate, since I kind of assume that there's only one type produced these days. | |
53 | */ | |
54 | ||
55 | /*! | |
56 | * The library version number | |
57 | */ | |
58 | #define LIBBIGWIG_VERSION 0.4.6 | |
59 | ||
60 | /*! | |
61 | * If 1, then this library was compiled with remote file support. | |
62 | */ | |
63 | #ifdef NOCURL | |
64 | #define LIBBIGWIG_CURL 0 | |
65 | #ifndef CURLTYPE_DEFINED | |
66 | #define CURLTYPE_DEFINED | |
67 | typedef int CURLcode; | |
68 | typedef void CURL; | |
69 | #endif | |
70 | #else | |
71 | #define LIBBIGWIG_CURL 1 | |
72 | #endif | |
73 | ||
74 | /*! | |
75 | * The magic number of a bigWig file. | |
76 | */ | |
77 | #define BIGWIG_MAGIC 0x888FFC26 | |
78 | /*! | |
79 | * The magic number of a bigBed file. | |
80 | */ | |
81 | #define BIGBED_MAGIC 0x8789F2EB | |
82 | /*! | |
83 | * The magic number of a "cirTree" block in a file. | |
84 | */ | |
85 | #define CIRTREE_MAGIC 0x78ca8c91 | |
86 | /*! | |
87 | * The magic number of an index block in a file. | |
88 | */ | |
89 | #define IDX_MAGIC 0x2468ace0 | |
90 | /*! | |
91 | * The default number of children per block. | |
92 | */ | |
93 | #define DEFAULT_nCHILDREN 64 | |
94 | /*! | |
95 | * The default decompression buffer size in bytes. This is used to determin | |
96 | */ | |
97 | #define DEFAULT_BLOCKSIZE 32768 | |
98 | ||
99 | /*! | |
100 | * An enum that dictates the type of statistic to fetch for a given interval | |
101 | */ | |
102 | enum bwStatsType { | |
103 | doesNotExist = -1, /*!< This does nothing */ | |
104 | mean = 0, /*!< The mean value */ | |
105 | average = 0, /*!< The mean value */ | |
106 | stdev = 1, /*!< The standard deviation of the values */ | |
107 | dev = 1, /*!< The standard deviation of the values */ | |
108 | max = 2, /*!< The maximum value */ | |
109 | min = 3, /*!< The minimum value */ | |
110 | cov = 4, /*!< The number of bases covered */ | |
111 | coverage = 4, /*!<The number of bases covered */ | |
112 | sum = 5 /*!< The sum of per-base values */ | |
113 | }; | |
114 | ||
115 | //Should hide this from end users | |
116 | /*! | |
117 | * @brief BigWig files have multiple "zoom" levels, each of which has its own header. This hold those headers | |
118 | * | |
119 | * N.B., there's 4 bytes of padding in the on disk representation of level and dataOffset. | |
120 | */ | |
121 | typedef struct { | |
122 | uint32_t *level; /**<The zoom level, which is an integer starting with 0.*/ | |
123 | //There's 4 bytes of padding between these | |
124 | uint64_t *dataOffset; /**<The offset to the on-disk start of the data. This isn't used currently.*/ | |
125 | uint64_t *indexOffset; /**<The offset to the on-disk start of the index. This *is* used.*/ | |
126 | bwRTree_t **idx; /**<Index for each zoom level. Represented as a tree*/ | |
127 | } bwZoomHdr_t; | |
128 | ||
129 | /*! | |
130 | * @brief The header section of a bigWig file. | |
131 | * | |
132 | * Some of the values aren't currently used for anything. Others may optionally not exist. | |
133 | */ | |
134 | typedef struct { | |
135 | uint16_t version; /**<The version information of the file.*/ | |
136 | uint16_t nLevels; /**<The number of "zoom" levels.*/ | |
137 | uint64_t ctOffset; /**<The offset to the on-disk chromosome tree list.*/ | |
138 | uint64_t dataOffset; /**<The on-disk offset to the first block of data.*/ | |
139 | uint64_t indexOffset; /**<The on-disk offset to the data index.*/ | |
140 | uint16_t fieldCount; /**<Total number of fields.*/ | |
141 | uint16_t definedFieldCount; /**<Number of fixed-format BED fields.*/ | |
142 | uint64_t sqlOffset; /**<The on-disk offset to an SQL string. This is unused.*/ | |
143 | uint64_t summaryOffset; /**<If there's a summary, this is the offset to it on the disk.*/ | |
144 | uint32_t bufSize; /**<The compression buffer size (if the data is compressed).*/ | |
145 | uint64_t extensionOffset; /**<Unused*/ | |
146 | bwZoomHdr_t *zoomHdrs; /**<Pointers to the header for each zoom level.*/ | |
147 | //total Summary | |
148 | uint64_t nBasesCovered; /**<The total bases covered in the file.*/ | |
149 | double minVal; /**<The minimum value in the file.*/ | |
150 | double maxVal; /**<The maximum value in the file.*/ | |
151 | double sumData; /**<The sum of all values in the file.*/ | |
152 | double sumSquared; /**<The sum of the squared values in the file.*/ | |
153 | } bigWigHdr_t; | |
154 | ||
155 | //Should probably replace this with a hash | |
156 | /*! | |
157 | * @brief Holds the chromosomes and their lengths | |
158 | */ | |
159 | typedef struct { | |
160 | int64_t nKeys; /**<The number of chromosomes */ | |
161 | char **chrom; /**<A list of null terminated chromosomes */ | |
162 | uint32_t *len; /**<The lengths of each chromosome */ | |
163 | } chromList_t; | |
164 | ||
165 | //TODO remove from bigWig.h | |
166 | /// @cond SKIP | |
167 | typedef struct bwLL bwLL; | |
168 | struct bwLL { | |
169 | bwRTreeNode_t *node; | |
170 | struct bwLL *next; | |
171 | }; | |
172 | typedef struct bwZoomBuffer_t bwZoomBuffer_t; | |
173 | struct bwZoomBuffer_t { //each individual entry takes 32 bytes | |
174 | void *p; | |
175 | uint32_t l, m; | |
176 | struct bwZoomBuffer_t *next; | |
177 | }; | |
178 | /// @endcond | |
179 | ||
180 | /*! | |
181 | * @brief This is only needed for writing bigWig files (and won't be created otherwise) | |
182 | * This should be removed from bigWig.h | |
183 | */ | |
184 | typedef struct { | |
185 | uint64_t nBlocks; /**<The number of blocks written*/ | |
186 | uint32_t blockSize; /**<The maximum number of children*/ | |
187 | uint64_t nEntries; /**<The number of entries processed. This is used for the first contig and determining how the zoom levels are computed*/ | |
188 | uint64_t runningWidthSum; /**<The running sum of the entry widths for the first contig (again, used for the first contig and computing zoom levels)*/ | |
189 | uint32_t tid; /**<The current TID that's being processed*/ | |
190 | uint32_t start; /**<The start position of the block*/ | |
191 | uint32_t end; /**<The end position of the block*/ | |
192 | uint32_t span; /**<The span of each entry, if applicable*/ | |
193 | uint32_t step; /**<The step size, if applicable*/ | |
194 | uint8_t ltype; /**<The type of the last entry added*/ | |
195 | uint32_t l; /**<The current size of p. This and the type determine the number of items held*/ | |
196 | void *p; /**<A buffer of size hdr->bufSize*/ | |
197 | bwLL *firstIndexNode; /**<The first index node in the linked list*/ | |
198 | bwLL *currentIndexNode; /**<The last index node in a linked list*/ | |
199 | bwZoomBuffer_t **firstZoomBuffer; /**<The first node in a linked list of leaf nodes*/ | |
200 | bwZoomBuffer_t **lastZoomBuffer; /**<The last node in a linked list of leaf nodes*/ | |
201 | uint64_t *nNodes; /**<The number of leaf nodes per zoom level, useful for determining duplicate levels*/ | |
202 | uLongf compressPsz; /**<The size of the compression buffer*/ | |
203 | void *compressP; /**<A compressed buffer of size compressPsz*/ | |
204 | } bwWriteBuffer_t; | |
205 | ||
206 | /*! | |
207 | * @brief A structure that holds everything needed to access a bigWig file. | |
208 | */ | |
209 | typedef struct { | |
210 | URL_t *URL; /**<A pointer that can handle both local and remote files (including a buffer if needed).*/ | |
211 | bigWigHdr_t *hdr; /**<The file header.*/ | |
212 | chromList_t *cl; /**<A list of chromosome names (the order is the ID).*/ | |
213 | bwRTree_t *idx; /**<The index for the full dataset.*/ | |
214 | bwWriteBuffer_t *writeBuffer; /**<The buffer used for writing.*/ | |
215 | int isWrite; /**<0: Opened for reading, 1: Opened for writing.*/ | |
216 | int type; /**<0: bigWig, 1: bigBed.*/ | |
217 | } bigWigFile_t; | |
218 | ||
219 | /*! | |
220 | * @brief Holds interval:value associations | |
221 | */ | |
222 | typedef struct { | |
223 | uint32_t l; /**<Number of intervals held*/ | |
224 | uint32_t m; /**<Maximum number of values/intervals the struct can hold*/ | |
225 | uint32_t *start; /**<The start positions (0-based half open)*/ | |
226 | uint32_t *end; /**<The end positions (0-based half open)*/ | |
227 | float *value; /**<The value associated with each position*/ | |
228 | } bwOverlappingIntervals_t; | |
229 | ||
230 | /*! | |
231 | * @brief Holds interval:str associations | |
232 | */ | |
233 | typedef struct { | |
234 | uint32_t l; /**<Number of intervals held*/ | |
235 | uint32_t m; /**<Maximum number of values/intervals the struct can hold*/ | |
236 | uint32_t *start; /**<The start positions (0-based half open)*/ | |
237 | uint32_t *end; /**<The end positions (0-based half open)*/ | |
238 | char **str; /**<The strings associated with a given entry.*/ | |
239 | } bbOverlappingEntries_t; | |
240 | ||
241 | /*! | |
242 | * @brief A structure to hold iterations | |
243 | * One of intervals and entries should be used to access records from bigWig or bigBed files, respectively. | |
244 | */ | |
245 | typedef struct { | |
246 | bigWigFile_t *bw; /**<Pointer to the bigWig/bigBed file.*/ | |
247 | uint32_t tid; /**<The contig/chromosome ID.*/ | |
248 | uint32_t start; /**<Start position of the query interval.*/ | |
249 | uint32_t end; /**<End position of the query interval.*/ | |
250 | uint64_t offset; /**<Offset into the blocks.*/ | |
251 | uint32_t blocksPerIteration; /**<Number of blocks to use per iteration.*/ | |
252 | int withString; /**<For bigBed entries, whether to return the string with the entries.*/ | |
253 | void *blocks; /**<Overlapping blocks.*/ | |
254 | bwOverlappingIntervals_t *intervals; /**<Overlapping intervals (or NULL).*/ | |
255 | bbOverlappingEntries_t *entries; /**<Overlapping entries (or NULL).*/ | |
256 | void *data; /**<Points to either intervals or entries. If there are no further intervals/entries, then this is NULL. Use this to test for whether to continue iterating.*/ | |
257 | } bwOverlapIterator_t; | |
258 | ||
259 | /*! | |
260 | * @brief Initializes curl and global variables. This *MUST* be called before other functions (at least if you want to connect to remote files). | |
261 | * For remote file, curl must be initialized and regions of a file read into an internal buffer. If the buffer is too small then an excessive number of connections will be made. If the buffer is too large than more data than required is fetched. 128KiB is likely sufficient for most needs. | |
262 | * @param bufSize The internal buffer size used for remote connection. | |
263 | * @see bwCleanup | |
264 | * @return 0 on success and 1 on error. | |
265 | */ | |
266 | int bwInit(size_t bufSize); | |
267 | ||
268 | /*! | |
269 | * @brief The counterpart to bwInit, this cleans up curl. | |
270 | * @see bwInit | |
271 | */ | |
272 | void bwCleanup(void); | |
273 | ||
274 | /*! | |
275 | * @brief Determine if a file is a bigWig file. | |
276 | * This function will quickly check either local or remote files to determine if they appear to be valid bigWig files. This can be determined by reading the first 4 bytes of the file. | |
277 | * @param fname The file name or URL (http, https, and ftp are supported) | |
278 | * @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information. See `test/testRemote` for an example. | |
279 | * @return 1 if the file appears to be bigWig, otherwise 0. | |
280 | */ | |
281 | int bwIsBigWig(char *fname, CURLcode (*callBack)(CURL*)); | |
282 | ||
283 | /*! | |
284 | * @brief Determine is a file is a bigBed file. | |
285 | * This function will quickly check either local or remote files to determine if they appear to be valid bigWig files. This can be determined by reading the first 4 bytes of the file. | |
286 | * @param fname The file name or URL (http, https, and ftp are supported) | |
287 | * @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information. See `test/testRemote` for an example. | |
288 | * @return 1 if the file appears to be bigWig, otherwise 0. | |
289 | */ | |
290 | int bbIsBigBed(char *fname, CURLcode (*callBack)(CURL*)); | |
291 | ||
292 | /*! | |
293 | * @brief Opens a local or remote bigWig file. | |
294 | * This will open a local or remote bigWig file. Writing of local bigWig files is also supported. | |
295 | * @param fname The file name or URL (http, https, and ftp are supported) | |
296 | * @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information. See `test/testRemote` for an example. | |
297 | * @param mode The mode, by default "r". Both local and remote files can be read, but only local files can be written. For files being written the callback function is ignored. If and only if the mode contains "w" will the file be opened for writing (in all other cases the file will be opened for reading. | |
298 | * @return A bigWigFile_t * on success and NULL on error. | |
299 | */ | |
300 | bigWigFile_t *bwOpen(char *fname, CURLcode (*callBack)(CURL*), const char* mode); | |
301 | ||
302 | /*! | |
303 | * @brief Opens a local or remote bigBed file. | |
304 | * This will open a local or remote bigBed file. Note that this file format can only be read and NOT written! | |
305 | * @param fname The file name or URL (http, https, and ftp are supported) | |
306 | * @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information. See `test/testRemote` for an example. | |
307 | * @return A bigWigFile_t * on success and NULL on error. | |
308 | */ | |
309 | bigWigFile_t *bbOpen(char *fname, CURLcode (*callBack)(CURL*)); | |
310 | ||
311 | /*! | |
312 | * @brief Returns a string containing the SQL entry (or NULL). | |
313 | * The "auto SQL" field contains the names and value types of the entries in | |
314 | * each bigBed entry. If you need to parse a particular value out of each entry, | |
315 | * then you'll need to first parse this. | |
316 | * @param fp The file pointer to a valid bigWigFile_t | |
317 | * @return A char *, which you MUST free! | |
318 | */ | |
319 | char *bbGetSQL(bigWigFile_t *fp); | |
320 | ||
321 | /*! | |
322 | * @brief Closes a bigWigFile_t and frees up allocated memory | |
323 | * This closes both bigWig and bigBed files. | |
324 | * @param fp The file pointer. | |
325 | */ | |
326 | void bwClose(bigWigFile_t *fp); | |
327 | ||
328 | /******************************************************************************* | |
329 | * | |
330 | * The following are in bwStats.c | |
331 | * | |
332 | *******************************************************************************/ | |
333 | ||
334 | /*! | |
335 | * @brief Converts between chromosome name and ID | |
336 | * | |
337 | * @param fp A valid bigWigFile_t pointer | |
338 | * @param chrom A chromosome name | |
339 | * @return An ID, -1 will be returned on error (note that this is an unsigned value, so that's ~4 billion. bigWig/bigBed files can't store that many chromosomes anyway. | |
340 | */ | |
341 | uint32_t bwGetTid(bigWigFile_t *fp, char *chrom); | |
342 | ||
343 | /*! | |
344 | * @brief Frees space allocated by `bwGetOverlappingIntervals` | |
345 | * @param o A valid `bwOverlappingIntervals_t` pointer. | |
346 | * @see bwGetOverlappingIntervals | |
347 | */ | |
348 | void bwDestroyOverlappingIntervals(bwOverlappingIntervals_t *o); | |
349 | ||
350 | /*! | |
351 | * @brief Frees space allocated by `bbGetOverlappingEntries` | |
352 | * @param o A valid `bbOverlappingEntries_t` pointer. | |
353 | * @see bbGetOverlappingEntries | |
354 | */ | |
355 | void bbDestroyOverlappingEntries(bbOverlappingEntries_t *o); | |
356 | ||
357 | /*! | |
358 | * @brief Return bigWig entries overlapping an interval. | |
359 | * Find all bigWig entries overlapping a range and returns them, including their associated values. | |
360 | * @param fp A valid bigWigFile_t pointer. This MUST be for a bigWig file! | |
361 | * @param chrom A valid chromosome name. | |
362 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
363 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
364 | * @return NULL on error or no overlapping values, otherwise a `bwOverlappingIntervals_t *` holding the values and intervals. | |
365 | * @see bwOverlappingIntervals_t | |
366 | * @see bwDestroyOverlappingIntervals | |
367 | * @see bwGetValues | |
368 | */ | |
369 | bwOverlappingIntervals_t *bwGetOverlappingIntervals(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end); | |
370 | ||
371 | /*! | |
372 | * @brief Return bigBed entries overlapping an interval. | |
373 | * Find all bigBed entries overlapping a range and returns them. | |
374 | * @param fp A valid bigWigFile_t pointer. This MUST be for a bigBed file! | |
375 | * @param chrom A valid chromosome name. | |
376 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
377 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
378 | * @param withString If not 0, return the string associated with each entry in the output. If 0, there are no associated strings returned. This is useful if the only information needed are the locations of the entries, which require significantly less memory. | |
379 | * @return NULL on error or no overlapping values, otherwise a `bbOverlappingEntries_t *` holding the intervals and (optionally) the associated string. | |
380 | * @see bbOverlappingEntries_t | |
381 | * @see bbDestroyOverlappingEntries | |
382 | */ | |
383 | bbOverlappingEntries_t *bbGetOverlappingEntries(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int withString); | |
384 | ||
385 | /*! | |
386 | * @brief Creates an iterator over intervals in a bigWig file | |
387 | * Iterators can be traversed with `bwIteratorNext()` and destroyed with `bwIteratorDestroy()`. | |
388 | * Intervals are in the `intervals` member and `data` can be used to determine when to end iteration. | |
389 | * @param fp A valid bigWigFile_t pointer. This MUST be for a bigWig file! | |
390 | * @param chrom A valid chromosome name. | |
391 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
392 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
393 | * @param blocksPerIteration The number of blocks (internal groupings of intervals in bigWig files) to return per iteration. | |
394 | * @return NULL on error, otherwise a bwOverlapIterator_t pointer | |
395 | * @see bwOverlapIterator_t | |
396 | * @see bwIteratorNext | |
397 | * @see bwIteratorDestroy | |
398 | */ | |
399 | bwOverlapIterator_t *bwOverlappingIntervalsIterator(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t blocksPerIteration); | |
400 | ||
401 | /*! | |
402 | * @brief Creates an iterator over entries in a bigBed file | |
403 | * Iterators can be traversed with `bwIteratorNext()` and destroyed with `bwIteratorDestroy()`. | |
404 | * Entries are in the `entries` member and `data` can be used to determine when to end iteration. | |
405 | * @param fp A valid bigWigFile_t pointer. This MUST be for a bigBed file! | |
406 | * @param chrom A valid chromosome name. | |
407 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
408 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
409 | * @param withString Whether the returned entries should include their associated strings. | |
410 | * @param blocksPerIteration The number of blocks (internal groupings of entries in bigBed files) to return per iteration. | |
411 | * @return NULL on error, otherwise a bwOverlapIterator_t pointer | |
412 | * @see bbGetOverlappingEntries | |
413 | * @see bwOverlapIterator_t | |
414 | * @see bwIteratorNext | |
415 | * @see bwIteratorDestroy | |
416 | */ | |
417 | bwOverlapIterator_t *bbOverlappingEntriesIterator(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int withString, uint32_t blocksPerIteration); | |
418 | ||
419 | /*! | |
420 | * @brief Traverses to the entries/intervals in the next group of blocks. | |
421 | * @param iter A bwOverlapIterator_t pointer that is updated (or destroyed on error) | |
422 | * @return NULL on error, otherwise a bwOverlapIterator_t pointer with the intervals or entries from the next set of blocks. | |
423 | * @see bwOverlapIterator_t | |
424 | * @see bwIteratorDestroy | |
425 | */ | |
426 | bwOverlapIterator_t *bwIteratorNext(bwOverlapIterator_t *iter); | |
427 | ||
428 | /*! | |
429 | * @brief Destroys a bwOverlapIterator_t | |
430 | * @param iter The bwOverlapIterator_t that should be destroyed | |
431 | */ | |
432 | void bwIteratorDestroy(bwOverlapIterator_t *iter); | |
433 | ||
434 | /*! | |
435 | * @brief Return all per-base bigWig values in a given interval. | |
436 | * Given an interval (e.g., chr1:0-100), return the value at each position in a bigWig file. Positions without associated values are suppressed by default, but may be returned if `includeNA` is not 0. | |
437 | * @param fp A valid bigWigFile_t pointer. | |
438 | * @param chrom A valid chromosome name. | |
439 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
440 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
441 | * @param includeNA If not 0, report NA values as well (as NA). | |
442 | * @return NULL on error or no overlapping values, otherwise a `bwOverlappingIntervals_t *` holding the values and positions. | |
443 | * @see bwOverlappingIntervals_t | |
444 | * @see bwDestroyOverlappingIntervals | |
445 | * @see bwGetOverlappingIntervals | |
446 | */ | |
447 | bwOverlappingIntervals_t *bwGetValues(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int includeNA); | |
448 | ||
449 | /*! | |
450 | * @brief Determines per-interval bigWig statistics | |
451 | * Can determine mean/min/max/coverage/standard deviation of values in one or more intervals in a bigWig file. You can optionally give it an interval and ask for values from X number of sub-intervals. | |
452 | * @param fp The file from which to extract statistics. | |
453 | * @param chrom A valid chromosome name. | |
454 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
455 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
456 | * @param nBins The number of bins within the interval to calculate statistics for. | |
457 | * @param type The type of statistic. | |
458 | * @see bwStatsType | |
459 | * @return A pointer to an array of double precission floating point values. Note that bigWig files only hold 32-bit values, so this is done to help prevent overflows. | |
460 | */ | |
461 | double *bwStats(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type); | |
462 | ||
463 | /*! | |
464 | * @brief Determines per-interval bigWig statistics | |
465 | * Can determine mean/min/max/coverage/standard deviation of values in one or more intervals in a bigWig file. You can optionally give it an interval and ask for values from X number of sub-intervals. The difference with bwStats is that zoom levels are never used. | |
466 | * @param fp The file from which to extract statistics. | |
467 | * @param chrom A valid chromosome name. | |
468 | * @param start The start position of the interval. This is 0-based half open, so 0 is the first base. | |
469 | * @param end The end position of the interval. Again, this is 0-based half open, so 100 will include the 100th base...which is at position 99. | |
470 | * @param nBins The number of bins within the interval to calculate statistics for. | |
471 | * @param type The type of statistic. | |
472 | * @see bwStatsType | |
473 | * @return A pointer to an array of double precission floating point values. Note that bigWig files only hold 32-bit values, so this is done to help prevent overflows. | |
474 | */ | |
475 | double *bwStatsFromFull(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type); | |
476 | ||
477 | //Writer functions | |
478 | ||
479 | /*! | |
480 | * @brief Create a largely empty bigWig header | |
481 | * Every bigWig file has a header, this creates the template for one. It also takes care of space allocation in the output write buffer. | |
482 | * @param fp The bigWigFile_t* that you want to write to. | |
483 | * @param maxZooms The maximum number of zoom levels. If you specify 0 then there will be no zoom levels. A value <0 or > 65535 will result in a maximum of 10. | |
484 | * @return 0 on success. | |
485 | */ | |
486 | int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms); | |
487 | ||
488 | /*! | |
489 | * @brief Take a list of chromosome names and lengths and return a pointer to a chromList_t | |
490 | * This MUST be run before `bwWriteHdr()`. Note that the input is NOT free()d! | |
491 | * @param chroms A list of chromosomes. | |
492 | * @param lengths The length of each chromosome. | |
493 | * @param n The number of chromosomes (thus, the length of `chroms` and `lengths`) | |
494 | * @return A pointer to a chromList_t or NULL on error. | |
495 | */ | |
496 | chromList_t *bwCreateChromList(char **chroms, uint32_t *lengths, int64_t n); | |
497 | ||
498 | /*! | |
499 | * @brief Write a the header to a bigWig file. | |
500 | * You must have already opened the output file, created a header and a chromosome list. | |
501 | * @param bw The output bigWigFile_t pointer. | |
502 | * @see bwCreateHdr | |
503 | * @see bwCreateChromList | |
504 | */ | |
505 | int bwWriteHdr(bigWigFile_t *bw); | |
506 | ||
507 | /*! | |
508 | * @brief Write a new block of bedGraph-like intervals to a bigWig file | |
509 | * Adds entries of the form: | |
510 | * chromosome start end value | |
511 | * to the file. These will always be added in a new block, so you may have previously used a different storage type. | |
512 | * | |
513 | * In general it's more efficient to use the bwAppend* functions, but then you MUST know that the previously written block is of the same type. In other words, you can only use bwAppendIntervals() after bwAddIntervals() or a previous bwAppendIntervals(). | |
514 | * @param fp The output file pointer. | |
515 | * @param chrom A list of chromosomes, of length `n`. | |
516 | * @param start A list of start positions of length`n`. | |
517 | * @param end A list of end positions of length`n`. | |
518 | * @param values A list of values of length`n`. | |
519 | * @param n The length of the aforementioned lists. | |
520 | * @return 0 on success and another value on error. | |
521 | * @see bwAppendIntervals | |
522 | */ | |
523 | int bwAddIntervals(bigWigFile_t *fp, char **chrom, uint32_t *start, uint32_t *end, float *values, uint32_t n); | |
524 | ||
525 | /*! | |
526 | * @brief Append bedGraph-like intervals to a previous block of bedGraph-like intervals in a bigWig file. | |
527 | * If you have previously used bwAddIntervals() then this will append additional entries into the previous block (or start a new one if needed). | |
528 | * @param fp The output file pointer. | |
529 | * @param start A list of start positions of length`n`. | |
530 | * @param end A list of end positions of length`n`. | |
531 | * @param values A list of values of length`n`. | |
532 | * @param n The length of the aforementioned lists. | |
533 | * @return 0 on success and another value on error. | |
534 | * @warning Do NOT use this after `bwAddIntervalSpanSteps()`, `bwAppendIntervalSpanSteps()`, `bwAddIntervalSpanSteps()`, or `bwAppendIntervalSpanSteps()`. | |
535 | * @see bwAddIntervals | |
536 | */ | |
537 | int bwAppendIntervals(bigWigFile_t *fp, uint32_t *start, uint32_t *end, float *values, uint32_t n); | |
538 | ||
539 | /*! | |
540 | * @brief Add a new block of variable-step entries to a bigWig file | |
541 | * Adds entries for the form | |
542 | * chromosome start value | |
543 | * to the file. Each block of such entries has an associated "span", so each value describes the region chromosome:start-(start+span) | |
544 | * | |
545 | * This will always start a new block of values. | |
546 | * @param fp The output file pointer. | |
547 | * @param chrom A list of chromosomes, of length `n`. | |
548 | * @param start A list of start positions of length`n`. | |
549 | * @param span The span of each entry (the must all be the same). | |
550 | * @param values A list of values of length`n`. | |
551 | * @param n The length of the aforementioned lists. | |
552 | * @return 0 on success and another value on error. | |
553 | * @see bwAppendIntervalSpans | |
554 | */ | |
555 | int bwAddIntervalSpans(bigWigFile_t *fp, char *chrom, uint32_t *start, uint32_t span, float *values, uint32_t n); | |
556 | ||
557 | /*! | |
558 | * @brief Append to a previous block of variable-step entries. | |
559 | * If you previously used `bwAddIntervalSpans()`, this will continue appending more values to the block(s) it created. | |
560 | * @param fp The output file pointer. | |
561 | * @param start A list of start positions of length`n`. | |
562 | * @param values A list of values of length`n`. | |
563 | * @param n The length of the aforementioned lists. | |
564 | * @return 0 on success and another value on error. | |
565 | * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpanSteps()` or `bwAppendIntervalSpanSteps()` | |
566 | * @see bwAddIntervalSpans | |
567 | */ | |
568 | int bwAppendIntervalSpans(bigWigFile_t *fp, uint32_t *start, float *values, uint32_t n); | |
569 | ||
570 | /*! | |
571 | * @brief Add a new block of fixed-step entries to a bigWig file | |
572 | * Adds entries for the form | |
573 | * value | |
574 | * to the file. Each block of such entries has an associated "span", "step", chromosome and start position. See the wiggle format for more details. | |
575 | * | |
576 | * This will always start a new block of values. | |
577 | * @param fp The output file pointer. | |
578 | * @param chrom The chromosome that the entries describe. | |
579 | * @param start The starting position of the block of entries. | |
580 | * @param span The span of each entry (i.e., the number of bases it describes). | |
581 | * @param step The step between entry start positions. | |
582 | * @param values A list of values of length`n`. | |
583 | * @param n The length of the aforementioned lists. | |
584 | * @return 0 on success and another value on error. | |
585 | * @see bwAddIntervalSpanSteps | |
586 | */ | |
587 | int bwAddIntervalSpanSteps(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t span, uint32_t step, float *values, uint32_t n); | |
588 | ||
589 | /*! | |
590 | * @brief Append to a previous block of fixed-step entries. | |
591 | * If you previously used `bwAddIntervalSpanSteps()`, this will continue appending more values to the block(s) it created. | |
592 | * @param fp The output file pointer. | |
593 | * @param values A list of values of length`n`. | |
594 | * @param n The length of the aforementioned lists. | |
595 | * @return 0 on success and another value on error. | |
596 | * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpans()` or `bwAppendIntervalSpans()` | |
597 | * @see bwAddIntervalSpanSteps | |
598 | */ | |
599 | int bwAppendIntervalSpanSteps(bigWigFile_t *fp, float *values, uint32_t n); | |
600 | ||
601 | #ifdef __cplusplus | |
602 | } | |
603 | #endif | |
604 | ||
605 | #endif // LIBBIGWIG_H |
0 | #ifndef LIBBIGWIG_IO_H | |
1 | #define LIBBIGWIG_IO_H | |
2 | ||
3 | #ifndef NOCURL | |
4 | #include <curl/curl.h> | |
5 | #else | |
6 | #include <stdio.h> | |
7 | #ifndef CURLTYPE_DEFINED | |
8 | #define CURLTYPE_DEFINED | |
9 | typedef int CURLcode; | |
10 | typedef void CURL; | |
11 | #endif | |
12 | #define CURLE_OK 0 | |
13 | #define CURLE_FAILED_INIT 1 | |
14 | #endif | |
15 | /*! \file bigWigIO.h | |
16 | * These are (typically internal) IO functions, so there's generally no need for you to directly use them! | |
17 | */ | |
18 | ||
19 | /*! | |
20 | * The size of the buffer used for remote files. | |
21 | */ | |
22 | extern size_t GLOBAL_DEFAULTBUFFERSIZE; | |
23 | ||
24 | /*! | |
25 | * The enumerated values that indicate the connection type used to access a file. | |
26 | */ | |
27 | enum bigWigFile_type_enum { | |
28 | BWG_FILE = 0, | |
29 | BWG_HTTP = 1, | |
30 | BWG_HTTPS = 2, | |
31 | BWG_FTP = 3 | |
32 | }; | |
33 | ||
34 | /*! | |
35 | * @brief This structure holds the file pointers and buffers needed for raw access to local and remote files. | |
36 | */ | |
37 | typedef struct { | |
38 | union { | |
39 | #ifndef NOCURL | |
40 | CURL *curl; /**<The CURL * file pointer for remote files.*/ | |
41 | #endif | |
42 | FILE *fp; /**<The FILE * file pointer for local files.**/ | |
43 | } x; /**<A union holding curl and fp.*/ | |
44 | void *memBuf; /**<A void * pointing to memory of size bufSize.*/ | |
45 | size_t filePos; /**<Current position inside the file.*/ | |
46 | size_t bufPos; /**<Curent position inside the buffer.*/ | |
47 | size_t bufSize; /**<The size of the buffer.*/ | |
48 | size_t bufLen; /**<The actual size of the buffer used.*/ | |
49 | enum bigWigFile_type_enum type; /**<The connection type*/ | |
50 | int isCompressed; /**<1 if the file is compressed, otherwise 0*/ | |
51 | char *fname; /**<Only needed for remote connections. The original URL/filename requested, since we need to make multiple connections.*/ | |
52 | } URL_t; | |
53 | ||
54 | /*! | |
55 | * @brief Reads data into the given buffer. | |
56 | * | |
57 | * This function will store bufSize data into buf for both local and remote files. For remote files an internal buffer is used to store a (typically larger) segment of the remote file. | |
58 | * | |
59 | * @param URL A URL_t * pointing to a valid opened file or remote URL. | |
60 | * @param buf The buffer in memory that you would like filled. It must be able to hold bufSize bytes! | |
61 | * @param bufSize The number of bytes to transfer to buf. | |
62 | * | |
63 | * @return Returns the number of bytes stored in buf, which should be bufSize on success and something else on error. | |
64 | * | |
65 | * @warning Note that on error, URL for remote files is left in an unusable state. You can get around this by running urlSeek() to a position outside of the range held by the internal buffer. | |
66 | */ | |
67 | size_t urlRead(URL_t *URL, void *buf, size_t bufSize); | |
68 | ||
69 | /*! | |
70 | * @brief Seeks to a given position in a local or remote file. | |
71 | * | |
72 | * For local files, this will set the file position indicator for the file pointer to the desired position. For remote files, it sets the position to start downloading data for the next urlRead(). Note that for remote files that running urlSeek() with a pos within the current buffer will simply modify the internal offset. | |
73 | * | |
74 | * @param URL A URL_t * pointing to a valid opened file or remote URL. | |
75 | * @param pos The position to seek to. | |
76 | * | |
77 | * @return CURLE_OK on success and a different CURLE_XXX on error. For local files, the error return value is always CURLE_FAILED_INIT | |
78 | */ | |
79 | CURLcode urlSeek(URL_t *URL, size_t pos); | |
80 | ||
81 | /*! | |
82 | * @brief Open a local or remote file | |
83 | * | |
84 | * Opens a local or remote file. Currently, http, https, and ftp are the only supported protocols and the URL must then begin with "http://", "https://", or "ftp://" as appropriate. | |
85 | * | |
86 | * For remote files, an internal buffer is used to hold file contents, to avoid downloading entire files before starting. The size of this buffer and various variable related to connection timeout are set with bwInit(). | |
87 | * | |
88 | * Note that you **must** run urlClose() on this when finished. However, you would typically just use bwOpen() rather than directly calling this function. | |
89 | * | |
90 | * @param fname The file name or URL to open. | |
91 | * @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information. | |
92 | * @param mode "r", "w" or NULL. If and only if the mode contains the character "w" will the file be opened for writing. | |
93 | * | |
94 | * @return A URL_t * or NULL on error. | |
95 | */ | |
96 | URL_t *urlOpen(char *fname, CURLcode (*callBack)(CURL*), const char* mode); | |
97 | ||
98 | /*! | |
99 | * @brief Close a local/remote file | |
100 | * | |
101 | * This will perform the cleanup required on a URL_t*, releasing memory as needed. | |
102 | * | |
103 | * @param URL A URL_t * pointing to a valid opened file or remote URL. | |
104 | * | |
105 | * @warning URL will no longer point to a valid location in memory! | |
106 | */ | |
107 | void urlClose(URL_t *URL); | |
108 | ||
109 | #endif // LIBBIGWIG_IO_H |
0 | /*! \file bwCommon.h | |
1 | * | |
2 | * You have no reason to use these functions. They may change without warning because there's no reason for them to be used outside of libBigWig's internals. | |
3 | * | |
4 | * These are structures and functions from a variety of files that are used across files internally but don't need to be see by libBigWig users. | |
5 | */ | |
6 | ||
7 | /*! | |
8 | * @brief Like fsetpos, but for local or remote bigWig files. | |
9 | * This will set the file position indicator to the specified point. For local files this literally is `fsetpos`, while for remote files it fills a memory buffer with data starting at the desired position. | |
10 | * @param fp A valid opened bigWigFile_t. | |
11 | * @param pos The position within the file to seek to. | |
12 | * @return 0 on success and -1 on error. | |
13 | */ | |
14 | int bwSetPos(bigWigFile_t *fp, size_t pos); | |
15 | ||
16 | /*! | |
17 | * @brief A local/remote version of `fread`. | |
18 | * Reads data from either local or remote bigWig files. | |
19 | * @param data An allocated memory block big enough to hold the data. | |
20 | * @param sz The size of each member that should be copied. | |
21 | * @param nmemb The number of members to copy. | |
22 | * @param fp The bigWigFile_t * from which to copy the data. | |
23 | * @see bwSetPos | |
24 | * @return For nmemb==1, the size of the copied data. For nmemb>1, the number of members fully copied (this is equivalent to `fread`). | |
25 | */ | |
26 | size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp); | |
27 | ||
28 | /*! | |
29 | * @brief Determine what the file position indicator say. | |
30 | * This is equivalent to `ftell` for local or remote files. | |
31 | * @param fp The file. | |
32 | * @return The position in the file. | |
33 | */ | |
34 | long bwTell(bigWigFile_t *fp); | |
35 | ||
36 | /*! | |
37 | * @brief Reads a data index (either full data or a zoom level) from a bigWig file. | |
38 | * There is little reason for end users to use this function. This must be freed with `bwDestroyIndex` | |
39 | * @param fp A valid bigWigFile_t pointer | |
40 | * @param offset The file offset where the index begins | |
41 | * @return A bwRTree_t pointer or NULL on error. | |
42 | */ | |
43 | bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset); | |
44 | ||
45 | /*! | |
46 | * @brief Destroy an bwRTreeNode_t and all of its children. | |
47 | * @param node The node to destroy. | |
48 | */ | |
49 | void bwDestroyIndexNode(bwRTreeNode_t *node); | |
50 | ||
51 | /*! | |
52 | * @brief Frees space allocated by `bwReadIndex` | |
53 | * There is generally little reason to use this, since end users should typically not need to run `bwReadIndex` themselves. | |
54 | * @param idx A bwRTree_t pointer allocated by `bwReadIndex`. | |
55 | */ | |
56 | void bwDestroyIndex(bwRTree_t *idx); | |
57 | ||
58 | /// @cond SKIP | |
59 | bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end); | |
60 | void destroyBWOverlapBlock(bwOverlapBlock_t *b); | |
61 | /// @endcond | |
62 | ||
63 | /*! | |
64 | * @brief Finishes what's needed to write a bigWigFile | |
65 | * Flushes the buffer, converts the index linked list to a tree, writes that to disk, handles zoom level stuff, writes magic at the end | |
66 | * @param fp A valid bigWigFile_t pointer | |
67 | * @return 0 on success | |
68 | */ | |
69 | int bwFinalize(bigWigFile_t *fp); |
0 | #include "bigWig.h" | |
1 | #include "bwCommon.h" | |
2 | #include <stdlib.h> | |
3 | #include <math.h> | |
4 | #include <string.h> | |
5 | #include <stdio.h> | |
6 | ||
7 | static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize); | |
8 | ||
9 | //Return the position in the file | |
10 | long bwTell(bigWigFile_t *fp) { | |
11 | if(fp->URL->type == BWG_FILE) return ftell(fp->URL->x.fp); | |
12 | return (long) (fp->URL->filePos + fp->URL->bufPos); | |
13 | } | |
14 | ||
15 | //Seek to a given position, always from the beginning of the file | |
16 | //Return 0 on success and -1 on error | |
17 | //To do, use the return code of urlSeek() in a more useful way. | |
18 | int bwSetPos(bigWigFile_t *fp, size_t pos) { | |
19 | CURLcode rv = urlSeek(fp->URL, pos); | |
20 | if(rv == CURLE_OK) return 0; | |
21 | return -1; | |
22 | } | |
23 | ||
24 | //returns the number of full members read (nmemb on success, something less on error) | |
25 | size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp) { | |
26 | size_t i, rv; | |
27 | for(i=0; i<nmemb; i++) { | |
28 | rv = urlRead(fp->URL, data+i*sz, sz); | |
29 | if(rv != sz) return i; | |
30 | } | |
31 | return nmemb; | |
32 | } | |
33 | ||
34 | //Initializes curl and sets global variables | |
35 | //Returns 0 on success and 1 on error | |
36 | //This should be called only once and bwCleanup() must be called when finished. | |
37 | int bwInit(size_t defaultBufSize) { | |
38 | //set the buffer size, number of iterations, sleep time between iterations, etc. | |
39 | GLOBAL_DEFAULTBUFFERSIZE = defaultBufSize; | |
40 | ||
41 | //call curl_global_init() | |
42 | #ifndef NOCURL | |
43 | CURLcode rv; | |
44 | rv = curl_global_init(CURL_GLOBAL_ALL); | |
45 | if(rv != CURLE_OK) return 1; | |
46 | #endif | |
47 | return 0; | |
48 | } | |
49 | ||
50 | //This should be called before quiting, to release memory acquired by curl | |
51 | void bwCleanup() { | |
52 | #ifndef NOCURL | |
53 | curl_global_cleanup(); | |
54 | #endif | |
55 | } | |
56 | ||
57 | static bwZoomHdr_t *bwReadZoomHdrs(bigWigFile_t *bw) { | |
58 | if(bw->isWrite) return NULL; | |
59 | uint16_t i; | |
60 | bwZoomHdr_t *zhdr = malloc(sizeof(bwZoomHdr_t)); | |
61 | if(!zhdr) return NULL; | |
62 | uint32_t *level = malloc(bw->hdr->nLevels * sizeof(uint64_t)); | |
63 | if(!level) { | |
64 | free(zhdr); | |
65 | return NULL; | |
66 | } | |
67 | uint32_t padding = 0; | |
68 | uint64_t *dataOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); | |
69 | if(!dataOffset) { | |
70 | free(zhdr); | |
71 | free(level); | |
72 | return NULL; | |
73 | } | |
74 | uint64_t *indexOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); | |
75 | if(!indexOffset) { | |
76 | free(zhdr); | |
77 | free(level); | |
78 | free(dataOffset); | |
79 | return NULL; | |
80 | } | |
81 | ||
82 | for(i=0; i<bw->hdr->nLevels; i++) { | |
83 | if(bwRead((void*) &(level[i]), sizeof(uint32_t), 1, bw) != 1) goto error; | |
84 | if(bwRead((void*) &padding, sizeof(uint32_t), 1, bw) != 1) goto error; | |
85 | if(bwRead((void*) &(dataOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; | |
86 | if(bwRead((void*) &(indexOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; | |
87 | } | |
88 | ||
89 | zhdr->level = level; | |
90 | zhdr->dataOffset = dataOffset; | |
91 | zhdr->indexOffset = indexOffset; | |
92 | zhdr->idx = calloc(bw->hdr->nLevels, sizeof(bwRTree_t*)); | |
93 | if(!zhdr->idx) goto error; | |
94 | ||
95 | return zhdr; | |
96 | ||
97 | error: | |
98 | for(i=0; i<bw->hdr->nLevels; i++) { | |
99 | if(zhdr->idx[i]) bwDestroyIndex(zhdr->idx[i]); | |
100 | } | |
101 | free(zhdr); | |
102 | free(level); | |
103 | free(dataOffset); | |
104 | free(indexOffset); | |
105 | return NULL; | |
106 | } | |
107 | ||
108 | static void bwHdrDestroy(bigWigHdr_t *hdr) { | |
109 | int i; | |
110 | if(hdr->zoomHdrs) { | |
111 | free(hdr->zoomHdrs->level); | |
112 | free(hdr->zoomHdrs->dataOffset); | |
113 | free(hdr->zoomHdrs->indexOffset); | |
114 | for(i=0; i<hdr->nLevels; i++) { | |
115 | if(hdr->zoomHdrs->idx[i]) bwDestroyIndex(hdr->zoomHdrs->idx[i]); | |
116 | } | |
117 | free(hdr->zoomHdrs->idx); | |
118 | free(hdr->zoomHdrs); | |
119 | } | |
120 | free(hdr); | |
121 | } | |
122 | ||
123 | static void bwHdrRead(bigWigFile_t *bw) { | |
124 | uint32_t magic; | |
125 | if(bw->isWrite) return; | |
126 | bw->hdr = calloc(1, sizeof(bigWigHdr_t)); | |
127 | if(!bw->hdr) return; | |
128 | ||
129 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //0x0 | |
130 | if(magic != BIGWIG_MAGIC && magic != BIGBED_MAGIC) goto error; | |
131 | ||
132 | if(bwRead((void*) &(bw->hdr->version), sizeof(uint16_t), 1, bw) != 1) goto error; //0x4 | |
133 | if(bwRead((void*) &(bw->hdr->nLevels), sizeof(uint16_t), 1, bw) != 1) goto error; //0x6 | |
134 | if(bwRead((void*) &(bw->hdr->ctOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x8 | |
135 | if(bwRead((void*) &(bw->hdr->dataOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x10 | |
136 | if(bwRead((void*) &(bw->hdr->indexOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x18 | |
137 | if(bwRead((void*) &(bw->hdr->fieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x20 | |
138 | if(bwRead((void*) &(bw->hdr->definedFieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x22 | |
139 | if(bwRead((void*) &(bw->hdr->sqlOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x24 | |
140 | if(bwRead((void*) &(bw->hdr->summaryOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x2c | |
141 | if(bwRead((void*) &(bw->hdr->bufSize), sizeof(uint32_t), 1, bw) != 1) goto error; //0x34 | |
142 | if(bwRead((void*) &(bw->hdr->extensionOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x38 | |
143 | ||
144 | //zoom headers | |
145 | if(bw->hdr->nLevels) { | |
146 | if(!(bw->hdr->zoomHdrs = bwReadZoomHdrs(bw))) goto error; | |
147 | } | |
148 | ||
149 | //File summary information | |
150 | if(bw->hdr->summaryOffset) { | |
151 | if(urlSeek(bw->URL, bw->hdr->summaryOffset) != CURLE_OK) goto error; | |
152 | if(bwRead((void*) &(bw->hdr->nBasesCovered), sizeof(uint64_t), 1, bw) != 1) goto error; | |
153 | if(bwRead((void*) &(bw->hdr->minVal), sizeof(uint64_t), 1, bw) != 1) goto error; | |
154 | if(bwRead((void*) &(bw->hdr->maxVal), sizeof(uint64_t), 1, bw) != 1) goto error; | |
155 | if(bwRead((void*) &(bw->hdr->sumData), sizeof(uint64_t), 1, bw) != 1) goto error; | |
156 | if(bwRead((void*) &(bw->hdr->sumSquared), sizeof(uint64_t), 1, bw) != 1) goto error; | |
157 | } | |
158 | ||
159 | //In case of uncompressed remote files, let the IO functions know to request larger chunks | |
160 | bw->URL->isCompressed = (bw->hdr->bufSize > 0)?1:0; | |
161 | ||
162 | return; | |
163 | ||
164 | error: | |
165 | bwHdrDestroy(bw->hdr); | |
166 | fprintf(stderr, "[bwHdrRead] There was an error while reading in the header!\n"); | |
167 | bw->hdr = NULL; | |
168 | } | |
169 | ||
170 | static void destroyChromList(chromList_t *cl) { | |
171 | uint32_t i; | |
172 | if(!cl) return; | |
173 | if(cl->nKeys && cl->chrom) { | |
174 | for(i=0; i<cl->nKeys; i++) { | |
175 | if(cl->chrom[i]) free(cl->chrom[i]); | |
176 | } | |
177 | } | |
178 | if(cl->chrom) free(cl->chrom); | |
179 | if(cl->len) free(cl->len); | |
180 | free(cl); | |
181 | } | |
182 | ||
183 | static uint64_t readChromLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t valueSize) { | |
184 | uint16_t nVals, i; | |
185 | uint32_t idx; | |
186 | char *chrom = NULL; | |
187 | ||
188 | if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; | |
189 | chrom = calloc(valueSize+1, sizeof(char)); | |
190 | if(!chrom) return -1; | |
191 | ||
192 | for(i=0; i<nVals; i++) { | |
193 | if(bwRead((void*) chrom, sizeof(char), valueSize, bw) != valueSize) goto error; | |
194 | if(bwRead((void*) &idx, sizeof(uint32_t), 1, bw) != 1) goto error; | |
195 | if(bwRead((void*) &(cl->len[idx]), sizeof(uint32_t), 1, bw) != 1) goto error; | |
196 | cl->chrom[idx] = strdup(chrom); | |
197 | if(!(cl->chrom[idx])) goto error; | |
198 | } | |
199 | ||
200 | free(chrom); | |
201 | return nVals; | |
202 | ||
203 | error: | |
204 | free(chrom); | |
205 | return -1; | |
206 | } | |
207 | ||
208 | static uint64_t readChromNonLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) { | |
209 | uint64_t offset , rv = 0, previous; | |
210 | uint16_t nVals, i; | |
211 | ||
212 | if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; | |
213 | ||
214 | previous = bwTell(bw) + keySize; | |
215 | for(i=0; i<nVals; i++) { | |
216 | if(bwSetPos(bw, previous)) return -1; | |
217 | if(bwRead((void*) &offset, sizeof(uint64_t), 1, bw) != 1) return -1; | |
218 | if(bwSetPos(bw, offset)) return -1; | |
219 | rv += readChromBlock(bw, cl, keySize); | |
220 | previous += 8 + keySize; | |
221 | } | |
222 | ||
223 | return rv; | |
224 | } | |
225 | ||
226 | static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) { | |
227 | uint8_t isLeaf, padding; | |
228 | ||
229 | if(bwRead((void*) &isLeaf, sizeof(uint8_t), 1, bw) != 1) return -1; | |
230 | if(bwRead((void*) &padding, sizeof(uint8_t), 1, bw) != 1) return -1; | |
231 | ||
232 | if(isLeaf) { | |
233 | return readChromLeaf(bw, cl, keySize); | |
234 | } else { //I've never actually observed one of these, which is good since they're pointless | |
235 | return readChromNonLeaf(bw, cl, keySize); | |
236 | } | |
237 | } | |
238 | ||
239 | static chromList_t *bwReadChromList(bigWigFile_t *bw) { | |
240 | chromList_t *cl = NULL; | |
241 | uint32_t magic, keySize, valueSize, itemsPerBlock; | |
242 | uint64_t rv, itemCount; | |
243 | if(bw->isWrite) return NULL; | |
244 | if(bwSetPos(bw, bw->hdr->ctOffset)) return NULL; | |
245 | ||
246 | cl = calloc(1, sizeof(chromList_t)); | |
247 | if(!cl) return NULL; | |
248 | ||
249 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; | |
250 | if(magic != CIRTREE_MAGIC) goto error; | |
251 | ||
252 | if(bwRead((void*) &itemsPerBlock, sizeof(uint32_t), 1, bw) != 1) goto error; | |
253 | if(bwRead((void*) &keySize, sizeof(uint32_t), 1, bw) != 1) goto error; | |
254 | if(bwRead((void*) &valueSize, sizeof(uint32_t), 1, bw) != 1) goto error; | |
255 | if(bwRead((void*) &itemCount, sizeof(uint64_t), 1, bw) != 1) goto error; | |
256 | ||
257 | cl->nKeys = itemCount; | |
258 | cl->chrom = calloc(itemCount, sizeof(char*)); | |
259 | cl->len = calloc(itemCount, sizeof(uint32_t)); | |
260 | if(!cl->chrom) goto error; | |
261 | if(!cl->len) goto error; | |
262 | ||
263 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; | |
264 | if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; | |
265 | ||
266 | //Read in the blocks | |
267 | rv = readChromBlock(bw, cl, keySize); | |
268 | if(rv == (uint64_t) -1) goto error; | |
269 | if(rv != itemCount) goto error; | |
270 | ||
271 | return cl; | |
272 | ||
273 | error: | |
274 | destroyChromList(cl); | |
275 | return NULL; | |
276 | } | |
277 | ||
278 | //This is here mostly for convenience | |
279 | static void bwDestroyWriteBuffer(bwWriteBuffer_t *wb) { | |
280 | if(wb->p) free(wb->p); | |
281 | if(wb->compressP) free(wb->compressP); | |
282 | if(wb->firstZoomBuffer) free(wb->firstZoomBuffer); | |
283 | if(wb->lastZoomBuffer) free(wb->lastZoomBuffer); | |
284 | if(wb->nNodes) free(wb->nNodes); | |
285 | free(wb); | |
286 | } | |
287 | ||
288 | void bwClose(bigWigFile_t *fp) { | |
289 | if(!fp) return; | |
290 | if(bwFinalize(fp)) { | |
291 | fprintf(stderr, "[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated.\n"); | |
292 | } | |
293 | if(fp->URL) urlClose(fp->URL); | |
294 | if(fp->hdr) bwHdrDestroy(fp->hdr); | |
295 | if(fp->cl) destroyChromList(fp->cl); | |
296 | if(fp->idx) bwDestroyIndex(fp->idx); | |
297 | if(fp->writeBuffer) bwDestroyWriteBuffer(fp->writeBuffer); | |
298 | free(fp); | |
299 | } | |
300 | ||
301 | int bwIsBigWig(char *fname, CURLcode (*callBack) (CURL*)) { | |
302 | uint32_t magic = 0; | |
303 | URL_t *URL = NULL; | |
304 | ||
305 | URL = urlOpen(fname, *callBack, NULL); | |
306 | ||
307 | if(!URL) return 0; | |
308 | if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; | |
309 | urlClose(URL); | |
310 | if(magic == BIGWIG_MAGIC) return 1; | |
311 | return 0; | |
312 | } | |
313 | ||
314 | char *bbGetSQL(bigWigFile_t *bw) { | |
315 | char *o = NULL; | |
316 | uint64_t len; | |
317 | if(!bw->hdr->sqlOffset) return NULL; | |
318 | len = bw->hdr->summaryOffset - bw->hdr->sqlOffset; //This includes the NULL terminator | |
319 | o = malloc(sizeof(char) * len); | |
320 | if(!o) goto error; | |
321 | if(bwSetPos(bw, bw->hdr->sqlOffset)) goto error; | |
322 | if(bwRead((void*) o, len, 1, bw) != 1) goto error; | |
323 | return o; | |
324 | ||
325 | error: | |
326 | if(o) free(o); | |
327 | printf("Got an error in bbGetSQL!\n"); | |
328 | return NULL; | |
329 | } | |
330 | ||
331 | int bbIsBigBed(char *fname, CURLcode (*callBack) (CURL*)) { | |
332 | uint32_t magic = 0; | |
333 | URL_t *URL = NULL; | |
334 | ||
335 | URL = urlOpen(fname, *callBack, NULL); | |
336 | ||
337 | if(!URL) return 0; | |
338 | if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; | |
339 | urlClose(URL); | |
340 | if(magic == BIGBED_MAGIC) return 1; | |
341 | return 0; | |
342 | } | |
343 | ||
344 | bigWigFile_t *bwOpen(char *fname, CURLcode (*callBack) (CURL*), const char *mode) { | |
345 | bigWigFile_t *bwg = calloc(1, sizeof(bigWigFile_t)); | |
346 | if(!bwg) { | |
347 | fprintf(stderr, "[bwOpen] Couldn't allocate space to create the output object!\n"); | |
348 | return NULL; | |
349 | } | |
350 | if((!mode) || (strchr(mode, 'w') == NULL)) { | |
351 | bwg->isWrite = 0; | |
352 | bwg->URL = urlOpen(fname, *callBack, NULL); | |
353 | if(!bwg->URL) { | |
354 | fprintf(stderr, "[bwOpen] urlOpen is NULL!\n"); | |
355 | goto error; | |
356 | } | |
357 | ||
358 | //Attempt to read in the fixed header | |
359 | bwHdrRead(bwg); | |
360 | if(!bwg->hdr) { | |
361 | fprintf(stderr, "[bwOpen] bwg->hdr is NULL!\n"); | |
362 | goto error; | |
363 | } | |
364 | ||
365 | //Read in the chromosome list | |
366 | bwg->cl = bwReadChromList(bwg); | |
367 | if(!bwg->cl) { | |
368 | fprintf(stderr, "[bwOpen] bwg->cl is NULL (%s)!\n", fname); | |
369 | goto error; | |
370 | } | |
371 | ||
372 | //Read in the index | |
373 | if(bwg->hdr->indexOffset) { | |
374 | bwg->idx = bwReadIndex(bwg, 0); | |
375 | if(!bwg->idx) { | |
376 | fprintf(stderr, "[bwOpen] bwg->idx is NULL bwg->hdr->dataOffset 0x%"PRIx64"!\n", bwg->hdr->dataOffset); | |
377 | goto error; | |
378 | } | |
379 | } | |
380 | } else { | |
381 | bwg->isWrite = 1; | |
382 | bwg->URL = urlOpen(fname, NULL, "w+"); | |
383 | if(!bwg->URL) goto error; | |
384 | bwg->writeBuffer = calloc(1,sizeof(bwWriteBuffer_t)); | |
385 | if(!bwg->writeBuffer) goto error; | |
386 | bwg->writeBuffer->l = 24; | |
387 | } | |
388 | ||
389 | return bwg; | |
390 | ||
391 | error: | |
392 | bwClose(bwg); | |
393 | return NULL; | |
394 | } | |
395 | ||
396 | bigWigFile_t *bbOpen(char *fname, CURLcode (*callBack) (CURL*)) { | |
397 | bigWigFile_t *bb = calloc(1, sizeof(bigWigFile_t)); | |
398 | if(!bb) { | |
399 | fprintf(stderr, "[bbOpen] Couldn't allocate space to create the output object!\n"); | |
400 | return NULL; | |
401 | } | |
402 | ||
403 | //Set the type to 1 for bigBed | |
404 | bb->type = 1; | |
405 | ||
406 | bb->URL = urlOpen(fname, *callBack, NULL); | |
407 | if(!bb->URL) goto error; | |
408 | ||
409 | //Attempt to read in the fixed header | |
410 | bwHdrRead(bb); | |
411 | if(!bb->hdr) goto error; | |
412 | ||
413 | //Read in the chromosome list | |
414 | bb->cl = bwReadChromList(bb); | |
415 | if(!bb->cl) goto error; | |
416 | ||
417 | //Read in the index | |
418 | bb->idx = bwReadIndex(bb, 0); | |
419 | if(!bb->idx) goto error; | |
420 | ||
421 | return bb; | |
422 | ||
423 | error: | |
424 | bwClose(bb); | |
425 | return NULL; | |
426 | } |
0 | #include "bigWig.h" | |
1 | #include "bwCommon.h" | |
2 | #include <errno.h> | |
3 | #include <stdlib.h> | |
4 | #include <zlib.h> | |
5 | #include <math.h> | |
6 | #include <string.h> | |
7 | ||
8 | //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level. | |
9 | //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins | |
10 | static int32_t determineZoomLevel(bigWigFile_t *fp, int basesPerBin) { | |
11 | int32_t out = -1; | |
12 | int64_t diff; | |
13 | uint32_t bestDiff = -1; | |
14 | uint16_t i; | |
15 | ||
16 | basesPerBin/=2; | |
17 | for(i=0; i<fp->hdr->nLevels; i++) { | |
18 | diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i]; | |
19 | if(diff >= 0 && diff < bestDiff) { | |
20 | bestDiff = diff; | |
21 | out = i; | |
22 | } | |
23 | } | |
24 | return out; | |
25 | } | |
26 | ||
27 | /// @cond SKIP | |
28 | struct val_t { | |
29 | uint32_t nBases; | |
30 | float min, max, sum, sumsq; | |
31 | double scalar; | |
32 | }; | |
33 | ||
34 | struct vals_t { | |
35 | uint32_t n; | |
36 | struct val_t **vals; | |
37 | }; | |
38 | /// @endcond | |
39 | ||
40 | void destroyVals_t(struct vals_t *v) { | |
41 | uint32_t i; | |
42 | if(!v) return; | |
43 | for(i=0; i<v->n; i++) free(v->vals[i]); | |
44 | if(v->vals) free(v->vals); | |
45 | free(v); | |
46 | } | |
47 | ||
48 | //Determine the base-pair overlap between an interval and a block | |
49 | double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) { | |
50 | double rv = 0.0; | |
51 | if(b_start <= i_start) { | |
52 | if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start); | |
53 | } else if(b_start < i_end) { | |
54 | if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start); | |
55 | else rv = ((double)(i_end - b_start))/(b_end-b_start); | |
56 | } | |
57 | ||
58 | return rv; | |
59 | } | |
60 | ||
61 | //Returns NULL on error | |
62 | static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) { | |
63 | void *buf = NULL, *compBuf = NULL; | |
64 | uLongf sz = fp->hdr->bufSize; | |
65 | int compressed = 0, rv; | |
66 | uint32_t *p, vtid, vstart, vend; | |
67 | struct vals_t *vals = NULL; | |
68 | struct val_t *v = NULL; | |
69 | ||
70 | if(sz) { | |
71 | compressed = 1; | |
72 | buf = malloc(sz); | |
73 | } | |
74 | sz = 0; //This is now the size of the compressed buffer | |
75 | ||
76 | if(bwSetPos(fp, o->offset[i])) goto error; | |
77 | ||
78 | vals = calloc(1,sizeof(struct vals_t)); | |
79 | if(!vals) goto error; | |
80 | ||
81 | v = malloc(sizeof(struct val_t)); | |
82 | if(!v) goto error; | |
83 | ||
84 | if(sz < o->size[i]) compBuf = malloc(o->size[i]); | |
85 | if(!compBuf) goto error; | |
86 | ||
87 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; | |
88 | if(compressed) { | |
89 | sz = fp->hdr->bufSize; | |
90 | rv = uncompress(buf, &sz, compBuf, o->size[i]); | |
91 | if(rv != Z_OK) goto error; | |
92 | } else { | |
93 | buf = compBuf; | |
94 | sz = o->size[i]; | |
95 | } | |
96 | ||
97 | p = buf; | |
98 | while(((uLongf) ((void*)p-buf)) < sz) { | |
99 | vtid = p[0]; | |
100 | vstart = p[1]; | |
101 | vend = p[2]; | |
102 | v->nBases = p[3]; | |
103 | v->min = ((float*) p)[4]; | |
104 | v->max = ((float*) p)[5]; | |
105 | v->sum = ((float*) p)[6]; | |
106 | v->sumsq = ((float*) p)[7]; | |
107 | v->scalar = getScalar(start, end, vstart, vend); | |
108 | ||
109 | if(tid == vtid) { | |
110 | if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) { | |
111 | vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1)); | |
112 | if(!vals->vals) goto error; | |
113 | vals->vals[vals->n++] = v; | |
114 | v = malloc(sizeof(struct val_t)); | |
115 | if(!v) goto error; | |
116 | } | |
117 | if(vstart > end) break; | |
118 | } else if(vtid > tid) { | |
119 | break; | |
120 | } | |
121 | p+=8; | |
122 | } | |
123 | ||
124 | free(v); | |
125 | free(buf); | |
126 | if(compressed) free(compBuf); | |
127 | return vals; | |
128 | ||
129 | error: | |
130 | if(buf) free(buf); | |
131 | if(compBuf && compressed) free(compBuf); | |
132 | if(v) free(v); | |
133 | destroyVals_t(vals); | |
134 | return NULL; | |
135 | } | |
136 | ||
137 | //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally) | |
138 | static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
139 | uint32_t i, j; | |
140 | double output = 0.0, coverage = 0.0; | |
141 | struct vals_t *v = NULL; | |
142 | ||
143 | if(!blocks->n) return strtod("NaN", NULL); | |
144 | ||
145 | //Iterate over the blocks | |
146 | for(i=0; i<blocks->n; i++) { | |
147 | v = getVals(fp, blocks, i, tid, start, end); | |
148 | if(!v) goto error; | |
149 | for(j=0; j<v->n; j++) { | |
150 | output += v->vals[j]->sum * v->vals[j]->scalar; | |
151 | coverage += v->vals[j]->nBases * v->vals[j]->scalar; | |
152 | } | |
153 | destroyVals_t(v); | |
154 | } | |
155 | ||
156 | ||
157 | if(!coverage) return strtod("NaN", NULL); | |
158 | ||
159 | return output/coverage; | |
160 | ||
161 | error: | |
162 | if(v) free(v); | |
163 | errno = ENOMEM; | |
164 | return strtod("NaN", NULL); | |
165 | } | |
166 | ||
167 | static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { | |
168 | double sum = 0.0; | |
169 | uint32_t nBases = 0, i, start_use, end_use; | |
170 | ||
171 | if(!ints->l) return strtod("NaN", NULL); | |
172 | ||
173 | for(i=0; i<ints->l; i++) { | |
174 | start_use = ints->start[i]; | |
175 | end_use = ints->end[i]; | |
176 | if(ints->start[i] < start) start_use = start; | |
177 | if(ints->end[i] > end) end_use = end; | |
178 | nBases += end_use-start_use; | |
179 | sum += (end_use-start_use)*((double) ints->value[i]); | |
180 | } | |
181 | ||
182 | return sum/nBases; | |
183 | } | |
184 | ||
185 | //Does UCSC compensate for partial block/range overlap? | |
186 | static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
187 | uint32_t i, j; | |
188 | double mean = 0.0, ssq = 0.0, coverage = 0.0, diff; | |
189 | struct vals_t *v = NULL; | |
190 | ||
191 | if(!blocks->n) return strtod("NaN", NULL); | |
192 | ||
193 | //Iterate over the blocks | |
194 | for(i=0; i<blocks->n; i++) { | |
195 | v = getVals(fp, blocks, i, tid, start, end); | |
196 | if(!v) goto error; | |
197 | for(j=0; j<v->n; j++) { | |
198 | coverage += v->vals[j]->nBases * v->vals[j]->scalar; | |
199 | mean += v->vals[j]->sum * v->vals[j]->scalar; | |
200 | ssq += v->vals[j]->sumsq * v->vals[j]->scalar; | |
201 | } | |
202 | destroyVals_t(v); | |
203 | v = NULL; | |
204 | } | |
205 | ||
206 | if(coverage<=1.0) return strtod("NaN", NULL); | |
207 | diff = ssq-mean*mean/coverage; | |
208 | if(coverage > 1.0) diff /= coverage-1; | |
209 | if(fabs(diff) > 1e-8) { //Ignore floating point differences | |
210 | return sqrt(diff); | |
211 | } else { | |
212 | return 0.0; | |
213 | } | |
214 | ||
215 | error: | |
216 | if(v) destroyVals_t(v); | |
217 | errno = ENOMEM; | |
218 | return strtod("NaN", NULL); | |
219 | } | |
220 | ||
221 | //This uses compensated summation to account for finite precision math | |
222 | static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { | |
223 | double v1 = 0.0, mean, rv; | |
224 | uint32_t nBases = 0, i, start_use, end_use; | |
225 | ||
226 | if(!ints->l) return strtod("NaN", NULL); | |
227 | mean = intMean(ints, start, end); | |
228 | ||
229 | for(i=0; i<ints->l; i++) { | |
230 | start_use = ints->start[i]; | |
231 | end_use = ints->end[i]; | |
232 | if(ints->start[i] < start) start_use = start; | |
233 | if(ints->end[i] > end) end_use = end; | |
234 | nBases += end_use-start_use; | |
235 | v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference | |
236 | } | |
237 | ||
238 | if(nBases>=2) rv = sqrt(v1/(nBases-1)); | |
239 | else if(nBases==1) rv = sqrt(v1); | |
240 | else rv = strtod("NaN", NULL); | |
241 | ||
242 | return rv; | |
243 | } | |
244 | ||
245 | static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
246 | uint32_t i, j, isNA = 1; | |
247 | double o = strtod("NaN", NULL); | |
248 | struct vals_t *v = NULL; | |
249 | ||
250 | if(!blocks->n) return o; | |
251 | ||
252 | //Iterate the blocks | |
253 | for(i=0; i<blocks->n; i++) { | |
254 | v = getVals(fp, blocks, i, tid, start, end); | |
255 | if(!v) goto error; | |
256 | for(j=0; j<v->n; j++) { | |
257 | if(isNA) { | |
258 | o = v->vals[j]->max; | |
259 | isNA = 0; | |
260 | } else if(v->vals[j]->max > o) { | |
261 | o = v->vals[j]->max; | |
262 | } | |
263 | } | |
264 | destroyVals_t(v); | |
265 | } | |
266 | ||
267 | return o; | |
268 | ||
269 | error: | |
270 | destroyVals_t(v); | |
271 | errno = ENOMEM; | |
272 | return strtod("NaN", NULL); | |
273 | } | |
274 | ||
275 | static double intMax(bwOverlappingIntervals_t* ints) { | |
276 | uint32_t i; | |
277 | double o; | |
278 | ||
279 | if(ints->l < 1) return strtod("NaN", NULL); | |
280 | ||
281 | o = ints->value[0]; | |
282 | for(i=1; i<ints->l; i++) { | |
283 | if(ints->value[i] > o) o = ints->value[i]; | |
284 | } | |
285 | ||
286 | return o; | |
287 | } | |
288 | ||
289 | static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
290 | uint32_t i, j, isNA = 1; | |
291 | double o = strtod("NaN", NULL); | |
292 | struct vals_t *v = NULL; | |
293 | ||
294 | if(!blocks->n) return o; | |
295 | ||
296 | //Iterate the blocks | |
297 | for(i=0; i<blocks->n; i++) { | |
298 | v = getVals(fp, blocks, i, tid, start, end); | |
299 | if(!v) goto error; | |
300 | for(j=0; j<v->n; j++) { | |
301 | if(isNA) { | |
302 | o = v->vals[j]->min; | |
303 | isNA = 0; | |
304 | } else if(v->vals[j]->min < o) o = v->vals[j]->min; | |
305 | } | |
306 | destroyVals_t(v); | |
307 | } | |
308 | ||
309 | return o; | |
310 | ||
311 | error: | |
312 | destroyVals_t(v); | |
313 | errno = ENOMEM; | |
314 | return strtod("NaN", NULL); | |
315 | } | |
316 | ||
317 | static double intMin(bwOverlappingIntervals_t* ints) { | |
318 | uint32_t i; | |
319 | double o; | |
320 | ||
321 | if(ints->l < 1) return strtod("NaN", NULL); | |
322 | ||
323 | o = ints->value[0]; | |
324 | for(i=1; i<ints->l; i++) { | |
325 | if(ints->value[i] < o) o = ints->value[i]; | |
326 | } | |
327 | ||
328 | return o; | |
329 | } | |
330 | ||
331 | //Does UCSC compensate for only partial block/interval overlap? | |
332 | static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
333 | uint32_t i, j; | |
334 | double o = 0.0; | |
335 | struct vals_t *v = NULL; | |
336 | ||
337 | if(!blocks->n) return strtod("NaN", NULL); | |
338 | ||
339 | //Iterate over the blocks | |
340 | for(i=0; i<blocks->n; i++) { | |
341 | v = getVals(fp, blocks, i, tid, start, end); | |
342 | if(!v) goto error; | |
343 | for(j=0; j<v->n; j++) { | |
344 | o+= v->vals[j]->nBases * v->vals[j]->scalar; | |
345 | } | |
346 | destroyVals_t(v); | |
347 | } | |
348 | ||
349 | if(o == 0.0) return strtod("NaN", NULL); | |
350 | return o; | |
351 | ||
352 | error: | |
353 | destroyVals_t(v); | |
354 | errno = ENOMEM; | |
355 | return strtod("NaN", NULL); | |
356 | } | |
357 | ||
358 | static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { | |
359 | uint32_t i, start_use, end_use; | |
360 | double o = 0.0; | |
361 | ||
362 | if(!ints->l) return strtod("NaN", NULL); | |
363 | ||
364 | for(i=0; i<ints->l; i++) { | |
365 | start_use = ints->start[i]; | |
366 | end_use = ints->end[i]; | |
367 | if(start_use < start) start_use = start; | |
368 | if(end_use > end) end_use = end; | |
369 | o += end_use - start_use; | |
370 | } | |
371 | ||
372 | return o/(end-start); | |
373 | } | |
374 | ||
375 | static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { | |
376 | uint32_t i, j, sizeUse; | |
377 | double o = 0.0; | |
378 | struct vals_t *v = NULL; | |
379 | ||
380 | if(!blocks->n) return strtod("NaN", NULL); | |
381 | ||
382 | //Iterate over the blocks | |
383 | for(i=0; i<blocks->n; i++) { | |
384 | v = getVals(fp, blocks, i, tid, start, end); | |
385 | if(!v) goto error; | |
386 | for(j=0; j<v->n; j++) { | |
387 | //Multiply the block average by min(bases covered, block overlap with interval) | |
388 | sizeUse = v->vals[j]->scalar; | |
389 | if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases; | |
390 | o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases; | |
391 | } | |
392 | destroyVals_t(v); | |
393 | } | |
394 | ||
395 | if(o == 0.0) return strtod("NaN", NULL); | |
396 | return o; | |
397 | ||
398 | error: | |
399 | destroyVals_t(v); | |
400 | errno = ENOMEM; | |
401 | return strtod("NaN", NULL); | |
402 | } | |
403 | ||
404 | static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { | |
405 | uint32_t i, start_use, end_use; | |
406 | double o = 0.0; | |
407 | ||
408 | if(!ints->l) return strtod("NaN", NULL); | |
409 | ||
410 | for(i=0; i<ints->l; i++) { | |
411 | start_use = ints->start[i]; | |
412 | end_use = ints->end[i]; | |
413 | if(start_use < start) start_use = start; | |
414 | if(end_use > end) end_use = end; | |
415 | o += (end_use - start_use) * ints->value[i]; | |
416 | } | |
417 | ||
418 | return o; | |
419 | } | |
420 | ||
421 | //Returns NULL on error, otherwise a double* that needs to be free()d | |
422 | double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { | |
423 | bwOverlapBlock_t *blocks = NULL; | |
424 | double *output = NULL; | |
425 | uint32_t pos = start, i, end2; | |
426 | ||
427 | if(!fp->hdr->zoomHdrs->idx[level]) { | |
428 | fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]); | |
429 | if(!fp->hdr->zoomHdrs->idx[level]) return NULL; | |
430 | } | |
431 | errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors | |
432 | ||
433 | output = malloc(sizeof(double)*nBins); | |
434 | if(!output) return NULL; | |
435 | ||
436 | for(i=0, pos=start; i<nBins; i++) { | |
437 | end2 = start + ((double)(end-start)*(i+1))/((int) nBins); | |
438 | blocks = walkRTreeNodes(fp, fp->hdr->zoomHdrs->idx[level]->root, tid, pos, end2); | |
439 | if(!blocks) goto error; | |
440 | ||
441 | switch(type) { | |
442 | case 0: | |
443 | //mean | |
444 | output[i] = blockMean(fp, blocks, tid, pos, end2); | |
445 | break; | |
446 | case 1: | |
447 | //stdev | |
448 | output[i] = blockDev(fp, blocks, tid, pos, end2); | |
449 | break; | |
450 | case 2: | |
451 | //max | |
452 | output[i] = blockMax(fp, blocks, tid, pos, end2); | |
453 | break; | |
454 | case 3: | |
455 | //min | |
456 | output[i] = blockMin(fp, blocks, tid, pos, end2); | |
457 | break; | |
458 | case 4: | |
459 | //cov | |
460 | output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos); | |
461 | break; | |
462 | case 5: | |
463 | //sum | |
464 | output[i] = blockSum(fp, blocks, tid, pos, end2); | |
465 | break; | |
466 | default: | |
467 | goto error; | |
468 | break; | |
469 | } | |
470 | if(errno) goto error; | |
471 | destroyBWOverlapBlock(blocks); | |
472 | pos = end2; | |
473 | } | |
474 | ||
475 | return output; | |
476 | ||
477 | error: | |
478 | fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno)); | |
479 | if(blocks) destroyBWOverlapBlock(blocks); | |
480 | if(output) free(output); | |
481 | return NULL; | |
482 | } | |
483 | ||
484 | double *bwStatsFromFull(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { | |
485 | bwOverlappingIntervals_t *ints = NULL; | |
486 | double *output = malloc(sizeof(double)*nBins); | |
487 | uint32_t i, pos = start, end2; | |
488 | if(!output) return NULL; | |
489 | ||
490 | for(i=0; i<nBins; i++) { | |
491 | end2 = start + ((double)(end-start)*(i+1))/((int) nBins); | |
492 | ints = bwGetOverlappingIntervals(fp, chrom, pos, end2); | |
493 | ||
494 | if(!ints) { | |
495 | output[i] = strtod("NaN", NULL); | |
496 | continue; | |
497 | } | |
498 | ||
499 | switch(type) { | |
500 | default : | |
501 | case 0: | |
502 | output[i] = intMean(ints, pos, end2); | |
503 | break; | |
504 | case 1: | |
505 | output[i] = intDev(ints, pos, end2); | |
506 | break; | |
507 | case 2: | |
508 | output[i] = intMax(ints); | |
509 | break; | |
510 | case 3: | |
511 | output[i] = intMin(ints); | |
512 | break; | |
513 | case 4: | |
514 | output[i] = intCoverage(ints, pos, end2); | |
515 | break; | |
516 | case 5: | |
517 | output[i] = intSum(ints, pos, end2); | |
518 | break; | |
519 | } | |
520 | bwDestroyOverlappingIntervals(ints); | |
521 | pos = end2; | |
522 | } | |
523 | ||
524 | return output; | |
525 | } | |
526 | ||
527 | //Returns a list of floats of length nBins that must be free()d | |
528 | //On error, NULL is returned | |
529 | double *bwStats(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { | |
530 | int32_t level = determineZoomLevel(fp, ((double)(end-start))/((int) nBins)); | |
531 | uint32_t tid = bwGetTid(fp, chrom); | |
532 | if(tid == (uint32_t) -1) return NULL; | |
533 | ||
534 | if(level == -1) return bwStatsFromFull(fp, chrom, start, end, nBins, type); | |
535 | return bwStatsFromZoom(fp, level, tid, start, end, nBins, type); | |
536 | } |
0 | #include "bigWig.h" | |
1 | #include "bwCommon.h" | |
2 | #include <stdlib.h> | |
3 | #include <string.h> | |
4 | #include <zlib.h> | |
5 | #include <errno.h> | |
6 | ||
7 | static uint32_t roundup(uint32_t v) { | |
8 | v--; | |
9 | v |= v >> 1; | |
10 | v |= v >> 2; | |
11 | v |= v >> 4; | |
12 | v |= v >> 8; | |
13 | v |= v >> 16; | |
14 | v++; | |
15 | return v; | |
16 | } | |
17 | ||
18 | //Returns the root node on success and NULL on error | |
19 | static bwRTree_t *readRTreeIdx(bigWigFile_t *fp, uint64_t offset) { | |
20 | uint32_t magic; | |
21 | bwRTree_t *node; | |
22 | ||
23 | if(!offset) { | |
24 | if(bwSetPos(fp, fp->hdr->indexOffset)) return NULL; | |
25 | } else { | |
26 | if(bwSetPos(fp, offset)) return NULL; | |
27 | } | |
28 | ||
29 | if(bwRead(&magic, sizeof(uint32_t), 1, fp) != 1) return NULL; | |
30 | if(magic != IDX_MAGIC) { | |
31 | fprintf(stderr, "[readRTreeIdx] Mismatch in the magic number!\n"); | |
32 | return NULL; | |
33 | } | |
34 | ||
35 | node = calloc(1, sizeof(bwRTree_t)); | |
36 | if(!node) return NULL; | |
37 | ||
38 | if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; | |
39 | if(bwRead(&(node->nItems), sizeof(uint64_t), 1, fp) != 1) goto error; | |
40 | if(bwRead(&(node->chrIdxStart), sizeof(uint32_t), 1, fp) != 1) goto error; | |
41 | if(bwRead(&(node->baseStart), sizeof(uint32_t), 1, fp) != 1) goto error; | |
42 | if(bwRead(&(node->chrIdxEnd), sizeof(uint32_t), 1, fp) != 1) goto error; | |
43 | if(bwRead(&(node->baseEnd), sizeof(uint32_t), 1, fp) != 1) goto error; | |
44 | if(bwRead(&(node->idxSize), sizeof(uint64_t), 1, fp) != 1) goto error; | |
45 | if(bwRead(&(node->nItemsPerSlot), sizeof(uint32_t), 1, fp) != 1) goto error; | |
46 | //Padding | |
47 | if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; | |
48 | node->rootOffset = bwTell(fp); | |
49 | ||
50 | //For remote files, libCurl sometimes sets errno to 115 and doesn't clear it | |
51 | errno = 0; | |
52 | ||
53 | return node; | |
54 | ||
55 | error: | |
56 | free(node); | |
57 | return NULL; | |
58 | } | |
59 | ||
60 | //Returns a bwRTreeNode_t on success and NULL on an error | |
61 | //For the root node, set offset to 0 | |
62 | static bwRTreeNode_t *bwGetRTreeNode(bigWigFile_t *fp, uint64_t offset) { | |
63 | bwRTreeNode_t *node = NULL; | |
64 | uint8_t padding; | |
65 | uint16_t i; | |
66 | if(offset) { | |
67 | if(bwSetPos(fp, offset)) return NULL; | |
68 | } else { | |
69 | //seek | |
70 | if(bwSetPos(fp, fp->idx->rootOffset)) return NULL; | |
71 | } | |
72 | ||
73 | node = calloc(1, sizeof(bwRTreeNode_t)); | |
74 | if(!node) return NULL; | |
75 | ||
76 | if(bwRead(&(node->isLeaf), sizeof(uint8_t), 1, fp) != 1) goto error; | |
77 | if(bwRead(&padding, sizeof(uint8_t), 1, fp) != 1) goto error; | |
78 | if(bwRead(&(node->nChildren), sizeof(uint16_t), 1, fp) != 1) goto error; | |
79 | ||
80 | node->chrIdxStart = malloc(sizeof(uint32_t)*(node->nChildren)); | |
81 | if(!node->chrIdxStart) goto error; | |
82 | node->baseStart = malloc(sizeof(uint32_t)*(node->nChildren)); | |
83 | if(!node->baseStart) goto error; | |
84 | node->chrIdxEnd = malloc(sizeof(uint32_t)*(node->nChildren)); | |
85 | if(!node->chrIdxEnd) goto error; | |
86 | node->baseEnd = malloc(sizeof(uint32_t)*(node->nChildren)); | |
87 | if(!node->baseEnd) goto error; | |
88 | node->dataOffset = malloc(sizeof(uint64_t)*(node->nChildren)); | |
89 | if(!node->dataOffset) goto error; | |
90 | if(node->isLeaf) { | |
91 | node->x.size = malloc(node->nChildren * sizeof(uint64_t)); | |
92 | if(!node->x.size) goto error; | |
93 | } else { | |
94 | node->x.child = calloc(node->nChildren, sizeof(struct bwRTreeNode_t *)); | |
95 | if(!node->x.child) goto error; | |
96 | } | |
97 | for(i=0; i<node->nChildren; i++) { | |
98 | if(bwRead(&(node->chrIdxStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; | |
99 | if(bwRead(&(node->baseStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; | |
100 | if(bwRead(&(node->chrIdxEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; | |
101 | if(bwRead(&(node->baseEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; | |
102 | if(bwRead(&(node->dataOffset[i]), sizeof(uint64_t), 1, fp) != 1) goto error; | |
103 | if(node->isLeaf) { | |
104 | if(bwRead(&(node->x.size[i]), sizeof(uint64_t), 1, fp) != 1) goto error; | |
105 | } | |
106 | } | |
107 | ||
108 | return node; | |
109 | ||
110 | error: | |
111 | if(node->chrIdxStart) free(node->chrIdxStart); | |
112 | if(node->baseStart) free(node->baseStart); | |
113 | if(node->chrIdxEnd) free(node->chrIdxEnd); | |
114 | if(node->baseEnd) free(node->baseEnd); | |
115 | if(node->dataOffset) free(node->dataOffset); | |
116 | if(node->isLeaf && node->x.size) free(node->x.size); | |
117 | else if((!node->isLeaf) && node->x.child) free(node->x.child); | |
118 | free(node); | |
119 | return NULL; | |
120 | } | |
121 | ||
122 | void destroyBWOverlapBlock(bwOverlapBlock_t *b) { | |
123 | if(!b) return; | |
124 | if(b->size) free(b->size); | |
125 | if(b->offset) free(b->offset); | |
126 | free(b); | |
127 | } | |
128 | ||
129 | //Returns a bwOverlapBlock_t * object or NULL on error. | |
130 | static bwOverlapBlock_t *overlapsLeaf(bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { | |
131 | uint16_t i, idx = 0; | |
132 | bwOverlapBlock_t *o = calloc(1, sizeof(bwOverlapBlock_t)); | |
133 | if(!o) return NULL; | |
134 | ||
135 | for(i=0; i<node->nChildren; i++) { | |
136 | if(tid < node->chrIdxStart[i]) break; | |
137 | if(tid > node->chrIdxEnd[i]) continue; | |
138 | ||
139 | /* | |
140 | The individual blocks can theoretically span multiple contigs. | |
141 | So if we treat the first/last contig in the range as special | |
142 | but anything in the middle is a guaranteed match | |
143 | */ | |
144 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { | |
145 | if(tid == node->chrIdxStart[i]) { | |
146 | if(node->baseStart[i] >= end) break; | |
147 | } else if(tid == node->chrIdxEnd[i]) { | |
148 | if(node->baseEnd[i] <= start) continue; | |
149 | } | |
150 | } else { | |
151 | if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; | |
152 | } | |
153 | o->n++; | |
154 | } | |
155 | ||
156 | if(o->n) { | |
157 | o->offset = malloc(sizeof(uint64_t) * (o->n)); | |
158 | if(!o->offset) goto error; | |
159 | o->size = malloc(sizeof(uint64_t) * (o->n)); | |
160 | if(!o->size) goto error; | |
161 | ||
162 | for(i=0; i<node->nChildren; i++) { | |
163 | if(tid < node->chrIdxStart[i]) break; | |
164 | if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; | |
165 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { | |
166 | if(tid == node->chrIdxStart[i]) { | |
167 | if(node->baseStart[i] >= end) continue; | |
168 | } else if(tid == node->chrIdxEnd[i]) { | |
169 | if(node->baseEnd[i] <= start) continue; | |
170 | } | |
171 | } else { | |
172 | if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; | |
173 | } | |
174 | o->offset[idx] = node->dataOffset[i]; | |
175 | o->size[idx++] = node->x.size[i]; | |
176 | if(idx >= o->n) break; | |
177 | } | |
178 | } | |
179 | ||
180 | if(idx != o->n) { //This should never happen | |
181 | fprintf(stderr, "[overlapsLeaf] Mismatch between number of overlaps calculated and found!\n"); | |
182 | goto error; | |
183 | } | |
184 | ||
185 | return o; | |
186 | ||
187 | error: | |
188 | if(o) destroyBWOverlapBlock(o); | |
189 | return NULL; | |
190 | } | |
191 | ||
192 | //This will free l2 unless there's an error! | |
193 | //Returns NULL on error, otherwise the merged lists | |
194 | static bwOverlapBlock_t *mergeOverlapBlocks(bwOverlapBlock_t *b1, bwOverlapBlock_t *b2) { | |
195 | uint64_t i,j; | |
196 | if(!b2) return b1; | |
197 | if(!b2->n) { | |
198 | destroyBWOverlapBlock(b2); | |
199 | return b1; | |
200 | } | |
201 | if(!b1->n) { | |
202 | destroyBWOverlapBlock(b1); | |
203 | return b2; | |
204 | } | |
205 | j = b1->n; | |
206 | b1->n += b2->n; | |
207 | b1->offset = realloc(b1->offset, sizeof(uint64_t) * (b1->n+b2->n)); | |
208 | if(!b1->offset) goto error; | |
209 | b1->size = realloc(b1->size, sizeof(uint64_t) * (b1->n+b2->n)); | |
210 | if(!b1->size) goto error; | |
211 | ||
212 | for(i=0; i<b2->n; i++) { | |
213 | b1->offset[j+i] = b2->offset[i]; | |
214 | b1->size[j+i] = b2->size[i]; | |
215 | } | |
216 | destroyBWOverlapBlock(b2); | |
217 | return b1; | |
218 | ||
219 | error: | |
220 | destroyBWOverlapBlock(b1); | |
221 | return NULL; | |
222 | } | |
223 | ||
224 | //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned | |
225 | //The output needs to be free()d if not NULL (likewise with *sizes) | |
226 | static bwOverlapBlock_t *overlapsNonLeaf(bigWigFile_t *fp, bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { | |
227 | uint16_t i; | |
228 | bwOverlapBlock_t *nodeBlocks, *output = calloc(1, sizeof(bwOverlapBlock_t)); | |
229 | if(!output) return NULL; | |
230 | ||
231 | for(i=0; i<node->nChildren; i++) { | |
232 | if(tid < node->chrIdxStart[i]) break; | |
233 | if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; | |
234 | if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { //child spans contigs | |
235 | if(tid == node->chrIdxStart[i]) { | |
236 | if(node->baseStart[i] >= end) continue; | |
237 | } else if(tid == node->chrIdxEnd[i]) { | |
238 | if(node->baseEnd[i] <= start) continue; | |
239 | } | |
240 | } else { | |
241 | if(end <= node->baseStart[i] || start >= node->baseEnd[i]) continue; | |
242 | } | |
243 | ||
244 | //We have an overlap! | |
245 | if(!node->x.child[i]) | |
246 | node->x.child[i] = bwGetRTreeNode(fp, node->dataOffset[i]); | |
247 | if(!node->x.child[i]) goto error; | |
248 | ||
249 | if(node->x.child[i]->isLeaf) { //leaf | |
250 | nodeBlocks = overlapsLeaf(node->x.child[i], tid, start, end); | |
251 | } else { //non-leaf | |
252 | nodeBlocks = overlapsNonLeaf(fp, node->x.child[i], tid, start, end); | |
253 | } | |
254 | ||
255 | //The output is processed the same regardless of leaf/non-leaf | |
256 | if(!nodeBlocks) goto error; | |
257 | else { | |
258 | output = mergeOverlapBlocks(output, nodeBlocks); | |
259 | if(!output) { | |
260 | destroyBWOverlapBlock(nodeBlocks); | |
261 | goto error; | |
262 | } | |
263 | } | |
264 | } | |
265 | ||
266 | return output; | |
267 | ||
268 | error: | |
269 | destroyBWOverlapBlock(output); | |
270 | return NULL; | |
271 | } | |
272 | ||
273 | //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned | |
274 | //The output must be free()d | |
275 | bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end) { | |
276 | if(root->isLeaf) return overlapsLeaf(root, tid, start, end); | |
277 | return overlapsNonLeaf(bw, root, tid, start, end); | |
278 | } | |
279 | ||
280 | //In reality, a hash or some sort of tree structure is probably faster... | |
281 | //Return -1 (AKA 0xFFFFFFFF...) on "not there", so we can hold (2^32)-1 items. | |
282 | uint32_t bwGetTid(bigWigFile_t *fp, char *chrom) { | |
283 | uint32_t i; | |
284 | if(!chrom) return -1; | |
285 | for(i=0; i<fp->cl->nKeys; i++) { | |
286 | if(strcmp(chrom, fp->cl->chrom[i]) == 0) return i; | |
287 | } | |
288 | return -1; | |
289 | } | |
290 | ||
291 | static bwOverlapBlock_t *bwGetOverlappingBlocks(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end) { | |
292 | uint32_t tid = bwGetTid(fp, chrom); | |
293 | ||
294 | if(tid == (uint32_t) -1) { | |
295 | fprintf(stderr, "[bwGetOverlappingBlocks] Non-existent contig: %s\n", chrom); | |
296 | return NULL; | |
297 | } | |
298 | ||
299 | //Get the info if needed | |
300 | if(!fp->idx) { | |
301 | fp->idx = readRTreeIdx(fp, fp->hdr->indexOffset); | |
302 | if(!fp->idx) { | |
303 | return NULL; | |
304 | } | |
305 | } | |
306 | ||
307 | if(!fp->idx->root) fp->idx->root = bwGetRTreeNode(fp, 0); | |
308 | if(!fp->idx->root) return NULL; | |
309 | ||
310 | return walkRTreeNodes(fp, fp->idx->root, tid, start, end); | |
311 | } | |
312 | ||
313 | void bwFillDataHdr(bwDataHeader_t *hdr, void *b) { | |
314 | hdr->tid = ((uint32_t*)b)[0]; | |
315 | hdr->start = ((uint32_t*)b)[1]; | |
316 | hdr->end = ((uint32_t*)b)[2]; | |
317 | hdr->step = ((uint32_t*)b)[3]; | |
318 | hdr->span = ((uint32_t*)b)[4]; | |
319 | hdr->type = ((uint8_t*)b)[20]; | |
320 | hdr->nItems = ((uint16_t*)b)[11]; | |
321 | } | |
322 | ||
323 | void bwDestroyOverlappingIntervals(bwOverlappingIntervals_t *o) { | |
324 | if(!o) return; | |
325 | if(o->start) free(o->start); | |
326 | if(o->end) free(o->end); | |
327 | if(o->value) free(o->value); | |
328 | free(o); | |
329 | } | |
330 | ||
331 | void bbDestroyOverlappingEntries(bbOverlappingEntries_t *o) { | |
332 | uint32_t i; | |
333 | if(!o) return; | |
334 | if(o->start) free(o->start); | |
335 | if(o->end) free(o->end); | |
336 | if(o->str) { | |
337 | for(i=0; i<o->l; i++) { | |
338 | if(o->str[i]) free(o->str[i]); | |
339 | } | |
340 | free(o->str); | |
341 | } | |
342 | free(o); | |
343 | } | |
344 | ||
345 | //Returns NULL on error, in which case o has been free()d | |
346 | static bwOverlappingIntervals_t *pushIntervals(bwOverlappingIntervals_t *o, uint32_t start, uint32_t end, float value) { | |
347 | if(o->l+1 >= o->m) { | |
348 | o->m = roundup(o->l+1); | |
349 | o->start = realloc(o->start, o->m * sizeof(uint32_t)); | |
350 | if(!o->start) goto error; | |
351 | o->end = realloc(o->end, o->m * sizeof(uint32_t)); | |
352 | if(!o->end) goto error; | |
353 | o->value = realloc(o->value, o->m * sizeof(float)); | |
354 | if(!o->value) goto error; | |
355 | } | |
356 | o->start[o->l] = start; | |
357 | o->end[o->l] = end; | |
358 | o->value[o->l++] = value; | |
359 | return o; | |
360 | ||
361 | error: | |
362 | bwDestroyOverlappingIntervals(o); | |
363 | return NULL; | |
364 | } | |
365 | ||
366 | static bbOverlappingEntries_t *pushBBIntervals(bbOverlappingEntries_t *o, uint32_t start, uint32_t end, char *str, int withString) { | |
367 | if(o->l+1 >= o->m) { | |
368 | o->m = roundup(o->l+1); | |
369 | o->start = realloc(o->start, o->m * sizeof(uint32_t)); | |
370 | if(!o->start) goto error; | |
371 | o->end = realloc(o->end, o->m * sizeof(uint32_t)); | |
372 | if(!o->end) goto error; | |
373 | if(withString) { | |
374 | o->str = realloc(o->str, o->m * sizeof(char**)); | |
375 | if(!o->str) goto error; | |
376 | } | |
377 | } | |
378 | o->start[o->l] = start; | |
379 | o->end[o->l] = end; | |
380 | if(withString) o->str[o->l] = strdup(str); | |
381 | o->l++; | |
382 | return o; | |
383 | ||
384 | error: | |
385 | bbDestroyOverlappingEntries(o); | |
386 | return NULL; | |
387 | } | |
388 | ||
389 | //Returns NULL on error | |
390 | bwOverlappingIntervals_t *bwGetOverlappingIntervalsCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend) { | |
391 | uint64_t i; | |
392 | uint16_t j; | |
393 | int compressed = 0, rv; | |
394 | uLongf sz = fp->hdr->bufSize, tmp; | |
395 | void *buf = NULL, *compBuf = NULL; | |
396 | uint32_t start = 0, end , *p; | |
397 | float value; | |
398 | bwDataHeader_t hdr; | |
399 | bwOverlappingIntervals_t *output = calloc(1, sizeof(bwOverlappingIntervals_t)); | |
400 | ||
401 | if(!output) goto error; | |
402 | ||
403 | if(!o) return output; | |
404 | if(!o->n) return output; | |
405 | ||
406 | if(sz) { | |
407 | compressed = 1; | |
408 | buf = malloc(sz); | |
409 | } | |
410 | sz = 0; //This is now the size of the compressed buffer | |
411 | ||
412 | for(i=0; i<o->n; i++) { | |
413 | if(bwSetPos(fp, o->offset[i])) goto error; | |
414 | ||
415 | if(sz < o->size[i]) { | |
416 | compBuf = realloc(compBuf, o->size[i]); | |
417 | sz = o->size[i]; | |
418 | } | |
419 | if(!compBuf) goto error; | |
420 | ||
421 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; | |
422 | if(compressed) { | |
423 | tmp = fp->hdr->bufSize; //This gets over-written by uncompress | |
424 | rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); | |
425 | if(rv != Z_OK) goto error; | |
426 | } else { | |
427 | buf = compBuf; | |
428 | } | |
429 | ||
430 | //TODO: ensure that tmp is large enough! | |
431 | bwFillDataHdr(&hdr, buf); | |
432 | ||
433 | p = ((uint32_t*) buf); | |
434 | p += 6; | |
435 | if(hdr.tid != tid) continue; | |
436 | ||
437 | if(hdr.type == 3) start = hdr.start - hdr.step; | |
438 | ||
439 | //FIXME: We should ensure that sz is large enough to hold nItems of the given type | |
440 | for(j=0; j<hdr.nItems; j++) { | |
441 | switch(hdr.type) { | |
442 | case 1: | |
443 | start = *p; | |
444 | p++; | |
445 | end = *p; | |
446 | p++; | |
447 | value = *((float *)p); | |
448 | p++; | |
449 | break; | |
450 | case 2: | |
451 | start = *p; | |
452 | p++; | |
453 | end = start + hdr.span; | |
454 | value = *((float *)p); | |
455 | p++; | |
456 | break; | |
457 | case 3: | |
458 | start += hdr.step; | |
459 | end = start+hdr.span; | |
460 | value = *((float *)p); | |
461 | p++; | |
462 | break; | |
463 | default : | |
464 | goto error; | |
465 | break; | |
466 | } | |
467 | ||
468 | if(end <= ostart || start >= oend) continue; | |
469 | //Push the overlap | |
470 | if(!pushIntervals(output, start, end, value)) goto error; | |
471 | } | |
472 | } | |
473 | ||
474 | if(compressed && buf) free(buf); | |
475 | if(compBuf) free(compBuf); | |
476 | return output; | |
477 | ||
478 | error: | |
479 | fprintf(stderr, "[bwGetOverlappingIntervalsCore] Got an error\n"); | |
480 | if(output) bwDestroyOverlappingIntervals(output); | |
481 | if(compressed && buf) free(buf); | |
482 | if(compBuf) free(compBuf); | |
483 | return NULL; | |
484 | } | |
485 | ||
486 | bbOverlappingEntries_t *bbGetOverlappingEntriesCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend, int withString) { | |
487 | uint64_t i; | |
488 | int compressed = 0, rv, slen; | |
489 | uLongf sz = fp->hdr->bufSize, tmp = 0; | |
490 | void *buf = NULL, *bufEnd = NULL, *compBuf = NULL; | |
491 | uint32_t entryTid = 0, start = 0, end; | |
492 | char *str; | |
493 | bbOverlappingEntries_t *output = calloc(1, sizeof(bbOverlappingEntries_t)); | |
494 | ||
495 | if(!output) goto error; | |
496 | ||
497 | if(!o) return output; | |
498 | if(!o->n) return output; | |
499 | ||
500 | if(sz) { | |
501 | compressed = 1; | |
502 | buf = malloc(sz); | |
503 | } | |
504 | sz = 0; //This is now the size of the compressed buffer | |
505 | ||
506 | for(i=0; i<o->n; i++) { | |
507 | if(bwSetPos(fp, o->offset[i])) goto error; | |
508 | ||
509 | if(sz < o->size[i]) { | |
510 | compBuf = realloc(compBuf, o->size[i]); | |
511 | sz = o->size[i]; | |
512 | } | |
513 | if(!compBuf) goto error; | |
514 | ||
515 | if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; | |
516 | if(compressed) { | |
517 | tmp = fp->hdr->bufSize; //This gets over-written by uncompress | |
518 | rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); | |
519 | if(rv != Z_OK) goto error; | |
520 | } else { | |
521 | buf = compBuf; | |
522 | tmp = o->size[i]; //TODO: Is this correct? Do non-gzipped bigBeds exist? | |
523 | } | |
524 | ||
525 | bufEnd = buf + tmp; | |
526 | while(buf < bufEnd) { | |
527 | entryTid = ((uint32_t*)buf)[0]; | |
528 | start = ((uint32_t*)buf)[1]; | |
529 | end = ((uint32_t*)buf)[2]; | |
530 | buf += 12; | |
531 | str = (char*)buf; | |
532 | slen = strlen(str) + 1; | |
533 | buf += slen; | |
534 | ||
535 | if(entryTid < tid) continue; | |
536 | if(entryTid > tid) break; | |
537 | if(end <= ostart) continue; | |
538 | if(start >= oend) break; | |
539 | ||
540 | //Push the overlap | |
541 | if(!pushBBIntervals(output, start, end, str, withString)) goto error; | |
542 | } | |
543 | ||
544 | buf = bufEnd - tmp; //reset the buffer pointer | |
545 | } | |
546 | ||
547 | if(compressed && buf) free(buf); | |
548 | if(compBuf) free(compBuf); | |
549 | return output; | |
550 | ||
551 | error: | |
552 | fprintf(stderr, "[bbGetOverlappingEntriesCore] Got an error\n"); | |
553 | buf = bufEnd - tmp; | |
554 | if(output) bbDestroyOverlappingEntries(output); | |
555 | if(compressed && buf) free(buf); | |
556 | if(compBuf) free(compBuf); | |
557 | return NULL; | |
558 | } | |
559 | ||
560 | //Returns NULL on error OR no intervals, which is a bad design... | |
561 | bwOverlappingIntervals_t *bwGetOverlappingIntervals(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end) { | |
562 | bwOverlappingIntervals_t *output; | |
563 | uint32_t tid = bwGetTid(fp, chrom); | |
564 | if(tid == (uint32_t) -1) return NULL; | |
565 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); | |
566 | if(!blocks) return NULL; | |
567 | output = bwGetOverlappingIntervalsCore(fp, blocks, tid, start, end); | |
568 | destroyBWOverlapBlock(blocks); | |
569 | return output; | |
570 | } | |
571 | ||
572 | //Like above, but for bigBed files | |
573 | bbOverlappingEntries_t *bbGetOverlappingEntries(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int withString) { | |
574 | bbOverlappingEntries_t *output; | |
575 | uint32_t tid = bwGetTid(fp, chrom); | |
576 | if(tid == (uint32_t) -1) return NULL; | |
577 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); | |
578 | if(!blocks) return NULL; | |
579 | output = bbGetOverlappingEntriesCore(fp, blocks, tid, start, end, withString); | |
580 | destroyBWOverlapBlock(blocks); | |
581 | return output; | |
582 | } | |
583 | ||
584 | //Returns NULL on error | |
585 | bwOverlapIterator_t *bwOverlappingIntervalsIterator(bigWigFile_t *bw, char *chrom, uint32_t start, uint32_t end, uint32_t blocksPerIteration) { | |
586 | bwOverlapIterator_t *output = NULL; | |
587 | uint64_t n; | |
588 | uint32_t tid = bwGetTid(bw, chrom); | |
589 | if(tid == (uint32_t) -1) return output; | |
590 | output = calloc(1, sizeof(bwOverlapIterator_t)); | |
591 | if(!output) return output; | |
592 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(bw, chrom, start, end); | |
593 | ||
594 | output->bw = bw; | |
595 | output->tid = tid; | |
596 | output->start = start; | |
597 | output->end = end; | |
598 | output->blocks = blocks; | |
599 | output->blocksPerIteration = blocksPerIteration; | |
600 | ||
601 | if(blocks) { | |
602 | n = blocks->n; | |
603 | if(n>blocksPerIteration) blocks->n = blocksPerIteration; | |
604 | output->intervals = bwGetOverlappingIntervalsCore(bw, blocks,tid, start, end); | |
605 | blocks->n = n; | |
606 | output->offset = blocksPerIteration; | |
607 | } | |
608 | output->data = output->intervals; | |
609 | return output; | |
610 | } | |
611 | ||
612 | //Returns NULL on error | |
613 | bwOverlapIterator_t *bbOverlappingEntriesIterator(bigWigFile_t *bw, char *chrom, uint32_t start, uint32_t end, int withString, uint32_t blocksPerIteration) { | |
614 | bwOverlapIterator_t *output = NULL; | |
615 | uint64_t n; | |
616 | uint32_t tid = bwGetTid(bw, chrom); | |
617 | if(tid == (uint32_t) -1) return output; | |
618 | output = calloc(1, sizeof(bwOverlapIterator_t)); | |
619 | if(!output) return output; | |
620 | bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(bw, chrom, start, end); | |
621 | ||
622 | output->bw = bw; | |
623 | output->tid = tid; | |
624 | output->start = start; | |
625 | output->end = end; | |
626 | output->blocks = blocks; | |
627 | output->blocksPerIteration = blocksPerIteration; | |
628 | output->withString = withString; | |
629 | ||
630 | if(blocks) { | |
631 | n = blocks->n; | |
632 | if(n>blocksPerIteration) blocks->n = blocksPerIteration; | |
633 | output->entries = bbGetOverlappingEntriesCore(bw, blocks,tid, start, end, withString); | |
634 | blocks->n = n; | |
635 | output->offset = blocksPerIteration; | |
636 | } | |
637 | output->data = output->entries; | |
638 | return output; | |
639 | } | |
640 | ||
641 | void bwIteratorDestroy(bwOverlapIterator_t *iter) { | |
642 | if(!iter) return; | |
643 | if(iter->blocks) destroyBWOverlapBlock((bwOverlapBlock_t*) iter->blocks); | |
644 | if(iter->intervals) bwDestroyOverlappingIntervals(iter->intervals); | |
645 | if(iter->entries) bbDestroyOverlappingEntries(iter->entries); | |
646 | free(iter); | |
647 | } | |
648 | ||
649 | //On error, points to NULL and destroys the input | |
650 | bwOverlapIterator_t *bwIteratorNext(bwOverlapIterator_t *iter) { | |
651 | uint64_t n, *offset, *size; | |
652 | bwOverlapBlock_t *blocks = iter->blocks; | |
653 | ||
654 | if(iter->intervals) { | |
655 | bwDestroyOverlappingIntervals(iter->intervals); | |
656 | iter->intervals = NULL; | |
657 | } | |
658 | if(iter->entries) { | |
659 | bbDestroyOverlappingEntries(iter->entries); | |
660 | iter->entries = NULL; | |
661 | } | |
662 | iter->data = NULL; | |
663 | ||
664 | if(iter->offset < blocks->n) { | |
665 | //store the previous values | |
666 | n = blocks->n; | |
667 | offset = blocks->offset; | |
668 | size = blocks->size; | |
669 | ||
670 | //Move the start of the blocks | |
671 | blocks->offset += iter->offset; | |
672 | blocks->size += iter->offset; | |
673 | if(iter->offset + iter->blocksPerIteration > n) { | |
674 | blocks->n = blocks->n - iter->offset; | |
675 | } else { | |
676 | blocks->n = iter->blocksPerIteration; | |
677 | } | |
678 | ||
679 | //Get the intervals or entries, as appropriate | |
680 | if(iter->bw->type == 0) { | |
681 | //bigWig | |
682 | iter->intervals = bwGetOverlappingIntervalsCore(iter->bw, blocks, iter->tid, iter->start, iter->end); | |
683 | iter->data = iter->intervals; | |
684 | } else { | |
685 | //bigBed | |
686 | iter->entries = bbGetOverlappingEntriesCore(iter->bw, blocks, iter->tid, iter->start, iter->end, iter->withString); | |
687 | iter->data = iter->entries; | |
688 | } | |
689 | iter->offset += iter->blocksPerIteration; | |
690 | ||
691 | //reset the values in iter->blocks | |
692 | blocks->n = n; | |
693 | blocks->offset = offset; | |
694 | blocks->size = size; | |
695 | ||
696 | //Check for error | |
697 | if(!iter->intervals && !iter->entries) goto error; | |
698 | } | |
699 | ||
700 | return iter; | |
701 | ||
702 | error: | |
703 | bwIteratorDestroy(iter); | |
704 | return NULL; | |
705 | } | |
706 | ||
707 | //This is like bwGetOverlappingIntervals, except it returns 1 base windows. If includeNA is not 0, then a value will be returned for every position in the range (defaulting to NAN). | |
708 | //The ->end member is NULL | |
709 | //If includeNA is not 0 then ->start is also NULL, since it's implied | |
710 | //Note that bwDestroyOverlappingIntervals() will work in either case | |
711 | bwOverlappingIntervals_t *bwGetValues(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int includeNA) { | |
712 | uint32_t i, j, n; | |
713 | bwOverlappingIntervals_t *output = NULL; | |
714 | bwOverlappingIntervals_t *intermediate = bwGetOverlappingIntervals(fp, chrom, start, end); | |
715 | if(!intermediate) return NULL; | |
716 | ||
717 | output = calloc(1, sizeof(bwOverlappingIntervals_t)); | |
718 | if(!output) goto error; | |
719 | if(includeNA) { | |
720 | output->l = end-start; | |
721 | output->value = malloc((end-start)*sizeof(float)); | |
722 | if(!output->value) goto error; | |
723 | for(i=0; i<end-start; i++) output->value[i] = strtod("NaN", NULL); | |
724 | for(i=0; i<intermediate->l; i++) { | |
725 | for(j=intermediate->start[i]; j<intermediate->end[i]; j++) { | |
726 | if(j < start || j >= end) continue; | |
727 | output->value[j-start] = intermediate->value[i]; | |
728 | } | |
729 | } | |
730 | } else { | |
731 | n = 0; | |
732 | for(i=0; i<intermediate->l; i++) { | |
733 | if(intermediate->start[i] < start) intermediate->start[i] = start; | |
734 | if(intermediate->end[i] > end) intermediate->end[i] = end; | |
735 | n += intermediate->end[i]-intermediate->start[i]; | |
736 | } | |
737 | output->l = n; | |
738 | output->start = malloc(sizeof(uint32_t)*n); | |
739 | if(!output->start) goto error; | |
740 | output->value = malloc(sizeof(float)*n); | |
741 | if(!output->value) goto error; | |
742 | n = 0; //this is now the index | |
743 | for(i=0; i<intermediate->l; i++) { | |
744 | for(j=intermediate->start[i]; j<intermediate->end[i]; j++) { | |
745 | if(j < start || j >= end) continue; | |
746 | output->start[n] = j; | |
747 | output->value[n++] = intermediate->value[i]; | |
748 | } | |
749 | } | |
750 | } | |
751 | ||
752 | bwDestroyOverlappingIntervals(intermediate); | |
753 | return output; | |
754 | ||
755 | error: | |
756 | if(intermediate) bwDestroyOverlappingIntervals(intermediate); | |
757 | if(output) bwDestroyOverlappingIntervals(output); | |
758 | return NULL; | |
759 | } | |
760 | ||
761 | void bwDestroyIndexNode(bwRTreeNode_t *node) { | |
762 | uint16_t i; | |
763 | ||
764 | if(!node) return; | |
765 | ||
766 | free(node->chrIdxStart); | |
767 | free(node->baseStart); | |
768 | free(node->chrIdxEnd); | |
769 | free(node->baseEnd); | |
770 | free(node->dataOffset); | |
771 | if(!node->isLeaf) { | |
772 | for(i=0; i<node->nChildren; i++) { | |
773 | bwDestroyIndexNode(node->x.child[i]); | |
774 | } | |
775 | free(node->x.child); | |
776 | } else { | |
777 | free(node->x.size); | |
778 | } | |
779 | free(node); | |
780 | } | |
781 | ||
782 | void bwDestroyIndex(bwRTree_t *idx) { | |
783 | bwDestroyIndexNode(idx->root); | |
784 | free(idx); | |
785 | } | |
786 | ||
787 | //Returns a pointer to the requested index (@offset, unless it's 0, in which case the index for the values is returned | |
788 | //Returns NULL on error | |
789 | bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset) { | |
790 | bwRTree_t *idx = readRTreeIdx(fp, offset); | |
791 | if(!idx) return NULL; | |
792 | ||
793 | //Read in the root node | |
794 | idx->root = bwGetRTreeNode(fp, idx->rootOffset); | |
795 | ||
796 | if(!idx->root) { | |
797 | bwDestroyIndex(idx); | |
798 | return NULL; | |
799 | } | |
800 | return idx; | |
801 | } |
0 | #ifndef LIBBIGWIG_VALUES_H | |
1 | #define LIBBIGWIG_VALUES_H | |
2 | ||
3 | #include <inttypes.h> | |
4 | /*! \file bwValues.h | |
5 | * | |
6 | * You should not directly use functions and structures defined here. They're really meant for internal use only. | |
7 | * | |
8 | * All of the structures here need to be destroyed or you'll leak memory! There are methods available to destroy anything that you need to take care of yourself. | |
9 | */ | |
10 | ||
11 | //N.B., coordinates are still 0-based half open! | |
12 | /*! | |
13 | * @brief A node within an R-tree holding the index for data. | |
14 | * | |
15 | * Note that there are two types of nodes: leaf and twig. Leaf nodes point to where data actually is. Twig nodes point to additional index nodes, which may or may not be leaves. Each of these nodes has additional children, which may span multiple chromosomes/contigs. | |
16 | * | |
17 | * With the start/end position, these positions refer specifically to the chromosomes specified in chrIdxStart/chrIdxEnd. Any chromosomes between these are completely spanned by a given child. | |
18 | */ | |
19 | typedef struct bwRTreeNode_t { | |
20 | uint8_t isLeaf; /**<Is this node a leaf?*/ | |
21 | //1 byte of padding | |
22 | uint16_t nChildren; /**<The number of children of this node, all lists have this length.*/ | |
23 | uint32_t *chrIdxStart; /**<A list of the starting chromosome indices of each child.*/ | |
24 | uint32_t *baseStart; /**<A list of the start position of each child.*/ | |
25 | uint32_t *chrIdxEnd; /**<A list of the end chromosome indices of each child.*/ | |
26 | uint32_t *baseEnd; /**<A list of the end position of each child.*/ | |
27 | uint64_t *dataOffset; /**<For leaves, the offset to the on-disk data. For twigs, the offset to the child node.*/ | |
28 | union { | |
29 | uint64_t *size; /**<Leaves only: The size of the data block.*/ | |
30 | struct bwRTreeNode_t **child; /**<Twigs only: The child node(s).*/ | |
31 | } x; /**<A union holding either size or child*/ | |
32 | } bwRTreeNode_t; | |
33 | ||
34 | /*! | |
35 | * A header and index that points to an R-tree that in turn points to data blocks. | |
36 | */ | |
37 | //TODO rootOffset is pointless, it's 48bytes after the indexOffset | |
38 | typedef struct { | |
39 | uint32_t blockSize; /**<The maximum number of children a node can have*/ | |
40 | uint64_t nItems; /**<The total number of data blocks pointed to by the tree. This is completely redundant.*/ | |
41 | uint32_t chrIdxStart; /**<The index to the first chromosome described.*/ | |
42 | uint32_t baseStart; /**<The first position on chrIdxStart with a value.*/ | |
43 | uint32_t chrIdxEnd; /**<The index of the last chromosome with an entry.*/ | |
44 | uint32_t baseEnd; /**<The last position on chrIdxEnd with an entry.*/ | |
45 | uint64_t idxSize; /**<This is actually the offset of the index rather than the size?!? Yes, it's completely redundant.*/ | |
46 | uint32_t nItemsPerSlot; /**<This is always 1!*/ | |
47 | //There's 4 bytes of padding in the file here | |
48 | uint64_t rootOffset; /**<The offset to the root node of the R-Tree (on disk). Yes, this is redundant.*/ | |
49 | bwRTreeNode_t *root; /**<A pointer to the root node.*/ | |
50 | } bwRTree_t; | |
51 | ||
52 | /*! | |
53 | * @brief This structure holds the data blocks that overlap a given interval. | |
54 | */ | |
55 | typedef struct { | |
56 | uint64_t n; /**<The number of blocks that overlap. This *MAY* be 0!.*/ | |
57 | uint64_t *offset; /**<The offset to the on-disk position of the block.*/ | |
58 | uint64_t *size; /**<The size of each block on disk (in bytes).*/ | |
59 | } bwOverlapBlock_t; | |
60 | ||
61 | /*! | |
62 | * @brief The header section of a given data block. | |
63 | * | |
64 | * There are 3 types of data blocks in bigWig files, each with slightly different needs. This is all taken care of internally. | |
65 | */ | |
66 | typedef struct { | |
67 | uint32_t tid; /**<The chromosome ID.*/ | |
68 | uint32_t start; /**<The start position of a block*/ | |
69 | uint32_t end; /**<The end position of a block*/ | |
70 | uint32_t step; /**<The step size of the values*/ | |
71 | uint32_t span; /**<The span of each data value*/ | |
72 | uint8_t type; /**<The block type: 1, bedGraph; 2, variable step; 3, fixed step.*/ | |
73 | uint16_t nItems; /**<The number of values in a given block.*/ | |
74 | } bwDataHeader_t; | |
75 | ||
76 | #endif // LIBBIGWIG_VALUES_H |
0 | #include <limits.h> | |
1 | #include <float.h> | |
2 | #include <stdlib.h> | |
3 | #include <string.h> | |
4 | #include <math.h> | |
5 | #include "bigWig.h" | |
6 | #include "bwCommon.h" | |
7 | ||
8 | /// @cond SKIP | |
9 | struct val_t { | |
10 | uint32_t tid; | |
11 | uint32_t start; | |
12 | uint32_t nBases; | |
13 | float min, max, sum, sumsq; | |
14 | double scalar; | |
15 | struct val_t *next; | |
16 | }; | |
17 | /// @endcond | |
18 | ||
19 | //Create a chromList_t and attach it to a bigWigFile_t *. Returns NULL on error | |
20 | //Note that chroms and lengths are duplicated, so you MUST free the input | |
21 | chromList_t *bwCreateChromList(char **chroms, uint32_t *lengths, int64_t n) { | |
22 | int64_t i = 0; | |
23 | chromList_t *cl = calloc(1, sizeof(chromList_t)); | |
24 | if(!cl) return NULL; | |
25 | ||
26 | cl->nKeys = n; | |
27 | cl->chrom = malloc(sizeof(char*)*n); | |
28 | cl->len = malloc(sizeof(uint32_t)*n); | |
29 | if(!cl->chrom) goto error; | |
30 | if(!cl->len) goto error; | |
31 | ||
32 | for(i=0; i<n; i++) { | |
33 | cl->len[i] = lengths[i]; | |
34 | cl->chrom[i] = strdup(chroms[i]); | |
35 | if(!cl->chrom[i]) goto error; | |
36 | } | |
37 | ||
38 | return cl; | |
39 | ||
40 | error: | |
41 | if(i) { | |
42 | int64_t j; | |
43 | for(j=0; j<i; j++) free(cl->chrom[j]); | |
44 | } | |
45 | if(cl) { | |
46 | if(cl->chrom) free(cl->chrom); | |
47 | if(cl->len) free(cl->len); | |
48 | free(cl); | |
49 | } | |
50 | return NULL; | |
51 | } | |
52 | ||
53 | //If maxZooms == 0, then 0 is used (i.e., there are no zoom levels). If maxZooms < 0 or > 65535 then 10 is used. | |
54 | //TODO allow changing bufSize and blockSize | |
55 | int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms) { | |
56 | if(!fp->isWrite) return 1; | |
57 | bigWigHdr_t *hdr = calloc(1, sizeof(bigWigHdr_t)); | |
58 | if(!hdr) return 2; | |
59 | ||
60 | hdr->version = 4; | |
61 | if(maxZooms < 0 || maxZooms > 65535) { | |
62 | hdr->nLevels = 10; | |
63 | } else { | |
64 | hdr->nLevels = maxZooms; | |
65 | } | |
66 | ||
67 | hdr->bufSize = 32768; //When the file is finalized this is reset if fp->writeBuffer->compressPsz is 0! | |
68 | hdr->minVal = DBL_MAX; | |
69 | hdr->maxVal = DBL_MIN; | |
70 | fp->hdr = hdr; | |
71 | fp->writeBuffer->blockSize = 64; | |
72 | ||
73 | //Allocate the writeBuffer buffers | |
74 | fp->writeBuffer->compressPsz = compressBound(hdr->bufSize); | |
75 | fp->writeBuffer->compressP = malloc(fp->writeBuffer->compressPsz); | |
76 | if(!fp->writeBuffer->compressP) return 3; | |
77 | fp->writeBuffer->p = calloc(1,hdr->bufSize); | |
78 | if(!fp->writeBuffer->p) return 4; | |
79 | ||
80 | return 0; | |
81 | } | |
82 | ||
83 | //return 0 on success | |
84 | static int writeAtPos(void *ptr, size_t sz, size_t nmemb, size_t pos, FILE *fp) { | |
85 | size_t curpos = ftell(fp); | |
86 | if(fseek(fp, pos, SEEK_SET)) return 1; | |
87 | if(fwrite(ptr, sz, nmemb, fp) != nmemb) return 2; | |
88 | if(fseek(fp, curpos, SEEK_SET)) return 3; | |
89 | return 0; | |
90 | } | |
91 | ||
92 | //We lose keySize bytes on error | |
93 | static int writeChromList(FILE *fp, chromList_t *cl) { | |
94 | uint16_t k; | |
95 | uint32_t j, magic = CIRTREE_MAGIC; | |
96 | uint32_t nperblock = (cl->nKeys > 0x7FFF) ? 0x7FFF : cl->nKeys; //Items per leaf/non-leaf, there are no unsigned ints in java :( | |
97 | uint32_t nblocks, keySize = 0, valSize = 8; //In theory valSize could be optimized, in practice that'd be annoying | |
98 | uint64_t i, nonLeafEnd, leafSize, nextLeaf; | |
99 | uint8_t eight; | |
100 | int64_t i64; | |
101 | char *chrom; | |
102 | size_t l; | |
103 | ||
104 | if(cl->nKeys > 1073676289) { | |
105 | fprintf(stderr, "[writeChromList] Error: Currently only 1,073,676,289 contigs are supported. If you really need more then please post a request on github.\n"); | |
106 | return 1; | |
107 | } | |
108 | nblocks = cl->nKeys/nperblock; | |
109 | nblocks += ((cl->nKeys % nperblock) > 0)?1:0; | |
110 | ||
111 | for(i64=0; i64<cl->nKeys; i64++) { | |
112 | l = strlen(cl->chrom[i64]); | |
113 | if(l>keySize) keySize = l; | |
114 | } | |
115 | l--; //We don't null terminate strings, because schiess mich tot | |
116 | chrom = calloc(keySize, sizeof(char)); | |
117 | ||
118 | //Write the root node of a largely pointless tree | |
119 | if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 1; | |
120 | if(fwrite(&nperblock, sizeof(uint32_t), 1, fp) != 1) return 2; | |
121 | if(fwrite(&keySize, sizeof(uint32_t), 1, fp) != 1) return 3; | |
122 | if(fwrite(&valSize, sizeof(uint32_t), 1, fp) != 1) return 4; | |
123 | if(fwrite(&(cl->nKeys), sizeof(uint64_t), 1, fp) != 1) return 5; | |
124 | ||
125 | //Padding? | |
126 | i=0; | |
127 | if(fwrite(&i, sizeof(uint64_t), 1, fp) != 1) return 6; | |
128 | ||
129 | //Do we need a non-leaf node? | |
130 | if(nblocks > 1) { | |
131 | eight = 0; | |
132 | if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 7; | |
133 | if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 8; //padding | |
134 | if(fwrite(&nblocks, sizeof(uint16_t), 1, fp) != 1) return 8; | |
135 | nonLeafEnd = ftell(fp) + nperblock * (keySize + 8); | |
136 | leafSize = nperblock * (keySize + 8) + 4; | |
137 | for(i=0; i<nblocks; i++) { //Why yes, this is pointless | |
138 | chrom = strncpy(chrom, cl->chrom[i * nperblock], keySize); | |
139 | nextLeaf = nonLeafEnd + i * leafSize; | |
140 | if(fwrite(chrom, keySize, 1, fp) != 1) return 9; | |
141 | if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10; | |
142 | } | |
143 | for(i=0; i<keySize; i++) chrom[i] = '\0'; | |
144 | nextLeaf = 0; | |
145 | for(i=nblocks; i<nperblock; i++) { | |
146 | if(fwrite(chrom, keySize, 1, fp) != 1) return 9; | |
147 | if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10; | |
148 | } | |
149 | } | |
150 | ||
151 | //Write the leaves | |
152 | nextLeaf = 0; | |
153 | for(i=0, j=0; i<nblocks; i++) { | |
154 | eight = 1; | |
155 | if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 11; | |
156 | eight = 0; | |
157 | if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 12; | |
158 | if(cl->nKeys - j < nperblock) { | |
159 | k = cl->nKeys - j; | |
160 | if(fwrite(&k, sizeof(uint16_t), 1, fp) != 1) return 13; | |
161 | } else { | |
162 | if(fwrite(&nperblock, sizeof(uint16_t), 1, fp) != 1) return 13; | |
163 | } | |
164 | for(k=0; k<nperblock; k++) { | |
165 | if(j>=cl->nKeys) { | |
166 | if(chrom[0]) { | |
167 | for(l=0; l<keySize; l++) chrom[l] = '\0'; | |
168 | } | |
169 | if(fwrite(chrom, keySize, 1, fp) != 1) return 15; | |
170 | if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 16; | |
171 | } else { | |
172 | chrom = strncpy(chrom, cl->chrom[j], keySize); | |
173 | if(fwrite(chrom, keySize, 1, fp) != 1) return 15; | |
174 | if(fwrite(&j, sizeof(uint32_t), 1, fp) != 1) return 16; | |
175 | if(fwrite(&(cl->len[j++]), sizeof(uint32_t), 1, fp) != 1) return 17; | |
176 | } | |
177 | } | |
178 | } | |
179 | ||
180 | free(chrom); | |
181 | return 0; | |
182 | } | |
183 | ||
184 | //returns 0 on success | |
185 | //Still need to fill in indexOffset | |
186 | int bwWriteHdr(bigWigFile_t *bw) { | |
187 | uint32_t magic = BIGWIG_MAGIC; | |
188 | uint16_t two = 4; | |
189 | FILE *fp; | |
190 | void *p = calloc(58, sizeof(uint8_t)); //58 bytes of nothing | |
191 | if(!bw->isWrite) return 1; | |
192 | ||
193 | //The header itself, largely just reserving space... | |
194 | fp = bw->URL->x.fp; | |
195 | if(!fp) return 2; | |
196 | if(fseek(fp, 0, SEEK_SET)) return 3; | |
197 | if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 4; | |
198 | if(fwrite(&two, sizeof(uint16_t), 1, fp) != 1) return 5; | |
199 | if(fwrite(p, sizeof(uint8_t), 58, fp) != 58) return 6; | |
200 | ||
201 | //Empty zoom headers | |
202 | if(bw->hdr->nLevels) { | |
203 | for(two=0; two<bw->hdr->nLevels; two++) { | |
204 | if(fwrite(p, sizeof(uint8_t), 24, fp) != 24) return 9; | |
205 | } | |
206 | } | |
207 | ||
208 | //Update summaryOffset and write an empty summary block | |
209 | bw->hdr->summaryOffset = ftell(fp); | |
210 | if(fwrite(p, sizeof(uint8_t), 40, fp) != 40) return 10; | |
211 | if(writeAtPos(&(bw->hdr->summaryOffset), sizeof(uint64_t), 1, 0x2c, fp)) return 11; | |
212 | ||
213 | //Write the chromosome list as a stupid freaking tree (because let's TREE ALL THE THINGS!!!) | |
214 | bw->hdr->ctOffset = ftell(fp); | |
215 | if(writeChromList(fp, bw->cl)) return 7; | |
216 | if(writeAtPos(&(bw->hdr->ctOffset), sizeof(uint64_t), 1, 0x8, fp)) return 8; | |
217 | ||
218 | //Update the dataOffset | |
219 | bw->hdr->dataOffset = ftell(fp); | |
220 | if(writeAtPos(&bw->hdr->dataOffset, sizeof(uint64_t), 1, 0x10, fp)) return 12; | |
221 | ||
222 | //Save space for the number of blocks | |
223 | if(fwrite(p, sizeof(uint8_t), 8, fp) != 8) return 13; | |
224 | ||
225 | free(p); | |
226 | return 0; | |
227 | } | |
228 | ||
229 | static int insertIndexNode(bigWigFile_t *fp, bwRTreeNode_t *leaf) { | |
230 | bwLL *l = malloc(sizeof(bwLL)); | |
231 | if(!l) return 1; | |
232 | l->node = leaf; | |
233 | l->next = NULL; | |
234 | ||
235 | if(!fp->writeBuffer->firstIndexNode) { | |
236 | fp->writeBuffer->firstIndexNode = l; | |
237 | } else { | |
238 | fp->writeBuffer->currentIndexNode->next = l; | |
239 | } | |
240 | fp->writeBuffer->currentIndexNode = l; | |
241 | return 0; | |
242 | } | |
243 | ||
244 | //0 on success | |
245 | static int appendIndexNodeEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) { | |
246 | bwLL *n = fp->writeBuffer->currentIndexNode; | |
247 | if(!n) return 1; | |
248 | if(n->node->nChildren >= fp->writeBuffer->blockSize) return 2; | |
249 | ||
250 | n->node->chrIdxStart[n->node->nChildren] = tid0; | |
251 | n->node->baseStart[n->node->nChildren] = start; | |
252 | n->node->chrIdxEnd[n->node->nChildren] = tid1; | |
253 | n->node->baseEnd[n->node->nChildren] = end; | |
254 | n->node->dataOffset[n->node->nChildren] = offset; | |
255 | n->node->x.size[n->node->nChildren] = size; | |
256 | n->node->nChildren++; | |
257 | return 0; | |
258 | } | |
259 | ||
260 | //Returns 0 on success | |
261 | static int addIndexEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) { | |
262 | bwRTreeNode_t *node; | |
263 | ||
264 | if(appendIndexNodeEntry(fp, tid0, tid1, start, end, offset, size)) { | |
265 | //The last index node is full, we need to add a new one | |
266 | node = calloc(1, sizeof(bwRTreeNode_t)); | |
267 | if(!node) return 1; | |
268 | ||
269 | //Allocate and set the fields | |
270 | node->isLeaf = 1; | |
271 | node->nChildren = 1; | |
272 | node->chrIdxStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); | |
273 | if(!node->chrIdxStart) goto error; | |
274 | node->baseStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); | |
275 | if(!node->baseStart) goto error; | |
276 | node->chrIdxEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); | |
277 | if(!node->chrIdxEnd) goto error; | |
278 | node->baseEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); | |
279 | if(!node->baseEnd) goto error; | |
280 | node->dataOffset = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize); | |
281 | if(!node->dataOffset) goto error; | |
282 | node->x.size = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize); | |
283 | if(!node->x.size) goto error; | |
284 | ||
285 | node->chrIdxStart[0] = tid0; | |
286 | node->baseStart[0] = start; | |
287 | node->chrIdxEnd[0] = tid1; | |
288 | node->baseEnd[0] = end; | |
289 | node->dataOffset[0] = offset; | |
290 | node->x.size[0] = size; | |
291 | ||
292 | if(insertIndexNode(fp, node)) goto error; | |
293 | } | |
294 | ||
295 | return 0; | |
296 | ||
297 | error: | |
298 | if(node->chrIdxStart) free(node->chrIdxStart); | |
299 | if(node->baseStart) free(node->baseStart); | |
300 | if(node->chrIdxEnd) free(node->chrIdxEnd); | |
301 | if(node->baseEnd) free(node->baseEnd); | |
302 | if(node->dataOffset) free(node->dataOffset); | |
303 | if(node->x.size) free(node->x.size); | |
304 | return 2; | |
305 | } | |
306 | ||
307 | /* | |
308 | * TODO: | |
309 | * The buffer size and compression sz need to be determined elsewhere (and p and compressP filled in!) | |
310 | */ | |
311 | static int flushBuffer(bigWigFile_t *fp) { | |
312 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
313 | uLongf sz = wb->compressPsz; | |
314 | uint16_t nItems; | |
315 | if(!fp->writeBuffer->l) return 0; | |
316 | if(!wb->ltype) return 0; | |
317 | ||
318 | //Fill in the header | |
319 | if(!memcpy(wb->p, &(wb->tid), sizeof(uint32_t))) return 1; | |
320 | if(!memcpy(wb->p+4, &(wb->start), sizeof(uint32_t))) return 2; | |
321 | if(!memcpy(wb->p+8, &(wb->end), sizeof(uint32_t))) return 3; | |
322 | if(!memcpy(wb->p+12, &(wb->step), sizeof(uint32_t))) return 4; | |
323 | if(!memcpy(wb->p+16, &(wb->span), sizeof(uint32_t))) return 5; | |
324 | if(!memcpy(wb->p+20, &(wb->ltype), sizeof(uint8_t))) return 6; | |
325 | //1 byte padding | |
326 | //Determine the number of items | |
327 | switch(wb->ltype) { | |
328 | case 1: | |
329 | nItems = (wb->l-24)/12; | |
330 | break; | |
331 | case 2: | |
332 | nItems = (wb->l-24)/8; | |
333 | break; | |
334 | case 3: | |
335 | nItems = (wb->l-24)/4; | |
336 | break; | |
337 | default: | |
338 | return 7; | |
339 | } | |
340 | if(!memcpy(wb->p+22, &nItems, sizeof(uint16_t))) return 8; | |
341 | ||
342 | if(sz) { | |
343 | //compress | |
344 | if(compress(wb->compressP, &sz, wb->p, wb->l) != Z_OK) return 9; | |
345 | ||
346 | //write the data to disk | |
347 | if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 10; | |
348 | } else { | |
349 | sz = wb->l; | |
350 | if(fwrite(wb->p, sizeof(uint8_t), wb->l, fp->URL->x.fp) != wb->l) return 10; | |
351 | } | |
352 | ||
353 | //Add an entry into the index | |
354 | if(addIndexEntry(fp, wb->tid, wb->tid, wb->start, wb->end, bwTell(fp)-sz, sz)) return 11; | |
355 | ||
356 | wb->nBlocks++; | |
357 | wb->l = 24; | |
358 | return 0; | |
359 | } | |
360 | ||
361 | static void updateStats(bigWigFile_t *fp, uint32_t span, float val) { | |
362 | if(val < fp->hdr->minVal) fp->hdr->minVal = val; | |
363 | else if(val > fp->hdr->maxVal) fp->hdr->maxVal = val; | |
364 | fp->hdr->nBasesCovered += span; | |
365 | fp->hdr->sumData += span*val; | |
366 | fp->hdr->sumSquared += span*pow(val,2); | |
367 | ||
368 | fp->writeBuffer->nEntries++; | |
369 | fp->writeBuffer->runningWidthSum += span; | |
370 | } | |
371 | ||
372 | //12 bytes per entry | |
373 | int bwAddIntervals(bigWigFile_t *fp, char **chrom, uint32_t *start, uint32_t *end, float *values, uint32_t n) { | |
374 | uint32_t tid = 0, i; | |
375 | char *lastChrom = NULL; | |
376 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
377 | if(!n) return 0; //Not an error per se | |
378 | if(!fp->isWrite) return 1; | |
379 | if(!wb) return 2; | |
380 | ||
381 | //Flush if needed | |
382 | if(wb->ltype != 1) if(flushBuffer(fp)) return 3; | |
383 | if(wb->l+36 > fp->hdr->bufSize) if(flushBuffer(fp)) return 4; | |
384 | lastChrom = chrom[0]; | |
385 | tid = bwGetTid(fp, chrom[0]); | |
386 | if(tid == (uint32_t) -1) return 5; | |
387 | if(tid != wb->tid) { | |
388 | if(flushBuffer(fp)) return 6; | |
389 | wb->tid = tid; | |
390 | wb->start = start[0]; | |
391 | wb->end = end[0]; | |
392 | } | |
393 | ||
394 | //Ensure that everything is set correctly | |
395 | wb->ltype = 1; | |
396 | if(wb->l <= 24) { | |
397 | wb->start = start[0]; | |
398 | wb->span = 0; | |
399 | wb->step = 0; | |
400 | } | |
401 | if(!memcpy(wb->p+wb->l, start, sizeof(uint32_t))) return 7; | |
402 | if(!memcpy(wb->p+wb->l+4, end, sizeof(uint32_t))) return 8; | |
403 | if(!memcpy(wb->p+wb->l+8, values, sizeof(float))) return 9; | |
404 | updateStats(fp, end[0]-start[0], values[0]); | |
405 | wb->l += 12; | |
406 | ||
407 | for(i=1; i<n; i++) { | |
408 | if(strcmp(chrom[i],lastChrom) != 0) { | |
409 | wb->end = end[i-1]; | |
410 | flushBuffer(fp); | |
411 | lastChrom = chrom[i]; | |
412 | tid = bwGetTid(fp, chrom[i]); | |
413 | if(tid == (uint32_t) -1) return 10; | |
414 | wb->tid = tid; | |
415 | wb->start = start[i]; | |
416 | } | |
417 | if(wb->l+12 > fp->hdr->bufSize) { //12 bytes/entry | |
418 | wb->end = end[i-1]; | |
419 | flushBuffer(fp); | |
420 | wb->start = start[i]; | |
421 | } | |
422 | if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 11; | |
423 | if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 12; | |
424 | if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 13; | |
425 | updateStats(fp, end[i]-start[i], values[i]); | |
426 | wb->l += 12; | |
427 | } | |
428 | wb->end = end[i-1]; | |
429 | ||
430 | return 0; | |
431 | } | |
432 | ||
433 | int bwAppendIntervals(bigWigFile_t *fp, uint32_t *start, uint32_t *end, float *values, uint32_t n) { | |
434 | uint32_t i; | |
435 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
436 | if(!n) return 0; | |
437 | if(!fp->isWrite) return 1; | |
438 | if(!wb) return 2; | |
439 | if(wb->ltype != 1) return 3; | |
440 | ||
441 | for(i=0; i<n; i++) { | |
442 | if(wb->l+12 > fp->hdr->bufSize) { | |
443 | if(i>0) { //otherwise it's already set | |
444 | wb->end = end[i-1]; | |
445 | } | |
446 | flushBuffer(fp); | |
447 | wb->start = start[i]; | |
448 | } | |
449 | if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4; | |
450 | if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 5; | |
451 | if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 6; | |
452 | updateStats(fp, end[i]-start[i], values[i]); | |
453 | wb->l += 12; | |
454 | } | |
455 | wb->end = end[i-1]; | |
456 | ||
457 | return 0; | |
458 | } | |
459 | ||
460 | //8 bytes per entry | |
461 | int bwAddIntervalSpans(bigWigFile_t *fp, char *chrom, uint32_t *start, uint32_t span, float *values, uint32_t n) { | |
462 | uint32_t i, tid; | |
463 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
464 | if(!n) return 0; | |
465 | if(!fp->isWrite) return 1; | |
466 | if(!wb) return 2; | |
467 | if(wb->ltype != 2) if(flushBuffer(fp)) return 3; | |
468 | if(flushBuffer(fp)) return 4; | |
469 | ||
470 | tid = bwGetTid(fp, chrom); | |
471 | if(tid == (uint32_t) -1) return 5; | |
472 | wb->tid = tid; | |
473 | wb->start = start[0]; | |
474 | wb->step = 0; | |
475 | wb->span = span; | |
476 | wb->ltype = 2; | |
477 | ||
478 | for(i=0; i<n; i++) { | |
479 | if(wb->l + 8 >= fp->hdr->bufSize) { //8 bytes/entry | |
480 | if(i) wb->end = start[i-1]+span; | |
481 | flushBuffer(fp); | |
482 | wb->start = start[i]; | |
483 | } | |
484 | if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 5; | |
485 | if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 6; | |
486 | updateStats(fp, span, values[i]); | |
487 | wb->l += 8; | |
488 | } | |
489 | wb->end = start[n-1] + span; | |
490 | ||
491 | return 0; | |
492 | } | |
493 | ||
494 | int bwAppendIntervalSpans(bigWigFile_t *fp, uint32_t *start, float *values, uint32_t n) { | |
495 | uint32_t i; | |
496 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
497 | if(!n) return 0; | |
498 | if(!fp->isWrite) return 1; | |
499 | if(!wb) return 2; | |
500 | if(wb->ltype != 2) return 3; | |
501 | ||
502 | for(i=0; i<n; i++) { | |
503 | if(wb->l + 8 >= fp->hdr->bufSize) { | |
504 | if(i) wb->end = start[i-1]+wb->span; | |
505 | flushBuffer(fp); | |
506 | wb->start = start[i]; | |
507 | } | |
508 | if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4; | |
509 | if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 5; | |
510 | updateStats(fp, wb->span, values[i]); | |
511 | wb->l += 8; | |
512 | } | |
513 | wb->end = start[n-1] + wb->span; | |
514 | ||
515 | return 0; | |
516 | } | |
517 | ||
518 | //4 bytes per entry | |
519 | int bwAddIntervalSpanSteps(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t span, uint32_t step, float *values, uint32_t n) { | |
520 | uint32_t i, tid; | |
521 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
522 | if(!n) return 0; | |
523 | if(!fp->isWrite) return 1; | |
524 | if(!wb) return 2; | |
525 | if(wb->ltype != 3) flushBuffer(fp); | |
526 | if(flushBuffer(fp)) return 3; | |
527 | ||
528 | tid = bwGetTid(fp, chrom); | |
529 | if(tid == (uint32_t) -1) return 4; | |
530 | wb->tid = tid; | |
531 | wb->start = start; | |
532 | wb->step = step; | |
533 | wb->span = span; | |
534 | wb->ltype = 3; | |
535 | ||
536 | for(i=0; i<n; i++) { | |
537 | if(wb->l + 4 >= fp->hdr->bufSize) { | |
538 | wb->end = wb->start + ((wb->l-24)>>2) * step; | |
539 | flushBuffer(fp); | |
540 | wb->start = wb->end; | |
541 | } | |
542 | if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 5; | |
543 | updateStats(fp, wb->span, values[i]); | |
544 | wb->l += 4; | |
545 | } | |
546 | wb->end = wb->start + (wb->l>>2) * step; | |
547 | ||
548 | return 0; | |
549 | } | |
550 | ||
551 | int bwAppendIntervalSpanSteps(bigWigFile_t *fp, float *values, uint32_t n) { | |
552 | uint32_t i; | |
553 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
554 | if(!n) return 0; | |
555 | if(!fp->isWrite) return 1; | |
556 | if(!wb) return 2; | |
557 | if(wb->ltype != 3) return 3; | |
558 | ||
559 | for(i=0; i<n; i++) { | |
560 | if(wb->l + 4 >= fp->hdr->bufSize) { | |
561 | wb->end = wb->start + ((wb->l-24)>>2) * wb->step; | |
562 | flushBuffer(fp); | |
563 | wb->start = wb->end; | |
564 | } | |
565 | if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 4; | |
566 | updateStats(fp, wb->span, values[i]); | |
567 | wb->l += 4; | |
568 | } | |
569 | wb->end = wb->start + (wb->l>>2) * wb->step; | |
570 | ||
571 | return 0; | |
572 | } | |
573 | ||
574 | //0 on success | |
575 | int writeSummary(bigWigFile_t *fp) { | |
576 | if(writeAtPos(&(fp->hdr->nBasesCovered), sizeof(uint64_t), 1, fp->hdr->summaryOffset, fp->URL->x.fp)) return 1; | |
577 | if(writeAtPos(&(fp->hdr->minVal), sizeof(double), 1, fp->hdr->summaryOffset+8, fp->URL->x.fp)) return 2; | |
578 | if(writeAtPos(&(fp->hdr->maxVal), sizeof(double), 1, fp->hdr->summaryOffset+16, fp->URL->x.fp)) return 3; | |
579 | if(writeAtPos(&(fp->hdr->sumData), sizeof(double), 1, fp->hdr->summaryOffset+24, fp->URL->x.fp)) return 4; | |
580 | if(writeAtPos(&(fp->hdr->sumSquared), sizeof(double), 1, fp->hdr->summaryOffset+32, fp->URL->x.fp)) return 5; | |
581 | return 0; | |
582 | } | |
583 | ||
584 | static bwRTreeNode_t *makeEmptyNode(uint32_t blockSize) { | |
585 | bwRTreeNode_t *n = calloc(1, sizeof(bwRTreeNode_t)); | |
586 | if(!n) return NULL; | |
587 | ||
588 | n->chrIdxStart = malloc(blockSize*sizeof(uint32_t)); | |
589 | if(!n->chrIdxStart) goto error; | |
590 | n->baseStart = malloc(blockSize*sizeof(uint32_t)); | |
591 | if(!n->baseStart) goto error; | |
592 | n->chrIdxEnd = malloc(blockSize*sizeof(uint32_t)); | |
593 | if(!n->chrIdxEnd) goto error; | |
594 | n->baseEnd = malloc(blockSize*sizeof(uint32_t)); | |
595 | if(!n->baseEnd) goto error; | |
596 | n->dataOffset = calloc(blockSize,sizeof(uint64_t)); //This MUST be 0 for node writing! | |
597 | if(!n->dataOffset) goto error; | |
598 | n->x.child = malloc(blockSize*sizeof(uint64_t)); | |
599 | if(!n->x.child) goto error; | |
600 | ||
601 | return n; | |
602 | ||
603 | error: | |
604 | if(n->chrIdxStart) free(n->chrIdxStart); | |
605 | if(n->baseStart) free(n->baseStart); | |
606 | if(n->chrIdxEnd) free(n->chrIdxEnd); | |
607 | if(n->baseEnd) free(n->baseEnd); | |
608 | if(n->dataOffset) free(n->dataOffset); | |
609 | if(n->x.child) free(n->x.child); | |
610 | free(n); | |
611 | return NULL; | |
612 | } | |
613 | ||
614 | //Returns 0 on success. This doesn't attempt to clean up! | |
615 | static bwRTreeNode_t *addLeaves(bwLL **ll, uint64_t *sz, uint64_t toProcess, uint32_t blockSize) { | |
616 | uint32_t i; | |
617 | uint64_t foo; | |
618 | bwRTreeNode_t *n = makeEmptyNode(blockSize); | |
619 | if(!n) return NULL; | |
620 | ||
621 | if(toProcess <= blockSize) { | |
622 | for(i=0; i<toProcess; i++) { | |
623 | n->chrIdxStart[i] = (*ll)->node->chrIdxStart[0]; | |
624 | n->baseStart[i] = (*ll)->node->baseStart[0]; | |
625 | n->chrIdxEnd[i] = (*ll)->node->chrIdxEnd[(*ll)->node->nChildren-1]; | |
626 | n->baseEnd[i] = (*ll)->node->baseEnd[(*ll)->node->nChildren-1]; | |
627 | n->x.child[i] = (*ll)->node; | |
628 | *sz += 4 + 32*(*ll)->node->nChildren; | |
629 | *ll = (*ll)->next; | |
630 | n->nChildren++; | |
631 | } | |
632 | } else { | |
633 | for(i=0; i<blockSize; i++) { | |
634 | foo = ceil(((double) toProcess)/((double) blockSize-i)); | |
635 | if(!ll) break; | |
636 | n->x.child[i] = addLeaves(ll, sz, foo, blockSize); | |
637 | if(!n->x.child[i]) goto error; | |
638 | n->chrIdxStart[i] = n->x.child[i]->chrIdxStart[0]; | |
639 | n->baseStart[i] = n->x.child[i]->baseStart[0]; | |
640 | n->chrIdxEnd[i] = n->x.child[i]->chrIdxEnd[n->x.child[i]->nChildren-1]; | |
641 | n->baseEnd[i] = n->x.child[i]->baseEnd[n->x.child[i]->nChildren-1]; | |
642 | n->nChildren++; | |
643 | toProcess -= foo; | |
644 | } | |
645 | } | |
646 | ||
647 | *sz += 4 + 24*n->nChildren; | |
648 | return n; | |
649 | ||
650 | error: | |
651 | bwDestroyIndexNode(n); | |
652 | return NULL; | |
653 | } | |
654 | ||
655 | //Returns 1 on error | |
656 | int writeIndexTreeNode(FILE *fp, bwRTreeNode_t *n, uint8_t *wrote, int level) { | |
657 | uint8_t one = 0; | |
658 | uint32_t i, j, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0 | |
659 | ||
660 | if(n->isLeaf) return 0; | |
661 | ||
662 | for(i=0; i<n->nChildren; i++) { | |
663 | if(n->dataOffset[i]) { //traverse into child | |
664 | if(n->isLeaf) return 0; //Only write leaves once! | |
665 | if(writeIndexTreeNode(fp, n->x.child[i], wrote, level+1)) return 1; | |
666 | } else { | |
667 | n->dataOffset[i] = ftell(fp); | |
668 | if(fwrite(&(n->x.child[i]->isLeaf), sizeof(uint8_t), 1, fp) != 1) return 1; | |
669 | if(fwrite(&one, sizeof(uint8_t), 1, fp) != 1) return 1; //one byte of padding | |
670 | if(fwrite(&(n->x.child[i]->nChildren), sizeof(uint16_t), 1, fp) != 1) return 1; | |
671 | for(j=0; j<n->x.child[i]->nChildren; j++) { | |
672 | vector[0] = n->x.child[i]->chrIdxStart[j]; | |
673 | vector[1] = n->x.child[i]->baseStart[j]; | |
674 | vector[2] = n->x.child[i]->chrIdxEnd[j]; | |
675 | vector[3] = n->x.child[i]->baseEnd[j]; | |
676 | if(n->x.child[i]->isLeaf) { | |
677 | //Include the offset and size | |
678 | if(fwrite(vector, sizeof(uint32_t), 4, fp) != 4) return 1; | |
679 | if(fwrite(&(n->x.child[i]->dataOffset[j]), sizeof(uint64_t), 1, fp) != 1) return 1; | |
680 | if(fwrite(&(n->x.child[i]->x.size[j]), sizeof(uint64_t), 1, fp) != 1) return 1; | |
681 | } else { | |
682 | if(fwrite(vector, sizeof(uint32_t), 6, fp) != 6) return 1; | |
683 | } | |
684 | } | |
685 | *wrote = 1; | |
686 | } | |
687 | } | |
688 | ||
689 | return 0; | |
690 | } | |
691 | ||
692 | //returns 1 on success | |
693 | int writeIndexOffsets(FILE *fp, bwRTreeNode_t *n, uint64_t offset) { | |
694 | uint32_t i; | |
695 | ||
696 | if(n->isLeaf) return 0; | |
697 | for(i=0; i<n->nChildren; i++) { | |
698 | if(writeIndexOffsets(fp, n->x.child[i], n->dataOffset[i])) return 1; | |
699 | if(writeAtPos(&(n->dataOffset[i]), sizeof(uint64_t), 1, offset+20+24*i, fp)) return 2; | |
700 | } | |
701 | return 0; | |
702 | } | |
703 | ||
704 | //Returns 0 on success | |
705 | int writeIndexTree(bigWigFile_t *fp) { | |
706 | uint64_t offset; | |
707 | uint8_t wrote = 0; | |
708 | int rv; | |
709 | ||
710 | while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->idx->root, &wrote, 0)) == 0) { | |
711 | if(!wrote) break; | |
712 | wrote = 0; | |
713 | } | |
714 | ||
715 | if(rv || wrote) return 1; | |
716 | ||
717 | //Save the file position | |
718 | offset = bwTell(fp); | |
719 | ||
720 | //Write the offsets | |
721 | if(writeIndexOffsets(fp->URL->x.fp, fp->idx->root, fp->idx->rootOffset)) return 2; | |
722 | ||
723 | //Move the file pointer back to the end | |
724 | bwSetPos(fp, offset); | |
725 | ||
726 | return 0; | |
727 | } | |
728 | ||
729 | //Returns 0 on success. The original state SHOULD be preserved on error | |
730 | int writeIndex(bigWigFile_t *fp) { | |
731 | uint32_t four = IDX_MAGIC; | |
732 | uint64_t idxSize = 0, foo; | |
733 | uint8_t one = 0; | |
734 | uint32_t i, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0 | |
735 | bwLL *ll = fp->writeBuffer->firstIndexNode, *p; | |
736 | bwRTreeNode_t *root = NULL; | |
737 | ||
738 | if(!fp->writeBuffer->nBlocks) return 0; | |
739 | fp->idx = malloc(sizeof(bwRTree_t)); | |
740 | if(!fp->idx) return 2; | |
741 | fp->idx->root = root; | |
742 | ||
743 | //Update the file header to indicate the proper index position | |
744 | foo = bwTell(fp); | |
745 | if(writeAtPos(&foo, sizeof(uint64_t), 1, 0x18, fp->URL->x.fp)) return 3; | |
746 | ||
747 | //Make the tree | |
748 | if(ll == fp->writeBuffer->currentIndexNode) { | |
749 | root = ll->node; | |
750 | idxSize = 4 + 24*root->nChildren; | |
751 | } else { | |
752 | root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize); | |
753 | } | |
754 | if(!root) return 4; | |
755 | fp->idx->root = root; | |
756 | ||
757 | ll = fp->writeBuffer->firstIndexNode; | |
758 | while(ll) { | |
759 | p = ll->next; | |
760 | free(ll); | |
761 | ll=p; | |
762 | } | |
763 | ||
764 | //write the header | |
765 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 5; | |
766 | if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6; | |
767 | if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7; | |
768 | if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; | |
769 | if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; | |
770 | if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10; | |
771 | if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11; | |
772 | if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12; | |
773 | four = 1; | |
774 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13; | |
775 | four = 0; | |
776 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding | |
777 | fp->idx->rootOffset = bwTell(fp); | |
778 | ||
779 | //Write the root node, since writeIndexTree writes the children and fills in the offset | |
780 | if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16; | |
781 | if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding | |
782 | if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18; | |
783 | for(i=0; i<root->nChildren; i++) { | |
784 | vector[0] = root->chrIdxStart[i]; | |
785 | vector[1] = root->baseStart[i]; | |
786 | vector[2] = root->chrIdxEnd[i]; | |
787 | vector[3] = root->baseEnd[i]; | |
788 | if(root->isLeaf) { | |
789 | //Include the offset and size | |
790 | if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19; | |
791 | if(fwrite(&(root->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20; | |
792 | if(fwrite(&(root->x.size[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21; | |
793 | } else { | |
794 | root->dataOffset[i] = 0; //FIXME: Something upstream is setting this to impossible values (e.g., 0x21?!?!?) | |
795 | if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22; | |
796 | } | |
797 | } | |
798 | ||
799 | //Write each level | |
800 | if(writeIndexTree(fp)) return 23; | |
801 | ||
802 | return 0; | |
803 | } | |
804 | ||
805 | //The first zoom level has a resolution of 4x mean entry size | |
806 | //This may or may not produce the requested number of zoom levels | |
807 | int makeZoomLevels(bigWigFile_t *fp) { | |
808 | uint32_t meanBinSize, i; | |
809 | uint32_t multiplier = 4, zoom = 10, maxZoom = 0; | |
810 | uint16_t nLevels = 0; | |
811 | ||
812 | meanBinSize = ((double) fp->writeBuffer->runningWidthSum)/(fp->writeBuffer->nEntries); | |
813 | //In reality, one level is skipped | |
814 | meanBinSize *= 4; | |
815 | //N.B., we must ALWAYS check that the zoom doesn't overflow a uint32_t! | |
816 | if(((uint32_t)-1)>>2 < meanBinSize) return 0; //No zoom levels! | |
817 | if(meanBinSize*4 > zoom) zoom = multiplier*meanBinSize; | |
818 | ||
819 | fp->hdr->zoomHdrs = calloc(1, sizeof(bwZoomHdr_t)); | |
820 | if(!fp->hdr->zoomHdrs) return 1; | |
821 | fp->hdr->zoomHdrs->level = malloc(fp->hdr->nLevels * sizeof(uint32_t)); | |
822 | fp->hdr->zoomHdrs->dataOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t)); | |
823 | fp->hdr->zoomHdrs->indexOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t)); | |
824 | fp->hdr->zoomHdrs->idx = calloc(fp->hdr->nLevels, sizeof(bwRTree_t*)); | |
825 | if(!fp->hdr->zoomHdrs->level) return 2; | |
826 | if(!fp->hdr->zoomHdrs->dataOffset) return 3; | |
827 | if(!fp->hdr->zoomHdrs->indexOffset) return 4; | |
828 | if(!fp->hdr->zoomHdrs->idx) return 5; | |
829 | ||
830 | //There's no point in having a zoom level larger than the largest chromosome | |
831 | //This will none the less allow at least one zoom level, which is generally needed for IGV et al. | |
832 | for(i=0; i<fp->cl->nKeys; i++) { | |
833 | if(fp->cl->len[i] > maxZoom) maxZoom = fp->cl->len[i]; | |
834 | } | |
835 | if(zoom > maxZoom) zoom = maxZoom; | |
836 | ||
837 | for(i=0; i<fp->hdr->nLevels; i++) { | |
838 | if(zoom > maxZoom) break; //prevent absurdly large zoom levels | |
839 | fp->hdr->zoomHdrs->level[i] = zoom; | |
840 | nLevels++; | |
841 | if(((uint32_t)-1)/multiplier < zoom) break; | |
842 | zoom *= multiplier; | |
843 | } | |
844 | fp->hdr->nLevels = nLevels; | |
845 | ||
846 | fp->writeBuffer->firstZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*)); | |
847 | if(!fp->writeBuffer->firstZoomBuffer) goto error; | |
848 | fp->writeBuffer->lastZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*)); | |
849 | if(!fp->writeBuffer->lastZoomBuffer) goto error; | |
850 | fp->writeBuffer->nNodes = calloc(nLevels, sizeof(uint64_t)); | |
851 | ||
852 | for(i=0; i<fp->hdr->nLevels; i++) { | |
853 | fp->writeBuffer->firstZoomBuffer[i] = calloc(1, sizeof(bwZoomBuffer_t)); | |
854 | if(!fp->writeBuffer->firstZoomBuffer[i]) goto error; | |
855 | fp->writeBuffer->firstZoomBuffer[i]->p = calloc(fp->hdr->bufSize/32, 32); | |
856 | if(!fp->writeBuffer->firstZoomBuffer[i]->p) goto error; | |
857 | fp->writeBuffer->firstZoomBuffer[i]->m = fp->hdr->bufSize; | |
858 | ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[0] = 0; | |
859 | ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[1] = 0; | |
860 | ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->hdr->zoomHdrs->level[i]; | |
861 | if(fp->hdr->zoomHdrs->level[i] > fp->cl->len[0]) ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->cl->len[0]; | |
862 | fp->writeBuffer->lastZoomBuffer[i] = fp->writeBuffer->firstZoomBuffer[i]; | |
863 | } | |
864 | ||
865 | return 0; | |
866 | ||
867 | error: | |
868 | if(fp->writeBuffer->firstZoomBuffer) { | |
869 | for(i=0; i<fp->hdr->nLevels; i++) { | |
870 | if(fp->writeBuffer->firstZoomBuffer[i]) { | |
871 | if(fp->writeBuffer->firstZoomBuffer[i]->p) free(fp->writeBuffer->firstZoomBuffer[i]->p); | |
872 | free(fp->writeBuffer->firstZoomBuffer[i]); | |
873 | } | |
874 | } | |
875 | free(fp->writeBuffer->firstZoomBuffer); | |
876 | } | |
877 | if(fp->writeBuffer->lastZoomBuffer) free(fp->writeBuffer->lastZoomBuffer); | |
878 | if(fp->writeBuffer->nNodes) free(fp->writeBuffer->lastZoomBuffer); | |
879 | return 6; | |
880 | } | |
881 | ||
882 | //Given an interval start, calculate the next one at a zoom level | |
883 | void nextPos(bigWigFile_t *fp, uint32_t size, uint32_t *pos, uint32_t desiredTid) { | |
884 | uint32_t *tid = pos; | |
885 | uint32_t *start = pos+1; | |
886 | uint32_t *end = pos+2; | |
887 | *start += size; | |
888 | if(*start >= fp->cl->len[*tid]) { | |
889 | (*start) = 0; | |
890 | (*tid)++; | |
891 | } | |
892 | ||
893 | //prevent needless iteration when changing chromosomes | |
894 | if(*tid < desiredTid) { | |
895 | *tid = desiredTid; | |
896 | *start = 0; | |
897 | } | |
898 | ||
899 | (*end) = *start+size; | |
900 | if(*end > fp->cl->len[*tid]) (*end) = fp->cl->len[*tid]; | |
901 | } | |
902 | ||
903 | //Return the number of bases two intervals overlap | |
904 | uint32_t overlapsInterval(uint32_t tid0, uint32_t start0, uint32_t end0, uint32_t tid1, uint32_t start1, uint32_t end1) { | |
905 | if(tid0 != tid1) return 0; | |
906 | if(end0 <= start1) return 0; | |
907 | if(end1 <= start0) return 0; | |
908 | if(end0 <= end1) { | |
909 | if(start1 > start0) return end0-start1; | |
910 | return end0-start0; | |
911 | } else { | |
912 | if(start1 > start0) return end1-start1; | |
913 | return end1-start0; | |
914 | } | |
915 | } | |
916 | ||
917 | //Returns the number of bases of the interval written | |
918 | uint32_t updateInterval(bigWigFile_t *fp, bwZoomBuffer_t *buffer, double *sum, double *sumsq, uint32_t size, uint32_t tid, uint32_t start, uint32_t end, float value) { | |
919 | uint32_t *p2 = (uint32_t*) buffer->p; | |
920 | float *fp2 = (float*) p2; | |
921 | uint32_t rv = 0, offset = 0; | |
922 | if(!buffer) return 0; | |
923 | if(buffer->l+32 >= buffer->m) return 0; | |
924 | ||
925 | //Make sure that we don't overflow a uint32_t by adding some huge value to start | |
926 | if(start + size < start) size = ((uint32_t) -1) - start; | |
927 | ||
928 | if(buffer->l) { | |
929 | offset = buffer->l/32; | |
930 | } else { | |
931 | p2[0] = tid; | |
932 | p2[1] = start; | |
933 | if(start+size < end) p2[2] = start+size; | |
934 | else p2[2] = end; | |
935 | } | |
936 | ||
937 | //Do we have any overlap with the previously added interval? | |
938 | if(offset) { | |
939 | rv = overlapsInterval(p2[8*(offset-1)], p2[8*(offset-1)+1], p2[8*(offset-1)+1] + size, tid, start, end); | |
940 | if(rv) { | |
941 | p2[8*(offset-1)+2] = start + rv; | |
942 | p2[8*(offset-1)+3] += rv; | |
943 | if(fp2[8*(offset-1)+4] > value) fp2[8*(offset-1)+4] = value; | |
944 | if(fp2[8*(offset-1)+5] < value) fp2[8*(offset-1)+5] = value; | |
945 | *sum += rv*value; | |
946 | *sumsq += rv*pow(value, 2.0); | |
947 | return rv; | |
948 | } else { | |
949 | fp2[8*(offset-1)+6] = *sum; | |
950 | fp2[8*(offset-1)+7] = *sumsq; | |
951 | *sum = 0.0; | |
952 | *sumsq = 0.0; | |
953 | } | |
954 | } | |
955 | ||
956 | //If we move to a new interval then skip iterating over a bunch of obviously non-overlapping intervals | |
957 | if(offset && p2[8*offset+2] == 0) { | |
958 | p2[8*offset] = tid; | |
959 | p2[8*offset+1] = start; | |
960 | if(start+size < end) p2[8*offset+2] = start+size; | |
961 | else p2[8*offset+2] = end; | |
962 | //nextPos(fp, size, p2+8*offset, tid); //We can actually skip uncovered intervals | |
963 | } | |
964 | ||
965 | //Add a new entry | |
966 | while(!(rv = overlapsInterval(p2[8*offset], p2[8*offset+1], p2[8*offset+1] + size, tid, start, end))) { | |
967 | p2[8*offset] = tid; | |
968 | p2[8*offset+1] = start; | |
969 | if(start+size < end) p2[8*offset+2] = start+size; | |
970 | else p2[8*offset+2] = end; | |
971 | //nextPos(fp, size, p2+8*offset, tid); | |
972 | } | |
973 | p2[8*offset+3] = rv; | |
974 | fp2[8*offset+4] = value; //min | |
975 | fp2[8*offset+5] = value; //max | |
976 | *sum += rv * value; | |
977 | *sumsq += rv * pow(value,2.0); | |
978 | buffer->l += 32; | |
979 | return rv; | |
980 | } | |
981 | ||
982 | //Returns 0 on success | |
983 | int addIntervalValue(bigWigFile_t *fp, uint64_t *nEntries, double *sum, double *sumsq, bwZoomBuffer_t *buffer, uint32_t itemsPerSlot, uint32_t zoom, uint32_t tid, uint32_t start, uint32_t end, float value) { | |
984 | bwZoomBuffer_t *newBuffer = NULL; | |
985 | uint32_t rv; | |
986 | ||
987 | while(start < end) { | |
988 | rv = updateInterval(fp, buffer, sum, sumsq, zoom, tid, start, end, value); | |
989 | if(!rv) { | |
990 | //Allocate a new buffer | |
991 | newBuffer = calloc(1, sizeof(bwZoomBuffer_t)); | |
992 | if(!newBuffer) return 1; | |
993 | newBuffer->p = calloc(itemsPerSlot, 32); | |
994 | if(!newBuffer->p) goto error; | |
995 | newBuffer->m = itemsPerSlot*32; | |
996 | memcpy(newBuffer->p, buffer->p+buffer->l-32, 4); | |
997 | memcpy(newBuffer->p+4, buffer->p+buffer->l-28, 4); | |
998 | ((uint32_t*) newBuffer->p)[2] = ((uint32_t*) newBuffer->p)[1] + zoom; | |
999 | *sum = *sumsq = 0.0; | |
1000 | rv = updateInterval(fp, newBuffer, sum, sumsq, zoom, tid, start, end, value); | |
1001 | if(!rv) goto error; | |
1002 | buffer->next = newBuffer; | |
1003 | buffer = buffer->next; | |
1004 | *nEntries += 1; | |
1005 | } | |
1006 | start += rv; | |
1007 | } | |
1008 | ||
1009 | return 0; | |
1010 | ||
1011 | error: | |
1012 | if(newBuffer) { | |
1013 | if(newBuffer->m) free(newBuffer->p); | |
1014 | free(newBuffer); | |
1015 | } | |
1016 | return 2; | |
1017 | } | |
1018 | ||
1019 | //Get all of the intervals and add them to the appropriate zoomBuffer | |
1020 | int constructZoomLevels(bigWigFile_t *fp) { | |
1021 | bwOverlappingIntervals_t *intervals = NULL; | |
1022 | double *sum = NULL, *sumsq = NULL; | |
1023 | uint32_t i, j, k; | |
1024 | ||
1025 | sum = calloc(fp->hdr->nLevels, sizeof(double)); | |
1026 | sumsq = calloc(fp->hdr->nLevels, sizeof(double)); | |
1027 | if(!sum || !sumsq) goto error; | |
1028 | ||
1029 | for(i=0; i<fp->cl->nKeys; i++) { | |
1030 | intervals = bwGetOverlappingIntervals(fp, fp->cl->chrom[i], 0, fp->cl->len[i]); | |
1031 | if(!intervals) goto error; | |
1032 | for(j=0; j<intervals->l; j++) { | |
1033 | for(k=0; k<fp->hdr->nLevels; k++) { | |
1034 | if(addIntervalValue(fp, &(fp->writeBuffer->nNodes[k]), sum+k, sumsq+k, fp->writeBuffer->lastZoomBuffer[k], fp->hdr->bufSize/32, fp->hdr->zoomHdrs->level[k], i, intervals->start[j], intervals->end[j], intervals->value[j])) goto error; | |
1035 | while(fp->writeBuffer->lastZoomBuffer[k]->next) fp->writeBuffer->lastZoomBuffer[k] = fp->writeBuffer->lastZoomBuffer[k]->next; | |
1036 | } | |
1037 | } | |
1038 | bwDestroyOverlappingIntervals(intervals); | |
1039 | } | |
1040 | ||
1041 | //Make an index for each zoom level | |
1042 | for(i=0; i<fp->hdr->nLevels; i++) { | |
1043 | fp->hdr->zoomHdrs->idx[i] = calloc(1, sizeof(bwRTree_t)); | |
1044 | if(!fp->hdr->zoomHdrs->idx[i]) return 1; | |
1045 | fp->hdr->zoomHdrs->idx[i]->blockSize = fp->writeBuffer->blockSize; | |
1046 | } | |
1047 | ||
1048 | free(sum); | |
1049 | free(sumsq); | |
1050 | ||
1051 | return 0; | |
1052 | ||
1053 | error: | |
1054 | if(intervals) bwDestroyOverlappingIntervals(intervals); | |
1055 | if(sum) free(sum); | |
1056 | if(sumsq) free(sumsq); | |
1057 | return 1; | |
1058 | } | |
1059 | ||
1060 | int writeZoomLevels(bigWigFile_t *fp) { | |
1061 | uint64_t offset1, offset2, idxSize = 0; | |
1062 | uint32_t i, j, four = 0, last, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0; | |
1063 | uint8_t wrote, one = 0; | |
1064 | uint16_t actualNLevels = 0; | |
1065 | int rv; | |
1066 | bwLL *ll, *p; | |
1067 | bwRTreeNode_t *root; | |
1068 | bwZoomBuffer_t *zb, *zb2; | |
1069 | bwWriteBuffer_t *wb = fp->writeBuffer; | |
1070 | uLongf sz; | |
1071 | ||
1072 | for(i=0; i<fp->hdr->nLevels; i++) { | |
1073 | if(i) { | |
1074 | //Is this a duplicate level? | |
1075 | if(fp->writeBuffer->nNodes[i] == fp->writeBuffer->nNodes[i-1]) break; | |
1076 | } | |
1077 | actualNLevels++; | |
1078 | ||
1079 | //reserve a uint32_t for the number of blocks | |
1080 | fp->hdr->zoomHdrs->dataOffset[i] = bwTell(fp); | |
1081 | fp->writeBuffer->nBlocks = 0; | |
1082 | fp->writeBuffer->l = 24; | |
1083 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1; | |
1084 | zb = fp->writeBuffer->firstZoomBuffer[i]; | |
1085 | fp->writeBuffer->firstIndexNode = NULL; | |
1086 | fp->writeBuffer->currentIndexNode = NULL; | |
1087 | while(zb) { | |
1088 | sz = fp->hdr->bufSize; | |
1089 | if(compress(wb->compressP, &sz, zb->p, zb->l) != Z_OK) return 2; | |
1090 | ||
1091 | //write the data to disk | |
1092 | if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 3; | |
1093 | ||
1094 | //Add an entry into the index | |
1095 | last = (zb->l - 32)>>2; | |
1096 | if(addIndexEntry(fp, ((uint32_t*)zb->p)[0], ((uint32_t*)zb->p)[last], ((uint32_t*)zb->p)[1], ((uint32_t*)zb->p)[last+2], bwTell(fp)-sz, sz)) return 4; | |
1097 | ||
1098 | wb->nBlocks++; | |
1099 | wb->l = 24; | |
1100 | zb = zb->next; | |
1101 | } | |
1102 | if(writeAtPos(&(wb->nBlocks), sizeof(uint32_t), 1, fp->hdr->zoomHdrs->dataOffset[i], fp->URL->x.fp)) return 5; | |
1103 | ||
1104 | //Make the tree | |
1105 | ll = fp->writeBuffer->firstIndexNode; | |
1106 | if(ll == fp->writeBuffer->currentIndexNode) { | |
1107 | root = ll->node; | |
1108 | idxSize = 4 + 24*root->nChildren; | |
1109 | } else { | |
1110 | root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize); | |
1111 | } | |
1112 | if(!root) return 4; | |
1113 | fp->hdr->zoomHdrs->idx[i]->root = root; | |
1114 | ||
1115 | ll = fp->writeBuffer->firstIndexNode; | |
1116 | while(ll) { | |
1117 | p = ll->next; | |
1118 | free(ll); | |
1119 | ll=p; | |
1120 | } | |
1121 | ||
1122 | ||
1123 | //write the index | |
1124 | wrote = 0; | |
1125 | fp->hdr->zoomHdrs->indexOffset[i] = bwTell(fp); | |
1126 | four = IDX_MAGIC; | |
1127 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1; | |
1128 | root = fp->hdr->zoomHdrs->idx[i]->root; | |
1129 | if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6; | |
1130 | if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7; | |
1131 | if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; | |
1132 | if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; | |
1133 | if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10; | |
1134 | if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11; | |
1135 | if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12; | |
1136 | four = fp->hdr->bufSize/32; | |
1137 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13; | |
1138 | four = 0; | |
1139 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding | |
1140 | fp->hdr->zoomHdrs->idx[i]->rootOffset = bwTell(fp); | |
1141 | ||
1142 | //Write the root node, since writeIndexTree writes the children and fills in the offset | |
1143 | offset1 = bwTell(fp); | |
1144 | if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16; | |
1145 | if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding | |
1146 | if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18; | |
1147 | for(j=0; j<root->nChildren; j++) { | |
1148 | vector[0] = root->chrIdxStart[j]; | |
1149 | vector[1] = root->baseStart[j]; | |
1150 | vector[2] = root->chrIdxEnd[j]; | |
1151 | vector[3] = root->baseEnd[j]; | |
1152 | if(root->isLeaf) { | |
1153 | //Include the offset and size | |
1154 | if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19; | |
1155 | if(fwrite(&(root->dataOffset[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20; | |
1156 | if(fwrite(&(root->x.size[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21; | |
1157 | } else { | |
1158 | if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22; | |
1159 | } | |
1160 | } | |
1161 | ||
1162 | while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->hdr->zoomHdrs->idx[i]->root, &wrote, 0)) == 0) { | |
1163 | if(!wrote) break; | |
1164 | wrote = 0; | |
1165 | } | |
1166 | ||
1167 | if(rv || wrote) return 6; | |
1168 | ||
1169 | //Save the file position | |
1170 | offset2 = bwTell(fp); | |
1171 | ||
1172 | //Write the offsets | |
1173 | if(writeIndexOffsets(fp->URL->x.fp, root, offset1)) return 2; | |
1174 | ||
1175 | //Move the file pointer back to the end | |
1176 | bwSetPos(fp, offset2); | |
1177 | ||
1178 | ||
1179 | //Free the linked list | |
1180 | zb = fp->writeBuffer->firstZoomBuffer[i]; | |
1181 | while(zb) { | |
1182 | if(zb->p) free(zb->p); | |
1183 | zb2 = zb->next; | |
1184 | free(zb); | |
1185 | zb = zb2; | |
1186 | } | |
1187 | fp->writeBuffer->firstZoomBuffer[i] = NULL; | |
1188 | } | |
1189 | ||
1190 | //Free unused zoom levels | |
1191 | for(i=actualNLevels; i<fp->hdr->nLevels; i++) { | |
1192 | zb = fp->writeBuffer->firstZoomBuffer[i]; | |
1193 | while(zb) { | |
1194 | if(zb->p) free(zb->p); | |
1195 | zb2 = zb->next; | |
1196 | free(zb); | |
1197 | zb = zb2; | |
1198 | } | |
1199 | fp->writeBuffer->firstZoomBuffer[i] = NULL; | |
1200 | } | |
1201 | ||
1202 | //Write the zoom headers to disk | |
1203 | offset1 = bwTell(fp); | |
1204 | if(bwSetPos(fp, 0x40)) return 7; | |
1205 | four = 0; | |
1206 | for(i=0; i<actualNLevels; i++) { | |
1207 | if(fwrite(&(fp->hdr->zoomHdrs->level[i]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; | |
1208 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; | |
1209 | if(fwrite(&(fp->hdr->zoomHdrs->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 10; | |
1210 | if(fwrite(&(fp->hdr->zoomHdrs->indexOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 11; | |
1211 | } | |
1212 | ||
1213 | //Write the number of levels if needed | |
1214 | if(bwSetPos(fp, 0x6)) return 12; | |
1215 | if(fwrite(&actualNLevels, sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 13; | |
1216 | ||
1217 | if(bwSetPos(fp, offset1)) return 14; | |
1218 | ||
1219 | return 0; | |
1220 | } | |
1221 | ||
1222 | //0 on success | |
1223 | int bwFinalize(bigWigFile_t *fp) { | |
1224 | uint32_t four; | |
1225 | uint64_t offset; | |
1226 | if(!fp->isWrite) return 0; | |
1227 | ||
1228 | //Flush the buffer | |
1229 | if(flushBuffer(fp)) return 1; //Valgrind reports a problem here! | |
1230 | ||
1231 | //Update the data section with the number of blocks written | |
1232 | if(fp->hdr) { | |
1233 | if(writeAtPos(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->hdr->dataOffset, fp->URL->x.fp)) return 2; | |
1234 | } else { | |
1235 | //The header wasn't written! | |
1236 | return 1; | |
1237 | } | |
1238 | ||
1239 | //write the bufferSize | |
1240 | if(fp->hdr->bufSize) { | |
1241 | if(writeAtPos(&(fp->hdr->bufSize), sizeof(uint32_t), 1, 0x34, fp->URL->x.fp)) return 2; | |
1242 | } | |
1243 | ||
1244 | //write the summary information | |
1245 | if(writeSummary(fp)) return 3; | |
1246 | ||
1247 | //Convert the linked-list to a tree and write to disk | |
1248 | if(writeIndex(fp)) return 4; | |
1249 | ||
1250 | //Zoom level stuff here? | |
1251 | if(fp->hdr->nLevels && fp->writeBuffer->nBlocks) { | |
1252 | offset = bwTell(fp); | |
1253 | if(makeZoomLevels(fp)) return 5; | |
1254 | if(constructZoomLevels(fp)) return 6; | |
1255 | bwSetPos(fp, offset); | |
1256 | if(writeZoomLevels(fp)) return 7; //This write nLevels as well | |
1257 | } | |
1258 | ||
1259 | //write magic at the end of the file | |
1260 | four = BIGWIG_MAGIC; | |
1261 | if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; | |
1262 | ||
1263 | return 0; | |
1264 | } | |
1265 | ||
1266 | /* | |
1267 | data chunk: | |
1268 | uint64_t number of blocks (2 / 110851) | |
1269 | some blocks | |
1270 | ||
1271 | an uncompressed data block (24 byte header) | |
1272 | uint32_t Tid 0-4 | |
1273 | uint32_t start 4-8 | |
1274 | uint32_t end 8-12 | |
1275 | uint32_t step 12-16 | |
1276 | uint32_t span 16-20 | |
1277 | uint8_t type 20 | |
1278 | uint8_t padding | |
1279 | uint16_t nItems 22 | |
1280 | nItems of: | |
1281 | type 1: //12 bytes | |
1282 | uint32_t start | |
1283 | uint32_t end | |
1284 | float value | |
1285 | type 2: //8 bytes | |
1286 | uint32_t start | |
1287 | float value | |
1288 | type 3: //4 bytes | |
1289 | float value | |
1290 | ||
1291 | data block index header | |
1292 | uint32_t magic | |
1293 | uint32_t blockSize (256 in the example) maximum number of children | |
1294 | uint64_t number of blocks (2 / 110851) | |
1295 | uint32_t startTid | |
1296 | uint32_t startPos | |
1297 | uint32_t endTid | |
1298 | uint32_t endPos | |
1299 | uint64_t index size? (0x1E7 / 0x1AF0401F) index address? | |
1300 | uint32_t itemsPerBlock (1 / 1) 1024 for zoom headers 1024 for zoom headers | |
1301 | uint32_t padding | |
1302 | ||
1303 | data block index node non-leaf (4 bytes + 24*nChildren) | |
1304 | uint8_t isLeaf | |
1305 | uint8_t padding | |
1306 | uint16_t nChildren (2, 256) | |
1307 | uint32_t startTid | |
1308 | uint32_t startPos | |
1309 | uint32_t endTid | |
1310 | uint32_t endPos | |
1311 | uint64_t dataOffset (0x1AF05853, 0x1AF07057) | |
1312 | ||
1313 | data block index node leaf (4 bytes + 32*nChildren) | |
1314 | uint8_t isLeaf | |
1315 | uint8_t padding | |
1316 | uint16_t nChildren (2) | |
1317 | uint32_t startTid | |
1318 | uint32_t startPos | |
1319 | uint32_t endTid | |
1320 | uint32_t endPos | |
1321 | uint64_t dataOffset (0x198, 0x1CF) | |
1322 | uint64_t dataSize (55, 24) | |
1323 | ||
1324 | zoom data block | |
1325 | uint32_t number of blocks (10519766) | |
1326 | some data blocks | |
1327 | */ |
0 | #ifndef NOCURL | |
1 | #include <curl/curl.h> | |
2 | #endif | |
3 | #include <stdio.h> | |
4 | #include <stdlib.h> | |
5 | #include <string.h> | |
6 | #include <unistd.h> | |
7 | #include "bigWigIO.h" | |
8 | #include <inttypes.h> | |
9 | #include <errno.h> | |
10 | ||
11 | size_t GLOBAL_DEFAULTBUFFERSIZE; | |
12 | ||
13 | #ifndef NOCURL | |
14 | uint64_t getContentLength(URL_t *URL) { | |
15 | double size; | |
16 | if(curl_easy_getinfo(URL->x.curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &size) != CURLE_OK) { | |
17 | return 0; | |
18 | } | |
19 | if(size== -1.0) return 0; | |
20 | return (uint64_t) size; | |
21 | } | |
22 | ||
23 | //Fill the buffer, note that URL may be left in an unusable state on error! | |
24 | CURLcode urlFetchData(URL_t *URL, unsigned long bufSize) { | |
25 | CURLcode rv; | |
26 | char range[1024]; | |
27 | ||
28 | if(URL->filePos != (size_t) -1) URL->filePos += URL->bufLen; | |
29 | else URL->filePos = 0; | |
30 | ||
31 | URL->bufPos = URL->bufLen = 0; //Otherwise, we can't copy anything into the buffer! | |
32 | sprintf(range,"%lu-%lu", URL->filePos, URL->filePos+bufSize-1); | |
33 | rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); | |
34 | if(rv != CURLE_OK) { | |
35 | fprintf(stderr, "[urlFetchData] Couldn't set the range (%s)\n", range); | |
36 | return rv; | |
37 | } | |
38 | ||
39 | rv = curl_easy_perform(URL->x.curl); | |
40 | errno = 0; //Sometimes curl_easy_perform leaves a random errno remnant | |
41 | return rv; | |
42 | } | |
43 | ||
44 | //Read data into a buffer, ideally from a buffer already in memory | |
45 | //The loop is likely no longer needed. | |
46 | size_t url_fread(void *obuf, size_t obufSize, URL_t *URL) { | |
47 | size_t remaining = obufSize, fetchSize; | |
48 | void *p = obuf; | |
49 | CURLcode rv; | |
50 | ||
51 | while(remaining) { | |
52 | if(!URL->bufLen) { | |
53 | rv = urlFetchData(URL, URL->bufSize); | |
54 | if(rv != CURLE_OK) { | |
55 | fprintf(stderr, "[url_fread] urlFetchData (A) returned %s\n", curl_easy_strerror(rv)); | |
56 | return 0; | |
57 | } | |
58 | } else if(URL->bufLen < URL->bufPos + remaining) { //Copy the remaining buffer and reload the buffer as needed | |
59 | p = memcpy(p, URL->memBuf+URL->bufPos, URL->bufLen - URL->bufPos); | |
60 | if(!p) return 0; | |
61 | p += URL->bufLen - URL->bufPos; | |
62 | remaining -= URL->bufLen - URL->bufPos; | |
63 | if(remaining) { | |
64 | if(!URL->isCompressed) { | |
65 | fetchSize = URL->bufSize; | |
66 | } else { | |
67 | fetchSize = (remaining<URL->bufSize)?remaining:URL->bufSize; | |
68 | } | |
69 | rv = urlFetchData(URL, fetchSize); | |
70 | if(rv != CURLE_OK) { | |
71 | fprintf(stderr, "[url_fread] urlFetchData (B) returned %s\n", curl_easy_strerror(rv)); | |
72 | return 0; | |
73 | } | |
74 | } | |
75 | } else { | |
76 | p = memcpy(p, URL->memBuf+URL->bufPos, remaining); | |
77 | if(!p) return 0; | |
78 | URL->bufPos += remaining; | |
79 | remaining = 0; | |
80 | } | |
81 | } | |
82 | return obufSize; | |
83 | } | |
84 | #endif | |
85 | ||
86 | //Returns the number of bytes requested or a smaller number on error | |
87 | //Note that in the case of remote files, the actual amount read may be less than the return value! | |
88 | size_t urlRead(URL_t *URL, void *buf, size_t bufSize) { | |
89 | #ifndef NOCURL | |
90 | if(URL->type==0) { | |
91 | return fread(buf, bufSize, 1, URL->x.fp)*bufSize; | |
92 | } else { | |
93 | return url_fread(buf, bufSize, URL); | |
94 | } | |
95 | #else | |
96 | return fread(buf, bufSize, 1, URL->x.fp)*bufSize; | |
97 | #endif | |
98 | } | |
99 | ||
100 | size_t bwFillBuffer(void *inBuf, size_t l, size_t nmemb, void *pURL) { | |
101 | URL_t *URL = (URL_t*) pURL; | |
102 | void *p = URL->memBuf; | |
103 | size_t copied = l*nmemb; | |
104 | if(!p) return 0; | |
105 | ||
106 | p += URL->bufLen; | |
107 | if(l*nmemb > URL->bufSize - URL->bufPos) { //We received more than we can store! | |
108 | copied = URL->bufSize - URL->bufLen; | |
109 | } | |
110 | memcpy(p, inBuf, copied); | |
111 | URL->bufLen += copied; | |
112 | ||
113 | if(!URL->memBuf) return 0; //signal error | |
114 | return copied; | |
115 | } | |
116 | ||
117 | //Seek to an arbitrary location, returning a CURLcode | |
118 | //Note that a local file returns CURLE_OK on success or CURLE_FAILED_INIT on any error; | |
119 | CURLcode urlSeek(URL_t *URL, size_t pos) { | |
120 | #ifndef NOCURL | |
121 | char range[1024]; | |
122 | CURLcode rv; | |
123 | ||
124 | if(URL->type == BWG_FILE) { | |
125 | #endif | |
126 | if(fseek(URL->x.fp, pos, SEEK_SET) == 0) { | |
127 | errno = 0; | |
128 | return CURLE_OK; | |
129 | } else { | |
130 | return CURLE_FAILED_INIT; //This is arbitrary | |
131 | } | |
132 | #ifndef NOCURL | |
133 | } else { | |
134 | //If the location is covered by the buffer then don't seek! | |
135 | if(pos < URL->filePos || pos >= URL->filePos+URL->bufLen) { | |
136 | URL->filePos = pos; | |
137 | URL->bufLen = 0; //Otherwise, filePos will get incremented on the next read! | |
138 | URL->bufPos = 0; | |
139 | //Maybe this works for FTP? | |
140 | sprintf(range,"%lu-%lu", pos, pos+URL->bufSize-1); | |
141 | rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); | |
142 | if(rv != CURLE_OK) { | |
143 | fprintf(stderr, "[urlSeek] Couldn't set the range (%s)\n", range); | |
144 | return rv; | |
145 | } | |
146 | rv = curl_easy_perform(URL->x.curl); | |
147 | if(rv != CURLE_OK) { | |
148 | fprintf(stderr, "[urlSeek] curl_easy_perform received an error!\n"); | |
149 | } | |
150 | errno = 0; //Don't propogate remnant resolved libCurl errors | |
151 | return rv; | |
152 | } else { | |
153 | URL->bufPos = pos-URL->filePos; | |
154 | return CURLE_OK; | |
155 | } | |
156 | } | |
157 | #endif | |
158 | } | |
159 | ||
160 | URL_t *urlOpen(char *fname, CURLcode (*callBack)(CURL*), const char *mode) { | |
161 | URL_t *URL = calloc(1, sizeof(URL_t)); | |
162 | if(!URL) return NULL; | |
163 | char *url = NULL, *req = NULL; | |
164 | #ifndef NOCURL | |
165 | CURLcode code; | |
166 | char range[1024]; | |
167 | #endif | |
168 | ||
169 | URL->fname = fname; | |
170 | ||
171 | if((!mode) || (strchr(mode, 'w') == 0)) { | |
172 | //Set the protocol | |
173 | #ifndef NOCURL | |
174 | if(strncmp(fname, "http://", 7) == 0) URL->type = BWG_HTTP; | |
175 | else if(strncmp(fname, "https://", 8) == 0) URL->type = BWG_HTTPS; | |
176 | else if(strncmp(fname, "ftp://", 6) == 0) URL->type = BWG_FTP; | |
177 | else URL->type = BWG_FILE; | |
178 | #else | |
179 | URL->type = BWG_FILE; | |
180 | #endif | |
181 | ||
182 | //local file? | |
183 | if(URL->type == BWG_FILE) { | |
184 | URL->filePos = -1; //This signals that nothing has been read | |
185 | URL->x.fp = fopen(fname, "rb"); | |
186 | if(!(URL->x.fp)) { | |
187 | free(URL); | |
188 | fprintf(stderr, "[urlOpen] Couldn't open %s for reading\n", fname); | |
189 | return NULL; | |
190 | } | |
191 | #ifndef NOCURL | |
192 | } else { | |
193 | //Remote file, set up the memory buffer and get CURL ready | |
194 | URL->memBuf = malloc(GLOBAL_DEFAULTBUFFERSIZE); | |
195 | if(!(URL->memBuf)) { | |
196 | free(URL); | |
197 | fprintf(stderr, "[urlOpen] Couldn't allocate enough space for the file buffer!\n"); | |
198 | return NULL; | |
199 | } | |
200 | URL->bufSize = GLOBAL_DEFAULTBUFFERSIZE; | |
201 | URL->x.curl = curl_easy_init(); | |
202 | if(!(URL->x.curl)) { | |
203 | fprintf(stderr, "[urlOpen] curl_easy_init() failed!\n"); | |
204 | goto error; | |
205 | } | |
206 | //Negotiate a reasonable HTTP authentication method | |
207 | if(curl_easy_setopt(URL->x.curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY) != CURLE_OK) { | |
208 | fprintf(stderr, "[urlOpen] Failed instructing curl to use any HTTP authentication it finds to be suitable!\n"); | |
209 | goto error; | |
210 | } | |
211 | //Follow redirects | |
212 | if(curl_easy_setopt(URL->x.curl, CURLOPT_FOLLOWLOCATION, 1L) != CURLE_OK) { | |
213 | fprintf(stderr, "[urlOpen] Failed instructing curl to follow redirects!\n"); | |
214 | goto error; | |
215 | } | |
216 | //Set the URL | |
217 | if(curl_easy_setopt(URL->x.curl, CURLOPT_URL, fname) != CURLE_OK) { | |
218 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_URL!\n"); | |
219 | goto error; | |
220 | } | |
221 | //Set the range, which doesn't do anything for HTTP | |
222 | sprintf(range, "0-%lu", URL->bufSize-1); | |
223 | if(curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range) != CURLE_OK) { | |
224 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_RANGE (%s)!\n", range); | |
225 | goto error; | |
226 | } | |
227 | //Set the callback info, which means we no longer need to directly deal with sockets and header! | |
228 | if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEFUNCTION, bwFillBuffer) != CURLE_OK) { | |
229 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEFUNCTION!\n"); | |
230 | goto error; | |
231 | } | |
232 | if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEDATA, (void*)URL) != CURLE_OK) { | |
233 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEDATA!\n"); | |
234 | goto error; | |
235 | } | |
236 | //Ignore certificate errors with https, libcurl just isn't reliable enough with conda | |
237 | if(curl_easy_setopt(URL->x.curl, CURLOPT_SSL_VERIFYPEER, 0) != CURLE_OK) { | |
238 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_SSL_VERIFYPEER to 0!\n"); | |
239 | goto error; | |
240 | } | |
241 | if(curl_easy_setopt(URL->x.curl, CURLOPT_SSL_VERIFYHOST, 0) != CURLE_OK) { | |
242 | fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_SSL_VERIFYHOST to 0!\n"); | |
243 | goto error; | |
244 | } | |
245 | if(callBack) { | |
246 | code = callBack(URL->x.curl); | |
247 | if(code != CURLE_OK) { | |
248 | fprintf(stderr, "[urlOpen] The user-supplied call back function returned an error: %s\n", curl_easy_strerror(code)); | |
249 | goto error; | |
250 | } | |
251 | } | |
252 | code = curl_easy_perform(URL->x.curl); | |
253 | errno = 0; //Sometimes curl_easy_perform leaves a random errno remnant | |
254 | if(code != CURLE_OK) { | |
255 | fprintf(stderr, "[urlOpen] curl_easy_perform received an error: %s\n", curl_easy_strerror(code)); | |
256 | goto error; | |
257 | } | |
258 | #endif | |
259 | } | |
260 | } else { | |
261 | URL->type = BWG_FILE; | |
262 | URL->x.fp = fopen(fname, mode); | |
263 | if(!(URL->x.fp)) { | |
264 | free(URL); | |
265 | fprintf(stderr, "[urlOpen] Couldn't open %s for writing\n", fname); | |
266 | return NULL; | |
267 | } | |
268 | } | |
269 | if(url) free(url); | |
270 | if(req) free(req); | |
271 | return URL; | |
272 | ||
273 | #ifndef NOCURL | |
274 | error: | |
275 | if(url) free(url); | |
276 | if(req) free(req); | |
277 | free(URL->memBuf); | |
278 | curl_easy_cleanup(URL->x.curl); | |
279 | free(URL); | |
280 | return NULL; | |
281 | #endif | |
282 | } | |
283 | ||
284 | //Performs the necessary free() operations and handles cleaning up curl | |
285 | void urlClose(URL_t *URL) { | |
286 | if(URL->type == BWG_FILE) { | |
287 | fclose(URL->x.fp); | |
288 | #ifndef NOCURL | |
289 | } else { | |
290 | free(URL->memBuf); | |
291 | curl_easy_cleanup(URL->x.curl); | |
292 | #endif | |
293 | } | |
294 | free(URL); | |
295 | } |
196 | 196 | |
197 | 197 | //Return 1 if there are any entries at all |
198 | 198 | int hasEntries(bigWigFile_t *bw) { |
199 | if(bw->hdr->nBasesCovered > 0) return 1; | |
199 | if(bw->hdr->indexOffset != 0) return 1; // No index, no entries pyBigWig issue #111 | |
200 | //if(bw->hdr->nBasesCovered > 0) return 1; // Sometimes headers are broken | |
200 | 201 | return 0; |
201 | 202 | } |
202 | 203 | |
377 | 378 | double *val; |
378 | 379 | uint32_t start, end = -1, tid; |
379 | 380 | unsigned long startl = 0, endl = -1; |
380 | static char *kwd_list[] = {"chrom", "start", "end", "type", "nBins", "exact", NULL}; | |
381 | static char *kwd_list[] = {"chrom", "start", "end", "type", "nBins", "exact", "numpy", NULL}; | |
381 | 382 | char *chrom, *type = "mean"; |
382 | 383 | PyObject *ret, *exact = Py_False, *starto = NULL, *endo = NULL; |
384 | PyObject *outputNumpy = Py_False; | |
383 | 385 | int i, nBins = 1; |
384 | 386 | errno = 0; //In the off-chance that something elsewhere got an error and didn't clear it... |
385 | 387 | |
398 | 400 | return NULL; |
399 | 401 | } |
400 | 402 | |
401 | if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|OOsiO", kwd_list, &chrom, &starto, &endo, &type, &nBins, &exact)) { | |
403 | if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|OOsiOO", kwd_list, &chrom, &starto, &endo, &type, &nBins, &exact, &outputNumpy)) { | |
402 | 404 | PyErr_SetString(PyExc_RuntimeError, "You must supply at least a chromosome!"); |
403 | 405 | return NULL; |
404 | 406 | } |
463 | 465 | |
464 | 466 | //Return a list of None if there are no entries at all |
465 | 467 | if(!hasEntries(bw)) { |
466 | ret = PyList_New(nBins); | |
467 | for(i=0; i<nBins; i++) { | |
468 | Py_INCREF(Py_None); | |
469 | PyList_SetItem(ret, i, Py_None); | |
470 | } | |
468 | #ifdef WITHNUMPY | |
469 | if(outputNumpy == Py_True) { | |
470 | val = malloc(sizeof(double)*nBins); | |
471 | for(i=0; i<nBins; i++) { | |
472 | val[i] = NPY_NAN; | |
473 | } | |
474 | npy_intp len = nBins; | |
475 | ret = PyArray_SimpleNewFromData(1, &len, NPY_FLOAT64, (void *) val); | |
476 | //This will break if numpy ever stops using malloc! | |
477 | PyArray_ENABLEFLAGS((PyArrayObject*) ret, NPY_ARRAY_OWNDATA); | |
478 | } else { | |
479 | #endif | |
480 | ret = PyList_New(nBins); | |
481 | for(i=0; i<nBins; i++) { | |
482 | Py_INCREF(Py_None); | |
483 | PyList_SetItem(ret, i, Py_None); | |
484 | } | |
485 | #ifdef WITHNUMPY | |
486 | } | |
487 | #endif | |
471 | 488 | return ret; |
472 | 489 | } |
473 | 490 | |
483 | 500 | return NULL; |
484 | 501 | } |
485 | 502 | |
486 | ret = PyList_New(nBins); | |
487 | for(i=0; i<nBins; i++) { | |
488 | if(isnan(val[i])) { | |
489 | Py_INCREF(Py_None); | |
490 | PyList_SetItem(ret, i, Py_None); | |
491 | } else { | |
492 | PyList_SetItem(ret, i, PyFloat_FromDouble(val[i])); | |
493 | } | |
494 | } | |
495 | free(val); | |
496 | ||
503 | #ifdef WITHNUMPY | |
504 | if(outputNumpy == Py_True) { | |
505 | npy_intp len = nBins; | |
506 | ret = PyArray_SimpleNewFromData(1, &len, NPY_FLOAT64, (void *) val); | |
507 | //This will break if numpy ever stops using malloc! | |
508 | PyArray_ENABLEFLAGS((PyArrayObject*) ret, NPY_ARRAY_OWNDATA); | |
509 | } else { | |
510 | #endif | |
511 | ret = PyList_New(nBins); | |
512 | for(i=0; i<nBins; i++) { | |
513 | if(isnan(val[i])) { | |
514 | Py_INCREF(Py_None); | |
515 | PyList_SetItem(ret, i, Py_None); | |
516 | } else { | |
517 | PyList_SetItem(ret, i, PyFloat_FromDouble(val[i])); | |
518 | } | |
519 | } | |
520 | free(val); | |
521 | #ifdef WITHNUMPY | |
522 | } | |
523 | #endif | |
497 | 524 | return ret; |
498 | 525 | } |
499 | 526 |
0 | LICENSE.txt | |
1 | MANIFEST.in | |
2 | README.md | |
3 | pyBigWig.c | |
4 | pyBigWig.h | |
5 | setup.cfg | |
6 | setup.py | |
7 | libBigWig/LICENSE | |
8 | libBigWig/README.md | |
9 | libBigWig/bigWig.h | |
10 | libBigWig/bigWigIO.h | |
11 | libBigWig/bwCommon.h | |
12 | libBigWig/bwRead.c | |
13 | libBigWig/bwStats.c | |
14 | libBigWig/bwValues.c | |
15 | libBigWig/bwValues.h | |
16 | libBigWig/bwWrite.c | |
17 | libBigWig/io.c | |
18 | pyBigWigTest/__init__.py | |
19 | pyBigWigTest/test.bigBed | |
20 | pyBigWigTest/test.bw | |
21 | pyBigWigTest/test.py⏎ |
1 | 1 | #include <structmember.h> |
2 | 2 | #include "bigWig.h" |
3 | 3 | |
4 | #define pyBigWigVersion "0.3.17" | |
4 | #define pyBigWigVersion "0.3.18" | |
5 | 5 | |
6 | 6 | typedef struct { |
7 | 7 | PyObject_HEAD |
307 | 307 | |
308 | 308 | bw.close() |
309 | 309 | os.remove("/tmp/delete.bw") |
310 | ||
311 | def testNumpyValues(self): | |
312 | if pyBigWig.numpy == 0: | |
313 | return 0 | |
314 | import numpy as np | |
315 | ||
316 | fname = "http://raw.githubusercontent.com/dpryan79/pyBigWig/master/pyBigWigTest/test.bw" | |
317 | bw = pyBigWig.open(fname, "r") | |
318 | ||
319 | assert np.allclose( | |
320 | bw.values("1", 0, 20, numpy=True), | |
321 | np.array(bw.values("1", 0, 20), dtype=np.float32), | |
322 | equal_nan=True | |
323 | ) | |
324 | ||
325 | assert np.allclose( | |
326 | bw.stats("1", 0, 20, "mean", 5, numpy=True), | |
327 | np.array(bw.stats("1", 0, 20, "mean", 5), dtype=np.float64), | |
328 | equal_nan=True | |
329 | ) |