Codebase list tika / b887c01
Imported Upstream version 1.5 Emmanuel Bourg 9 years ago
552 changed file(s) with 135839 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
1 tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
2 tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
3 tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf
0 target
1 .idea
2 .classpath
3 .project
4 .settings
5 *.iml
6 *.ipr
7 *.iws
0 Release 1.5 - 02/04/2014
1
2 * Fixed bug in handling of embedded file processing in PDFs (TIKA-1228).
3
4 * Added SourceCodeParser to support java, Groovy, C++ files (TIKA-1224).
5
6 * Updated Tika Server to support multipart/form-data payloads (TIKA-1198).
7
8 * Updated Tika Server to CXF 2.7.8 (TIKA-1197).
9
10 * Updated Tika Server to accept requests over wildcard addresses (TIKA-1196).
11
12 * Added option to use alternate NonSequentialPDFParser (TIKA-1201).
13
14 * Content from PDF AcroForms is now extracted (TIKA-973).
15
16 * Fixed invalid asterisks from master slide in PPT (TIKA-1171).
17
18 * Added test cases to confirm handling of auto-date in PPT and PPTX (TIKA-817).
19
20 * Text from tables in PPT files is once again extracted correctly (TIKA-1076).
21
22 * Text is extracted from text boxes in XLSX (TIKA-1100).
23
24 * Tika no longer hangs when processing Excel files with custom fraction format (TIKA-1132).
25
26 * Disconcerting stacktrace from missing beans no longer printed for some DOCX files (TIKA-792).
27
28 * Upgraded POI to 3.10-beta2 (TIKA-1173).
29
30 * Upgraded PDFBox to 1.8.4 (TIKA-1230).
31
32 * Made HtmlEncodingDetector more flexible in finding meta
33 header charset (TIKA-1001).
34
35 * Added sanitized test HTML file for local file test (TIKA-1139).
36
37 * Fixed bug that prevented attachments within a PDF from being processed
38 if the PDF itself was an attachment (TIKA-1124).
39
40 * Text from paragraph-level structured document tags in DOCX files is now extracted (TIKA-1130).
41
42 * RTF: Fixed ArrayIndexOutOfBoundsException when parsing list override (TIKA-1192).
43
44 * CLI: TikaCLI now escapes invalid filename characters as hex
45 characters (TIKA-1078).
46
47 Release 1.4 - 06/15/2013
48
49 * Removed a test HTML file with a poorly chosen GPL text in it (TIKA-1129).
50
51 * Improvements to tika-server to allow it to produce text/html and
52 text/xml content (TIKA-1126, TIKA-1127).
53
54 * Improvements were made to the Compressor Parser to handle g'zipped files
55 that require the decompressConcatenated option set to true (TIKA-1096).
56
57 * Addressed a typographic error that was preventing from detection of
58 awk files (TIKA-1081).
59
60 * Added a new end-point to Tika's JAX-RS REST server that only detects
61 the media-type based on a small portion of the document submitted
62 (TIKA-1047).
63
64 * RTF: Ordered and unordered lists are now extracted (TIKA-1062).
65
66 * MP3: Audio duration is now extracted (TIKA-991)
67
68 * Java .class files: upgraded from ASM 3.1 to ASM 4.1 for parsing
69 the Java bytecodes (TIKA-1053).
70
71 * Mime Types: Definitions extended to optionally include Link (URL) and
72 UTI, along with details for several common formats (TIKA-1012 / TIKA-1083)
73
74 * Exceptions when parsing OLE10 embedded documents, when parsing
75 summary information from Office documents, and when saving
76 embedded documennts in TikaCLI are now logged instead
77 of aborting extraction (TIKA-1074)
78
79 * MS Word: line tabular character is now replaced with newline
80 (TIKA-1128)
81
82 * XML: ElementMetadataHandlers can now optionally accept duplicate
83 and empty values (TIKA-1133)
84
85 Release 1.3 - 01/19/2013
86
87 * Mimetype definitions added for more common programming languages,
88 including common extensions, but not magic patterns. (TIKA-1055)
89
90 * MS Word: When a Word (.doc) document contains embedded files or
91 links to external documents, Tika now places a <div
92 class="embedded" id="_XXX"/> placeholder into the XHTML so you can
93 see where in the main text the embedded document occurred
94 (TIKA-956, TIKA-1019). Embedded Wordpad/RTF documents are now
95 recognized (TIKA-982).
96
97 * PDF: Text from pop-up annotations is now extracted (TIKA-981).
98 Text from bookmarks is now extracted (TIKA-1035).
99
100 * PKCS7: Detached signatures no longer through NullPointerException
101 (TIKA-986).
102
103 * iWork: The chart name for charts embedded in numbers documents is
104 now extracted (TIKA-918).
105
106 * CLI: TikaCLI -m now handles multi-valued metadata keys correctly
107 (previously it only printed the first value). (TIKA-920)
108
109 * MS Word (.docx): When a Word (.docx) document contains embedded
110 files, Tika now places a <div class="embedded" id="XXX"/> into the
111 XHTML so you can see where in the main text the embedded document
112 occurred. The id (rId) is included in the Metadata of each
113 embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
114 key, and TikaCLI prepends the rId (if present) onto the filename
115 it extracts (TIKA-989). Fixed NullPointerException when style is
116 null (TIKA-1006). Text inside text boxes is now extracted
117 (TIKA-1005).
118
119 * RTF: Page, word, character count and creation date metadata are
120 now extracted for RTF documents (TIKA-999).
121
122 * MS PowerPoint (.pptx): When a PowerPoint (.pptx) document contains
123 embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
124 XHTML so you can see where in the main text the embedded document
125 occurred. The id (rId) is included in the Metadata of each
126 embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
127 key, and TikaCLI prepends the rId (if present) onto the filename
128 it extracts (TIKA-997, TIKA-1032).
129
130 * MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
131 embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
132 XHTML so you can see where in the main text the embedded document
133 occurred (TIKA-1025). Text from the master slide is now extracted
134 (TIKA-712).
135
136 * MHTML: fixed Null charset name exception when a mime part has an
137 unrecognized charset (TIKA-1011).
138
139 * MP3: if an ID3 tag was encoded in UTF-16 with only the BOM then on
140 certain JVMs this would incorrectly extract the BOM as the tag's
141 value (TIKA-1024).
142
143 * ZIP: placeholders (<div class="embedded" id="<entry name>"/>) are
144 now left in the XHTML so you can see where each archive member
145 appears (TIKA-1036). TikaCLI would hit FileNotFoundException when
146 extracting files that were under sub-directories from a ZIP
147 archive, because it failed to create the parent directories first
148 (TIKA-1031).
149
150 * XML: a space character is now added before each element
151 (TIKA-1048)
152
153 Release 1.2 - 07/10/2012
154 ---------------------------------
155
156 * Tika's JAX-RS based Network server now is based on Apache CXF,
157 which is available in Maven Central and now allows the server
158 module to be packaged and included in our release
159 (TIKA-593, TIKA-901).
160
161 * Tika: parseToString now lets you specify the max string length
162 per-call, in addition to per-Tika-instance. (TIKA-870)
163
164 * Tika now has the ability to detect FITS (Flexible Image Transport System)
165 files (TIKA-874).
166
167 * Images: Fixed file handle leak in ImageParser. (TIKA-875)
168
169 * iWork: Comments in Pages files are now extracted (TIKA-907).
170 Headers, footers and footnotes in Pages files are now extracted
171 (TIKA-906). Don't throw NullPointerException on passsword
172 protected iWork files, even though we can't parse their contents
173 yet (TIKA-903). Text extracted from Keynote text boxes and bullet
174 points no longer runs together (TIKA-910). Also extract text for
175 Pages documents created in layout mode (TIKA-904). Table names
176 are now extracted in Numbers documents (TIKA-924). Content added
177 to master slides is also extracted (TIKA-923).
178
179 * Archive and compression formats: The Commons Compress dependency was
180 upgraded from 1.3 to 1.4.1. With this change Tika can now parse also
181 Unix dump archives and documents compressed using the XZ and Pack200
182 compression formats. (TIKA-932)
183
184 * KML: Tika now has basic support for Keyhole Markup Language documents
185 (KML and KMZ) used by tools like Google Earth. See also
186 http://www.opengeospatial.org/standards/kml/. (TIKA-941)
187
188 * CLI: You can now use the TIKA_PASSWORD environment variable or the
189 --password=X command line option to specify the password that Tika CLI
190 should use for opening encrypted documents (TIKA-943).
191
192 * Character encodings: Tika's character encoding detection mechanism was
193 improved by adding integration to the juniversalchardet library that
194 implements Mozilla's universal charset detection algorithm. The slower
195 ICU4J algorithms are still used as a fallback thanks to their wider
196 coverage of custom character encodings. (TIKA-322, TIKA-471)
197
198 * Charset parameter: Related to the character encoding improvements
199 mentioned above, Tika now returns the detected character encoding as
200 a "charset" parameter of the content type metadata field for text/plain
201 and text/html documents. For example, instead of just "text/plain", the
202 returned content type will be something like "text/plain; charset=UTF-8"
203 for a UTF-8 encoded text document. Character encoding information is still
204 present also in the content encoding metadata field for backwards
205 compatibility, but that field should be considered deprecated. (TIKA-431)
206
207 * Extraction of embedded resources from OLE2 Office Documents, where
208 the resource isn't another office document, has been fixed (TIKA-948)
209
210 Release 1.1 - 3/7/2012
211 ---------------------------------
212
213 * Link Extraction: The rel attribute is now extracted from
214 links per the LinkConteHandler. (TIKA-824)
215
216 * MP3: Fixed handling of UTF-16 (two byte) ID3v2 tags (previously
217 the last character in a UTF-16 tag could be corrupted) (TIKA-793)
218
219 * Performance: Loading of the default media type registry is now
220 significantly faster. (TIKA-780)
221
222 * PDF: Allow controlling whether overlapping duplicated text should
223 be removed. Disabling this (the default) can give big
224 speedups to text extraction and may workaround cases where
225 non-duplicated characters were incorrectly removed (TIKA-767).
226 Allow controlling whether text tokens should be sorted by their x/y
227 position before extracting text (TIKA-612); this is necessary for
228 certain PDFs. Fixed cases where too many </p> tags appear in the
229 XHTML output, causing NPE when opening some PDFs with the GUI
230 (TIKA-778).
231
232 * RTF: Fixed case where a font change would result in processing
233 bytes in the wrong font's charset, producing bogus text output
234 (TIKA-777). Don't output whitespace in ignored group states,
235 avoiding excessive whitespace output (TIKA-781). Binary embedded
236 content (using \bin control word) is now skipped correctly;
237 previously it could cause the parser to incorrectly extract binary
238 content as text (TIKA-782).
239
240 * CLI: New TikaCLI option "--list-detectors", which displays the
241 mimetype detectors that are available, similar to the existing
242 "--list-parsers" option for parsers. (TIKA-785).
243
244 * Detectors: The order of detectors, as supplied via the service
245 registry loader, is now controlled. User supplied detectors are
246 prefered, then Tika detectors (such as the container aware ones),
247 and finally the core Tika MimeTypes is used as a backup. This
248 allows for specific, detailed detectors to take preference over
249 the default mime magic + filename detector. (TIKA-786)
250
251 * Microsoft Project (MPP): Filetype detection has been fixed,
252 and basic metadata (but no text) is now extracted. (TIKA-789)
253
254 * Outlook: fixed NullPointerException in TikaGUI when messages with
255 embedded RTF or HTML content were filtered (TIKA-801).
256
257 * Ogg Vorbis and FLAC: Parser added for Ogg Vorbis and FLAC audio
258 files, which extract audio metadata and tags (TIKA-747)
259
260 * MP4: Improved mime magic detection for MP4 based formats (including
261 QuickTime, MP4 Video and Audio, and 3GPP) (TIKA-851)
262
263 * MP4: Basic metadata extracting parser for MP4 files added, which includes
264 limited audio and video metadata, along with the iTunes media metadata
265 (such as Artist and Title) (TIKA-852)
266
267 * Document Passwords: A new ParseContext object, PasswordProvider,
268 has been added. This provides a way to supply the password for
269 a document during processing. Currently, only password protected
270 PDFs and Microsoft OOXML Files are supported. (TIKA-850)
271
272 Release 1.0 - 11/4/2011
273 ---------------------------------
274
275 The most notable changes in Tika 1.0 over previous releases are:
276
277 * API: All methods, classes and interfaces that were marked as
278 deprecated in Tika 0.10 have been removed to clean up the API
279 (TIKA-703). You may need to adjust and recompile client code
280 accordingly. The declared OSGi package versions are now 1.0, and
281 will thus not resolve for client bundles that still refer to 0.x
282 versions (TIKA-565).
283
284 * Configuration: The context class loader of the current thread is
285 no longer used as the default for loading configured parser and
286 detector classes. You can still pass an explicit class loader
287 to the configuration mechanism to get the previous behaviour.
288 (TIKA-565)
289
290 * OSGi: The tika-core bundle will now automatically pick up and use
291 any available Parser and Detector services when deployed to an OSGi
292 environment. The tika-parsers bundle provides such services based on
293 for all the supported file formats for which the upstream parser library
294 is available. If you don't want to track all the parser libraries as
295 separate OSGi bundles, you can use the tika-bundle bundle that packages
296 tika-parsers together with all its upstream dependencies. (TIKA-565)
297
298 * RTF: Hyperlinks in RTF documents are now extracted as an <a
299 href=...>...</a> element (TIKA-632). The RTF parser is also now
300 more robust when encountering too many closing {'s vs. opening {'s
301 (TIKA-733).
302
303 * MS Word: From Word (.doc) documents we now extract optional hyphen
304 as Unicode zero-width space (U+200B), and non-breaking hyphen as
305 Unicode non-breaking hyphen (U+2011). (TIKA-711)
306
307 * Outlook: Tika can now process also attachments in Outlook messages.
308 (TIKA-396)
309
310 * MS Office: Performance of extracting embedded office docs was improved.
311 (TIKA-753)
312
313 * PDF: The PDF parser now extracts paragraphs within each page
314 (TIKA-742) and can now optionally extract text from PDF
315 annotations (TIKA-738). There's also an option to enable (the
316 default) or disable auto-space insertion (TIKA-724).
317
318 * Language detection: Tika can now detect Belarusian, Catalan,
319 Esperanto, Galician, Lithuanian (TIKA-582), Romanian, Slovak,
320 Slovenian, and Ukrainian (TIKA-681).
321
322 * Java: Tika no longer ships retrotranslated Java 1.4 binaries along
323 with the normal ones that work with Java 5 and higher. (TIKA-744)
324
325 * OpenOffice documents: header/footer text is now extracted for text,
326 presentation and spreadsheet documents (TIKA-736)
327
328 Tika 1.0 relies on the following set of major dependencies (generated using
329 mvn dependency:tree from tika-parsers):
330
331 org.apache.tika:tika-parsers:bundle:1.0
332 +- org.apache.tika:tika-core:jar:1.0:compile
333 +- edu.ucar:netcdf:jar:4.2-min:compile
334 | \- org.slf4j:slf4j-api:jar:1.5.6:compile
335 +- org.apache.james:apache-mime4j-core:jar:0.7:compile
336 +- org.apache.james:apache-mime4j-dom:jar:0.7:compile
337 +- org.apache.commons:commons-compress:jar:1.3:compile
338 +- commons-codec:commons-codec:jar:1.5:compile
339 +- org.apache.pdfbox:pdfbox:jar:1.6.0:compile
340 | +- org.apache.pdfbox:fontbox:jar:1.6.0:compile
341 | +- org.apache.pdfbox:jempbox:jar:1.6.0:compile
342 | \- commons-logging:commons-logging:jar:1.1.1:compile
343 +- org.bouncycastle:bcmail-jdk15:jar:1.45:compile
344 +- org.bouncycastle:bcprov-jdk15:jar:1.45:compile
345 +- org.apache.poi:poi:jar:3.8-beta4:compile
346 +- org.apache.poi:poi-scratchpad:jar:3.8-beta4:compile
347 +- org.apache.poi:poi-ooxml:jar:3.8-beta4:compile
348 | +- org.apache.poi:poi-ooxml-schemas:jar:3.8-beta4:compile
349 | | \- org.apache.xmlbeans:xmlbeans:jar:2.3.0:compile
350 | \- dom4j:dom4j:jar:1.6.1:compile
351 +- org.apache.geronimo.specs:geronimo-stax-api_1.0_spec:jar:1.0.1:compile
352 +- org.ccil.cowan.tagsoup:tagsoup:jar:1.2.1:compile
353 +- asm:asm:jar:3.1:compile
354 +- com.drewnoakes:metadata-extractor:jar:2.4.0-beta-1:compile
355 +- de.l3s.boilerpipe:boilerpipe:jar:1.1.0:compile
356 +- rome:rome:jar:0.9:compile
357 \- jdom:jdom:jar:1.0:compile
358
359 The following people have contributed to Tika 1.0 by submitting or commenting
360 on the issues resolved in this release:
361
362 Andrzej Bialecki
363 Antoni Mylka
364 Benson Margulies
365 Chris A. Mattmann
366 Cristian Vat
367 Dave Meikle
368 David Smiley
369 Dennis Adler
370 Erik Hetzner
371 Ingo Renner
372 Jeremias Maerki
373 Jeremy Anderson
374 Jeroen van Vianen
375 John Bartak
376 Jukka Zitting
377 Julien Nioche
378 Ken Krugler
379 Mark Butler
380 Maxim Valyanskiy
381 Michael Bryant
382 Michael McCandless
383 Nick Burch
384 Pablo Queixalos
385 Uwe Schindler
386 Žygimantas Medelis
387
388
389 See http://s.apache.org/Zk6 for more details on these contributions.
390
391
392 Release 0.10 - 09/25/2011
393 -------------------------
394
395 The most notable changes in Tika 0.10 over previous releases are:
396
397 * A parser for CHM help files was added. (TIKA-245)
398
399 * TIKA-698: Invalid characters are now replaced with the Unicode
400 replacement character (U+FFFD), whereas before such characters were
401 replaced with spaces, so you may need to change your processing of
402 Tika's output to now handle U+FFFD.
403
404 * The RTF parser was rewritten to perform its own direct shallow
405 parse of the RTF content, instead of using RTFEditorKit from
406 javax.swing. This fixes several issues in the old parser,
407 including doubling of Unicode characters in certain cases
408 (TIKA-683), exceptions on mal-formed RTF docs (TIKA-666), and
409 missing text from some elements (header/footer, hyperlinks,
410 footnotes, text inside pictures).
411
412 * Handling of temporary files within Tika was much improved
413 (TIKA-701, TIKA-654, TIKA-645, TIKA-153)
414
415 * The Tika GUI got a facelift and some extra features (TIKA-635)
416
417 * The apache-mime4j dependency of the email message parser was upgraded
418 from version 0.6 to 0.7 (TIKA-716). The parser also now accepts a
419 MimeConfig object in the ParseContext as configuration (TIKA-640).
420
421 Tika 0.10 relies on the following set of major dependencies (generated using
422 mvn dependency:tree from tika-parsers):
423
424 org.apache.tika:tika-parsers:bundle:0.10
425 +- org.apache.tika:tika-core:jar:0.10:compile
426 +- edu.ucar:netcdf:jar:4.2-min:compile
427 | \- org.slf4j:slf4j-api:jar:1.5.6:compile
428 +- org.apache.james:apache-mime4j-core:jar:0.7:compile
429 +- org.apache.james:apache-mime4j-dom:jar:0.7:compile
430 +- org.apache.commons:commons-compress:jar:1.1:compile
431 +- commons-codec:commons-codec:jar:1.4:compile
432 +- org.apache.pdfbox:pdfbox:jar:1.6.0:compile
433 | +- org.apache.pdfbox:fontbox:jar:1.6.0:compile
434 | +- org.apache.pdfbox:jempbox:jar:1.6.0:compile
435 | \- commons-logging:commons-logging:jar:1.1.1:compile
436 +- org.bouncycastle:bcmail-jdk15:jar:1.45:compile
437 +- org.bouncycastle:bcprov-jdk15:jar:1.45:compile
438 +- org.apache.poi:poi:jar:3.8-beta4:compile
439 +- org.apache.poi:poi-scratchpad:jar:3.8-beta4:compile
440 +- org.apache.poi:poi-ooxml:jar:3.8-beta4:compile
441 | +- org.apache.poi:poi-ooxml-schemas:jar:3.8-beta4:compile
442 | | \- org.apache.xmlbeans:xmlbeans:jar:2.3.0:compile
443 | \- dom4j:dom4j:jar:1.6.1:compile
444 +- org.apache.geronimo.specs:geronimo-stax-api_1.0_spec:jar:1.0.1:compile
445 +- org.ccil.cowan.tagsoup:tagsoup:jar:1.2.1:compile
446 +- asm:asm:jar:3.1:compile
447 +- com.drewnoakes:metadata-extractor:jar:2.4.0-beta-1:compile
448 +- de.l3s.boilerpipe:boilerpipe:jar:1.1.0:compile
449 +- rome:rome:jar:0.9:compile
450 \- jdom:jdom:jar:1.0:compile
451
452 The following people have contributed to Tika 0.10 by submitting or commenting
453 on the issues resolved in this release:
454
455 Alain Viret
456 Alex Ott
457 Alexander Chow
458 Andreas Kemkes
459 Andrew Khoury
460 Babak Farhang
461 Benjamin Douglas
462 Benson Margulies
463 Chris A. Mattmann
464 chris hudson
465 Chris Lott
466 Cristian Vat
467 Curt Arnold
468 Cynthia L Wong
469 Dave Brosius
470 David Benson
471 Enrico Donelli
472 Erik Hetzner
473 Erna de Groot
474 Gabriele Columbro
475 Gavin
476 Geoff Jarrad
477 Gregory Kanevsky
478 gunter rombauts
479 Henning Gross
480 Henri Bergius
481 Ingo Renner
482 Ingo Wiarda
483 Izaak Alpert
484 Jan H√∏ydahl
485 Jens Wilmer
486 Jeremy Anderson
487 Joseph Vychtrle
488 Joshua Turner
489 Jukka Zitting
490 Julien Nioche
491 Karl Heinz Marbaise
492 Ken Krugler
493 Kostya Gribov
494 Luciano Leggieri
495 Mads Hansen
496 Mark Butler
497 Matt Sheppard
498 Maxim Valyanskiy
499 Michael McCandless
500 Michael Pisula
501 Murad Shahid
502 Nick Burch
503 Oleg Tikhonov
504 Pablo Queixalos
505 Paul Jakubik
506 Raimund Merkert
507 Rajiv Kumar
508 Robert Trickey
509 Sami Siren
510 samraj
511 Selva Ganesan
512 Sjoerd Smeets
513 Stephen Duncan Jr
514 Tran Nam Quang
515 Uwe Schindler
516 Vitaliy Filippov
517
518 See http://s.apache.org/vR for more details on these contributions.
519
520
521 Release 0.9 - 02/13/2011
522 ------------------------
523
524 The most notable changes in Tika 0.9 over previous releases are:
525
526 * A critical bugfix preventing metadata from printing to the
527 command line when the underlying Parser didn't generate
528 XHTML output was fixed. (TIKA-596)
529
530 * The 0.8 version of Tika included a NetCDF jar file that pulled
531 in tremendous amounts of redundant dependencies. This has
532 been addressed in Tika 0.9 by republishing a minimal NetCDF
533 jar and changing Tika to depend on that. (TIKA-556)
534
535 * MIME detection for iWork, and OpenXML documents has been
536 improved. (TIKA-533, TIKA-562, TIKA-588)
537
538 * A critical backwards incompatible bug in PDF parsing that
539 was introduced in Tika 0.8 has been fixed. (TIKA-548)
540
541 * Support for forked parsing in separate processes was added.
542 (TIKA-416)
543
544 * Tika's language identifier now supports the Lithuanian
545 language. (TIKA-582)
546
547 Tika 0.9 relies on the following set of major dependencies (generated using
548 mvn dependency:tree from tika-parsers):
549
550 org.apache.tika:tika-parsers:bundle:0.9
551 +- org.apache.tika:tika-core:jar:0.9:compile
552 +- edu.ucar:netcdf:jar:4.2-min:compile
553 | \- org.slf4j:slf4j-api:jar:1.5.6:compile
554 +- commons-httpclient:commons-httpclient:jar:3.1:compile
555 | +- commons-logging:commons-logging:jar:1.1.1:compile (version managed from 1.0.4)
556 | \- commons-codec:commons-codec:jar:1.2:compile
557 +- org.apache.james:apache-mime4j:jar:0.6:compile
558 +- org.apache.commons:commons-compress:jar:1.1:compile
559 +- org.apache.pdfbox:pdfbox:jar:1.4.0:compile
560 | +- org.apache.pdfbox:fontbox:jar:1.4.0:compile
561 | \- org.apache.pdfbox:jempbox:jar:1.4.0:compile
562 +- org.bouncycastle:bcmail-jdk15:jar:1.45:compile
563 +- org.bouncycastle:bcprov-jdk15:jar:1.45:compile
564 +- org.apache.poi:poi:jar:3.7:compile
565 +- org.apache.poi:poi-scratchpad:jar:3.7:compile
566 +- org.apache.poi:poi-ooxml:jar:3.7:compile
567 | +- org.apache.poi:poi-ooxml-schemas:jar:3.7:compile
568 | | \- org.apache.xmlbeans:xmlbeans:jar:2.3.0:compile
569 | \- dom4j:dom4j:jar:1.6.1:compile
570 +- org.apache.geronimo.specs:geronimo-stax-api_1.0_spec:jar:1.0.1:compile
571 +- org.ccil.cowan.tagsoup:tagsoup:jar:1.2:compile
572 +- asm:asm:jar:3.1:compile
573 +- com.drewnoakes:metadata-extractor:jar:2.4.0-beta-1:compile
574 +- de.l3s.boilerpipe:boilerpipe:jar:1.1.0:compile
575 +- rome:rome:jar:0.9:compile
576 \- jdom:jdom:jar:1.0:compile
577
578 The following people have contributed to Tika 0.9 by submitting or commenting
579 on the issues resolved in this release:
580
581 Alex Skochin
582 Alexander Chow
583 Antoine L.
584 Antoni Mylka
585 Benjamin Douglas
586 Benson Margulies
587 Chris A. Mattmann
588 Cristian Vat
589 Cyriel Vringer
590 David Benson
591 Erik Hetzner
592 Gabriel Miklos
593 Geoff Jarrad
594 Jukka Zitting
595 Ken Krugler
596 Kostya Gribov
597 Leszek Piotrowicz
598 Martijn van Groningen
599 Maxim Valyanskiy
600 Michel Tremblay
601 Nick Burch
602 paul
603 Paul Pearcy
604 Peter van Raamsdonk
605 Piotr Bartosiewicz
606 Reinhard Schwab
607 Scott Severtson
608 Shinsuke Sugaya
609 Staffan Olsson
610 Steve Kearns
611 Tom Klonikowski
612 ≈Ωygimantas Medelis
613
614 See http://s.apache.org/qi for more details on these contributions.
615
616
617 Release 0.8 - 11/07/2010
618 ------------------------
619
620 The most notable changes in Tika 0.8 over previous releases are:
621
622 * Language identification is now dynamically configurable,
623 managed via a config file loaded from the classpath. (TIKA-490)
624
625 * Tika now supports parsing Feeds by wrapping the underlying
626 Rome library. (TIKA-466)
627
628 * A quick-start guide for Tika parsing was contributed. (TIKA-464)
629
630 * An approach for plumbing through XHTML attributes was added. (TIKA-379)
631
632 * Media type hierarchy information is now taken into account when
633 selecting the best parser for a given input document. (TIKA-298)
634
635 * Support for parsing common scientific data formats including netCDF
636 and HDF4/5 was added (TIKA-400 and TIKA-399).
637
638 * Unit tests for Windows have been fixed, allowing TestParsers
639 to complete. (TIKA-398)
640
641 Tika 0.8 relies on the following set of major dependencies (generated using
642 mvn dependency:tree from tika-parsers):
643
644 org.apache.tika:tika-parsers:bundle:0.8
645 +- org.apache.tika:tika-core:jar:0.8:compile
646 +- edu.ucar:netcdf:jar:4.2:compile
647 | \- org.slf4j:slf4j-api:jar:1.5.6:compile
648 +- commons-httpclient:commons-httpclient:jar:3.1:compile
649 | +- commons-logging:commons-logging:jar:1.1.1:compile (version managed from 1.0.4)
650 | \- commons-codec:commons-codec:jar:1.2:compile
651 +- org.apache.commons:commons-compress:jar:1.1:compile
652 +- org.apache.pdfbox:pdfbox:jar:1.3.1:compile
653 | +- org.apache.pdfbox:fontbox:jar:1.3.1:compile
654 | \- org.apache.pdfbox:jempbox:jar:1.3.1:compile
655 +- org.bouncycastle:bcmail-jdk15:jar:1.45:compile
656 +- org.bouncycastle:bcprov-jdk15:jar:1.45:compile
657 +- org.apache.poi:poi:jar:3.7:compile
658 +- org.apache.poi:poi-scratchpad:jar:3.7:compile
659 +- org.apache.poi:poi-ooxml:jar:3.7:compile
660 | +- org.apache.poi:poi-ooxml-schemas:jar:3.7:compile
661 | | \- org.apache.xmlbeans:xmlbeans:jar:2.3.0:compile
662 | \- dom4j:dom4j:jar:1.6.1:compile
663 +- org.apache.geronimo.specs:geronimo-stax-api_1.0_spec:jar:1.0.1:compile
664 +- org.ccil.cowan.tagsoup:tagsoup:jar:1.2:compile
665 +- asm:asm:jar:3.1:compile
666 +- com.drewnoakes:metadata-extractor:jar:2.4.0-beta-1:compile
667 +- de.l3s.boilerpipe:boilerpipe:jar:1.1.0:compile
668 +- rome:rome:jar:0.9:compile
669 \- jdom:jdom:jar:1.0:compile
670
671 The following people have contributed to Tika 0.8 by submitting or commenting
672 on the issues resolved in this release:
673
674 Łukasz Wiktor
675 Adam Wilmer
676 Alex Baranau
677 Alex Ott
678 André Ricardo
679 Andrey Barhatov
680 Andrey Sidorenko
681 Antoni Mylka
682 Arturo Beltran
683 Attila Kir√°ly
684 Brad Greenlee
685 Bruno Dumon
686 Chris A. Mattmann
687 Chris Bamford
688 Christophe Gourmelon
689 Dave Meikle
690 David Weekly
691 Dmitry Kuzmenko
692 Erik Hetzner
693 Geoff Jarrad
694 Gerd Bremer
695 Grant Ingersoll
696 Jan H√∏ydahl
697 Jean-Philippe Ricard
698 Jeremias Maerki
699 Joao Garcia
700 Jukka Zitting
701 Julien Nioche
702 Ken Krugler
703 Liam O'Boyle
704 Mads Hansen
705 Marcel May
706 Markus Goldbach
707 Martijn van Groningen
708 Maxim Valyanskiy
709 Mike Hays
710 Miroslav Pokorny
711 Nick Burch
712 Otis Gospodnetic
713 Peter van Raamsdonk
714 Peter Wolanin
715 Peter_Lenahan@ibi.com
716 Piotr Bartosiewicz
717 Radek
718 Rajiv Kumar
719 Reinhard Schwab
720 rick cameron
721 Robert Muir
722 Sanjeev Rao
723 Simon Tyler
724 Sjoerd Smeets
725 Slavomir Varchula
726 Staffan Olsson
727 Tom De Leu
728 Uwe Schindler
729 Victor Kazakov
730
731 See http://s.apache.org/ab0 for more details on these contributions.
732
733
734 Release 0.7 - 3/31/2010
735 -----------------------
736
737 The most notable changes in Tika 0.7 over previous releases are:
738
739 * MP3 file parsing was improved, including Channel and SampleRate
740 extraction and ID3v2 support (TIKA-368, TIKA-372). Further, audio
741 parsing mime detection was also improved for the MIDI format. (TIKA-199)
742
743 * Tika no longer relies on X11 for its RTF parsing functionality. (TIKA-386)
744
745 * A Thread-safe bug in the AutoDetectParser was discovered and
746 addressed. (TIKA-374)
747
748 * Upgrade to PDFBox 1.0.0. The new PDFBox version improves PDF parsing
749 performance and fixes a number of text extraction issues. (TIKA-380)
750
751 The following people have contributed to Tika 0.7 by submitting or commenting
752 on the issues resolved in this release:
753
754 Adam Rauch
755 Benson Margulies
756 Brett S.
757 Chris A. Mattmann
758 Daan de Wit
759 Dave Meikle
760 Durville
761 Ingo Renner
762 Jukka Zitting
763 Ken Krugler
764 Kenny Neal
765 Markus Goldbach
766 Maxim Valyanskiy
767 Nick Burch
768 Sami Siren
769 Uwe Schindler
770
771 See http://tinyurl.com/yklopby for more details on these contributions.
772
773
774 Release 0.6 - 01/20/2010
775 ------------------------
776
777 The most notable changes in Tika 0.6 over the previous release are:
778
779 * Mime-type detection for HTML (and all types) has been improved, allowing malformed
780 HTML files and those HTML files that require a bit more observed content
781 before the type is properly detected, are now correctly identified by
782 the AutoDetectParser. (TIKA-327, TIKA-357, TIKA-366, TIKA-367)
783
784 * Tika now has an additional OSGi bundle packaging that includes all the
785 required parser libraries. This bundle package makes it easy to use all
786 Tika features in an OSGi environment. (TIKA-340, TIKA-342)
787
788 * The Apache POI dependency used for parsing Microsoft Office file formats
789 has been upgraded to version 3.6. The most visible improvement in this
790 version is the notably reduced ooxml jar file size. The tika-app jar size
791 is now down to 15MB from the 25MB in Tika 0.5. (TIKA-353)
792
793 * Handling of character encoding information in input metadata and HTML
794 <meta> tags has been improved. When no applicable encoding information is
795 available, the encoding is detected by looking at the input data.
796 (TIKA-332, TIKA-334, TIKA-335, TIKA-341)
797
798 * Some document types like Excel spreadsheets contain content like
799 numbers or formulas whose exact text format depends on the current locale.
800 So far Tika has used the platform default locale in such cases, but
801 clients can now explicitly specify the locale by passing a Locale instance
802 in the parse context. (TIKA-125)
803
804 * The default text output encoding of the tika-app jar is now UTF-8
805 when running on Mac OS X. This is because the default encoding used
806 by Java is not compatible with the console application in Mac OS X.
807 On all other platforms the text output from tika-app still uses
808 the platform default encoding. (TIKA-324)
809
810 * A flash video (video/x-flv) parser has been added. (TIKA-328)
811
812 * The handling of Number and Date cell formatting within the Microsoft Excel
813 documents has been added. This include currencies, percentages and
814 scientific formats. (TIKA-103)
815
816 The following people have contributed to Tika 0.6 by submitting or commenting
817 on the issues resolved in this release:
818
819 Andrzej Bialecki
820 Bertrand Delacretaz
821 Chris A. Mattmann
822 Dave Meikle
823 Erik Hetzner
824 Felix Meschberger
825 Jukka Zitting
826 Julien Nioche
827 Ken Krugler
828 Luke Nezda
829 Maxim Valyanskiy
830 Niall Pemberton
831 Peter Wolanin
832 Piotr B.
833 Sami Siren
834 Yuan-Fang Li
835
836 See http://tinyurl.com/yc3dk67 for more details on these contributions.
837
838
839 Release 0.5 - 11/14/2009
840 ------------------------
841
842 The most notable changes in Tika 0.5 over the previous release are:
843
844 * Improved RDF/OWL mime detection using both MIME magic as well as
845 pattern matching (TIKA-309)
846
847 * An org.apache.tika.Tika facade class has been added to simplify common
848 text extraction and type detection use cases. (TIKA-269)
849
850 * A new parse context argument was added to the Parser.parse() method.
851 This context map can be used to pass things like a delegate parser or
852 other settings to the parsing process. The previous parse() method
853 signature has been deprecated and will be removed in Tika 1.0. (TIKA-275)
854
855 * A simple ngram-based language detection mechanism has been added along
856 with predefined language profiles for 18 languages. (TIKA-209)
857
858 * The media type registry in Tika was synchronized with the MIME type
859 configuration in the Apache HTTP Server. Tika now knows about 1274
860 different media types and can detect 672 of those using 927 file
861 extension and 280 magic byte patterns. (TIKA-285)
862
863 * Tika now uses the Apache PDFBox version 0.8.0-incubating for parsing PDF
864 documents. This version is notably better than the 0.7.3 release used
865 earlier. (TIKA-158)
866
867 The following people have contributed to Tika 0.5 by submitting or commenting
868 on the issues resolved in this release:
869
870 Alex Baranov
871 Bart Hanssens
872 Benson Margulies
873 Chris A. Mattmann
874 Daan de Wit
875 Erik Hetzner
876 Frank Hellwig
877 Jeff Cadow
878 Joachim Zittmayr
879 Jukka Zitting
880 Julien Nioche
881 Ken Krugler
882 Maxim Valyanskiy
883 MRIT64
884 Paul Borgermans
885 Piotr B.
886 Robert Newson
887 Sascha Szott
888 Ted Dunning
889 Thilo Goetz
890 Uwe Schindler
891 Yuan-Fang Li
892
893 See http://tinyurl.com/yl9prwp for more details on these contributions.
894
895
896 Release 0.4 - 07/14/2009
897 ------------------------
898
899 The most notable changes in Tika 0.4 over the previous release are:
900
901 * Tika has been split to three different components for increased
902 modularity. The tika-core component contains the key interfaces and
903 core functionality of Tika, tika-parsers contains all the adapters
904 to external parser libraries, and tika-app bundles everything together
905 in a single executable jar file. (TIKA-219)
906
907 * All the three Tika components are packaged as OSGi bundles. (TIKA-228)
908
909 * Tika now uses the new Commons Compress library for improved support
910 of compression and packaging formats like gzip, bzip2, tar, cpio,
911 ar, zip and jar. (TIKA-204)
912
913 * The memory use of parsing Excel sheets with lots of numbers
914 has been considerably reduced. (TIKA-211)
915
916 * The AutoDetectParser now has basic protection against "zip bomb"
917 attacks, where a specially crafted input document can expand to
918 practically infinite amount of output text. (TIKA-216)
919
920 * The ParsingReader class can now use a thread pool or a more complex
921 execution model (java.util.concurrent.Executor) for the background
922 parsing task. (TIKA-215)
923
924 * Automatic type detection of text- and XML-based documents has been
925 improved. (TIKA-225)
926
927 * Charset detection functionality from the ICU4J library was inlined
928 in Tika to avoid the dependency to the large ICU4J jar. (TIKA-229)
929
930 * Composite parsers like the AutoDetectParser now make sure that any
931 RuntimeExceptions, IOExceptions or SAXExceptions unrelated to the given
932 document stream or content handler are converted to TikaExceptions
933 before being passed to the client. (TIKA-198, TIKA-237)
934
935 The following people have contributed to Tika 0.4 by submitting or commenting
936 on the issues resolved in this release:
937
938 Chris A. Mattmann
939 Daan de Wit
940 Dave Meikle
941 David Weekly
942 Jeremias Maerki
943 Jonathan Koren
944 Jukka Zitting
945 Karl Heinz Marbaise
946 Keith R. Bennett
947 Maxim Valyanskiy
948 Niall Pemberton
949 Robert Burrell Donkin
950 Sami Siren
951 Siddharth Gargate
952 Uwe Schindler
953
954 See http://tinyurl.com/mgv9o3 for more details on these contributions.
955
956
957 Release 0.3 - 03/09/2009
958 ------------------------
959
960 The most notable changes in Tika 0.3 over the previous release are:
961
962 * Tika now supports mime type glob patterns specified using
963 standard JDK 1.4 (and beyond) syntax via the isregex attribute
964 on the glob tag. See:
965
966 http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
967
968 for more information. (TIKA-194)
969
970 * Tika now supports the Office Open XML format used by
971 Microsoft Office 2007. (TIKA-152)
972
973 * All the metadata keys for Microsoft Office document properties are now
974 included as constants in the MSOffice interface. Clients should use
975 these constants instead of the raw string values to refer to specific
976 metadata items. (TIKA-186)
977
978 * Automatic detection of document types in Tika has been improved.
979 For example Tika can now detect plain text just by looking at the first
980 few bytes of the document. (TIKA-154)
981
982 * Tika now disables the loading of all external entities in XML files
983 that it parses as input documents. This improves security and avoids
984 problems with potentially broken references. (TIKA-185)
985
986 * Tika now replaces all invalid XML characters in the extracted text
987 content with spaces. This prevents problems when output from Tika
988 is processed with XML tools. (TIKA-180)
989
990 * The Tika CLI now correctly flushes its buffers when invoked with the
991 --text argument. This prevents the end of the text output from being
992 lost. (TIKA-179)
993
994 * Embedded text in MIDI files is now extracted. For example many karaoke
995 files contain song lyrics embedded as MIDI text.
996
997 * The text content of Microsoft Outlook message files no longer appears as
998 multiple copies in the extracted text. (TIKA-197)
999
1000 * The ParsingReader class now makes most document metadata available
1001 already before any of the extracted text is consumed. This makes it
1002 easier for example to construct Lucene Document instances that contain
1003 both extracted text and metadata. (TIKA-203)
1004
1005 See http://tinyurl.com/tika-0-3-changes for a list of all changes in Tika 0.3.
1006
1007 The following people have contributed to Tika 0.3 by submitting or commenting
1008 on the issues resolved in this release:
1009
1010 Andrzej Rusin
1011 Chris A. Mattmann
1012 Dave Meikle
1013 Georger Ara√∫jo
1014 Guillermo Arribas
1015 Jonathan Koren
1016 Jukka Zitting
1017 Karl Heinz Marbaise
1018 Kumar Raja Jana
1019 Paul Borgermans
1020 Peter Becker
1021 Sébastien Michel
1022 Uwe Schindler
1023
1024 See http://tinyurl.com/tika-0-3-contributions for more details on
1025 these contributions.
1026
1027
1028 Release 0.2 - 12/04/2008
1029 ------------------------
1030
1031 1. TIKA-109 - WordParser fails on some Word files (Dave Meikle)
1032
1033 2. TIKA-105 - Excel parser implementation based on POI's Event API
1034 (Niall Pemberton)
1035
1036 3. TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting)
1037
1038 4. TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting)
1039
1040 5. TIKA-115 - Tika package with all the dependencies (Jukka Zitting)
1041
1042 6. TIKA-97 - Tika GUI (Jukka Zitting)
1043
1044 7. TIKA-96 - Tika CLI (Jukka Zitting)
1045
1046 8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
1047
1048 9. TIKA-127 - Add support for Visio files (Jukka Zitting)
1049
1050 10. TIKA-129 - node() support for the streaming XPath utility (Jukka Zitting)
1051
1052 11. TIKA-130 - self-or-descendant axis does not match self in streaming XPath
1053 (Jukka Zitting)
1054
1055 12. TIKA-131 - Lazy XHTML prefix generation (Jukka Zitting)
1056
1057 13. TIKA-128 - HTML parser should produce XHTML SAX events (Jukka Zitting)
1058
1059 14. TIKA-133 - TeeContentHandler constructor should use varargs (Jukka Zitting)
1060
1061 15. TIKA-132 - Refactor Excel extractor to parse per sheet and add
1062 hyperlink support (Niall Pemberton)
1063
1064 16. TIKA-134 - mvn package does not produce packages for bin/src
1065 (Karl Heinz Marbaise)
1066
1067 17. TIKA-138 - Ignore HTML style and script content (Jukka Zitting)
1068
1069 18. TIKA-113 - Metadata (such as title) should not be part of content
1070 (Jukka Zitting)
1071
1072 19. TIKA-139 - Add a composite parser (Jukka Zitting)
1073
1074 20. TIKA-142 - Include application/xhtml+xml as valid mime type for XMLParser
1075 (mattmann)
1076
1077 21. TIKA-143 - Add ParsingReader (Jukka Zitting)
1078
1079 22. TIKA-144 - Upgrade nekohtml dependency (Jukka Zitting)
1080
1081 23. TIKA-145 - Separate NOTICEs and LICENSEs for binary and source packages
1082 (Jukka Zitting)
1083
1084 24. TIKA-146 - Upgrade to POI 3.1 (Jukka Zitting)
1085
1086 25. TIKA-99 - Support external parser programs (Jukka Zitting)
1087
1088 26. TIKA-149 - Parser for Zip files (Dave Meikle & Jukka Zitting)
1089
1090 27. TIKA-150 - Parser for tar files (Jukka Zitting)
1091
1092 28. TIKA-151 - Stream compression support (Jukka Zitting)
1093
1094 29. TIKA-156 - Some MIME magic patterns are ignored by MimeTypes
1095 (Jukka Zitting)
1096
1097 30. TIKA-155 - Java class file parser (Dave Brosius & Jukka Zitting)
1098
1099 31. TIKA-108 - New Tika logos (Yongqian Li & Jukka Zitting)
1100
1101 32. TIKA-120 - Add support for retrieving ID3 tags from MP3 files
1102 (Dave Meikle & Jukka Zitting)
1103
1104 33. TIKA-54 - Outlook msg parser
1105 (Rida Benjelloun, Dave Meikle & Jukka Zitting)
1106
1107 34. TIKA-114 - PDFParser : Getting content of the document using
1108 "writer.ToString ()" , some words are stuck together
1109 (Dave Meikle)
1110
1111 35. TIKA-161 - Enable PMD reports (Jukka Zitting)
1112
1113 36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi
1114 (Sami Siren)
1115
1116 37. TIKA-140 - HTML parser unable to extract text
1117 (Julien Nioche & Jukka Zitting)
1118
1119 38. TIKA-163 - GUI does not support drag and drop in Gnome or KDE (Dave Meikle)
1120
1121 39. TIKA-166 - Update HTMLParser to parse contents of meta tags (Dave Meikle)
1122
1123 40. TIKA-164 - Upgrade of the nekohtml dependency to 1.9.9 (Jukka Zitting)
1124
1125 41. TIKA-165 - Upgrade of the ICU4J dependency to version 3.8 (Jukka Zitting)
1126
1127 42. TIKA-172 - New Open Document Parser that emits structured XHTML content
1128 (Uwe Schindler & Jukka Zitting)
1129
1130 43. TIKA-175 - Retrotranslate Tika for use in Java 1.4 environments (Jukka Zitting)
1131
1132 44. TIKA-177 - Improvements to build instruction in README (Chris Hostetter & Jukka Zitting)
1133
1134 45. TIKA-171 - New ContentHandler for plain text output that has no problem with
1135 missing white space after XHTML block tags (Uwe Schindler & Jukka Zitting)
1136
1137
1138 Release 0.1-incubating - 12/27/2007
1139 -----------------------------------
1140
1141 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
1142
1143 2. TIKA-11 - Consolidate test classes into a src/test/java directory tree (mattmann)
1144
1145 3. TIKA-15 - Utils.print does not print a Content having no value (jukka)
1146
1147 4. TIKA-19 - org.apache.tika.TestParsers fails (bdelacretaz)
1148
1149 5. TIKA-16 - Issues with data files used for testing by TestParsers (bdelacretaz)
1150
1151 6. TIKA-14 - MimeTypeUtils.getMimeType() returns the default mime type for
1152 .odt (Open Office) file (bdelacretaz)
1153
1154 7. TIKA-12 - Add URL capability to MimeTypesUtils (jukka)
1155
1156 8. TIKA-13 - Fix obsolete package names in config.xml (siren)
1157
1158 9. TIKA-10 - Remove MimeInfoException catch clauses and import from TestParsers (siren)
1159
1160 10. TIKA-8 - Replaced the jmimeinfo dependency with a trivial mime type detector (jukka)
1161
1162 11. TIKA-7 - Added the Lius Lite code. Added missing dependencies to POM (jukka)
1163
1164 12. TIKA-18 - "Office" interface should be renamed "MSOffice" (mattmann)
1165
1166 13. TIKA-23 - Decouple Parser from ParserConfig (jukka)
1167
1168 14. TIKA-6 - Port Nutch (or better) MimeType detection system into Tika (J. Charron & mattmann)
1169
1170 15. TIKA-25 - Removed hardcoded reference to C:\oo.xml in OpenOfficeParser (K. Bennett & jukka)
1171
1172 16. TIKA-17 - Need to support URL's for input resources. (K. Bennett & mattmann)
1173
1174 17. TIKA-22 - Remove @author tags from the java source (mattmann)
1175
1176 18. TIKA-21 - Simplified configuration code (jukka)
1177
1178 19. TIKA-17 - Rename all "Lius" classes to be "Tika" classes (jukka)
1179
1180 20. TIKA-30 - Added utility constructors to TikaConfig (K. Bennett & jukka)
1181
1182 21. TIKA-28 - Rename config.xml to tika-config.xml or similar (mattmann)
1183
1184 22. TIKA-26 - Use Map<String, Content> instead of List<Content> (jukka)
1185
1186 23. TIKA-31 - protected Parser.parse(InputStream stream,
1187 Iterable<Content> contents) (jukka & K. Bennett)
1188
1189 24. TIKA-36 - A convenience method for getting a document's content's text
1190 would be helpful (K. Bennett & mattmann)
1191
1192 25. TIKA-33 - Stateless parsers (jukka)
1193
1194 26. TIKA-38 - TXTParser adds a space to the content it reads from a file (K. Bennett & ridabenjelloun)
1195
1196 27. TIKA-35 - Extract MsOffice properties, use RereadableInputStream devloped by K. Bennett (ridabenjelloun & K. Bennett)
1197
1198 28. TIKA-39 - Excel parsing improvements (siren & ridabenjelloun)
1199
1200 29. TIKA-34 - Provide a method that will return a default configuration
1201 (TikaConfig) (K. Bennett & mattmann)
1202
1203 30. TIKA-42 - Content class needs (String, String, String) constructor (K. Bennett)
1204
1205 31. TIKA-43 - Parser interface (jukka)
1206
1207 32. TIKA-47 - Remove TikaLogger (jukka)
1208
1209 33. TIKA-46 - Use Metadata in Parser (jukka & mattmann)
1210
1211 34. TIKA-48 - Merge MS Extractors and Parsers (jukka)
1212
1213 35. TIKA-45 - RereadableInputStream needs to be able to read to
1214 the end of the original stream on first rewind. (K. Bennett)
1215
1216 36. TIKA-41 - Resource files occur twice in jar file. (jukka)
1217
1218 37. TIKA-49 - Some files have old-style license headers, fixed (Robert Burrell Donkin & bdelacretaz)
1219
1220 38. TIKA-51 - Leftover temp files after running Tika tests, fixed (bdelacretaz)
1221
1222 39. TIKA-40 - Tika needs to support diverse character encodings (jukka)
1223
1224 40. TIKA-55 - ParseUtils.getParser() method variants should have consistent parameter orders
1225 (K. Bennett)
1226
1227 41. TIKA-52 - RereadableInputStream needs to support not closing the input stream it wraps.
1228 (K. Bennett via bdelacretaz)
1229
1230 42. TIKA-53 - XHTML SAX events from parsers (jukka)
1231
1232 43. TIKA-57 - Rename org.apache.tika.ms to org.apache.tika.parser.ms (jukka)
1233
1234 44. TIKA-62 - Use TikaConfig.getDefaultConfig() instead of a hardcoded
1235 config path in TestParsers (jukka)
1236
1237 45. TIKA-58 - Replace jtidy html parser with nekohtml based parser (siren)
1238
1239 46. TIKA-60 - Rename Microsoft parser classes (jukka)
1240
1241 47. TIKA-63 - Avoid multiple passes over the input stream in Microsoft parsers
1242 (jukka)
1243
1244 48. TIKA-66 - Use Java 5 features in org.apache.tika.mime (jukka)
1245
1246 49. TIKA-56 - Mime type detection fails with upper case file extensions such as "PDF"
1247 (mattmann)
1248
1249 50. TIKA-65 - Add encode detection support for HTML parser (siren)
1250
1251 51. TIKA-68 - Add dummy parser classes to be used as sentinels (jukka)
1252
1253 52. TIKA-67 - Add an auto-detecting Parser implementation (jukka)
1254
1255 53. TIKA-70 - Better MIME information for the Open Document formats (jukka)
1256
1257 54. TIKA-71 - Remove ParserConfig and ParserFactory (jukka)
1258
1259 55. TIKA-83 - Create a org.apache.tika.sax package for SAX utilities (jukka)
1260
1261 56. TIKA-84 - Add MimeTypes.getMimeType(InputStream) (jukka)
1262
1263 57. TIKA-85 - Add glob patterns from the ASF svn:eol-style documentation (jukka)
1264
1265 58. TIKA-100 - Structured PDF parsing (jukka)
1266
1267 59. TIKA-101 - Improve site and build (mattmann)
1268
1269 60. TIKA-102 - Parser implementations loading a large amount of content
1270 into a single String could be problematic (Niall Pemberton)
1271
1272 61. TIKA-107 - Remove use of assertions for argument checking (Niall Pemberton)
1273
1274 62. TIKA-104 - Add utility methods to throw IOException with the caused
1275 intialized (jukka & Niall Pemberton)
1276
1277 63. TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex
1278 (Niall Pemberton)
1279
1280 64. TIKA-111 - Missing license headers (jukka)
1281
1282 65. TIKA-112 - XMLParser improvement (ridabenjelloun)
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
+283
-0
KEYS less more
0 This file contains the PGP keys of various developers.
1 Please don't use them for email unless you have to. Their main
2 purpose is code signing.
3
4 Users:
5 pgp < KEYS
6 gpg --import KEYS
7
8 Developers:
9 pgp -kxa <your name> # and append to KEYS (this file)
10 (pgpk -ll <your name> && pgpk -xa <your name>) >> KEYS
11 (gpg --list-sigs <your name> && gpg --armor --export <your name>) >> KEYS
12
13 ----
14
15 pub 1024D/A355A63E 2006-09-19
16 uid Jukka Zitting <jukka@apache.org>
17 sig 3 A355A63E 2006-09-19 Jukka Zitting <jukka@apache.org>
18 sig 3 A355A63E 2006-09-19 Jukka Zitting <jukka@apache.org>
19 sig E04F9A89 2006-10-13 Roy T. Fielding <fielding@gbiv.com>
20 sig AB821FBC 2006-10-14 Andrew McIntyre <fuzzylogic@apache.org>
21 sig 9992ADFC 2006-10-17 Jeff McAdams (General purpose key) <jeffm@iglou.com>
22 sig 5F7898A9 2006-10-17 William Glass-Husain <wglass@apache.org>
23 sig 08C975E5 2006-10-18 Jim Jagielski <jim@apache.org>
24 sig BB550746 2006-10-23 J. Daniel Kulp <dan@kulp.com>
25 sig 9BCFCE2F 2006-10-24 Garrett Rooney <rooneg@electricjellyfish.net>
26 sig BFD0307C 2006-10-25 J Aaron Farr <aaron.farr@jadetower.com>
27 sig 2C5E4EC0 2006-10-28 Henri Yandell (For signing ASF releases) <bayard@apache.org>
28 sig D872E270 2006-10-30 Bozhong Lin <blin@iona.com>
29 sig 3 669C4AB5 2006-10-16 Cezar Cristian Andrei <cezar@apache.org>
30 sig 665E0760 2006-12-15 Afkham Azeez (WSO2 Email) <azeez@wso2.com>
31 sig 329D80D2 2006-12-16 Sanka Samaranayake (Oxygenating The Web Service Platform) <sanka@wso2.com>
32 sig E41EDC7E 2006-12-17 Carsten Ziegeler <cziegeler@apache.org>
33 sig 9C85222B 2007-05-14 Henning Schmiedehausen <hps@intermeta.de>
34 sub 2048g/2542C54A 2006-09-19
35 sig A355A63E 2006-09-19 Jukka Zitting <jukka@apache.org>
36
37 -----BEGIN PGP PUBLIC KEY BLOCK-----
38 Version: GnuPG v1.4.5 (GNU/Linux)
39
40 mQGiBEUQYOcRBADsCu4zTVaB4TOhV7NyTvHhG1bqN+3Va5t4vpGQJg4M4U0Yu0ut
41 4bCZP8I6rlXGj+TqDKVUx9kfGpIKX6Kw2TvZUYbHIDWh3UhQO1hD4xy4b8rOak1w
42 3vDAMYA5L/jsZshNHp++aTgmvSoXGS5S1xsjrbrOics6iTtHXMV9TCRelwCgkwGj
43 WHzPJnlSj0z4jAlRG4ZubAkD/3LOfrXtti1oirfDTnBZcxhvldyCT2yiE3LRe8N2
44 ijmtNO6fl2fqXSWuP0L125ytlOvww1r6Gd8sVXiVwt2oKZVQ+A5028BbHa0u4e3y
45 54nA21OBKLCC/hJvyOkPf9/kZk6S7fV+Tour/auixX4WqUg+siMRe/EwHw6bQDD/
46 1OK8A/4rVPPCDTLvcQbT+B3z0IEfryMkivJMu7dEoENDXDK7N5KginugnCpJd+g9
47 GbYgTYI2YPNB6A2eaR4lH8yQCMyDXC5+bGL+1NL3SP1qR4JE//nUcbx+iMTYR5uX
48 kwmaGMXRl7z47OgtaWM/dVipNuNaqkD9WkuMGb4rdSNHrI+amrQgSnVra2EgWml0
49 dGluZyA8anVra2FAYXBhY2hlLm9yZz6IXgQTEQIAHgUCRRBg5wIbAwYLCQgHAwID
50 FQIDAxYCAQIeAQIXgAAKCRCnMFKco1WmPjMkAKCDzBKvBWCxE+NWc98LsnvMRXxy
51 1gCdFNXW4qAnNrjwJeNZa9YqUy1e9+6IXgQTEQIAHgUCRRBg5wIbAwYLCQgHAwID
52 FQIDAxYCAQIeAQIXgAAKCRCnMFKco1WmPjMkAJ9orvc5AMXyquJ3LTMw4ZZpfu2p
53 cgCeNCIzVWrUnsm8fwq6QCZ2E0zC0aWIRgQQEQIABgUCRS8wYAAKCRBbloAQ4E+a
54 ia2JAJ9JQKm04OY7mmePtbo3X3duAIJ2+ACdGjapH2csm0eN0ASY8LmZfbQ+2x+I
55 RgQQEQIABgUCRTCZoQAKCRAN8HRdq4IfvC7dAJ0TbVWoLCMFJlJJEIgI+7SdIUXP
56 FACgoNoQqRE1ddJwlvShuAr/TGkPWFWIRgQQEQIABgUCRTQd3AAKCRBeRSbOmZKt
57 /N55AKDi8FaRU4mZE6Y1UeI4tedJjQuiyQCgyPQ4TNh1kQVAobJOaKfH5oOaq9SI
58 RgQQEQIABgUCRTQwXwAKCRCZLEP9X3iYqZFSAJ4t6A3DaBAH38U2894lcxF4i89Z
59 owCdGLGJemsJYJ0tG35klJ6LgptPJCGIRgQQEQIABgUCRTZPigAKCRCLOmAfCMl1
60 5XQnAKDbXvf1/Ikkwk/Mm2gU/Oww3nenDQCeMXbNLANP61ZQVbiq2ptLprB9GIyI
61 RgQQEQIABgUCRTzy9gAKCRCryO45u1UHRokqAKCSnWGzdBc7rUA8Hk+dSnotxAAk
62 CACghNdwrn9yZOj9PkaC5AZHooTApbiIRgQQEQIABgUCRT4RZQAKCRDJHbjam8/O
63 L25YAJ9PvcRqiI28JTR+2nPklshBIl4RPQCg4RVhfHfYYhrnp/PTPvsXwLwbmCiI
64 RgQQEQIABgUCRT61ywAKCRBr4o6uv9AwfCzwAJ9B0LfpHiNiEcu4fiKwYRkrq23W
65 2QCfQMXeANfjrLF5rOLBTlybG/iFVoOIRgQQEQIABgUCRUKpkgAKCRASQbyHLF5O
66 wHbRAJ4kAkT2/s9+jxnQaYGxvGdhAWAv8wCghnhoDt5wEVO9g5B354EeQtBkgQKI
67 RgQQEQIABgUCRUWYPwAKCRCXe/3Q2HLicG8SAJ9MsVvqPH2GtW9z6JGYwd72dL6q
68 awCgulT1Osiy3BMvsS3GFLeojlxu1XaIRgQTEQIABgUCRTQR+QAKCRB4voXZZpxK
69 ta2vAJ9LnSiN1u4vEI8gMfqSE45w+8laKgCfSFhq6NE6E+6pLWRjb/vaMxG34bKI
70 RgQQEQIABgUCRYKMPAAKCRBK4siHZl4HYEhpAJ9E5g6gYEoB1o9eFreeeHEfJr2+
71 SgCgt0rvIcRJt33Ya34WXjW2ec7ZYnGIRgQQEQIABgUCRYQagwAKCRD8d3QRMp2A
72 0kLOAJ9zBUwMNkOQq5Zi5XymM8z3lkKE0ACdEqTWfzT30T7Gqcqam7rlnvjMlk2I
73 RgQQEQIABgUCRYWMRQAKCRATLknU5B7cflKmAKCQp7dA9URTfyZeATsuuu1zrkMO
74 7ACfRg4Ygus7DlFZpa8tDGTqCBkZByiIcQQQEQIAMQUCRkjN6CocSGVubmluZyBT
75 Y2htaWVkZWhhdXNlbiA8aHBzQGludGVybWV0YS5kZT4ACgkQMoZOQZyFIiuU5ACf
76 TfCWT2kwScYYl42+uhwXK8oUcQMAningCO/sTpoHOxSXI1PvlNXOFGXHuQINBEUQ
77 YQMQCACkfsPxWTFYmAfnMI7sjWbYhMF/ym78xwwK+kIdHuqBCnjK8sGJD4BeRtA9
78 E+fE7y9A2VIvvA4N9gj2EoOSiLmkU2XOEEj2K96FSt93He4wiOjBmT5QwGFqIfg+
79 lrOTDcVXsmEqbSsD1Vfkee6MaZmNbKYT7JVEmFH13LRdCj5vjr3VV7wJ70vuPSPw
80 HMJwLACFM60QrEGeKifjoouCvRxoNrz0LUcbxAkKpCs34dFt+PTtmzgeMmXYK4lj
81 qqX9/UaUbElhlPQXF8iy3TV0B00WmkPwknUj49gStZ2CSbWtAOQBlcQ3uvC2KEeP
82 oDQQSquzWiCRfPJezTl8m6tS0jn/AAUTB/0eq/QnmTLNIDeZWFLNJjWk6F6JU/ql
83 ZYoKxnrEsz3Q9Z6QAewQ2+8XSa2gAA5LodgqiDmoMcmZ7uOvXTB1qXKdJ+AIMLO3
84 WqbZ29e8STr2Pu+ouaoCQoSKv//Vdo+UYrEDcRiOXhTwfF2o6HD8UClZGYmdx7Op
85 4T5H64Piky6CLsMudVr3SGU+9E1h8G61ddLynt2G6IJrl5s/FyW9xboX+Z/pkPAN
86 K6GR108O7s4Pfp5uaIPpEcnCx+f+47PixEbmiBg2qIHk46Qo7nMFXO2ByZZzcALM
87 0JvKSKcOY0ecBp6I9ZHfFOnxnA1rQEMsYFflFSxSE/nueFCvSJ+WW5uyiEkEGBEC
88 AAkFAkUQYQMCGwwACgkQpzBSnKNVpj6lWACeMtA8UqDpB2tSJhgJBpy1zYbH06UA
89 n2vnHhk+T9mBMOTPjgR9lLrdsnYg
90 =Mj6z
91 -----END PGP PUBLIC KEY BLOCK-----
92 pub 1024D/B876884A 2007-12-24
93 uid Chris Mattmann (CODE SIGNING KEY) <mattmann@apache.org>
94 sig 3 B876884A 2007-12-24 Chris Mattmann (CODE SIGNING KEY) <mattmann@apache.org>
95 sub 2048g/D3B4F350 2007-12-24
96 sig B876884A 2007-12-24 Chris Mattmann (CODE SIGNING KEY) <mattmann@apache.org>
97
98 -----BEGIN PGP PUBLIC KEY BLOCK-----
99 Version: GnuPG v1.4.8 (Darwin)
100
101 mQGiBEdvL9QRBACuaV06by+pxZHXIxBsfAFYJk7XJgsqR23m5ClCDPusMeaI4XGB
102 eU8Nw4iVwgG3p5VLWLXeMIm/KPz3pmxiNyEP/dHoDxOPR+hAqlP5v03D1iK19H7q
103 46BIecIwo8q0ei70fBLvMQN+apIFlvYDqVCTm1lxoCQafagqd9p2JtTf+wCg70yM
104 nGtrejB+ZTTcb08f7SAHsLED/11vIdcxViN3u+3klhbb99bd/g9KvCU/I/7+MDx1
105 3zrSvJV2b2wrxabUJ1Oxsb4/4BXq8A1FyhC1h/d2PsawqiY0GZ02cucbzEmdXH51
106 UnrRLM9/txtZ2b7V6YkDmPf0k6rD0SjqAAy1ERekEVUOxnY4sPGmJoyac4j9+pO9
107 1vH/A/9LRoJlPTfv/mFYty6/Egckhv48YoRUBo1dNh6IPQY0oVpAFbcXc3GiTyCu
108 5iQp7utxP7hoJTUM2Hn5tF9D7IniRC9wsrcW8Gi/f82O4HlmyV4+Tt75nWx018oI
109 ObGmwitT27EkOnFcQc9F+Q53nKr+a22SBbpfffF9Xdbkw7V73bQ3Q2hyaXMgTWF0
110 dG1hbm4gKENPREUgU0lHTklORyBLRVkpIDxtYXR0bWFubkBhcGFjaGUub3JnPohg
111 BBMRAgAgBQJHby/UAhsDBgsJCAcDAgQVAggDBBYCAwECHgECF4AACgkQcPCcxrh2
112 iEr8KwCffMIKMu3TBrGZVu1BPLbMBhjsrl8AoI15rg+tzYZZmZJD6tDS40klTsVA
113 uQINBEdvL9QQCAClHjwXMu38iDR3nvbYkWmcz5rfBFvDm/KVQGLnnY96C1r890Ir
114 cHxAlSpbGb6qPi5n27v87LoS2bYEitqCUUwB7AQLOgqmLvqMJ4qp5HUfTQ/wH9Br
115 wK2LX1oGFJXH14lbZ7xW36n9A/JtXHY8vGz3GuDvKYqbdOCFo8fBLwotdFOHhNYy
116 bBYS1G4gtmemXwzH8kcuoIW6LuoRNxluHi1tJGFC1F1uBoxKir7F7BC38DDNvhak
117 dSJpm3WxFkEEkIUyIERVGVRoFzLlk72W0R3kZVvnXbtgPklTg/2Sy13Gb+MzTBYt
118 5TF841neM/kHdgt45EgBhchHN3Ys3ljabihbAAMFB/4ke4Xe573V78UR/WTMUzfw
119 pIysMUzEjNKqOfnAoNnR4WDDca4MwIUl62QqGTRrWZxTD8fAGYxc+m0qmygGKtYq
120 LUYB5N/pLGu1sg2j23G8aBKthiCCE+jOr3uebU/j0BTzN/BwXCqIGogELFlPC5Tj
121 Hr6c8LpkRFIOjVfuYB2TV4o2RfSFzrSFHCbrU82ojxhYSwyqDGAdD6EGtbbqaEMX
122 tGZzHaMVm2gDeV9W2veurxOulgndNg2+FXvgUlOa+KZ2J2DxNBcJv1uBtDAWDyR9
123 dTgTbK62ZnSjsnRYbgf0HdA+kW9n9XBMEHwgYk0q+doOWUOQFqC84TgrrhyDd1XZ
124 iEkEGBECAAkFAkdvL9QCGwwACgkQcPCcxrh2iEplXwCgraY3ELlDStqpJDSUzVsN
125 rGuNiwsAoKz92ycEjcMnoLnX8AaPADdo1m/P
126 =zEfO
127 -----END PGP PUBLIC KEY BLOCK-----
128 pub 1024D/9740DD55 2008-10-15 [revoked: 2010-03-27]
129 rev 9740DD55 2010-03-27 David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
130 uid David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
131 sig 3 9740DD55 2008-10-15 David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
132
133 pub 4096R/AEA8C6AB 2010-03-27 [revoked: 2010-03-27]
134 rev AEA8C6AB 2010-03-27 David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
135 uid David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
136 sig 3 AEA8C6AB 2010-03-27 David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
137
138 -----BEGIN PGP PUBLIC KEY BLOCK-----
139 Version: GnuPG v1
140
141 mQGiBEj2VswRBACppF4XWDc5n0oSwNj8qXlv9kYp/iZn/T4hzFCjaGvAUn6fBHW/
142 AYiwOneS/8vzSDfmURw9DpRFNt8W2Qsrzg2nuQxz+vcYk8qQp3bXpKVlpbMGYT8C
143 hdYPg7C0FGHrpNUWGqc8nkRaqQq7whZZcL5ciSOqnxcDckOlA+8jw8tVcwCghxzk
144 D3Sx+kraGE9kH6ZDFerhnmcD/R02g+eYIYrFbEauoErBoCpSRj/d9+wxlWvGGYen
145 IBcu27xHfuaEAOtRXZOTs9M9poz6Peo1wZnp+mkJjXCLWCHBDL9mVUCYkHOr5cmc
146 PdvkWxEUrRK9poRuYLjOcq4QWEin3P+mLmxVaT9XHvbdtS1zIRvnIvv9F3t0/8oa
147 hyQcBACB4BJEqPD7SMz354Zl+Ke4R2sraMzbkW1TocG1S63bzPInNhJnPTQed9IY
148 5gNpRKHjowzOX3ALAZf9pasHb2QiVTjm2zItn/BCrXFSIyCha97bOE5o+gYCwpQS
149 a2czj6ZQRDNqh+eRfYLLdKQVcuHqCwkrkpq96Z28GLDbF8tURohJBCARAgAJBQJL
150 rjRkAh0CAAoJEFoN7e+XQN1VKKAAn3izyNsH8InqQ2QoJS4qRtfFVMa3AJsHvUKM
151 6DEgp9Db+OG8nFtGtStIqrQ0RGF2aWQgTWVpa2xlIChDT0RFIFNJR05JTkcgS0VZ
152 KSA8ZG1laWtsZUBhcGFjaGUub3JnPohgBBMRAgAgBQJI9lbMAhsDBgsJCAcDAgQV
153 AggDBBYCAwECHgECF4AACgkQWg3t75dA3VUkYACeL6o1ljt8Z8+pU1UaYGQnXZ83
154 1bcAnRakxk95qY5rPZbXg7dcWP0D7A3euQINBEj2Vs8QCAC27/Nhska9l9g9mAm/
155 qrLCGIMGiDKdVoYxkmRBW77HGB5WrXNv+0aO1NJIyOjmJCMkM2tCDKaVBRHkIEs/
156 eaERuf23wUs/UilYZKEUgljtZDJh1VkmxkA1HWLX33jwweZgiz6SCj1w7FGeYqCZ
157 kS3ivfbaH67xMCLnsai7rw9RWy6+tDwxBOM1FQxK+al+rM47LXdLWqbVL2+ziUci
158 TVyxD5w9bEImZFiUEdcR026AVLfNGyY/EFrUDGA+eJn+QwsuhXX4xk2MSCajvRVU
159 YsYD2Ud/mfzE1W4OeszJPMdp8tCm5n11UtE6SVh1gFVjzGEncEgpMekJS+/ZUj9D
160 x50rAAMFB/9B7f3+mCzIHEZzYuzlqNjJVZaBXhPolX/Jb8I92TheqJnBoOcSqJ+J
161 6rs4Nvm06XbVQbDB9287NQuarbtTC4jbBPwcBnd8VVLYLWbc/y9M7ajGCAIIAS9T
162 kpgjLhb09VzWK5nZS2MPYUiqiaRilIhJg2fPZ1AtTtLlisfmteIRBIBpkJs8Jsxh
163 bvP4h7w3veQd6HGPeaXtkshBN3ul+Mlc6U+8sJUtot5fClxToR1EkWuSw31W/Ygl
164 H1ZimoIlwWCwwktHw0oS68OFVxbknEVB6W6q3v1hWdyikTwJF86Y13nNqKofEc65
165 JZ3kSqhFrurhEmktNh0DXxG4kOtoKPy+iEkEGBECAAkFAkj2Vs8CGwwACgkQWg3t
166 75dA3VU4PQCeNMHJYVMojKYy9zNB13vOztD7HCMAmQFUTckcVLR2qWXAWAzAmv8l
167 awZGmQINBEuuMQwBEADs/GZ6i/YJxep0pi/wA80qQ2hb1pdYvghZFhUQT0iTAesM
168 N3QIfdntd7Q89NJmg22uSVBtDuasFXF/AwsMlW7KMO+mE7z0G82rLmgTgKGPuNkU
169 DOy/7/d7ltP9NQvcho2nBujS24R/UvcynPE4YgjyoE0yrbgV3ipmXBhn7fFrne34
170 dXSvUI01tGEMmsCgD4Fof6XzGrrPLMZDF8eEIYk8wmzJ/3kGHaynMOvBnIFCtuyh
171 Y4tCcUWC23VNk1TqCXTjA4/Xw8/WocxSq44o+sfCmu1eCEBCVFc/9TvEDtAF/wLs
172 fYQrDKntc9PL9JZve4yCxV4N8q50zh5P5gP3zTou78uC35L33axO+6dd2SyI/hQ3
173 s7wUHnLfnscCOz076F+IIhTIMUzpSrHTNrEUnjhSsYDAyJMgCWBq8DL6CSNb2JZ3
174 SfbSUAdTaLF4HoWo5nkhx67tHiXINtqw2IZK0ASjJ8EpK7MFyQOnLIY88A8N7/3g
175 irMhkwCnbzKzh4JHgpSE+DHFl+8hv8EW7d9dXh6mkrACAnR8wQBaBfTnWTJetOoe
176 z5pqrJpieZCqPeGbhFfdfMymKAB+FWGJQznKEJYWXncqfdK8DcNvYRp925CIOzr/
177 Zm2JoLoou1LKreoquCKu5nMDDGma3Z7NYVpKnaD2xmOehQ7O0XdSVO5P0kjDxwAR
178 AQABiQIfBCABAgAJBQJLrjSDAh0CAAoJEGBlJ+auqMarJnsP/j6E+hQ9vkdvMncX
179 bQXX9auQeI0tRQDvgoKmQYk9T16QyhXANZoJDzuLEmE/8kMNwr0U5ay3lEV0KsJZ
180 e+z8fnsEmfNoV6xACNwa/DT4V4dQnVvy++K9z8CndX/3QNimduvuKzCZhEbMltwd
181 QSYhLr7JUWBerayMD3XR8Jl+bYnU47FapI4pgDNOKbLsdhEPhlEcZUhqBy/8d/p/
182 NjI6NLzOpBieSbYhYYh5O8wjR0JJw5gtYf//IO7GQYoAGzbGLa9m/MCsNSNU9M3H
183 YQs0dnBYxS3yGOk+rIRrfb/MR6ySjfSLiNb6IvSrSQ2oaJfjm3NnBYD4z1QcAq+J
184 mfFmb0Zq7uAEaxrovzkFX3mwNmxeoMTqJP0nr6napU5y7maJYzsLO/tHf+NeIiZI
185 uoK8Q/GQE1QJW3nQ2/LOWwAyXTLoMR6/IP+HO9Lk6ad9hXPswqPKw6vGTJrMJJS+
186 eyTRAFoxY8nQ24rkE6uT37hZUPeoZdZ6sGDIJBDWUXTez/U/TgCFrbpoMhMC2oLO
187 WdA0Qr/vgz6wdb/8x/CUe49eKzdXhIacAI7aYXtLOQwVprARDTUqSrfB9ijwz80B
188 krL1OWqgI1iHcsQD0e8eqzVQCndehvLMhezRQ2NSQyZNF0AsBP39OtyK5ATUNrrq
189 6g8LraF/l9VwsC/63dtfJyvq0sLEtDREYXZpZCBNZWlrbGUgKENPREUgU0lHTklO
190 RyBLRVkpIDxkbWVpa2xlQGFwYWNoZS5vcmc+iQI3BBMBAgAhAhsDAh4BAheABQJL
191 rjHwBQsJCAcDBRUKCQgLBRYCAwEAAAoJEGBlJ+auqMarJ9sP/RD9Qa3GS/S5c4ES
192 bpshfrqmD2BF3HJgbAJms49/V2pyC6lpVNARretn7G070WRqcBVIPNW0LLGkUeML
193 rtUMTiL+IIUz8I5REAd0e5NXXx0dnUMkkicJJ0IW6Xu6YS1NgTB5IoTonhKS3eIj
194 uDUYWGHS1ESEr0WI6Hv65IukkWFohl0awvgcqrS6unXFLtzEWedJn5hYyoFGVpeu
195 IBjrId6rZvTOJCFomdsgoN6rCJdLdK3RCaPyTk3ArVz0zuMcs29iKBq9liVkt5Cy
196 vSagsTrlXv5gD6+pgRas7oE2ZvRBHjg0qBqvhibfqDEXFg+jcke131OS1e7Knrep
197 0G32oXwszDDj2uGUk0Tw9RdzqZDZISehpQrhQ5uxNVyhF2Ms7sFnhjNd8G0lOfQk
198 eVi1lkS3utdMya5bRI4ursnY6Kam658ZqzviYoXoHextpOlEPGHAiGfDlS3uKU3F
199 qBX6jqZY3uxLpe/YAdxZcN8TjYnjXYTdcaTuXdvVvO3+xPVKG/STPoY3PU72u6xp
200 jMhf/jSzwi5BfMSKw20q63zcUMHk1Das2zUTUoXgKPJJAKaCU8THcBX8TEJ8v8NE
201 PEmtJe6ZnkXdt4b+9EqJC3h+LnzqoIZEiKnVirdatvuIMkTOln19WFRKDgOBLM5q
202 zDMWO/C2onb46vCf9rIiGmyoHVghuQINBEuuMQwBEAC4aY7Ecu8JmuRForjm/nbR
203 Sk4uy7XHvGBzVO4gSfeSmPFRog+raLWjmh61nA95nGJOK+C8ztZ3dPmSmQ2TGIsT
204 wRd1nb60Gz+G7bgiSbLXWZ5r8k9bGKCAHtXJF6uKXzdc1bM3vDYeXTSVEtKQsiyC
205 IgQVVmNP1cqzOVWbpDhb5RfvN7ZMJNguJXI5M8OM7h8wUejmR3TJ05kE/uxRhve5
206 YvX+fYZkOmKp8kHE5TQ98SDSm6tGvbdwQp4kgna7IpIkvYiNdQq75jElIwh/m/wu
207 K1r3ip6w6FvhfqNt9nHkLigYZJ7DLNv4blFx7OhIGp2cl7jYNDFyzQJ0LrfmytPC
208 tcWmSVTRNCxOvG+ry4v+b+HcSdf7rTKr+itPOSNDxgjK1d9nS1eFqfFD4qeCup/X
209 kvDGAo3gD7115Tsj/lzcjdqztG3oeVhadIikvJkQ5QWAwV5feBm9qqiDTxZRyPyj
210 mAC8xIjgweDBVbYUK+IvEdS4xnFCm/tpNqq46mCdkZtj11Viy7zHkBk/YJK3MRii
211 8iGTFKd8UOxjzJ5ADab22hnJSTv/3bvKC0AlwSo6J9Y5PExB7ZOqvDqOhMAOAHVa
212 xz+R/d+MAkaoK/MBTRI7RG3fyIdYQDTi1MJJEkdp/krtm1pTjTe/2E2Qeda4YY3l
213 x6uSbdFZpD46oSANeIuEBQARAQABiQIfBBgBAgAJBQJLrjEMAhsMAAoJEGBlJ+au
214 qMarO+YQAJgYbXbmK6WRBNj4YukelrJEGQpd8lBaD1VoVjsx4nEfUU5i56Kd1rem
215 Es+AyttEdfc//jDuCKOEuWZyIsezE0pu6syL3X4EESR0VnQtwgwZj83/DrpwObN4
216 48OfBKf7w2/jEFomuT7LaNCB2PFUd09kAySPfIvOYVLRjsnTL+qUyKKt5q/yxC1B
217 LDstQ2NWppWly4I47zhbFbXSZ0Ggov3KS0GlW3LxEbCEUSxMLBmN4/J7s44qvDLu
218 AhMx7lrnFwRYBzJPbyQwt5DaFUOGQAZqR7h5c/TRvcpsj+WlY4BODXs7//FhBCPf
219 Li+cutvmi5bwONgTdR/xmBdWX+0YgH2LGLj92oOoqVQMb38+U4EFgEBzhD6ASgFd
220 CtqOdLGnKqTRB9UNWIy4Lj47Nlw7LF1ZpqdeKkMTUo4kzljzOT0611MZMA9z4EGV
221 97e8OG1j29BghlfQoriEo9HWhZ/jnYDf4YEV4RUJr5te8F9XAG4MEPAgg7A4tFwS
222 f6oikLFa1AGBrgyEyD3kBsV/tBdbpeC70kVKto5KSgRB4JajrBIthpLdCJlFkfdr
223 wjmlka3INRFZ6pd9nzsS2oX542c+5m1g1yuIMAsbqTH60UD+pfG6ZUiXZBlPOXzK
224 g4YqqSSvCKVQSOzFPkfcMMIkYzfKOk1PAgvcemNciWspXcFk3kBt
225 =hghl
226 -----END PGP PUBLIC KEY BLOCK-----
227 pub 4096R/0EB30B07 2014-02-04
228 uid David Meikle (CODE SIGNING KEY) <dmeikle@apache.org>
229 sub 4096R/84C15C40 2014-02-04
230
231 -----BEGIN PGP PUBLIC KEY BLOCK-----
232 Version: GnuPG v1
233
234 mQINBFLxaSYBEADUywK+vv9sbxjLrW5aAM5bSxyZdPLgv8xUphG40XEGQPAamGiL
235 aDg9cgob1eZNcxmzMmp/O4vHdcdjzHN0iRMUpsYaSlm9YjqbK3sYynrXqahmHJFa
236 o2f8YS3O9MLfDTC/bbB8F2kTTeURlBCkRoP8jCHTpEkk9D0gz3+beXZnDKF4M+dY
237 I2mjZa0mCK1LWN7gm/lFDneogYMrJUBDL3jbZp0ASCMK9gH11BsuqpqhLC++IuCi
238 8GvKvap06d7YyfrF3DjsqN6IIK1WDq4zrm6QxIOU+t8KkMGywO4HwlQH0/rj66Kv
239 mgzB5CDgsjpEdAcYiTZgaCfqiudTaA8AvIcVF4fYA/su2nwg0urInBVeM+aRHSxD
240 pTi3wY1p/8YUxW5QqkzmVN5miHimDCGT/2KE3cWigK+iSr399PY9XYsD/TRsbsyU
241 hdxO+h+F/x+YOfUS+YUe+/EN8QZaCyUc3STzjWwd2z7UDOufy10iTHWuewPHMKU/
242 n451DTjiXtnXGu/Vz/kidKlmRQnGnkjj1qK5zTmwAfo81DTT/iezaAv9HS6OCJdx
243 35o0J1aPow3oA5JC/XR7EfV1b14QwVWy8Fv7MlBkbgLqtPdQSUYN41Iz/u7ucGLi
244 8JbW21gzUbxl+/EiyoWLzAAZh2wQMywqHWJb58A1JYWWXYmoVFDb3jl/TwARAQAB
245 tDREYXZpZCBNZWlrbGUgKENPREUgU0lHTklORyBLRVkpIDxkbWVpa2xlQGFwYWNo
246 ZS5vcmc+iQI3BBMBCgAhBQJS8WkmAhsDBQsJCAcDBRUKCQgLBRYCAwEAAh4BAheA
247 AAoJEFJBSwsOswsHOL0QALbL9kkMz3aKmc3wYjGk8789bNz7RzAGaliZZyLbDUwx
248 sHlC5YOFoshMxZM3w7kI7wFQh3W55yX+kzLErTjJFgMpCiKEeGvOtKjSI1baOyb1
249 DNxWVxuWrm+7U0kkPYDtVTMt13KstAPIto4we9Q9kzcsXqzPH0Y1iqagmBalGQn/
250 UkgfrXGEH2Zdp8DNkSQZRKTZxyOk4GVIt4FJyXeuSxNW0PvwzLe+NO38KxxCMd7u
251 HCjyyXoxvgI7GeijZ1N3iOVG6efhDHVydQd9GL5dHM70nkeCLxLXk34VM6vrkoxm
252 0D5mEKoNre+yZP7awxLZR+anaEd99xxp27QiZEAw9Nq87NLgu7T9auULz82gPa60
253 kYegJj6ohWdH5sJ1uYM9XNEIko6iqqFveb5HbjlHiu/U0ZT02Yy+aQ+MCMd5i0Mr
254 yTpZZplU3QS7sMrAPDGrTP7A0pHkcMxLu+EKfnAYgtWKcPHmwdpWEHwcJaaYD5LU
255 U3+oNL01iP7fdTp+Nu6eHqCg3GXIkCEwN88Vr9IbAkoQD3DrRWerh35X9zOeb56i
256 GT+UulAJayBWIgypp6j+uiDqOtDWysOQBn1wQxkERSHzsHtKJ4OXTqXudZ+gNhAK
257 cQPDzrm1vaT/WoGLxL/hvjf1jo0UD/UtCKnFbCphjKXifuXiRFmr0MkI12ui79rk
258 uQINBFLxaSYBEACoQN98rA1Nj2shfaWDe3Pjhpd7f6qin86ziKbw8Eu/AxdiG5Xh
259 PpZbYm63+GKOinAwP0T4V1Fln+j+XH650Ysee1dexa8gXufChf85FKq/rDGjTPG0
260 RFvI6DkGDP+u4sJJdyAjkZjoZrUOR6ai3kSLIcVAsBRT/NLnlDnfljVfK1hbrE1Y
261 pLVxKmeTbJsvZOjQA1MgCigAlH3AAXcfZE9UY9HlPHrNDBNazvc89fzktgyUBS7b
262 H99J9DxmtWIn/XqpsFF/lQ86zeirWSSofJvfk6G66yxS5ApKB7O15GZ1AyqWkAzD
263 KfsLfd4gAfezRfKDJcWrY8/DyjsyRfbqF6HoBnV0UOLpzqZ8UbjP5+64WDGqajUc
264 6pUoW0lBJUxu1ZYvjCy7O/m1GNQD/v5Atoi0M2MUOLui30yN1AxC9qhbKFZB8vb1
265 F4cMuTuCtfF+GeOk8Ib2gl8RhSVmIlrbek7p5wk6pPR8Jf/9ngwySr4dhQBs3GdJ
266 Hh8fBge4zkg6kFo1pCo99bFzinB3hSbCV3hTnf2uLAb2LCizr125uAc5UML/jeHq
267 SXSZtHwfcy1wYKKmaynLcz8iMMM+CaCuNTUVLS2VI4P0p5eHUH1SAL1qT9T2S3gp
268 gM+5pyZmeDOWigtbUOJ79QBM17lbqlexvNQ/t1TcyWPHk9bXXvdAjoNBnwARAQAB
269 iQIfBBgBCgAJBQJS8WkmAhsMAAoJEFJBSwsOswsHJUAQAKslPy2ZNJwMvFyKaHfk
270 F0ki+tGDy3Zy/qYW8aQFxnzsR4fTl/BJNXBCh2EpYTGpSESfdvZeMXk8+jUVatq/
271 C8tLfFoDzwrPudwAemwynEJ+MqK8kJKbmMvmXddzqLJKrHXjzZCmi706ssAkioqI
272 XSJYZt+d93tTcZsmVrWwoyXUE+ZBbQfmds32HqeziepmfGXasWnRj1/fhdY5XZWJ
273 rCIZnwCadpxKlPj74XxfEastcKVsI+EWr7Sj83B7RsGS1IPjclY77PBNTCBSB1U3
274 kBTD7l/dYDLXpiJZ5OAyRO9MrXW7A876XhaBsRdcRxUqEe98NFLVhWuJO6RTb89L
275 if3LGxsvuaHpIQD0NgkNNTYL9PfIbRjTZpJPQtWErt+eRK4QuZRA1HYKT/htYVH6
276 /oqsMKlrIh9AoVXPQ+ExCw8TqNo0jTSh0Kdy4Sj0vASLDzckpzFVz2cMoPBfgsjr
277 U1h+Fc3lMPTjWW/YZUaPt7V+m1jrI4ikm+EC4TIWKtgE+VvHAXHnUPKm2G4a2a0Y
278 +a5Hwbtz+6hSCxpa2WMIV4wBhaoEeoRrgfllqfovNJ2BJHpZAvGqavFf/yqnqXst
279 JN4T06+JEcezrCUTDyZ/M0GNGVNq/cdNApc1XIl+tukUCYS/mg6uUWhVzfWenukM
280 WDWdnyjxFv/9quOxHM976hkg
281 =CmKg
282 -----END PGP PUBLIC KEY BLOCK-----
0
1 Apache License
2 Version 2.0, January 2004
3 http://www.apache.org/licenses/
4
5 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
7 1. Definitions.
8
9 "License" shall mean the terms and conditions for use, reproduction,
10 and distribution as defined by Sections 1 through 9 of this document.
11
12 "Licensor" shall mean the copyright owner or entity authorized by
13 the copyright owner that is granting the License.
14
15 "Legal Entity" shall mean the union of the acting entity and all
16 other entities that control, are controlled by, or are under common
17 control with that entity. For the purposes of this definition,
18 "control" means (i) the power, direct or indirect, to cause the
19 direction or management of such entity, whether by contract or
20 otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 outstanding shares, or (iii) beneficial ownership of such entity.
22
23 "You" (or "Your") shall mean an individual or Legal Entity
24 exercising permissions granted by this License.
25
26 "Source" form shall mean the preferred form for making modifications,
27 including but not limited to software source code, documentation
28 source, and configuration files.
29
30 "Object" form shall mean any form resulting from mechanical
31 transformation or translation of a Source form, including but
32 not limited to compiled object code, generated documentation,
33 and conversions to other media types.
34
35 "Work" shall mean the work of authorship, whether in Source or
36 Object form, made available under the License, as indicated by a
37 copyright notice that is included in or attached to the work
38 (an example is provided in the Appendix below).
39
40 "Derivative Works" shall mean any work, whether in Source or Object
41 form, that is based on (or derived from) the Work and for which the
42 editorial revisions, annotations, elaborations, or other modifications
43 represent, as a whole, an original work of authorship. For the purposes
44 of this License, Derivative Works shall not include works that remain
45 separable from, or merely link (or bind by name) to the interfaces of,
46 the Work and Derivative Works thereof.
47
48 "Contribution" shall mean any work of authorship, including
49 the original version of the Work and any modifications or additions
50 to that Work or Derivative Works thereof, that is intentionally
51 submitted to Licensor for inclusion in the Work by the copyright owner
52 or by an individual or Legal Entity authorized to submit on behalf of
53 the copyright owner. For the purposes of this definition, "submitted"
54 means any form of electronic, verbal, or written communication sent
55 to the Licensor or its representatives, including but not limited to
56 communication on electronic mailing lists, source code control systems,
57 and issue tracking systems that are managed by, or on behalf of, the
58 Licensor for the purpose of discussing and improving the Work, but
59 excluding communication that is conspicuously marked or otherwise
60 designated in writing by the copyright owner as "Not a Contribution."
61
62 "Contributor" shall mean Licensor and any individual or Legal Entity
63 on behalf of whom a Contribution has been received by Licensor and
64 subsequently incorporated within the Work.
65
66 2. Grant of Copyright License. Subject to the terms and conditions of
67 this License, each Contributor hereby grants to You a perpetual,
68 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 copyright license to reproduce, prepare Derivative Works of,
70 publicly display, publicly perform, sublicense, and distribute the
71 Work and such Derivative Works in Source or Object form.
72
73 3. Grant of Patent License. Subject to the terms and conditions of
74 this License, each Contributor hereby grants to You a perpetual,
75 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 (except as stated in this section) patent license to make, have made,
77 use, offer to sell, sell, import, and otherwise transfer the Work,
78 where such license applies only to those patent claims licensable
79 by such Contributor that are necessarily infringed by their
80 Contribution(s) alone or by combination of their Contribution(s)
81 with the Work to which such Contribution(s) was submitted. If You
82 institute patent litigation against any entity (including a
83 cross-claim or counterclaim in a lawsuit) alleging that the Work
84 or a Contribution incorporated within the Work constitutes direct
85 or contributory patent infringement, then any patent licenses
86 granted to You under this License for that Work shall terminate
87 as of the date such litigation is filed.
88
89 4. Redistribution. You may reproduce and distribute copies of the
90 Work or Derivative Works thereof in any medium, with or without
91 modifications, and in Source or Object form, provided that You
92 meet the following conditions:
93
94 (a) You must give any other recipients of the Work or
95 Derivative Works a copy of this License; and
96
97 (b) You must cause any modified files to carry prominent notices
98 stating that You changed the files; and
99
100 (c) You must retain, in the Source form of any Derivative Works
101 that You distribute, all copyright, patent, trademark, and
102 attribution notices from the Source form of the Work,
103 excluding those notices that do not pertain to any part of
104 the Derivative Works; and
105
106 (d) If the Work includes a "NOTICE" text file as part of its
107 distribution, then any Derivative Works that You distribute must
108 include a readable copy of the attribution notices contained
109 within such NOTICE file, excluding those notices that do not
110 pertain to any part of the Derivative Works, in at least one
111 of the following places: within a NOTICE text file distributed
112 as part of the Derivative Works; within the Source form or
113 documentation, if provided along with the Derivative Works; or,
114 within a display generated by the Derivative Works, if and
115 wherever such third-party notices normally appear. The contents
116 of the NOTICE file are for informational purposes only and
117 do not modify the License. You may add Your own attribution
118 notices within Derivative Works that You distribute, alongside
119 or as an addendum to the NOTICE text from the Work, provided
120 that such additional attribution notices cannot be construed
121 as modifying the License.
122
123 You may add Your own copyright statement to Your modifications and
124 may provide additional or different license terms and conditions
125 for use, reproduction, or distribution of Your modifications, or
126 for any such Derivative Works as a whole, provided Your use,
127 reproduction, and distribution of the Work otherwise complies with
128 the conditions stated in this License.
129
130 5. Submission of Contributions. Unless You explicitly state otherwise,
131 any Contribution intentionally submitted for inclusion in the Work
132 by You to the Licensor shall be under the terms and conditions of
133 this License, without any additional terms or conditions.
134 Notwithstanding the above, nothing herein shall supersede or modify
135 the terms of any separate license agreement you may have executed
136 with Licensor regarding such Contributions.
137
138 6. Trademarks. This License does not grant permission to use the trade
139 names, trademarks, service marks, or product names of the Licensor,
140 except as required for reasonable and customary use in describing the
141 origin of the Work and reproducing the content of the NOTICE file.
142
143 7. Disclaimer of Warranty. Unless required by applicable law or
144 agreed to in writing, Licensor provides the Work (and each
145 Contributor provides its Contributions) on an "AS IS" BASIS,
146 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 implied, including, without limitation, any warranties or conditions
148 of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 PARTICULAR PURPOSE. You are solely responsible for determining the
150 appropriateness of using or redistributing the Work and assume any
151 risks associated with Your exercise of permissions under this License.
152
153 8. Limitation of Liability. In no event and under no legal theory,
154 whether in tort (including negligence), contract, or otherwise,
155 unless required by applicable law (such as deliberate and grossly
156 negligent acts) or agreed to in writing, shall any Contributor be
157 liable to You for damages, including any direct, indirect, special,
158 incidental, or consequential damages of any character arising as a
159 result of this License or out of the use or inability to use the
160 Work (including but not limited to damages for loss of goodwill,
161 work stoppage, computer failure or malfunction, or any and all
162 other commercial damages or losses), even if such Contributor
163 has been advised of the possibility of such damages.
164
165 9. Accepting Warranty or Additional Liability. While redistributing
166 the Work or Derivative Works thereof, You may choose to offer,
167 and charge a fee for, acceptance of support, warranty, indemnity,
168 or other liability obligations and/or rights consistent with this
169 License. However, in accepting such obligations, You may act only
170 on Your own behalf and on Your sole responsibility, not on behalf
171 of any other Contributor, and only if You agree to indemnify,
172 defend, and hold each Contributor harmless for any liability
173 incurred by, or claims asserted against, such Contributor by reason
174 of your accepting any such warranty or additional liability.
175
176 END OF TERMS AND CONDITIONS
177
178 APPENDIX: How to apply the Apache License to your work.
179
180 To apply the Apache License to your work, attach the following
181 boilerplate notice, with the fields enclosed by brackets "[]"
182 replaced with your own identifying information. (Don't include
183 the brackets!) The text should be enclosed in the appropriate
184 comment syntax for the file format. We also recommend that a
185 file or class name and description of purpose be included on the
186 same "printed page" as the copyright notice for easier
187 identification within third-party archives.
188
189 Copyright [yyyy] [name of copyright owner]
190
191 Licensed under the Apache License, Version 2.0 (the "License");
192 you may not use this file except in compliance with the License.
193 You may obtain a copy of the License at
194
195 http://www.apache.org/licenses/LICENSE-2.0
196
197 Unless required by applicable law or agreed to in writing, software
198 distributed under the License is distributed on an "AS IS" BASIS,
199 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 See the License for the specific language governing permissions and
201 limitations under the License.
202
203
204
205 APACHE TIKA SUBCOMPONENTS
206
207 Apache Tika includes a number of subcomponents with separate copyright notices
208 and license terms. Your use of these subcomponents is subject to the terms and
209 conditions of the following licenses.
210
211 MIME type information from file-4.26.tar.gz (http://www.darwinsys.com/file/)
212
213 Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
214 Software written by Ian F. Darwin and others;
215 maintained 1994- Christos Zoulas.
216
217 This software is not subject to any export provision of the United States
218 Department of Commerce, and may be exported to any country or planet.
219
220 Redistribution and use in source and binary forms, with or without
221 modification, are permitted provided that the following conditions
222 are met:
223 1. Redistributions of source code must retain the above copyright
224 notice immediately at the beginning of the file, without modification,
225 this list of conditions, and the following disclaimer.
226 2. Redistributions in binary form must reproduce the above copyright
227 notice, this list of conditions and the following disclaimer in the
228 documentation and/or other materials provided with the distribution.
229
230 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
231 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
232 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
233 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
234 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
235 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
236 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
237 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
238 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
239 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
240 SUCH DAMAGE.
241
242 Charset detection code from ICU4J (http://site.icu-project.org/)
243
244 Copyright (c) 1995-2009 International Business Machines Corporation
245 and others
246
247 All rights reserved.
248
249 Permission is hereby granted, free of charge, to any person obtaining
250 a copy of this software and associated documentation files (the
251 "Software"), to deal in the Software without restriction, including
252 without limitation the rights to use, copy, modify, merge, publish,
253 distribute, and/or sell copies of the Software, and to permit persons
254 to whom the Software is furnished to do so, provided that the above
255 copyright notice(s) and this permission notice appear in all copies
256 of the Software and that both the above copyright notice(s) and this
257 permission notice appear in supporting documentation.
258
259 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
260 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
261 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
262 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
263 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
264 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
265 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
266 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
267 SOFTWARE.
268
269 Except as contained in this notice, the name of a copyright holder shall
270 not be used in advertising or otherwise to promote the sale, use or other
271 dealings in this Software without prior written authorization of the
272 copyright holder.
273
274
275 Parsing functionality provided by the NetCDF Java Library (http://www.unidata.ucar.edu/software/netcdf-java/)
276
277 Copyright 1993-2010 University Corporation for Atmospheric Research/Unidata
278
279 Portions of this software were developed by the Unidata Program at the University
280 Corporation for Atmospheric Research.
281
282 Access and use of this software shall impose the following obligations and understandings
283 on the user. The user is granted the right, without any fee or cost, to use, copy, modify,
284 alter, enhance and distribute this software, and any derivative works thereof, and its
285 supporting documentation for any purpose whatsoever, provided that this entire notice
286 appears in all copies of the software, derivative works and supporting documentation. Further,
287 UCAR requests that the user credit UCAR/Unidata in any publications that result from the use
288 of this software or in any product that includes this software, although this is not an obligation.
289 The names UCAR and/or Unidata, however, may not be used in any advertising or publicity to endorse
290 or promote any products or commercial entity unless specific written permission is obtained from
291 UCAR/Unidata. The user also understands that UCAR/Unidata is not obligated to provide the user with
292 any support, consulting, training or assistance of any kind with regard to the use, operation and
293 performance of this software nor to provide the user with any updates, revisions, new versions or
294 "bug fixes."
295
296 THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
297 BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
298 ARE DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
299 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
300 OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE ACCESS,
301 USE OR PERFORMANCE OF THIS SOFTWARE.
302
303
304 IPTC Photo Metadata descriptions are taken from the IPTC Photo Metadata
305 Standard, July 2010, Copyright 2010 International Press Telecommunications
306 Council.
307
308 1. The Specifications and Materials are licensed for use only on the condition that you agree to be bound by the terms of this license. Subject to this and other licensing requirements contained herein, you may, on a non-exclusive basis, use the Specifications and Materials.
309 2. The IPTC openly provides the Specifications and Materials for voluntary use by individuals, partnerships, companies, corporations, organizations and any other entity for use at the entity's own risk. This disclaimer, license and release is intended to apply to the IPTC, its officers, directors, agents, representatives, members, contributors, affiliates, contractors, or co-venturers acting jointly or severally.
310 3. The Document and translations thereof may be copied and furnished to others, and derivative works that comment on or otherwise explain it or assist in its implementation may be prepared, copied, published and distributed, in whole or in part, without restriction of any kind, provided that the copyright and license notices and references to the IPTC appearing in the Document and the terms of this Specifications License Agreement are included on all such copies and derivative works. Further, upon the receipt of written permission from the IPTC, the Document may be modified for the purpose of developing applications that use IPTC Specifications or as required to translate the Document into languages other than English.
311 4. Any use, duplication, distribution, or exploitation of the Document and Specifications and Materials in any manner is at your own risk.
312 5. NO WARRANTY, EXPRESSED OR IMPLIED, IS MADE REGARDING THE ACCURACY, ADEQUACY, COMPLETENESS, LEGALITY, RELIABILITY OR USEFULNESS OF ANY INFORMATION CONTAINED IN THE DOCUMENT OR IN ANY SPECIFICATION OR OTHER PRODUCT OR SERVICE PRODUCED OR SPONSORED BY THE IPTC. THE DOCUMENT AND THE INFORMATION CONTAINED HEREIN AND INCLUDED IN ANY SPECIFICATION OR OTHER PRODUCT OR SERVICE OF THE IPTC IS PROVIDED ON AN "AS IS" BASIS. THE IPTC DISCLAIMS ALL WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY ACTUAL OR ASSERTED WARRANTY OF NON-INFRINGEMENT OF PROPRIETARY RIGHTS, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. NEITHER THE IPTC NOR ITS CONTRIBUTORS SHALL BE HELD LIABLE FOR ANY IMPROPER OR INCORRECT USE OF INFORMATION. NEITHER THE IPTC NOR ITS CONTRIBUTORS ASSUME ANY RESPONSIBILITY FOR ANYONE'S USE OF INFORMATION PROVIDED BY THE IPTC. IN NO EVENT SHALL THE IPTC OR ITS CONTRIBUTORS BE LIABLE TO ANYONE FOR DAMAGES OF ANY KIND, INCLUDING BUT NOT LIMITED TO, COMPENSATORY DAMAGES, LOST PROFITS, LOST DATA OR ANY FORM OF SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL OR PUNITIVE DAMAGES OF ANY KIND WHETHER BASED ON BREACH OF CONTRACT OR WARRANTY, TORT, PRODUCT LIABILITY OR OTHERWISE.
313 6. The IPTC takes no position regarding the validity or scope of any Intellectual Property or other rights that might be claimed to pertain to the implementation or use of the technology described in the Document or the extent to which any license under such rights might or might not be available. The IPTC does not represent that it has made any effort to identify any such rights. Copies of claims of rights made available for publication, assurances of licenses to be made available, or the result of an attempt made to obtain a general license or permission for the use of such proprietary rights by implementers or users of the Specifications and Materials, can be obtained from the Managing Director of the IPTC.
314 7. By using the Specifications and Materials including the Document in any manner or for any purpose, you release the IPTC from all liabilities, claims, causes of action, allegations, losses, injuries, damages, or detriments of any nature arising from or relating to the use of the Specifications, Materials or any portion thereof. You further agree not to file a lawsuit, make a claim, or take any other formal or informal legal action against the IPTC, resulting from your acquisition, use, duplication, distribution, or exploitation of the Specifications, Materials or any portion thereof. Finally, you hereby agree that the IPTC is not liable for any direct, indirect, special or consequential damages arising from or relating to your acquisition, use, duplication, distribution, or exploitation of the Specifications, Materials or any portion thereof.
315 8. Specifications and Materials may be downloaded or copied provided that ALL copies retain the ownership, copyright and license notices.
316 9. Materials may not be edited, modified, or presented in a context that creates a misleading or false impression or statement as to the positions, actions, or statements of the IPTC.
317 10. The name and trademarks of the IPTC may not be used in advertising, publicity, or in relation to products or services and their names without the specific, written prior permission of the IPTC. Any permitted use of the trademarks of the IPTC, whether registered or not, shall be accompanied by an appropriate mark and attribution, as agreed with the IPTC.
318 11. Specifications may be extended by both members and non-members to provide additional functionality (Extension Specifications) provided that there is a clear recognition of the IPTC IP and its ownership in the Extension Specifications and the related documentation and provided that the extensions are clearly identified and provided that a perpetual license is granted by the creator of the Extension Specifications for other members and non-members to use the Extension Specifications and to continue extensions of the Extension Specifications. The IPTC does not waive any of its rights in the Specifications and Materials in this context. The Extension Specifications may be considered the intellectual property of their creator. The IPTC expressly disclaims any responsibility for damage caused by an extension to the Specifications.
319 12. Specifications and Materials may be included in derivative work of both members and non-members provided that there is a clear recognition of the IPTC IP and its ownership in the derivative work and its related documentation. The IPTC does not waive any of its rights in the Specifications and Materials in this context. Derivative work in its entirety may be considered the intellectual property of the creator of the work .The IPTC expressly disclaims any responsibility for damage caused when its IP is used in a derivative context.
320 13. This Specifications License Agreement is perpetual subject to your conformance to the terms of this Agreement. The IPTC may terminate this Specifications License Agreement immediately upon your breach of this Agreement and, upon such termination you will cease all use, duplication, distribution, and/or exploitation in any manner of the Specifications and Materials.
321 14. This Specifications License Agreement reflects the entire agreement of the parties regarding the subject matter hereof and supersedes all prior agreements or representations regarding such matters, whether written or oral. To the extent any portion or provision of this Specifications License Agreement is found to be illegal or unenforceable, then the remaining provisions of this Specifications License Agreement will remain in full force and effect and the illegal or unenforceable provision will be construed to give it such effect as it may properly have that is consistent with the intentions of the parties.
322 15. This Specifications License Agreement may only be modified in writing signed by an authorized representative of the IPTC.
323 16. This Specifications License Agreement is governed by the law of United Kingdom, as such law is applied to contracts made and fully performed in the United Kingdom. Any disputes arising from or relating to this Specifications License Agreement will be resolved in the courts of the United Kingdom. You consent to the jurisdiction of such courts over you and covenant not to assert before such courts any objection to proceeding in such forums.
0 Apache Tika
1 Copyright 2011 The Apache Software Foundation
2
3 This product includes software developed at
4 The Apache Software Foundation (http://www.apache.org/).
5
6 Copyright 1993-2010 University Corporation for Atmospheric Research/Unidata
7 This software contains code derived from UCAR/Unidata's NetCDF library.
8
9 Tika-server compoment uses CDDL-licensed dependencies: jersey (http://jersey.java.net/) and
10 Grizzly (http://grizzly.java.net/)
11
12 OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
13
14 IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
0 =================================================
1 Welcome to Apache Tika <http://tika.apache.org/>
2 =================================================
3
4 Apache Tika(TM) is a toolkit for detecting and extracting metadata and
5 structured text content from various documents using existing parser
6 libraries.
7
8 Tika is a project of the Apache Software Foundation <http://www.apache.org/>.
9
10 Apache Tika, Tika, Apache, the Apache feather logo, and the Apache Tika
11 project logo are trademarks of The Apache Software Foundation.
12
13 Getting Started
14 ===============
15
16 Tika is based on Java 5 and uses the Maven 2 <http://maven.apache.org/>
17 build system. To build Tika, use the following command in this directory:
18
19 mvn clean install
20
21 The build consists of a number of components, including a standalone runnable
22 jar that you can use to try out Tika features. You can run it like this:
23
24 java -jar tika-app/target/tika-app-*.jar --help
25
26 License (see also LICENSE.txt)
27 ==============================
28
29 Collective work: Copyright 2011 The Apache Software Foundation.
30
31 Licensed to the Apache Software Foundation (ASF) under one or more
32 contributor license agreements. See the NOTICE file distributed with
33 this work for additional information regarding copyright ownership.
34 The ASF licenses this file to You under the Apache License, Version 2.0
35 (the "License"); you may not use this file except in compliance with
36 the License. You may obtain a copy of the License at
37
38 http://www.apache.org/licenses/LICENSE-2.0
39
40 Unless required by applicable law or agreed to in writing, software
41 distributed under the License is distributed on an "AS IS" BASIS,
42 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 See the License for the specific language governing permissions and
44 limitations under the License.
45
46 Apache Tika includes a number of subcomponents with separate copyright
47 notices and license terms. Your use of these subcomponents is subject to
48 the terms and conditions of the licenses listed in the LICENSE.txt file.
49
50 Export control
51 ==============
52
53 This distribution includes cryptographic software. The country in which
54 you currently reside may have restrictions on the import, possession, use,
55 and/or re-export to another country, of encryption software. BEFORE using
56 any encryption software, please check your country's laws, regulations and
57 policies concerning the import, possession, or use, and re-export of
58 encryption software, to see if this is permitted. See
59 <http://www.wassenaar.org/> for more information.
60
61 The U.S. Government Department of Commerce, Bureau of Industry and
62 Security (BIS), has classified this software as Export Commodity Control
63 Number (ECCN) 5D002.C.1, which includes information security software using
64 or performing cryptographic functions with asymmetric algorithms. The form
65 and manner of this Apache Software Foundation distribution makes it eligible
66 for export under the License Exception ENC Technology Software Unrestricted
67 (TSU) exception (see the BIS Export Administration Regulations, Section
68 740.13) for both object code and source code.
69
70 The following provides more details on the included cryptographic software:
71
72 Apache Tika uses the Bouncy Castle generic encryption libraries for
73 extracting text content and metadata from encrypted PDF files.
74 See http://www.bouncycastle.org/ for more details on Bouncy Castle.
75
76 Mailing Lists
77 =============
78
79 Discussion about Tika takes place on the following mailing lists:
80
81 user@tika.apache.org - About using Tika
82 dev@tika.apache.org - About developing Tika
83
84 Notification on all code changes are sent to the following mailing list:
85
86 commits@tika.apache.org
87
88 The mailing lists are open to anyone and publicly archived.
89
90 You can subscribe the mailing lists by sending a message to
91 <LIST>-subscribe@tika.apache.org (for example user-subscribe@...).
92 To unsubscribe, send a message to <LIST>-unsubscribe@tika.apache.org.
93 For more instructions, send a message to <LIST>-help@tika.apache.org.
94
95 Issue Tracker
96 =============
97
98 If you encounter errors in Tika or want to suggest an improvement or
99 a new feature, please visit the Tika issue tracker at
100 https://issues.apache.org/jira/browse/TIKA. There you can also find the
101 latest information on known issues and recent bug fixes and enhancements.
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one or more
2 contributor license agreements. See the NOTICE file distributed with
3 this work for additional information regarding copyright ownership.
4 The ASF licenses this file to You under the Apache License, Version 2.0
5 (the "License"); you may not use this file except in compliance with
6 the License. You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 -->
16 <assembly>
17 <id>src</id>
18 <formats>
19 <format>zip</format>
20 </formats>
21 <fileSets>
22 <fileSet>
23 <directory>${project.basedir}</directory>
24 <outputDirectory></outputDirectory>
25 <excludes>
26 <exclude>**/target/**</exclude>
27 <exclude>**/.*/**</exclude>
28 </excludes>
29 </fileSet>
30 </fileSets>
31 </assembly>
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika</artifactId>
32 <packaging>pom</packaging>
33 <name>Apache Tika</name>
34 <url>http://tika.apache.org</url>
35
36 <scm>
37 <connection>
38 scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/
39 </connection>
40 <developerConnection>
41 scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/
42 </developerConnection>
43 <url>http://svn.apache.org/viewvc/tika/tags/1.5/</url>
44 </scm>
45
46 <modules>
47 <module>tika-parent</module>
48 <module>tika-core</module>
49 <module>tika-parsers</module>
50 <module>tika-xmp</module>
51 <module>tika-app</module>
52 <module>tika-bundle</module>
53 <module>tika-server</module>
54 </modules>
55
56 <build>
57 <plugins>
58 <plugin>
59 <artifactId>maven-deploy-plugin</artifactId>
60 <configuration>
61 <skip>true</skip> <!-- No need to deploy the reactor -->
62 </configuration>
63 </plugin>
64 <plugin>
65 <artifactId>maven-site-plugin</artifactId>
66 <configuration>
67 <templateDirectory>src/site</templateDirectory>
68 <template>site.vm</template>
69 </configuration>
70 </plugin>
71 <plugin>
72 <groupId>org.apache.rat</groupId>
73 <artifactId>apache-rat-plugin</artifactId>
74 <configuration>
75 <excludes>
76 <exclude>.*/**</exclude>
77 <exclude>CHANGES.txt</exclude>
78 <exclude>tika-dotnet/AssemblyInfo.cs</exclude>
79 <exclude>tika-dotnet/Tika.csproj</exclude>
80 <exclude>tika-dotnet/Tika.sln</exclude>
81 <exclude>tika-dotnet/Tika.sln.cache</exclude>
82 <exclude>tika-dotnet/obj/**</exclude>
83 <exclude>tika-dotnet/target/**</exclude>
84 </excludes>
85 </configuration>
86 </plugin>
87 </plugins>
88 </build>
89
90 <profiles>
91 <profile>
92 <id>apache-release</id>
93 <properties>
94 <username>${user.name}</username>
95 </properties>
96 <build>
97 <plugins>
98 <plugin>
99 <artifactId>maven-assembly-plugin</artifactId>
100 <executions>
101 <execution>
102 <id>src</id>
103 <goals>
104 <goal>single</goal>
105 </goals>
106 <phase>package</phase>
107 <configuration>
108 <descriptors>
109 <descriptor>assembly.xml</descriptor>
110 </descriptors>
111 </configuration>
112 </execution>
113 <execution>
114 <id>source-release-assembly</id>
115 <configuration>
116 <skipAssembly>true</skipAssembly>
117 </configuration>
118 </execution>
119 </executions>
120 </plugin>
121 <plugin>
122 <artifactId>maven-antrun-plugin</artifactId>
123 <executions>
124 <execution>
125 <goals>
126 <goal>run</goal>
127 </goals>
128 <phase>deploy</phase>
129 <configuration>
130 <tasks>
131 <mkdir dir="${basedir}/target/${project.version}" />
132 <copy todir="${basedir}/target/${project.version}" flatten="true">
133 <fileset dir="${basedir}">
134 <include name="CHANGES.txt" />
135 <include name="target/*-src.zip*" />
136 <include name="tika-app/target/*-${project.version}.jar*" />
137 </fileset>
138 </copy>
139 <checksum algorithm="MD5" fileext=".md5">
140 <fileset dir="${basedir}/target/${project.version}">
141 <include name="*.zip" />
142 <include name="*.?ar" />
143 </fileset>
144 </checksum>
145 <checksum algorithm="SHA1" fileext=".sha">
146 <fileset dir="${basedir}/target/${project.version}">
147 <include name="*.zip" />
148 <include name="*.?ar" />
149 </fileset>
150 </checksum>
151 <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" />
152 <echo file="${basedir}/target/vote.txt">
153 From: ${username}@apache.org
154 To: dev@tika.apache.org
155 Subject: [VOTE] Release Apache Tika ${project.version}
156
157 A candidate for the Tika ${project.version} release is available at:
158
159 http://people.apache.org/~${username}/tika/${project.version}/
160
161 The release candidate is a zip archive of the sources in:
162
163 http://svn.apache.org/repos/asf/tika/tags/${project.version}/
164
165 The SHA1 checksum of the archive is ${checksum}.
166
167 Please vote on releasing this package as Apache Tika ${project.version}.
168 The vote is open for the next 72 hours and passes if a majority of at
169 least three +1 Tika PMC votes are cast.
170
171 [ ] +1 Release this package as Apache Tika ${project.version}
172 [ ] -1 Do not release this package because...${line.separator}
173 </echo>
174 <echo />
175 <echo>
176 The release candidate has been prepared in:
177
178 ${basedir}/target/${project.version}
179
180 Please deploy it to people.apache.org like this:
181
182 scp -r ${basedir}/target/${project.version} people.apache.org:public_html/tika/
183
184 A release vote template has been generated for you:
185
186 file://${basedir}/target/vote.txt
187 </echo>
188 <echo />
189 </tasks>
190 </configuration>
191 </execution>
192 </executions>
193 <dependencies>
194 <dependency>
195 <groupId>org.apache.ant</groupId>
196 <artifactId>ant-nodeps</artifactId>
197 <version>1.8.1</version>
198 </dependency>
199 </dependencies>
200 </plugin>
201 </plugins>
202 </build>
203 </profile>
204 <profile>
205 <id>java7</id>
206 <activation>
207 <jdk>[1.7,]</jdk>
208 </activation>
209 <modules>
210 <module>tika-java7</module>
211 </modules>
212 </profile>
213 </profiles>
214
215 <description>The Apache Tika™ toolkit detects and extracts metadata and structured text content from various documents using existing parser libraries. </description>
216 <organization>
217 <name>The Apache Software Foundation</name>
218 <url>http://www.apache.org</url>
219 </organization>
220 <issueManagement>
221 <system>JIRA</system>
222 <url>https://issues.apache.org/jira/browse/TIKA</url>
223 </issueManagement>
224 <ciManagement>
225 <system>Jenkins</system>
226 <url>https://builds.apache.org/job/Tika-trunk/</url>
227 </ciManagement>
228 </project>
0 -----------------
1 Content Detection
2 -----------------
3
4 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
5 ~~ contributor license agreements. See the NOTICE file distributed with
6 ~~ this work for additional information regarding copyright ownership.
7 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
8 ~~ (the "License"); you may not use this file except in compliance with
9 ~~ the License. You may obtain a copy of the License at
10 ~~
11 ~~ http://www.apache.org/licenses/LICENSE-2.0
12 ~~
13 ~~ Unless required by applicable law or agreed to in writing, software
14 ~~ distributed under the License is distributed on an "AS IS" BASIS,
15 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ~~ See the License for the specific language governing permissions and
17 ~~ limitations under the License.
18
19 Content Detection
20
21 This page gives you information on how content and language detection
22 works with Apache Tika, and how to tune the behaviour of Tika.
23
24 %{toc|section=1|fromDepth=1}
25
26 * {The Detector Interface}
27
28 The
29 {{{./api/org/apache/tika/detect/Detector.html}org.apache.tika.detect.Detector}}
30 interface is the basis for most of the content type detection in Apache
31 Tika. All the different ways of detecting content all implement the
32 same common method:
33
34 ---
35 MediaType detect(java.io.InputStream input,
36 Metadata metadata) throws java.io.IOException
37 ---
38
39 The <<<detect>>> method takes the stream to inspect, and a
40 <<<Metadata>>> object that holds any additional information on
41 the content. The detector will return a
42 {{{./api/org/apache/tika/mime/MediaType.html}MediaType}} object describing
43 its best guess as to the type of the file.
44
45 In general, only two keys on the Metadata object are used by Detectors.
46 These are <<<Metadata.RESOURCE_NAME_KEY>>> which should hold the name
47 of the file (where known), and <<<Metadata.CONTENT_TYPE>>> which should
48 hold the advertised content type of the file (eg from a webserver or
49 a content repository).
50
51
52 * {Mime Magic Detction}
53
54 By looking for special ("magic") patterns of bytes near the start of
55 the file, it is often possible to detect the type of the file. For
56 some file types, this is a simple process. For others, typically
57 container based formats, the magic detection may not be enough. (More
58 detail on detecting container formats below)
59
60 Tika is able to make use of a a mime magic info file, in the
61 {{{http://www.freedesktop.org/standards/shared-mime-info}Freedesktop MIME-info}}
62 format to peform mime magic detection.
63
64 This is provided within Tika by
65 {{{./api/org/apache/tika/detect/MagicDetector.html}org.apache.tika.detect.MagicDetector}}. It is most commonly access via
66 {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}},
67 normally sourced from the <<<tika-mimetypes.xml>>> file.
68
69
70 * {Resource Name Based Detection}
71
72 Where the name of the file is known, it is sometimes possible to guess
73 the file type from the name or extension. Within the
74 <<<tika-mimetypes.xml>>> file is a list of patterns which are used to
75 identify the type from the filename.
76
77 However, because files may be renamed, this method of detection is quick
78 but not always as accurate.
79
80 This is provided within Tika by
81 {{{./api/org/apache/tika/detect/NameDetector.html}org.apache.tika.detect.NameDetector}}.
82
83
84 * {Known Content Type "Detection}
85
86 Sometimes, the mime type for a file is already known, such as when
87 downloading from a webserver, or when retrieving from a content store.
88 This information can be used by detectors, such as
89 {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}},
90
91
92 * {The default Mime Types Detector}
93
94 By default, the mime type detection in Tika is provided by
95 {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}}.
96 This detector makes use of <<<tika-mimetypes.xml>>> to power
97 magic based and filename based detection.
98
99 Firstly, magic based detection is used on the start of the file.
100 If the file is an XML file, then the start of the XML is processed
101 to look for root elements. Next, if available, the filename
102 (from <<<Metadata.RESOURCE_NAME_KEY>>>) is
103 then used to improve the detail of the detection, such as when magic
104 detects a text file, and the filename hints it's really a CSV. Finally,
105 if available, the supplied content type (from <<<Metadata.CONTENT_TYPE>>>)
106 is used to further refine the type.
107
108
109 * {Container Aware Detection}
110
111 Several common file formats are actually held within a common container
112 format. One example is the PowerPoint .ppt and Word .doc formats, which
113 are both held within an OLE2 container. Another is Apple iWork formats,
114 which are actually a series of XML files within a Zip file.
115
116 Using magic detection, it is easy to spot that a given file is an OLE2
117 document, or a Zip file. Using magic detection alone, it is very difficult
118 (and often impossible) to tell what kind of file lives inside the container.
119
120 For some use cases, speed is important, so having a quick way to know the
121 container type is sufficient. For other cases however, you don't mind
122 spending a bit of time (and memory!) processing the container to get a
123 more accurate answer on its contents. For these cases, a container
124 aware detector should be used.
125
126 Tika provides a wrapping detector in the parsers bundle, of
127 {{{./api/org/apache/tika/detect/ContainerAwareDetector.html}org.apache.tika.detect.ContainerAwareDetector}}.
128 This detector will check for certain known containers, and if found,
129 will open them and detect the appropriate type based on the contents.
130 If the file isn't a known container, it will fall back to another
131 detector for the answer (most commonly the default
132 <<<MimeTypes>>> detector)
133
134 Because this detector needs to read the whole file to process the
135 container, it must be used with a
136 {{{./api/org/apache/tika/io/TikaInputStream.html}org.apache.tika.io.TikaInputStream}}.
137 If called with a regular <<<InputStream>>>, then all work will be done
138 by the fallback detector.
139
140 For more information on container formats and Tika, see
141 {{{http://wiki.apache.org/tika/MetadataDiscussion}}}
142
143
144 * {Language Detection}
145
146 Tika is able to help identify the language of a piece of text, which
147 is useful when extracting text from document formats which do not include
148 language information in their metadata.
149
150 The language detection is provided by
151 {{{./api/org/apache/tika/language/LanguageIdentifier.html}org.apache.tika.language.LanguageIdentifier}}
0 --------------------------
1 Supported Document Formats
2 --------------------------
3
4 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
5 ~~ contributor license agreements. See the NOTICE file distributed with
6 ~~ this work for additional information regarding copyright ownership.
7 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
8 ~~ (the "License"); you may not use this file except in compliance with
9 ~~ the License. You may obtain a copy of the License at
10 ~~
11 ~~ http://www.apache.org/licenses/LICENSE-2.0
12 ~~
13 ~~ Unless required by applicable law or agreed to in writing, software
14 ~~ distributed under the License is distributed on an "AS IS" BASIS,
15 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ~~ See the License for the specific language governing permissions and
17 ~~ limitations under the License.
18
19 Supported Document Formats
20
21 This page lists all the document formats supported by Apache Tika 0.6.
22 Follow the links to the various parser class javadocs for more detailed
23 information about each document format and how it is parsed by Tika.
24
25 %{toc|section=1|fromDepth=1}
26
27 * {HyperText Markup Language}
28
29 The HyperText Markup Language (HTML) is the lingua franca of the web.
30 Tika uses the {{{http://home.ccil.org/~cowan/XML/tagsoup/}TagSoup}}
31 library to support virtually any kind of HTML found on the web.
32 The output from the
33 {{{api/org/apache/tika/parser/html/HtmlParser.html}HtmlParser}} class
34 is guaranteed to be well-formed and valid XHTML, and various heuristics
35 are used to prevent things like inline scripts from cluttering the
36 extracted text content.
37
38 * {XML and derived formats}
39
40 The Extensible Markup Language (XML) format is a generic format that can
41 be used for all kinds of content. Tika has custom parsers for some widely
42 used XML vocabularies like XHTML, OOXML and ODF, but the default
43 {{{api/org/apache/tika/parser/xml/DcXMLParser.html}DcXMLParser}}
44 class simply extracts the text content of the document and ignores any XML
45 structure. The only exception to this rule are Dublin Core metadata
46 elements that are used for the document metadata.
47
48 * {Microsoft Office document formats}
49
50 Microsoft Office and some related applications produce documents in the
51 generic OLE 2 Compound Document and Office Open XML (OOXML) formats. The
52 older OLE 2 format was introduced in Microsoft Office version 97 and was
53 the default format until Office version 2007 and the new XML-based
54 OOXML format. The
55 {{{api/org/apache/tika/parser/microsoft/OfficeParser.html}OfficeParser}}
56 and
57 {{{api/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.html}OOXMLParser}}
58 classes use {{{http://poi.apache.org/}Apache POI}} libraries to support
59 text and metadata extraction from both OLE2 and OOXML documents.
60
61 * {OpenDocument Format}
62
63 The OpenDocument format (ODF) is used most notably as the default format
64 of the OpenOffice.org office suite. The
65 {{{api/org/apache/tika/parser/odf/OpenDocumentParser.html}OpenDocumentParser}}
66 class supports this format and the earlier OpenOffice 1.0 format on which
67 ODF is based.
68
69 * {Portable Document Format}
70
71 The {{{api/org/apache/tika/parser/pdf/PDFParser.html}PDFParser}} class
72 parsers Portable Document Format (PDF) documents using the
73 {{{http://pdfbox.apache.org/}Apache PDFBox}} library.
74
75 * {Electronic Publication Format}
76
77 The {{{api/org/apache/tika/parser/epub/EpubParser.html}EpubParser}} class
78 supports the Electronic Publication Format (EPUB) used for many digital
79 books.
80
81 * {Rich Text Format}
82
83 The {{{api/org/apache/tika/parser/rtf/RTFParser.html}RTFParser}} class
84 uses the standard javax.swing.text.rtf feature to extract text content
85 from Rich Text Format (RTF) documents.
86
87 * {Compression and packaging formats}
88
89 Tika uses the {{{http://commons.apache.org/compress/}Commons Compress}}
90 library to support various compression and packaging formats. The
91 {{{api/org/apache/tika/parser/pkg/PackageParser.html}PackageParser}}
92 class and its subclasses first parse the top level compression or
93 packaging format and then pass the unpacked document streams to a
94 second parsing stage using the parser instance specified in the
95 parse context.
96
97 * {Text formats}
98
99 Extracting text content from plain text files seems like a simple task
100 until you start thinking of all the possible character encodings. The
101 {{{api/org/apache/tika/parser/txt/TXTParser.html}TXTParser}} class uses
102 encoding detection code from the {{{http://site.icu-project.org/}ICU}}
103 project to automatically detect the character encoding of a text document.
104
105 * {Audio formats}
106
107 Tika can detect several common audio formats and extract metadata
108 from them. Even text extraction is supported for some audio files that
109 contain lyrics or other textual content. The
110 {{{api/org/apache/tika/parser/audio/AudioParser.html}AudioParser}}
111 and {{{api/org/apache/tika/parser/audio/MidiParser.html}MidiParser}}
112 classes use standard javax.sound features to process simple audio
113 formats, and the
114 {{{api/org/apache/tika/parser/mp3/Mp3Parser.html}Mp3Parser}} class
115 adds support for the widely used MP3 format.
116
117 * {Image formats}
118
119 The {{{api/org/apache/tika/parser/image/ImageParser.html}ImageParser}}
120 class uses the standard javax.imageio feature to extract simple metadata
121 from image formats supported by the Java platform. More complex image
122 metadata is available through the
123 {{{api/org/apache/tika/parser/jpeg/JpegParser.html}JpegParser}} class
124 that uses the metadata-extractor library to supports Exif metadata
125 extraction from Jpeg images.
126
127 * {Video formats}
128
129 Currently Tika only supports the Flash video format using a simple
130 parsing algorithm implemented in the
131 {{{api/org/apache/tika/parser/flv/FLVParser}FLVParser}} class.
132
133 * {Java class files and archives}
134
135 The {{{api/org/apache/tika/parser/asm/ClassParser}ClassParser}} class
136 extracts class names and method signatures from Java class files, and
137 the {{{api/org/apache/tika/parser/pkg/ZipParser.html}ZipParser}} class
138 supports also jar archives.
139
140 * {The mbox format}
141
142 The {{{api/org/apache/tika/parser/mbox/MboxParser.html}MboxParser}} can
143 extract email messages from the mbox format used by many email archives
144 and Unix-style mailboxes.
0 --------------------------------
1 Getting Started with Apache Tika
2 --------------------------------
3
4 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
5 ~~ contributor license agreements. See the NOTICE file distributed with
6 ~~ this work for additional information regarding copyright ownership.
7 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
8 ~~ (the "License"); you may not use this file except in compliance with
9 ~~ the License. You may obtain a copy of the License at
10 ~~
11 ~~ http://www.apache.org/licenses/LICENSE-2.0
12 ~~
13 ~~ Unless required by applicable law or agreed to in writing, software
14 ~~ distributed under the License is distributed on an "AS IS" BASIS,
15 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ~~ See the License for the specific language governing permissions and
17 ~~ limitations under the License.
18
19 Getting Started with Apache Tika
20
21 This document describes how to build Apache Tika from sources and
22 how to start using Tika in an application.
23
24 Getting and building the sources
25
26 To build Tika from sources you first need to either
27 {{{../download.html}download}} a source release or
28 {{{../source-repository.html}checkout}} the latest sources from
29 version control.
30
31 Once you have the sources, you can build them using the
32 {{{http://maven.apache.org/}Maven 2}} build system. Executing the
33 following command in the base directory will build the sources
34 and install the resulting artifacts in your local Maven repository.
35
36 ---
37 mvn install
38 ---
39
40 See the Maven documentation for more information about the available
41 build options.
42
43 Note that you need Java 5 or higher to build Tika.
44
45 Build artifacts
46
47 The Tika build consists of a number of components and produces
48 the following main binaries:
49
50 [tika-core/target/tika-core-*.jar]
51 Tika core library. Contains the core interfaces and classes of Tika,
52 but none of the parser implementations. Depends only on Java 5.
53
54 [tika-parsers/target/tika-parsers-*.jar]
55 Tika parsers. Collection of classes that implement the Tika Parser
56 interface based on various external parser libraries.
57
58 [tika-app/target/tika-app-*.jar]
59 Tika application. Combines the above components and all the external
60 parser libraries into a single runnable jar with a GUI and a command
61 line interface.
62
63 [tika-bundle/target/tika-bundle-*.jar]
64 Tika bundle. An OSGi bundle that combines tika-parsers with non-OSGified
65 parser libraries to make them easy to deploy in an OSGi environment.
66
67 Using Tika as a Maven dependency
68
69 The core library, tika-core, contains the key interfaces and classes of Tika
70 and can be used by itself if you don't need the full set of parsers from
71 the tika-parsers component. The tika-core dependency looks like this:
72
73 ---
74 <dependency>
75 <groupId>org.apache.tika</groupId>
76 <artifactId>tika-core</artifactId>
77 <version>...</version>
78 </dependency>
79 ---
80
81 If you want to use Tika to parse documents (instead of simply detecting
82 document types, etc.), you'll want to depend on tika-parsers instead:
83
84 ---
85 <dependency>
86 <groupId>org.apache.tika</groupId>
87 <artifactId>tika-parsers</artifactId>
88 <version>...</version>
89 </dependency>
90 ---
91
92 Note that adding this dependency will introduce a number of
93 transitive dependencies to your project, including one on tika-core.
94 You need to make sure that these dependencies won't conflict with your
95 existing project dependencies. You can use the following command in
96 the tika-parsers directory to get a full listing of all the dependencies.
97
98 ---
99 $ mvn dependency:tree | grep :compile
100 ---
101
102 Using Tika in an Ant project
103
104 Unless you use a dependency manager tool like
105 {{{http://ant.apache.org/ivy/}Apache Ivy}}, the easiest way to use
106 Tika is to include either the tika-core or the tika-app jar in your
107 classpath, depending on whether you want just the core functionality
108 or also all the parser implementations.
109
110 ---
111 <classpath>
112 ... <!-- your other classpath entries -->
113
114 <!-- either: -->
115 <pathelement location="path/to/tika-core-${tika.version}.jar"/>
116 <!-- or: -->
117 <pathelement location="path/to/tika-app-${tika.version}.jar"/>
118
119 </classpath>
120 ---
121
122 Using Tika as a command line utility
123
124 The Tika application jar (tika-app-*.jar) can be used as a command
125 line utility for extracting text content and metadata from all sorts of
126 files. This runnable jar contains all the dependencies it needs, so
127 you don't need to worry about classpath settings to run it.
128
129 The usage instructions are shown below.
130
131 ---
132 usage: java -jar tika-app.jar [option...] [file|port...]
133
134 Options:
135 -? or --help Print this usage message
136 -v or --verbose Print debug level messages
137 -V or --version Print the Apache Tika version number
138
139 -g or --gui Start the Apache Tika GUI
140 -s or --server Start the Apache Tika server
141 -f or --fork Use Fork Mode for out-of-process extraction
142
143 -x or --xml Output XHTML content (default)
144 -h or --html Output HTML content
145 -t or --text Output plain text content
146 -T or --text-main Output plain text content (main content only)
147 -m or --metadata Output only metadata
148 -j or --json Output metadata in JSON
149 -y or --xmp Output metadata in XMP
150 -l or --language Output only language
151 -d or --detect Detect document type
152 -eX or --encoding=X Use output encoding X
153 -pX or --password=X Use document password X
154 -z or --extract Extract all attachements into current directory
155 --extract-dir=<dir> Specify target directory for -z
156 -r or --pretty-print For XML and XHTML outputs, adds newlines and
157 whitespace, for better readability
158
159 --create-profile=X
160 Create NGram profile, where X is a profile name
161 --list-parsers
162 List the available document parsers
163 --list-parser-details
164 List the available document parsers, and their supported mime types
165 --list-detectors
166 List the available document detectors
167 --list-met-models
168 List the available metadata models, and their supported keys
169 --list-supported-types
170 List all known media types and related information
171
172 Description:
173 Apache Tika will parse the file(s) specified on the
174 command line and output the extracted text content
175 or metadata to standard output.
176
177 Instead of a file name you can also specify the URL
178 of a document to be parsed.
179
180 If no file name or URL is specified (or the special
181 name "-" is used), then the standard input stream
182 is parsed. If no arguments were given and no input
183 data is available, the GUI is started instead.
184
185 - GUI mode
186
187 Use the "--gui" (or "-g") option to start the
188 Apache Tika GUI. You can drag and drop files from
189 a normal file explorer to the GUI window to extract
190 text content and metadata from the files.
191
192 - Server mode
193
194 Use the "--server" (or "-s") option to start the
195 Apache Tika server. The server will listen to the
196 ports you specify as one or more arguments.
197 ---
198
199 You can also use the jar as a component in a Unix pipeline or
200 as an external tool in many scripting languages.
201
202 ---
203 # Check if an Internet resource contains a specific keyword
204 curl http://.../document.doc \
205 | java -jar tika-app.jar --text \
206 | grep -q keyword
207 ---
0 ---------------
1 Apache Tika 1.3
2 ---------------
3
4 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
5 ~~ contributor license agreements. See the NOTICE file distributed with
6 ~~ this work for additional information regarding copyright ownership.
7 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
8 ~~ (the "License"); you may not use this file except in compliance with
9 ~~ the License. You may obtain a copy of the License at
10 ~~
11 ~~ http://www.apache.org/licenses/LICENSE-2.0
12 ~~
13 ~~ Unless required by applicable law or agreed to in writing, software
14 ~~ distributed under the License is distributed on an "AS IS" BASIS,
15 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ~~ See the License for the specific language governing permissions and
17 ~~ limitations under the License.
18
19 Apache Tika 1.3
20
21 The most notable changes in Tika 1.3 over the previous release are:
22
23 * TBD
24
25 The following people have contributed to Tika 1.3 by submitting or
26 commenting on the issues resolved in this release:
27
28 * TBD
29
30 See TBD for more details on these contributions.
0 --------------------
1 The Parser interface
2 --------------------
3
4 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
5 ~~ contributor license agreements. See the NOTICE file distributed with
6 ~~ this work for additional information regarding copyright ownership.
7 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
8 ~~ (the "License"); you may not use this file except in compliance with
9 ~~ the License. You may obtain a copy of the License at
10 ~~
11 ~~ http://www.apache.org/licenses/LICENSE-2.0
12 ~~
13 ~~ Unless required by applicable law or agreed to in writing, software
14 ~~ distributed under the License is distributed on an "AS IS" BASIS,
15 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ~~ See the License for the specific language governing permissions and
17 ~~ limitations under the License.
18
19 The Parser interface
20
21 The
22 {{{api/org/apache/tika/parser/Parser.html}org.apache.tika.parser.Parser}}
23 interface is the key concept of Apache Tika. It hides the complexity of
24 different file formats and parsing libraries while providing a simple and
25 powerful mechanism for client applications to extract structured text
26 content and metadata from all sorts of documents. All this is achieved
27 with a single method:
28
29 ---
30 void parse(
31 InputStream stream, ContentHandler handler, Metadata metadata,
32 ParseContext context) throws IOException, SAXException, TikaException;
33 ---
34
35 The <<<parse>>> method takes the document to be parsed and related metadata
36 as input and outputs the results as XHTML SAX events and extra metadata.
37 The parse context argument is used to specify context information (like
38 the current local) that is not related to any individual document.
39 The main criteria that lead to this design were:
40
41 [Streamed parsing] The interface should require neither the client
42 application nor the parser implementation to keep the full document
43 content in memory or spooled to disk. This allows even huge documents
44 to be parsed without excessive resource requirements.
45
46 [Structured content] A parser implementation should be able to
47 include structural information (headings, links, etc.) in the extracted
48 content. A client application can use this information for example to
49 better judge the relevance of different parts of the parsed document.
50
51 [Input metadata] A client application should be able to include metadata
52 like the file name or declared content type with the document to be
53 parsed. The parser implementation can use this information to better
54 guide the parsing process.
55
56 [Output metadata] A parser implementation should be able to return
57 document metadata in addition to document content. Many document
58 formats contain metadata like the name of the author that may be useful
59 to client applications.
60
61 [Context sensitivity] While the default settings and behaviour of Tika
62 parsers should work well for most use cases, there are still situations
63 where more fine-grained control over the parsing process is desirable.
64 It should be easy to inject such context-specific information to the
65 parsing process without breaking the layers of abstraction.
66
67 []
68
69 These criteria are reflected in the arguments of the <<<parse>>> method.
70
71 * Document input stream
72
73 The first argument is an
74 {{{http://java.sun.com/j2se/1.5.0/docs/api/java/io/InputStream.html}InputStream}}
75 for reading the document to be parsed.
76
77 If this document stream can not be read, then parsing stops and the thrown
78 {{{http://java.sun.com/j2se/1.5.0/docs/api/java/io/IOException.html}IOException}}
79 is passed up to the client application. If the stream can be read but
80 not parsed (for example if the document is corrupted), then the parser
81 throws a {{{api/org/apache/tika/exception/TikaException.html}TikaException}}.
82
83 The parser implementation will consume this stream but <will not close it>.
84 Closing the stream is the responsibility of the client application that
85 opened it in the first place. The recommended pattern for using streams
86 with the <<<parse>>> method is:
87
88 ---
89 InputStream stream = ...; // open the stream
90 try {
91 parser.parse(stream, ...); // parse the stream
92 } finally {
93 stream.close(); // close the stream
94 }
95 ---
96
97 Some document formats like the OLE2 Compound Document Format used by
98 Microsoft Office are best parsed as random access files. In such cases the
99 content of the input stream is automatically spooled to a temporary file
100 that gets removed once parsed. A future version of Tika may make it possible
101 to avoid this extra file if the input document is already a file in the
102 local file system. See
103 {{{https://issues.apache.org/jira/browse/TIKA-153}TIKA-153}} for the status
104 of this feature request.
105
106 * XHTML SAX events
107
108 The parsed content of the document stream is returned to the client
109 application as a sequence of XHTML SAX events. XHTML is used to express
110 structured content of the document and SAX events enable streamed
111 processing. Note that the XHTML format is used here only to convey
112 structural information, not to render the documents for browsing!
113
114 The XHTML SAX events produced by the parser implementation are sent to a
115 {{{http://java.sun.com/j2se/1.5.0/docs/api/org/xml/sax/ContentHandler.html}ContentHandler}}
116 instance given to the <<<parse>>> method. If this the content handler
117 fails to process an event, then parsing stops and the thrown
118 {{{http://java.sun.com/j2se/1.5.0/docs/api/org/xml/sax/SAXException.html}SAXException}}
119 is passed up to the client application.
120
121 The overall structure of the generated event stream is (with indenting
122 added for clarity):
123
124 ---
125 <html xmlns="http://www.w3.org/1999/xhtml">
126 <head>
127 <title>...</title>
128 </head>
129 <body>
130 ...
131 </body>
132 </html>
133 ---
134
135 Parser implementations typically use the
136 {{{apidocs/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}}
137 utility class to generate the XHTML output.
138
139 Dealing with the raw SAX events can be a bit complex, so Apache Tika
140 comes with a number of utility classes that can be used to process and
141 convert the event stream to other representations.
142
143 For example, the
144 {{{api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}
145 class can be used to extract just the body part of the XHTML output and
146 feed it either as SAX events to another content handler or as characters
147 to an output stream, a writer, or simply a string. The following code
148 snippet parses a document from the standard input stream and outputs the
149 extracted text content to standard output:
150
151 ---
152 ContentHandler handler = new BodyContentHandler(System.out);
153 parser.parse(System.in, handler, ...);
154 ---
155
156 Another useful class is
157 {{{api/org/apache/tika/parser/ParsingReader.html}ParsingReader}} that
158 uses a background thread to parse the document and returns the extracted
159 text content as a character stream:
160
161 ---
162 InputStream stream = ...; // the document to be parsed
163 Reader reader = new ParsingReader(parser, stream, ...);
164 try {
165 ...; // read the document text using the reader
166 } finally {
167 reader.close(); // the document stream is closed automatically
168 }
169 ---
170
171 * Document metadata
172
173 The third argument to the <<<parse>>> method is used to pass document
174 metadata both in and out of the parser. Document metadata is expressed
175 as an {{{api/org/apache/tika/metadata/Metadata.html}Metadata}} object.
176
177 The following are some of the more interesting metadata properties:
178
179 [Metadata.RESOURCE_NAME_KEY] The name of the file or resource that contains
180 the document.
181
182 A client application can set this property to allow the parser to use
183 file name heuristics to determine the format of the document.
184
185 The parser implementation may set this property if the file format
186 contains the canonical name of the file (for example the Gzip format
187 has a slot for the file name).
188
189 [Metadata.CONTENT_TYPE] The declared content type of the document.
190
191 A client application can set this property based on for example a HTTP
192 Content-Type header. The declared content type may help the parser to
193 correctly interpret the document.
194
195 The parser implementation sets this property to the content type according
196 to which the document was parsed.
197
198 [Metadata.TITLE] The title of the document.
199
200 The parser implementation sets this property if the document format
201 contains an explicit title field.
202
203 [Metadata.AUTHOR] The name of the author of the document.
204
205 The parser implementation sets this property if the document format
206 contains an explicit author field.
207
208 []
209
210 Note that metadata handling is still being discussed by the Tika development
211 team, and it is likely that there will be some (backwards incompatible)
212 changes in metadata handling before Tika 1.0.
213
214 * Parse context
215
216 The final argument to the <<<parse>>> method is used to inject
217 context-specific information to the parsing process. This is useful
218 for example when dealing with locale-specific date and number formats
219 in Microsoft Excel spreadsheets. Another important use of the parse
220 context is passing in the delegate parser instance to be used by
221 two-phase parsers like the
222 {{{api/org/apache/parser/pkg/PackageParser.html}PackageParser}} subclasses.
223 Some parser classes allow customization of the parsing process through
224 strategy objects in the parse context.
225
226 * Parser implementations
227
228 Apache Tika comes with a number of parser classes for parsing
229 {{{formats.html}various document formats}}. You can also extend Tika
230 with your own parsers, and of course any contributions to Tika are
231 warmly welcome.
232
233 The goal of Tika is to reuse existing parser libraries like
234 {{{http://www.pdfbox.org/}PDFBox}} or
235 {{{http://poi.apache.org/}Apache POI}} as much as possible, and so most
236 of the parser classes in Tika are adapters to such external libraries.
237
238 Tika also contains some general purpose parser implementations that are
239 not targeted at any specific document formats. The most notable of these
240 is the {{{apidocs/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}}
241 class that encapsulates all Tika functionality into a single parser that
242 can handle any types of documents. This parser will automatically determine
243 the type of the incoming document based on various heuristics and will then
244 parse the document accordingly.
0 --------------------------------------------
1 Get Tika parsing up and running in 5 minutes
2 --------------------------------------------
3 Arturo Beltran
4 --------------------------------------------
5
6 ~~ Licensed to the Apache Software Foundation (ASF) under one or more
7 ~~ contributor license agreements. See the NOTICE file distributed with
8 ~~ this work for additional information regarding copyright ownership.
9 ~~ The ASF licenses this file to You under the Apache License, Version 2.0
10 ~~ (the "License"); you may not use this file except in compliance with
11 ~~ the License. You may obtain a copy of the License at
12 ~~
13 ~~ http://www.apache.org/licenses/LICENSE-2.0
14 ~~
15 ~~ Unless required by applicable law or agreed to in writing, software
16 ~~ distributed under the License is distributed on an "AS IS" BASIS,
17 ~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 ~~ See the License for the specific language governing permissions and
19 ~~ limitations under the License.
20
21 Get Tika parsing up and running in 5 minutes
22
23 This page is a quick start guide showing how to add a new parser to Apache Tika.
24 Following the simple steps listed below your new parser can be running in only 5 minutes.
25
26 %{toc|section=1|fromDepth=1}
27
28 * {Getting Started}
29
30 The {{{gettingstarted.html}Getting Started}} document describes how to
31 build Apache Tika from sources and how to start using Tika in an application. Pay close attention
32 and follow the instructions in the "Getting and building the sources" section.
33
34
35 * {Add your MIME-Type}
36
37 You first need to modify {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}}
38 in order to Tika can map the file extension with its MIME-Type. You should add something like this:
39
40 ---
41 <mime-type type="application/hello">
42 <glob pattern="*.hi"/>
43 </mime-type>
44 ---
45
46 * {Create your Parser class}
47
48 Now, you need to create your new parser. This is a class that must implement the Parser interface
49 offered by Tika. A very simple Tika Parser looks like this:
50
51 ---
52 /*
53 * Licensed to the Apache Software Foundation (ASF) under one or more
54 * contributor license agreements. See the NOTICE file distributed with
55 * this work for additional information regarding copyright ownership.
56 * The ASF licenses this file to You under the Apache License, Version 2.0
57 * (the "License"); you may not use this file except in compliance with
58 * the License. You may obtain a copy of the License at
59 *
60 * http://www.apache.org/licenses/LICENSE-2.0
61 *
62 * Unless required by applicable law or agreed to in writing, software
63 * distributed under the License is distributed on an "AS IS" BASIS,
64 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
65 * See the License for the specific language governing permissions and
66 * limitations under the License.
67 *
68 * @Author: Arturo Beltran
69 */
70 package org.apache.tika.parser.hello;
71
72 import java.io.IOException;
73 import java.io.InputStream;
74 import java.util.Collections;
75 import java.util.Set;
76
77 import org.apache.tika.exception.TikaException;
78 import org.apache.tika.metadata.Metadata;
79 import org.apache.tika.mime.MediaType;
80 import org.apache.tika.parser.ParseContext;
81 import org.apache.tika.parser.Parser;
82 import org.apache.tika.sax.XHTMLContentHandler;
83 import org.xml.sax.ContentHandler;
84 import org.xml.sax.SAXException;
85
86 public class HelloParser implements Parser {
87
88 private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello"));
89 public static final String HELLO_MIME_TYPE = "application/hello";
90
91 public Set<MediaType> getSupportedTypes(ParseContext context) {
92 return SUPPORTED_TYPES;
93 }
94
95 public void parse(
96 InputStream stream, ContentHandler handler,
97 Metadata metadata, ParseContext context)
98 throws IOException, SAXException, TikaException {
99
100 metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
101 metadata.set("Hello", "World");
102
103 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
104 xhtml.startDocument();
105 xhtml.endDocument();
106 }
107
108 /**
109 * @deprecated This method will be removed in Apache Tika 1.0.
110 */
111 public void parse(
112 InputStream stream, ContentHandler handler, Metadata metadata)
113 throws IOException, SAXException, TikaException {
114 parse(stream, handler, metadata, new ParseContext());
115 }
116 }
117 ---
118
119 Pay special attention to the definition of the SUPPORTED_TYPES static class
120 field in the parser class that defines what MIME-Types it supports.
121
122 Is in the "parse" method where you will do all your work. This is, extract
123 the information of the resource and then set the metadata.
124
125 * {List the new parser}
126
127 Finally, you should explicitly tell the AutoDetectParser to include your new
128 parser. This step is only needed if you want to use the AutoDetectParser functionality.
129 If you figure out the correct parser in a different way, it isn't needed.
130
131 List your new parser in:
132 {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
133
134
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one
2 * or more contributor license agreements. See the NOTICE file
3 * distributed with this work for additional information
4 * regarding copyright ownership. The ASF licenses this file
5 * to you under the Apache License, Version 2.0 (the
6 * "License"); you may not use this file except in compliance
7 * with the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18
19 #search {
20 position: relative;
21 right: 10px;
22 width: 100%;
23 font-size: 70%;
24 white-space: nowrap;
25 text-align: right;
26 z-index:0;
27
28 bottom: -1px; /* compensate for IE rendering issue */
29 }
30
31 #bookpromo {
32 position: relative;
33 top: 35px;
34 left: 10px;
35 width: 100%;
36 white-space: nowrap;
37 text-align: center;
38 z-index:0;
39 bottom: -1px;
40 }
41
42 #searchform {
43 }
44
45 body {
46 margin: 0px;
47 padding: 0px 0px 10px 0px;
48 }
49
50 /* From maven-theme.css */
51
52 body, td, select, input, li {
53 font-family: Verdana, Helvetica, Arial, sans-serif;
54 font-size: 13px;
55 }
56
57 code{
58 font-family: Courier, monospace;
59 font-size: 13px;
60 }
61 a {
62 text-decoration: none;
63 }
64 a:link {
65 color:#36a;
66 }
67 a:visited {
68 color:#47a;
69 }
70 a:active, a:hover {
71 color:#69c;
72 }
73 #legend li.externalLink {
74 background: url(../images/external.png) left top no-repeat;
75 padding-left: 18px;
76 }
77 a.externalLink, a.externalLink:link, a.externalLink:visited, a.externalLink:active, a.externalLink:hover {
78 background: url(../images/external.png) right center no-repeat;
79 padding-right: 18px;
80 }
81 #legend li.newWindow {
82 background: url(../images/newwindow.png) left top no-repeat;
83 padding-left: 18px;
84 }
85 a.newWindow, a.newWindow:link, a.newWindow:visited, a.newWindow:active, a.newWindow:hover {
86 background: url(../images/newwindow.png) right center no-repeat;
87 padding-right: 18px;
88 }
89 h2 {
90 padding: 4px 4px 4px 6px;
91 border: 1px solid #999;
92 color: #900;
93 background-color: #ddd;
94 font-weight:900;
95 font-size: x-large;
96 }
97 h3 {
98 padding: 4px 4px 4px 6px;
99 border: 1px solid #aaa;
100 color: #900;
101 background-color: #eee;
102 font-weight: normal;
103 font-size: large;
104 }
105 h4 {
106 padding: 4px 4px 4px 6px;
107 border: 1px solid #bbb;
108 color: #900;
109 background-color: #fff;
110 font-weight: normal;
111 font-size: large;
112 }
113 h5 {
114 padding: 4px 4px 4px 6px;
115 color: #900;
116 font-size: normal;
117 }
118 p {
119 line-height: 1.3em;
120 font-size: small;
121 }
122 #breadcrumbs {
123 border-top: 1px solid #aaa;
124 border-bottom: 1px solid #aaa;
125 background-color: #ccc;
126 }
127 #leftColumn {
128 margin: 10px 0 0 5px;
129 border: 1px solid #999;
130 background-color: #eee;
131 }
132 #navcolumn h5 {
133 font-size: smaller;
134 border-bottom: 1px solid #aaaaaa;
135 padding-top: 2px;
136 color: #000;
137 }
138
139 table.bodyTable th {
140 color: white;
141 background-color: #bbb;
142 text-align: left;
143 font-weight: bold;
144 }
145
146 table.bodyTable th, table.bodyTable td {
147 font-size: 1em;
148 }
149
150 table.bodyTable tr.a {
151 background-color: #ddd;
152 }
153
154 table.bodyTable tr.b {
155 background-color: #eee;
156 }
157
158 .source {
159 border: 1px solid #999;
160 }
161 dl {
162 padding: 4px 4px 4px 6px;
163 border: 1px solid #aaa;
164 background-color: #ffc;
165 }
166 dt {
167 color: #900;
168 }
169 #organizationLogo img, #projectLogo img, #projectLogo span{
170 margin: 8px;
171 }
172 #banner {
173 border-bottom: 1px solid #fff;
174 }
175 .errormark, .warningmark, .donemark, .infomark {
176 background: url(../images/icon_error_sml.gif) no-repeat;
177 }
178
179 .warningmark {
180 background-image: url(../images/icon_warning_sml.gif);
181 }
182
183 .donemark {
184 background-image: url(../images/icon_success_sml.gif);
185 }
186
187 .infomark {
188 background-image: url(../images/icon_info_sml.gif);
189 }
190
191 /* From maven-base.css */
192
193 img {
194 border:none;
195 }
196 table {
197 padding:0px;
198 width: 100%;
199 margin-left: -2px;
200 margin-right: -2px;
201 }
202 acronym {
203 cursor: help;
204 border-bottom: 1px dotted #feb;
205 }
206 table.bodyTable th, table.bodyTable td {
207 padding: 2px 4px 2px 4px;
208 vertical-align: top;
209 }
210 div.clear{
211 clear:both;
212 visibility: hidden;
213 }
214 div.clear hr{
215 display: none;
216 }
217 #bannerLeft, #bannerRight {
218 font-size: xx-large;
219 font-weight: bold;
220 }
221 #bannerLeft img, #bannerRight img {
222 margin: 0px;
223 }
224 .xleft, #bannerLeft img {
225 float:left;
226 text-shadow: #7CFC00 1px 1px 1px;
227 }
228 .xright, #bannerRight {
229 float:right;
230 text-shadow: #7CFC00 1px 1px 1px;
231 }
232 #banner {
233 padding: 0px;
234 }
235 #banner img {
236 border: none;
237 }
238 #breadcrumbs {
239 padding: 3px 10px 3px 10px;
240 }
241 #leftColumn {
242 width: 170px;
243 float:left;
244 overflow: auto;
245 }
246 #bodyColumn {
247 margin-right: 1.5em;
248 margin-left: 197px;
249 }
250 #legend {
251 padding: 8px 0 8px 0;
252 }
253 #navcolumn {
254 padding: 8px 4px 0 8px;
255 }
256 #navcolumn h5 {
257 margin: 0;
258 padding: 0;
259 font-size: small;
260 }
261 #navcolumn ul {
262 margin: 0;
263 padding: 0;
264 font-size: small;
265 }
266 #navcolumn li {
267 list-style-type: none;
268 background-image: none;
269 background-repeat: no-repeat;
270 background-position: 0 0.4em;
271 padding-left: 16px;
272 list-style-position: outside;
273 line-height: 1.2em;
274 font-size: smaller;
275 }
276 #navcolumn li.expanded {
277 background-image: url(../images/expanded.gif);
278 }
279 #navcolumn li.collapsed {
280 background-image: url(../images/collapsed.gif);
281 }
282 #navcolumn img {
283 margin-top: 10px;
284 margin-bottom: 3px;
285 }
286 #search img {
287 margin: 0px;
288 display: block;
289 }
290 #search #q, #search #btnG {
291 border: 1px solid #999;
292 margin-bottom:10px;
293 }
294 #search form {
295 margin: 0px;
296 }
297 #lastPublished {
298 font-size: x-small;
299 }
300 .navSection {
301 margin-bottom: 2px;
302 padding: 8px;
303 }
304 .navSectionHead {
305 font-weight: bold;
306 font-size: x-small;
307 }
308 .section {
309 padding: 4px;
310 }
311 #footer p {
312 padding: 3px 10px 3px 10px;
313 font-size: x-small;
314 text-align: center;
315 }
316 .source {
317 padding: 12px;
318 margin: 1em 7px 1em 7px;
319 }
320 .source pre {
321 margin: 0px;
322 padding: 0px;
323 }
0 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
1 <!--
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 -->
17 <svg
18 xmlns:dc="http://purl.org/dc/elements/1.1/"
19 xmlns:cc="http://web.resource.org/cc/"
20 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
21 xmlns:svg="http://www.w3.org/2000/svg"
22 xmlns="http://www.w3.org/2000/svg"
23 xmlns:xlink="http://www.w3.org/1999/xlink"
24 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
25 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
26 width="363px"
27 height="94px"
28 viewBox="82px 238px 363px 94px"
29 id="svg2"
30 sodipodi:version="0.32"
31 inkscape:version="0.45.1"
32 sodipodi:docname="tika logo v10.svg"
33 inkscape:output_extension="org.inkscape.output.svg.inkscape"
34 sodipodi:docbase="C:\Users\Jukka Zitting\Desktop"
35 inkscape:export-filename="C:\Users\Jukka Zitting\Desktop\tika logo v10.png"
36 inkscape:export-xdpi="76.820831"
37 inkscape:export-ydpi="76.820831">
38 <metadata
39 id="metadata1566">
40 <rdf:RDF>
41 <cc:Work
42 rdf:about="">
43 <dc:format>image/svg+xml</dc:format>
44 <dc:type
45 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
46 </cc:Work>
47 </rdf:RDF>
48 </metadata>
49 <defs
50 id="defs1564">
51 <linearGradient
52 id="XMLID_1_"
53 gradientUnits="userSpaceOnUse"
54 x1="-3662.4312"
55 y1="-3617.1401"
56 x2="-3663.4963"
57 y2="-3588.9297"
58 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
59 <stop
60 offset="0"
61 style="stop-color:#500C81"
62 id="stop173" />
63 <stop
64 offset="0.0374"
65 style="stop-color:#5F0B7A"
66 id="stop175" />
67 <stop
68 offset="0.1836"
69 style="stop-color:#96075F"
70 id="stop177" />
71 <stop
72 offset="0.3196"
73 style="stop-color:#C1044A"
74 id="stop179" />
75 <stop
76 offset="0.4412"
77 style="stop-color:#E0023B"
78 id="stop181" />
79 <stop
80 offset="0.5441"
81 style="stop-color:#F30032"
82 id="stop183" />
83 <stop
84 offset="0.6158"
85 style="stop-color:#FA002F"
86 id="stop185" />
87 <stop
88 offset="1"
89 style="stop-color:#F7EE5F"
90 id="stop187" />
91 </linearGradient>
92 <linearGradient
93 id="XMLID_2_"
94 gradientUnits="userSpaceOnUse"
95 x1="-3672.1465"
96 y1="-3607.502"
97 x2="-3673.0225"
98 y2="-3584.3003"
99 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
100 <stop
101 offset="0"
102 style="stop-color:#500C81"
103 id="stop192" />
104 <stop
105 offset="0.0374"
106 style="stop-color:#5F0B7A"
107 id="stop194" />
108 <stop
109 offset="0.1836"
110 style="stop-color:#96075F"
111 id="stop196" />
112 <stop
113 offset="0.3196"
114 style="stop-color:#C1044A"
115 id="stop198" />
116 <stop
117 offset="0.4412"
118 style="stop-color:#E0023B"
119 id="stop200" />
120 <stop
121 offset="0.5441"
122 style="stop-color:#F30032"
123 id="stop202" />
124 <stop
125 offset="0.6158"
126 style="stop-color:#FA002F"
127 id="stop204" />
128 <stop
129 offset="1"
130 style="stop-color:#F7EE5F"
131 id="stop206" />
132 </linearGradient>
133 <linearGradient
134 id="XMLID_3_"
135 gradientUnits="userSpaceOnUse"
136 x1="-3681.2422"
137 y1="-3614.8345"
138 x2="-3682.1765"
139 y2="-3590.0845"
140 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
141 <stop
142 offset="0"
143 style="stop-color:#500C81"
144 id="stop211" />
145 <stop
146 offset="0.0374"
147 style="stop-color:#5F0B7A"
148 id="stop213" />
149 <stop
150 offset="0.1836"
151 style="stop-color:#96075F"
152 id="stop215" />
153 <stop
154 offset="0.3196"
155 style="stop-color:#C1044A"
156 id="stop217" />
157 <stop
158 offset="0.4412"
159 style="stop-color:#E0023B"
160 id="stop219" />
161 <stop
162 offset="0.5441"
163 style="stop-color:#F30032"
164 id="stop221" />
165 <stop
166 offset="0.6158"
167 style="stop-color:#FA002F"
168 id="stop223" />
169 <stop
170 offset="1"
171 style="stop-color:#F7EE5F"
172 id="stop225" />
173 </linearGradient>
174 <linearGradient
175 id="XMLID_4_"
176 gradientUnits="userSpaceOnUse"
177 x1="-3689.5493"
178 y1="-3608.1592"
179 x2="-3690.4253"
180 y2="-3584.9575"
181 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
182 <stop
183 offset="0"
184 style="stop-color:#500C81"
185 id="stop230" />
186 <stop
187 offset="0.0374"
188 style="stop-color:#5F0B7A"
189 id="stop232" />
190 <stop
191 offset="0.1836"
192 style="stop-color:#96075F"
193 id="stop234" />
194 <stop
195 offset="0.3196"
196 style="stop-color:#C1044A"
197 id="stop236" />
198 <stop
199 offset="0.4412"
200 style="stop-color:#E0023B"
201 id="stop238" />
202 <stop
203 offset="0.5441"
204 style="stop-color:#F30032"
205 id="stop240" />
206 <stop
207 offset="0.6158"
208 style="stop-color:#FA002F"
209 id="stop242" />
210 <stop
211 offset="1"
212 style="stop-color:#F7EE5F"
213 id="stop244" />
214 </linearGradient>
215 <linearGradient
216 id="XMLID_5_"
217 gradientUnits="userSpaceOnUse"
218 x1="-3699.7769"
219 y1="-3613.4175"
220 x2="-3700.6736"
221 y2="-3589.6692"
222 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
223 <stop
224 offset="0"
225 style="stop-color:#500C81"
226 id="stop249" />
227 <stop
228 offset="0.0374"
229 style="stop-color:#5F0B7A"
230 id="stop251" />
231 <stop
232 offset="0.1836"
233 style="stop-color:#96075F"
234 id="stop253" />
235 <stop
236 offset="0.3196"
237 style="stop-color:#C1044A"
238 id="stop255" />
239 <stop
240 offset="0.4412"
241 style="stop-color:#E0023B"
242 id="stop257" />
243 <stop
244 offset="0.5441"
245 style="stop-color:#F30032"
246 id="stop259" />
247 <stop
248 offset="0.6158"
249 style="stop-color:#FA002F"
250 id="stop261" />
251 <stop
252 offset="1"
253 style="stop-color:#F7EE5F"
254 id="stop263" />
255 </linearGradient>
256 <linearGradient
257 id="XMLID_6_"
258 gradientUnits="userSpaceOnUse"
259 x1="-3706.9673"
260 y1="-3608.8169"
261 x2="-3707.8433"
262 y2="-3585.6152"
263 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
264 <stop
265 offset="0"
266 style="stop-color:#500C81"
267 id="stop268" />
268 <stop
269 offset="0.0374"
270 style="stop-color:#5F0B7A"
271 id="stop270" />
272 <stop
273 offset="0.1836"
274 style="stop-color:#96075F"
275 id="stop272" />
276 <stop
277 offset="0.3196"
278 style="stop-color:#C1044A"
279 id="stop274" />
280 <stop
281 offset="0.4412"
282 style="stop-color:#E0023B"
283 id="stop276" />
284 <stop
285 offset="0.5441"
286 style="stop-color:#F30032"
287 id="stop278" />
288 <stop
289 offset="0.6158"
290 style="stop-color:#FA002F"
291 id="stop280" />
292 <stop
293 offset="1"
294 style="stop-color:#F7EE5F"
295 id="stop282" />
296 </linearGradient>
297 <linearGradient
298 id="XMLID_7_"
299 gradientUnits="userSpaceOnUse"
300 x1="-3716.0044"
301 y1="-3611.9541"
302 x2="-3716.7644"
303 y2="-3591.8245"
304 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
305 <stop
306 offset="0"
307 style="stop-color:#500C81"
308 id="stop287" />
309 <stop
310 offset="0.0374"
311 style="stop-color:#5F0B7A"
312 id="stop289" />
313 <stop
314 offset="0.1836"
315 style="stop-color:#96075F"
316 id="stop291" />
317 <stop
318 offset="0.3196"
319 style="stop-color:#C1044A"
320 id="stop293" />
321 <stop
322 offset="0.4412"
323 style="stop-color:#E0023B"
324 id="stop295" />
325 <stop
326 offset="0.5441"
327 style="stop-color:#F30032"
328 id="stop297" />
329 <stop
330 offset="0.6158"
331 style="stop-color:#FA002F"
332 id="stop299" />
333 <stop
334 offset="1"
335 style="stop-color:#F7EE5F"
336 id="stop301" />
337 </linearGradient>
338 <linearGradient
339 id="XMLID_8_"
340 gradientUnits="userSpaceOnUse"
341 x1="-3725.0508"
342 y1="-3609.4995"
343 x2="-3725.9268"
344 y2="-3586.2979"
345 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
346 <stop
347 offset="0"
348 style="stop-color:#500C81"
349 id="stop306" />
350 <stop
351 offset="0.0374"
352 style="stop-color:#5F0B7A"
353 id="stop308" />
354 <stop
355 offset="0.1836"
356 style="stop-color:#96075F"
357 id="stop310" />
358 <stop
359 offset="0.3196"
360 style="stop-color:#C1044A"
361 id="stop312" />
362 <stop
363 offset="0.4412"
364 style="stop-color:#E0023B"
365 id="stop314" />
366 <stop
367 offset="0.5441"
368 style="stop-color:#F30032"
369 id="stop316" />
370 <stop
371 offset="0.6158"
372 style="stop-color:#FA002F"
373 id="stop318" />
374 <stop
375 offset="1"
376 style="stop-color:#F7EE5F"
377 id="stop320" />
378 </linearGradient>
379 <linearGradient
380 id="XMLID_9_"
381 gradientUnits="userSpaceOnUse"
382 x1="-3733.1665"
383 y1="-3613.2212"
384 x2="-3733.397"
385 y2="-3602.4053"
386 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
387 <stop
388 offset="0"
389 style="stop-color:#F7EE5F"
390 id="stop325" />
391 <stop
392 offset="0.1872"
393 style="stop-color:#F6D65D"
394 id="stop327" />
395 <stop
396 offset="0.3829"
397 style="stop-color:#F4C35B"
398 id="stop329" />
399 <stop
400 offset="0.5198"
401 style="stop-color:#F4BC5A"
402 id="stop331" />
403 <stop
404 offset="0.7816"
405 style="stop-color:#F6DA5D"
406 id="stop333" />
407 <stop
408 offset="1"
409 style="stop-color:#F7EE5F"
410 id="stop335" />
411 </linearGradient>
412 <linearGradient
413 id="XMLID_10_"
414 gradientUnits="userSpaceOnUse"
415 x1="-3742.7129"
416 y1="-3619.499"
417 x2="-3739.1846"
418 y2="-3596.0273"
419 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
420 <stop
421 offset="0"
422 style="stop-color:#F7EE5F"
423 id="stop340" />
424 <stop
425 offset="0.1872"
426 style="stop-color:#F6D65D"
427 id="stop342" />
428 <stop
429 offset="0.3829"
430 style="stop-color:#F4C35B"
431 id="stop344" />
432 <stop
433 offset="0.5198"
434 style="stop-color:#F4BC5A"
435 id="stop346" />
436 <stop
437 offset="0.7816"
438 style="stop-color:#F6DA5D"
439 id="stop348" />
440 <stop
441 offset="1"
442 style="stop-color:#F7EE5F"
443 id="stop350" />
444 </linearGradient>
445 <linearGradient
446 id="XMLID_11_"
447 gradientUnits="userSpaceOnUse"
448 x1="-3747.291"
449 y1="-3613.5225"
450 x2="-3747.5215"
451 y2="-3602.7065"
452 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
453 <stop
454 offset="0"
455 style="stop-color:#F7EE5F"
456 id="stop355" />
457 <stop
458 offset="0.1872"
459 style="stop-color:#F6D65D"
460 id="stop357" />
461 <stop
462 offset="0.3829"
463 style="stop-color:#F4C35B"
464 id="stop359" />
465 <stop
466 offset="0.5198"
467 style="stop-color:#F4BC5A"
468 id="stop361" />
469 <stop
470 offset="0.7816"
471 style="stop-color:#F6DA5D"
472 id="stop363" />
473 <stop
474 offset="1"
475 style="stop-color:#F7EE5F"
476 id="stop365" />
477 </linearGradient>
478 <linearGradient
479 id="XMLID_12_"
480 gradientUnits="userSpaceOnUse"
481 x1="-3756.9526"
482 y1="-3617.3584"
483 x2="-3753.4243"
484 y2="-3593.8867"
485 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
486 <stop
487 offset="0"
488 style="stop-color:#F7EE5F"
489 id="stop370" />
490 <stop
491 offset="0.1872"
492 style="stop-color:#F6D65D"
493 id="stop372" />
494 <stop
495 offset="0.3829"
496 style="stop-color:#F4C35B"
497 id="stop374" />
498 <stop
499 offset="0.5198"
500 style="stop-color:#F4BC5A"
501 id="stop376" />
502 <stop
503 offset="0.7816"
504 style="stop-color:#F6DA5D"
505 id="stop378" />
506 <stop
507 offset="1"
508 style="stop-color:#F7EE5F"
509 id="stop380" />
510 </linearGradient>
511 <linearGradient
512 id="XMLID_13_"
513 gradientUnits="userSpaceOnUse"
514 x1="-3759.7539"
515 y1="-3613.7876"
516 x2="-3759.9844"
517 y2="-3602.9717"
518 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
519 <stop
520 offset="0"
521 style="stop-color:#F7EE5F"
522 id="stop385" />
523 <stop
524 offset="0.1872"
525 style="stop-color:#F6D65D"
526 id="stop387" />
527 <stop
528 offset="0.3829"
529 style="stop-color:#F4C35B"
530 id="stop389" />
531 <stop
532 offset="0.5198"
533 style="stop-color:#F4BC5A"
534 id="stop391" />
535 <stop
536 offset="0.7816"
537 style="stop-color:#F6DA5D"
538 id="stop393" />
539 <stop
540 offset="1"
541 style="stop-color:#F7EE5F"
542 id="stop395" />
543 </linearGradient>
544 <linearGradient
545 id="XMLID_14_"
546 gradientUnits="userSpaceOnUse"
547 x1="-3763.1528"
548 y1="-3613.8604"
549 x2="-3763.3833"
550 y2="-3603.0444"
551 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
552 <stop
553 offset="0"
554 style="stop-color:#F7EE5F"
555 id="stop400" />
556 <stop
557 offset="0.1872"
558 style="stop-color:#F6D65D"
559 id="stop402" />
560 <stop
561 offset="0.3829"
562 style="stop-color:#F4C35B"
563 id="stop404" />
564 <stop
565 offset="0.5198"
566 style="stop-color:#F4BC5A"
567 id="stop406" />
568 <stop
569 offset="0.7816"
570 style="stop-color:#F6DA5D"
571 id="stop408" />
572 <stop
573 offset="1"
574 style="stop-color:#F7EE5F"
575 id="stop410" />
576 </linearGradient>
577 <linearGradient
578 id="XMLID_15_"
579 gradientUnits="userSpaceOnUse"
580 x1="-3766.9253"
581 y1="-3606.5742"
582 x2="-3767.0735"
583 y2="-3599.615"
584 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
585 <stop
586 offset="0"
587 style="stop-color:#F7EE5F"
588 id="stop415" />
589 <stop
590 offset="0.1872"
591 style="stop-color:#F6D65D"
592 id="stop417" />
593 <stop
594 offset="0.3829"
595 style="stop-color:#F4C35B"
596 id="stop419" />
597 <stop
598 offset="0.5198"
599 style="stop-color:#F4BC5A"
600 id="stop421" />
601 <stop
602 offset="0.7816"
603 style="stop-color:#F6DA5D"
604 id="stop423" />
605 <stop
606 offset="1"
607 style="stop-color:#F7EE5F"
608 id="stop425" />
609 </linearGradient>
610 <linearGradient
611 id="XMLID_16_"
612 gradientUnits="userSpaceOnUse"
613 x1="-3767.0049"
614 y1="-3613.9424"
615 x2="-3767.2354"
616 y2="-3603.1265"
617 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
618 <stop
619 offset="0"
620 style="stop-color:#F7EE5F"
621 id="stop430" />
622 <stop
623 offset="0.1872"
624 style="stop-color:#F6D65D"
625 id="stop432" />
626 <stop
627 offset="0.3829"
628 style="stop-color:#F4C35B"
629 id="stop434" />
630 <stop
631 offset="0.5198"
632 style="stop-color:#F4BC5A"
633 id="stop436" />
634 <stop
635 offset="0.7816"
636 style="stop-color:#F6DA5D"
637 id="stop438" />
638 <stop
639 offset="1"
640 style="stop-color:#F7EE5F"
641 id="stop440" />
642 </linearGradient>
643 <linearGradient
644 id="XMLID_17_"
645 gradientUnits="userSpaceOnUse"
646 x1="-3664.459"
647 y1="-3609.6206"
648 x2="-3674.479"
649 y2="-3642.6406"
650 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
651 <stop
652 offset="0"
653 style="stop-color:#500C81"
654 id="stop445" />
655 <stop
656 offset="0.0374"
657 style="stop-color:#5F0B7A"
658 id="stop447" />
659 <stop
660 offset="0.1836"
661 style="stop-color:#96075F"
662 id="stop449" />
663 <stop
664 offset="0.3196"
665 style="stop-color:#C1044A"
666 id="stop451" />
667 <stop
668 offset="0.4412"
669 style="stop-color:#E0023B"
670 id="stop453" />
671 <stop
672 offset="0.5441"
673 style="stop-color:#F30032"
674 id="stop455" />
675 <stop
676 offset="0.6158"
677 style="stop-color:#FA002F"
678 id="stop457" />
679 <stop
680 offset="1"
681 style="stop-color:#F7EE5F"
682 id="stop459" />
683 </linearGradient>
684 <linearGradient
685 id="XMLID_18_"
686 gradientUnits="userSpaceOnUse"
687 x1="-3679.1553"
688 y1="-3625.2759"
689 x2="-3697.104"
690 y2="-3651.5425"
691 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
692 <stop
693 offset="0"
694 style="stop-color:#500C81"
695 id="stop464" />
696 <stop
697 offset="0.0374"
698 style="stop-color:#5F0B7A"
699 id="stop466" />
700 <stop
701 offset="0.1836"
702 style="stop-color:#96075F"
703 id="stop468" />
704 <stop
705 offset="0.3196"
706 style="stop-color:#C1044A"
707 id="stop470" />
708 <stop
709 offset="0.4412"
710 style="stop-color:#E0023B"
711 id="stop472" />
712 <stop
713 offset="0.5441"
714 style="stop-color:#F30032"
715 id="stop474" />
716 <stop
717 offset="0.6158"
718 style="stop-color:#FA002F"
719 id="stop476" />
720 <stop
721 offset="1"
722 style="stop-color:#F7EE5F"
723 id="stop478" />
724 </linearGradient>
725 <linearGradient
726 id="XMLID_19_"
727 gradientUnits="userSpaceOnUse"
728 x1="-3680.7471"
729 y1="-3604.6782"
730 x2="-3690.7671"
731 y2="-3637.6982"
732 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
733 <stop
734 offset="0"
735 style="stop-color:#500C81"
736 id="stop483" />
737 <stop
738 offset="0.0374"
739 style="stop-color:#5F0B7A"
740 id="stop485" />
741 <stop
742 offset="0.1836"
743 style="stop-color:#96075F"
744 id="stop487" />
745 <stop
746 offset="0.3196"
747 style="stop-color:#C1044A"
748 id="stop489" />
749 <stop
750 offset="0.4412"
751 style="stop-color:#E0023B"
752 id="stop491" />
753 <stop
754 offset="0.5441"
755 style="stop-color:#F30032"
756 id="stop493" />
757 <stop
758 offset="0.6158"
759 style="stop-color:#FA002F"
760 id="stop495" />
761 <stop
762 offset="1"
763 style="stop-color:#F7EE5F"
764 id="stop497" />
765 </linearGradient>
766 <linearGradient
767 id="XMLID_20_"
768 gradientUnits="userSpaceOnUse"
769 x1="-3693.1763"
770 y1="-3615.6953"
771 x2="-3711.125"
772 y2="-3641.9619"
773 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
774 <stop
775 offset="0"
776 style="stop-color:#500C81"
777 id="stop502" />
778 <stop
779 offset="0.0374"
780 style="stop-color:#5F0B7A"
781 id="stop504" />
782 <stop
783 offset="0.1836"
784 style="stop-color:#96075F"
785 id="stop506" />
786 <stop
787 offset="0.3196"
788 style="stop-color:#C1044A"
789 id="stop508" />
790 <stop
791 offset="0.4412"
792 style="stop-color:#E0023B"
793 id="stop510" />
794 <stop
795 offset="0.5441"
796 style="stop-color:#F30032"
797 id="stop512" />
798 <stop
799 offset="0.6158"
800 style="stop-color:#FA002F"
801 id="stop514" />
802 <stop
803 offset="1"
804 style="stop-color:#F7EE5F"
805 id="stop516" />
806 </linearGradient>
807 <linearGradient
808 id="XMLID_21_"
809 gradientUnits="userSpaceOnUse"
810 x1="-3698.873"
811 y1="-3599.1777"
812 x2="-3708.8931"
813 y2="-3632.1978"
814 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
815 <stop
816 offset="0"
817 style="stop-color:#500C81"
818 id="stop521" />
819 <stop
820 offset="0.0374"
821 style="stop-color:#5F0B7A"
822 id="stop523" />
823 <stop
824 offset="0.1836"
825 style="stop-color:#96075F"
826 id="stop525" />
827 <stop
828 offset="0.3196"
829 style="stop-color:#C1044A"
830 id="stop527" />
831 <stop
832 offset="0.4412"
833 style="stop-color:#E0023B"
834 id="stop529" />
835 <stop
836 offset="0.5441"
837 style="stop-color:#F30032"
838 id="stop531" />
839 <stop
840 offset="0.6158"
841 style="stop-color:#FA002F"
842 id="stop533" />
843 <stop
844 offset="1"
845 style="stop-color:#F7EE5F"
846 id="stop535" />
847 </linearGradient>
848 <linearGradient
849 id="XMLID_22_"
850 gradientUnits="userSpaceOnUse"
851 x1="-3707.5356"
852 y1="-3605.8828"
853 x2="-3725.4844"
854 y2="-3632.1494"
855 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
856 <stop
857 offset="0"
858 style="stop-color:#500C81"
859 id="stop540" />
860 <stop
861 offset="0.0374"
862 style="stop-color:#5F0B7A"
863 id="stop542" />
864 <stop
865 offset="0.1836"
866 style="stop-color:#96075F"
867 id="stop544" />
868 <stop
869 offset="0.3196"
870 style="stop-color:#C1044A"
871 id="stop546" />
872 <stop
873 offset="0.4412"
874 style="stop-color:#E0023B"
875 id="stop548" />
876 <stop
877 offset="0.5441"
878 style="stop-color:#F30032"
879 id="stop550" />
880 <stop
881 offset="0.6158"
882 style="stop-color:#FA002F"
883 id="stop552" />
884 <stop
885 offset="1"
886 style="stop-color:#F7EE5F"
887 id="stop554" />
888 </linearGradient>
889 <linearGradient
890 id="XMLID_23_"
891 gradientUnits="userSpaceOnUse"
892 x1="-3718.1577"
893 y1="-3593.3257"
894 x2="-3728.1777"
895 y2="-3626.3457"
896 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
897 <stop
898 offset="0"
899 style="stop-color:#500C81"
900 id="stop559" />
901 <stop
902 offset="0.0374"
903 style="stop-color:#5F0B7A"
904 id="stop561" />
905 <stop
906 offset="0.1836"
907 style="stop-color:#96075F"
908 id="stop563" />
909 <stop
910 offset="0.3196"
911 style="stop-color:#C1044A"
912 id="stop565" />
913 <stop
914 offset="0.4412"
915 style="stop-color:#E0023B"
916 id="stop567" />
917 <stop
918 offset="0.5441"
919 style="stop-color:#F30032"
920 id="stop569" />
921 <stop
922 offset="0.6158"
923 style="stop-color:#FA002F"
924 id="stop571" />
925 <stop
926 offset="1"
927 style="stop-color:#F7EE5F"
928 id="stop573" />
929 </linearGradient>
930 <linearGradient
931 id="XMLID_24_"
932 gradientUnits="userSpaceOnUse"
933 x1="-3740.3467"
934 y1="-3613.374"
935 x2="-3740.5771"
936 y2="-3602.5581"
937 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
938 <stop
939 offset="0"
940 style="stop-color:#F7EE5F"
941 id="stop578" />
942 <stop
943 offset="0.1872"
944 style="stop-color:#F6D65D"
945 id="stop580" />
946 <stop
947 offset="0.3829"
948 style="stop-color:#F4C35B"
949 id="stop582" />
950 <stop
951 offset="0.5198"
952 style="stop-color:#F4BC5A"
953 id="stop584" />
954 <stop
955 offset="0.7816"
956 style="stop-color:#F6DA5D"
957 id="stop586" />
958 <stop
959 offset="1"
960 style="stop-color:#F7EE5F"
961 id="stop588" />
962 </linearGradient>
963 <linearGradient
964 id="XMLID_25_"
965 gradientUnits="userSpaceOnUse"
966 x1="-3735.0215"
967 y1="-3620.6553"
968 x2="-3731.4932"
969 y2="-3597.1836"
970 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
971 <stop
972 offset="0"
973 style="stop-color:#F7EE5F"
974 id="stop593" />
975 <stop
976 offset="0.1872"
977 style="stop-color:#F6D65D"
978 id="stop595" />
979 <stop
980 offset="0.3829"
981 style="stop-color:#F4C35B"
982 id="stop597" />
983 <stop
984 offset="0.5198"
985 style="stop-color:#F4BC5A"
986 id="stop599" />
987 <stop
988 offset="0.7816"
989 style="stop-color:#F6DA5D"
990 id="stop601" />
991 <stop
992 offset="1"
993 style="stop-color:#F7EE5F"
994 id="stop603" />
995 </linearGradient>
996 <linearGradient
997 id="XMLID_26_"
998 gradientUnits="userSpaceOnUse"
999 x1="-3748.5347"
1000 y1="-3618.624"
1001 x2="-3745.0063"
1002 y2="-3595.1523"
1003 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1004 <stop
1005 offset="0"
1006 style="stop-color:#F7EE5F"
1007 id="stop608" />
1008 <stop
1009 offset="0.1872"
1010 style="stop-color:#F6D65D"
1011 id="stop610" />
1012 <stop
1013 offset="0.3829"
1014 style="stop-color:#F4C35B"
1015 id="stop612" />
1016 <stop
1017 offset="0.5198"
1018 style="stop-color:#F4BC5A"
1019 id="stop614" />
1020 <stop
1021 offset="0.7816"
1022 style="stop-color:#F6DA5D"
1023 id="stop616" />
1024 <stop
1025 offset="1"
1026 style="stop-color:#F7EE5F"
1027 id="stop618" />
1028 </linearGradient>
1029 <linearGradient
1030 id="XMLID_27_"
1031 gradientUnits="userSpaceOnUse"
1032 x1="-3761.7197"
1033 y1="-3616.6421"
1034 x2="-3758.1914"
1035 y2="-3593.1704"
1036 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1037 <stop
1038 offset="0"
1039 style="stop-color:#F7EE5F"
1040 id="stop623" />
1041 <stop
1042 offset="0.1872"
1043 style="stop-color:#F6D65D"
1044 id="stop625" />
1045 <stop
1046 offset="0.3829"
1047 style="stop-color:#F4C35B"
1048 id="stop627" />
1049 <stop
1050 offset="0.5198"
1051 style="stop-color:#F4BC5A"
1052 id="stop629" />
1053 <stop
1054 offset="0.7816"
1055 style="stop-color:#F6DA5D"
1056 id="stop631" />
1057 <stop
1058 offset="1"
1059 style="stop-color:#F7EE5F"
1060 id="stop633" />
1061 </linearGradient>
1062 <linearGradient
1063 id="XMLID_28_"
1064 gradientUnits="userSpaceOnUse"
1065 x1="-3763.478"
1066 y1="-3613.8672"
1067 x2="-3763.7085"
1068 y2="-3603.0513"
1069 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1070 <stop
1071 offset="0"
1072 style="stop-color:#F7EE5F"
1073 id="stop638" />
1074 <stop
1075 offset="0.1872"
1076 style="stop-color:#F6D65D"
1077 id="stop640" />
1078 <stop
1079 offset="0.3829"
1080 style="stop-color:#F4C35B"
1081 id="stop642" />
1082 <stop
1083 offset="0.5198"
1084 style="stop-color:#F4BC5A"
1085 id="stop644" />
1086 <stop
1087 offset="0.7816"
1088 style="stop-color:#F6DA5D"
1089 id="stop646" />
1090 <stop
1091 offset="1"
1092 style="stop-color:#F7EE5F"
1093 id="stop648" />
1094 </linearGradient>
1095 <linearGradient
1096 id="XMLID_29_"
1097 gradientUnits="userSpaceOnUse"
1098 x1="-3754.6851"
1099 y1="-3613.6797"
1100 x2="-3754.9155"
1101 y2="-3602.8638"
1102 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1103 <stop
1104 offset="0"
1105 style="stop-color:#F7EE5F"
1106 id="stop653" />
1107 <stop
1108 offset="0.1872"
1109 style="stop-color:#F6D65D"
1110 id="stop655" />
1111 <stop
1112 offset="0.3829"
1113 style="stop-color:#F4C35B"
1114 id="stop657" />
1115 <stop
1116 offset="0.5198"
1117 style="stop-color:#F4BC5A"
1118 id="stop659" />
1119 <stop
1120 offset="0.7816"
1121 style="stop-color:#F6DA5D"
1122 id="stop661" />
1123 <stop
1124 offset="1"
1125 style="stop-color:#F7EE5F"
1126 id="stop663" />
1127 </linearGradient>
1128 <linearGradient
1129 id="XMLID_30_"
1130 gradientUnits="userSpaceOnUse"
1131 x1="-3511.8975"
1132 y1="-3643.6323"
1133 x2="-3504.0176"
1134 y2="-3620.4302"
1135 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1136 <stop
1137 offset="0"
1138 style="stop-color:#691183"
1139 id="stop668" />
1140 <stop
1141 offset="0.0485"
1142 style="stop-color:#840E73"
1143 id="stop670" />
1144 <stop
1145 offset="0.125"
1146 style="stop-color:#A80A5E"
1147 id="stop672" />
1148 <stop
1149 offset="0.2057"
1150 style="stop-color:#C6064D"
1151 id="stop674" />
1152 <stop
1153 offset="0.2906"
1154 style="stop-color:#DD0340"
1155 id="stop676" />
1156 <stop
1157 offset="0.3816"
1158 style="stop-color:#ED0136"
1159 id="stop678" />
1160 <stop
1161 offset="0.483"
1162 style="stop-color:#F70031"
1163 id="stop680" />
1164 <stop
1165 offset="0.6158"
1166 style="stop-color:#FA002F"
1167 id="stop682" />
1168 <stop
1169 offset="1"
1170 style="stop-color:#F7EE5F"
1171 id="stop684" />
1172 </linearGradient>
1173 <linearGradient
1174 id="XMLID_31_"
1175 gradientUnits="userSpaceOnUse"
1176 x1="-3517.9097"
1177 y1="-3641.5903"
1178 x2="-3510.0298"
1179 y2="-3618.3882"
1180 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1181 <stop
1182 offset="0"
1183 style="stop-color:#691183"
1184 id="stop689" />
1185 <stop
1186 offset="0.0485"
1187 style="stop-color:#840E73"
1188 id="stop691" />
1189 <stop
1190 offset="0.125"
1191 style="stop-color:#A80A5E"
1192 id="stop693" />
1193 <stop
1194 offset="0.2057"
1195 style="stop-color:#C6064D"
1196 id="stop695" />
1197 <stop
1198 offset="0.2906"
1199 style="stop-color:#DD0340"
1200 id="stop697" />
1201 <stop
1202 offset="0.3816"
1203 style="stop-color:#ED0136"
1204 id="stop699" />
1205 <stop
1206 offset="0.483"
1207 style="stop-color:#F70031"
1208 id="stop701" />
1209 <stop
1210 offset="0.6158"
1211 style="stop-color:#FA002F"
1212 id="stop703" />
1213 <stop
1214 offset="1"
1215 style="stop-color:#F7EE5F"
1216 id="stop705" />
1217 </linearGradient>
1218 <linearGradient
1219 id="XMLID_32_"
1220 gradientUnits="userSpaceOnUse"
1221 x1="-3524.9941"
1222 y1="-3639.1846"
1223 x2="-3517.1143"
1224 y2="-3615.9824"
1225 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1226 <stop
1227 offset="0"
1228 style="stop-color:#691183"
1229 id="stop710" />
1230 <stop
1231 offset="0.0485"
1232 style="stop-color:#840E73"
1233 id="stop712" />
1234 <stop
1235 offset="0.125"
1236 style="stop-color:#A80A5E"
1237 id="stop714" />
1238 <stop
1239 offset="0.2057"
1240 style="stop-color:#C6064D"
1241 id="stop716" />
1242 <stop
1243 offset="0.2906"
1244 style="stop-color:#DD0340"
1245 id="stop718" />
1246 <stop
1247 offset="0.3816"
1248 style="stop-color:#ED0136"
1249 id="stop720" />
1250 <stop
1251 offset="0.483"
1252 style="stop-color:#F70031"
1253 id="stop722" />
1254 <stop
1255 offset="0.6158"
1256 style="stop-color:#FA002F"
1257 id="stop724" />
1258 <stop
1259 offset="1"
1260 style="stop-color:#F7EE5F"
1261 id="stop726" />
1262 </linearGradient>
1263 <linearGradient
1264 id="XMLID_33_"
1265 gradientUnits="userSpaceOnUse"
1266 x1="-3532.7456"
1267 y1="-3636.5518"
1268 x2="-3524.8657"
1269 y2="-3613.3496"
1270 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1271 <stop
1272 offset="0"
1273 style="stop-color:#691183"
1274 id="stop731" />
1275 <stop
1276 offset="0.0485"
1277 style="stop-color:#840E73"
1278 id="stop733" />
1279 <stop
1280 offset="0.125"
1281 style="stop-color:#A80A5E"
1282 id="stop735" />
1283 <stop
1284 offset="0.2057"
1285 style="stop-color:#C6064D"
1286 id="stop737" />
1287 <stop
1288 offset="0.2906"
1289 style="stop-color:#DD0340"
1290 id="stop739" />
1291 <stop
1292 offset="0.3816"
1293 style="stop-color:#ED0136"
1294 id="stop741" />
1295 <stop
1296 offset="0.483"
1297 style="stop-color:#F70031"
1298 id="stop743" />
1299 <stop
1300 offset="0.6158"
1301 style="stop-color:#FA002F"
1302 id="stop745" />
1303 <stop
1304 offset="1"
1305 style="stop-color:#F7EE5F"
1306 id="stop747" />
1307 </linearGradient>
1308 <linearGradient
1309 id="XMLID_34_"
1310 gradientUnits="userSpaceOnUse"
1311 x1="-3612.71"
1312 y1="-3562.9639"
1313 x2="-3607.5464"
1314 y2="-3533.6799"
1315 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,105.42691,1026.1774)">
1316 <stop
1317 offset="0"
1318 style="stop-color:#691183"
1319 id="stop752" />
1320 <stop
1321 offset="0.082"
1322 style="stop-color:#711282"
1323 id="stop754" />
1324 <stop
1325 offset="0.211"
1326 style="stop-color:#88137F"
1327 id="stop756" />
1328 <stop
1329 offset="0.3709"
1330 style="stop-color:#AD167A"
1331 id="stop758" />
1332 <stop
1333 offset="0.5538"
1334 style="stop-color:#DF1A73"
1335 id="stop760" />
1336 <stop
1337 offset="0.6158"
1338 style="stop-color:#F21B71"
1339 id="stop762" />
1340 <stop
1341 offset="1"
1342 style="stop-color:#F7EE5F"
1343 id="stop764" />
1344 </linearGradient>
1345 <linearGradient
1346 id="XMLID_35_"
1347 gradientUnits="userSpaceOnUse"
1348 x1="-3543.3057"
1349 y1="-3626.2104"
1350 x2="-3540.6792"
1351 y2="-3608.2617"
1352 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1353 <stop
1354 offset="0"
1355 style="stop-color:#691183"
1356 id="stop769" />
1357 <stop
1358 offset="0.0485"
1359 style="stop-color:#840E73"
1360 id="stop771" />
1361 <stop
1362 offset="0.125"
1363 style="stop-color:#A80A5E"
1364 id="stop773" />
1365 <stop
1366 offset="0.2057"
1367 style="stop-color:#C6064D"
1368 id="stop775" />
1369 <stop
1370 offset="0.2906"
1371 style="stop-color:#DD0340"
1372 id="stop777" />
1373 <stop
1374 offset="0.3816"
1375 style="stop-color:#ED0136"
1376 id="stop779" />
1377 <stop
1378 offset="0.483"
1379 style="stop-color:#F70031"
1380 id="stop781" />
1381 <stop
1382 offset="0.6158"
1383 style="stop-color:#FA002F"
1384 id="stop783" />
1385 <stop
1386 offset="1"
1387 style="stop-color:#F7EE5F"
1388 id="stop785" />
1389 </linearGradient>
1390 <linearGradient
1391 id="XMLID_36_"
1392 gradientUnits="userSpaceOnUse"
1393 x1="-3551.1382"
1394 y1="-3625.064"
1395 x2="-3548.5117"
1396 y2="-3607.1152"
1397 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1398 <stop
1399 offset="0"
1400 style="stop-color:#691183"
1401 id="stop790" />
1402 <stop
1403 offset="0.0485"
1404 style="stop-color:#840E73"
1405 id="stop792" />
1406 <stop
1407 offset="0.125"
1408 style="stop-color:#A80A5E"
1409 id="stop794" />
1410 <stop
1411 offset="0.2057"
1412 style="stop-color:#C6064D"
1413 id="stop796" />
1414 <stop
1415 offset="0.2906"
1416 style="stop-color:#DD0340"
1417 id="stop798" />
1418 <stop
1419 offset="0.3816"
1420 style="stop-color:#ED0136"
1421 id="stop800" />
1422 <stop
1423 offset="0.483"
1424 style="stop-color:#F70031"
1425 id="stop802" />
1426 <stop
1427 offset="0.6158"
1428 style="stop-color:#FA002F"
1429 id="stop804" />
1430 <stop
1431 offset="1"
1432 style="stop-color:#F7EE5F"
1433 id="stop806" />
1434 </linearGradient>
1435 <linearGradient
1436 id="XMLID_37_"
1437 gradientUnits="userSpaceOnUse"
1438 x1="-3558.2891"
1439 y1="-3625.9536"
1440 x2="-3554.1504"
1441 y2="-3586.3257"
1442 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1443 <stop
1444 offset="0"
1445 style="stop-color:#691183"
1446 id="stop811" />
1447 <stop
1448 offset="0.0485"
1449 style="stop-color:#840E73"
1450 id="stop813" />
1451 <stop
1452 offset="0.125"
1453 style="stop-color:#A80A5E"
1454 id="stop815" />
1455 <stop
1456 offset="0.2057"
1457 style="stop-color:#C6064D"
1458 id="stop817" />
1459 <stop
1460 offset="0.2906"
1461 style="stop-color:#DD0340"
1462 id="stop819" />
1463 <stop
1464 offset="0.3816"
1465 style="stop-color:#ED0136"
1466 id="stop821" />
1467 <stop
1468 offset="0.483"
1469 style="stop-color:#F70031"
1470 id="stop823" />
1471 <stop
1472 offset="0.6158"
1473 style="stop-color:#FA002F"
1474 id="stop825" />
1475 <stop
1476 offset="1"
1477 style="stop-color:#F7EE5F"
1478 id="stop827" />
1479 </linearGradient>
1480 <linearGradient
1481 id="XMLID_38_"
1482 gradientUnits="userSpaceOnUse"
1483 x1="-3568.6143"
1484 y1="-3622.5068"
1485 x2="-3565.9878"
1486 y2="-3604.5581"
1487 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1488 <stop
1489 offset="0"
1490 style="stop-color:#691183"
1491 id="stop832" />
1492 <stop
1493 offset="0.0485"
1494 style="stop-color:#840E73"
1495 id="stop834" />
1496 <stop
1497 offset="0.125"
1498 style="stop-color:#A80A5E"
1499 id="stop836" />
1500 <stop
1501 offset="0.2057"
1502 style="stop-color:#C6064D"
1503 id="stop838" />
1504 <stop
1505 offset="0.2906"
1506 style="stop-color:#DD0340"
1507 id="stop840" />
1508 <stop
1509 offset="0.3816"
1510 style="stop-color:#ED0136"
1511 id="stop842" />
1512 <stop
1513 offset="0.483"
1514 style="stop-color:#F70031"
1515 id="stop844" />
1516 <stop
1517 offset="0.6158"
1518 style="stop-color:#FA002F"
1519 id="stop846" />
1520 <stop
1521 offset="1"
1522 style="stop-color:#F7EE5F"
1523 id="stop848" />
1524 </linearGradient>
1525 <linearGradient
1526 id="XMLID_39_"
1527 gradientUnits="userSpaceOnUse"
1528 x1="-3576.6631"
1529 y1="-3624.0347"
1530 x2="-3572.5244"
1531 y2="-3584.4067"
1532 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1533 <stop
1534 offset="0"
1535 style="stop-color:#691183"
1536 id="stop853" />
1537 <stop
1538 offset="0.0485"
1539 style="stop-color:#840E73"
1540 id="stop855" />
1541 <stop
1542 offset="0.125"
1543 style="stop-color:#A80A5E"
1544 id="stop857" />
1545 <stop
1546 offset="0.2057"
1547 style="stop-color:#C6064D"
1548 id="stop859" />
1549 <stop
1550 offset="0.2906"
1551 style="stop-color:#DD0340"
1552 id="stop861" />
1553 <stop
1554 offset="0.3816"
1555 style="stop-color:#ED0136"
1556 id="stop863" />
1557 <stop
1558 offset="0.483"
1559 style="stop-color:#F70031"
1560 id="stop865" />
1561 <stop
1562 offset="0.6158"
1563 style="stop-color:#FA002F"
1564 id="stop867" />
1565 <stop
1566 offset="1"
1567 style="stop-color:#F7EE5F"
1568 id="stop869" />
1569 </linearGradient>
1570 <linearGradient
1571 id="XMLID_40_"
1572 gradientUnits="userSpaceOnUse"
1573 x1="-3583.9473"
1574 y1="-3623.7251"
1575 x2="-3582.5107"
1576 y2="-3603.1548"
1577 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1578 <stop
1579 offset="0"
1580 style="stop-color:#691183"
1581 id="stop874" />
1582 <stop
1583 offset="0.0485"
1584 style="stop-color:#840E73"
1585 id="stop876" />
1586 <stop
1587 offset="0.125"
1588 style="stop-color:#A80A5E"
1589 id="stop878" />
1590 <stop
1591 offset="0.2057"
1592 style="stop-color:#C6064D"
1593 id="stop880" />
1594 <stop
1595 offset="0.2906"
1596 style="stop-color:#DD0340"
1597 id="stop882" />
1598 <stop
1599 offset="0.3816"
1600 style="stop-color:#ED0136"
1601 id="stop884" />
1602 <stop
1603 offset="0.483"
1604 style="stop-color:#F70031"
1605 id="stop886" />
1606 <stop
1607 offset="0.6158"
1608 style="stop-color:#FA002F"
1609 id="stop888" />
1610 <stop
1611 offset="1"
1612 style="stop-color:#F7EE5F"
1613 id="stop890" />
1614 </linearGradient>
1615 <linearGradient
1616 id="XMLID_41_"
1617 gradientUnits="userSpaceOnUse"
1618 x1="-3591.7266"
1619 y1="-3619.7495"
1620 x2="-3589.9756"
1621 y2="-3592.6074"
1622 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1623 <stop
1624 offset="0"
1625 style="stop-color:#691183"
1626 id="stop895" />
1627 <stop
1628 offset="0.0485"
1629 style="stop-color:#840E73"
1630 id="stop897" />
1631 <stop
1632 offset="0.125"
1633 style="stop-color:#A80A5E"
1634 id="stop899" />
1635 <stop
1636 offset="0.2057"
1637 style="stop-color:#C6064D"
1638 id="stop901" />
1639 <stop
1640 offset="0.2906"
1641 style="stop-color:#DD0340"
1642 id="stop903" />
1643 <stop
1644 offset="0.3816"
1645 style="stop-color:#ED0136"
1646 id="stop905" />
1647 <stop
1648 offset="0.483"
1649 style="stop-color:#F70031"
1650 id="stop907" />
1651 <stop
1652 offset="0.6158"
1653 style="stop-color:#FA002F"
1654 id="stop909" />
1655 <stop
1656 offset="1"
1657 style="stop-color:#F7EE5F"
1658 id="stop911" />
1659 </linearGradient>
1660 <linearGradient
1661 id="XMLID_42_"
1662 gradientUnits="userSpaceOnUse"
1663 x1="-3599.0938"
1664 y1="-3619.2744"
1665 x2="-3597.3428"
1666 y2="-3592.1323"
1667 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1668 <stop
1669 offset="0"
1670 style="stop-color:#691183"
1671 id="stop916" />
1672 <stop
1673 offset="0.0485"
1674 style="stop-color:#840E73"
1675 id="stop918" />
1676 <stop
1677 offset="0.125"
1678 style="stop-color:#A80A5E"
1679 id="stop920" />
1680 <stop
1681 offset="0.2057"
1682 style="stop-color:#C6064D"
1683 id="stop922" />
1684 <stop
1685 offset="0.2906"
1686 style="stop-color:#DD0340"
1687 id="stop924" />
1688 <stop
1689 offset="0.3816"
1690 style="stop-color:#ED0136"
1691 id="stop926" />
1692 <stop
1693 offset="0.483"
1694 style="stop-color:#F70031"
1695 id="stop928" />
1696 <stop
1697 offset="0.6158"
1698 style="stop-color:#FA002F"
1699 id="stop930" />
1700 <stop
1701 offset="1"
1702 style="stop-color:#F7EE5F"
1703 id="stop932" />
1704 </linearGradient>
1705 <linearGradient
1706 id="XMLID_43_"
1707 gradientUnits="userSpaceOnUse"
1708 x1="-3606.6357"
1709 y1="-3620.9043"
1710 x2="-3602.4971"
1711 y2="-3581.2764"
1712 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1713 <stop
1714 offset="0"
1715 style="stop-color:#691183"
1716 id="stop937" />
1717 <stop
1718 offset="0.0485"
1719 style="stop-color:#840E73"
1720 id="stop939" />
1721 <stop
1722 offset="0.125"
1723 style="stop-color:#A80A5E"
1724 id="stop941" />
1725 <stop
1726 offset="0.2057"
1727 style="stop-color:#C6064D"
1728 id="stop943" />
1729 <stop
1730 offset="0.2906"
1731 style="stop-color:#DD0340"
1732 id="stop945" />
1733 <stop
1734 offset="0.3816"
1735 style="stop-color:#ED0136"
1736 id="stop947" />
1737 <stop
1738 offset="0.483"
1739 style="stop-color:#F70031"
1740 id="stop949" />
1741 <stop
1742 offset="0.6158"
1743 style="stop-color:#FA002F"
1744 id="stop951" />
1745 <stop
1746 offset="1"
1747 style="stop-color:#F7EE5F"
1748 id="stop953" />
1749 </linearGradient>
1750 <linearGradient
1751 id="XMLID_44_"
1752 gradientUnits="userSpaceOnUse"
1753 x1="-3616.3291"
1754 y1="-3619.8916"
1755 x2="-3612.1904"
1756 y2="-3580.2637"
1757 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1758 <stop
1759 offset="0"
1760 style="stop-color:#691183"
1761 id="stop958" />
1762 <stop
1763 offset="0.0485"
1764 style="stop-color:#840E73"
1765 id="stop960" />
1766 <stop
1767 offset="0.125"
1768 style="stop-color:#A80A5E"
1769 id="stop962" />
1770 <stop
1771 offset="0.2057"
1772 style="stop-color:#C6064D"
1773 id="stop964" />
1774 <stop
1775 offset="0.2906"
1776 style="stop-color:#DD0340"
1777 id="stop966" />
1778 <stop
1779 offset="0.3816"
1780 style="stop-color:#ED0136"
1781 id="stop968" />
1782 <stop
1783 offset="0.483"
1784 style="stop-color:#F70031"
1785 id="stop970" />
1786 <stop
1787 offset="0.6158"
1788 style="stop-color:#FA002F"
1789 id="stop972" />
1790 <stop
1791 offset="1"
1792 style="stop-color:#F7EE5F"
1793 id="stop974" />
1794 </linearGradient>
1795 <linearGradient
1796 id="XMLID_45_"
1797 gradientUnits="userSpaceOnUse"
1798 x1="-3622.9634"
1799 y1="-3614.5537"
1800 x2="-3620.3369"
1801 y2="-3596.605"
1802 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1803 <stop
1804 offset="0"
1805 style="stop-color:#691183"
1806 id="stop979" />
1807 <stop
1808 offset="0.0485"
1809 style="stop-color:#840E73"
1810 id="stop981" />
1811 <stop
1812 offset="0.125"
1813 style="stop-color:#A80A5E"
1814 id="stop983" />
1815 <stop
1816 offset="0.2057"
1817 style="stop-color:#C6064D"
1818 id="stop985" />
1819 <stop
1820 offset="0.2906"
1821 style="stop-color:#DD0340"
1822 id="stop987" />
1823 <stop
1824 offset="0.3816"
1825 style="stop-color:#ED0136"
1826 id="stop989" />
1827 <stop
1828 offset="0.483"
1829 style="stop-color:#F70031"
1830 id="stop991" />
1831 <stop
1832 offset="0.6158"
1833 style="stop-color:#FA002F"
1834 id="stop993" />
1835 <stop
1836 offset="1"
1837 style="stop-color:#F7EE5F"
1838 id="stop995" />
1839 </linearGradient>
1840 <linearGradient
1841 id="XMLID_46_"
1842 gradientUnits="userSpaceOnUse"
1843 x1="-3631.832"
1844 y1="-3618.2729"
1845 x2="-3627.6934"
1846 y2="-3578.645"
1847 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1848 <stop
1849 offset="0"
1850 style="stop-color:#691183"
1851 id="stop1000" />
1852 <stop
1853 offset="0.0485"
1854 style="stop-color:#840E73"
1855 id="stop1002" />
1856 <stop
1857 offset="0.125"
1858 style="stop-color:#A80A5E"
1859 id="stop1004" />
1860 <stop
1861 offset="0.2057"
1862 style="stop-color:#C6064D"
1863 id="stop1006" />
1864 <stop
1865 offset="0.2906"
1866 style="stop-color:#DD0340"
1867 id="stop1008" />
1868 <stop
1869 offset="0.3816"
1870 style="stop-color:#ED0136"
1871 id="stop1010" />
1872 <stop
1873 offset="0.483"
1874 style="stop-color:#F70031"
1875 id="stop1012" />
1876 <stop
1877 offset="0.6158"
1878 style="stop-color:#FA002F"
1879 id="stop1014" />
1880 <stop
1881 offset="1"
1882 style="stop-color:#F7EE5F"
1883 id="stop1016" />
1884 </linearGradient>
1885 <linearGradient
1886 id="XMLID_47_"
1887 gradientUnits="userSpaceOnUse"
1888 x1="-3638.7656"
1889 y1="-3612.2417"
1890 x2="-3636.1392"
1891 y2="-3594.293"
1892 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1893 <stop
1894 offset="0"
1895 style="stop-color:#691183"
1896 id="stop1021" />
1897 <stop
1898 offset="0.0485"
1899 style="stop-color:#840E73"
1900 id="stop1023" />
1901 <stop
1902 offset="0.125"
1903 style="stop-color:#A80A5E"
1904 id="stop1025" />
1905 <stop
1906 offset="0.2057"
1907 style="stop-color:#C6064D"
1908 id="stop1027" />
1909 <stop
1910 offset="0.2906"
1911 style="stop-color:#DD0340"
1912 id="stop1029" />
1913 <stop
1914 offset="0.3816"
1915 style="stop-color:#ED0136"
1916 id="stop1031" />
1917 <stop
1918 offset="0.483"
1919 style="stop-color:#F70031"
1920 id="stop1033" />
1921 <stop
1922 offset="0.6158"
1923 style="stop-color:#FA002F"
1924 id="stop1035" />
1925 <stop
1926 offset="1"
1927 style="stop-color:#F7EE5F"
1928 id="stop1037" />
1929 </linearGradient>
1930 <linearGradient
1931 id="XMLID_48_"
1932 gradientUnits="userSpaceOnUse"
1933 x1="-3647.9785"
1934 y1="-3616.5864"
1935 x2="-3643.8398"
1936 y2="-3576.9585"
1937 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1938 <stop
1939 offset="0"
1940 style="stop-color:#691183"
1941 id="stop1042" />
1942 <stop
1943 offset="0.0485"
1944 style="stop-color:#840E73"
1945 id="stop1044" />
1946 <stop
1947 offset="0.125"
1948 style="stop-color:#A80A5E"
1949 id="stop1046" />
1950 <stop
1951 offset="0.2057"
1952 style="stop-color:#C6064D"
1953 id="stop1048" />
1954 <stop
1955 offset="0.2906"
1956 style="stop-color:#DD0340"
1957 id="stop1050" />
1958 <stop
1959 offset="0.3816"
1960 style="stop-color:#ED0136"
1961 id="stop1052" />
1962 <stop
1963 offset="0.483"
1964 style="stop-color:#F70031"
1965 id="stop1054" />
1966 <stop
1967 offset="0.6158"
1968 style="stop-color:#FA002F"
1969 id="stop1056" />
1970 <stop
1971 offset="1"
1972 style="stop-color:#F7EE5F"
1973 id="stop1058" />
1974 </linearGradient>
1975 <linearGradient
1976 id="XMLID_49_"
1977 gradientUnits="userSpaceOnUse"
1978 x1="-3655.8188"
1979 y1="-3609.7461"
1980 x2="-3653.1924"
1981 y2="-3591.7974"
1982 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1983 <stop
1984 offset="0"
1985 style="stop-color:#691183"
1986 id="stop1063" />
1987 <stop
1988 offset="0.0485"
1989 style="stop-color:#840E73"
1990 id="stop1065" />
1991 <stop
1992 offset="0.125"
1993 style="stop-color:#A80A5E"
1994 id="stop1067" />
1995 <stop
1996 offset="0.2057"
1997 style="stop-color:#C6064D"
1998 id="stop1069" />
1999 <stop
2000 offset="0.2906"
2001 style="stop-color:#DD0340"
2002 id="stop1071" />
2003 <stop
2004 offset="0.3816"
2005 style="stop-color:#ED0136"
2006 id="stop1073" />
2007 <stop
2008 offset="0.483"
2009 style="stop-color:#F70031"
2010 id="stop1075" />
2011 <stop
2012 offset="0.6158"
2013 style="stop-color:#FA002F"
2014 id="stop1077" />
2015 <stop
2016 offset="1"
2017 style="stop-color:#F7EE5F"
2018 id="stop1079" />
2019 </linearGradient>
2020 <linearGradient
2021 id="XMLID_50_"
2022 gradientUnits="userSpaceOnUse"
2023 x1="-3513.0679"
2024 y1="-3655.561"
2025 x2="-3523.0879"
2026 y2="-3688.5811"
2027 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2028 <stop
2029 offset="0"
2030 style="stop-color:#500C81"
2031 id="stop1084" />
2032 <stop
2033 offset="0.0374"
2034 style="stop-color:#5F0B7A"
2035 id="stop1086" />
2036 <stop
2037 offset="0.1836"
2038 style="stop-color:#96075F"
2039 id="stop1088" />
2040 <stop
2041 offset="0.3196"
2042 style="stop-color:#C1044A"
2043 id="stop1090" />
2044 <stop
2045 offset="0.4412"
2046 style="stop-color:#E0023B"
2047 id="stop1092" />
2048 <stop
2049 offset="0.5441"
2050 style="stop-color:#F30032"
2051 id="stop1094" />
2052 <stop
2053 offset="0.6158"
2054 style="stop-color:#FA002F"
2055 id="stop1096" />
2056 <stop
2057 offset="1"
2058 style="stop-color:#F7EE5F"
2059 id="stop1098" />
2060 </linearGradient>
2061 <linearGradient
2062 id="XMLID_51_"
2063 gradientUnits="userSpaceOnUse"
2064 x1="-3518.1055"
2065 y1="-3653.4717"
2066 x2="-3529.9253"
2067 y2="-3685.4292"
2068 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2069 <stop
2070 offset="0"
2071 style="stop-color:#500C81"
2072 id="stop1103" />
2073 <stop
2074 offset="0.0374"
2075 style="stop-color:#5F0B7A"
2076 id="stop1105" />
2077 <stop
2078 offset="0.1836"
2079 style="stop-color:#96075F"
2080 id="stop1107" />
2081 <stop
2082 offset="0.3196"
2083 style="stop-color:#C1044A"
2084 id="stop1109" />
2085 <stop
2086 offset="0.4412"
2087 style="stop-color:#E0023B"
2088 id="stop1111" />
2089 <stop
2090 offset="0.5441"
2091 style="stop-color:#F30032"
2092 id="stop1113" />
2093 <stop
2094 offset="0.6158"
2095 style="stop-color:#FA002F"
2096 id="stop1115" />
2097 <stop
2098 offset="1"
2099 style="stop-color:#F7EE5F"
2100 id="stop1117" />
2101 </linearGradient>
2102 <linearGradient
2103 id="XMLID_52_"
2104 gradientUnits="userSpaceOnUse"
2105 x1="-3524.8926"
2106 y1="-3651.9727"
2107 x2="-3534.9126"
2108 y2="-3684.9927"
2109 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2110 <stop
2111 offset="0"
2112 style="stop-color:#500C81"
2113 id="stop1122" />
2114 <stop
2115 offset="0.0374"
2116 style="stop-color:#5F0B7A"
2117 id="stop1124" />
2118 <stop
2119 offset="0.1836"
2120 style="stop-color:#96075F"
2121 id="stop1126" />
2122 <stop
2123 offset="0.3196"
2124 style="stop-color:#C1044A"
2125 id="stop1128" />
2126 <stop
2127 offset="0.4412"
2128 style="stop-color:#E0023B"
2129 id="stop1130" />
2130 <stop
2131 offset="0.5441"
2132 style="stop-color:#F30032"
2133 id="stop1132" />
2134 <stop
2135 offset="0.6158"
2136 style="stop-color:#FA002F"
2137 id="stop1134" />
2138 <stop
2139 offset="1"
2140 style="stop-color:#F7EE5F"
2141 id="stop1136" />
2142 </linearGradient>
2143 <linearGradient
2144 id="XMLID_53_"
2145 gradientUnits="userSpaceOnUse"
2146 x1="-3531.4272"
2147 y1="-3648.5444"
2148 x2="-3543.2471"
2149 y2="-3680.502"
2150 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2151 <stop
2152 offset="0"
2153 style="stop-color:#500C81"
2154 id="stop1141" />
2155 <stop
2156 offset="0.0374"
2157 style="stop-color:#5F0B7A"
2158 id="stop1143" />
2159 <stop
2160 offset="0.1836"
2161 style="stop-color:#96075F"
2162 id="stop1145" />
2163 <stop
2164 offset="0.3196"
2165 style="stop-color:#C1044A"
2166 id="stop1147" />
2167 <stop
2168 offset="0.4412"
2169 style="stop-color:#E0023B"
2170 id="stop1149" />
2171 <stop
2172 offset="0.5441"
2173 style="stop-color:#F30032"
2174 id="stop1151" />
2175 <stop
2176 offset="0.6158"
2177 style="stop-color:#FA002F"
2178 id="stop1153" />
2179 <stop
2180 offset="1"
2181 style="stop-color:#F7EE5F"
2182 id="stop1155" />
2183 </linearGradient>
2184 <linearGradient
2185 id="XMLID_54_"
2186 gradientUnits="userSpaceOnUse"
2187 x1="-3540.4351"
2188 y1="-3647.2563"
2189 x2="-3550.4551"
2190 y2="-3680.2764"
2191 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2192 <stop
2193 offset="0"
2194 style="stop-color:#500C81"
2195 id="stop1160" />
2196 <stop
2197 offset="0.0374"
2198 style="stop-color:#5F0B7A"
2199 id="stop1162" />
2200 <stop
2201 offset="0.1836"
2202 style="stop-color:#96075F"
2203 id="stop1164" />
2204 <stop
2205 offset="0.3196"
2206 style="stop-color:#C1044A"
2207 id="stop1166" />
2208 <stop
2209 offset="0.4412"
2210 style="stop-color:#E0023B"
2211 id="stop1168" />
2212 <stop
2213 offset="0.5441"
2214 style="stop-color:#F30032"
2215 id="stop1170" />
2216 <stop
2217 offset="0.6158"
2218 style="stop-color:#FA002F"
2219 id="stop1172" />
2220 <stop
2221 offset="1"
2222 style="stop-color:#F7EE5F"
2223 id="stop1174" />
2224 </linearGradient>
2225 <linearGradient
2226 id="XMLID_55_"
2227 gradientUnits="userSpaceOnUse"
2228 x1="-3546.5405"
2229 y1="-3642.9551"
2230 x2="-3558.3604"
2231 y2="-3674.9126"
2232 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2233 <stop
2234 offset="0"
2235 style="stop-color:#500C81"
2236 id="stop1179" />
2237 <stop
2238 offset="0.0374"
2239 style="stop-color:#5F0B7A"
2240 id="stop1181" />
2241 <stop
2242 offset="0.1836"
2243 style="stop-color:#96075F"
2244 id="stop1183" />
2245 <stop
2246 offset="0.3196"
2247 style="stop-color:#C1044A"
2248 id="stop1185" />
2249 <stop
2250 offset="0.4412"
2251 style="stop-color:#E0023B"
2252 id="stop1187" />
2253 <stop
2254 offset="0.5441"
2255 style="stop-color:#F30032"
2256 id="stop1189" />
2257 <stop
2258 offset="0.6158"
2259 style="stop-color:#FA002F"
2260 id="stop1191" />
2261 <stop
2262 offset="1"
2263 style="stop-color:#F7EE5F"
2264 id="stop1193" />
2265 </linearGradient>
2266 <linearGradient
2267 id="XMLID_56_"
2268 gradientUnits="userSpaceOnUse"
2269 x1="-3556.0371"
2270 y1="-3642.522"
2271 x2="-3566.0571"
2272 y2="-3675.542"
2273 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2274 <stop
2275 offset="0"
2276 style="stop-color:#500C81"
2277 id="stop1198" />
2278 <stop
2279 offset="0.0374"
2280 style="stop-color:#5F0B7A"
2281 id="stop1200" />
2282 <stop
2283 offset="0.1836"
2284 style="stop-color:#96075F"
2285 id="stop1202" />
2286 <stop
2287 offset="0.3196"
2288 style="stop-color:#C1044A"
2289 id="stop1204" />
2290 <stop
2291 offset="0.4412"
2292 style="stop-color:#E0023B"
2293 id="stop1206" />
2294 <stop
2295 offset="0.5441"
2296 style="stop-color:#F30032"
2297 id="stop1208" />
2298 <stop
2299 offset="0.6158"
2300 style="stop-color:#FA002F"
2301 id="stop1210" />
2302 <stop
2303 offset="1"
2304 style="stop-color:#F7EE5F"
2305 id="stop1212" />
2306 </linearGradient>
2307 <linearGradient
2308 id="XMLID_57_"
2309 gradientUnits="userSpaceOnUse"
2310 x1="-3565.9399"
2311 y1="-3639.5166"
2312 x2="-3575.96"
2313 y2="-3672.5366"
2314 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2315 <stop
2316 offset="0"
2317 style="stop-color:#500C81"
2318 id="stop1217" />
2319 <stop
2320 offset="0.0374"
2321 style="stop-color:#5F0B7A"
2322 id="stop1219" />
2323 <stop
2324 offset="0.1836"
2325 style="stop-color:#96075F"
2326 id="stop1221" />
2327 <stop
2328 offset="0.3196"
2329 style="stop-color:#C1044A"
2330 id="stop1223" />
2331 <stop
2332 offset="0.4412"
2333 style="stop-color:#E0023B"
2334 id="stop1225" />
2335 <stop
2336 offset="0.5441"
2337 style="stop-color:#F30032"
2338 id="stop1227" />
2339 <stop
2340 offset="0.6158"
2341 style="stop-color:#FA002F"
2342 id="stop1229" />
2343 <stop
2344 offset="1"
2345 style="stop-color:#F7EE5F"
2346 id="stop1231" />
2347 </linearGradient>
2348 <linearGradient
2349 id="XMLID_58_"
2350 gradientUnits="userSpaceOnUse"
2351 x1="-3572.1245"
2352 y1="-3633.4922"
2353 x2="-3583.9443"
2354 y2="-3665.4497"
2355 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2356 <stop
2357 offset="0"
2358 style="stop-color:#500C81"
2359 id="stop1236" />
2360 <stop
2361 offset="0.0374"
2362 style="stop-color:#5F0B7A"
2363 id="stop1238" />
2364 <stop
2365 offset="0.1836"
2366 style="stop-color:#96075F"
2367 id="stop1240" />
2368 <stop
2369 offset="0.3196"
2370 style="stop-color:#C1044A"
2371 id="stop1242" />
2372 <stop
2373 offset="0.4412"
2374 style="stop-color:#E0023B"
2375 id="stop1244" />
2376 <stop
2377 offset="0.5441"
2378 style="stop-color:#F30032"
2379 id="stop1246" />
2380 <stop
2381 offset="0.6158"
2382 style="stop-color:#FA002F"
2383 id="stop1248" />
2384 <stop
2385 offset="1"
2386 style="stop-color:#F7EE5F"
2387 id="stop1250" />
2388 </linearGradient>
2389 <linearGradient
2390 id="XMLID_59_"
2391 gradientUnits="userSpaceOnUse"
2392 x1="-3581.0527"
2393 y1="-3634.9307"
2394 x2="-3591.0728"
2395 y2="-3667.9507"
2396 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2397 <stop
2398 offset="0"
2399 style="stop-color:#500C81"
2400 id="stop1255" />
2401 <stop
2402 offset="0.0374"
2403 style="stop-color:#5F0B7A"
2404 id="stop1257" />
2405 <stop
2406 offset="0.1836"
2407 style="stop-color:#96075F"
2408 id="stop1259" />
2409 <stop
2410 offset="0.3196"
2411 style="stop-color:#C1044A"
2412 id="stop1261" />
2413 <stop
2414 offset="0.4412"
2415 style="stop-color:#E0023B"
2416 id="stop1263" />
2417 <stop
2418 offset="0.5441"
2419 style="stop-color:#F30032"
2420 id="stop1265" />
2421 <stop
2422 offset="0.6158"
2423 style="stop-color:#FA002F"
2424 id="stop1267" />
2425 <stop
2426 offset="1"
2427 style="stop-color:#F7EE5F"
2428 id="stop1269" />
2429 </linearGradient>
2430 <linearGradient
2431 id="XMLID_60_"
2432 gradientUnits="userSpaceOnUse"
2433 x1="-3586.04"
2434 y1="-3628.3457"
2435 x2="-3597.8599"
2436 y2="-3660.3032"
2437 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2438 <stop
2439 offset="0"
2440 style="stop-color:#500C81"
2441 id="stop1274" />
2442 <stop
2443 offset="0.0374"
2444 style="stop-color:#5F0B7A"
2445 id="stop1276" />
2446 <stop
2447 offset="0.1836"
2448 style="stop-color:#96075F"
2449 id="stop1278" />
2450 <stop
2451 offset="0.3196"
2452 style="stop-color:#C1044A"
2453 id="stop1280" />
2454 <stop
2455 offset="0.4412"
2456 style="stop-color:#E0023B"
2457 id="stop1282" />
2458 <stop
2459 offset="0.5441"
2460 style="stop-color:#F30032"
2461 id="stop1284" />
2462 <stop
2463 offset="0.6158"
2464 style="stop-color:#FA002F"
2465 id="stop1286" />
2466 <stop
2467 offset="1"
2468 style="stop-color:#F7EE5F"
2469 id="stop1288" />
2470 </linearGradient>
2471 <linearGradient
2472 id="XMLID_61_"
2473 gradientUnits="userSpaceOnUse"
2474 x1="-3595.1567"
2475 y1="-3630.6509"
2476 x2="-3605.1768"
2477 y2="-3663.6709"
2478 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2479 <stop
2480 offset="0"
2481 style="stop-color:#500C81"
2482 id="stop1293" />
2483 <stop
2484 offset="0.0374"
2485 style="stop-color:#5F0B7A"
2486 id="stop1295" />
2487 <stop
2488 offset="0.1836"
2489 style="stop-color:#96075F"
2490 id="stop1297" />
2491 <stop
2492 offset="0.3196"
2493 style="stop-color:#C1044A"
2494 id="stop1299" />
2495 <stop
2496 offset="0.4412"
2497 style="stop-color:#E0023B"
2498 id="stop1301" />
2499 <stop
2500 offset="0.5441"
2501 style="stop-color:#F30032"
2502 id="stop1303" />
2503 <stop
2504 offset="0.6158"
2505 style="stop-color:#FA002F"
2506 id="stop1305" />
2507 <stop
2508 offset="1"
2509 style="stop-color:#F7EE5F"
2510 id="stop1307" />
2511 </linearGradient>
2512 <linearGradient
2513 id="XMLID_62_"
2514 gradientUnits="userSpaceOnUse"
2515 x1="-3598.8574"
2516 y1="-3623.605"
2517 x2="-3610.6772"
2518 y2="-3655.5625"
2519 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2520 <stop
2521 offset="0"
2522 style="stop-color:#500C81"
2523 id="stop1312" />
2524 <stop
2525 offset="0.0374"
2526 style="stop-color:#5F0B7A"
2527 id="stop1314" />
2528 <stop
2529 offset="0.1836"
2530 style="stop-color:#96075F"
2531 id="stop1316" />
2532 <stop
2533 offset="0.3196"
2534 style="stop-color:#C1044A"
2535 id="stop1318" />
2536 <stop
2537 offset="0.4412"
2538 style="stop-color:#E0023B"
2539 id="stop1320" />
2540 <stop
2541 offset="0.5441"
2542 style="stop-color:#F30032"
2543 id="stop1322" />
2544 <stop
2545 offset="0.6158"
2546 style="stop-color:#FA002F"
2547 id="stop1324" />
2548 <stop
2549 offset="1"
2550 style="stop-color:#F7EE5F"
2551 id="stop1326" />
2552 </linearGradient>
2553 <linearGradient
2554 id="XMLID_63_"
2555 gradientUnits="userSpaceOnUse"
2556 x1="-3610.3208"
2557 y1="-3626.0493"
2558 x2="-3620.3408"
2559 y2="-3659.0693"
2560 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2561 <stop
2562 offset="0"
2563 style="stop-color:#500C81"
2564 id="stop1331" />
2565 <stop
2566 offset="0.0374"
2567 style="stop-color:#5F0B7A"
2568 id="stop1333" />
2569 <stop
2570 offset="0.1836"
2571 style="stop-color:#96075F"
2572 id="stop1335" />
2573 <stop
2574 offset="0.3196"
2575 style="stop-color:#C1044A"
2576 id="stop1337" />
2577 <stop
2578 offset="0.4412"
2579 style="stop-color:#E0023B"
2580 id="stop1339" />
2581 <stop
2582 offset="0.5441"
2583 style="stop-color:#F30032"
2584 id="stop1341" />
2585 <stop
2586 offset="0.6158"
2587 style="stop-color:#FA002F"
2588 id="stop1343" />
2589 <stop
2590 offset="1"
2591 style="stop-color:#F7EE5F"
2592 id="stop1345" />
2593 </linearGradient>
2594 <linearGradient
2595 id="XMLID_64_"
2596 gradientUnits="userSpaceOnUse"
2597 x1="-3613.4258"
2598 y1="-3618.2168"
2599 x2="-3625.2456"
2600 y2="-3650.1743"
2601 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2602 <stop
2603 offset="0"
2604 style="stop-color:#500C81"
2605 id="stop1350" />
2606 <stop
2607 offset="0.0374"
2608 style="stop-color:#5F0B7A"
2609 id="stop1352" />
2610 <stop
2611 offset="0.1836"
2612 style="stop-color:#96075F"
2613 id="stop1354" />
2614 <stop
2615 offset="0.3196"
2616 style="stop-color:#C1044A"
2617 id="stop1356" />
2618 <stop
2619 offset="0.4412"
2620 style="stop-color:#E0023B"
2621 id="stop1358" />
2622 <stop
2623 offset="0.5441"
2624 style="stop-color:#F30032"
2625 id="stop1360" />
2626 <stop
2627 offset="0.6158"
2628 style="stop-color:#FA002F"
2629 id="stop1362" />
2630 <stop
2631 offset="1"
2632 style="stop-color:#F7EE5F"
2633 id="stop1364" />
2634 </linearGradient>
2635 <linearGradient
2636 id="XMLID_65_"
2637 gradientUnits="userSpaceOnUse"
2638 x1="-3625.0215"
2639 y1="-3621.5884"
2640 x2="-3635.0415"
2641 y2="-3654.6084"
2642 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2643 <stop
2644 offset="0"
2645 style="stop-color:#500C81"
2646 id="stop1369" />
2647 <stop
2648 offset="0.0374"
2649 style="stop-color:#5F0B7A"
2650 id="stop1371" />
2651 <stop
2652 offset="0.1836"
2653 style="stop-color:#96075F"
2654 id="stop1373" />
2655 <stop
2656 offset="0.3196"
2657 style="stop-color:#C1044A"
2658 id="stop1375" />
2659 <stop
2660 offset="0.4412"
2661 style="stop-color:#E0023B"
2662 id="stop1377" />
2663 <stop
2664 offset="0.5441"
2665 style="stop-color:#F30032"
2666 id="stop1379" />
2667 <stop
2668 offset="0.6158"
2669 style="stop-color:#FA002F"
2670 id="stop1381" />
2671 <stop
2672 offset="1"
2673 style="stop-color:#F7EE5F"
2674 id="stop1383" />
2675 </linearGradient>
2676 <linearGradient
2677 id="XMLID_66_"
2678 gradientUnits="userSpaceOnUse"
2679 x1="-3631.4028"
2680 y1="-3619.6519"
2681 x2="-3641.4229"
2682 y2="-3652.6719"
2683 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2684 <stop
2685 offset="0"
2686 style="stop-color:#500C81"
2687 id="stop1388" />
2688 <stop
2689 offset="0.0374"
2690 style="stop-color:#5F0B7A"
2691 id="stop1390" />
2692 <stop
2693 offset="0.1836"
2694 style="stop-color:#96075F"
2695 id="stop1392" />
2696 <stop
2697 offset="0.3196"
2698 style="stop-color:#C1044A"
2699 id="stop1394" />
2700 <stop
2701 offset="0.4412"
2702 style="stop-color:#E0023B"
2703 id="stop1396" />
2704 <stop
2705 offset="0.5441"
2706 style="stop-color:#F30032"
2707 id="stop1398" />
2708 <stop
2709 offset="0.6158"
2710 style="stop-color:#FA002F"
2711 id="stop1400" />
2712 <stop
2713 offset="1"
2714 style="stop-color:#F7EE5F"
2715 id="stop1402" />
2716 </linearGradient>
2717 <linearGradient
2718 id="XMLID_67_"
2719 gradientUnits="userSpaceOnUse"
2720 x1="-3639.3081"
2721 y1="-3617.2529"
2722 x2="-3649.3281"
2723 y2="-3650.2729"
2724 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2725 <stop
2726 offset="0"
2727 style="stop-color:#500C81"
2728 id="stop1407" />
2729 <stop
2730 offset="0.0374"
2731 style="stop-color:#5F0B7A"
2732 id="stop1409" />
2733 <stop
2734 offset="0.1836"
2735 style="stop-color:#96075F"
2736 id="stop1411" />
2737 <stop
2738 offset="0.3196"
2739 style="stop-color:#C1044A"
2740 id="stop1413" />
2741 <stop
2742 offset="0.4412"
2743 style="stop-color:#E0023B"
2744 id="stop1415" />
2745 <stop
2746 offset="0.5441"
2747 style="stop-color:#F30032"
2748 id="stop1417" />
2749 <stop
2750 offset="0.6158"
2751 style="stop-color:#FA002F"
2752 id="stop1419" />
2753 <stop
2754 offset="1"
2755 style="stop-color:#F7EE5F"
2756 id="stop1421" />
2757 </linearGradient>
2758 <linearGradient
2759 id="XMLID_68_"
2760 gradientUnits="userSpaceOnUse"
2761 x1="-3644.0654"
2762 y1="-3622.0806"
2763 x2="-3668.9834"
2764 y2="-3650.0459"
2765 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2766 <stop
2767 offset="0"
2768 style="stop-color:#500C81"
2769 id="stop1426" />
2770 <stop
2771 offset="0.0374"
2772 style="stop-color:#5F0B7A"
2773 id="stop1428" />
2774 <stop
2775 offset="0.1836"
2776 style="stop-color:#96075F"
2777 id="stop1430" />
2778 <stop
2779 offset="0.3196"
2780 style="stop-color:#C1044A"
2781 id="stop1432" />
2782 <stop
2783 offset="0.4412"
2784 style="stop-color:#E0023B"
2785 id="stop1434" />
2786 <stop
2787 offset="0.5441"
2788 style="stop-color:#F30032"
2789 id="stop1436" />
2790 <stop
2791 offset="0.6158"
2792 style="stop-color:#FA002F"
2793 id="stop1438" />
2794 <stop
2795 offset="1"
2796 style="stop-color:#F7EE5F"
2797 id="stop1440" />
2798 </linearGradient>
2799 <linearGradient
2800 id="XMLID_69_"
2801 gradientUnits="userSpaceOnUse"
2802 x1="-3649.6221"
2803 y1="-3617.1294"
2804 x2="-3674.54"
2805 y2="-3645.0947"
2806 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2807 <stop
2808 offset="0"
2809 style="stop-color:#500C81"
2810 id="stop1445" />
2811 <stop
2812 offset="0.0374"
2813 style="stop-color:#5F0B7A"
2814 id="stop1447" />
2815 <stop
2816 offset="0.1836"
2817 style="stop-color:#96075F"
2818 id="stop1449" />
2819 <stop
2820 offset="0.3196"
2821 style="stop-color:#C1044A"
2822 id="stop1451" />
2823 <stop
2824 offset="0.4412"
2825 style="stop-color:#E0023B"
2826 id="stop1453" />
2827 <stop
2828 offset="0.5441"
2829 style="stop-color:#F30032"
2830 id="stop1455" />
2831 <stop
2832 offset="0.6158"
2833 style="stop-color:#FA002F"
2834 id="stop1457" />
2835 <stop
2836 offset="1"
2837 style="stop-color:#F7EE5F"
2838 id="stop1459" />
2839 </linearGradient>
2840 <linearGradient
2841 id="XMLID_79_"
2842 gradientUnits="userSpaceOnUse"
2843 x1="-3586.4019"
2844 y1="-3601.3828"
2845 x2="-3570.2441"
2846 y2="-3549.8223"
2847 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2848 <stop
2849 offset="0"
2850 style="stop-color:#D8E7EB"
2851 id="stop1709" />
2852 <stop
2853 offset="0.0684"
2854 style="stop-color:#D0DFE4"
2855 id="stop1711" />
2856 <stop
2857 offset="0.1761"
2858 style="stop-color:#B9CAD0"
2859 id="stop1713" />
2860 <stop
2861 offset="0.3096"
2862 style="stop-color:#94A7B0"
2863 id="stop1715" />
2864 <stop
2865 offset="0.4622"
2866 style="stop-color:#627784"
2867 id="stop1717" />
2868 <stop
2869 offset="0.5537"
2870 style="stop-color:#405766"
2871 id="stop1719" />
2872 <stop
2873 offset="0.6113"
2874 style="stop-color:#607682"
2875 id="stop1721" />
2876 <stop
2877 offset="0.6983"
2878 style="stop-color:#8B9EA8"
2879 id="stop1723" />
2880 <stop
2881 offset="0.7829"
2882 style="stop-color:#ADBEC5"
2883 id="stop1725" />
2884 <stop
2885 offset="0.8633"
2886 style="stop-color:#C5D5DA"
2887 id="stop1727" />
2888 <stop
2889 offset="0.9376"
2890 style="stop-color:#D3E2E7"
2891 id="stop1729" />
2892 <stop
2893 offset="1"
2894 style="stop-color:#D8E7EB"
2895 id="stop1731" />
2896 </linearGradient>
2897 <linearGradient
2898 id="XMLID_78_"
2899 gradientUnits="userSpaceOnUse"
2900 x1="-3581.9321"
2901 y1="-3602.7837"
2902 x2="-3565.7744"
2903 y2="-3551.2231"
2904 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2905 <stop
2906 offset="0"
2907 style="stop-color:#D8E7EB"
2908 id="stop1682" />
2909 <stop
2910 offset="0.0684"
2911 style="stop-color:#D0DFE4"
2912 id="stop1684" />
2913 <stop
2914 offset="0.1761"
2915 style="stop-color:#B9CAD0"
2916 id="stop1686" />
2917 <stop
2918 offset="0.3096"
2919 style="stop-color:#94A7B0"
2920 id="stop1688" />
2921 <stop
2922 offset="0.4622"
2923 style="stop-color:#627784"
2924 id="stop1690" />
2925 <stop
2926 offset="0.5537"
2927 style="stop-color:#405766"
2928 id="stop1692" />
2929 <stop
2930 offset="0.6113"
2931 style="stop-color:#607682"
2932 id="stop1694" />
2933 <stop
2934 offset="0.6983"
2935 style="stop-color:#8B9EA8"
2936 id="stop1696" />
2937 <stop
2938 offset="0.7829"
2939 style="stop-color:#ADBEC5"
2940 id="stop1698" />
2941 <stop
2942 offset="0.8633"
2943 style="stop-color:#C5D5DA"
2944 id="stop1700" />
2945 <stop
2946 offset="0.9376"
2947 style="stop-color:#D3E2E7"
2948 id="stop1702" />
2949 <stop
2950 offset="1"
2951 style="stop-color:#D8E7EB"
2952 id="stop1704" />
2953 </linearGradient>
2954 <linearGradient
2955 id="XMLID_77_"
2956 gradientUnits="userSpaceOnUse"
2957 x1="-3576.8662"
2958 y1="-3604.3711"
2959 x2="-3560.7085"
2960 y2="-3552.8105"
2961 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2962 <stop
2963 offset="0"
2964 style="stop-color:#D8E7EB"
2965 id="stop1655" />
2966 <stop
2967 offset="0.0684"
2968 style="stop-color:#D0DFE4"
2969 id="stop1657" />
2970 <stop
2971 offset="0.1761"
2972 style="stop-color:#B9CAD0"
2973 id="stop1659" />
2974 <stop
2975 offset="0.3096"
2976 style="stop-color:#94A7B0"
2977 id="stop1661" />
2978 <stop
2979 offset="0.4622"
2980 style="stop-color:#627784"
2981 id="stop1663" />
2982 <stop
2983 offset="0.5537"
2984 style="stop-color:#405766"
2985 id="stop1665" />
2986 <stop
2987 offset="0.6113"
2988 style="stop-color:#607682"
2989 id="stop1667" />
2990 <stop
2991 offset="0.6983"
2992 style="stop-color:#8B9EA8"
2993 id="stop1669" />
2994 <stop
2995 offset="0.7829"
2996 style="stop-color:#ADBEC5"
2997 id="stop1671" />
2998 <stop
2999 offset="0.8633"
3000 style="stop-color:#C5D5DA"
3001 id="stop1673" />
3002 <stop
3003 offset="0.9376"
3004 style="stop-color:#D3E2E7"
3005 id="stop1675" />
3006 <stop
3007 offset="1"
3008 style="stop-color:#D8E7EB"
3009 id="stop1677" />
3010 </linearGradient>
3011 <linearGradient
3012 id="XMLID_76_"
3013 gradientUnits="userSpaceOnUse"
3014 x1="-3577.7891"
3015 y1="-3604.082"
3016 x2="-3561.6313"
3017 y2="-3552.5215"
3018 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3019 <stop
3020 offset="0"
3021 style="stop-color:#D8E7EB"
3022 id="stop1628" />
3023 <stop
3024 offset="0.0684"
3025 style="stop-color:#D0DFE4"
3026 id="stop1630" />
3027 <stop
3028 offset="0.1761"
3029 style="stop-color:#B9CAD0"
3030 id="stop1632" />
3031 <stop
3032 offset="0.3096"
3033 style="stop-color:#94A7B0"
3034 id="stop1634" />
3035 <stop
3036 offset="0.4622"
3037 style="stop-color:#627784"
3038 id="stop1636" />
3039 <stop
3040 offset="0.5537"
3041 style="stop-color:#405766"
3042 id="stop1638" />
3043 <stop
3044 offset="0.6113"
3045 style="stop-color:#607682"
3046 id="stop1640" />
3047 <stop
3048 offset="0.6983"
3049 style="stop-color:#8B9EA8"
3050 id="stop1642" />
3051 <stop
3052 offset="0.7829"
3053 style="stop-color:#ADBEC5"
3054 id="stop1644" />
3055 <stop
3056 offset="0.8633"
3057 style="stop-color:#C5D5DA"
3058 id="stop1646" />
3059 <stop
3060 offset="0.9376"
3061 style="stop-color:#D3E2E7"
3062 id="stop1648" />
3063 <stop
3064 offset="1"
3065 style="stop-color:#D8E7EB"
3066 id="stop1650" />
3067 </linearGradient>
3068 <linearGradient
3069 id="XMLID_75_"
3070 gradientUnits="userSpaceOnUse"
3071 x1="-3584.2759"
3072 y1="-3602.0488"
3073 x2="-3568.1182"
3074 y2="-3550.4883"
3075 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3076 <stop
3077 offset="0"
3078 style="stop-color:#D8E7EB"
3079 id="stop1601" />
3080 <stop
3081 offset="0.0684"
3082 style="stop-color:#D0DFE4"
3083 id="stop1603" />
3084 <stop
3085 offset="0.1761"
3086 style="stop-color:#B9CAD0"
3087 id="stop1605" />
3088 <stop
3089 offset="0.3096"
3090 style="stop-color:#94A7B0"
3091 id="stop1607" />
3092 <stop
3093 offset="0.4622"
3094 style="stop-color:#627784"
3095 id="stop1609" />
3096 <stop
3097 offset="0.5537"
3098 style="stop-color:#405766"
3099 id="stop1611" />
3100 <stop
3101 offset="0.6113"
3102 style="stop-color:#607682"
3103 id="stop1613" />
3104 <stop
3105 offset="0.6983"
3106 style="stop-color:#8B9EA8"
3107 id="stop1615" />
3108 <stop
3109 offset="0.7829"
3110 style="stop-color:#ADBEC5"
3111 id="stop1617" />
3112 <stop
3113 offset="0.8633"
3114 style="stop-color:#C5D5DA"
3115 id="stop1619" />
3116 <stop
3117 offset="0.9376"
3118 style="stop-color:#D3E2E7"
3119 id="stop1621" />
3120 <stop
3121 offset="1"
3122 style="stop-color:#D8E7EB"
3123 id="stop1623" />
3124 </linearGradient>
3125 <linearGradient
3126 id="XMLID_74_"
3127 gradientUnits="userSpaceOnUse"
3128 x1="-3578.7671"
3129 y1="-3603.7754"
3130 x2="-3562.6094"
3131 y2="-3552.2148"
3132 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3133 <stop
3134 offset="0"
3135 style="stop-color:#D8E7EB"
3136 id="stop1574" />
3137 <stop
3138 offset="0.0684"
3139 style="stop-color:#D0DFE4"
3140 id="stop1576" />
3141 <stop
3142 offset="0.1761"
3143 style="stop-color:#B9CAD0"
3144 id="stop1578" />
3145 <stop
3146 offset="0.3096"
3147 style="stop-color:#94A7B0"
3148 id="stop1580" />
3149 <stop
3150 offset="0.4622"
3151 style="stop-color:#627784"
3152 id="stop1582" />
3153 <stop
3154 offset="0.5537"
3155 style="stop-color:#405766"
3156 id="stop1584" />
3157 <stop
3158 offset="0.6113"
3159 style="stop-color:#607682"
3160 id="stop1586" />
3161 <stop
3162 offset="0.6983"
3163 style="stop-color:#8B9EA8"
3164 id="stop1588" />
3165 <stop
3166 offset="0.7829"
3167 style="stop-color:#ADBEC5"
3168 id="stop1590" />
3169 <stop
3170 offset="0.8633"
3171 style="stop-color:#C5D5DA"
3172 id="stop1592" />
3173 <stop
3174 offset="0.9376"
3175 style="stop-color:#D3E2E7"
3176 id="stop1594" />
3177 <stop
3178 offset="1"
3179 style="stop-color:#D8E7EB"
3180 id="stop1596" />
3181 </linearGradient>
3182 <linearGradient
3183 id="XMLID_73_"
3184 gradientUnits="userSpaceOnUse"
3185 x1="-3584.6758"
3186 y1="-3601.9238"
3187 x2="-3568.5181"
3188 y2="-3550.3633"
3189 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3190 <stop
3191 offset="0"
3192 style="stop-color:#D8E7EB"
3193 id="stop1547" />
3194 <stop
3195 offset="0.0684"
3196 style="stop-color:#D0DFE4"
3197 id="stop1549" />
3198 <stop
3199 offset="0.1761"
3200 style="stop-color:#B9CAD0"
3201 id="stop1551" />
3202 <stop
3203 offset="0.3096"
3204 style="stop-color:#94A7B0"
3205 id="stop1553" />
3206 <stop
3207 offset="0.4622"
3208 style="stop-color:#627784"
3209 id="stop1555" />
3210 <stop
3211 offset="0.5537"
3212 style="stop-color:#405766"
3213 id="stop1557" />
3214 <stop
3215 offset="0.6113"
3216 style="stop-color:#607682"
3217 id="stop1559" />
3218 <stop
3219 offset="0.6983"
3220 style="stop-color:#8B9EA8"
3221 id="stop1561" />
3222 <stop
3223 offset="0.7829"
3224 style="stop-color:#ADBEC5"
3225 id="stop1563" />
3226 <stop
3227 offset="0.8633"
3228 style="stop-color:#C5D5DA"
3229 id="stop1565" />
3230 <stop
3231 offset="0.9376"
3232 style="stop-color:#D3E2E7"
3233 id="stop1567" />
3234 <stop
3235 offset="1"
3236 style="stop-color:#D8E7EB"
3237 id="stop1569" />
3238 </linearGradient>
3239 <linearGradient
3240 id="XMLID_72_"
3241 gradientUnits="userSpaceOnUse"
3242 x1="-3580.1533"
3243 y1="-3603.3408"
3244 x2="-3563.9956"
3245 y2="-3551.7803"
3246 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3247 <stop
3248 offset="0"
3249 style="stop-color:#D8E7EB"
3250 id="stop1520" />
3251 <stop
3252 offset="0.0684"
3253 style="stop-color:#D0DFE4"
3254 id="stop1522" />
3255 <stop
3256 offset="0.1761"
3257 style="stop-color:#B9CAD0"
3258 id="stop1524" />
3259 <stop
3260 offset="0.3096"
3261 style="stop-color:#94A7B0"
3262 id="stop1526" />
3263 <stop
3264 offset="0.4622"
3265 style="stop-color:#627784"
3266 id="stop1528" />
3267 <stop
3268 offset="0.5537"
3269 style="stop-color:#405766"
3270 id="stop1530" />
3271 <stop
3272 offset="0.6113"
3273 style="stop-color:#607682"
3274 id="stop1532" />
3275 <stop
3276 offset="0.6983"
3277 style="stop-color:#8B9EA8"
3278 id="stop1534" />
3279 <stop
3280 offset="0.7829"
3281 style="stop-color:#ADBEC5"
3282 id="stop1536" />
3283 <stop
3284 offset="0.8633"
3285 style="stop-color:#C5D5DA"
3286 id="stop1538" />
3287 <stop
3288 offset="0.9376"
3289 style="stop-color:#D3E2E7"
3290 id="stop1540" />
3291 <stop
3292 offset="1"
3293 style="stop-color:#D8E7EB"
3294 id="stop1542" />
3295 </linearGradient>
3296 <linearGradient
3297 id="XMLID_71_"
3298 gradientUnits="userSpaceOnUse"
3299 x1="-3578.5146"
3300 y1="-3603.8545"
3301 x2="-3562.3569"
3302 y2="-3552.2939"
3303 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3304 <stop
3305 offset="0"
3306 style="stop-color:#D8E7EB"
3307 id="stop1493" />
3308 <stop
3309 offset="0.0684"
3310 style="stop-color:#D0DFE4"
3311 id="stop1495" />
3312 <stop
3313 offset="0.1761"
3314 style="stop-color:#B9CAD0"
3315 id="stop1497" />
3316 <stop
3317 offset="0.3096"
3318 style="stop-color:#94A7B0"
3319 id="stop1499" />
3320 <stop
3321 offset="0.4622"
3322 style="stop-color:#627784"
3323 id="stop1501" />
3324 <stop
3325 offset="0.5537"
3326 style="stop-color:#405766"
3327 id="stop1503" />
3328 <stop
3329 offset="0.6113"
3330 style="stop-color:#607682"
3331 id="stop1505" />
3332 <stop
3333 offset="0.6983"
3334 style="stop-color:#8B9EA8"
3335 id="stop1507" />
3336 <stop
3337 offset="0.7829"
3338 style="stop-color:#ADBEC5"
3339 id="stop1509" />
3340 <stop
3341 offset="0.8633"
3342 style="stop-color:#C5D5DA"
3343 id="stop1511" />
3344 <stop
3345 offset="0.9376"
3346 style="stop-color:#D3E2E7"
3347 id="stop1513" />
3348 <stop
3349 offset="1"
3350 style="stop-color:#D8E7EB"
3351 id="stop1515" />
3352 </linearGradient>
3353 <linearGradient
3354 id="XMLID_70_"
3355 gradientUnits="userSpaceOnUse"
3356 x1="-3581.9316"
3357 y1="-3602.7837"
3358 x2="-3565.7739"
3359 y2="-3551.2231"
3360 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3361 <stop
3362 offset="0"
3363 style="stop-color:#D8E7EB"
3364 id="stop1466" />
3365 <stop
3366 offset="0.0684"
3367 style="stop-color:#D0DFE4"
3368 id="stop1468" />
3369 <stop
3370 offset="0.1761"
3371 style="stop-color:#B9CAD0"
3372 id="stop1470" />
3373 <stop
3374 offset="0.3096"
3375 style="stop-color:#94A7B0"
3376 id="stop1472" />
3377 <stop
3378 offset="0.4622"
3379 style="stop-color:#627784"
3380 id="stop1474" />
3381 <stop
3382 offset="0.5537"
3383 style="stop-color:#405766"
3384 id="stop1476" />
3385 <stop
3386 offset="0.6113"
3387 style="stop-color:#607682"
3388 id="stop1478" />
3389 <stop
3390 offset="0.6983"
3391 style="stop-color:#8B9EA8"
3392 id="stop1480" />
3393 <stop
3394 offset="0.7829"
3395 style="stop-color:#ADBEC5"
3396 id="stop1482" />
3397 <stop
3398 offset="0.8633"
3399 style="stop-color:#C5D5DA"
3400 id="stop1484" />
3401 <stop
3402 offset="0.9376"
3403 style="stop-color:#D3E2E7"
3404 id="stop1486" />
3405 <stop
3406 offset="1"
3407 style="stop-color:#D8E7EB"
3408 id="stop1488" />
3409 </linearGradient>
3410 <linearGradient
3411 inkscape:collect="always"
3412 id="linearGradient2902">
3413 <stop
3414 style="stop-color:#000000;stop-opacity:1;"
3415 offset="0"
3416 id="stop2904" />
3417 <stop
3418 style="stop-color:#000000;stop-opacity:0;"
3419 offset="1"
3420 id="stop2906" />
3421 </linearGradient>
3422 <linearGradient
3423 id="linearGradient5149">
3424 <stop
3425 style="stop-color:#3cc23c;stop-opacity:1;"
3426 offset="0"
3427 id="stop5151" />
3428 <stop
3429 style="stop-color:#b4ffb4;stop-opacity:1;"
3430 offset="1"
3431 id="stop5153" />
3432 </linearGradient>
3433 <linearGradient
3434 inkscape:collect="always"
3435 xlink:href="#linearGradient5149"
3436 id="linearGradient5155"
3437 x1="209.40289"
3438 y1="196.93959"
3439 x2="402.86024"
3440 y2="201.399"
3441 gradientUnits="userSpaceOnUse"
3442 gradientTransform="matrix(0.3124104,-0.1174332,0.1174332,0.3124104,43.759948,18.537169)" />
3443 <linearGradient
3444 id="linearGradient3195">
3445 <stop
3446 style="stop-color:#0be254;stop-opacity:1;"
3447 offset="0"
3448 id="stop3197" />
3449 <stop
3450 style="stop-color:#ffffff;stop-opacity:1;"
3451 offset="1"
3452 id="stop3199" />
3453 </linearGradient>
3454 <linearGradient
3455 inkscape:collect="always"
3456 xlink:href="#linearGradient3195"
3457 id="linearGradient3201"
3458 x1="372.93762"
3459 y1="201.399"
3460 x2="191.42549"
3461 y2="205.16872"
3462 gradientUnits="userSpaceOnUse"
3463 gradientTransform="matrix(0.3124104,-0.1174332,0.1174332,0.3124104,43.759948,18.537169)" />
3464 <linearGradient
3465 inkscape:collect="always"
3466 xlink:href="#linearGradient2902"
3467 id="linearGradient2908"
3468 x1="-111.49345"
3469 y1="98.656258"
3470 x2="14.666016"
3471 y2="89.682144"
3472 gradientUnits="userSpaceOnUse" />
3473 <linearGradient
3474 inkscape:collect="always"
3475 xlink:href="#linearGradient2902"
3476 id="linearGradient2914"
3477 gradientUnits="userSpaceOnUse"
3478 x1="78.814453"
3479 y1="146.72023"
3480 x2="78.814453"
3481 y2="32.644054" />
3482 <linearGradient
3483 inkscape:collect="always"
3484 xlink:href="#linearGradient2902"
3485 id="linearGradient3669"
3486 gradientUnits="userSpaceOnUse"
3487 x1="78.814453"
3488 y1="194.78421"
3489 x2="78.814453"
3490 y2="18.507591" />
3491 <linearGradient
3492 inkscape:collect="always"
3493 xlink:href="#linearGradient2902"
3494 id="linearGradient9226"
3495 gradientUnits="userSpaceOnUse"
3496 x1="78.814453"
3497 y1="194.78421"
3498 x2="78.814453"
3499 y2="18.507591" />
3500 <linearGradient
3501 inkscape:collect="always"
3502 xlink:href="#linearGradient2902"
3503 id="linearGradient9228"
3504 gradientUnits="userSpaceOnUse"
3505 x1="78.814453"
3506 y1="194.78421"
3507 x2="78.814453"
3508 y2="18.507591" />
3509 <linearGradient
3510 inkscape:collect="always"
3511 xlink:href="#linearGradient2902"
3512 id="linearGradient9230"
3513 gradientUnits="userSpaceOnUse"
3514 x1="78.814453"
3515 y1="194.78421"
3516 x2="78.814453"
3517 y2="18.507591" />
3518 </defs>
3519 <sodipodi:namedview
3520 inkscape:window-height="764"
3521 inkscape:window-width="1264"
3522 inkscape:pageshadow="2"
3523 inkscape:pageopacity="0.0"
3524 guidetolerance="10.0"
3525 gridtolerance="10.0"
3526 objecttolerance="10.0"
3527 borderopacity="1.0"
3528 bordercolor="#666666"
3529 pagecolor="#ffffff"
3530 id="base"
3531 inkscape:zoom="5.2984309"
3532 inkscape:cx="189.61448"
3533 inkscape:cy="23.24562"
3534 inkscape:window-x="0"
3535 inkscape:window-y="0"
3536 inkscape:current-layer="g6" />
3537 <title
3538 id="title4">generated by pstoedit version:3.45 from Z:/asf_logo_1999.eps</title>
3539 <g
3540 xml:space="preserve"
3541 id="g6"
3542 transform="translate(-2.6691177,-22.954412)">
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705 <clipPath
3706 id="clippath1"> <path
3707 d="M 201.762,284.016 L 201.762,284.016 L 200.309,284.512 L 198.891,284.629 L 197.473,284.402 L 196.055,283.828 L 194.676,283.059 L 193.297,282.066 L 191.992,280.922 L 190.691,279.695 L 189.465,278.434 L 188.316,277.168 L 187.203,275.945 L 186.168,274.797 L 185.211,273.84 L 184.328,273.078 L 183.563,272.578 L 182.871,272.348 L 182.871,272.348 L 183.449,272.387 L 184.059,272.422 L 184.75,272.5 L 185.438,272.574 L 186.129,272.652 L 186.82,272.73 L 187.508,272.766 L 188.121,272.844 L 188.121,272.844 L 189.078,273.723 L 189.996,274.566 L 190.801,275.406 L 191.531,276.25 L 192.258,277.051 L 192.949,277.855 L 193.598,278.617 L 194.289,279.348 L 194.98,280.074 L 195.711,280.727 L 196.477,281.375 L 197.355,281.984 L 198.277,282.562 L 199.309,283.098 L 200.461,283.594 L 201.762,284.016"
3708 id="path657" />
3709 </clipPath>
3710 <path
3711 d="M 146.41619,62.696027 C 142.62485,65.569644 129.70379,77.249461 123.31721,81.838063 C 118.18169,85.530297 112.55653,88.526567 107.06272,91.260526 C 105.64199,91.967054 103.02634,93.1178 101.17315,93.67506 C 99.827079,93.854801 97.128165,94.825804 95.80891,94.506847 C 107.56985,90.318569 111.82247,88.150989 123.07148,80.688964 C 127.64352,77.674484 142.56869,64.986455 146.41619,62.696027 z "
3712 id="path8874"
3713 style="stroke:#000000;stroke-width:0.20699584" /><path
3714 d="M 136.08322,75.96456 L 135.84827,76.444287 L 135.52884,76.821806 L 135.12422,77.113635 L 134.63488,77.320749 L 134.10787,77.471469 L 133.52446,77.566277 L 132.92233,77.605399 L 132.30147,77.623884 L 131.69022,77.615944 L 131.0965,77.587142 L 130.52269,77.559544 L 129.98606,77.531729 L 129.51547,77.531972 L 129.11155,77.56102 L 128.80004,77.626292 L 128.57468,77.73983 L 128.72516,77.607097 L 128.88474,77.466238 L 129.07313,77.315738 L 129.26058,77.164773 L 129.44943,77.0143 L 129.6376,76.863585 L 129.81641,76.703506 L 129.9856,76.571758 L 130.43701,76.552809 L 130.8702,76.533371 L 131.27483,76.542489 L 131.66146,76.570577 L 132.03683,76.58881 L 132.40377,76.617369 L 132.75175,76.644745 L 133.10069,76.653861 L 133.44845,76.663222 L 133.78756,76.643787 L 134.13578,76.615231 L 134.5025,76.548532 L 134.86991,76.464317 L 135.2556,76.341686 L 135.65997,76.181138 L 136.08322,75.96456 z "
3715 id="path189"
3716 style="fill:url(#XMLID_1_);stroke:#000000;stroke-width:0.0882756" /><path
3717 d="M 137.20254,74.749258 L 136.85432,75.230434 L 136.47802,75.625449 L 136.08347,75.94608 L 135.66037,76.199765 L 135.21832,76.379263 L 134.77506,76.501913 L 134.30469,76.595989 L 133.83456,76.634152 L 133.35455,76.662957 L 132.87482,76.65359 L 132.37563,76.62575 L 131.89566,76.598397 L 131.4068,76.579448 L 130.91696,76.561459 L 130.44683,76.562175 L 129.9858,76.590236 L 130.08852,76.505751 L 130.19245,76.401822 L 130.29637,76.317112 L 130.40844,76.223287 L 130.52171,76.128252 L 130.64433,76.04401 L 130.75617,75.949472 L 130.86943,75.856358 L 130.982,75.762036 L 131.10438,75.657889 L 131.22725,75.5734 L 131.34917,75.46972 L 131.4711,75.366015 L 131.59327,75.262087 L 131.70631,75.149042 L 131.82872,75.045852 L 132.04545,75.092174 L 132.30872,75.149067 L 132.62862,75.204972 L 132.96773,75.261373 L 133.35386,75.308188 L 133.74914,75.344902 L 134.16287,75.382355 L 134.57661,75.400589 L 134.99134,75.400589 L 135.39548,75.391965 L 135.79123,75.352841 L 136.14857,75.296443 L 136.47785,75.210994 L 136.76992,75.088833 L 137.014,74.937894 L 137.20259,74.749307 L 137.20254,74.749258 z "
3718 id="path208"
3719 style="fill:url(#XMLID_2_);stroke:#000000;stroke-width:0.0882756" /><path
3720 d="M 137.98324,73.986796 L 137.77612,74.250536 L 137.56903,74.495078 L 137.34342,74.702441 L 137.08952,74.881494 L 136.82604,75.051407 L 136.53442,75.174036 L 136.22436,75.277225 L 135.86631,75.353061 L 135.48976,75.40993 L 135.08539,75.43895 L 134.64238,75.429344 L 134.17249,75.411082 L 133.64498,75.354439 L 133.08052,75.280049 L 132.47839,75.186224 L 131.81938,75.055192 L 131.97897,74.894617 L 132.1393,74.75351 L 132.30898,74.621273 L 132.47819,74.489525 L 132.6476,74.358491 L 132.81609,74.225543 L 132.9853,74.056343 L 133.16386,73.87778 L 133.46576,73.876824 L 133.76645,73.895795 L 134.05902,73.905159 L 134.36907,73.933 L 134.67027,73.951483 L 134.97124,73.989402 L 135.27219,74.007142 L 135.57458,74.044569 L 135.87505,74.063764 L 136.17601,74.082488 L 136.48631,74.091855 L 136.77885,74.100748 L 137.07933,74.081555 L 137.38073,74.062365 L 137.68143,74.043393 L 137.98324,73.986796 z "
3721 id="path227"
3722 style="fill:url(#XMLID_3_);stroke:#000000;stroke-width:0.0882756" /><path
3723 d="M 138.7627,73.129575 L 138.67896,73.403388 L 138.51959,73.638103 L 138.29281,73.807768 L 138.00195,73.93064 L 137.6717,74.015131 L 137.28578,74.062167 L 136.88187,74.090722 L 136.43932,74.082564 L 135.99726,74.053764 L 135.53599,74.026878 L 135.09345,73.97935 L 134.66052,73.941922 L 134.2367,73.895379 L 133.85129,73.868028 L 133.50282,73.858441 L 133.20113,73.858686 L 134.30219,72.945284 L 134.56617,72.982247 L 134.83856,73.009872 L 135.10254,73.048746 L 135.36485,73.085461 L 135.62886,73.123379 L 135.90221,73.150978 L 136.16595,73.188186 L 136.43858,73.216497 L 136.71143,73.225366 L 136.99366,73.243602 L 137.26747,73.253922 L 137.55834,73.243356 L 137.84993,73.233992 L 138.15207,73.215044 L 138.45301,73.177594 L 138.7627,73.129575 z "
3724 id="path8879"
3725 style="fill:url(#XMLID_4_);stroke:#000000;stroke-width:0.0882756" /><path
3726 d="M 140.21128,71.246126 L 140.10759,71.726591 L 139.95783,72.122101 L 139.74182,72.451845 L 139.47784,72.715829 L 139.17688,72.922702 L 138.84811,73.083006 L 138.48115,73.185481 L 138.07652,73.252693 L 137.64408,73.271887 L 137.2008,73.28221 L 136.73955,73.233721 L 136.26892,73.197472 L 135.78916,73.131956 L 135.30006,73.057568 L 134.82008,72.973104 L 134.34922,72.897758 L 134.54673,72.756869 L 134.68759,72.634243 L 134.79104,72.549041 L 134.88535,72.473941 L 134.9792,72.399306 L 135.06392,72.314103 L 135.19565,72.200593 L 135.36485,72.049628 L 135.79804,72.124752 L 136.21131,72.180909 L 136.61666,72.228191 L 137.01191,72.227945 L 137.37886,72.218116 L 137.7367,72.199168 L 138.08443,72.151886 L 138.40482,72.094994 L 138.70552,72.019403 L 138.99759,71.9342 L 139.25128,71.830763 L 139.48673,71.727329 L 139.71257,71.614287 L 139.90096,71.500747 L 140.06991,71.368758 L 140.21128,71.246126 z "
3727 id="path265"
3728 style="fill:url(#XMLID_5_);stroke:#000000;stroke-width:0.0882756" /><path
3729 d="M 141.02917,70.313512 L 140.7652,70.652841 L 140.49354,70.963863 L 140.19188,71.227133 L 139.88204,71.462558 L 139.57174,71.661492 L 139.23312,71.830692 L 138.89401,71.962436 L 138.53642,72.056978 L 138.16972,72.142398 L 137.80227,72.189681 L 137.41658,72.218237 L 137.02182,72.21799 L 136.6268,72.199267 L 136.22169,72.153187 L 135.80673,72.096789 L 135.393,72.021932 L 135.59099,71.843149 L 135.8166,71.654024 L 136.03236,71.475708 L 136.23972,71.28707 L 136.42833,71.136351 L 136.57809,71.003891 L 136.69115,70.910064 L 136.74729,70.872148 L 136.78521,70.872615 L 136.87904,70.872367 L 137.02976,70.8719 L 137.21815,70.872638 L 137.43463,70.881755 L 137.69811,70.881758 L 137.9808,70.880798 L 138.29111,70.872419 L 138.62036,70.842191 L 138.96788,70.814347 L 139.3077,70.776895 L 139.66504,70.720003 L 140.02192,70.644412 L 140.36993,70.559456 L 140.70928,70.446191 L 141.02917,70.313512 z "
3730 id="path284"
3731 style="fill:url(#XMLID_6_);stroke:#000000;stroke-width:0.0882756" /><path
3732 d="M 142.25166,69.221573 L 142.04502,69.522765 L 141.8185,69.786745 L 141.53675,70.012811 L 141.23555,70.219931 L 140.89741,70.390085 L 140.54917,70.530457 L 140.18223,70.653088 L 139.78675,70.748116 L 139.40157,70.814592 L 139.00632,70.870523 L 138.61153,70.908689 L 138.21579,70.928131 L 137.82029,70.928618 L 137.44424,70.928864 L 137.08594,70.909892 L 136.73771,70.88183 L 136.85097,70.768561 L 137.00192,70.637285 L 137.16101,70.49593 L 137.32109,70.335846 L 137.4898,70.203119 L 137.65059,70.062479 L 137.78212,69.950168 L 137.90453,69.864724 L 137.92347,69.883672 L 138.00796,69.892319 L 138.18676,69.883917 L 138.42195,69.855114 L 138.71378,69.826336 L 139.05218,69.789374 L 139.42006,69.742094 L 139.79588,69.684517 L 140.19066,69.627157 L 140.58617,69.571471 L 140.96248,69.514826 L 141.31141,69.448106 L 141.63108,69.391485 L 141.90397,69.325474 L 142.11084,69.268364 L 142.25166,69.221573 z "
3733 id="path303"
3734 style="fill:url(#XMLID_7_);stroke:#000000;stroke-width:0.0882756" /><path
3735 d="M 143.52144,67.931146 L 143.52144,68.232088 L 143.38896,68.495606 L 143.14441,68.740639 L 142.80625,68.948251 L 142.39178,69.136644 L 141.92284,69.287826 L 141.4047,69.428691 L 140.87745,69.542693 L 140.33171,69.63679 L 139.80517,69.71312 L 139.30719,69.77913 L 138.85432,69.817267 L 138.46888,69.864796 L 138.16794,69.883524 L 137.97042,69.893847 L 137.88521,69.903208 L 138.01696,69.790186 L 138.15807,69.666821 L 138.3083,69.534827 L 138.45902,69.404263 L 138.62774,69.271068 L 138.78831,69.129713 L 138.94836,68.989316 L 139.11802,68.838377 L 139.39018,68.772121 L 139.66281,68.70685 L 139.9453,68.648535 L 140.20927,68.593338 L 140.49081,68.535983 L 140.77374,68.497816 L 141.05549,68.441663 L 141.32884,68.393917 L 141.62042,68.346168 L 141.89353,68.299597 L 142.17552,68.243195 L 142.44815,68.195669 L 142.72078,68.129905 L 142.99439,68.063675 L 143.25741,68.00654 L 143.52144,67.931146 z "
3736 id="path8884"
3737 style="fill:url(#XMLID_8_);stroke:#000000;stroke-width:0.0882756" /><path
3738 d="M 144.48045,67.008161 L 144.37655,67.224864 L 144.24527,67.412759 L 144.09454,67.581713 L 143.90687,67.715132 L 143.69929,67.827215 L 143.46482,67.931636 L 143.19218,68.016127 L 142.89051,68.091716 L 142.55186,68.167308 L 142.18491,68.233316 L 141.78029,68.300036 L 141.33798,68.384772 L 140.84914,68.479089 L 140.32189,68.573386 L 139.75743,68.686649 L 139.14521,68.828497 L 139.31537,68.677556 L 139.50353,68.508607 L 139.66313,68.348524 L 139.83234,68.198051 L 139.98255,68.066062 L 140.10471,67.962626 L 140.18967,67.896889 L 140.2271,67.8772 L 140.27436,67.886565 L 140.38741,67.88681 L 140.54771,67.8772 L 140.76344,67.867839 L 141.02647,67.848645 L 141.32884,67.829922 L 141.63868,67.800872 L 141.98716,67.753834 L 142.33514,67.706307 L 142.69272,67.649661 L 143.03113,67.573823 L 143.37982,67.489115 L 143.69898,67.394553 L 143.99994,67.281754 L 144.26321,67.149517 L 144.48045,67.008161 z "
3739 id="path337"
3740 style="fill:url(#XMLID_9_);stroke:#000000;stroke-width:0.0882756" /><path
3741 d="M 145.06337,66.217387 L 144.93186,66.517867 L 144.73387,66.790721 L 144.47085,67.018211 L 144.15117,67.206359 L 143.79358,67.3758 L 143.41729,67.507326 L 143.00379,67.621797 L 142.58933,67.696921 L 142.18472,67.763152 L 141.77987,67.810432 L 141.4038,67.848596 L 141.0551,67.857468 L 140.75391,67.877374 L 140.50961,67.876418 L 140.33178,67.886739 L 140.21756,67.886736 L 140.37764,67.745383 L 140.5293,67.613389 L 140.67881,67.482114 L 140.83841,67.35899 L 140.98,67.237542 L 141.13094,67.104836 L 141.29918,66.973067 L 141.45926,66.831714 L 141.6755,66.822351 L 141.9018,66.802421 L 142.1178,66.793301 L 142.35347,66.783718 L 142.59824,66.783716 L 142.83319,66.774353 L 143.07773,66.755409 L 143.32253,66.73572 L 143.55821,66.707906 L 143.79292,66.680058 L 144.02834,66.631846 L 144.24387,66.584806 L 144.47068,66.509682 L 144.67705,66.433846 L 144.87529,66.330654 L 145.06337,66.217387 z "
3742 id="path352"
3743 style="fill:url(#XMLID_10_);stroke:#000000;stroke-width:0.0882756" /><path
3744 d="M 145.62687,65.125916 L 145.5705,65.445344 L 145.46777,65.717486 L 145.31729,65.943797 L 145.13852,66.141797 L 144.91267,66.293228 L 144.66741,66.424262 L 144.39503,66.528163 L 144.09458,66.603508 L 143.79292,66.660401 L 143.45455,66.698318 L 143.12456,66.726875 L 142.78569,66.746069 L 142.44778,66.765288 L 142.10845,66.765288 L 141.77942,66.77514 L 141.45857,66.793621 L 141.57234,66.700015 L 141.7041,66.605697 L 141.83538,66.492653 L 141.97651,66.370736 L 142.1361,66.247613 L 142.29618,66.106753 L 142.47472,65.964685 L 142.63431,65.823329 L 142.79486,65.889314 L 142.97318,65.936594 L 143.14333,65.973804 L 143.33197,65.992529 L 143.51964,65.992995 L 143.71787,65.982921 L 143.91467,65.973333 L 144.1117,65.926544 L 144.31017,65.879263 L 144.50793,65.813035 L 144.71455,65.737933 L 144.90319,65.643365 L 145.09157,65.530815 L 145.27855,65.417768 L 145.45782,65.275949 L 145.62687,65.125916 z "
3745 id="path367"
3746 style="fill:url(#XMLID_11_);stroke:#000000;stroke-width:0.0882756" /><path
3747 d="M 146.2295,64.24009 L 146.13447,64.465664 L 146.01233,64.682856 L 145.86233,64.889482 L 145.66456,65.08773 L 145.46681,65.266757 L 145.23164,65.445319 L 144.97772,65.586918 L 144.72431,65.728027 L 144.45118,65.832177 L 144.17783,65.916886 L 143.91456,65.973287 L 143.64194,66.002579 L 143.37795,66.00209 L 143.12427,65.974488 L 142.87947,65.899117 L 142.66252,65.795462 L 142.78539,65.692248 L 142.90756,65.58832 L 143.04845,65.484884 L 143.18069,65.371374 L 143.31173,65.259064 L 143.45282,65.136192 L 143.60355,65.00469 L 143.77272,64.853259 L 143.93304,64.881571 L 144.10271,64.900543 L 144.26255,64.910124 L 144.41279,64.910127 L 144.56327,64.909907 L 144.70487,64.880611 L 144.85534,64.862597 L 144.9967,64.833552 L 145.13734,64.786982 L 145.28781,64.73009 L 145.43803,64.673446 L 145.58875,64.598566 L 145.7397,64.522482 L 145.8988,64.438238 L 146.05909,64.334558 L 146.2295,64.24009 z "
3748 id="path8889"
3749 style="fill:url(#XMLID_12_);stroke:#000000;stroke-width:0.0882756" /><path
3750 d="M 146.48292,63.722199 L 146.48266,63.835243 L 146.43489,63.957873 L 146.3509,64.080255 L 146.23785,64.193299 L 146.09675,64.315685 L 145.92828,64.428952 L 145.73003,64.532139 L 145.53299,64.636043 L 145.31604,64.72053 L 145.08974,64.796146 L 144.85526,64.862623 L 144.62871,64.900049 L 144.39353,64.928853 L 144.16818,64.928605 L 143.97042,64.900272 L 143.7727,64.853234 L 143.86701,64.779066 L 143.95172,64.712595 L 144.05515,64.627886 L 144.14923,64.552047 L 144.24333,64.477165 L 144.32755,64.410691 L 144.39355,64.345171 L 144.45932,64.278919 L 144.49698,64.316592 L 144.65637,64.306986 L 144.90164,64.269063 L 145.21242,64.203056 L 145.55968,64.09962 L 145.90887,63.995695 L 146.22829,63.863479 L 146.48292,63.722199 z "
3751 id="path397"
3752 style="fill:url(#XMLID_13_);stroke:#000000;stroke-width:0.0882756" /><path
3753 d="M 146.49156,63.18558 L 146.60411,63.336523 L 146.64133,63.468292 L 146.6046,63.581338 L 146.52014,63.703252 L 146.37904,63.807398 L 146.19064,63.901228 L 145.99362,63.985938 L 145.76754,64.061776 L 145.53186,64.127535 L 145.29715,64.17504 L 145.06218,64.222793 L 144.86372,64.250855 L 144.68518,64.278698 L 144.55344,64.297644 L 144.46897,64.307255 L 144.44088,64.297892 L 145.25852,63.647762 L 145.3521,63.629036 L 145.49417,63.581533 L 145.66337,63.525626 L 145.84238,63.458902 L 146.03993,63.393386 L 146.20865,63.317794 L 146.36921,63.251811 L 146.49156,63.18558 z "
3754 id="path412"
3755 style="fill:url(#XMLID_14_);stroke:#000000;stroke-width:0.0882756" /><path
3756 d="M 145.25849,63.647834 L 145.42818,63.497364 L 145.57891,63.365861 L 145.71951,63.24299 L 145.85102,63.129722 L 145.9739,63.026506 L 146.10492,62.913217 L 146.24626,62.809785 L 146.41617,62.69605 L 146.56638,62.90388 L 146.56686,63.072833 L 146.43511,63.205068 L 146.21888,63.327724 L 145.96423,63.411966 L 145.69138,63.497167 L 145.44659,63.572536 L 145.25849,63.647834 z "
3757 id="path427"
3758 style="fill:url(#XMLID_15_);stroke:#000000;stroke-width:0.0882756" /><path
3759 d="M 145.19322,63.526385 L 145.38137,63.393437 L 145.54096,63.271299 L 145.69121,63.177227 L 145.83256,63.072834 L 145.9732,62.988366 L 146.11432,62.903878 L 146.27486,62.800445 L 146.45364,62.697007 L 146.25564,62.480281 L 146.03919,62.433245 L 145.85991,62.517957 L 145.67295,62.688111 L 145.52201,62.913462 L 145.38139,63.149138 L 145.27796,63.365863 L 145.19322,63.526385 z "
3760 id="path8894"
3761 style="fill:url(#XMLID_16_);stroke:#000000;stroke-width:0.0882756" /><path
3762 d="M 131.57981,69.493713 L 131.00598,69.747379 L 130.50706,70.040409 L 130.11275,70.379026 L 129.78325,70.74598 L 129.52023,71.16068 L 129.32296,71.583542 L 129.18183,72.045281 L 129.06928,72.515889 L 128.9944,73.005938 L 128.94734,73.504634 L 128.91111,74.012206 L 128.89192,74.520982 L 128.86383,75.019436 L 128.82711,75.508772 L 128.78008,75.989211 L 128.70568,76.460063 L 128.81797,76.383733 L 128.93078,76.28966 L 129.04382,76.195834 L 129.15662,76.120487 L 129.27016,76.026172 L 129.38295,75.932095 L 129.50462,75.828168 L 129.61768,75.734342 L 129.73,75.63931 L 129.85334,75.536122 L 129.98462,75.442297 L 130.09742,75.347732 L 130.22051,75.244293 L 130.35106,75.150223 L 130.48256,75.036955 L 130.61477,74.942639 L 130.57732,74.717528 L 130.53844,74.435314 L 130.4919,74.105568 L 130.47246,73.747731 L 130.4439,73.362051 L 130.43453,72.957179 L 130.44317,72.533829 L 130.46164,72.119348 L 130.50917,71.695531 L 130.57445,71.292354 L 130.65844,70.905986 L 130.78131,70.538788 L 130.92193,70.209042 L 131.10071,69.916971 L 131.32607,69.672398 L 131.57981,69.493713 z "
3763 id="path461"
3764 style="fill:url(#XMLID_17_);stroke:#000000;stroke-width:0.0882756" /><path
3765 d="M 132.44545,68.946309 L 132.12482,69.097518 L 131.84307,69.267898 L 131.5899,69.44621 L 131.34416,69.654065 L 131.14735,69.888758 L 130.96882,70.16186 L 130.81835,70.443856 L 130.69646,70.7736 L 130.59348,71.140556 L 130.52772,71.546138 L 130.48045,71.987947 L 130.46246,72.477771 L 130.46295,73.02304 L 130.50109,73.606942 L 130.55823,74.246748 L 130.63357,74.943201 L 130.81284,74.783148 L 130.98083,74.650688 L 131.13205,74.518693 L 131.28206,74.406141 L 131.43277,74.292878 L 131.59236,74.170739 L 131.78075,74.038033 L 131.99745,73.878938 L 132.02579,73.568382 L 132.03491,73.257832 L 132.0438,72.927837 L 132.03373,72.61682 L 132.03373,72.297641 L 132.03347,71.978215 L 132.02365,71.666947 L 132.02365,71.346786 L 132.03325,71.036235 L 132.06111,70.726174 L 132.07959,70.424488 L 132.10767,70.114185 L 132.16386,69.813238 L 132.24832,69.520919 L 132.33302,69.229339 L 132.44545,68.946309 z "
3766 id="path480"
3767 style="fill:url(#XMLID_18_);stroke:#000000;stroke-width:0.0882756" /><path
3768 d="M 133.44239,68.36263 L 133.1136,68.390964 L 132.83978,68.514086 L 132.60532,68.712084 L 132.43637,68.975112 L 132.29501,69.30461 L 132.19228,69.671808 L 132.11694,70.085552 L 132.0699,70.5281 L 132.04159,70.989369 L 132.02382,71.45976 L 132.01516,71.92103 L 132.02455,72.382519 L 132.03414,72.805383 L 132.02552,73.209515 L 132.01616,73.5587 L 131.9876,73.869475 L 133.13572,72.96519 L 133.11673,72.682729 L 133.0973,72.399797 L 133.06897,72.126698 L 133.04977,71.845436 L 133.03105,71.562728 L 133.00224,71.271365 L 132.99215,70.979537 L 132.98207,70.706411 L 132.99168,70.414587 L 133.00129,70.123249 L 133.02936,69.83071 L 133.07593,69.539374 L 133.13255,69.238431 L 133.21677,68.946357 L 133.32019,68.654778 L 133.44239,68.36263 z "
3769 id="path499"
3770 style="fill:url(#XMLID_19_);stroke:#000000;stroke-width:0.0882756" /><path
3771 d="M 135.64356,67.476607 L 135.08824,67.485724 L 134.6181,67.561783 L 134.22308,67.694241 L 133.88421,67.882414 L 133.61158,68.136812 L 133.39534,68.428886 L 133.23601,68.777354 L 133.1136,69.14453 L 133.03874,69.558274 L 133.00154,69.990496 L 132.98305,70.461598 L 133.00202,70.932209 L 133.02125,71.421788 L 133.069,71.920244 L 133.11649,72.418941 L 133.16355,72.917641 L 133.35266,72.748196 L 133.51154,72.606619 L 133.64353,72.493844 L 133.77552,72.399306 L 133.90679,72.30548 L 134.04791,72.201329 L 134.20751,72.080128 L 134.40525,71.937567 L 134.34837,71.486152 L 134.31069,71.072188 L 134.30059,70.667559 L 134.30996,70.300631 L 134.35652,69.953118 L 134.41243,69.632736 L 134.47794,69.340688 L 134.57276,69.058224 L 134.6757,68.803603 L 134.7885,68.578004 L 134.92001,68.35194 L 135.05201,68.14507 L 135.19167,67.966043 L 135.33398,67.787258 L 135.49357,67.627668 L 135.64356,67.476607 z "
3772 id="path8899"
3773 style="fill:url(#XMLID_20_);stroke:#000000;stroke-width:0.0882756" /><path
3774 d="M 136.99879,66.628254 L 136.57426,66.826258 L 136.19819,67.033349 L 135.86004,67.278361 L 135.56845,67.532048 L 135.2956,67.805395 L 135.0705,68.106833 L 134.87296,68.418097 L 134.71337,68.748086 L 134.57177,69.095576 L 134.47794,69.453389 L 134.39348,69.83929 L 134.34718,70.224474 L 134.31912,70.629816 L 134.31954,71.04309 L 134.34809,71.467648 L 134.39561,71.909699 L 134.62119,71.740749 L 134.87437,71.54228 L 135.12899,71.344282 L 135.38361,71.146278 L 135.61832,70.967251 L 135.7971,70.826854 L 135.92885,70.713343 L 135.9754,70.665814 L 135.9754,70.628855 L 135.985,70.543901 L 135.98525,70.412126 L 135.98453,70.242933 L 135.98526,70.035814 L 136.0028,69.809992 L 136.02224,69.546723 L 136.04983,69.254649 L 136.10598,68.953238 L 136.16213,68.633594 L 136.24662,68.303355 L 136.35055,67.956089 L 136.47247,67.625872 L 136.62247,67.287724 L 136.79119,66.948121 L 136.99879,66.628254 z "
3775 id="path537"
3776 style="fill:url(#XMLID_21_);stroke:#000000;stroke-width:0.0882756" /><path
3777 d="M 138.55942,65.403567 L 138.20254,65.554995 L 137.8728,65.734051 L 137.5721,65.978129 L 137.3086,66.242113 L 137.06455,66.544262 L 136.84832,66.872803 L 136.66976,67.221273 L 136.50034,67.598056 L 136.36882,67.974347 L 136.2469,68.360249 L 136.16244,68.76487 L 136.07822,69.169252 L 136.03192,69.556358 L 135.99399,69.9514 L 135.97574,70.327225 L 135.9767,70.684814 L 136.10748,70.57226 L 136.27669,70.440512 L 136.46461,70.289549 L 136.66233,70.129715 L 136.85984,69.969167 L 137.03887,69.82759 L 137.18885,69.695845 L 137.29254,69.592162 L 137.27383,69.572968 L 137.27383,69.478898 L 137.30143,69.319775 L 137.34798,69.0841 L 137.41423,68.811468 L 137.48911,68.490838 L 137.58294,68.133028 L 137.68687,67.767034 L 137.78889,67.399612 L 137.90142,67.022585 L 138.02477,66.65541 L 138.13756,66.318001 L 138.25037,66.015134 L 138.36221,65.752603 L 138.46611,65.553647 L 138.55942,65.403567 z "
3778 id="path556"
3779 style="fill:url(#XMLID_22_);stroke:#000000;stroke-width:0.0882756" /><path
3780 d="M 140.14937,64.357952 L 139.8105,64.283071 L 139.49037,64.358443 L 139.1995,64.557157 L 138.9175,64.839155 L 138.67271,65.215418 L 138.43753,65.657965 L 138.22154,66.157157 L 138.03389,66.664974 L 137.85556,67.201834 L 137.71467,67.719016 L 137.58343,68.209307 L 137.47953,68.670333 L 137.39625,69.055989 L 137.32977,69.348063 L 137.29184,69.553974 L 137.26401,69.639419 L 137.42455,69.535986 L 137.59351,69.404925 L 137.77204,69.262368 L 137.95084,69.102284 L 138.1486,68.942919 L 138.34611,68.78188 L 138.54337,68.641236 L 138.74113,68.500104 L 138.8352,68.236121 L 138.9187,67.981721 L 139.00438,67.70884 L 139.07876,67.445323 L 139.16347,67.190701 L 139.2386,66.927433 L 139.32306,66.654328 L 139.39844,66.391306 L 139.48196,66.135946 L 139.55709,65.873166 L 139.65187,65.609899 L 139.74572,65.365352 L 139.82972,65.110458 L 139.93341,64.857021 L 140.03567,64.602153 L 140.14937,64.357952 z "
3781 id="path575"
3782 style="fill:url(#XMLID_23_);stroke:#000000;stroke-width:0.0882756" /><path
3783 d="M 142.28431,63.123433 L 141.92672,63.180567 L 141.60681,63.330575 L 141.3253,63.538649 L 141.06253,63.801921 L 140.8355,64.122084 L 140.63874,64.489748 L 140.46928,64.866062 L 140.31978,65.261325 L 140.20648,65.656836 L 140.09417,66.051854 L 140.01932,66.428633 L 139.95356,66.757886 L 139.89762,67.059079 L 139.86955,67.313234 L 139.8415,67.491549 L 139.83262,67.595478 L 140.00133,67.463238 L 140.16139,67.341075 L 140.30177,67.238132 L 140.43403,67.124129 L 140.57488,67.001499 L 140.71553,66.89831 L 140.87585,66.775437 L 141.04409,66.643664 L 141.09208,66.427676 L 141.1384,66.210482 L 141.1763,65.966185 L 141.22334,65.731002 L 141.26989,65.477093 L 141.31716,65.222941 L 141.36372,64.968539 L 141.42011,64.723997 L 141.47675,64.478713 L 141.56029,64.243999 L 141.64571,64.008817 L 141.73932,63.800985 L 141.85209,63.595072 L 141.97447,63.415088 L 142.12469,63.264861 L 142.28431,63.123433 z "
3784 id="path8904"
3785 style="fill:url(#XMLID_24_);stroke:#000000;stroke-width:0.0882756" /><path
3786 d="M 141.39156,63.472395 L 141.12707,63.529019 L 140.88324,63.623087 L 140.68503,63.745469 L 140.49759,63.896435 L 140.33726,64.074999 L 140.19591,64.291209 L 140.06393,64.555658 L 139.93361,64.837162 L 139.81121,65.167404 L 139.68834,65.515873 L 139.56692,65.9205 L 139.43588,66.353461 L 139.28494,66.824069 L 139.11648,67.332844 L 138.93817,67.886958 L 138.72171,68.48069 L 138.88227,68.3398 L 139.06969,68.188347 L 139.25809,68.03834 L 139.45559,67.896025 L 139.6255,67.764502 L 139.757,67.671385 L 139.85061,67.595084 L 139.87963,67.548759 L 139.87867,67.51084 L 139.89812,67.397821 L 139.90698,67.237272 L 139.93531,67.039986 L 139.97248,66.79544 L 140.02913,66.513194 L 140.08527,66.211296 L 140.15991,65.891402 L 140.25352,65.553032 L 140.34806,65.214166 L 140.47838,64.875058 L 140.61924,64.545557 L 140.78028,64.23525 L 140.96723,63.951827 L 141.16593,63.698413 L 141.39156,63.472395 z "
3787 id="path605"
3788 style="fill:url(#XMLID_25_);stroke:#000000;stroke-width:0.0882756" /><path
3789 d="M 143.51738,62.849373 L 143.15933,62.831136 L 142.8399,62.868342 L 142.5668,62.953791 L 142.32225,63.085536 L 142.12475,63.264811 L 141.94597,63.481044 L 141.78588,63.735691 L 141.65486,63.999674 L 141.55216,64.309737 L 141.45759,64.610681 L 141.38246,64.949298 L 141.30782,65.288899 L 141.23246,65.627763 L 141.1763,65.966133 L 141.12063,66.305955 L 141.04527,66.625136 L 141.1768,66.531065 L 141.30854,66.418042 L 141.47821,66.305488 L 141.6467,66.172539 L 141.82572,66.032387 L 142.01338,65.880711 L 142.1924,65.740072 L 142.37143,65.598003 L 142.30542,65.419686 L 142.26775,65.249998 L 142.23895,65.071435 L 142.2392,64.883045 L 142.24878,64.704483 L 142.28549,64.535308 L 142.34189,64.345958 L 142.40765,64.167397 L 142.49237,63.988618 L 142.59509,63.810054 L 142.7086,63.640365 L 142.83964,63.470452 L 142.99035,63.301503 L 143.15089,63.141447 L 143.32871,62.999599 L 143.51738,62.849373 z "
3790 id="path620"
3791 style="fill:url(#XMLID_26_);stroke:#000000;stroke-width:0.0882756" /><path
3792 d="M 145.29545,62.311059 L 145.1728,62.283462 L 145.03192,62.292823 L 144.87233,62.340596 L 144.72161,62.415476 L 144.56177,62.519134 L 144.40243,62.660736 L 144.25195,62.811698 L 144.09162,62.990263 L 143.95989,63.178183 L 143.82861,63.385796 L 143.72444,63.602251 L 143.64046,63.818264 L 143.57444,64.035455 L 143.53725,64.260806 L 143.53725,64.46817 L 143.55691,64.675042 L 143.65988,64.609525 L 143.78228,64.524569 L 143.90419,64.440105 L 144.02637,64.354411 L 144.1293,64.269704 L 144.21498,64.204186 L 144.27111,64.148035 L 144.29007,64.129085 L 144.30854,64.072217 L 144.34551,63.92197 L 144.43047,63.686321 L 144.53437,63.413934 L 144.67501,63.121616 L 144.85354,62.81106 L 145.05081,62.537958 L 145.29545,62.311059 z "
3793 id="path635"
3794 style="fill:url(#XMLID_27_);stroke:#000000;stroke-width:0.0882756" /><path
3795 d="M 145.9072,62.470673 L 145.76631,62.312043 L 145.61561,62.236205 L 145.47379,62.227331 L 145.32356,62.283489 L 145.18222,62.386456 L 145.05094,62.537885 L 144.91895,62.707328 L 144.78766,62.914446 L 144.68422,63.130682 L 144.58126,63.347381 L 144.48718,63.554747 L 144.4123,63.742917 L 144.3559,63.912607 L 144.29975,64.044598 L 144.29015,64.12906 L 144.27982,64.157127 L 145.16417,63.534841 L 145.20281,63.441014 L 145.2772,63.309024 L 145.35232,63.158547 L 145.44639,62.988638 L 145.54983,62.828579 L 145.66312,62.678821 L 145.78525,62.556195 L 145.9072,62.470673 z "
3796 id="path8909"
3797 style="fill:url(#XMLID_28_);stroke:#000000;stroke-width:0.0882756" /><path
3798 d="M 144.65565,62.462758 L 144.38279,62.490848 L 144.12864,62.55686 L 143.86562,62.651886 L 143.611,62.792774 L 143.37531,62.953103 L 143.15092,63.123237 L 142.94381,63.330842 L 142.75495,63.555239 L 142.59584,63.791156 L 142.46457,64.054894 L 142.36088,64.309269 L 142.28552,64.572784 L 142.2392,64.84564 L 142.23897,65.108912 L 142.28625,65.363066 L 142.3712,65.617001 L 142.50345,65.503978 L 142.6347,65.41062 L 142.76645,65.296618 L 142.91692,65.183596 L 143.06764,65.071264 L 143.21786,64.93927 L 143.39664,64.817112 L 143.57424,64.675043 L 143.54736,64.534424 L 143.53754,64.393068 L 143.54713,64.251959 L 143.56514,64.10149 L 143.60256,63.950773 L 143.64936,63.790688 L 143.706,63.639482 L 143.78159,63.489476 L 143.85646,63.33876 L 143.95033,63.188283 L 144.04464,63.056516 L 144.15743,62.924055 L 144.27048,62.792776 L 144.39216,62.670616 L 144.52415,62.557348 L 144.65565,62.462758 z "
3799 id="path665"
3800 style="fill:url(#XMLID_29_);stroke:#000000;stroke-width:0.0882756" /><path
3801 d="M 106.64878,91.467152 L 106.65864,91.514901 L 106.68669,91.637529 L 106.74334,91.825208 L 106.80955,92.06039 L 106.89332,92.333737 L 106.9975,92.625563 L 107.13861,92.936584 L 107.28982,93.236569 L 107.46958,93.528151 L 107.6762,93.773654 L 107.91188,93.989177 L 108.16603,94.148765 L 108.45836,94.233721 L 108.78714,94.242347 L 109.14449,94.148053 L 109.53112,93.950054 L 109.34153,93.855513 L 109.1625,93.734089 L 108.98443,93.611706 L 108.79626,93.460988 L 108.61721,93.301152 L 108.43889,93.122837 L 108.27834,92.944054 L 108.10843,92.755909 L 107.94813,92.558154 L 107.78829,92.36185 L 107.64719,92.16363 L 107.52453,91.96612 L 107.39229,91.778196 L 107.28886,91.598922 L 107.19407,91.41149 L 107.10986,91.250942 L 107.04433,91.279498 L 106.99658,91.307587 L 106.93107,91.335651 L 106.88379,91.365191 L 106.82717,91.383916 L 106.7806,91.411268 L 106.71437,91.439111 L 106.64878,91.467152 z "
3802 id="path686"
3803 style="fill:url(#XMLID_30_);stroke:#000000;stroke-width:0.0882756" /><path
3804 d="M 107.1002,91.279228 L 107.48706,91.928644 L 107.86362,92.492859 L 108.22167,92.944054 L 108.57015,93.310762 L 108.89964,93.602367 L 109.21978,93.809214 L 109.5212,93.959932 L 109.79407,94.06241 L 110.05783,94.100082 L 110.30191,94.099838 L 110.50949,94.062656 L 110.69668,94.006502 L 110.87643,93.920834 L 111.00795,93.82676 L 111.12026,93.732688 L 111.21457,93.638369 L 111.00745,93.544051 L 110.80059,93.449979 L 110.58411,93.328063 L 110.37675,93.196536 L 110.15092,93.045574 L 109.9342,92.886942 L 109.71725,92.727107 L 109.50151,92.548325 L 109.29343,92.3604 L 109.08607,92.153529 L 108.89864,91.945945 L 108.70951,91.720838 L 108.54009,91.495241 L 108.37114,91.249959 L 108.22931,91.015489 L 108.08796,90.761336 L 107.93773,90.835724 L 107.78655,90.912054 L 107.64544,90.978307 L 107.51441,91.052721 L 107.40111,91.110324 L 107.2792,91.175619 L 107.18535,91.232019 L 107.1002,91.279228 z "
3805 id="path707"
3806 style="fill:url(#XMLID_31_);stroke:#000000;stroke-width:0.0882756" /><path
3807 d="M 108.09782,90.751481 L 108.22885,90.996247 L 108.40883,91.287607 L 108.63441,91.58951 L 108.90797,91.898613 L 109.21804,92.228827 L 109.5382,92.548005 L 109.88642,92.839119 L 110.24497,93.103104 L 110.60327,93.327965 L 110.96061,93.515888 L 111.31843,93.628442 L 111.64721,93.675945 L 111.96807,93.637313 L 112.24907,93.505567 L 112.49483,93.279477 L 112.69186,92.931743 L 112.41828,92.865514 L 112.13628,92.790387 L 111.86293,92.687444 L 111.60903,92.584229 L 111.35537,92.443365 L 111.10986,92.311594 L 110.87467,92.151268 L 110.639,91.991433 L 110.42255,91.812871 L 110.20656,91.615608 L 110.00881,91.417854 L 109.81992,91.192994 L 109.63178,90.967397 L 109.4712,90.731969 L 109.30106,90.486957 L 109.16019,90.233295 L 109.02869,90.28994 L 108.87871,90.365065 L 108.72777,90.440658 L 108.57705,90.516029 L 108.4268,90.591867 L 108.2953,90.648021 L 108.18154,90.704174 L 108.09782,90.751481 z "
3808 id="path728"
3809 style="fill:url(#XMLID_32_);stroke:#000000;stroke-width:0.0882756" /><path
3810 d="M 114.39445,92.261265 L 114.1595,92.535571 L 113.87655,92.742688 L 113.56696,92.865071 L 113.23768,92.931301 L 112.87002,92.939926 L 112.49373,92.88424 L 112.11768,92.771686 L 111.72242,92.621681 L 111.33626,92.424173 L 110.94941,92.188991 L 110.5827,91.916848 L 110.24359,91.616125 L 109.92368,91.2967 L 109.63163,90.948474 L 109.37725,90.5813 L 109.16943,90.204984 L 109.31054,90.138731 L 109.45285,90.072255 L 109.59303,89.988258 L 109.74421,89.911928 L 109.89468,89.837295 L 110.05452,89.752806 L 110.20499,89.677189 L 110.36457,89.592455 L 110.56209,89.789963 L 110.77832,90.005706 L 111.03365,90.24138 L 111.29716,90.485679 L 111.56069,90.729978 L 111.83357,90.984132 L 112.135,91.228455 L 112.4165,91.454274 L 112.70921,91.670411 L 112.98183,91.868165 L 113.265,92.036626 L 113.52877,92.168864 L 113.77307,92.261977 L 114.00921,92.309482 L 114.21559,92.30899 L 114.39445,92.261265 z "
3811 id="path749"
3812 style="fill:url(#XMLID_33_);stroke:#000000;stroke-width:0.0882756" /><path
3813 d="M 110.383,89.592356 L 110.515,89.535711 L 110.63716,89.469728 L 110.74971,89.413795 L 110.86345,89.337957 L 110.98537,89.272219 L 111.0984,89.214862 L 111.22128,89.149099 L 111.35232,89.093412 L 111.55941,89.262611 L 111.77587,89.458915 L 112.01154,89.67537 L 112.25633,89.901435 L 112.49201,90.136617 L 112.74569,90.389812 L 113.01038,90.634848 L 113.26432,90.869563 L 113.52808,91.095874 L 113.80069,91.311863 L 114.07403,91.490154 L 114.32844,91.65048 L 114.60035,91.772643 L 114.86387,91.866248 L 115.11898,91.913997 L 115.37265,91.904167 L 115.0345,92.110794 L 114.7146,92.242539 L 114.39519,92.299897 L 114.08414,92.309014 L 113.77312,92.261979 L 113.46303,92.159746 L 113.16208,92.009053 L 112.8693,91.830492 L 112.56737,91.605386 L 112.26689,91.360103 L 111.97458,91.087002 L 111.66332,90.79638 L 111.35325,90.504552 L 111.03308,90.203608 L 110.72229,89.893301 L 110.383,89.592356 z "
3814 id="path766"
3815 style="fill:url(#XMLID_34_);stroke:#000000;stroke-width:0.0882756" /><path
3816 d="M 116.89653,91.111725 L 116.50199,91.469806 L 116.10673,91.715309 L 115.72154,91.856172 L 115.34476,91.912793 L 114.96844,91.875366 L 114.60964,91.763304 L 114.24348,91.602977 L 113.88543,91.377379 L 113.54631,91.114108 L 113.20841,90.832382 L 112.8777,90.521829 L 112.56739,90.212013 L 112.24653,89.89163 L 111.9458,89.591398 L 111.64436,89.327414 L 111.3619,89.083336 L 111.49343,89.007989 L 111.61534,88.94176 L 111.72838,88.886319 L 111.85125,88.819131 L 111.97293,88.753148 L 112.08548,88.697215 L 112.19875,88.621844 L 112.33077,88.546005 L 112.349,88.583458 L 112.39653,88.667947 L 112.48126,88.791067 L 112.58516,88.951148 L 112.73588,89.138334 L 112.90533,89.346165 L 113.1122,89.572254 L 113.36682,89.807215 L 113.6493,90.033525 L 113.97856,90.26679 L 114.35608,90.473908 L 114.76022,90.671662 L 115.22196,90.831473 L 115.73913,90.97236 L 116.28487,91.065475 L 116.89653,91.111725 z "
3817 id="path787"
3818 style="fill:url(#XMLID_35_);stroke:#000000;stroke-width:0.0882756" /><path
3819 d="M 118.58105,90.235039 L 118.37369,90.536475 L 118.1013,90.772886 L 117.7718,90.932476 L 117.39502,91.045274 L 116.98199,91.102165 L 116.53944,91.111749 L 116.06904,91.056087 L 115.58834,90.953143 L 115.11797,90.801958 L 114.63776,90.604941 L 114.16641,90.360617 L 113.72458,90.070461 L 113.30963,89.750078 L 112.92324,89.383838 L 112.60333,88.989066 L 112.32108,88.555639 L 113.4968,87.943429 L 113.63839,88.103264 L 113.8083,88.29141 L 114.01446,88.478843 L 114.26021,88.685223 L 114.52325,88.893053 L 114.82467,89.099924 L 115.13548,89.316159 L 115.4837,89.5132 L 115.84154,89.700633 L 116.20871,89.860936 L 116.58525,90.010941 L 116.981,90.123985 L 117.37601,90.218058 L 117.78111,90.264603 L 118.18549,90.273475 L 118.58105,90.235039 z "
3820 id="path808"
3821 style="fill:url(#XMLID_36_);stroke:#000000;stroke-width:0.0882756" /><path
3822 d="M 113.45967,87.943159 L 113.6471,87.848864 L 113.87316,87.716873 L 114.13715,87.584414 L 114.39968,87.434188 L 114.66391,87.302417 L 114.91711,87.160595 L 115.13358,87.038188 L 115.29389,86.934506 L 115.5291,87.13226 L 115.77413,87.356654 L 116.02828,87.573847 L 116.28219,87.809029 L 116.55553,88.024772 L 116.82934,88.260691 L 117.10196,88.47668 L 117.38442,88.683798 L 117.66618,88.871967 L 117.9496,89.041166 L 118.25054,89.190435 L 118.5419,89.312816 L 118.84381,89.407355 L 119.13513,89.453927 L 119.42672,89.482015 L 119.72816,89.463778 L 119.5681,89.680017 L 119.36148,89.867915 L 119.11717,90.018632 L 118.83518,90.132144 L 118.51479,90.206778 L 118.1661,90.254527 L 117.79003,90.255682 L 117.36646,90.2084 L 116.94287,90.12369 L 116.4912,89.972481 L 116.0201,89.785515 L 115.52285,89.531607 L 115.02199,89.221299 L 114.51467,88.864669 L 113.98646,88.431954 L 113.45967,87.943159 z "
3823 id="path829"
3824 style="fill:url(#XMLID_37_);stroke:#000000;stroke-width:0.0882756" /><path
3825 d="M 121.14868,88.416151 L 120.7448,88.859902 L 120.3493,89.17957 L 119.93579,89.387623 L 119.52999,89.472334 L 119.12629,89.48194 L 118.72096,89.416424 L 118.33529,89.275069 L 117.9398,89.069157 L 117.56325,88.843805 L 117.18647,88.560603 L 116.83849,88.269735 L 116.48977,87.97862 L 116.16027,87.667846 L 115.84925,87.395212 L 115.55789,87.140812 L 115.28455,86.925069 L 115.44416,86.821633 L 115.58623,86.736186 L 115.72685,86.670914 L 115.85717,86.595788 L 115.99973,86.510833 L 116.13076,86.434507 L 116.29059,86.350507 L 116.4694,86.246114 L 116.75307,86.472671 L 117.02451,86.688903 L 117.29784,86.905139 L 117.57097,87.101443 L 117.83567,87.309029 L 118.09894,87.496951 L 118.3615,87.666372 L 118.62574,87.835101 L 118.89909,87.976458 L 119.18132,88.10749 L 119.4729,88.211419 L 119.77458,88.305737 L 120.10312,88.370785 L 120.43311,88.417577 L 120.78134,88.426917 L 121.14868,88.416151 z "
3826 id="path850"
3827 style="fill:url(#XMLID_38_);stroke:#000000;stroke-width:0.0882756" /><path
3828 d="M 122.49332,87.484272 L 122.13598,87.880001 L 121.75944,88.161974 L 121.3649,88.351078 L 120.95044,88.445864 L 120.53644,88.464344 L 120.11313,88.417307 L 119.68979,88.29635 L 119.26598,88.136513 L 118.85199,87.947634 L 118.44737,87.713407 L 118.07034,87.468398 L 117.70341,87.196965 L 117.35542,86.942567 L 117.04416,86.688412 L 116.75187,86.453229 L 116.5066,86.246853 L 116.66789,86.143171 L 116.80828,86.05797 L 116.93979,85.983088 L 117.06265,85.898111 L 117.18387,85.832594 L 117.3065,85.746434 L 117.4656,85.662189 L 117.63551,85.5489 L 118.05045,85.869284 L 118.45507,86.161109 L 118.84097,86.41455 L 119.20818,86.649733 L 119.5569,86.846281 L 119.90488,87.024842 L 120.22477,87.156589 L 120.53603,87.279217 L 120.82737,87.382407 L 121.11032,87.456574 L 121.37384,87.494494 L 121.61839,87.531675 L 121.86293,87.550623 L 122.07938,87.540549 L 122.29585,87.512237 L 122.49332,87.484272 z "
3829 id="path871"
3830 style="fill:url(#XMLID_39_);stroke:#000000;stroke-width:0.0882756" /><path
3831 d="M 123.70721,86.551186 L 123.36834,86.928436 L 123.00185,87.201783 L 122.61475,87.380591 L 122.20126,87.493633 L 121.78774,87.513072 L 121.36417,87.485007 L 120.93098,87.391156 L 120.49851,87.260613 L 120.07466,87.099823 L 119.66068,86.893194 L 119.26492,86.667843 L 118.87926,86.433375 L 118.52082,86.207777 L 118.19107,85.972593 L 117.9002,85.756581 L 117.63548,85.568189 L 117.76701,85.474117 L 117.90862,85.389628 L 118.05021,85.304206 L 118.19132,85.219718 L 118.33314,85.134518 L 118.47329,85.049559 L 118.61441,84.964608 L 118.76538,84.871244 L 118.95281,85.03995 L 119.17913,85.20915 L 119.42392,85.378597 L 119.71553,85.538183 L 120.02629,85.716498 L 120.35531,85.857854 L 120.70378,86.017201 L 121.05272,86.158776 L 121.42015,86.280935 L 121.78731,86.384127 L 122.13458,86.468834 L 122.48278,86.533396 L 122.8219,86.590285 L 123.14252,86.608987 L 123.44275,86.590039 L 123.70721,86.551186 z "
3832 id="path892"
3833 style="fill:url(#XMLID_40_);stroke:#000000;stroke-width:0.0882756" /><path
3834 d="M 125.09819,85.591244 L 124.77951,86.004986 L 124.39337,86.297057 L 123.96089,86.485694 L 123.49003,86.580259 L 122.99156,86.609257 L 122.48303,86.553348 L 121.95697,86.440059 L 121.43837,86.299663 L 120.93993,86.102644 L 120.46857,85.895283 L 120.03562,85.688412 L 119.63124,85.47267 L 119.31061,85.26533 L 119.04709,85.096376 L 118.85916,84.964629 L 118.76531,84.87127 L 118.93352,84.757758 L 119.09383,84.654078 L 119.26279,84.559982 L 119.42284,84.456544 L 119.59202,84.362224 L 119.74273,84.267193 L 119.89321,84.173342 L 120.03503,84.069908 L 120.24215,84.182951 L 120.47683,84.324059 L 120.74922,84.483651 L 121.06073,84.661718 L 121.39073,84.840529 L 121.73825,85.019556 L 122.09533,85.188017 L 122.47332,85.357707 L 122.84962,85.507464 L 123.21633,85.629847 L 123.58398,85.733531 L 123.93196,85.799292 L 124.27058,85.816817 L 124.57153,85.798089 L 124.85398,85.722989 L 125.09819,85.591244 z "
3835 id="path913"
3836 style="fill:url(#XMLID_41_);stroke:#000000;stroke-width:0.0882756" /><path
3837 d="M 125.90718,84.687204 L 125.64414,85.100945 L 125.3439,85.401669 L 125.00479,85.609746 L 124.6191,85.732377 L 124.22337,85.770048 L 123.80913,85.751322 L 123.36802,85.667327 L 122.925,85.54492 L 122.49134,85.376213 L 122.05914,85.188777 L 121.65404,84.972322 L 121.25804,84.765452 L 120.9007,84.558801 L 120.58943,84.361269 L 120.30719,84.192315 L 120.08206,84.060322 L 120.20373,83.976106 L 120.31702,83.88154 L 120.42886,83.806661 L 120.53279,83.740184 L 120.64584,83.664596 L 120.74999,83.598832 L 120.8431,83.542186 L 120.93788,83.467553 L 121.18292,83.654493 L 121.45531,83.833744 L 121.73777,84.001982 L 122.01063,84.163021 L 122.31207,84.312779 L 122.61374,84.444061 L 122.91494,84.557594 L 123.23507,84.651202 L 123.55499,84.745028 L 123.88449,84.810547 L 124.22357,84.84726 L 124.54278,84.866455 L 124.89171,84.856845 L 125.2205,84.828509 L 125.55961,84.771621 L 125.90718,84.687204 z "
3838 id="path934"
3839 style="fill:url(#XMLID_42_);stroke:#000000;stroke-width:0.0882756" /><path
3840 d="M 127.14889,83.782454 L 126.86739,84.064944 L 126.56574,84.309981 L 126.23694,84.507244 L 125.88775,84.667789 L 125.52177,84.771716 L 125.12625,84.847557 L 124.73195,84.866995 L 124.31846,84.866747 L 123.90302,84.81035 L 123.47968,84.726354 L 123.04602,84.594607 L 122.61382,84.444135 L 122.19001,84.246846 L 121.76618,84.012109 L 121.34211,83.758444 L 120.93795,83.467579 L 121.08819,83.373037 L 121.22953,83.269136 L 121.38959,83.165699 L 121.52974,83.062019 L 121.6711,82.959046 L 121.81149,82.873845 L 121.95332,82.769918 L 122.09443,82.68543 L 122.34905,82.826317 L 122.60295,82.986155 L 122.85758,83.127042 L 123.1026,83.276554 L 123.35674,83.417419 L 123.61016,83.55902 L 123.87365,83.671819 L 124.15611,83.784372 L 124.44769,83.869574 L 124.75871,83.95308 L 125.08846,83.999625 L 125.44555,84.018818 L 125.81322,84.009208 L 126.22671,83.971511 L 126.6786,83.895918 L 127.14889,83.782454 z "
3841 id="path955"
3842 style="fill:url(#XMLID_43_);stroke:#000000;stroke-width:0.0882756" /><path
3843 d="M 128.85116,82.434913 L 128.51278,82.83138 L 128.13671,83.169999 L 127.75106,83.442855 L 127.34621,83.660046 L 126.93246,83.829464 L 126.49953,83.943223 L 126.05698,84.008495 L 125.60604,84.00874 L 125.16396,83.981389 L 124.7118,83.906018 L 124.25919,83.794201 L 123.81809,83.634833 L 123.37578,83.436365 L 122.94214,83.211504 L 122.5284,82.948454 L 122.13169,82.66646 L 122.25433,82.581257 L 122.39618,82.497014 L 122.52651,82.403162 L 122.65824,82.308379 L 122.79024,82.214773 L 122.9412,82.119989 L 123.08209,82.016549 L 123.23328,81.921985 L 123.50542,82.119272 L 123.79724,82.29781 L 124.10925,82.457647 L 124.42868,82.588927 L 124.76658,82.702215 L 125.11554,82.804445 L 125.47336,82.88004 L 125.83982,82.927814 L 126.21708,82.965265 L 126.59267,82.964061 L 126.9697,82.944155 L 127.35513,82.897608 L 127.73167,82.821525 L 128.11757,82.718334 L 128.48477,82.595461 L 128.85116,82.434913 z "
3844 id="path976"
3845 style="fill:url(#XMLID_44_);stroke:#000000;stroke-width:0.0882756" /><path
3846 d="M 129.7818,81.690932 L 129.45278,81.983005 L 129.10506,82.237626 L 128.72827,82.464156 L 128.32439,82.642941 L 127.91929,82.784543 L 127.48732,82.897829 L 127.05414,82.972957 L 126.61183,83.001511 L 126.17001,82.992639 L 125.71881,82.955679 L 125.27482,82.870726 L 124.84234,82.757929 L 124.41973,82.598803 L 124.00549,82.410658 L 123.60878,82.184842 L 123.2332,81.92206 L 123.35437,81.837353 L 123.45806,81.751904 L 123.55263,81.676557 L 123.64648,81.601922 L 123.74056,81.526083 L 123.85333,81.451205 L 123.98436,81.357133 L 124.13532,81.24313 L 124.54066,81.384482 L 124.9069,81.525125 L 125.26498,81.657116 L 125.59449,81.779253 L 125.924,81.882693 L 126.22592,81.976295 L 126.52686,82.050709 L 126.8278,82.106886 L 127.13882,82.153924 L 127.44913,82.162798 L 127.78799,82.163263 L 128.12612,82.125591 L 128.50315,82.068229 L 128.89816,81.973667 L 129.32102,81.851751 L 129.7818,81.690932 z "
3847 id="path997"
3848 style="fill:url(#XMLID_45_);stroke:#000000;stroke-width:0.0882756" /><path
3849 d="M 131.38111,80.278343 L 130.93879,80.758559 L 130.48761,81.173755 L 130.0362,81.494137 L 129.57493,81.748047 L 129.10504,81.936656 L 128.64353,82.058329 L 128.17316,82.135123 L 127.703,82.153139 L 127.23263,82.13537 L 126.76174,82.079218 L 126.30983,81.985854 L 125.84856,81.862489 L 125.3974,81.731923 L 124.96399,81.582165 L 124.53031,81.412965 L 124.11681,81.243303 L 124.20994,81.167931 L 124.34239,81.073856 L 124.50198,80.951718 L 124.66206,80.829091 L 124.84109,80.706685 L 124.99181,80.593417 L 125.12262,80.500057 L 125.21647,80.423509 L 125.59351,80.574447 L 125.98012,80.715799 L 126.36622,80.81828 L 126.74181,80.912131 L 127.11859,80.968309 L 127.50379,81.015098 L 127.88057,81.033331 L 128.2763,81.034045 L 128.65237,81.015098 L 129.03876,80.966611 L 129.43331,80.910459 L 129.81994,80.825258 L 130.20513,80.721354 L 130.60018,80.58936 L 130.9856,80.447762 L 131.38111,80.278343 z "
3850 id="path1018"
3851 style="fill:url(#XMLID_46_);stroke:#000000;stroke-width:0.0882756" /><path
3852 d="M 132.40611,79.514972 L 132.14284,79.760011 L 131.84214,79.985851 L 131.49342,80.202553 L 131.09914,80.390947 L 130.6854,80.561345 L 130.24237,80.702948 L 129.77222,80.816459 L 129.28338,80.910776 L 128.77532,80.967642 L 128.26703,81.005561 L 127.7496,80.996939 L 127.22139,80.95973 L 126.71358,80.883894 L 126.20601,80.77134 L 125.70684,80.611998 L 125.22613,80.414489 L 125.38692,80.292569 L 125.54652,80.169451 L 125.70587,80.028341 L 125.87528,79.896815 L 126.01614,79.774189 L 126.14836,79.679868 L 126.2609,79.604768 L 126.3365,79.585798 L 126.63769,79.547657 L 126.94849,79.519565 L 127.29672,79.50974 L 127.64469,79.499664 L 128.0114,79.509247 L 128.39683,79.517897 L 128.79259,79.536868 L 129.18881,79.555814 L 129.59294,79.583659 L 129.99732,79.593022 L 130.40195,79.602139 L 130.81591,79.600936 L 131.22991,79.601178 L 131.62492,79.581028 L 132.02043,79.562795 L 132.40611,79.514972 z "
3853 id="path1039"
3854 style="fill:url(#XMLID_47_);stroke:#000000;stroke-width:0.0882756" /><path
3855 d="M 133.77909,78.14003 L 133.69485,78.545858 L 133.50695,78.865284 L 133.25304,79.119685 L 132.92353,79.31766 L 132.52851,79.449187 L 132.08575,79.533896 L 131.58755,79.581891 L 131.06076,79.60086 L 130.49558,79.582381 L 129.91242,79.563653 L 129.30042,79.536303 L 128.68917,79.509198 L 128.08679,79.490472 L 127.47505,79.481108 L 126.89119,79.500545 L 126.33655,79.548294 L 126.44095,79.4439 L 126.58059,79.340714 L 126.73156,79.208472 L 126.90099,79.096169 L 127.07067,78.963928 L 127.24899,78.841302 L 127.41795,78.728967 L 127.55929,78.624573 L 128.0105,78.605848 L 128.46191,78.586899 L 128.91453,78.585915 L 129.36617,78.585919 L 129.80775,78.594545 L 130.25101,78.603904 L 130.69332,78.613735 L 131.10728,78.612532 L 131.52102,78.611548 L 131.91653,78.593314 L 132.28323,78.564512 L 132.63146,78.517722 L 132.96074,78.451493 L 133.26193,78.3759 L 133.53432,78.271998 L 133.77909,78.14003 z "
3856 id="path1060"
3857 style="fill:url(#XMLID_48_);stroke:#000000;stroke-width:0.0882756" /><path
3858 d="M 134.96487,77.198074 L 134.57873,77.584712 L 134.17482,77.914209 L 133.7505,78.149393 L 133.30869,78.34835 L 132.86615,78.488972 L 132.40562,78.573703 L 131.93501,78.631553 L 131.46439,78.650035 L 130.97506,78.650771 L 130.4862,78.631823 L 130.00548,78.603732 L 129.50678,78.576384 L 129.00808,78.549031 L 128.50963,78.539177 L 128.02005,78.558393 L 127.53094,78.596314 L 127.63459,78.512314 L 127.75627,78.408879 L 127.91632,78.285291 L 128.06703,78.154726 L 128.22685,78.030891 L 128.34901,77.927456 L 128.45292,77.843706 L 128.50907,77.787059 L 128.70635,77.664187 L 128.94156,77.579233 L 129.20508,77.523571 L 129.5161,77.494275 L 129.85425,77.49354 L 130.23103,77.512265 L 130.62654,77.531483 L 131.04029,77.567459 L 131.49194,77.586653 L 131.94361,77.604422 L 132.43343,77.604173 L 132.92277,77.586184 L 133.4306,77.547798 L 133.9389,77.471959 L 134.4465,77.358447 L 134.96487,77.198074 z "
3859 id="path1081"
3860 style="fill:url(#XMLID_49_);stroke:#000000;stroke-width:0.0882756" /><path
3861 d="M 106.27173,90.394629 L 106.23405,90.356955 L 106.12103,90.263595 L 105.95208,90.113369 L 105.74403,89.924978 L 105.49924,89.699405 L 105.25445,89.43638 L 105.00029,89.1635 L 104.75455,88.863047 L 104.55704,88.552739 L 104.38809,88.232111 L 104.26474,87.922049 L 104.2266,87.602624 L 104.2554,87.309838 L 104.38718,87.027375 L 104.63165,86.763587 L 104.99811,86.547378 L 104.99786,86.73626 L 105.02642,86.933302 L 105.08282,87.159121 L 105.15843,87.38494 L 105.253,87.629263 L 105.35621,87.864937 L 105.4798,88.118132 L 105.61083,88.382116 L 105.74332,88.626415 L 105.89453,88.871206 L 106.04404,89.116218 L 106.18586,89.350687 L 106.33633,89.577489 L 106.47746,89.793477 L 106.60921,89.981402 L 106.73209,90.159963 L 106.67618,90.197636 L 106.62914,90.226439 L 106.56338,90.253815 L 106.5161,90.282862 L 106.45945,90.30063 L 106.40281,90.320314 L 106.34641,90.358479 L 106.27173,90.394629 z "
3862 id="path1100"
3863 style="fill:url(#XMLID_50_);stroke:#000000;stroke-width:0.0882756" /><path
3864 d="M 106.76059,90.18847 L 106.27101,89.454565 L 105.8844,88.804902 L 105.57338,88.230833 L 105.33746,87.733338 L 105.17787,87.308782 L 105.07372,86.942075 L 105.03531,86.641598 L 105.03578,86.396807 L 105.08233,86.17986 L 105.15721,86.029633 L 105.27096,85.896658 L 105.39335,85.812661 L 105.53372,85.747145 L 105.67458,85.700354 L 105.81643,85.671307 L 105.9482,85.652826 L 105.93864,85.906808 L 105.94847,86.178705 L 105.96719,86.424454 L 105.996,86.696105 L 106.04303,86.951438 L 106.09869,87.214463 L 106.16517,87.468592 L 106.25059,87.722992 L 106.34466,87.986042 L 106.45769,88.249313 L 106.57097,88.513296 L 106.70297,88.776813 L 106.85415,89.039372 L 107.00438,89.302889 L 107.16539,89.575277 L 107.34346,89.848845 L 107.21221,89.942204 L 107.11863,90.018043 L 107.04257,90.056208 L 106.98594,90.074934 L 106.93962,90.103023 L 106.88248,90.122708 L 106.8352,90.150772 L 106.76059,90.18847 z "
3865 id="path1119"
3866 style="fill:url(#XMLID_51_);stroke:#000000;stroke-width:0.0882756" /><path
3867 d="M 107.40977,89.895931 L 107.21201,89.642489 L 107.00536,89.340341 L 106.79753,88.982751 L 106.58945,88.588447 L 106.40178,88.155019 L 106.24074,87.713187 L 106.09938,87.271379 L 105.99548,86.82908 L 105.92996,86.405015 L 105.91075,86.009998 L 105.94817,85.6529 L 106.05065,85.341634 L 106.2297,85.087726 L 106.48384,84.908452 L 106.81333,84.824209 L 107.24577,84.823251 L 107.19896,85.114833 L 107.18073,85.416268 L 107.17112,85.708095 L 107.17161,85.990805 L 107.18083,86.28219 L 107.22836,86.57355 L 107.28546,86.85626 L 107.35123,87.128402 L 107.43618,87.421212 L 107.54033,87.693846 L 107.65337,87.97631 L 107.79473,88.248699 L 107.9452,88.530204 L 108.12374,88.803797 L 108.31284,89.08604 L 108.51898,89.368503 L 108.38866,89.442646 L 108.23796,89.518484 L 108.07766,89.60344 L 107.90824,89.678787 L 107.74841,89.745483 L 107.60705,89.810999 L 107.48536,89.858282 L 107.40977,89.895931 z "
3868 id="path1138"
3869 style="fill:url(#XMLID_52_);stroke:#000000;stroke-width:0.0882756" /><path
3870 d="M 108.60998,83.759157 L 108.23369,83.834504 L 107.92314,83.97608 L 107.65965,84.182951 L 107.46214,84.43708 L 107.30205,84.747855 L 107.19911,85.096548 L 107.15185,85.482694 L 107.14249,85.887566 L 107.18112,86.300572 L 107.25671,86.752455 L 107.38871,87.204117 L 107.54925,87.64595 L 107.74797,88.088495 L 107.9918,88.520964 L 108.25578,88.935199 L 108.56655,89.320386 L 108.70791,89.235651 L 108.84927,89.151409 L 108.99927,89.075325 L 109.15024,89.000199 L 109.31943,88.925074 L 109.46991,88.830288 L 109.62013,88.7559 L 109.76099,88.651997 L 109.62971,88.350807 L 109.46892,88.040254 L 109.32782,87.710999 L 109.17808,87.39037 L 109.0264,87.052466 L 108.88409,86.704264 L 108.75305,86.365376 L 108.63089,86.036342 L 108.52579,85.707063 L 108.45066,85.38668 L 108.39355,85.067501 L 108.36595,84.774937 L 108.37436,84.483822 L 108.41274,84.219839 L 108.48713,83.975539 L 108.60998,83.759157 z "
3871 id="path1157"
3872 style="fill:url(#XMLID_53_);stroke:#000000;stroke-width:0.0882756" /><path
3873 d="M 109.92628,83.00488 L 109.40935,83.203125 L 109.00495,83.438553 L 108.71312,83.711654 L 108.5065,84.013335 L 108.38414,84.342566 L 108.34598,84.700401 L 108.36637,85.076446 L 108.44171,85.471463 L 108.56387,85.876335 L 108.73403,86.289832 L 108.91306,86.71461 L 109.1017,87.127861 L 109.29994,87.550722 L 109.50706,87.964711 L 109.68609,88.369337 L 109.83702,88.7646 L 109.94958,88.670281 L 110.07221,88.604273 L 110.19413,88.538043 L 110.32611,88.480907 L 110.44878,88.416349 L 110.58029,88.359237 L 110.71182,88.28389 L 110.83373,88.198442 L 110.80589,88.151873 L 110.72118,88.048437 L 110.62689,87.897965 L 110.48554,87.700433 L 110.33484,87.4566 L 110.175,87.164306 L 110.00534,86.84439 L 109.85437,86.487046 L 109.72165,86.111467 L 109.60836,85.715959 L 109.53274,85.282777 L 109.49458,84.850087 L 109.5049,84.38857 L 109.57953,83.937153 L 109.71128,83.465586 L 109.92628,83.00488 z "
3874 id="path8936"
3875 style="fill:url(#XMLID_54_);stroke:#000000;stroke-width:0.0882756" /><path
3876 d="M 111.45958,81.865661 L 111.0828,81.922279 L 110.73457,82.064127 L 110.43413,82.289234 L 110.18044,82.56258 L 109.95413,82.901198 L 109.78517,83.296682 L 109.66397,83.720255 L 109.57951,84.181993 L 109.55144,84.679979 L 109.57064,85.189245 L 109.65584,85.705933 L 109.78783,86.233433 L 109.9671,86.75012 L 110.21189,87.258895 L 110.5138,87.728792 L 110.88146,88.190038 L 112.10396,87.473434 L 112.00028,87.257936 L 111.88653,86.993486 L 111.76439,86.721097 L 111.64174,86.41079 L 111.52893,86.07146 L 111.40627,85.724191 L 111.30213,85.35606 L 111.20804,84.980236 L 111.13316,84.584751 L 111.08517,84.180617 L 111.06643,83.7856 L 111.06523,83.38987 L 111.10219,82.995344 L 111.18715,82.608954 L 111.29995,82.232169 L 111.45958,81.865661 z "
3877 id="path1195"
3878 style="fill:url(#XMLID_55_);stroke:#000000;stroke-width:0.0882756" /><path
3879 d="M 112.12294,87.51069 L 112.29188,87.398357 L 112.50788,87.257224 L 112.74355,87.134841 L 112.99842,86.992773 L 113.25162,86.870612 L 113.47792,86.738374 L 113.68409,86.625354 L 113.85326,86.531035 L 113.7119,86.202026 L 113.57079,85.854046 L 113.42969,85.505573 L 113.27825,85.147763 L 113.12802,84.790664 L 112.99555,84.433075 L 112.87337,84.066147 L 112.7505,83.698948 L 112.65594,83.342319 L 112.58922,82.993871 L 112.54268,82.6454 L 112.5242,82.306755 L 112.52373,81.986616 L 112.57052,81.675842 L 112.6641,81.374407 L 112.78649,81.101774 L 112.50353,81.15913 L 112.23135,81.26232 L 111.97672,81.403676 L 111.742,81.60143 L 111.53513,81.845757 L 111.35632,82.138296 L 111.21521,82.467549 L 111.11298,82.852982 L 111.04626,83.277268 L 111.02835,83.747411 L 111.06626,84.274419 L 111.15218,84.829272 L 111.30289,85.431628 L 111.5009,86.080331 L 111.78336,86.77703 L 112.12294,87.51069 z "
3880 id="path1214"
3881 style="fill:url(#XMLID_56_);stroke:#000000;stroke-width:0.0882756" /><path
3882 d="M 114.37572,80.319874 L 113.78294,80.499394 L 113.31283,80.724746 L 112.96437,81.017038 L 112.7203,81.356151 L 112.57919,81.722366 L 112.50432,82.137066 L 112.51464,82.560637 L 112.58062,83.022129 L 112.70324,83.483156 L 112.85443,83.953521 L 113.04307,84.423884 L 113.24105,84.884418 L 113.42921,85.33608 L 113.6176,85.769238 L 113.78727,86.164009 L 113.92958,86.530494 L 114.07959,86.417942 L 114.21999,86.33274 L 114.34311,86.248497 L 114.47488,86.172905 L 114.6064,86.097558 L 114.74751,86.01307 L 114.88862,85.909413 L 115.05687,85.796367 L 114.92583,85.419585 L 114.78376,85.052412 L 114.6525,84.695068 L 114.52963,84.347089 L 114.407,83.998863 L 114.31243,83.660956 L 114.21883,83.321602 L 114.14251,83.001929 L 114.08564,82.662355 L 114.05708,82.33379 L 114.03859,82.014364 L 114.0475,81.684862 L 114.08519,81.344797 L 114.15025,81.016255 L 114.25393,80.667805 L 114.37572,80.319874 z "
3883 id="path8940"
3884 style="fill:url(#XMLID_57_);stroke:#000000;stroke-width:0.0882756" /><path
3885 d="M 115.88115,79.490248 L 115.34456,79.651042 L 114.92145,79.886006 L 114.5922,80.178298 L 114.3383,80.526277 L 114.17845,80.912643 L 114.08512,81.344871 L 114.04769,81.797022 L 114.06664,82.268589 L 114.14248,82.757681 L 114.23702,83.246305 L 114.36014,83.726057 L 114.51038,84.196908 L 114.67114,84.658159 L 114.82136,85.072367 L 114.98262,85.457307 L 115.11388,85.796418 L 115.28283,85.68315 L 115.43401,85.588587 L 115.5749,85.503385 L 115.69706,85.418676 L 115.83792,85.334434 L 115.9795,85.249476 L 116.12997,85.136459 L 116.309,85.014049 L 116.10142,84.468071 L 115.92239,83.950645 L 115.76135,83.471851 L 115.62935,83.019699 L 115.51705,82.605465 L 115.42177,82.228461 L 115.3654,81.872073 L 115.30828,81.532745 L 115.29868,81.22219 L 115.30804,80.930119 L 115.33589,80.657509 L 115.39229,80.394237 L 115.47723,80.158099 L 115.57946,79.92385 L 115.72104,79.705945 L 115.88115,79.490248 z "
3886 id="path1252"
3887 style="fill:url(#XMLID_58_);stroke:#000000;stroke-width:0.0882756" /><path
3888 d="M 117.01941,78.858821 L 116.52096,79.037604 L 116.12428,79.283106 L 115.82429,79.583587 L 115.5891,79.933484 L 115.43863,80.309063 L 115.34552,80.723761 L 115.31792,81.166088 L 115.32704,81.627332 L 115.39351,82.087891 L 115.48758,82.558254 L 115.6011,83.028397 L 115.74221,83.470945 L 115.88477,83.91302 L 116.04461,84.318604 L 116.18643,84.684598 L 116.31866,85.023707 L 116.4593,84.938752 L 116.59034,84.863406 L 116.71321,84.777985 L 116.83583,84.712469 L 116.95822,84.627981 L 117.0799,84.543267 L 117.22124,84.4576 L 117.36211,84.354633 L 117.23108,84.109866 L 117.10749,83.818285 L 117.0038,83.508224 L 116.90015,83.160244 L 116.80587,82.802896 L 116.73963,82.416507 L 116.67315,82.030853 L 116.62587,81.626006 L 116.60715,81.231482 L 116.59707,80.825898 L 116.60619,80.440487 L 116.64339,80.082677 L 116.69979,79.724841 L 116.77516,79.404704 L 116.87813,79.113589 L 117.01941,78.858821 z "
3889 id="path1271"
3890 style="fill:url(#XMLID_59_);stroke:#000000;stroke-width:0.0882756" /><path
3891 d="M 118.35473,78.066848 L 117.81933,78.227884 L 117.3955,78.482287 L 117.06626,78.830756 L 116.84114,79.244995 L 116.68106,79.706019 L 116.59587,80.223198 L 116.56848,80.76075 L 116.58771,81.305526 L 116.64506,81.851974 L 116.73913,82.379008 L 116.84401,82.876992 L 116.97577,83.328406 L 117.09865,83.714775 L 117.21096,84.034446 L 117.31461,84.251391 L 117.38084,84.373311 L 117.55026,84.26051 L 117.70047,84.165969 L 117.85117,84.071899 L 118.00236,83.977336 L 118.15258,83.883727 L 118.28361,83.789163 L 118.4252,83.685483 L 118.56608,83.582048 L 118.49049,83.337476 L 118.38731,83.045648 L 118.28315,82.716396 L 118.18884,82.358091 L 118.07532,81.963789 L 117.97213,81.558672 L 117.88597,81.154758 L 117.80172,80.730692 L 117.74462,80.316461 L 117.71653,79.911589 L 117.71606,79.517038 L 117.75302,79.159471 L 117.82839,78.820119 L 117.95105,78.528045 L 118.12937,78.274872 L 118.35473,78.066848 z "
3892 id="path1290"
3893 style="fill:url(#XMLID_60_);stroke:#000000;stroke-width:0.0882756" /><path
3894 d="M 119.42748,77.595989 L 118.90959,77.70014 L 118.49609,77.888039 L 118.18579,78.142659 L 117.95012,78.471913 L 117.80011,78.848451 L 117.71635,79.272517 L 117.6878,79.714814 L 117.71734,80.194075 L 117.78357,80.675031 L 117.87669,81.163654 L 118.00002,81.64412 L 118.13226,82.113771 L 118.27386,82.537587 L 118.40608,82.932849 L 118.52824,83.261882 L 118.62231,83.544105 L 118.75429,83.450007 L 118.87623,83.384266 L 118.98902,83.32809 L 119.09246,83.280341 L 119.20575,83.223227 L 119.30894,83.176656 L 119.40372,83.120261 L 119.50595,83.054006 L 119.35573,82.734801 L 119.22373,82.395938 L 119.10111,82.065945 L 118.98855,81.727302 L 118.89472,81.369493 L 118.81792,81.031565 L 118.76128,80.673481 L 118.72335,80.315895 L 118.70441,79.958057 L 118.7229,79.601179 L 118.75983,79.262317 L 118.82609,78.913844 L 118.92903,78.565149 L 119.05141,78.235898 L 119.22062,77.915513 L 119.42748,77.595989 z "
3895 id="path1309"
3896 style="fill:url(#XMLID_61_);stroke:#000000;stroke-width:0.0882756" /><path
3897 d="M 120.63176,76.841933 L 120.25447,77.011845 L 119.91657,77.21921 L 119.6154,77.482481 L 119.37062,77.765657 L 119.15485,78.095159 L 118.98589,78.433527 L 118.86349,78.819428 L 118.76964,79.233882 L 118.72331,79.657483 L 118.72378,80.109145 L 118.75209,80.570416 L 118.81857,81.050633 L 118.92272,81.548866 L 119.08252,82.047563 L 119.2618,82.565234 L 119.48786,83.072094 L 119.62919,82.968656 L 119.76959,82.86571 L 119.92101,82.7709 L 120.08086,82.686412 L 120.23108,82.592807 L 120.39115,82.508096 L 120.54162,82.414245 L 120.68272,82.309606 L 120.56009,82.01849 L 120.42764,81.71755 L 120.30572,81.42523 L 120.17277,81.124285 L 120.06958,80.813731 L 119.97477,80.513036 L 119.89101,80.183043 L 119.8339,79.86337 L 119.81447,79.524752 L 119.80487,79.176774 L 119.84234,78.818717 L 119.90784,78.451766 L 120.02065,78.074981 L 120.17114,77.67972 L 120.37681,77.265754 L 120.63176,76.841933 z "
3898 id="path1328"
3899 style="fill:url(#XMLID_62_);stroke:#000000;stroke-width:0.0882756" /><path
3900 d="M 122.35296,75.777373 L 121.8456,75.985204 L 121.39299,76.248698 L 121.01717,76.531379 L 120.67879,76.851518 L 120.41505,77.20933 L 120.19882,77.595475 L 120.0392,77.999857 L 119.92639,78.433259 L 119.88033,78.874843 L 119.86185,79.344961 L 119.89926,79.816774 L 119.9756,80.305867 L 120.09847,80.804099 L 120.26742,81.293656 L 120.45656,81.783238 L 120.69149,82.281223 L 120.82324,82.187372 L 120.95501,82.093053 L 121.10594,81.980498 L 121.2473,81.875612 L 121.39752,81.763775 L 121.548,81.649772 L 121.69826,81.53697 L 121.84851,81.424171 L 121.65099,81.094648 L 121.50078,80.756276 L 121.35917,80.408297 L 121.25453,80.041588 L 121.17941,79.683755 L 121.13237,79.316554 L 121.10381,78.949599 L 121.11268,78.582181 L 121.15012,78.205866 L 121.2341,77.839649 L 121.33729,77.471494 L 121.46905,77.114149 L 121.64736,76.765926 L 121.84536,76.417231 L 122.08007,76.088444 L 122.35296,75.777373 z "
3901 id="path8947"
3902 style="fill:url(#XMLID_63_);stroke:#000000;stroke-width:0.0882756" /><path
3903 d="M 123.1142,75.165383 L 122.7475,75.438509 L 122.4096,75.720729 L 122.1089,76.040644 L 121.84517,76.380218 L 121.61981,76.756266 L 121.43167,77.133541 L 121.28141,77.528559 L 121.16839,77.943016 L 121.10287,78.366585 L 121.07578,78.808419 L 121.09422,79.241602 L 121.15112,79.693017 L 121.26463,80.144927 L 121.41559,80.596344 L 121.62247,81.048471 L 121.8778,81.490524 L 121.9995,81.38613 L 122.10293,81.30142 L 122.2157,81.226542 L 122.32896,81.15117 L 122.44152,81.075086 L 122.55453,81.000453 L 122.69567,80.915497 L 122.83631,80.812308 L 122.7233,80.379124 L 122.61028,79.96467 L 122.50564,79.598454 L 122.39356,79.240619 L 122.31772,78.901732 L 122.24258,78.581838 L 122.17631,78.271332 L 122.14729,77.961026 L 122.13768,77.650471 L 122.16553,77.339922 L 122.22264,77.019759 L 122.30713,76.690504 L 122.44728,76.341564 L 122.61648,75.984219 L 122.84207,75.58871 L 123.1142,75.165383 z "
3904 id="path1366"
3905 style="fill:url(#XMLID_64_);stroke:#000000;stroke-width:0.0882756" /><path
3906 d="M 124.80708,73.865618 L 124.22438,74.2237 L 123.71633,74.600233 L 123.29321,74.986383 L 122.95482,75.400615 L 122.68193,75.82419 L 122.47529,76.257371 L 122.32457,76.689815 L 122.24969,77.142903 L 122.20337,77.602972 L 122.21393,78.064241 L 122.26073,78.525759 L 122.33702,78.996148 L 122.44024,79.456927 L 122.57248,79.927562 L 122.7232,80.379222 L 122.88375,80.840223 L 122.96821,80.773994 L 123.08052,80.679923 L 123.22163,80.595435 L 123.36348,80.491041 L 123.5137,80.397431 L 123.6553,80.31201 L 123.76712,80.217939 L 123.86168,80.142592 L 123.71986,79.719975 L 123.61618,79.31486 L 123.54083,78.900623 L 123.49356,78.496709 L 123.46475,78.091617 L 123.4645,77.695862 L 123.50217,77.30156 L 123.55834,76.906074 L 123.64186,76.519684 L 123.74579,76.134028 L 123.87756,75.757493 L 124.0182,75.372085 L 124.18738,74.995057 L 124.37529,74.618985 L 124.58266,74.241715 L 124.80708,73.865618 z "
3907 id="path1385"
3908 style="fill:url(#XMLID_65_);stroke:#000000;stroke-width:0.0882756" /><path
3909 d="M 125.79416,73.064772 L 125.48503,73.280758 L 125.20231,73.545235 L 124.91073,73.855541 L 124.64771,74.213134 L 124.40292,74.590874 L 124.17782,75.004615 L 123.97984,75.446917 L 123.81088,75.917796 L 123.6705,76.416246 L 123.57737,76.924532 L 123.51211,77.441929 L 123.4845,77.978327 L 123.51257,78.515189 L 123.57856,79.051098 L 123.70168,79.587468 L 123.88049,80.123839 L 124.02209,80.001459 L 124.17184,79.889148 L 124.33239,79.785223 L 124.49198,79.681318 L 124.63262,79.596363 L 124.75598,79.530603 L 124.83998,79.464348 L 124.87766,79.408438 L 124.95349,79.106513 L 125.00893,78.78709 L 125.05573,78.438859 L 125.0934,78.062297 L 125.12148,77.676156 L 125.14859,77.271774 L 125.18603,76.857071 L 125.21409,76.434926 L 125.25176,76.001249 L 125.2885,75.568068 L 125.33578,75.126264 L 125.4013,74.702196 L 125.47593,74.269013 L 125.56018,73.86417 L 125.67273,73.45018 L 125.79416,73.064772 z "
3910 id="path1404"
3911 style="fill:url(#XMLID_66_);stroke:#000000;stroke-width:0.0882756" /><path
3912 d="M 127.40354,72.168646 L 126.9334,72.169361 L 126.53861,72.28287 L 126.21798,72.49021 L 125.96429,72.781792 L 125.75767,73.139602 L 125.59807,73.5819 L 125.48551,74.072219 L 125.38208,74.607635 L 125.3264,75.191287 L 125.26978,75.793666 L 125.23306,76.415461 L 125.19563,77.036543 L 125.14931,77.667012 L 125.08426,78.278758 L 125.00051,78.871306 L 124.88722,79.436725 L 125.00889,79.351523 L 125.14063,79.238014 L 125.30119,79.097616 L 125.46101,78.956017 L 125.62948,78.805791 L 125.79003,78.645242 L 125.93113,78.504131 L 126.0528,78.381478 L 126.11881,77.939181 L 126.16538,77.497592 L 126.21171,77.054337 L 126.23043,76.621868 L 126.25849,76.198294 L 126.27723,75.783126 L 126.29521,75.369627 L 126.32306,74.965002 L 126.37968,74.569984 L 126.43558,74.193916 L 126.5203,73.826494 L 126.63237,73.468659 L 126.77349,73.120186 L 126.94294,72.781324 L 127.15938,72.470304 L 127.40354,72.168646 z "
3913 id="path1423"
3914 style="fill:url(#XMLID_67_);stroke:#000000;stroke-width:0.0882756" /><path
3915 d="M 128.45684,71.339955 L 127.96748,71.642105 L 127.56311,71.971627 L 127.23409,72.338093 L 126.97083,72.734805 L 126.77403,73.15813 L 126.61419,73.600674 L 126.50189,74.071041 L 126.41742,74.551012 L 126.37036,75.049709 L 126.32429,75.5489 L 126.30556,76.056717 L 126.27725,76.574608 L 126.24004,77.082672 L 126.20357,77.590486 L 126.11862,78.089677 L 126.02573,78.579013 L 126.1282,78.49477 L 126.26906,78.390866 L 126.42987,78.268459 L 126.58033,78.135731 L 126.74042,78.014525 L 126.88105,77.909419 L 126.98449,77.826134 L 127.05047,77.778385 L 127.21894,77.608939 L 127.34202,77.39271 L 127.43515,77.1287 L 127.50137,76.818639 L 127.53856,76.479531 L 127.56689,76.09363 L 127.58466,75.679418 L 127.60312,75.246234 L 127.62231,74.794082 L 127.65015,74.315046 L 127.70608,73.825708 L 127.78093,73.336128 L 127.89253,72.827597 L 128.03367,72.328898 L 128.21245,71.829952 L 128.45684,71.339955 z "
3916 id="path1442"
3917 style="fill:url(#XMLID_68_);stroke:#000000;stroke-width:0.0882756" /><path
3918 d="M 129.97143,70.482956 L 129.42545,70.633426 L 128.97402,70.916848 L 128.61644,71.27444 L 128.32606,71.716491 L 128.10024,72.225513 L 127.93057,72.790194 L 127.80915,73.383459 L 127.71557,73.985347 L 127.66899,74.596843 L 127.6318,75.199469 L 127.58572,75.774009 L 127.54757,76.300304 L 127.50103,76.761576 L 127.4259,77.156371 L 127.31335,77.458053 L 127.16358,77.664925 L 127.33255,77.533864 L 127.52093,77.401163 L 127.72734,77.250444 L 127.94404,77.091347 L 128.15067,76.921191 L 128.36737,76.761109 L 128.55552,76.591686 L 128.72492,76.440505 L 128.78996,75.97948 L 128.83724,75.555904 L 128.86484,75.151526 L 128.89243,74.766361 L 128.9013,74.39845 L 128.9013,74.041326 L 128.91979,73.683024 L 128.93803,73.344627 L 128.96658,73.015126 L 129.02249,72.676018 L 129.09712,72.337402 L 129.20103,71.989665 L 129.33254,71.631611 L 129.50175,71.274267 L 129.71749,70.88763 L 129.97143,70.482956 z "
3919 id="path1461"
3920 style="fill:url(#XMLID_69_);stroke:#000000;stroke-width:0.0882756" /><g
3921 style="stroke:#000000"
3922 id="g1463"
3923 transform="matrix(0.2457491,-0.2457491,0.2457491,0.2457491,95.022367,94.120824)">
3924 <linearGradient
3925 id="linearGradient8956"
3926 gradientUnits="userSpaceOnUse"
3927 x1="-3581.9316"
3928 y1="-3602.7837"
3929 x2="-3565.7739"
3930 y2="-3551.2231"
3931 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3932 <stop
3933 offset="0"
3934 style="stop-color:#D8E7EB"
3935 id="stop8958" />
3936 <stop
3937 offset="0.0684"
3938 style="stop-color:#D0DFE4"
3939 id="stop8960" />
3940 <stop
3941 offset="0.1761"
3942 style="stop-color:#B9CAD0"
3943 id="stop8962" />
3944 <stop
3945 offset="0.3096"
3946 style="stop-color:#94A7B0"
3947 id="stop8964" />
3948 <stop
3949 offset="0.4622"
3950 style="stop-color:#627784"
3951 id="stop8966" />
3952 <stop
3953 offset="0.5537"
3954 style="stop-color:#405766"
3955 id="stop8968" />
3956 <stop
3957 offset="0.6113"
3958 style="stop-color:#607682"
3959 id="stop8970" />
3960 <stop
3961 offset="0.6983"
3962 style="stop-color:#8B9EA8"
3963 id="stop8972" />
3964 <stop
3965 offset="0.7829"
3966 style="stop-color:#ADBEC5"
3967 id="stop8974" />
3968 <stop
3969 offset="0.8633"
3970 style="stop-color:#C5D5DA"
3971 id="stop8976" />
3972 <stop
3973 offset="0.9376"
3974 style="stop-color:#D3E2E7"
3975 id="stop8978" />
3976 <stop
3977 offset="1"
3978 style="stop-color:#D8E7EB"
3979 id="stop8980" />
3980 </linearGradient>
3981 <path
3982 d="M 26.439,7.3066 L 26.2476,6.9238 L 26.0943,6.5039 L 25.942,6.081 L 25.8287,5.6601 L 25.7115,5.2011 L 25.5963,4.705 L 25.5192,4.246 L 25.4772,3.746 L 25.4411,3.287 L 25.4059,2.789 L 25.4049,2.29 L 25.4401,1.83 L 25.4762,1.371 L 25.5563,0.914 L 25.6686,0.4531 L 25.7829,0.0332 L 25.1716,0.8369 L 24.7126,1.7187 L 24.3679,2.5605 L 24.1784,3.4404 L 24.1022,4.3593 L 24.1022,5.2792 L 24.2184,6.1972 L 24.4127,7.1171 L 24.6402,8.0351 L 24.9498,8.9179 L 25.2926,9.7968 L 25.6354,10.6767 L 26.0202,11.5195 L 26.3669,12.3613 L 26.7126,13.1259 L 27.0554,13.8925 L 26.9812,13.7402 L 26.9441,13.4716 L 26.9021,13.206 L 26.9021,12.8613 L 26.865,12.4375 L 26.865,12.0166 L 26.865,11.5566 L 26.864,11.0605 L 26.8259,10.5615 L 26.8259,10.0635 L 26.7868,9.5293 L 26.7468,9.0313 L 26.7087,8.5704 L 26.6306,8.1095 L 26.5554,7.6896 L 26.439,7.3066 z "
3983 id="path1490"
3984 style="fill:url(#XMLID_70_);stroke-width:0.25400001" />
3985 <linearGradient
3986 id="linearGradient8983"
3987 gradientUnits="userSpaceOnUse"
3988 x1="-3578.5146"
3989 y1="-3603.8545"
3990 x2="-3562.3569"
3991 y2="-3552.2939"
3992 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3993 <stop
3994 offset="0"
3995 style="stop-color:#D8E7EB"
3996 id="stop8985" />
3997 <stop
3998 offset="0.0684"
3999 style="stop-color:#D0DFE4"
4000 id="stop8987" />
4001 <stop
4002 offset="0.1761"
4003 style="stop-color:#B9CAD0"
4004 id="stop8989" />
4005 <stop
4006 offset="0.3096"
4007 style="stop-color:#94A7B0"
4008 id="stop8991" />
4009 <stop
4010 offset="0.4622"
4011 style="stop-color:#627784"
4012 id="stop8993" />
4013 <stop
4014 offset="0.5537"
4015 style="stop-color:#405766"
4016 id="stop8995" />
4017 <stop
4018 offset="0.6113"
4019 style="stop-color:#607682"
4020 id="stop8997" />
4021 <stop
4022 offset="0.6983"
4023 style="stop-color:#8B9EA8"
4024 id="stop8999" />
4025 <stop
4026 offset="0.7829"
4027 style="stop-color:#ADBEC5"
4028 id="stop9001" />
4029 <stop
4030 offset="0.8633"
4031 style="stop-color:#C5D5DA"
4032 id="stop9003" />
4033 <stop
4034 offset="0.9376"
4035 style="stop-color:#D3E2E7"
4036 id="stop9005" />
4037 <stop
4038 offset="1"
4039 style="stop-color:#D8E7EB"
4040 id="stop9007" />
4041 </linearGradient>
4042 <path
4043 d="M 14.77,23.209 L 15.4985,22.7871 L 16.228,22.3252 L 16.9145,21.9043 L 17.603,21.4824 L 18.2534,21.0996 L 18.9048,20.6777 L 19.521,20.2929 L 20.1304,19.873 L 20.7837,19.4892 L 21.3911,19.1054 L 22.0054,18.6835 L 22.6577,18.3017 L 23.269,17.916 L 23.9233,17.5742 L 24.6088,17.1885 L 25.2983,16.8067 L 24.688,17.417 L 24.0747,18.0332 L 23.4614,18.6445 L 22.8491,19.2207 L 22.1987,19.832 L 21.5473,20.3681 L 20.8969,20.9072 L 20.2094,21.4043 L 19.558,21.8652 L 18.8685,22.248 L 18.2171,22.5937 L 17.5296,22.8623 L 16.8411,23.0918 L 16.1526,23.207 L 15.4612,23.2441 L 14.77,23.209 z "
4044 id="path1517"
4045 style="fill:url(#XMLID_71_);stroke-width:0.25400001" />
4046 <linearGradient
4047 id="linearGradient9010"
4048 gradientUnits="userSpaceOnUse"
4049 x1="-3580.1533"
4050 y1="-3603.3408"
4051 x2="-3563.9956"
4052 y2="-3551.7803"
4053 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4054 <stop
4055 offset="0"
4056 style="stop-color:#D8E7EB"
4057 id="stop9012" />
4058 <stop
4059 offset="0.0684"
4060 style="stop-color:#D0DFE4"
4061 id="stop9014" />
4062 <stop
4063 offset="0.1761"
4064 style="stop-color:#B9CAD0"
4065 id="stop9016" />
4066 <stop
4067 offset="0.3096"
4068 style="stop-color:#94A7B0"
4069 id="stop9018" />
4070 <stop
4071 offset="0.4622"
4072 style="stop-color:#627784"
4073 id="stop9020" />
4074 <stop
4075 offset="0.5537"
4076 style="stop-color:#405766"
4077 id="stop9022" />
4078 <stop
4079 offset="0.6113"
4080 style="stop-color:#607682"
4081 id="stop9024" />
4082 <stop
4083 offset="0.6983"
4084 style="stop-color:#8B9EA8"
4085 id="stop9026" />
4086 <stop
4087 offset="0.7829"
4088 style="stop-color:#ADBEC5"
4089 id="stop9028" />
4090 <stop
4091 offset="0.8633"
4092 style="stop-color:#C5D5DA"
4093 id="stop9030" />
4094 <stop
4095 offset="0.9376"
4096 style="stop-color:#D3E2E7"
4097 id="stop9032" />
4098 <stop
4099 offset="1"
4100 style="stop-color:#D8E7EB"
4101 id="stop9034" />
4102 </linearGradient>
4103 <path
4104 d="M 20.4663,7.8848 L 21.0024,8.1924 L 21.5024,8.5371 L 21.9975,8.8818 L 22.4584,9.2256 L 22.9184,9.5684 L 23.3373,9.9512 L 23.7611,10.3721 L 24.184,10.7549 L 24.5678,11.1768 L 24.9516,11.5987 L 25.3334,12.0196 L 25.6781,12.4766 L 26.0609,12.8985 L 26.4066,13.3575 L 26.7504,13.8165 L 27.0961,14.2755 L 26.9428,13.7794 L 26.7504,13.2804 L 26.559,12.7433 L 26.3295,12.2091 L 26.0981,11.673 L 25.8335,11.174 L 25.5239,10.676 L 25.1782,10.1809 L 24.7964,9.7209 L 24.3355,9.34 L 23.8375,8.9552 L 23.3004,8.6114 L 22.688,8.3428 L 22.0357,8.1133 L 21.271,7.9609 L 20.4663,7.8848 z "
4105 id="path1544"
4106 style="fill:url(#XMLID_72_);stroke-width:0.25400001" />
4107 <linearGradient
4108 id="linearGradient9037"
4109 gradientUnits="userSpaceOnUse"
4110 x1="-3584.6758"
4111 y1="-3601.9238"
4112 x2="-3568.5181"
4113 y2="-3550.3633"
4114 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4115 <stop
4116 offset="0"
4117 style="stop-color:#D8E7EB"
4118 id="stop9039" />
4119 <stop
4120 offset="0.0684"
4121 style="stop-color:#D0DFE4"
4122 id="stop9041" />
4123 <stop
4124 offset="0.1761"
4125 style="stop-color:#B9CAD0"
4126 id="stop9043" />
4127 <stop
4128 offset="0.3096"
4129 style="stop-color:#94A7B0"
4130 id="stop9045" />
4131 <stop
4132 offset="0.4622"
4133 style="stop-color:#627784"
4134 id="stop9047" />
4135 <stop
4136 offset="0.5537"
4137 style="stop-color:#405766"
4138 id="stop9049" />
4139 <stop
4140 offset="0.6113"
4141 style="stop-color:#607682"
4142 id="stop9051" />
4143 <stop
4144 offset="0.6983"
4145 style="stop-color:#8B9EA8"
4146 id="stop9053" />
4147 <stop
4148 offset="0.7829"
4149 style="stop-color:#ADBEC5"
4150 id="stop9055" />
4151 <stop
4152 offset="0.8633"
4153 style="stop-color:#C5D5DA"
4154 id="stop9057" />
4155 <stop
4156 offset="0.9376"
4157 style="stop-color:#D3E2E7"
4158 id="stop9059" />
4159 <stop
4160 offset="1"
4161 style="stop-color:#D8E7EB"
4162 id="stop9061" />
4163 </linearGradient>
4164 <path
4165 d="M 26.1694,3.2852 L 26.3227,4.0518 L 26.437,4.8184 L 26.5913,5.584 L 26.7075,6.2735 L 26.7837,7 L 26.898,7.6895 L 26.9771,8.379 L 27.0923,9.0685 L 27.1675,9.757 L 27.2847,10.4465 L 27.3589,11.0979 L 27.4771,11.7874 L 27.5933,12.4769 L 27.7085,13.1664 L 27.8589,13.8559 L 28.0161,14.5825 L 28.0923,13.7407 L 28.2056,12.936 L 28.2827,12.0932 L 28.3589,11.2524 L 28.396,10.4458 L 28.4341,9.6411 L 28.395,8.8359 L 28.356,8.0723 L 28.2798,7.3057 L 28.1636,6.6172 L 27.9693,5.9277 L 27.7408,5.2763 L 27.4722,4.7031 L 27.0884,4.166 L 26.6665,3.706 L 26.1694,3.2852 z "
4166 id="path1571"
4167 style="fill:url(#XMLID_73_);stroke-width:0.25400001" />
4168 <linearGradient
4169 id="linearGradient9064"
4170 gradientUnits="userSpaceOnUse"
4171 x1="-3578.7671"
4172 y1="-3603.7754"
4173 x2="-3562.6094"
4174 y2="-3552.2148"
4175 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4176 <stop
4177 offset="0"
4178 style="stop-color:#D8E7EB"
4179 id="stop9066" />
4180 <stop
4181 offset="0.0684"
4182 style="stop-color:#D0DFE4"
4183 id="stop9068" />
4184 <stop
4185 offset="0.1761"
4186 style="stop-color:#B9CAD0"
4187 id="stop9070" />
4188 <stop
4189 offset="0.3096"
4190 style="stop-color:#94A7B0"
4191 id="stop9072" />
4192 <stop
4193 offset="0.4622"
4194 style="stop-color:#627784"
4195 id="stop9074" />
4196 <stop
4197 offset="0.5537"
4198 style="stop-color:#405766"
4199 id="stop9076" />
4200 <stop
4201 offset="0.6113"
4202 style="stop-color:#607682"
4203 id="stop9078" />
4204 <stop
4205 offset="0.6983"
4206 style="stop-color:#8B9EA8"
4207 id="stop9080" />
4208 <stop
4209 offset="0.7829"
4210 style="stop-color:#ADBEC5"
4211 id="stop9082" />
4212 <stop
4213 offset="0.8633"
4214 style="stop-color:#C5D5DA"
4215 id="stop9084" />
4216 <stop
4217 offset="0.9376"
4218 style="stop-color:#D3E2E7"
4219 id="stop9086" />
4220 <stop
4221 offset="1"
4222 style="stop-color:#D8E7EB"
4223 id="stop9088" />
4224 </linearGradient>
4225 <path
4226 d="M 22.0698,2.0273 L 21.3774,2.6757 L 20.9614,3.4042 L 20.772,4.1699 L 20.732,4.9746 L 20.8853,5.7783 L 21.1919,6.6211 L 21.6157,7.4649 L 22.1137,8.3057 L 22.6889,9.1094 L 23.3002,9.9141 L 23.9535,10.6778 L 24.6039,11.3673 L 25.2182,12.0568 L 25.7524,12.6291 L 26.2153,13.1682 L 26.5981,13.5862 L 26.521,12.4387 L 26.3667,11.4436 L 26.1753,10.5998 L 25.8687,9.8361 L 25.563,9.1855 L 25.1773,8.6113 L 24.7955,8.0761 L 24.3717,7.5751 L 23.9889,7.079 L 23.5649,6.584 L 23.1831,6.0459 L 22.8765,5.4326 L 22.5699,4.7441 L 22.3394,3.9785 L 22.145,3.0596 L 22.0698,2.0273 z "
4227 id="path1598"
4228 style="fill:url(#XMLID_74_);stroke-width:0.25400001" />
4229 <linearGradient
4230 id="linearGradient9091"
4231 gradientUnits="userSpaceOnUse"
4232 x1="-3584.2759"
4233 y1="-3602.0488"
4234 x2="-3568.1182"
4235 y2="-3550.4883"
4236 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4237 <stop
4238 offset="0"
4239 style="stop-color:#D8E7EB"
4240 id="stop9093" />
4241 <stop
4242 offset="0.0684"
4243 style="stop-color:#D0DFE4"
4244 id="stop9095" />
4245 <stop
4246 offset="0.1761"
4247 style="stop-color:#B9CAD0"
4248 id="stop9097" />
4249 <stop
4250 offset="0.3096"
4251 style="stop-color:#94A7B0"
4252 id="stop9099" />
4253 <stop
4254 offset="0.4622"
4255 style="stop-color:#627784"
4256 id="stop9101" />
4257 <stop
4258 offset="0.5537"
4259 style="stop-color:#405766"
4260 id="stop9103" />
4261 <stop
4262 offset="0.6113"
4263 style="stop-color:#607682"
4264 id="stop9105" />
4265 <stop
4266 offset="0.6983"
4267 style="stop-color:#8B9EA8"
4268 id="stop9107" />
4269 <stop
4270 offset="0.7829"
4271 style="stop-color:#ADBEC5"
4272 id="stop9109" />
4273 <stop
4274 offset="0.8633"
4275 style="stop-color:#C5D5DA"
4276 id="stop9111" />
4277 <stop
4278 offset="0.9376"
4279 style="stop-color:#D3E2E7"
4280 id="stop9113" />
4281 <stop
4282 offset="1"
4283 style="stop-color:#D8E7EB"
4284 id="stop9115" />
4285 </linearGradient>
4286 <path
4287 d="M 26.981,14.2754 L 26.981,13.4707 L 27.0201,12.7061 L 27.0543,11.9415 L 27.0943,11.1739 L 27.1304,10.4454 L 27.2075,9.7188 L 27.2837,8.9922 L 27.3569,8.3008 L 27.4741,7.6123 L 27.5884,6.8848 L 27.7779,6.1953 L 27.9302,5.5058 L 28.1236,4.8163 L 28.3912,4.1268 L 28.6197,3.4373 L 28.9254,2.7478 L 27.814,3.8213 L 26.896,4.8555 L 26.2466,5.8145 L 25.7896,6.7344 L 25.481,7.6133 L 25.3296,8.418 L 25.2925,9.2237 L 25.3696,9.9503 L 25.5649,10.6398 L 25.7514,11.2912 L 26.0209,11.9045 L 26.2914,12.4377 L 26.5219,12.9748 L 26.7504,13.4338 L 26.9027,13.8928 L 26.981,14.2754 z "
4288 id="path1625"
4289 style="fill:url(#XMLID_75_);stroke-width:0.25400001" />
4290 <linearGradient
4291 id="linearGradient9118"
4292 gradientUnits="userSpaceOnUse"
4293 x1="-3577.7891"
4294 y1="-3604.082"
4295 x2="-3561.6313"
4296 y2="-3552.5215"
4297 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4298 <stop
4299 offset="0"
4300 style="stop-color:#D8E7EB"
4301 id="stop9120" />
4302 <stop
4303 offset="0.0684"
4304 style="stop-color:#D0DFE4"
4305 id="stop9122" />
4306 <stop
4307 offset="0.1761"
4308 style="stop-color:#B9CAD0"
4309 id="stop9124" />
4310 <stop
4311 offset="0.3096"
4312 style="stop-color:#94A7B0"
4313 id="stop9126" />
4314 <stop
4315 offset="0.4622"
4316 style="stop-color:#627784"
4317 id="stop9128" />
4318 <stop
4319 offset="0.5537"
4320 style="stop-color:#405766"
4321 id="stop9130" />
4322 <stop
4323 offset="0.6113"
4324 style="stop-color:#607682"
4325 id="stop9132" />
4326 <stop
4327 offset="0.6983"
4328 style="stop-color:#8B9EA8"
4329 id="stop9134" />
4330 <stop
4331 offset="0.7829"
4332 style="stop-color:#ADBEC5"
4333 id="stop9136" />
4334 <stop
4335 offset="0.8633"
4336 style="stop-color:#C5D5DA"
4337 id="stop9138" />
4338 <stop
4339 offset="0.9376"
4340 style="stop-color:#D3E2E7"
4341 id="stop9140" />
4342 <stop
4343 offset="1"
4344 style="stop-color:#D8E7EB"
4345 id="stop9142" />
4346 </linearGradient>
4347 <path
4348 d="M 15.7612,16.9268 L 16.3745,17.0411 L 16.9888,17.1573 L 17.6001,17.2325 L 18.2134,17.3077 L 18.8228,17.3458 L 19.439,17.3829 L 20.0523,17.3829 L 20.6275,17.3829 L 21.2398,17.3438 L 21.814,17.3438 L 22.4273,17.2667 L 23.0005,17.2286 L 23.6138,17.1886 L 24.189,17.1134 L 24.8013,17.0343 L 25.3755,16.9591 L 24.8775,17.1876 L 24.3795,17.38 L 23.8443,17.6114 L 23.3072,17.8018 L 22.733,17.9952 L 22.1568,18.1466 L 21.5464,18.3029 L 20.936,18.3791 L 20.3237,18.4182 L 19.6723,18.4182 L 19.0209,18.3791 L 18.3666,18.2287 L 17.7152,18.0373 L 17.0638,17.7678 L 16.4154,17.3869 L 15.7612,16.9268 z "
4349 id="path1652"
4350 style="fill:url(#XMLID_76_);stroke-width:0.25400001" />
4351 <linearGradient
4352 id="linearGradient9145"
4353 gradientUnits="userSpaceOnUse"
4354 x1="-3576.8662"
4355 y1="-3604.3711"
4356 x2="-3560.7085"
4357 y2="-3552.8105"
4358 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4359 <stop
4360 offset="0"
4361 style="stop-color:#D8E7EB"
4362 id="stop9147" />
4363 <stop
4364 offset="0.0684"
4365 style="stop-color:#D0DFE4"
4366 id="stop9149" />
4367 <stop
4368 offset="0.1761"
4369 style="stop-color:#B9CAD0"
4370 id="stop9151" />
4371 <stop
4372 offset="0.3096"
4373 style="stop-color:#94A7B0"
4374 id="stop9153" />
4375 <stop
4376 offset="0.4622"
4377 style="stop-color:#627784"
4378 id="stop9155" />
4379 <stop
4380 offset="0.5537"
4381 style="stop-color:#405766"
4382 id="stop9157" />
4383 <stop
4384 offset="0.6113"
4385 style="stop-color:#607682"
4386 id="stop9159" />
4387 <stop
4388 offset="0.6983"
4389 style="stop-color:#8B9EA8"
4390 id="stop9161" />
4391 <stop
4392 offset="0.7829"
4393 style="stop-color:#ADBEC5"
4394 id="stop9163" />
4395 <stop
4396 offset="0.8633"
4397 style="stop-color:#C5D5DA"
4398 id="stop9165" />
4399 <stop
4400 offset="0.9376"
4401 style="stop-color:#D3E2E7"
4402 id="stop9167" />
4403 <stop
4404 offset="1"
4405 style="stop-color:#D8E7EB"
4406 id="stop9169" />
4407 </linearGradient>
4408 <path
4409 d="M 19.3267,20.0264 L 18.8658,20.1045 L 18.4469,20.1797 L 17.986,20.2949 L 17.529,20.4101 L 17.0661,20.6044 L 16.61,20.7558 L 16.1891,20.9472 L 15.7291,21.1777 L 15.3082,21.4082 L 14.8873,21.6748 L 14.5045,21.9453 L 14.1188,22.2129 L 13.776,22.5195 L 13.4684,22.8261 L 13.1647,23.1327 L 12.8952,23.4765 L 13.1638,22.5595 L 13.5857,21.7519 L 14.0798,21.0644 L 14.695,20.413 L 15.3825,19.8769 L 16.1501,19.3769 L 16.9899,18.956 L 17.8717,18.6093 L 18.7906,18.3036 L 19.7095,17.997 L 20.6646,17.7656 L 21.6226,17.5351 L 22.5796,17.3417 L 23.4985,17.1522 L 24.3413,16.9979 L 25.1831,16.8055 L 24.7632,16.9588 L 24.189,17.2645 L 23.4986,17.7274 L 22.6578,18.2245 L 21.815,18.7597 L 20.9361,19.2587 L 20.0943,19.7187 L 19.3267,20.0264 z "
4410 id="path1679"
4411 style="fill:url(#XMLID_77_);stroke-width:0.25400001" />
4412 <linearGradient
4413 id="linearGradient9172"
4414 gradientUnits="userSpaceOnUse"
4415 x1="-3581.9321"
4416 y1="-3602.7837"
4417 x2="-3565.7744"
4418 y2="-3551.2231"
4419 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4420 <stop
4421 offset="0"
4422 style="stop-color:#D8E7EB"
4423 id="stop9174" />
4424 <stop
4425 offset="0.0684"
4426 style="stop-color:#D0DFE4"
4427 id="stop9176" />
4428 <stop
4429 offset="0.1761"
4430 style="stop-color:#B9CAD0"
4431 id="stop9178" />
4432 <stop
4433 offset="0.3096"
4434 style="stop-color:#94A7B0"
4435 id="stop9180" />
4436 <stop
4437 offset="0.4622"
4438 style="stop-color:#627784"
4439 id="stop9182" />
4440 <stop
4441 offset="0.5537"
4442 style="stop-color:#405766"
4443 id="stop9184" />
4444 <stop
4445 offset="0.6113"
4446 style="stop-color:#607682"
4447 id="stop9186" />
4448 <stop
4449 offset="0.6983"
4450 style="stop-color:#8B9EA8"
4451 id="stop9188" />
4452 <stop
4453 offset="0.7829"
4454 style="stop-color:#ADBEC5"
4455 id="stop9190" />
4456 <stop
4457 offset="0.8633"
4458 style="stop-color:#C5D5DA"
4459 id="stop9192" />
4460 <stop
4461 offset="0.9376"
4462 style="stop-color:#D3E2E7"
4463 id="stop9194" />
4464 <stop
4465 offset="1"
4466 style="stop-color:#D8E7EB"
4467 id="stop9196" />
4468 </linearGradient>
4469 <path
4470 d="M 17.5698,26.8828 L 17.2251,26.0019 L 17.1079,25.1581 L 17.2241,24.3163 L 17.5307,23.5507 L 17.9877,22.8222 L 18.6,22.0956 L 19.3275,21.4052 L 20.1312,20.7909 L 20.974,20.1776 L 21.8529,19.6415 L 22.7718,19.1034 L 23.6556,18.6444 L 24.4564,18.2245 L 25.223,17.8378 L 25.8744,17.494 L 26.3705,17.1854 L 25.8383,18.2987 L 25.3002,19.2176 L 24.727,19.9451 L 24.1909,20.5974 L 23.6167,21.0954 L 23.0425,21.5544 L 22.4683,21.9001 L 21.8941,22.2458 L 21.3228,22.5915 L 20.7457,22.9733 L 20.1734,23.3571 L 19.6373,23.8171 L 19.1002,24.3903 L 18.565,25.0817 L 18.0679,25.8874 L 17.5698,26.8828 z "
4471 id="path1706"
4472 style="fill:url(#XMLID_78_);stroke-width:0.25400001" />
4473 <linearGradient
4474 id="linearGradient9199"
4475 gradientUnits="userSpaceOnUse"
4476 x1="-3586.4019"
4477 y1="-3601.3828"
4478 x2="-3570.2441"
4479 y2="-3549.8223"
4480 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4481 <stop
4482 offset="0"
4483 style="stop-color:#D8E7EB"
4484 id="stop9201" />
4485 <stop
4486 offset="0.0684"
4487 style="stop-color:#D0DFE4"
4488 id="stop9203" />
4489 <stop
4490 offset="0.1761"
4491 style="stop-color:#B9CAD0"
4492 id="stop9205" />
4493 <stop
4494 offset="0.3096"
4495 style="stop-color:#94A7B0"
4496 id="stop9207" />
4497 <stop
4498 offset="0.4622"
4499 style="stop-color:#627784"
4500 id="stop9209" />
4501 <stop
4502 offset="0.5537"
4503 style="stop-color:#405766"
4504 id="stop9211" />
4505 <stop
4506 offset="0.6113"
4507 style="stop-color:#607682"
4508 id="stop9213" />
4509 <stop
4510 offset="0.6983"
4511 style="stop-color:#8B9EA8"
4512 id="stop9215" />
4513 <stop
4514 offset="0.7829"
4515 style="stop-color:#ADBEC5"
4516 id="stop9217" />
4517 <stop
4518 offset="0.8633"
4519 style="stop-color:#C5D5DA"
4520 id="stop9219" />
4521 <stop
4522 offset="0.9376"
4523 style="stop-color:#D3E2E7"
4524 id="stop9221" />
4525 <stop
4526 offset="1"
4527 style="stop-color:#D8E7EB"
4528 id="stop9223" />
4529 </linearGradient>
4530 <path
4531 d="M 27.2144,17.4531 L 26.7925,18.1054 L 26.3726,18.7577 L 25.9517,19.4081 L 25.5718,20.0214 L 25.2261,20.6347 L 24.8814,21.2841 L 24.5357,21.8984 L 24.231,22.5117 L 23.9273,23.164 L 23.6597,23.8144 L 23.3882,24.4277 L 23.1597,25.0781 L 22.9693,25.7676 L 22.7769,26.418 L 22.6246,27.1075 L 22.4723,27.838 L 22.0885,26.3829 L 21.8971,25.1183 L 21.8561,23.9689 L 21.9713,22.9748 L 22.1998,22.131 L 22.5455,21.3654 L 22.9654,20.7131 L 23.4234,20.176 L 23.9615,19.677 L 24.4967,19.2561 L 25.0699,18.9104 L 25.607,18.6047 L 26.1031,18.299 L 26.5631,18.0314 L 26.9049,17.7248 L 27.2144,17.4531 z "
4532 id="path1733"
4533 style="fill:url(#XMLID_79_);stroke-width:0.25400001" />
4534 </g><g
4535 id="g659">
4536 </g>
4537
4538 <clipPath
4539 id="clippath2"> <path
4540 d="M 206.512,283.82 L 206.512,283.82 L 204.824,284.09 L 203.254,284.129 L 201.801,283.977 L 200.422,283.633 L 199.16,283.098 L 198.008,282.445 L 196.859,281.68 L 195.824,280.801 L 194.789,279.883 L 193.832,278.887 L 192.871,277.816 L 191.953,276.785 L 190.996,275.75 L 190.035,274.719 L 189.078,273.762 L 188.082,272.883 L 188.082,272.883 L 188.465,272.918 L 188.887,272.918 L 189.27,272.957 L 189.691,272.992 L 190.113,273.031 L 190.535,273.109 L 190.953,273.145 L 191.375,273.184 L 191.793,273.223 L 192.258,273.262 L 192.676,273.34 L 193.137,273.375 L 193.598,273.414 L 194.055,273.449 L 194.516,273.449 L 194.973,273.488 L 194.973,273.488 L 195.32,274.023 L 195.742,274.676 L 196.277,275.441 L 196.852,276.242 L 197.543,277.125 L 198.273,278.004 L 199.039,278.922 L 199.844,279.801 L 200.688,280.645 L 201.527,281.449 L 202.41,282.176 L 203.254,282.789 L 204.098,283.285 L 204.941,283.629 L 205.742,283.82 L 206.512,283.82"
4541 id="path666" />
4542 </clipPath>
4543 <g
4544 id="g668">
4545 </g>
4546
4547 <clipPath
4548 id="clippath3"> <path
4549 d="M 209.652,283.855 L 209.652,283.855 L 208.695,283.973 L 207.773,284.047 L 206.895,284.012 L 206.012,283.855 L 205.133,283.668 L 204.289,283.324 L 203.449,282.902 L 202.566,282.328 L 201.684,281.676 L 200.801,280.914 L 199.918,279.992 L 199,279 L 198.043,277.812 L 197.043,276.512 L 196.012,275.098 L 194.938,273.488 L 194.938,273.488 L 195.59,273.488 L 196.203,273.527 L 196.816,273.602 L 197.426,273.676 L 198.039,273.758 L 198.652,273.828 L 199.34,273.828 L 200.066,273.828 L 200.066,273.828 L 200.684,274.441 L 201.258,275.09 L 201.832,275.707 L 202.406,276.395 L 202.98,277.043 L 203.52,277.734 L 204.094,278.383 L 204.633,279.074 L 205.207,279.723 L 205.781,280.371 L 206.391,281.023 L 206.969,281.637 L 207.617,282.211 L 208.273,282.785 L 208.922,283.355 L 209.652,283.855"
4550 id="path675" />
4551 </clipPath>
4552 <g
4553 id="g677">
4554 </g>
4555
4556 <clipPath
4557 id="clippath4"> <path
4558 d="M 212.98,283.699 L 212.98,283.699 L 212.254,284.086 L 211.453,284.238 L 210.645,284.121 L 209.805,283.781 L 208.961,283.281 L 208.078,282.59 L 207.199,281.828 L 206.316,280.91 L 205.473,279.953 L 204.59,278.957 L 203.785,277.961 L 202.98,277.004 L 202.215,276.047 L 201.484,275.207 L 200.797,274.48 L 200.184,273.863 L 204.281,274.246 L 204.281,274.246 L 204.742,274.859 L 205.242,275.469 L 205.699,276.086 L 206.16,276.695 L 206.617,277.309 L 207.117,277.922 L 207.578,278.535 L 208.078,279.145 L 208.613,279.719 L 209.152,280.332 L 209.688,280.91 L 210.301,281.477 L 210.91,282.051 L 211.566,282.629 L 212.254,283.164 L 212.98,283.699"
4559 id="path684" />
4560 </clipPath>
4561 <g
4562 id="g686">
4563 </g>
4564
4565 <clipPath
4566 id="clippath5"> <path
4567 d="M 219.762,282.812 L 219.762,282.812 L 218.574,283.578 L 217.465,284.078 L 216.352,284.309 L 215.277,284.309 L 214.246,284.117 L 213.25,283.777 L 212.293,283.238 L 211.336,282.551 L 210.414,281.711 L 209.492,280.832 L 208.652,279.793 L 207.77,278.762 L 206.926,277.652 L 206.082,276.508 L 205.277,275.359 L 204.473,274.246 L 204.473,274.246 L 205.16,274.363 L 205.699,274.398 L 206.082,274.434 L 206.426,274.477 L 206.77,274.516 L 207.113,274.512 L 207.613,274.551 L 208.266,274.586 L 208.266,274.586 L 208.992,275.621 L 209.723,276.578 L 210.449,277.496 L 211.254,278.301 L 212.02,279.027 L 212.785,279.719 L 213.59,280.328 L 214.359,280.863 L 215.125,281.324 L 215.891,281.742 L 216.617,282.047 L 217.309,282.316 L 217.996,282.547 L 218.609,282.699 L 219.223,282.773 L 219.762,282.812"
4568 id="path693" />
4569 </clipPath>
4570 <g
4571 id="g695">
4572 </g>
4573
4574 <clipPath
4575 id="clippath6"> <path
4576 d="M 223.324,282.578 L 223.324,282.578 L 222.094,282.73 L 220.91,282.812 L 219.762,282.734 L 218.652,282.582 L 217.613,282.355 L 216.582,282.012 L 215.621,281.59 L 214.703,281.055 L 213.785,280.48 L 212.941,279.832 L 212.098,279.105 L 211.293,278.301 L 210.527,277.461 L 209.797,276.543 L 209.07,275.582 L 208.379,274.59 L 208.379,274.59 L 209.145,274.629 L 209.988,274.703 L 210.789,274.777 L 211.598,274.816 L 212.285,274.895 L 212.859,274.926 L 213.281,274.969 L 213.473,275.004 L 213.473,275.004 L 213.551,275.082 L 213.742,275.273 L 214.047,275.578 L 214.43,275.965 L 214.852,276.422 L 215.391,276.957 L 215.965,277.531 L 216.613,278.145 L 217.348,278.754 L 218.109,279.406 L 218.879,280.02 L 219.719,280.633 L 220.598,281.203 L 221.48,281.738 L 222.402,282.199 L 223.324,282.578"
4577 id="path702" />
4578 </clipPath>
4579 <g
4580 id="g704">
4581 </g>
4582
4583 <clipPath
4584 id="clippath7"> <path
4585 d="M 228.031,282.844 L 228.031,282.844 L 226.996,283.039 L 226,283.113 L 224.969,283 L 223.934,282.809 L 222.898,282.469 L 221.906,282.043 L 220.91,281.547 L 219.91,280.934 L 218.992,280.289 L 218.074,279.598 L 217.191,278.871 L 216.348,278.105 L 215.543,277.301 L 214.777,276.539 L 214.086,275.77 L 213.434,275.004 L 213.434,275.004 L 213.895,275.004 L 214.469,275.043 L 215.082,275.082 L 215.73,275.082 L 216.348,275.152 L 216.961,275.195 L 217.457,275.234 L 217.879,275.309 L 217.879,275.309 L 217.879,275.387 L 218.031,275.574 L 218.414,275.922 L 218.949,276.344 L 219.602,276.879 L 220.367,277.488 L 221.211,278.145 L 222.094,278.789 L 223.012,279.477 L 223.93,280.168 L 224.813,280.82 L 225.656,281.395 L 226.422,281.926 L 227.113,282.348 L 227.648,282.652 L 228.031,282.844"
4586 id="path711" />
4587 </clipPath>
4588 <g
4589 id="g713">
4590 </g>
4591
4592 <clipPath
4593 id="clippath8"> <path
4594 d="M 233.238,282.801 L 233.238,282.801 L 232.629,283.414 L 231.824,283.68 L 230.828,283.684 L 229.715,283.418 L 228.488,282.957 L 227.227,282.309 L 225.887,281.543 L 224.582,280.703 L 223.281,279.785 L 222.055,278.867 L 220.906,277.988 L 219.906,277.145 L 219.027,276.457 L 218.375,275.883 L 217.953,275.504 L 217.762,275.348 L 217.762,275.348 L 218.258,275.387 L 218.797,275.422 L 219.371,275.461 L 219.945,275.5 L 220.559,275.574 L 221.172,275.613 L 221.785,275.652 L 222.434,275.691 L 222.434,275.691 L 223.125,276.109 L 223.813,276.531 L 224.504,276.988 L 225.156,277.41 L 225.844,277.867 L 226.496,278.363 L 227.184,278.824 L 227.84,279.285 L 228.527,279.781 L 229.18,280.238 L 229.867,280.699 L 230.52,281.156 L 231.207,281.578 L 231.898,282 L 232.551,282.418 L 233.238,282.801"
4595 id="path720" />
4596 </clipPath>
4597 <g
4598 id="g722">
4599 </g>
4600
4601 <clipPath
4602 id="clippath9"> <path
4603 d="M 237.07,282.875 L 237.07,282.875 L 236.418,283.105 L 235.77,283.223 L 235.117,283.258 L 234.465,283.148 L 233.813,282.953 L 233.125,282.687 L 232.398,282.305 L 231.629,281.848 L 230.785,281.309 L 229.906,280.699 L 228.949,280.012 L 227.875,279.285 L 226.688,278.48 L 225.422,277.602 L 224.043,276.68 L 222.512,275.727 L 222.512,275.727 L 223.164,275.766 L 223.891,275.801 L 224.539,275.801 L 225.191,275.84 L 225.766,275.879 L 226.223,275.914 L 226.531,275.953 L 226.648,275.988 L 226.648,275.988 L 226.723,276.105 L 226.953,276.336 L 227.301,276.645 L 227.758,277.062 L 228.332,277.559 L 228.984,278.137 L 229.676,278.707 L 230.48,279.32 L 231.285,279.93 L 232.129,280.543 L 232.969,281.078 L 233.852,281.613 L 234.695,282.074 L 235.535,282.457 L 236.34,282.723 L 237.07,282.875"
4604 id="path729" />
4605 </clipPath>
4606 <g
4607 id="g731">
4608 </g>
4609
4610 <clipPath
4611 id="clippath10"> <path
4612 d="M 239.863,282.453 L 239.863,282.453 L 238.984,282.797 L 238.027,282.949 L 237.031,282.879 L 235.996,282.609 L 234.922,282.227 L 233.891,281.727 L 232.816,281.121 L 231.82,280.43 L 230.863,279.742 L 229.941,279.016 L 229.098,278.328 L 228.371,277.637 L 227.719,277.062 L 227.223,276.562 L 226.84,276.223 L 226.609,275.988 L 226.609,275.988 L 227.223,276.027 L 227.797,276.07 L 228.371,276.105 L 228.945,276.18 L 229.48,276.223 L 230.059,276.258 L 230.668,276.332 L 231.281,276.371 L 231.281,276.371 L 231.738,276.793 L 232.242,277.211 L 232.699,277.633 L 233.199,278.094 L 233.695,278.59 L 234.195,279.051 L 234.73,279.508 L 235.27,279.965 L 235.805,280.391 L 236.34,280.809 L 236.914,281.191 L 237.449,281.535 L 238.063,281.844 L 238.637,282.105 L 239.25,282.301 L 239.863,282.453"
4613 id="path738" />
4614 </clipPath>
4615 <g
4616 id="g740">
4617 </g>
4618
4619 <clipPath
4620 id="clippath11"> <path
4621 d="M 243.23,281.379 L 243.23,281.379 L 242.469,281.914 L 241.703,282.258 L 240.938,282.414 L 240.172,282.453 L 239.402,282.301 L 238.637,282.07 L 237.871,281.727 L 237.105,281.27 L 236.379,280.77 L 235.613,280.16 L 234.883,279.547 L 234.156,278.895 L 233.426,278.246 L 232.738,277.555 L 232.047,276.906 L 231.355,276.293 L 231.355,276.293 L 231.781,276.332 L 232.238,276.41 L 232.734,276.445 L 233.273,276.484 L 233.848,276.559 L 234.461,276.598 L 235.109,276.672 L 235.723,276.711 L 235.723,276.711 L 235.914,277.172 L 236.184,277.629 L 236.453,278.051 L 236.797,278.473 L 237.18,278.855 L 237.602,279.238 L 238.023,279.617 L 238.52,279.926 L 239.02,280.23 L 239.555,280.5 L 240.129,280.77 L 240.707,280.961 L 241.32,281.113 L 241.93,281.266 L 242.582,281.34 L 243.23,281.379"
4622 id="path747" />
4623 </clipPath>
4624 <g
4625 id="g749">
4626 </g>
4627
4628 <clipPath
4629 id="clippath12"> <path
4630 d="M 246.262,280.801 L 246.262,280.801 L 245.605,281.07 L 244.918,281.262 L 244.191,281.379 L 243.387,281.379 L 242.617,281.34 L 241.777,281.227 L 240.973,280.996 L 240.168,280.77 L 239.402,280.422 L 238.672,280.039 L 238.023,279.617 L 237.41,279.125 L 236.871,278.586 L 236.414,278.016 L 236.07,277.363 L 235.84,276.711 L 235.84,276.711 L 236.297,276.75 L 236.758,276.789 L 237.254,276.863 L 237.754,276.902 L 238.25,276.941 L 238.785,276.977 L 239.359,277.016 L 240.016,277.051 L 240.016,277.051 L 240.281,277.434 L 240.59,277.82 L 240.895,278.164 L 241.199,278.469 L 241.508,278.777 L 241.855,279.004 L 242.199,279.273 L 242.543,279.504 L 242.926,279.695 L 243.348,279.883 L 243.77,280.074 L 244.227,280.227 L 244.688,280.383 L 245.184,280.535 L 245.723,280.648 L 246.262,280.801"
4631 id="path756" />
4632 </clipPath>
4633 <g
4634 id="g758">
4635 </g>
4636
4637 <clipPath
4638 id="clippath13"> <path
4639 d="M 247.828,280.266 L 247.828,280.266 L 247.598,280.492 L 247.254,280.648 L 246.832,280.727 L 246.371,280.727 L 245.836,280.687 L 245.262,280.574 L 244.648,280.383 L 244.035,280.191 L 243.422,279.922 L 242.809,279.613 L 242.199,279.273 L 241.66,278.891 L 241.121,278.469 L 240.664,278.012 L 240.32,277.551 L 240.016,277.051 L 240.016,277.051 L 240.355,277.094 L 240.664,277.129 L 241.047,277.168 L 241.395,277.207 L 241.734,277.242 L 242.043,277.281 L 242.309,277.281 L 242.578,277.281 L 242.578,277.281 L 242.578,277.434 L 242.922,277.738 L 243.5,278.16 L 244.266,278.66 L 245.184,279.156 L 246.105,279.652 L 247.023,280.035 L 247.828,280.266"
4640 id="path765" />
4641 </clipPath>
4642 <g
4643 id="g767">
4644 </g>
4645
4646 <clipPath
4647 id="clippath14"> <path
4648 d="M 248.938,279.191 L 248.938,279.191 L 248.859,279.727 L 248.668,280.07 L 248.363,280.227 L 247.945,280.301 L 247.445,280.227 L 246.871,280.035 L 246.297,279.805 L 245.684,279.5 L 245.07,279.156 L 244.496,278.773 L 243.918,278.395 L 243.461,278.047 L 243.039,277.738 L 242.73,277.512 L 242.539,277.359 L 242.504,277.281 L 245.488,277.621 L 245.488,277.621 L 245.719,277.773 L 246.102,277.969 L 246.563,278.199 L 247.063,278.426 L 247.598,278.695 L 248.094,278.883 L 248.555,279.078 L 248.938,279.191"
4649 id="path774" />
4650 </clipPath>
4651 <g
4652 id="g776">
4653 </g>
4654
4655 <clipPath
4656 id="clippath15"> <path
4657 d="M 245.488,277.621 L 245.488,277.621 L 246.141,277.66 L 246.715,277.699 L 247.25,277.738 L 247.75,277.773 L 248.211,277.812 L 248.707,277.852 L 249.203,277.926 L 249.781,278.043 L 249.781,278.043 L 249.664,278.77 L 249.32,279.113 L 248.785,279.113 L 248.094,278.926 L 247.406,278.578 L 246.676,278.195 L 246.027,277.852 L 245.488,277.621"
4658 id="path783" />
4659 </clipPath>
4660 <g
4661 id="g785">
4662 </g>
4663
4664 <clipPath
4665 id="clippath16"> <path
4666 d="M 245.602,277.242 L 245.602,277.242 L 246.258,277.355 L 246.832,277.43 L 247.328,277.547 L 247.828,277.621 L 248.285,277.734 L 248.742,277.852 L 249.281,277.965 L 249.855,278.117 L 249.855,278.117 L 249.895,277.277 L 249.547,276.738 L 249.012,276.547 L 248.285,276.512 L 247.52,276.664 L 246.754,276.859 L 246.102,277.09 L 245.602,277.242"
4667 id="path792" />
4668 </clipPath>
4669 <g
4670 id="g794">
4671 </g>
4672
4673 <clipPath
4674 id="clippath17"> <path
4675 d="M 205.766,261.684 L 205.766,261.684 L 204.082,261.035 L 202.469,260.613 L 200.977,260.504 L 199.563,260.578 L 198.184,260.887 L 196.922,261.348 L 195.695,261.996 L 194.508,262.727 L 193.359,263.57 L 192.246,264.488 L 191.141,265.449 L 190.066,266.445 L 188.996,267.402 L 187.926,268.324 L 186.852,269.207 L 185.742,270.012 L 185.742,270.012 L 186.129,270.086 L 186.547,270.121 L 186.969,270.164 L 187.352,270.238 L 187.773,270.277 L 188.195,270.316 L 188.656,270.352 L 189.074,270.391 L 189.496,270.426 L 189.957,270.469 L 190.414,270.543 L 190.84,270.582 L 191.297,270.621 L 191.758,270.695 L 192.254,270.73 L 192.715,270.809 L 192.715,270.809 L 193.098,270.273 L 193.594,269.621 L 194.168,268.855 L 194.855,268.09 L 195.582,267.246 L 196.387,266.402 L 197.266,265.559 L 198.148,264.754 L 199.105,263.988 L 200.059,263.301 L 201.016,262.684 L 202.016,262.187 L 202.973,261.801 L 203.93,261.574 L 204.887,261.535 L 205.766,261.684"
4676 id="path801" />
4677 </clipPath>
4678 <g
4679 id="g803">
4680 </g>
4681
4682 <clipPath
4683 id="clippath18"> <path
4684 d="M 208.641,262.332 L 208.641,262.332 L 207.68,261.988 L 206.762,261.762 L 205.883,261.609 L 204.961,261.531 L 204.082,261.609 L 203.16,261.801 L 202.281,262.07 L 201.363,262.492 L 200.406,263.031 L 199.449,263.723 L 198.453,264.523 L 197.418,265.484 L 196.313,266.594 L 195.203,267.859 L 194.016,269.277 L 192.754,270.848 L 192.754,270.848 L 193.441,270.887 L 194.055,270.961 L 194.629,270.996 L 195.164,271.074 L 195.703,271.152 L 196.273,271.227 L 196.926,271.34 L 197.691,271.457 L 197.691,271.457 L 198.383,270.883 L 199.031,270.27 L 199.723,269.617 L 200.336,268.965 L 200.984,268.312 L 201.633,267.664 L 202.246,267.012 L 202.898,266.359 L 203.547,265.746 L 204.234,265.172 L 204.887,264.598 L 205.578,264.023 L 206.301,263.523 L 207.07,263.102 L 207.836,262.68 L 208.641,262.332"
4685 id="path810" />
4686 </clipPath>
4687 <g
4688 id="g812">
4689 </g>
4690
4691 <clipPath
4692 id="clippath19"> <path
4693 d="M 211.855,263.176 L 211.855,263.176 L 211.129,262.562 L 210.32,262.258 L 209.441,262.18 L 208.563,262.371 L 207.605,262.758 L 206.648,263.293 L 205.652,263.984 L 204.656,264.789 L 203.66,265.668 L 202.668,266.59 L 201.711,267.512 L 200.793,268.469 L 199.953,269.348 L 199.109,270.152 L 198.383,270.844 L 197.691,271.418 L 201.867,271.914 L 201.867,271.914 L 202.402,271.301 L 202.941,270.684 L 203.438,270.074 L 203.973,269.461 L 204.508,268.848 L 205.043,268.195 L 205.613,267.582 L 206.152,267.008 L 206.766,266.43 L 207.375,265.859 L 208.027,265.32 L 208.715,264.824 L 209.445,264.324 L 210.211,263.902 L 211.012,263.52 L 211.855,263.176"
4694 id="path819" />
4695 </clipPath>
4696 <g
4697 id="g821">
4698 </g>
4699
4700 <clipPath
4701 id="clippath20"> <path
4702 d="M 218.137,265.852 L 218.137,265.852 L 216.988,264.738 L 215.879,263.937 L 214.805,263.402 L 213.73,263.098 L 212.66,263.059 L 211.625,263.215 L 210.594,263.598 L 209.598,264.098 L 208.602,264.785 L 207.648,265.59 L 206.652,266.512 L 205.73,267.508 L 204.773,268.543 L 203.855,269.652 L 202.941,270.766 L 202.02,271.875 L 202.02,271.875 L 202.75,271.914 L 203.359,271.949 L 203.859,271.988 L 204.32,272.066 L 204.777,272.141 L 205.277,272.215 L 205.848,272.293 L 206.539,272.406 L 206.539,272.406 L 207.344,271.371 L 208.109,270.453 L 208.91,269.609 L 209.676,268.883 L 210.48,268.27 L 211.246,267.73 L 211.973,267.27 L 212.738,266.891 L 213.469,266.582 L 214.156,266.352 L 214.883,266.16 L 215.574,266.008 L 216.223,265.926 L 216.875,265.852 L 217.523,265.852 L 218.137,265.852"
4703 id="path828" />
4704 </clipPath>
4705 <g
4706 id="g830">
4707 </g>
4708
4709 <clipPath
4710 id="clippath21"> <path
4711 d="M 222.621,266.883 L 222.621,266.883 L 221.352,266.422 L 220.168,266.078 L 218.98,265.887 L 217.871,265.809 L 216.762,265.809 L 215.688,265.965 L 214.652,266.199 L 213.656,266.543 L 212.66,266.965 L 211.742,267.5 L 210.785,268.113 L 209.91,268.801 L 209.027,269.57 L 208.188,270.414 L 207.383,271.336 L 206.578,272.332 L 206.578,272.332 L 207.383,272.445 L 208.301,272.555 L 209.223,272.672 L 210.141,272.789 L 210.984,272.902 L 211.633,272.98 L 212.133,273.016 L 212.324,273.016 L 212.324,273.016 L 212.398,272.937 L 212.59,272.785 L 212.859,272.52 L 213.203,272.172 L 213.625,271.754 L 214.121,271.328 L 214.695,270.832 L 215.348,270.293 L 216.074,269.797 L 216.84,269.258 L 217.68,268.758 L 218.598,268.266 L 219.52,267.84 L 220.512,267.457 L 221.547,267.109 L 222.621,266.883"
4712 id="path837" />
4713 </clipPath>
4714 <g
4715 id="g839">
4716 </g>
4717
4718 <clipPath
4719 id="clippath22"> <path
4720 d="M 228.285,267.566 L 228.285,267.566 L 227.254,267.148 L 226.219,266.84 L 225.109,266.727 L 224.035,266.727 L 222.926,266.844 L 221.816,267.074 L 220.742,267.418 L 219.633,267.84 L 218.598,268.34 L 217.566,268.875 L 216.57,269.527 L 215.578,270.176 L 214.695,270.871 L 213.816,271.598 L 213.012,272.324 L 212.285,273.055 L 212.285,273.055 L 212.781,273.094 L 213.395,273.168 L 214.086,273.242 L 214.813,273.32 L 215.539,273.395 L 216.191,273.473 L 216.766,273.508 L 217.188,273.508 L 217.188,273.508 L 217.188,273.43 L 217.379,273.238 L 217.762,272.973 L 218.336,272.59 L 219.023,272.168 L 219.828,271.668 L 220.746,271.133 L 221.703,270.598 L 222.66,270.059 L 223.656,269.52 L 224.652,269.023 L 225.57,268.566 L 226.414,268.18 L 227.176,267.871 L 227.793,267.68 L 228.285,267.566"
4721 id="path846" />
4722 </clipPath>
4723 <g
4724 id="g848">
4725 </g>
4726
4727 <clipPath
4728 id="clippath23"> <path
4729 d="M 233.648,268.672 L 233.648,268.672 L 233.113,267.832 L 232.309,267.332 L 231.313,267.145 L 230.164,267.145 L 228.902,267.414 L 227.523,267.836 L 226.066,268.41 L 224.652,269.062 L 223.195,269.793 L 221.855,270.555 L 220.594,271.289 L 219.445,272.016 L 218.488,272.629 L 217.762,273.09 L 217.266,273.43 L 217.035,273.547 L 217.035,273.547 L 217.57,273.664 L 218.18,273.738 L 218.836,273.812 L 219.523,273.852 L 220.25,273.93 L 220.98,274.004 L 221.668,274.117 L 222.355,274.234 L 222.355,274.234 L 223.086,273.891 L 223.773,273.543 L 224.504,273.16 L 225.191,272.777 L 225.879,272.43 L 226.57,272.047 L 227.297,271.664 L 227.984,271.281 L 228.672,270.93 L 229.359,270.551 L 230.09,270.207 L 230.777,269.902 L 231.469,269.551 L 232.195,269.246 L 232.922,268.937 L 233.648,268.672"
4730 id="path855" />
4731 </clipPath>
4732 <g
4733 id="g857">
4734 </g>
4735
4736 <clipPath
4737 id="clippath24"> <path
4738 d="M 240.504,270.504 L 240.504,270.504 L 239.66,269.895 L 238.707,269.547 L 237.711,269.398 L 236.641,269.398 L 235.527,269.59 L 234.379,269.934 L 233.266,270.355 L 232.16,270.855 L 231.121,271.43 L 230.09,272.008 L 229.172,272.621 L 228.367,273.156 L 227.641,273.656 L 227.066,274.113 L 226.648,274.422 L 226.418,274.613 L 226.418,274.613 L 227.031,274.687 L 227.605,274.766 L 228.098,274.844 L 228.602,274.879 L 229.137,274.918 L 229.633,274.992 L 230.211,275.07 L 230.82,275.145 L 230.82,275.145 L 231.355,274.801 L 231.895,274.453 L 232.469,274.035 L 233.039,273.652 L 233.652,273.23 L 234.266,272.809 L 234.879,272.387 L 235.488,272.004 L 236.105,271.617 L 236.754,271.309 L 237.406,271.008 L 238.02,270.773 L 238.664,270.586 L 239.281,270.469 L 239.895,270.469 L 240.504,270.504"
4739 id="path864" />
4740 </clipPath>
4741 <g
4742 id="g866">
4743 </g>
4744
4745 <clipPath
4746 id="clippath25"> <path
4747 d="M 237.977,269.398 L 237.977,269.398 L 237.324,268.977 L 236.637,268.672 L 235.984,268.516 L 235.297,268.441 L 234.605,268.48 L 233.879,268.633 L 233.074,268.902 L 232.234,269.207 L 231.316,269.633 L 230.355,270.09 L 229.285,270.668 L 228.137,271.281 L 226.871,271.93 L 225.496,272.621 L 224.004,273.387 L 222.355,274.156 L 222.355,274.156 L 222.969,274.195 L 223.66,274.27 L 224.348,274.348 L 225.039,274.461 L 225.652,274.539 L 226.109,274.613 L 226.457,274.652 L 226.609,274.613 L 226.609,274.613 L 226.684,274.535 L 226.953,274.348 L 227.297,274.039 L 227.758,273.695 L 228.332,273.273 L 229.02,272.812 L 229.746,272.312 L 230.551,271.812 L 231.43,271.316 L 232.313,270.82 L 233.266,270.395 L 234.223,270.012 L 235.184,269.707 L 236.141,269.512 L 237.059,269.398 L 237.977,269.398"
4748 id="path873" />
4749 </clipPath>
4750 <g
4751 id="g875">
4752 </g>
4753
4754 <clipPath
4755 id="clippath26"> <path
4756 d="M 243.57,272.457 L 243.57,272.457 L 242.879,271.691 L 242.156,271.113 L 241.426,270.734 L 240.66,270.504 L 239.895,270.469 L 239.09,270.543 L 238.246,270.734 L 237.441,271.008 L 236.602,271.426 L 235.797,271.848 L 234.957,272.383 L 234.113,272.922 L 233.27,273.461 L 232.469,274.035 L 231.66,274.613 L 230.859,275.109 L 230.859,275.109 L 231.32,275.184 L 231.816,275.223 L 232.391,275.34 L 233.004,275.41 L 233.652,275.488 L 234.344,275.562 L 234.992,275.641 L 235.648,275.715 L 235.648,275.715 L 235.875,275.219 L 236.145,274.797 L 236.449,274.375 L 236.832,273.992 L 237.215,273.648 L 237.637,273.379 L 238.137,273.109 L 238.633,272.879 L 239.168,272.687 L 239.738,272.535 L 240.316,272.418 L 240.93,272.34 L 241.578,272.301 L 242.23,272.305 L 242.883,272.379 L 243.57,272.457"
4757 id="path882" />
4758 </clipPath>
4759 <g
4760 id="g884">
4761 </g>
4762
4763 <clipPath
4764 id="clippath27"> <path
4765 d="M 248.285,274.977 L 248.285,274.977 L 248.09,274.672 L 247.785,274.406 L 247.363,274.176 L 246.906,274.023 L 246.367,273.91 L 245.758,273.871 L 245.145,273.871 L 244.453,273.91 L 243.801,274.023 L 243.113,274.18 L 242.461,274.41 L 241.852,274.676 L 241.273,274.984 L 240.738,275.367 L 240.32,275.789 L 239.938,276.25 L 239.938,276.25 L 240.281,276.328 L 240.703,276.402 L 241.121,276.48 L 241.543,276.551 L 241.926,276.59 L 242.234,276.633 L 242.465,276.633 L 242.539,276.633 L 242.539,276.633 L 242.695,276.551 L 243.074,276.324 L 243.727,276.016 L 244.492,275.672 L 245.371,275.363 L 246.367,275.098 L 247.324,274.941 L 248.285,274.977"
4766 id="path891" />
4767 </clipPath>
4768 <g
4769 id="g893">
4770 </g>
4771
4772 <clipPath
4773 id="clippath28"> <path
4774 d="M 249.203,276.547 L 249.203,276.547 L 249.238,275.937 L 249.09,275.477 L 248.816,275.172 L 248.398,274.98 L 247.902,274.902 L 247.324,274.941 L 246.711,275.02 L 246.023,275.172 L 245.371,275.402 L 244.723,275.633 L 244.109,275.863 L 243.574,276.094 L 243.113,276.324 L 242.73,276.48 L 242.539,276.633 L 242.461,276.668 L 245.527,277.199 L 245.527,277.199 L 245.797,277.09 L 246.219,276.973 L 246.676,276.816 L 247.215,276.664 L 247.75,276.547 L 248.285,276.473 L 248.781,276.473 L 249.203,276.547"
4775 id="path900" />
4776 </clipPath>
4777 <g
4778 id="g902">
4779 </g>
4780
4781 <clipPath
4782 id="clippath29"> <path
4783 d="M 246.672,273.984 L 246.672,273.984 L 246.063,273.488 L 245.41,273.105 L 244.68,272.762 L 243.879,272.531 L 243.07,272.379 L 242.27,272.266 L 241.426,272.27 L 240.586,272.34 L 239.781,272.496 L 238.977,272.766 L 238.246,273.074 L 237.559,273.457 L 236.91,273.918 L 236.371,274.453 L 235.953,275.066 L 235.609,275.754 L 235.609,275.754 L 236.105,275.793 L 236.566,275.871 L 237.066,275.906 L 237.602,275.984 L 238.137,276.059 L 238.711,276.098 L 239.324,276.215 L 239.973,276.285 L 239.973,276.285 L 240.203,275.945 L 240.473,275.637 L 240.777,275.367 L 241.121,275.102 L 241.504,274.867 L 241.926,274.641 L 242.348,274.445 L 242.805,274.297 L 243.266,274.141 L 243.762,274.027 L 244.223,273.949 L 244.723,273.91 L 245.219,273.871 L 245.715,273.871 L 246.215,273.91 L 246.672,273.984"
4784 id="path909" />
4785 </clipPath>
4786 <g
4787 id="g911">
4788 </g>
4789
4790 <clipPath
4791 id="clippath30"> <path
4792 d="M 110.332,255.668 L 110.332,255.668 L 110.258,255.785 L 110.063,256.094 L 109.797,256.59 L 109.453,257.203 L 109.066,257.93 L 108.684,258.734 L 108.34,259.656 L 108.035,260.574 L 107.809,261.531 L 107.73,262.453 L 107.773,263.367 L 107.965,264.211 L 108.387,264.98 L 109.035,265.664 L 109.957,266.199 L 111.145,266.586 L 111.145,266.586 L 110.953,266.008 L 110.836,265.395 L 110.723,264.785 L 110.645,264.094 L 110.605,263.406 L 110.605,262.68 L 110.645,261.988 L 110.68,261.262 L 110.758,260.531 L 110.832,259.805 L 110.949,259.117 L 111.098,258.465 L 111.215,257.812 L 111.367,257.238 L 111.555,256.664 L 111.711,256.168 L 111.711,256.168 L 111.52,256.09 L 111.363,256.051 L 111.176,255.977 L 111.02,255.937 L 110.863,255.863 L 110.715,255.824 L 110.523,255.742 L 110.332,255.668"
4793 id="path918" />
4794 </clipPath>
4795 <g
4796 id="g920">
4797 </g>
4798
4799 <clipPath
4800 id="clippath31"> <path
4801 d="M 111.633,256.203 L 111.633,256.203 L 111.098,258.312 L 110.719,260.227 L 110.527,261.871 L 110.492,263.328 L 110.57,264.594 L 110.797,265.664 L 111.105,266.586 L 111.453,267.348 L 111.91,267.961 L 112.41,268.457 L 112.906,268.805 L 113.402,269.07 L 113.941,269.262 L 114.402,269.34 L 114.82,269.375 L 115.207,269.375 L 115.207,269.375 L 114.977,268.762 L 114.746,268.148 L 114.555,267.461 L 114.398,266.773 L 114.246,266.004 L 114.129,265.242 L 114.012,264.477 L 113.938,263.672 L 113.898,262.867 L 113.895,262.023 L 113.938,261.219 L 114.012,260.379 L 114.125,259.574 L 114.281,258.73 L 114.469,257.965 L 114.699,257.16 L 114.699,257.16 L 114.238,257.008 L 113.777,256.855 L 113.355,256.703 L 112.938,256.586 L 112.59,256.473 L 112.211,256.359 L 111.902,256.281 L 111.633,256.203"
4802 id="path927" />
4803 </clipPath>
4804 <g
4805 id="g929">
4806 </g>
4807
4808 <clipPath
4809 id="clippath32"> <path
4810 d="M 114.738,257.16 L 114.738,257.16 L 114.508,257.926 L 114.281,258.883 L 114.125,259.957 L 114.051,261.145 L 114.012,262.445 L 114.012,263.746 L 114.129,265.047 L 114.32,266.312 L 114.594,267.5 L 114.938,268.609 L 115.438,269.566 L 116.008,270.332 L 116.738,270.906 L 117.582,271.211 L 118.539,271.25 L 119.648,270.945 L 119.648,270.945 L 119.227,270.254 L 118.805,269.527 L 118.461,268.762 L 118.152,268.035 L 117.922,267.23 L 117.691,266.465 L 117.539,265.66 L 117.387,264.855 L 117.309,264.051 L 117.27,263.211 L 117.27,262.406 L 117.344,261.562 L 117.418,260.723 L 117.57,259.914 L 117.723,259.07 L 117.953,258.27 L 117.953,258.27 L 117.57,258.113 L 117.113,257.965 L 116.652,257.809 L 116.191,257.656 L 115.73,257.504 L 115.348,257.352 L 115.004,257.234 L 114.738,257.16"
4811 id="path936" />
4812 </clipPath>
4813 <g
4814 id="g938">
4815 </g>
4816
4817 <clipPath
4818 id="clippath33"> <path
4819 d="M 124.477,273.043 L 124.477,273.043 L 123.441,273.121 L 122.445,272.969 L 121.566,272.59 L 120.762,272.051 L 119.992,271.324 L 119.344,270.445 L 118.805,269.449 L 118.305,268.34 L 117.922,267.152 L 117.613,265.887 L 117.422,264.586 L 117.344,263.285 L 117.344,261.984 L 117.457,260.68 L 117.688,259.418 L 118.031,258.23 L 118.031,258.23 L 118.453,258.383 L 118.875,258.535 L 119.332,258.648 L 119.793,258.801 L 120.254,258.957 L 120.75,259.109 L 121.211,259.262 L 121.707,259.414 L 121.707,259.414 L 121.707,260.219 L 121.707,261.098 L 121.746,262.098 L 121.785,263.129 L 121.828,264.164 L 121.863,265.234 L 121.98,266.344 L 122.094,267.379 L 122.25,268.414 L 122.402,269.367 L 122.637,270.289 L 122.902,271.094 L 123.211,271.781 L 123.594,272.359 L 124.016,272.777 L 124.477,273.043"
4820 id="path945" />
4821 </clipPath>
4822 <g
4823 id="g947">
4824 </g>
4825
4826 <clipPath
4827 id="clippath34"> <path
4828 d="M 121.746,259.453 L 121.746,259.453 L 122.129,259.605 L 122.512,259.719 L 122.855,259.836 L 123.238,259.91 L 123.621,260.027 L 123.969,260.141 L 124.352,260.254 L 124.73,260.41 L 124.73,260.41 L 124.809,261.176 L 124.852,262.016 L 124.891,262.934 L 124.926,263.891 L 124.93,264.852 L 124.93,265.883 L 124.969,266.918 L 125.008,267.914 L 125.086,268.91 L 125.199,269.902 L 125.395,270.824 L 125.586,271.668 L 125.891,272.469 L 126.234,273.195 L 126.656,273.809 L 127.195,274.305 L 127.195,274.305 L 126.086,274.039 L 125.164,273.656 L 124.398,273.121 L 123.746,272.508 L 123.211,271.781 L 122.789,270.941 L 122.48,270.023 L 122.25,269.062 L 122.094,267.992 L 121.98,266.883 L 121.941,265.73 L 121.902,264.508 L 121.863,263.281 L 121.824,262.02 L 121.824,260.754 L 121.746,259.453"
4829 id="path954" />
4830 </clipPath>
4831 <g
4832 id="g956">
4833 </g>
4834
4835 <clipPath
4836 id="clippath35"> <path
4837 d="M 131.906,275.797 L 131.906,275.797 L 130.375,275.723 L 129.07,275.418 L 128,274.918 L 127.117,274.27 L 126.43,273.426 L 125.926,272.469 L 125.508,271.398 L 125.238,270.211 L 125.086,268.984 L 124.973,267.723 L 124.93,266.418 L 124.93,265.156 L 124.926,263.852 L 124.926,262.629 L 124.852,261.48 L 124.773,260.406 L 124.773,260.406 L 125.195,260.523 L 125.578,260.637 L 125.918,260.754 L 126.305,260.863 L 126.688,260.98 L 127.031,261.094 L 127.414,261.172 L 127.836,261.285 L 127.836,261.285 L 127.797,261.398 L 127.723,261.668 L 127.645,262.09 L 127.531,262.629 L 127.457,263.316 L 127.379,264.082 L 127.34,264.965 L 127.379,265.961 L 127.492,266.992 L 127.688,268.141 L 128.035,269.328 L 128.457,270.551 L 129.07,271.816 L 129.836,273.156 L 130.758,274.457 L 131.906,275.797"
4838 id="path963" />
4839 </clipPath>
4840 <g
4841 id="g965">
4842 </g>
4843
4844 <clipPath
4845 id="clippath36"> <path
4846 d="M 137.117,277.437 L 137.117,277.437 L 136.082,277.629 L 135.047,277.555 L 134.051,277.211 L 133.055,276.676 L 132.098,275.949 L 131.18,275.07 L 130.336,273.996 L 129.566,272.809 L 128.918,271.547 L 128.344,270.168 L 127.879,268.711 L 127.57,267.223 L 127.379,265.727 L 127.34,264.195 L 127.488,262.742 L 127.797,261.285 L 131.434,262.43 L 131.434,262.43 L 131.398,263.047 L 131.359,263.773 L 131.398,264.574 L 131.48,265.492 L 131.594,266.453 L 131.785,267.488 L 131.977,268.559 L 132.285,269.668 L 132.633,270.777 L 133.051,271.852 L 133.512,272.922 L 134.09,273.957 L 134.699,274.953 L 135.43,275.871 L 136.234,276.711 L 137.117,277.437"
4847 id="path972" />
4848 </clipPath>
4849 <g
4850 id="g974">
4851 </g>
4852
4853 <clipPath
4854 id="clippath37"> <path
4855 d="M 131.359,262.355 L 131.359,262.355 L 131.934,262.547 L 132.66,262.738 L 133.469,263.004 L 134.309,263.234 L 135.113,263.504 L 135.918,263.73 L 136.605,263.922 L 137.145,264.035 L 137.145,264.035 L 137.223,264.918 L 137.262,265.871 L 137.34,266.832 L 137.375,267.824 L 137.492,268.82 L 137.57,269.859 L 137.684,270.852 L 137.84,271.848 L 138.031,272.805 L 138.262,273.727 L 138.57,274.641 L 138.914,275.484 L 139.336,276.289 L 139.836,276.977 L 140.371,277.629 L 141.02,278.203 L 141.02,278.203 L 140.254,278.316 L 139.453,278.281 L 138.648,278.09 L 137.844,277.746 L 137.039,277.246 L 136.234,276.633 L 135.465,275.871 L 134.699,274.914 L 134.012,273.879 L 133.398,272.652 L 132.82,271.312 L 132.328,269.785 L 131.938,268.137 L 131.633,266.379 L 131.438,264.422 L 131.359,262.355"
4856 id="path981" />
4857 </clipPath>
4858 <g
4859 id="g983">
4860 </g>
4861
4862 <clipPath
4863 id="clippath38"> <path
4864 d="M 146.043,278.961 L 146.043,278.961 L 144.316,279.043 L 142.863,278.891 L 141.598,278.473 L 140.598,277.816 L 139.758,277.016 L 139.066,276.059 L 138.57,274.984 L 138.184,273.762 L 137.879,272.539 L 137.688,271.195 L 137.57,269.895 L 137.453,268.594 L 137.414,267.289 L 137.336,266.102 L 137.262,264.992 L 137.145,263.996 L 137.145,263.996 L 137.68,264.113 L 138.145,264.227 L 138.563,264.379 L 138.98,264.492 L 139.441,264.609 L 139.863,264.723 L 140.359,264.875 L 140.938,265.027 L 140.938,265.027 L 141.051,266.066 L 141.164,267.055 L 141.281,268.051 L 141.438,269.008 L 141.555,269.969 L 141.707,270.887 L 141.898,271.766 L 142.09,272.648 L 142.359,273.488 L 142.668,274.332 L 143.047,275.137 L 143.473,275.941 L 144.008,276.742 L 144.582,277.508 L 145.273,278.238 L 146.043,278.961"
4865 id="path990" />
4866 </clipPath>
4867 <g
4868 id="g992">
4869 </g>
4870
4871 <clipPath
4872 id="clippath39"> <path
4873 d="M 150.672,279.801 L 150.672,279.801 L 149.141,279.879 L 147.801,279.687 L 146.613,279.27 L 145.578,278.617 L 144.699,277.816 L 143.934,276.859 L 143.316,275.75 L 142.781,274.562 L 142.324,273.336 L 141.977,272.035 L 141.707,270.77 L 141.512,269.473 L 141.324,268.246 L 141.207,267.098 L 141.09,266.023 L 141.012,265.105 L 141.012,265.105 L 141.551,265.223 L 142.008,265.332 L 142.43,265.449 L 142.852,265.527 L 143.23,265.641 L 143.656,265.715 L 144.152,265.863 L 144.727,265.98 L 144.727,265.98 L 144.918,267.477 L 145.148,268.895 L 145.418,270.195 L 145.688,271.422 L 145.996,272.531 L 146.344,273.602 L 146.723,274.52 L 147.109,275.402 L 147.492,276.207 L 147.914,276.93 L 148.375,277.547 L 148.797,278.117 L 149.258,278.656 L 149.719,279.074 L 150.215,279.457 L 150.672,279.801"
4874 id="path999" />
4875 </clipPath>
4876 <g
4877 id="g1001">
4878 </g>
4879
4880 <clipPath
4881 id="clippath40"> <path
4882 d="M 155.043,280.371 L 155.043,280.371 L 153.586,280.453 L 152.285,280.262 L 151.133,279.84 L 150.059,279.227 L 149.18,278.426 L 148.375,277.508 L 147.684,276.434 L 147.07,275.289 L 146.535,274.098 L 146.113,272.836 L 145.766,271.574 L 145.461,270.309 L 145.188,269.121 L 144.996,267.973 L 144.844,266.941 L 144.688,266.02 L 144.688,266.02 L 145.148,266.098 L 145.605,266.211 L 146.07,266.328 L 146.527,266.441 L 146.988,266.555 L 147.449,266.668 L 147.91,266.785 L 148.406,266.902 L 148.406,266.902 L 148.445,267.625 L 148.559,268.43 L 148.715,269.273 L 148.98,270.191 L 149.25,271.184 L 149.633,272.145 L 150.02,273.176 L 150.441,274.176 L 150.938,275.172 L 151.477,276.129 L 152.012,277.008 L 152.586,277.848 L 153.16,278.652 L 153.777,279.344 L 154.426,279.914 L 155.043,280.371"
4883 id="path1008" />
4884 </clipPath>
4885 <g
4886 id="g1010">
4887 </g>
4888
4889 <clipPath
4890 id="clippath41"> <path
4891 d="M 159.824,281.25 L 159.824,281.25 L 158.336,281.445 L 156.957,281.254 L 155.691,280.758 L 154.539,279.988 L 153.469,279.035 L 152.547,277.887 L 151.707,276.586 L 150.938,275.246 L 150.324,273.832 L 149.785,272.449 L 149.328,271.148 L 148.945,269.887 L 148.715,268.812 L 148.52,267.93 L 148.406,267.281 L 148.406,266.902 L 148.406,266.902 L 148.977,267.012 L 149.516,267.129 L 150.051,267.281 L 150.586,267.395 L 151.121,267.547 L 151.621,267.66 L 152.121,267.777 L 152.617,267.855 L 152.617,267.855 L 152.809,268.504 L 153,269.27 L 153.23,270.148 L 153.504,271.145 L 153.809,272.18 L 154.152,273.25 L 154.535,274.32 L 154.961,275.434 L 155.422,276.504 L 155.918,277.5 L 156.457,278.461 L 157.031,279.301 L 157.684,280.027 L 158.332,280.602 L 159.059,281.023 L 159.824,281.25"
4892 id="path1017" />
4893 </clipPath>
4894 <g
4895 id="g1019">
4896 </g>
4897
4898 <clipPath
4899 id="clippath42"> <path
4900 d="M 163.309,281.055 L 163.309,281.055 L 161.934,281.363 L 160.711,281.363 L 159.598,281.098 L 158.563,280.562 L 157.68,279.836 L 156.879,278.953 L 156.152,277.887 L 155.496,276.734 L 154.961,275.508 L 154.461,274.246 L 154.078,272.984 L 153.691,271.758 L 153.387,270.609 L 153.156,269.574 L 152.922,268.656 L 152.734,267.93 L 152.734,267.93 L 153.152,268.008 L 153.578,268.043 L 153.957,268.117 L 154.301,268.195 L 154.688,268.273 L 155.031,268.352 L 155.336,268.426 L 155.68,268.465 L 155.68,268.465 L 155.801,269.344 L 155.988,270.262 L 156.223,271.18 L 156.449,272.062 L 156.758,272.98 L 157.105,273.863 L 157.484,274.707 L 157.949,275.547 L 158.406,276.391 L 158.945,277.191 L 159.559,277.957 L 160.168,278.645 L 160.898,279.336 L 161.625,279.949 L 162.43,280.523 L 163.309,281.055"
4901 id="path1026" />
4902 </clipPath>
4903 <g
4904 id="g1028">
4905 </g>
4906
4907 <clipPath
4908 id="clippath43"> <path
4909 d="M 167.676,281.742 L 167.676,281.742 L 166.531,281.742 L 165.418,281.629 L 164.348,281.359 L 163.309,280.977 L 162.355,280.445 L 161.395,279.793 L 160.555,279.031 L 159.715,278.191 L 158.98,277.23 L 158.293,276.199 L 157.676,275.047 L 157.105,273.863 L 156.645,272.598 L 156.258,271.258 L 155.91,269.879 L 155.68,268.465 L 155.68,268.465 L 156.18,268.578 L 156.68,268.656 L 157.215,268.77 L 157.711,268.844 L 158.207,268.922 L 158.668,269.035 L 159.168,269.113 L 159.625,269.227 L 159.625,269.227 L 159.855,270.031 L 160.047,270.871 L 160.281,271.676 L 160.473,272.48 L 160.707,273.285 L 160.934,274.09 L 161.238,274.855 L 161.586,275.656 L 162.004,276.426 L 162.469,277.227 L 163.043,277.992 L 163.73,278.758 L 164.5,279.488 L 165.418,280.25 L 166.488,281.016 L 167.676,281.742"
4910 id="path1035" />
4911 </clipPath>
4912 <g
4913 id="g1037">
4914 </g>
4915
4916 <clipPath
4917 id="clippath44"> <path
4918 d="M 173.883,282.465 L 173.883,282.465 L 172.387,282.582 L 170.934,282.508 L 169.594,282.277 L 168.328,281.895 L 167.141,281.398 L 166.031,280.746 L 164.996,279.98 L 164.078,279.062 L 163.234,278.109 L 162.469,277.035 L 161.773,275.887 L 161.203,274.664 L 160.707,273.363 L 160.281,272.023 L 159.973,270.645 L 159.738,269.266 L 159.738,269.266 L 160.164,269.34 L 160.621,269.457 L 161.078,269.531 L 161.539,269.605 L 162,269.684 L 162.5,269.801 L 162.996,269.875 L 163.496,269.988 L 163.496,269.988 L 163.648,270.945 L 163.879,271.902 L 164.188,272.863 L 164.57,273.781 L 165.027,274.699 L 165.531,275.613 L 166.105,276.496 L 166.754,277.34 L 167.445,278.184 L 168.211,278.945 L 169.02,279.672 L 169.898,280.359 L 170.82,280.973 L 171.813,281.547 L 172.809,282.043 L 173.883,282.465"
4919 id="path1044" />
4920 </clipPath>
4921 <g
4922 id="g1046">
4923 </g>
4924
4925 <clipPath
4926 id="clippath45"> <path
4927 d="M 177.289,282.844 L 177.289,282.844 L 176.027,282.77 L 174.801,282.578 L 173.574,282.273 L 172.387,281.816 L 171.277,281.281 L 170.164,280.633 L 169.133,279.902 L 168.172,279.059 L 167.293,278.145 L 166.449,277.152 L 165.719,276.074 L 165.07,274.965 L 164.535,273.781 L 164.074,272.555 L 163.727,271.289 L 163.496,269.988 L 163.496,269.988 L 163.914,270.066 L 164.301,270.102 L 164.645,270.141 L 164.988,270.18 L 165.336,270.219 L 165.715,270.293 L 166.172,270.367 L 166.715,270.445 L 166.715,270.445 L 167.25,271.555 L 167.711,272.59 L 168.168,273.586 L 168.59,274.504 L 169.051,275.387 L 169.473,276.191 L 169.934,276.953 L 170.434,277.68 L 170.973,278.41 L 171.586,279.059 L 172.273,279.746 L 173.035,280.359 L 173.922,281.012 L 174.918,281.621 L 176.027,282.234 L 177.289,282.844"
4928 id="path1053" />
4929 </clipPath>
4930 <g
4931 id="g1055">
4932 </g>
4933
4934 <clipPath
4935 id="clippath46"> <path
4936 d="M 183.418,283.223 L 183.418,283.223 L 181.539,283.301 L 179.777,283.227 L 178.207,282.961 L 176.754,282.539 L 175.414,281.969 L 174.227,281.277 L 173.113,280.477 L 172.121,279.555 L 171.199,278.562 L 170.355,277.488 L 169.625,276.379 L 168.938,275.191 L 168.285,274.008 L 167.711,272.82 L 167.172,271.594 L 166.676,270.406 L 166.676,270.406 L 167.02,270.445 L 167.477,270.523 L 168.051,270.598 L 168.629,270.676 L 169.238,270.789 L 169.777,270.863 L 170.234,270.941 L 170.582,270.977 L 170.582,270.977 L 171.039,272.051 L 171.539,273.125 L 172.117,274.117 L 172.691,275.074 L 173.344,275.957 L 174.031,276.836 L 174.762,277.637 L 175.563,278.445 L 176.367,279.172 L 177.254,279.859 L 178.168,280.547 L 179.129,281.16 L 180.125,281.73 L 181.195,282.27 L 182.27,282.766 L 183.418,283.223"
4937 id="path1062" />
4938 </clipPath>
4939 <g
4940 id="g1064">
4941 </g>
4942
4943 <clipPath
4944 id="clippath47"> <path
4945 d="M 187.055,283.758 L 187.055,283.758 L 186.023,283.719 L 184.953,283.566 L 183.801,283.297 L 182.613,282.879 L 181.426,282.383 L 180.238,281.77 L 179.051,281.043 L 177.863,280.242 L 176.715,279.324 L 175.602,278.367 L 174.566,277.297 L 173.57,276.148 L 172.691,274.961 L 171.887,273.699 L 171.195,272.359 L 170.617,270.977 L 170.617,270.977 L 171.195,271.055 L 171.77,271.133 L 172.379,271.168 L 172.992,271.246 L 173.527,271.281 L 173.988,271.359 L 174.371,271.434 L 174.563,271.551 L 174.563,271.551 L 175.254,272.086 L 175.945,272.66 L 176.672,273.352 L 177.402,274.039 L 178.129,274.801 L 178.895,275.605 L 179.66,276.449 L 180.426,277.293 L 181.195,278.172 L 181.996,279.016 L 182.801,279.855 L 183.648,280.695 L 184.488,281.539 L 185.332,282.301 L 186.176,283.07 L 187.055,283.758"
4946 id="path1071" />
4947 </clipPath>
4948 <g
4949 id="g1073">
4950 </g>
4951
4952 <clipPath
4953 id="clippath48"> <path
4954 d="M 192.648,283.754 L 192.648,283.754 L 191.652,284.406 L 190.617,284.672 L 189.586,284.676 L 188.512,284.406 L 187.441,283.871 L 186.367,283.145 L 185.254,282.227 L 184.145,281.195 L 183.031,280.008 L 181.883,278.781 L 180.695,277.48 L 179.508,276.18 L 178.32,274.918 L 177.094,273.656 L 175.863,272.508 L 174.641,271.477 L 174.641,271.477 L 175.066,271.477 L 175.559,271.551 L 176.137,271.586 L 176.707,271.703 L 177.324,271.781 L 177.934,271.895 L 178.508,272.008 L 179.008,272.082 L 179.008,272.082 L 179.965,272.965 L 180.918,273.844 L 181.844,274.762 L 182.762,275.68 L 183.645,276.598 L 184.527,277.52 L 185.406,278.437 L 186.25,279.277 L 187.094,280.117 L 187.934,280.887 L 188.738,281.574 L 189.543,282.184 L 190.348,282.723 L 191.113,283.18 L 191.883,283.523 L 192.648,283.754"
4955 id="path1080" />
4956 </clipPath>
4957 <g
4958 id="g1082">
4959 </g>
4960
4961 <clipPath
4962 id="clippath49"> <path
4963 d="M 196.977,284.246 L 196.977,284.246 L 195.406,284.25 L 193.91,284.098 L 192.57,283.715 L 191.266,283.219 L 190.082,282.605 L 188.973,281.84 L 187.895,281 L 186.902,280.082 L 185.902,279.086 L 184.949,278.051 L 184.027,277.02 L 183.066,275.949 L 182.109,274.879 L 181.113,273.844 L 180.078,272.887 L 179.008,271.969 L 179.008,271.969 L 179.391,272.008 L 179.848,272.047 L 180.422,272.117 L 180.996,272.16 L 181.574,272.234 L 182.031,272.273 L 182.414,272.312 L 182.645,272.312 L 182.645,272.312 L 183.293,272.465 L 183.945,272.77 L 184.598,273.191 L 185.289,273.766 L 185.977,274.453 L 186.707,275.258 L 187.473,276.102 L 188.238,277.016 L 189.121,277.973 L 190.004,278.926 L 191,279.926 L 192.031,280.883 L 193.145,281.84 L 194.332,282.719 L 195.598,283.52 L 196.977,284.246"
4964 id="path1089" />
4965 </clipPath>
4966 <g
4967 id="g1091">
4968 </g>
4969
4970 <clipPath
4971 id="clippath50"> <path
4972 d="M 111.746,252.719 L 111.746,252.719 L 111.746,252.566 L 111.707,252.145 L 111.668,251.496 L 111.629,250.691 L 111.59,249.734 L 111.629,248.699 L 111.664,247.629 L 111.777,246.516 L 112.008,245.484 L 112.316,244.488 L 112.695,243.605 L 113.27,242.879 L 113.922,242.34 L 114.766,242.035 L 115.797,241.992 L 116.984,242.301 L 116.984,242.301 L 116.598,242.684 L 116.258,243.145 L 115.91,243.719 L 115.605,244.332 L 115.301,245.02 L 115.031,245.711 L 114.77,246.477 L 114.496,247.281 L 114.27,248.047 L 114.082,248.852 L 113.887,249.656 L 113.699,250.422 L 113.543,251.187 L 113.391,251.914 L 113.277,252.566 L 113.16,253.176 L 113.16,253.176 L 112.973,253.141 L 112.816,253.105 L 112.629,253.027 L 112.473,252.988 L 112.32,252.91 L 112.164,252.836 L 111.973,252.797 L 111.746,252.719"
4973 id="path1098" />
4974 </clipPath>
4975 <g
4976 id="g1100">
4977 </g>
4978
4979 <clipPath
4980 id="clippath51"> <path
4981 d="M 113.16,253.293 L 113.16,253.293 L 113.66,250.805 L 114.195,248.695 L 114.73,246.895 L 115.262,245.402 L 115.801,244.215 L 116.336,243.258 L 116.867,242.566 L 117.367,242.07 L 117.902,241.723 L 118.359,241.57 L 118.863,241.531 L 119.285,241.609 L 119.703,241.762 L 120.086,241.953 L 120.43,242.184 L 120.738,242.414 L 120.738,242.414 L 120.203,242.91 L 119.668,243.484 L 119.207,244.023 L 118.711,244.633 L 118.289,245.246 L 117.867,245.898 L 117.484,246.551 L 117.141,247.238 L 116.797,247.969 L 116.492,248.734 L 116.184,249.5 L 115.918,250.305 L 115.691,251.148 L 115.461,251.988 L 115.234,252.871 L 115.039,253.789 L 115.039,253.789 L 114.582,253.711 L 114.238,253.676 L 114.004,253.598 L 113.852,253.523 L 113.699,253.484 L 113.543,253.41 L 113.391,253.367 L 113.16,253.293"
4982 id="path1107" />
4983 </clipPath>
4984 <g
4985 id="g1109">
4986 </g>
4987
4988 <clipPath
4989 id="clippath52"> <path
4990 d="M 115.078,254.02 L 115.078,254.02 L 115.191,253.102 L 115.387,252.066 L 115.691,250.918 L 116.07,249.691 L 116.57,248.426 L 117.141,247.199 L 117.754,246.016 L 118.441,244.902 L 119.168,243.906 L 119.934,243.062 L 120.738,242.414 L 121.578,241.988 L 122.461,241.836 L 123.344,241.988 L 124.184,242.488 L 125.066,243.363 L 125.066,243.363 L 124.379,243.863 L 123.727,244.441 L 123.113,245.016 L 122.539,245.59 L 121.965,246.203 L 121.469,246.891 L 121.012,247.582 L 120.59,248.27 L 120.168,249.039 L 119.824,249.805 L 119.48,250.609 L 119.215,251.453 L 118.945,252.332 L 118.754,253.25 L 118.563,254.211 L 118.41,255.203 L 118.41,255.203 L 117.992,255.09 L 117.531,254.937 L 117.031,254.785 L 116.535,254.594 L 116.074,254.402 L 115.652,254.246 L 115.309,254.098 L 115.078,254.02"
4991 id="path1116" />
4992 </clipPath>
4993 <g
4994 id="g1118">
4995 </g>
4996
4997 <clipPath
4998 id="clippath53"> <path
4999 d="M 130.008,243.977 L 130.008,243.977 L 129.086,243.363 L 128.168,243.02 L 127.211,242.906 L 126.289,243.02 L 125.332,243.328 L 124.414,243.828 L 123.531,244.516 L 122.691,245.32 L 121.926,246.238 L 121.16,247.312 L 120.512,248.5 L 119.941,249.727 L 119.445,251.031 L 119.059,252.406 L 118.754,253.785 L 118.602,255.203 L 118.602,255.203 L 119.063,255.316 L 119.523,255.434 L 119.98,255.586 L 120.441,255.738 L 120.938,255.93 L 121.438,256.043 L 121.895,256.199 L 122.395,256.273 L 122.395,256.273 L 122.738,255.395 L 123.043,254.434 L 123.426,253.477 L 123.773,252.52 L 124.152,251.523 L 124.57,250.527 L 124.992,249.57 L 125.414,248.652 L 125.871,247.77 L 126.371,246.965 L 126.902,246.199 L 127.441,245.547 L 128.051,244.973 L 128.668,244.512 L 129.316,244.168 L 130.008,243.977"
5000 id="path1125" />
5001 </clipPath>
5002 <g
5003 id="g1127">
5004 </g>
5005
5006 <clipPath
5007 id="clippath54"> <path
5008 d="M 134.219,245.117 L 134.219,245.117 L 132.766,244.473 L 131.461,244.129 L 130.313,244.09 L 129.277,244.281 L 128.359,244.703 L 127.555,245.355 L 126.832,246.16 L 126.18,247.117 L 125.605,248.191 L 125.109,249.379 L 124.609,250.605 L 124.152,251.832 L 123.695,253.094 L 123.273,254.359 L 122.816,255.547 L 122.32,256.656 L 122.32,256.656 L 122.738,256.695 L 123.121,256.809 L 123.508,256.922 L 123.891,257.074 L 124.273,257.191 L 124.656,257.344 L 125.078,257.457 L 125.496,257.531 L 125.496,257.531 L 125.535,257.383 L 125.574,256.996 L 125.688,256.5 L 125.801,255.809 L 125.992,255.008 L 126.262,254.09 L 126.566,253.094 L 126.988,252.059 L 127.48,251.023 L 128.055,249.988 L 128.785,248.953 L 129.586,247.996 L 130.547,247.078 L 131.617,246.309 L 132.844,245.617 L 134.219,245.117"
5009 id="path1134" />
5010 </clipPath>
5011 <g
5012 id="g1136">
5013 </g>
5014
5015 <clipPath
5016 id="clippath55"> <path
5017 d="M 139.656,245.922 L 139.656,245.922 L 138.773,245.27 L 137.777,244.852 L 136.707,244.695 L 135.637,244.738 L 134.484,244.965 L 133.34,245.426 L 132.23,246.043 L 131.117,246.809 L 130.047,247.766 L 129.051,248.84 L 128.172,250.066 L 127.367,251.406 L 126.68,252.824 L 126.145,254.355 L 125.801,255.926 L 125.613,257.613 L 129.559,258.641 L 129.559,258.641 L 129.785,257.992 L 130.094,257.223 L 130.398,256.422 L 130.781,255.539 L 131.238,254.617 L 131.699,253.664 L 132.234,252.703 L 132.809,251.746 L 133.461,250.789 L 134.184,249.867 L 134.949,249.027 L 135.754,248.219 L 136.633,247.492 L 137.59,246.879 L 138.586,246.344 L 139.656,245.922"
5018 id="path1143" />
5019 </clipPath>
5020 <g
5021 id="g1145">
5022 </g>
5023
5024 <clipPath
5025 id="clippath56"> <path
5026 d="M 129.52,258.758 L 129.52,258.758 L 130.094,258.871 L 130.82,259.023 L 131.547,259.254 L 132.355,259.484 L 133.117,259.75 L 133.848,259.941 L 134.496,260.133 L 135.035,260.285 L 135.035,260.285 L 135.414,259.328 L 135.836,258.332 L 136.258,257.336 L 136.68,256.301 L 137.098,255.266 L 137.559,254.27 L 138.055,253.273 L 138.551,252.277 L 139.086,251.359 L 139.66,250.516 L 140.273,249.711 L 140.926,248.984 L 141.574,248.332 L 142.301,247.797 L 143.105,247.371 L 143.91,247.066 L 143.91,247.066 L 143.219,246.609 L 142.453,246.266 L 141.648,246.035 L 140.77,245.957 L 139.852,246.035 L 138.891,246.266 L 137.934,246.648 L 136.941,247.227 L 135.945,247.953 L 134.949,248.871 L 133.957,250.023 L 133,251.324 L 132.082,252.859 L 131.164,254.582 L 130.324,256.574 L 129.52,258.758"
5027 id="path1152" />
5028 </clipPath>
5029 <g
5030 id="g1154">
5031 </g>
5032
5033 <clipPath
5034 id="clippath57"> <path
5035 d="M 148.734,248.711 L 148.734,248.711 L 147.164,247.867 L 145.746,247.371 L 144.445,247.258 L 143.258,247.449 L 142.227,247.906 L 141.23,248.598 L 140.391,249.48 L 139.586,250.555 L 138.895,251.742 L 138.246,253.008 L 137.672,254.348 L 137.141,255.687 L 136.602,256.988 L 136.105,258.254 L 135.648,259.402 L 135.191,260.437 L 135.191,260.437 L 135.723,260.516 L 136.184,260.625 L 136.605,260.707 L 137.027,260.82 L 137.449,260.934 L 137.906,261.051 L 138.406,261.125 L 138.977,261.238 L 138.977,261.238 L 139.477,260.203 L 139.934,259.168 L 140.395,258.176 L 140.852,257.215 L 141.313,256.258 L 141.809,255.379 L 142.309,254.496 L 142.801,253.691 L 143.379,252.887 L 143.988,252.16 L 144.602,251.473 L 145.289,250.82 L 146.059,250.203 L 146.859,249.668 L 147.777,249.168 L 148.734,248.711"
5036 id="path1161" />
5037 </clipPath>
5038 <g
5039 id="g1163">
5040 </g>
5041
5042 <clipPath
5043 id="clippath58"> <path
5044 d="M 153.484,250.086 L 153.484,250.086 L 152.066,249.32 L 150.727,248.937 L 149.465,248.863 L 148.238,249.051 L 147.129,249.516 L 146.059,250.203 L 145.063,251.047 L 144.141,252.047 L 143.301,253.195 L 142.496,254.383 L 141.773,255.609 L 141.121,256.871 L 140.508,258.137 L 139.973,259.285 L 139.516,260.398 L 139.094,261.355 L 139.094,261.355 L 139.668,261.469 L 140.168,261.582 L 140.629,261.695 L 141.047,261.773 L 141.508,261.887 L 141.969,262.004 L 142.504,262.078 L 143.117,262.195 L 143.117,262.195 L 143.805,260.66 L 144.492,259.242 L 145.141,257.941 L 145.793,256.754 L 146.406,255.68 L 146.98,254.723 L 147.59,253.883 L 148.164,253.074 L 148.777,252.422 L 149.391,251.848 L 150,251.352 L 150.652,250.93 L 151.305,250.621 L 151.988,250.352 L 152.723,250.199 L 153.484,250.086"
5045 id="path1170" />
5046 </clipPath>
5047 <g
5048 id="g1172">
5049 </g>
5050
5051 <clipPath
5052 id="clippath59"> <path
5053 d="M 157.086,251.113 L 157.086,251.113 L 155.707,250.465 L 154.402,250.156 L 153.18,250.16 L 151.988,250.391 L 150.918,250.852 L 149.887,251.504 L 148.93,252.348 L 148.012,253.305 L 147.207,254.379 L 146.441,255.527 L 145.719,256.715 L 145.102,257.902 L 144.492,259.09 L 143.992,260.238 L 143.539,261.273 L 143.117,262.234 L 143.117,262.234 L 143.578,262.348 L 143.996,262.461 L 144.418,262.535 L 144.801,262.652 L 145.223,262.73 L 145.645,262.805 L 146.105,262.918 L 146.602,262.992 L 146.602,262.992 L 146.832,262.23 L 147.176,261.387 L 147.594,260.543 L 148.094,259.625 L 148.629,258.707 L 149.277,257.785 L 149.926,256.863 L 150.656,255.945 L 151.418,255.105 L 152.223,254.258 L 153.027,253.492 L 153.832,252.84 L 154.672,252.227 L 155.477,251.73 L 156.281,251.348 L 157.086,251.113"
5054 id="path1179" />
5055 </clipPath>
5056 <g
5057 id="g1181">
5058 </g>
5059
5060 <clipPath
5061 id="clippath60"> <path
5062 d="M 161.414,252.223 L 161.414,252.223 L 159.996,251.461 L 158.617,251.113 L 157.238,251.152 L 155.938,251.539 L 154.672,252.152 L 153.449,253.031 L 152.297,254.066 L 151.23,255.215 L 150.234,256.445 L 149.352,257.707 L 148.555,258.934 L 147.902,260.121 L 147.367,261.156 L 146.945,262.035 L 146.715,262.687 L 146.602,263.07 L 146.602,263.07 L 147.176,263.184 L 147.672,263.301 L 148.172,263.414 L 148.672,263.531 L 149.168,263.645 L 149.629,263.719 L 150.125,263.797 L 150.621,263.871 L 150.621,263.871 L 150.969,263.223 L 151.352,262.418 L 151.809,261.535 L 152.348,260.613 L 152.918,259.582 L 153.531,258.547 L 154.176,257.551 L 154.867,256.516 L 155.598,255.555 L 156.363,254.676 L 157.164,253.871 L 157.969,253.219 L 158.809,252.684 L 159.656,252.34 L 160.531,252.184 L 161.414,252.223"
5063 id="path1188" />
5064 </clipPath>
5065 <g
5066 id="g1190">
5067 </g>
5068
5069 <clipPath
5070 id="clippath61"> <path
5071 d="M 164.555,253.445 L 164.555,253.445 L 163.289,252.605 L 162.066,252.145 L 160.914,252.031 L 159.766,252.223 L 158.695,252.684 L 157.66,253.375 L 156.703,254.219 L 155.789,255.254 L 154.945,256.363 L 154.141,257.551 L 153.414,258.777 L 152.727,260.004 L 152.152,261.152 L 151.617,262.227 L 151.199,263.145 L 150.816,263.91 L 150.816,263.91 L 151.273,263.988 L 151.656,264.102 L 152,264.215 L 152.309,264.328 L 152.656,264.445 L 152.961,264.559 L 153.266,264.637 L 153.609,264.711 L 153.609,264.711 L 153.953,263.754 L 154.375,262.797 L 154.797,261.875 L 155.258,260.957 L 155.793,260.039 L 156.324,259.195 L 156.938,258.352 L 157.59,257.547 L 158.277,256.781 L 159.039,256.094 L 159.805,255.477 L 160.648,254.902 L 161.57,254.402 L 162.488,253.984 L 163.484,253.676 L 164.555,253.445"
5072 id="path1197" />
5073 </clipPath>
5074 <g
5075 id="g1199">
5076 </g>
5077
5078 <clipPath
5079 id="clippath62"> <path
5080 d="M 168.539,254.363 L 168.539,254.363 L 167.426,253.941 L 166.316,253.676 L 165.168,253.598 L 164.094,253.676 L 162.984,253.906 L 161.953,254.25 L 160.918,254.789 L 159.883,255.441 L 158.926,256.207 L 158.012,257.129 L 157.129,258.121 L 156.285,259.234 L 155.484,260.461 L 154.797,261.801 L 154.105,263.219 L 153.535,264.711 L 153.535,264.711 L 154.035,264.789 L 154.531,264.863 L 155.031,264.98 L 155.527,265.133 L 156.023,265.246 L 156.523,265.402 L 157.02,265.516 L 157.52,265.59 L 157.52,265.59 L 157.859,264.746 L 158.203,263.867 L 158.551,263.023 L 158.895,262.141 L 159.316,261.301 L 159.734,260.492 L 160.234,259.652 L 160.77,258.887 L 161.418,258.156 L 162.105,257.43 L 162.91,256.777 L 163.789,256.164 L 164.785,255.629 L 165.898,255.129 L 167.16,254.707 L 168.539,254.363"
5081 id="path1206" />
5082 </clipPath>
5083 <g
5084 id="g1208">
5085 </g>
5086
5087 <clipPath
5088 id="clippath63"> <path
5089 d="M 174.207,255.699 L 174.207,255.699 L 172.754,255.09 L 171.293,254.703 L 169.957,254.516 L 168.613,254.477 L 167.352,254.668 L 166.125,255.016 L 164.977,255.512 L 163.867,256.164 L 162.875,256.969 L 161.879,257.887 L 160.996,258.926 L 160.156,260.074 L 159.395,261.34 L 158.738,262.676 L 158.129,264.059 L 157.594,265.551 L 157.594,265.551 L 158.051,265.629 L 158.512,265.703 L 159.047,265.781 L 159.551,265.855 L 160.082,265.934 L 160.621,266.008 L 161.156,266.086 L 161.691,266.16 L 161.691,266.16 L 161.961,265.09 L 162.344,264.094 L 162.762,263.098 L 163.297,262.141 L 163.871,261.258 L 164.523,260.414 L 165.211,259.609 L 165.977,258.883 L 166.82,258.191 L 167.734,257.617 L 168.695,257.078 L 169.688,256.617 L 170.762,256.273 L 171.871,255.969 L 173.02,255.777 L 174.207,255.699"
5090 id="path1215" />
5091 </clipPath>
5092 <g
5093 id="g1217">
5094 </g>
5095
5096 <clipPath
5097 id="clippath64"> <path
5098 d="M 177,256 L 177,256 L 175.699,255.809 L 174.438,255.699 L 173.176,255.738 L 171.949,255.891 L 170.723,256.199 L 169.574,256.582 L 168.465,257.082 L 167.391,257.695 L 166.395,258.422 L 165.441,259.266 L 164.598,260.184 L 163.793,261.219 L 163.105,262.367 L 162.496,263.594 L 161.996,264.934 L 161.617,266.355 L 161.617,266.355 L 162.078,266.391 L 162.461,266.426 L 162.84,266.504 L 163.223,266.582 L 163.609,266.656 L 163.988,266.734 L 164.449,266.848 L 164.945,266.926 L 164.945,266.926 L 165.598,265.812 L 166.211,264.738 L 166.742,263.781 L 167.242,262.828 L 167.777,261.984 L 168.277,261.18 L 168.773,260.414 L 169.348,259.723 L 169.957,259.07 L 170.648,258.496 L 171.414,257.961 L 172.258,257.461 L 173.25,257.039 L 174.324,256.656 L 175.586,256.309 L 177,256"
5099 id="path1224" />
5100 </clipPath>
5101 <g
5102 id="g1226">
5103 </g>
5104
5105 <clipPath
5106 id="clippath65"> <path
5107 d="M 183.09,256.801 L 183.09,256.801 L 181.176,256.344 L 179.375,256.078 L 177.73,256 L 176.199,256.156 L 174.781,256.461 L 173.48,256.922 L 172.293,257.496 L 171.219,258.266 L 170.188,259.109 L 169.273,260.066 L 168.426,261.102 L 167.625,262.215 L 166.898,263.363 L 166.211,264.59 L 165.598,265.812 L 164.984,267.078 L 164.984,267.078 L 165.293,267.113 L 165.715,267.152 L 166.172,267.27 L 166.672,267.344 L 167.168,267.461 L 167.633,267.574 L 168.051,267.609 L 168.395,267.648 L 168.395,267.648 L 168.969,266.5 L 169.582,265.465 L 170.27,264.469 L 170.996,263.551 L 171.762,262.668 L 172.566,261.863 L 173.445,261.137 L 174.363,260.445 L 175.32,259.832 L 176.316,259.258 L 177.348,258.758 L 178.418,258.262 L 179.531,257.84 L 180.68,257.457 L 181.867,257.109 L 183.09,256.801"
5108 id="path1233" />
5109 </clipPath>
5110 <g
5111 id="g1235">
5112 </g>
5113
5114 <clipPath
5115 id="clippath66"> <path
5116 d="M 186.727,257.18 L 186.727,257.18 L 185.66,256.988 L 184.547,256.953 L 183.32,256.992 L 182.059,257.184 L 180.793,257.453 L 179.492,257.84 L 178.191,258.336 L 176.887,258.949 L 175.59,259.676 L 174.363,260.523 L 173.18,261.441 L 172.031,262.477 L 170.996,263.629 L 170.039,264.852 L 169.199,266.195 L 168.473,267.648 L 168.473,267.648 L 169.008,267.687 L 169.543,267.766 L 170.082,267.879 L 170.617,267.992 L 171.074,268.105 L 171.461,268.223 L 171.766,268.258 L 171.957,268.223 L 171.957,268.223 L 172.723,267.762 L 173.488,267.227 L 174.289,266.613 L 175.133,265.922 L 175.977,265.195 L 176.855,264.426 L 177.773,263.66 L 178.691,262.855 L 179.648,262.051 L 180.605,261.242 L 181.602,260.441 L 182.598,259.711 L 183.629,258.984 L 184.625,258.332 L 185.695,257.719 L 186.727,257.18"
5117 id="path1242" />
5118 </clipPath>
5119 <g
5120 id="g1244">
5121 </g>
5122
5123 <clipPath
5124 id="clippath67"> <path
5125 d="M 191.824,258.633 L 191.824,258.633 L 190.867,257.676 L 189.832,257.105 L 188.758,256.871 L 187.648,256.949 L 186.5,257.258 L 185.277,257.832 L 184.051,258.602 L 182.75,259.48 L 181.449,260.555 L 180.109,261.664 L 178.77,262.855 L 177.43,264.043 L 176.051,265.23 L 174.676,266.344 L 173.297,267.379 L 171.918,268.301 L 171.918,268.301 L 172.34,268.371 L 172.84,268.41 L 173.449,268.453 L 174.063,268.488 L 174.711,268.527 L 175.363,268.527 L 175.941,268.527 L 176.438,268.523 L 176.438,268.523 L 177.473,267.758 L 178.465,266.953 L 179.461,266.148 L 180.379,265.305 L 181.297,264.5 L 182.18,263.695 L 183.059,262.891 L 183.938,262.121 L 184.855,261.434 L 185.734,260.781 L 186.656,260.207 L 187.609,259.707 L 188.605,259.285 L 189.641,258.941 L 190.715,258.746 L 191.824,258.633"
5126 id="path1251" />
5127 </clipPath>
5128 <g
5129 id="g1253">
5130 </g>
5131
5132 <clipPath
5133 id="clippath68"> <path
5134 d="M 195.656,259.09 L 195.656,259.09 L 194.043,258.707 L 192.551,258.555 L 191.137,258.633 L 189.793,258.902 L 188.531,259.363 L 187.305,259.937 L 186.121,260.668 L 184.973,261.473 L 183.859,262.391 L 182.754,263.312 L 181.68,264.309 L 180.57,265.305 L 179.461,266.262 L 178.352,267.223 L 177.164,268.062 L 175.98,268.871 L 175.98,268.871 L 176.359,268.906 L 176.855,268.984 L 177.434,269.059 L 178.012,269.098 L 178.582,269.176 L 179.082,269.246 L 179.461,269.289 L 179.695,269.328 L 179.695,269.328 L 180.383,269.324 L 181.07,269.137 L 181.797,268.789 L 182.563,268.293 L 183.328,267.676 L 184.172,266.949 L 185.051,266.145 L 185.969,265.301 L 186.93,264.418 L 187.961,263.5 L 189.07,262.617 L 190.219,261.773 L 191.48,260.969 L 192.781,260.238 L 194.16,259.59 L 195.656,259.09"
5135 id="path1260" />
5136 </clipPath>
5137 <g
5138 id="g1262">
5139 </g>
5140
5141 <clipPath
5142 id="clippath69"> <path
5143 d="M 200.48,260.426 L 200.48,260.426 L 199.063,259.621 L 197.566,259.281 L 196.113,259.281 L 194.621,259.59 L 193.129,260.164 L 191.633,260.969 L 190.18,261.926 L 188.766,262.961 L 187.426,264.113 L 186.121,265.262 L 184.859,266.336 L 183.711,267.332 L 182.68,268.176 L 181.723,268.824 L 180.879,269.211 L 180.156,269.324 L 180.156,269.324 L 180.766,269.402 L 181.418,269.516 L 182.145,269.629 L 182.91,269.746 L 183.676,269.82 L 184.441,269.934 L 185.168,269.973 L 185.824,270.012 L 185.824,270.012 L 186.895,269.207 L 187.852,268.441 L 188.73,267.672 L 189.57,266.945 L 190.336,266.215 L 191.063,265.488 L 191.828,264.797 L 192.555,264.145 L 193.285,263.535 L 194.086,262.957 L 194.926,262.422 L 195.848,261.926 L 196.844,261.465 L 197.914,261.082 L 199.141,260.734 L 200.48,260.426"
5144 id="path1269" />
5145 </clipPath>
5146 <g
5147 id="g1271">
5148 </g>
5149
5150 <clipPath
5151 id="clippath70"> <path
5152 d="M 107.719,244.719 L 107.719,244.719 L 107.527,244.336 L 107.371,243.918 L 107.223,243.492 L 107.105,243.074 L 106.988,242.613 L 106.875,242.117 L 106.797,241.66 L 106.758,241.16 L 106.719,240.699 L 106.684,240.203 L 106.684,239.703 L 106.719,239.242 L 106.754,238.785 L 106.836,238.328 L 106.949,237.863 L 107.063,237.445 L 107.063,237.445 L 106.449,238.25 L 105.992,239.133 L 105.648,239.973 L 105.457,240.852 L 105.383,241.773 L 105.383,242.691 L 105.496,243.609 L 105.691,244.531 L 105.918,245.449 L 106.227,246.332 L 106.57,247.211 L 106.914,248.09 L 107.297,248.93 L 107.645,249.773 L 107.992,250.539 L 108.336,251.305 L 108.336,251.305 L 108.262,251.152 L 108.223,250.883 L 108.18,250.617 L 108.18,250.273 L 108.145,249.852 L 108.145,249.43 L 108.145,248.969 L 108.145,248.473 L 108.105,247.973 L 108.105,247.477 L 108.066,246.941 L 108.027,246.445 L 107.988,245.984 L 107.91,245.523 L 107.836,245.102 L 107.719,244.719"
5153 id="path1278" />
5154 </clipPath>
5155 <g
5156 id="g1280">
5157 </g>
5158
5159 <clipPath
5160 id="clippath71"> <path
5161 d="M 96.0469,260.621 L 96.0469,260.621 L 96.7773,260.199 L 97.5078,259.738 L 98.1953,259.316 L 98.8828,258.895 L 99.5313,258.512 L 100.184,258.09 L 100.801,257.707 L 101.41,257.285 L 102.063,256.902 L 102.668,256.52 L 103.285,256.098 L 103.938,255.715 L 104.547,255.328 L 105.203,254.988 L 105.887,254.602 L 106.578,254.219 L 106.578,254.219 L 105.969,254.828 L 105.352,255.445 L 104.738,256.055 L 104.129,256.633 L 103.477,257.242 L 102.828,257.781 L 102.176,258.32 L 101.488,258.816 L 100.836,259.277 L 100.148,259.66 L 99.4961,260.008 L 98.8086,260.273 L 98.1211,260.504 L 97.4297,260.617 L 96.7383,260.656 L 96.0469,260.621"
5162 id="path1287" />
5163 </clipPath>
5164 <g
5165 id="g1289">
5166 </g>
5167
5168 <clipPath
5169 id="clippath72"> <path
5170 d="M 101.746,245.297 L 101.746,245.297 L 102.281,245.605 L 102.781,245.949 L 103.277,246.293 L 103.738,246.637 L 104.199,246.98 L 104.617,247.363 L 105.039,247.785 L 105.465,248.168 L 105.848,248.59 L 106.23,249.012 L 106.613,249.43 L 106.957,249.891 L 107.34,250.309 L 107.684,250.77 L 108.031,251.23 L 108.375,251.687 L 108.375,251.687 L 108.223,251.191 L 108.031,250.691 L 107.84,250.156 L 107.609,249.621 L 107.379,249.086 L 107.113,248.586 L 106.801,248.09 L 106.457,247.594 L 106.074,247.133 L 105.613,246.754 L 105.117,246.367 L 104.578,246.023 L 103.969,245.754 L 103.316,245.527 L 102.551,245.371 L 101.746,245.297"
5171 id="path1296" />
5172 </clipPath>
5173 <g
5174 id="g1298">
5175 </g>
5176
5177 <clipPath
5178 id="clippath73"> <path
5179 d="M 107.449,240.699 L 107.449,240.699 L 107.602,241.465 L 107.715,242.23 L 107.871,242.996 L 107.984,243.684 L 108.063,244.414 L 108.176,245.102 L 108.258,245.793 L 108.371,246.48 L 108.445,247.168 L 108.563,247.859 L 108.637,248.512 L 108.758,249.199 L 108.871,249.891 L 108.988,250.578 L 109.137,251.27 L 109.293,251.992 L 109.293,251.992 L 109.371,251.152 L 109.484,250.348 L 109.563,249.504 L 109.637,248.664 L 109.676,247.859 L 109.715,247.051 L 109.672,246.246 L 109.637,245.484 L 109.559,244.719 L 109.441,244.031 L 109.246,243.34 L 109.02,242.687 L 108.75,242.113 L 108.367,241.578 L 107.945,241.117 L 107.449,240.699"
5180 id="path1305" />
5181 </clipPath>
5182 <g
5183 id="g1307">
5184 </g>
5185
5186 <clipPath
5187 id="clippath74"> <path
5188 d="M 103.348,239.441 L 103.348,239.441 L 102.656,240.09 L 102.238,240.816 L 102.051,241.582 L 102.012,242.387 L 102.164,243.191 L 102.473,244.035 L 102.895,244.879 L 103.395,245.719 L 103.969,246.523 L 104.578,247.328 L 105.23,248.09 L 105.883,248.781 L 106.496,249.469 L 107.031,250.043 L 107.492,250.582 L 107.879,250.996 L 107.879,250.996 L 107.801,249.852 L 107.645,248.855 L 107.453,248.012 L 107.148,247.246 L 106.844,246.598 L 106.457,246.023 L 106.074,245.488 L 105.652,244.988 L 105.27,244.492 L 104.844,243.996 L 104.461,243.457 L 104.156,242.844 L 103.848,242.156 L 103.617,241.391 L 103.422,240.473 L 103.348,239.441"
5189 id="path1314" />
5190 </clipPath>
5191 <g
5192 id="g1316">
5193 </g>
5194
5195 <clipPath
5196 id="clippath75"> <path
5197 d="M 108.262,251.687 L 108.262,251.687 L 108.262,250.883 L 108.297,250.117 L 108.332,249.355 L 108.371,248.586 L 108.41,247.859 L 108.484,247.133 L 108.563,246.406 L 108.637,245.715 L 108.754,245.023 L 108.867,244.297 L 109.055,243.609 L 109.211,242.918 L 109.402,242.23 L 109.668,241.539 L 109.898,240.852 L 110.203,240.16 L 110.203,240.16 L 109.094,241.234 L 108.176,242.27 L 107.527,243.227 L 107.07,244.148 L 106.762,245.027 L 106.609,245.832 L 106.57,246.637 L 106.648,247.363 L 106.844,248.051 L 107.031,248.703 L 107.301,249.316 L 107.57,249.852 L 107.801,250.387 L 108.031,250.848 L 108.18,251.305 L 108.262,251.687"
5198 id="path1323" />
5199 </clipPath>
5200 <g
5201 id="g1325">
5202 </g>
5203
5204 <clipPath
5205 id="clippath76"> <path
5206 d="M 97.0391,254.34 L 97.0391,254.34 L 97.6523,254.453 L 98.2695,254.57 L 98.8789,254.645 L 99.4922,254.719 L 100.102,254.758 L 100.719,254.797 L 101.332,254.797 L 101.906,254.797 L 102.52,254.758 L 103.094,254.758 L 103.707,254.68 L 104.281,254.641 L 104.895,254.602 L 105.469,254.527 L 106.082,254.445 L 106.656,254.371 L 106.656,254.371 L 106.156,254.602 L 105.66,254.793 L 105.121,255.023 L 104.586,255.215 L 104.012,255.406 L 103.434,255.559 L 102.824,255.715 L 102.215,255.793 L 101.602,255.832 L 100.953,255.832 L 100.301,255.793 L 99.6445,255.641 L 98.9922,255.449 L 98.3438,255.18 L 97.6953,254.801 L 97.0391,254.34"
5207 id="path1332" />
5208 </clipPath>
5209 <g
5210 id="g1334">
5211 </g>
5212
5213 <clipPath
5214 id="clippath77"> <path
5215 d="M 100.605,257.437 L 100.605,257.437 L 100.145,257.516 L 99.7266,257.594 L 99.2656,257.707 L 98.8086,257.824 L 98.3438,258.016 L 97.8906,258.168 L 97.4688,258.359 L 97.0078,258.59 L 96.5859,258.82 L 96.1641,259.086 L 95.7852,259.359 L 95.3984,259.625 L 95.0547,259.93 L 94.7461,260.238 L 94.4453,260.547 L 94.1719,260.891 L 94.1719,260.891 L 94.4414,259.973 L 94.8633,259.164 L 95.3594,258.477 L 95.9727,257.824 L 96.6602,257.289 L 97.4297,256.789 L 98.2695,256.367 L 99.1523,256.023 L 100.07,255.715 L 100.988,255.41 L 101.945,255.176 L 102.902,254.949 L 103.859,254.754 L 104.777,254.566 L 105.621,254.41 L 106.461,254.219 L 106.461,254.219 L 106.043,254.371 L 105.469,254.676 L 104.777,255.141 L 103.938,255.637 L 103.094,256.172 L 102.215,256.672 L 101.371,257.133 L 100.605,257.437"
5216 id="path1341" />
5217 </clipPath>
5218 <g
5219 id="g1343">
5220 </g>
5221
5222 <clipPath
5223 id="clippath78"> <path
5224 d="M 98.8477,264.297 L 98.8477,264.297 L 98.5039,263.414 L 98.3867,262.57 L 98.5039,261.73 L 98.8086,260.965 L 99.2656,260.234 L 99.8789,259.508 L 100.605,258.816 L 101.41,258.203 L 102.254,257.59 L 103.133,257.055 L 104.051,256.516 L 104.934,256.055 L 105.734,255.637 L 106.504,255.25 L 107.152,254.906 L 107.648,254.598 L 107.648,254.598 L 107.117,255.711 L 106.578,256.629 L 106.008,257.359 L 105.469,258.012 L 104.895,258.508 L 104.32,258.969 L 103.746,259.312 L 103.172,259.66 L 102.602,260.004 L 102.023,260.387 L 101.453,260.77 L 100.914,261.23 L 100.379,261.801 L 99.8438,262.492 L 99.3477,263.301 L 98.8477,264.297"
5225 id="path1350" />
5226 </clipPath>
5227 <g
5228 id="g1352">
5229 </g>
5230
5231 <clipPath
5232 id="clippath79"> <path
5233 d="M 108.492,254.863 L 108.492,254.863 L 108.07,255.52 L 107.652,256.172 L 107.23,256.82 L 106.852,257.434 L 106.504,258.047 L 106.16,258.695 L 105.816,259.309 L 105.512,259.926 L 105.207,260.578 L 104.938,261.227 L 104.668,261.84 L 104.438,262.488 L 104.246,263.18 L 104.055,263.832 L 103.902,264.52 L 103.75,265.25 L 103.75,265.25 L 103.367,263.797 L 103.176,262.531 L 103.137,261.383 L 103.25,260.387 L 103.477,259.543 L 103.824,258.777 L 104.242,258.125 L 104.703,257.59 L 105.238,257.09 L 105.777,256.668 L 106.348,256.324 L 106.887,256.016 L 107.383,255.711 L 107.844,255.445 L 108.184,255.137 L 108.492,254.863"
5234 id="path1359" />
5235 </clipPath>
5236 <g
5237 id="g1361">
5238 </g>
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289 <flowRoot
5290 xml:space="preserve"
5291 id="flowRoot2898"
5292 transform="matrix(0.3776217,0,0,0.3776217,94.044679,45.940478)"
5293 style="fill:url(#linearGradient3669);fill-opacity:1"><flowRegion
5294 id="flowRegion2900"
5295 style="fill:url(#linearGradient9226);fill-opacity:1"><rect
5296 id="rect2902"
5297 width="132.38824"
5298 height="126.51618"
5299 x="13.879412"
5300 y="32.076469"
5301 style="fill:url(#linearGradient9228);fill-opacity:1" /></flowRegion><flowPara
5302 id="flowPara2904"
5303 style="fill:url(#linearGradient9230);fill-opacity:1">1010100101010101011010101110101011101011101010111101011100101011101011010011000010101010101111011000110101010101100101110111010110011101</flowPara></flowRoot><path
5304 style="fill:url(#linearGradient3201);fill-opacity:1;fill-rule:nonzero;stroke:url(#linearGradient5155);stroke-width:1.27600002;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
5305 d="M 120.77778,62.052336 C 151.84292,16.081843 187.99543,38.077582 188.24589,38.743884 L 190.59972,32.295271 L 196.28191,51.75545 L 177.29362,48.14096 L 181.99895,45.132138 C 165.68448,34.231278 142.30462,47.848087 130.18353,69.108155 L 120.77778,62.052336 z "
5306 id="path2190"
5307 sodipodi:nodetypes="cccccccc" /><text
5308 xml:space="preserve"
5309 style="font-size:45.0126915px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Bitstream Vera Sans"
5310 x="131.8634"
5311 y="92.587715"
5312 id="text4638"><tspan
5313 sodipodi:role="line"
5314 id="tspan4640"
5315 x="131.8634"
5316 y="92.587715">&lt;Apache Tika/&gt;</tspan></text>
5317 </g></svg>
0 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
1 <!--
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 -->
17 <svg
18 xmlns:dc="http://purl.org/dc/elements/1.1/"
19 xmlns:cc="http://web.resource.org/cc/"
20 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
21 xmlns:svg="http://www.w3.org/2000/svg"
22 xmlns="http://www.w3.org/2000/svg"
23 xmlns:xlink="http://www.w3.org/1999/xlink"
24 xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
25 xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
26 width="363px"
27 height="94px"
28 viewBox="82px 238px 363px 94px"
29 id="svg2"
30 sodipodi:version="0.32"
31 inkscape:version="0.45.1"
32 sodipodi:docname="tikaNoText.svg"
33 inkscape:output_extension="org.inkscape.output.svg.inkscape"
34 sodipodi:docbase="C:\src\tika\trunk\src\site\resources"
35 inkscape:export-filename="C:\Users\Jukka Zitting\Desktop\tikaNoText16.png"
36 inkscape:export-xdpi="27.52"
37 inkscape:export-ydpi="27.52">
38 <metadata
39 id="metadata1566">
40 <rdf:RDF>
41 <cc:Work
42 rdf:about="">
43 <dc:format>image/svg+xml</dc:format>
44 <dc:type
45 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
46 </cc:Work>
47 </rdf:RDF>
48 </metadata>
49 <defs
50 id="defs1564">
51 <linearGradient
52 id="XMLID_1_"
53 gradientUnits="userSpaceOnUse"
54 x1="-3662.4312"
55 y1="-3617.1401"
56 x2="-3663.4963"
57 y2="-3588.9297"
58 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
59 <stop
60 offset="0"
61 style="stop-color:#500C81"
62 id="stop173" />
63 <stop
64 offset="0.0374"
65 style="stop-color:#5F0B7A"
66 id="stop175" />
67 <stop
68 offset="0.1836"
69 style="stop-color:#96075F"
70 id="stop177" />
71 <stop
72 offset="0.3196"
73 style="stop-color:#C1044A"
74 id="stop179" />
75 <stop
76 offset="0.4412"
77 style="stop-color:#E0023B"
78 id="stop181" />
79 <stop
80 offset="0.5441"
81 style="stop-color:#F30032"
82 id="stop183" />
83 <stop
84 offset="0.6158"
85 style="stop-color:#FA002F"
86 id="stop185" />
87 <stop
88 offset="1"
89 style="stop-color:#F7EE5F"
90 id="stop187" />
91 </linearGradient>
92 <linearGradient
93 id="XMLID_2_"
94 gradientUnits="userSpaceOnUse"
95 x1="-3672.1465"
96 y1="-3607.502"
97 x2="-3673.0225"
98 y2="-3584.3003"
99 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
100 <stop
101 offset="0"
102 style="stop-color:#500C81"
103 id="stop192" />
104 <stop
105 offset="0.0374"
106 style="stop-color:#5F0B7A"
107 id="stop194" />
108 <stop
109 offset="0.1836"
110 style="stop-color:#96075F"
111 id="stop196" />
112 <stop
113 offset="0.3196"
114 style="stop-color:#C1044A"
115 id="stop198" />
116 <stop
117 offset="0.4412"
118 style="stop-color:#E0023B"
119 id="stop200" />
120 <stop
121 offset="0.5441"
122 style="stop-color:#F30032"
123 id="stop202" />
124 <stop
125 offset="0.6158"
126 style="stop-color:#FA002F"
127 id="stop204" />
128 <stop
129 offset="1"
130 style="stop-color:#F7EE5F"
131 id="stop206" />
132 </linearGradient>
133 <linearGradient
134 id="XMLID_3_"
135 gradientUnits="userSpaceOnUse"
136 x1="-3681.2422"
137 y1="-3614.8345"
138 x2="-3682.1765"
139 y2="-3590.0845"
140 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
141 <stop
142 offset="0"
143 style="stop-color:#500C81"
144 id="stop211" />
145 <stop
146 offset="0.0374"
147 style="stop-color:#5F0B7A"
148 id="stop213" />
149 <stop
150 offset="0.1836"
151 style="stop-color:#96075F"
152 id="stop215" />
153 <stop
154 offset="0.3196"
155 style="stop-color:#C1044A"
156 id="stop217" />
157 <stop
158 offset="0.4412"
159 style="stop-color:#E0023B"
160 id="stop219" />
161 <stop
162 offset="0.5441"
163 style="stop-color:#F30032"
164 id="stop221" />
165 <stop
166 offset="0.6158"
167 style="stop-color:#FA002F"
168 id="stop223" />
169 <stop
170 offset="1"
171 style="stop-color:#F7EE5F"
172 id="stop225" />
173 </linearGradient>
174 <linearGradient
175 id="XMLID_4_"
176 gradientUnits="userSpaceOnUse"
177 x1="-3689.5493"
178 y1="-3608.1592"
179 x2="-3690.4253"
180 y2="-3584.9575"
181 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
182 <stop
183 offset="0"
184 style="stop-color:#500C81"
185 id="stop230" />
186 <stop
187 offset="0.0374"
188 style="stop-color:#5F0B7A"
189 id="stop232" />
190 <stop
191 offset="0.1836"
192 style="stop-color:#96075F"
193 id="stop234" />
194 <stop
195 offset="0.3196"
196 style="stop-color:#C1044A"
197 id="stop236" />
198 <stop
199 offset="0.4412"
200 style="stop-color:#E0023B"
201 id="stop238" />
202 <stop
203 offset="0.5441"
204 style="stop-color:#F30032"
205 id="stop240" />
206 <stop
207 offset="0.6158"
208 style="stop-color:#FA002F"
209 id="stop242" />
210 <stop
211 offset="1"
212 style="stop-color:#F7EE5F"
213 id="stop244" />
214 </linearGradient>
215 <linearGradient
216 id="XMLID_5_"
217 gradientUnits="userSpaceOnUse"
218 x1="-3699.7769"
219 y1="-3613.4175"
220 x2="-3700.6736"
221 y2="-3589.6692"
222 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
223 <stop
224 offset="0"
225 style="stop-color:#500C81"
226 id="stop249" />
227 <stop
228 offset="0.0374"
229 style="stop-color:#5F0B7A"
230 id="stop251" />
231 <stop
232 offset="0.1836"
233 style="stop-color:#96075F"
234 id="stop253" />
235 <stop
236 offset="0.3196"
237 style="stop-color:#C1044A"
238 id="stop255" />
239 <stop
240 offset="0.4412"
241 style="stop-color:#E0023B"
242 id="stop257" />
243 <stop
244 offset="0.5441"
245 style="stop-color:#F30032"
246 id="stop259" />
247 <stop
248 offset="0.6158"
249 style="stop-color:#FA002F"
250 id="stop261" />
251 <stop
252 offset="1"
253 style="stop-color:#F7EE5F"
254 id="stop263" />
255 </linearGradient>
256 <linearGradient
257 id="XMLID_6_"
258 gradientUnits="userSpaceOnUse"
259 x1="-3706.9673"
260 y1="-3608.8169"
261 x2="-3707.8433"
262 y2="-3585.6152"
263 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
264 <stop
265 offset="0"
266 style="stop-color:#500C81"
267 id="stop268" />
268 <stop
269 offset="0.0374"
270 style="stop-color:#5F0B7A"
271 id="stop270" />
272 <stop
273 offset="0.1836"
274 style="stop-color:#96075F"
275 id="stop272" />
276 <stop
277 offset="0.3196"
278 style="stop-color:#C1044A"
279 id="stop274" />
280 <stop
281 offset="0.4412"
282 style="stop-color:#E0023B"
283 id="stop276" />
284 <stop
285 offset="0.5441"
286 style="stop-color:#F30032"
287 id="stop278" />
288 <stop
289 offset="0.6158"
290 style="stop-color:#FA002F"
291 id="stop280" />
292 <stop
293 offset="1"
294 style="stop-color:#F7EE5F"
295 id="stop282" />
296 </linearGradient>
297 <linearGradient
298 id="XMLID_7_"
299 gradientUnits="userSpaceOnUse"
300 x1="-3716.0044"
301 y1="-3611.9541"
302 x2="-3716.7644"
303 y2="-3591.8245"
304 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
305 <stop
306 offset="0"
307 style="stop-color:#500C81"
308 id="stop287" />
309 <stop
310 offset="0.0374"
311 style="stop-color:#5F0B7A"
312 id="stop289" />
313 <stop
314 offset="0.1836"
315 style="stop-color:#96075F"
316 id="stop291" />
317 <stop
318 offset="0.3196"
319 style="stop-color:#C1044A"
320 id="stop293" />
321 <stop
322 offset="0.4412"
323 style="stop-color:#E0023B"
324 id="stop295" />
325 <stop
326 offset="0.5441"
327 style="stop-color:#F30032"
328 id="stop297" />
329 <stop
330 offset="0.6158"
331 style="stop-color:#FA002F"
332 id="stop299" />
333 <stop
334 offset="1"
335 style="stop-color:#F7EE5F"
336 id="stop301" />
337 </linearGradient>
338 <linearGradient
339 id="XMLID_8_"
340 gradientUnits="userSpaceOnUse"
341 x1="-3725.0508"
342 y1="-3609.4995"
343 x2="-3725.9268"
344 y2="-3586.2979"
345 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
346 <stop
347 offset="0"
348 style="stop-color:#500C81"
349 id="stop306" />
350 <stop
351 offset="0.0374"
352 style="stop-color:#5F0B7A"
353 id="stop308" />
354 <stop
355 offset="0.1836"
356 style="stop-color:#96075F"
357 id="stop310" />
358 <stop
359 offset="0.3196"
360 style="stop-color:#C1044A"
361 id="stop312" />
362 <stop
363 offset="0.4412"
364 style="stop-color:#E0023B"
365 id="stop314" />
366 <stop
367 offset="0.5441"
368 style="stop-color:#F30032"
369 id="stop316" />
370 <stop
371 offset="0.6158"
372 style="stop-color:#FA002F"
373 id="stop318" />
374 <stop
375 offset="1"
376 style="stop-color:#F7EE5F"
377 id="stop320" />
378 </linearGradient>
379 <linearGradient
380 id="XMLID_9_"
381 gradientUnits="userSpaceOnUse"
382 x1="-3733.1665"
383 y1="-3613.2212"
384 x2="-3733.397"
385 y2="-3602.4053"
386 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
387 <stop
388 offset="0"
389 style="stop-color:#F7EE5F"
390 id="stop325" />
391 <stop
392 offset="0.1872"
393 style="stop-color:#F6D65D"
394 id="stop327" />
395 <stop
396 offset="0.3829"
397 style="stop-color:#F4C35B"
398 id="stop329" />
399 <stop
400 offset="0.5198"
401 style="stop-color:#F4BC5A"
402 id="stop331" />
403 <stop
404 offset="0.7816"
405 style="stop-color:#F6DA5D"
406 id="stop333" />
407 <stop
408 offset="1"
409 style="stop-color:#F7EE5F"
410 id="stop335" />
411 </linearGradient>
412 <linearGradient
413 id="XMLID_10_"
414 gradientUnits="userSpaceOnUse"
415 x1="-3742.7129"
416 y1="-3619.499"
417 x2="-3739.1846"
418 y2="-3596.0273"
419 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
420 <stop
421 offset="0"
422 style="stop-color:#F7EE5F"
423 id="stop340" />
424 <stop
425 offset="0.1872"
426 style="stop-color:#F6D65D"
427 id="stop342" />
428 <stop
429 offset="0.3829"
430 style="stop-color:#F4C35B"
431 id="stop344" />
432 <stop
433 offset="0.5198"
434 style="stop-color:#F4BC5A"
435 id="stop346" />
436 <stop
437 offset="0.7816"
438 style="stop-color:#F6DA5D"
439 id="stop348" />
440 <stop
441 offset="1"
442 style="stop-color:#F7EE5F"
443 id="stop350" />
444 </linearGradient>
445 <linearGradient
446 id="XMLID_11_"
447 gradientUnits="userSpaceOnUse"
448 x1="-3747.291"
449 y1="-3613.5225"
450 x2="-3747.5215"
451 y2="-3602.7065"
452 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
453 <stop
454 offset="0"
455 style="stop-color:#F7EE5F"
456 id="stop355" />
457 <stop
458 offset="0.1872"
459 style="stop-color:#F6D65D"
460 id="stop357" />
461 <stop
462 offset="0.3829"
463 style="stop-color:#F4C35B"
464 id="stop359" />
465 <stop
466 offset="0.5198"
467 style="stop-color:#F4BC5A"
468 id="stop361" />
469 <stop
470 offset="0.7816"
471 style="stop-color:#F6DA5D"
472 id="stop363" />
473 <stop
474 offset="1"
475 style="stop-color:#F7EE5F"
476 id="stop365" />
477 </linearGradient>
478 <linearGradient
479 id="XMLID_12_"
480 gradientUnits="userSpaceOnUse"
481 x1="-3756.9526"
482 y1="-3617.3584"
483 x2="-3753.4243"
484 y2="-3593.8867"
485 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
486 <stop
487 offset="0"
488 style="stop-color:#F7EE5F"
489 id="stop370" />
490 <stop
491 offset="0.1872"
492 style="stop-color:#F6D65D"
493 id="stop372" />
494 <stop
495 offset="0.3829"
496 style="stop-color:#F4C35B"
497 id="stop374" />
498 <stop
499 offset="0.5198"
500 style="stop-color:#F4BC5A"
501 id="stop376" />
502 <stop
503 offset="0.7816"
504 style="stop-color:#F6DA5D"
505 id="stop378" />
506 <stop
507 offset="1"
508 style="stop-color:#F7EE5F"
509 id="stop380" />
510 </linearGradient>
511 <linearGradient
512 id="XMLID_13_"
513 gradientUnits="userSpaceOnUse"
514 x1="-3759.7539"
515 y1="-3613.7876"
516 x2="-3759.9844"
517 y2="-3602.9717"
518 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
519 <stop
520 offset="0"
521 style="stop-color:#F7EE5F"
522 id="stop385" />
523 <stop
524 offset="0.1872"
525 style="stop-color:#F6D65D"
526 id="stop387" />
527 <stop
528 offset="0.3829"
529 style="stop-color:#F4C35B"
530 id="stop389" />
531 <stop
532 offset="0.5198"
533 style="stop-color:#F4BC5A"
534 id="stop391" />
535 <stop
536 offset="0.7816"
537 style="stop-color:#F6DA5D"
538 id="stop393" />
539 <stop
540 offset="1"
541 style="stop-color:#F7EE5F"
542 id="stop395" />
543 </linearGradient>
544 <linearGradient
545 id="XMLID_14_"
546 gradientUnits="userSpaceOnUse"
547 x1="-3763.1528"
548 y1="-3613.8604"
549 x2="-3763.3833"
550 y2="-3603.0444"
551 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
552 <stop
553 offset="0"
554 style="stop-color:#F7EE5F"
555 id="stop400" />
556 <stop
557 offset="0.1872"
558 style="stop-color:#F6D65D"
559 id="stop402" />
560 <stop
561 offset="0.3829"
562 style="stop-color:#F4C35B"
563 id="stop404" />
564 <stop
565 offset="0.5198"
566 style="stop-color:#F4BC5A"
567 id="stop406" />
568 <stop
569 offset="0.7816"
570 style="stop-color:#F6DA5D"
571 id="stop408" />
572 <stop
573 offset="1"
574 style="stop-color:#F7EE5F"
575 id="stop410" />
576 </linearGradient>
577 <linearGradient
578 id="XMLID_15_"
579 gradientUnits="userSpaceOnUse"
580 x1="-3766.9253"
581 y1="-3606.5742"
582 x2="-3767.0735"
583 y2="-3599.615"
584 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
585 <stop
586 offset="0"
587 style="stop-color:#F7EE5F"
588 id="stop415" />
589 <stop
590 offset="0.1872"
591 style="stop-color:#F6D65D"
592 id="stop417" />
593 <stop
594 offset="0.3829"
595 style="stop-color:#F4C35B"
596 id="stop419" />
597 <stop
598 offset="0.5198"
599 style="stop-color:#F4BC5A"
600 id="stop421" />
601 <stop
602 offset="0.7816"
603 style="stop-color:#F6DA5D"
604 id="stop423" />
605 <stop
606 offset="1"
607 style="stop-color:#F7EE5F"
608 id="stop425" />
609 </linearGradient>
610 <linearGradient
611 id="XMLID_16_"
612 gradientUnits="userSpaceOnUse"
613 x1="-3767.0049"
614 y1="-3613.9424"
615 x2="-3767.2354"
616 y2="-3603.1265"
617 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
618 <stop
619 offset="0"
620 style="stop-color:#F7EE5F"
621 id="stop430" />
622 <stop
623 offset="0.1872"
624 style="stop-color:#F6D65D"
625 id="stop432" />
626 <stop
627 offset="0.3829"
628 style="stop-color:#F4C35B"
629 id="stop434" />
630 <stop
631 offset="0.5198"
632 style="stop-color:#F4BC5A"
633 id="stop436" />
634 <stop
635 offset="0.7816"
636 style="stop-color:#F6DA5D"
637 id="stop438" />
638 <stop
639 offset="1"
640 style="stop-color:#F7EE5F"
641 id="stop440" />
642 </linearGradient>
643 <linearGradient
644 id="XMLID_17_"
645 gradientUnits="userSpaceOnUse"
646 x1="-3664.459"
647 y1="-3609.6206"
648 x2="-3674.479"
649 y2="-3642.6406"
650 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
651 <stop
652 offset="0"
653 style="stop-color:#500C81"
654 id="stop445" />
655 <stop
656 offset="0.0374"
657 style="stop-color:#5F0B7A"
658 id="stop447" />
659 <stop
660 offset="0.1836"
661 style="stop-color:#96075F"
662 id="stop449" />
663 <stop
664 offset="0.3196"
665 style="stop-color:#C1044A"
666 id="stop451" />
667 <stop
668 offset="0.4412"
669 style="stop-color:#E0023B"
670 id="stop453" />
671 <stop
672 offset="0.5441"
673 style="stop-color:#F30032"
674 id="stop455" />
675 <stop
676 offset="0.6158"
677 style="stop-color:#FA002F"
678 id="stop457" />
679 <stop
680 offset="1"
681 style="stop-color:#F7EE5F"
682 id="stop459" />
683 </linearGradient>
684 <linearGradient
685 id="XMLID_18_"
686 gradientUnits="userSpaceOnUse"
687 x1="-3679.1553"
688 y1="-3625.2759"
689 x2="-3697.104"
690 y2="-3651.5425"
691 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
692 <stop
693 offset="0"
694 style="stop-color:#500C81"
695 id="stop464" />
696 <stop
697 offset="0.0374"
698 style="stop-color:#5F0B7A"
699 id="stop466" />
700 <stop
701 offset="0.1836"
702 style="stop-color:#96075F"
703 id="stop468" />
704 <stop
705 offset="0.3196"
706 style="stop-color:#C1044A"
707 id="stop470" />
708 <stop
709 offset="0.4412"
710 style="stop-color:#E0023B"
711 id="stop472" />
712 <stop
713 offset="0.5441"
714 style="stop-color:#F30032"
715 id="stop474" />
716 <stop
717 offset="0.6158"
718 style="stop-color:#FA002F"
719 id="stop476" />
720 <stop
721 offset="1"
722 style="stop-color:#F7EE5F"
723 id="stop478" />
724 </linearGradient>
725 <linearGradient
726 id="XMLID_19_"
727 gradientUnits="userSpaceOnUse"
728 x1="-3680.7471"
729 y1="-3604.6782"
730 x2="-3690.7671"
731 y2="-3637.6982"
732 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
733 <stop
734 offset="0"
735 style="stop-color:#500C81"
736 id="stop483" />
737 <stop
738 offset="0.0374"
739 style="stop-color:#5F0B7A"
740 id="stop485" />
741 <stop
742 offset="0.1836"
743 style="stop-color:#96075F"
744 id="stop487" />
745 <stop
746 offset="0.3196"
747 style="stop-color:#C1044A"
748 id="stop489" />
749 <stop
750 offset="0.4412"
751 style="stop-color:#E0023B"
752 id="stop491" />
753 <stop
754 offset="0.5441"
755 style="stop-color:#F30032"
756 id="stop493" />
757 <stop
758 offset="0.6158"
759 style="stop-color:#FA002F"
760 id="stop495" />
761 <stop
762 offset="1"
763 style="stop-color:#F7EE5F"
764 id="stop497" />
765 </linearGradient>
766 <linearGradient
767 id="XMLID_20_"
768 gradientUnits="userSpaceOnUse"
769 x1="-3693.1763"
770 y1="-3615.6953"
771 x2="-3711.125"
772 y2="-3641.9619"
773 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
774 <stop
775 offset="0"
776 style="stop-color:#500C81"
777 id="stop502" />
778 <stop
779 offset="0.0374"
780 style="stop-color:#5F0B7A"
781 id="stop504" />
782 <stop
783 offset="0.1836"
784 style="stop-color:#96075F"
785 id="stop506" />
786 <stop
787 offset="0.3196"
788 style="stop-color:#C1044A"
789 id="stop508" />
790 <stop
791 offset="0.4412"
792 style="stop-color:#E0023B"
793 id="stop510" />
794 <stop
795 offset="0.5441"
796 style="stop-color:#F30032"
797 id="stop512" />
798 <stop
799 offset="0.6158"
800 style="stop-color:#FA002F"
801 id="stop514" />
802 <stop
803 offset="1"
804 style="stop-color:#F7EE5F"
805 id="stop516" />
806 </linearGradient>
807 <linearGradient
808 id="XMLID_21_"
809 gradientUnits="userSpaceOnUse"
810 x1="-3698.873"
811 y1="-3599.1777"
812 x2="-3708.8931"
813 y2="-3632.1978"
814 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
815 <stop
816 offset="0"
817 style="stop-color:#500C81"
818 id="stop521" />
819 <stop
820 offset="0.0374"
821 style="stop-color:#5F0B7A"
822 id="stop523" />
823 <stop
824 offset="0.1836"
825 style="stop-color:#96075F"
826 id="stop525" />
827 <stop
828 offset="0.3196"
829 style="stop-color:#C1044A"
830 id="stop527" />
831 <stop
832 offset="0.4412"
833 style="stop-color:#E0023B"
834 id="stop529" />
835 <stop
836 offset="0.5441"
837 style="stop-color:#F30032"
838 id="stop531" />
839 <stop
840 offset="0.6158"
841 style="stop-color:#FA002F"
842 id="stop533" />
843 <stop
844 offset="1"
845 style="stop-color:#F7EE5F"
846 id="stop535" />
847 </linearGradient>
848 <linearGradient
849 id="XMLID_22_"
850 gradientUnits="userSpaceOnUse"
851 x1="-3707.5356"
852 y1="-3605.8828"
853 x2="-3725.4844"
854 y2="-3632.1494"
855 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
856 <stop
857 offset="0"
858 style="stop-color:#500C81"
859 id="stop540" />
860 <stop
861 offset="0.0374"
862 style="stop-color:#5F0B7A"
863 id="stop542" />
864 <stop
865 offset="0.1836"
866 style="stop-color:#96075F"
867 id="stop544" />
868 <stop
869 offset="0.3196"
870 style="stop-color:#C1044A"
871 id="stop546" />
872 <stop
873 offset="0.4412"
874 style="stop-color:#E0023B"
875 id="stop548" />
876 <stop
877 offset="0.5441"
878 style="stop-color:#F30032"
879 id="stop550" />
880 <stop
881 offset="0.6158"
882 style="stop-color:#FA002F"
883 id="stop552" />
884 <stop
885 offset="1"
886 style="stop-color:#F7EE5F"
887 id="stop554" />
888 </linearGradient>
889 <linearGradient
890 id="XMLID_23_"
891 gradientUnits="userSpaceOnUse"
892 x1="-3718.1577"
893 y1="-3593.3257"
894 x2="-3728.1777"
895 y2="-3626.3457"
896 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
897 <stop
898 offset="0"
899 style="stop-color:#500C81"
900 id="stop559" />
901 <stop
902 offset="0.0374"
903 style="stop-color:#5F0B7A"
904 id="stop561" />
905 <stop
906 offset="0.1836"
907 style="stop-color:#96075F"
908 id="stop563" />
909 <stop
910 offset="0.3196"
911 style="stop-color:#C1044A"
912 id="stop565" />
913 <stop
914 offset="0.4412"
915 style="stop-color:#E0023B"
916 id="stop567" />
917 <stop
918 offset="0.5441"
919 style="stop-color:#F30032"
920 id="stop569" />
921 <stop
922 offset="0.6158"
923 style="stop-color:#FA002F"
924 id="stop571" />
925 <stop
926 offset="1"
927 style="stop-color:#F7EE5F"
928 id="stop573" />
929 </linearGradient>
930 <linearGradient
931 id="XMLID_24_"
932 gradientUnits="userSpaceOnUse"
933 x1="-3740.3467"
934 y1="-3613.374"
935 x2="-3740.5771"
936 y2="-3602.5581"
937 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
938 <stop
939 offset="0"
940 style="stop-color:#F7EE5F"
941 id="stop578" />
942 <stop
943 offset="0.1872"
944 style="stop-color:#F6D65D"
945 id="stop580" />
946 <stop
947 offset="0.3829"
948 style="stop-color:#F4C35B"
949 id="stop582" />
950 <stop
951 offset="0.5198"
952 style="stop-color:#F4BC5A"
953 id="stop584" />
954 <stop
955 offset="0.7816"
956 style="stop-color:#F6DA5D"
957 id="stop586" />
958 <stop
959 offset="1"
960 style="stop-color:#F7EE5F"
961 id="stop588" />
962 </linearGradient>
963 <linearGradient
964 id="XMLID_25_"
965 gradientUnits="userSpaceOnUse"
966 x1="-3735.0215"
967 y1="-3620.6553"
968 x2="-3731.4932"
969 y2="-3597.1836"
970 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
971 <stop
972 offset="0"
973 style="stop-color:#F7EE5F"
974 id="stop593" />
975 <stop
976 offset="0.1872"
977 style="stop-color:#F6D65D"
978 id="stop595" />
979 <stop
980 offset="0.3829"
981 style="stop-color:#F4C35B"
982 id="stop597" />
983 <stop
984 offset="0.5198"
985 style="stop-color:#F4BC5A"
986 id="stop599" />
987 <stop
988 offset="0.7816"
989 style="stop-color:#F6DA5D"
990 id="stop601" />
991 <stop
992 offset="1"
993 style="stop-color:#F7EE5F"
994 id="stop603" />
995 </linearGradient>
996 <linearGradient
997 id="XMLID_26_"
998 gradientUnits="userSpaceOnUse"
999 x1="-3748.5347"
1000 y1="-3618.624"
1001 x2="-3745.0063"
1002 y2="-3595.1523"
1003 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1004 <stop
1005 offset="0"
1006 style="stop-color:#F7EE5F"
1007 id="stop608" />
1008 <stop
1009 offset="0.1872"
1010 style="stop-color:#F6D65D"
1011 id="stop610" />
1012 <stop
1013 offset="0.3829"
1014 style="stop-color:#F4C35B"
1015 id="stop612" />
1016 <stop
1017 offset="0.5198"
1018 style="stop-color:#F4BC5A"
1019 id="stop614" />
1020 <stop
1021 offset="0.7816"
1022 style="stop-color:#F6DA5D"
1023 id="stop616" />
1024 <stop
1025 offset="1"
1026 style="stop-color:#F7EE5F"
1027 id="stop618" />
1028 </linearGradient>
1029 <linearGradient
1030 id="XMLID_27_"
1031 gradientUnits="userSpaceOnUse"
1032 x1="-3761.7197"
1033 y1="-3616.6421"
1034 x2="-3758.1914"
1035 y2="-3593.1704"
1036 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1037 <stop
1038 offset="0"
1039 style="stop-color:#F7EE5F"
1040 id="stop623" />
1041 <stop
1042 offset="0.1872"
1043 style="stop-color:#F6D65D"
1044 id="stop625" />
1045 <stop
1046 offset="0.3829"
1047 style="stop-color:#F4C35B"
1048 id="stop627" />
1049 <stop
1050 offset="0.5198"
1051 style="stop-color:#F4BC5A"
1052 id="stop629" />
1053 <stop
1054 offset="0.7816"
1055 style="stop-color:#F6DA5D"
1056 id="stop631" />
1057 <stop
1058 offset="1"
1059 style="stop-color:#F7EE5F"
1060 id="stop633" />
1061 </linearGradient>
1062 <linearGradient
1063 id="XMLID_28_"
1064 gradientUnits="userSpaceOnUse"
1065 x1="-3763.478"
1066 y1="-3613.8672"
1067 x2="-3763.7085"
1068 y2="-3603.0513"
1069 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1070 <stop
1071 offset="0"
1072 style="stop-color:#F7EE5F"
1073 id="stop638" />
1074 <stop
1075 offset="0.1872"
1076 style="stop-color:#F6D65D"
1077 id="stop640" />
1078 <stop
1079 offset="0.3829"
1080 style="stop-color:#F4C35B"
1081 id="stop642" />
1082 <stop
1083 offset="0.5198"
1084 style="stop-color:#F4BC5A"
1085 id="stop644" />
1086 <stop
1087 offset="0.7816"
1088 style="stop-color:#F6DA5D"
1089 id="stop646" />
1090 <stop
1091 offset="1"
1092 style="stop-color:#F7EE5F"
1093 id="stop648" />
1094 </linearGradient>
1095 <linearGradient
1096 id="XMLID_29_"
1097 gradientUnits="userSpaceOnUse"
1098 x1="-3754.6851"
1099 y1="-3613.6797"
1100 x2="-3754.9155"
1101 y2="-3602.8638"
1102 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1103 <stop
1104 offset="0"
1105 style="stop-color:#F7EE5F"
1106 id="stop653" />
1107 <stop
1108 offset="0.1872"
1109 style="stop-color:#F6D65D"
1110 id="stop655" />
1111 <stop
1112 offset="0.3829"
1113 style="stop-color:#F4C35B"
1114 id="stop657" />
1115 <stop
1116 offset="0.5198"
1117 style="stop-color:#F4BC5A"
1118 id="stop659" />
1119 <stop
1120 offset="0.7816"
1121 style="stop-color:#F6DA5D"
1122 id="stop661" />
1123 <stop
1124 offset="1"
1125 style="stop-color:#F7EE5F"
1126 id="stop663" />
1127 </linearGradient>
1128 <linearGradient
1129 id="XMLID_30_"
1130 gradientUnits="userSpaceOnUse"
1131 x1="-3511.8975"
1132 y1="-3643.6323"
1133 x2="-3504.0176"
1134 y2="-3620.4302"
1135 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1136 <stop
1137 offset="0"
1138 style="stop-color:#691183"
1139 id="stop668" />
1140 <stop
1141 offset="0.0485"
1142 style="stop-color:#840E73"
1143 id="stop670" />
1144 <stop
1145 offset="0.125"
1146 style="stop-color:#A80A5E"
1147 id="stop672" />
1148 <stop
1149 offset="0.2057"
1150 style="stop-color:#C6064D"
1151 id="stop674" />
1152 <stop
1153 offset="0.2906"
1154 style="stop-color:#DD0340"
1155 id="stop676" />
1156 <stop
1157 offset="0.3816"
1158 style="stop-color:#ED0136"
1159 id="stop678" />
1160 <stop
1161 offset="0.483"
1162 style="stop-color:#F70031"
1163 id="stop680" />
1164 <stop
1165 offset="0.6158"
1166 style="stop-color:#FA002F"
1167 id="stop682" />
1168 <stop
1169 offset="1"
1170 style="stop-color:#F7EE5F"
1171 id="stop684" />
1172 </linearGradient>
1173 <linearGradient
1174 id="XMLID_31_"
1175 gradientUnits="userSpaceOnUse"
1176 x1="-3517.9097"
1177 y1="-3641.5903"
1178 x2="-3510.0298"
1179 y2="-3618.3882"
1180 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1181 <stop
1182 offset="0"
1183 style="stop-color:#691183"
1184 id="stop689" />
1185 <stop
1186 offset="0.0485"
1187 style="stop-color:#840E73"
1188 id="stop691" />
1189 <stop
1190 offset="0.125"
1191 style="stop-color:#A80A5E"
1192 id="stop693" />
1193 <stop
1194 offset="0.2057"
1195 style="stop-color:#C6064D"
1196 id="stop695" />
1197 <stop
1198 offset="0.2906"
1199 style="stop-color:#DD0340"
1200 id="stop697" />
1201 <stop
1202 offset="0.3816"
1203 style="stop-color:#ED0136"
1204 id="stop699" />
1205 <stop
1206 offset="0.483"
1207 style="stop-color:#F70031"
1208 id="stop701" />
1209 <stop
1210 offset="0.6158"
1211 style="stop-color:#FA002F"
1212 id="stop703" />
1213 <stop
1214 offset="1"
1215 style="stop-color:#F7EE5F"
1216 id="stop705" />
1217 </linearGradient>
1218 <linearGradient
1219 id="XMLID_32_"
1220 gradientUnits="userSpaceOnUse"
1221 x1="-3524.9941"
1222 y1="-3639.1846"
1223 x2="-3517.1143"
1224 y2="-3615.9824"
1225 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1226 <stop
1227 offset="0"
1228 style="stop-color:#691183"
1229 id="stop710" />
1230 <stop
1231 offset="0.0485"
1232 style="stop-color:#840E73"
1233 id="stop712" />
1234 <stop
1235 offset="0.125"
1236 style="stop-color:#A80A5E"
1237 id="stop714" />
1238 <stop
1239 offset="0.2057"
1240 style="stop-color:#C6064D"
1241 id="stop716" />
1242 <stop
1243 offset="0.2906"
1244 style="stop-color:#DD0340"
1245 id="stop718" />
1246 <stop
1247 offset="0.3816"
1248 style="stop-color:#ED0136"
1249 id="stop720" />
1250 <stop
1251 offset="0.483"
1252 style="stop-color:#F70031"
1253 id="stop722" />
1254 <stop
1255 offset="0.6158"
1256 style="stop-color:#FA002F"
1257 id="stop724" />
1258 <stop
1259 offset="1"
1260 style="stop-color:#F7EE5F"
1261 id="stop726" />
1262 </linearGradient>
1263 <linearGradient
1264 id="XMLID_33_"
1265 gradientUnits="userSpaceOnUse"
1266 x1="-3532.7456"
1267 y1="-3636.5518"
1268 x2="-3524.8657"
1269 y2="-3613.3496"
1270 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1271 <stop
1272 offset="0"
1273 style="stop-color:#691183"
1274 id="stop731" />
1275 <stop
1276 offset="0.0485"
1277 style="stop-color:#840E73"
1278 id="stop733" />
1279 <stop
1280 offset="0.125"
1281 style="stop-color:#A80A5E"
1282 id="stop735" />
1283 <stop
1284 offset="0.2057"
1285 style="stop-color:#C6064D"
1286 id="stop737" />
1287 <stop
1288 offset="0.2906"
1289 style="stop-color:#DD0340"
1290 id="stop739" />
1291 <stop
1292 offset="0.3816"
1293 style="stop-color:#ED0136"
1294 id="stop741" />
1295 <stop
1296 offset="0.483"
1297 style="stop-color:#F70031"
1298 id="stop743" />
1299 <stop
1300 offset="0.6158"
1301 style="stop-color:#FA002F"
1302 id="stop745" />
1303 <stop
1304 offset="1"
1305 style="stop-color:#F7EE5F"
1306 id="stop747" />
1307 </linearGradient>
1308 <linearGradient
1309 id="XMLID_34_"
1310 gradientUnits="userSpaceOnUse"
1311 x1="-3612.71"
1312 y1="-3562.9639"
1313 x2="-3607.5464"
1314 y2="-3533.6799"
1315 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,105.42691,1026.1774)">
1316 <stop
1317 offset="0"
1318 style="stop-color:#691183"
1319 id="stop752" />
1320 <stop
1321 offset="0.082"
1322 style="stop-color:#711282"
1323 id="stop754" />
1324 <stop
1325 offset="0.211"
1326 style="stop-color:#88137F"
1327 id="stop756" />
1328 <stop
1329 offset="0.3709"
1330 style="stop-color:#AD167A"
1331 id="stop758" />
1332 <stop
1333 offset="0.5538"
1334 style="stop-color:#DF1A73"
1335 id="stop760" />
1336 <stop
1337 offset="0.6158"
1338 style="stop-color:#F21B71"
1339 id="stop762" />
1340 <stop
1341 offset="1"
1342 style="stop-color:#F7EE5F"
1343 id="stop764" />
1344 </linearGradient>
1345 <linearGradient
1346 id="XMLID_35_"
1347 gradientUnits="userSpaceOnUse"
1348 x1="-3543.3057"
1349 y1="-3626.2104"
1350 x2="-3540.6792"
1351 y2="-3608.2617"
1352 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1353 <stop
1354 offset="0"
1355 style="stop-color:#691183"
1356 id="stop769" />
1357 <stop
1358 offset="0.0485"
1359 style="stop-color:#840E73"
1360 id="stop771" />
1361 <stop
1362 offset="0.125"
1363 style="stop-color:#A80A5E"
1364 id="stop773" />
1365 <stop
1366 offset="0.2057"
1367 style="stop-color:#C6064D"
1368 id="stop775" />
1369 <stop
1370 offset="0.2906"
1371 style="stop-color:#DD0340"
1372 id="stop777" />
1373 <stop
1374 offset="0.3816"
1375 style="stop-color:#ED0136"
1376 id="stop779" />
1377 <stop
1378 offset="0.483"
1379 style="stop-color:#F70031"
1380 id="stop781" />
1381 <stop
1382 offset="0.6158"
1383 style="stop-color:#FA002F"
1384 id="stop783" />
1385 <stop
1386 offset="1"
1387 style="stop-color:#F7EE5F"
1388 id="stop785" />
1389 </linearGradient>
1390 <linearGradient
1391 id="XMLID_36_"
1392 gradientUnits="userSpaceOnUse"
1393 x1="-3551.1382"
1394 y1="-3625.064"
1395 x2="-3548.5117"
1396 y2="-3607.1152"
1397 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1398 <stop
1399 offset="0"
1400 style="stop-color:#691183"
1401 id="stop790" />
1402 <stop
1403 offset="0.0485"
1404 style="stop-color:#840E73"
1405 id="stop792" />
1406 <stop
1407 offset="0.125"
1408 style="stop-color:#A80A5E"
1409 id="stop794" />
1410 <stop
1411 offset="0.2057"
1412 style="stop-color:#C6064D"
1413 id="stop796" />
1414 <stop
1415 offset="0.2906"
1416 style="stop-color:#DD0340"
1417 id="stop798" />
1418 <stop
1419 offset="0.3816"
1420 style="stop-color:#ED0136"
1421 id="stop800" />
1422 <stop
1423 offset="0.483"
1424 style="stop-color:#F70031"
1425 id="stop802" />
1426 <stop
1427 offset="0.6158"
1428 style="stop-color:#FA002F"
1429 id="stop804" />
1430 <stop
1431 offset="1"
1432 style="stop-color:#F7EE5F"
1433 id="stop806" />
1434 </linearGradient>
1435 <linearGradient
1436 id="XMLID_37_"
1437 gradientUnits="userSpaceOnUse"
1438 x1="-3558.2891"
1439 y1="-3625.9536"
1440 x2="-3554.1504"
1441 y2="-3586.3257"
1442 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1443 <stop
1444 offset="0"
1445 style="stop-color:#691183"
1446 id="stop811" />
1447 <stop
1448 offset="0.0485"
1449 style="stop-color:#840E73"
1450 id="stop813" />
1451 <stop
1452 offset="0.125"
1453 style="stop-color:#A80A5E"
1454 id="stop815" />
1455 <stop
1456 offset="0.2057"
1457 style="stop-color:#C6064D"
1458 id="stop817" />
1459 <stop
1460 offset="0.2906"
1461 style="stop-color:#DD0340"
1462 id="stop819" />
1463 <stop
1464 offset="0.3816"
1465 style="stop-color:#ED0136"
1466 id="stop821" />
1467 <stop
1468 offset="0.483"
1469 style="stop-color:#F70031"
1470 id="stop823" />
1471 <stop
1472 offset="0.6158"
1473 style="stop-color:#FA002F"
1474 id="stop825" />
1475 <stop
1476 offset="1"
1477 style="stop-color:#F7EE5F"
1478 id="stop827" />
1479 </linearGradient>
1480 <linearGradient
1481 id="XMLID_38_"
1482 gradientUnits="userSpaceOnUse"
1483 x1="-3568.6143"
1484 y1="-3622.5068"
1485 x2="-3565.9878"
1486 y2="-3604.5581"
1487 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1488 <stop
1489 offset="0"
1490 style="stop-color:#691183"
1491 id="stop832" />
1492 <stop
1493 offset="0.0485"
1494 style="stop-color:#840E73"
1495 id="stop834" />
1496 <stop
1497 offset="0.125"
1498 style="stop-color:#A80A5E"
1499 id="stop836" />
1500 <stop
1501 offset="0.2057"
1502 style="stop-color:#C6064D"
1503 id="stop838" />
1504 <stop
1505 offset="0.2906"
1506 style="stop-color:#DD0340"
1507 id="stop840" />
1508 <stop
1509 offset="0.3816"
1510 style="stop-color:#ED0136"
1511 id="stop842" />
1512 <stop
1513 offset="0.483"
1514 style="stop-color:#F70031"
1515 id="stop844" />
1516 <stop
1517 offset="0.6158"
1518 style="stop-color:#FA002F"
1519 id="stop846" />
1520 <stop
1521 offset="1"
1522 style="stop-color:#F7EE5F"
1523 id="stop848" />
1524 </linearGradient>
1525 <linearGradient
1526 id="XMLID_39_"
1527 gradientUnits="userSpaceOnUse"
1528 x1="-3576.6631"
1529 y1="-3624.0347"
1530 x2="-3572.5244"
1531 y2="-3584.4067"
1532 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1533 <stop
1534 offset="0"
1535 style="stop-color:#691183"
1536 id="stop853" />
1537 <stop
1538 offset="0.0485"
1539 style="stop-color:#840E73"
1540 id="stop855" />
1541 <stop
1542 offset="0.125"
1543 style="stop-color:#A80A5E"
1544 id="stop857" />
1545 <stop
1546 offset="0.2057"
1547 style="stop-color:#C6064D"
1548 id="stop859" />
1549 <stop
1550 offset="0.2906"
1551 style="stop-color:#DD0340"
1552 id="stop861" />
1553 <stop
1554 offset="0.3816"
1555 style="stop-color:#ED0136"
1556 id="stop863" />
1557 <stop
1558 offset="0.483"
1559 style="stop-color:#F70031"
1560 id="stop865" />
1561 <stop
1562 offset="0.6158"
1563 style="stop-color:#FA002F"
1564 id="stop867" />
1565 <stop
1566 offset="1"
1567 style="stop-color:#F7EE5F"
1568 id="stop869" />
1569 </linearGradient>
1570 <linearGradient
1571 id="XMLID_40_"
1572 gradientUnits="userSpaceOnUse"
1573 x1="-3583.9473"
1574 y1="-3623.7251"
1575 x2="-3582.5107"
1576 y2="-3603.1548"
1577 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1578 <stop
1579 offset="0"
1580 style="stop-color:#691183"
1581 id="stop874" />
1582 <stop
1583 offset="0.0485"
1584 style="stop-color:#840E73"
1585 id="stop876" />
1586 <stop
1587 offset="0.125"
1588 style="stop-color:#A80A5E"
1589 id="stop878" />
1590 <stop
1591 offset="0.2057"
1592 style="stop-color:#C6064D"
1593 id="stop880" />
1594 <stop
1595 offset="0.2906"
1596 style="stop-color:#DD0340"
1597 id="stop882" />
1598 <stop
1599 offset="0.3816"
1600 style="stop-color:#ED0136"
1601 id="stop884" />
1602 <stop
1603 offset="0.483"
1604 style="stop-color:#F70031"
1605 id="stop886" />
1606 <stop
1607 offset="0.6158"
1608 style="stop-color:#FA002F"
1609 id="stop888" />
1610 <stop
1611 offset="1"
1612 style="stop-color:#F7EE5F"
1613 id="stop890" />
1614 </linearGradient>
1615 <linearGradient
1616 id="XMLID_41_"
1617 gradientUnits="userSpaceOnUse"
1618 x1="-3591.7266"
1619 y1="-3619.7495"
1620 x2="-3589.9756"
1621 y2="-3592.6074"
1622 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1623 <stop
1624 offset="0"
1625 style="stop-color:#691183"
1626 id="stop895" />
1627 <stop
1628 offset="0.0485"
1629 style="stop-color:#840E73"
1630 id="stop897" />
1631 <stop
1632 offset="0.125"
1633 style="stop-color:#A80A5E"
1634 id="stop899" />
1635 <stop
1636 offset="0.2057"
1637 style="stop-color:#C6064D"
1638 id="stop901" />
1639 <stop
1640 offset="0.2906"
1641 style="stop-color:#DD0340"
1642 id="stop903" />
1643 <stop
1644 offset="0.3816"
1645 style="stop-color:#ED0136"
1646 id="stop905" />
1647 <stop
1648 offset="0.483"
1649 style="stop-color:#F70031"
1650 id="stop907" />
1651 <stop
1652 offset="0.6158"
1653 style="stop-color:#FA002F"
1654 id="stop909" />
1655 <stop
1656 offset="1"
1657 style="stop-color:#F7EE5F"
1658 id="stop911" />
1659 </linearGradient>
1660 <linearGradient
1661 id="XMLID_42_"
1662 gradientUnits="userSpaceOnUse"
1663 x1="-3599.0938"
1664 y1="-3619.2744"
1665 x2="-3597.3428"
1666 y2="-3592.1323"
1667 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1668 <stop
1669 offset="0"
1670 style="stop-color:#691183"
1671 id="stop916" />
1672 <stop
1673 offset="0.0485"
1674 style="stop-color:#840E73"
1675 id="stop918" />
1676 <stop
1677 offset="0.125"
1678 style="stop-color:#A80A5E"
1679 id="stop920" />
1680 <stop
1681 offset="0.2057"
1682 style="stop-color:#C6064D"
1683 id="stop922" />
1684 <stop
1685 offset="0.2906"
1686 style="stop-color:#DD0340"
1687 id="stop924" />
1688 <stop
1689 offset="0.3816"
1690 style="stop-color:#ED0136"
1691 id="stop926" />
1692 <stop
1693 offset="0.483"
1694 style="stop-color:#F70031"
1695 id="stop928" />
1696 <stop
1697 offset="0.6158"
1698 style="stop-color:#FA002F"
1699 id="stop930" />
1700 <stop
1701 offset="1"
1702 style="stop-color:#F7EE5F"
1703 id="stop932" />
1704 </linearGradient>
1705 <linearGradient
1706 id="XMLID_43_"
1707 gradientUnits="userSpaceOnUse"
1708 x1="-3606.6357"
1709 y1="-3620.9043"
1710 x2="-3602.4971"
1711 y2="-3581.2764"
1712 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1713 <stop
1714 offset="0"
1715 style="stop-color:#691183"
1716 id="stop937" />
1717 <stop
1718 offset="0.0485"
1719 style="stop-color:#840E73"
1720 id="stop939" />
1721 <stop
1722 offset="0.125"
1723 style="stop-color:#A80A5E"
1724 id="stop941" />
1725 <stop
1726 offset="0.2057"
1727 style="stop-color:#C6064D"
1728 id="stop943" />
1729 <stop
1730 offset="0.2906"
1731 style="stop-color:#DD0340"
1732 id="stop945" />
1733 <stop
1734 offset="0.3816"
1735 style="stop-color:#ED0136"
1736 id="stop947" />
1737 <stop
1738 offset="0.483"
1739 style="stop-color:#F70031"
1740 id="stop949" />
1741 <stop
1742 offset="0.6158"
1743 style="stop-color:#FA002F"
1744 id="stop951" />
1745 <stop
1746 offset="1"
1747 style="stop-color:#F7EE5F"
1748 id="stop953" />
1749 </linearGradient>
1750 <linearGradient
1751 id="XMLID_44_"
1752 gradientUnits="userSpaceOnUse"
1753 x1="-3616.3291"
1754 y1="-3619.8916"
1755 x2="-3612.1904"
1756 y2="-3580.2637"
1757 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1758 <stop
1759 offset="0"
1760 style="stop-color:#691183"
1761 id="stop958" />
1762 <stop
1763 offset="0.0485"
1764 style="stop-color:#840E73"
1765 id="stop960" />
1766 <stop
1767 offset="0.125"
1768 style="stop-color:#A80A5E"
1769 id="stop962" />
1770 <stop
1771 offset="0.2057"
1772 style="stop-color:#C6064D"
1773 id="stop964" />
1774 <stop
1775 offset="0.2906"
1776 style="stop-color:#DD0340"
1777 id="stop966" />
1778 <stop
1779 offset="0.3816"
1780 style="stop-color:#ED0136"
1781 id="stop968" />
1782 <stop
1783 offset="0.483"
1784 style="stop-color:#F70031"
1785 id="stop970" />
1786 <stop
1787 offset="0.6158"
1788 style="stop-color:#FA002F"
1789 id="stop972" />
1790 <stop
1791 offset="1"
1792 style="stop-color:#F7EE5F"
1793 id="stop974" />
1794 </linearGradient>
1795 <linearGradient
1796 id="XMLID_45_"
1797 gradientUnits="userSpaceOnUse"
1798 x1="-3622.9634"
1799 y1="-3614.5537"
1800 x2="-3620.3369"
1801 y2="-3596.605"
1802 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1803 <stop
1804 offset="0"
1805 style="stop-color:#691183"
1806 id="stop979" />
1807 <stop
1808 offset="0.0485"
1809 style="stop-color:#840E73"
1810 id="stop981" />
1811 <stop
1812 offset="0.125"
1813 style="stop-color:#A80A5E"
1814 id="stop983" />
1815 <stop
1816 offset="0.2057"
1817 style="stop-color:#C6064D"
1818 id="stop985" />
1819 <stop
1820 offset="0.2906"
1821 style="stop-color:#DD0340"
1822 id="stop987" />
1823 <stop
1824 offset="0.3816"
1825 style="stop-color:#ED0136"
1826 id="stop989" />
1827 <stop
1828 offset="0.483"
1829 style="stop-color:#F70031"
1830 id="stop991" />
1831 <stop
1832 offset="0.6158"
1833 style="stop-color:#FA002F"
1834 id="stop993" />
1835 <stop
1836 offset="1"
1837 style="stop-color:#F7EE5F"
1838 id="stop995" />
1839 </linearGradient>
1840 <linearGradient
1841 id="XMLID_46_"
1842 gradientUnits="userSpaceOnUse"
1843 x1="-3631.832"
1844 y1="-3618.2729"
1845 x2="-3627.6934"
1846 y2="-3578.645"
1847 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1848 <stop
1849 offset="0"
1850 style="stop-color:#691183"
1851 id="stop1000" />
1852 <stop
1853 offset="0.0485"
1854 style="stop-color:#840E73"
1855 id="stop1002" />
1856 <stop
1857 offset="0.125"
1858 style="stop-color:#A80A5E"
1859 id="stop1004" />
1860 <stop
1861 offset="0.2057"
1862 style="stop-color:#C6064D"
1863 id="stop1006" />
1864 <stop
1865 offset="0.2906"
1866 style="stop-color:#DD0340"
1867 id="stop1008" />
1868 <stop
1869 offset="0.3816"
1870 style="stop-color:#ED0136"
1871 id="stop1010" />
1872 <stop
1873 offset="0.483"
1874 style="stop-color:#F70031"
1875 id="stop1012" />
1876 <stop
1877 offset="0.6158"
1878 style="stop-color:#FA002F"
1879 id="stop1014" />
1880 <stop
1881 offset="1"
1882 style="stop-color:#F7EE5F"
1883 id="stop1016" />
1884 </linearGradient>
1885 <linearGradient
1886 id="XMLID_47_"
1887 gradientUnits="userSpaceOnUse"
1888 x1="-3638.7656"
1889 y1="-3612.2417"
1890 x2="-3636.1392"
1891 y2="-3594.293"
1892 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1893 <stop
1894 offset="0"
1895 style="stop-color:#691183"
1896 id="stop1021" />
1897 <stop
1898 offset="0.0485"
1899 style="stop-color:#840E73"
1900 id="stop1023" />
1901 <stop
1902 offset="0.125"
1903 style="stop-color:#A80A5E"
1904 id="stop1025" />
1905 <stop
1906 offset="0.2057"
1907 style="stop-color:#C6064D"
1908 id="stop1027" />
1909 <stop
1910 offset="0.2906"
1911 style="stop-color:#DD0340"
1912 id="stop1029" />
1913 <stop
1914 offset="0.3816"
1915 style="stop-color:#ED0136"
1916 id="stop1031" />
1917 <stop
1918 offset="0.483"
1919 style="stop-color:#F70031"
1920 id="stop1033" />
1921 <stop
1922 offset="0.6158"
1923 style="stop-color:#FA002F"
1924 id="stop1035" />
1925 <stop
1926 offset="1"
1927 style="stop-color:#F7EE5F"
1928 id="stop1037" />
1929 </linearGradient>
1930 <linearGradient
1931 id="XMLID_48_"
1932 gradientUnits="userSpaceOnUse"
1933 x1="-3647.9785"
1934 y1="-3616.5864"
1935 x2="-3643.8398"
1936 y2="-3576.9585"
1937 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1938 <stop
1939 offset="0"
1940 style="stop-color:#691183"
1941 id="stop1042" />
1942 <stop
1943 offset="0.0485"
1944 style="stop-color:#840E73"
1945 id="stop1044" />
1946 <stop
1947 offset="0.125"
1948 style="stop-color:#A80A5E"
1949 id="stop1046" />
1950 <stop
1951 offset="0.2057"
1952 style="stop-color:#C6064D"
1953 id="stop1048" />
1954 <stop
1955 offset="0.2906"
1956 style="stop-color:#DD0340"
1957 id="stop1050" />
1958 <stop
1959 offset="0.3816"
1960 style="stop-color:#ED0136"
1961 id="stop1052" />
1962 <stop
1963 offset="0.483"
1964 style="stop-color:#F70031"
1965 id="stop1054" />
1966 <stop
1967 offset="0.6158"
1968 style="stop-color:#FA002F"
1969 id="stop1056" />
1970 <stop
1971 offset="1"
1972 style="stop-color:#F7EE5F"
1973 id="stop1058" />
1974 </linearGradient>
1975 <linearGradient
1976 id="XMLID_49_"
1977 gradientUnits="userSpaceOnUse"
1978 x1="-3655.8188"
1979 y1="-3609.7461"
1980 x2="-3653.1924"
1981 y2="-3591.7974"
1982 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
1983 <stop
1984 offset="0"
1985 style="stop-color:#691183"
1986 id="stop1063" />
1987 <stop
1988 offset="0.0485"
1989 style="stop-color:#840E73"
1990 id="stop1065" />
1991 <stop
1992 offset="0.125"
1993 style="stop-color:#A80A5E"
1994 id="stop1067" />
1995 <stop
1996 offset="0.2057"
1997 style="stop-color:#C6064D"
1998 id="stop1069" />
1999 <stop
2000 offset="0.2906"
2001 style="stop-color:#DD0340"
2002 id="stop1071" />
2003 <stop
2004 offset="0.3816"
2005 style="stop-color:#ED0136"
2006 id="stop1073" />
2007 <stop
2008 offset="0.483"
2009 style="stop-color:#F70031"
2010 id="stop1075" />
2011 <stop
2012 offset="0.6158"
2013 style="stop-color:#FA002F"
2014 id="stop1077" />
2015 <stop
2016 offset="1"
2017 style="stop-color:#F7EE5F"
2018 id="stop1079" />
2019 </linearGradient>
2020 <linearGradient
2021 id="XMLID_50_"
2022 gradientUnits="userSpaceOnUse"
2023 x1="-3513.0679"
2024 y1="-3655.561"
2025 x2="-3523.0879"
2026 y2="-3688.5811"
2027 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2028 <stop
2029 offset="0"
2030 style="stop-color:#500C81"
2031 id="stop1084" />
2032 <stop
2033 offset="0.0374"
2034 style="stop-color:#5F0B7A"
2035 id="stop1086" />
2036 <stop
2037 offset="0.1836"
2038 style="stop-color:#96075F"
2039 id="stop1088" />
2040 <stop
2041 offset="0.3196"
2042 style="stop-color:#C1044A"
2043 id="stop1090" />
2044 <stop
2045 offset="0.4412"
2046 style="stop-color:#E0023B"
2047 id="stop1092" />
2048 <stop
2049 offset="0.5441"
2050 style="stop-color:#F30032"
2051 id="stop1094" />
2052 <stop
2053 offset="0.6158"
2054 style="stop-color:#FA002F"
2055 id="stop1096" />
2056 <stop
2057 offset="1"
2058 style="stop-color:#F7EE5F"
2059 id="stop1098" />
2060 </linearGradient>
2061 <linearGradient
2062 id="XMLID_51_"
2063 gradientUnits="userSpaceOnUse"
2064 x1="-3518.1055"
2065 y1="-3653.4717"
2066 x2="-3529.9253"
2067 y2="-3685.4292"
2068 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2069 <stop
2070 offset="0"
2071 style="stop-color:#500C81"
2072 id="stop1103" />
2073 <stop
2074 offset="0.0374"
2075 style="stop-color:#5F0B7A"
2076 id="stop1105" />
2077 <stop
2078 offset="0.1836"
2079 style="stop-color:#96075F"
2080 id="stop1107" />
2081 <stop
2082 offset="0.3196"
2083 style="stop-color:#C1044A"
2084 id="stop1109" />
2085 <stop
2086 offset="0.4412"
2087 style="stop-color:#E0023B"
2088 id="stop1111" />
2089 <stop
2090 offset="0.5441"
2091 style="stop-color:#F30032"
2092 id="stop1113" />
2093 <stop
2094 offset="0.6158"
2095 style="stop-color:#FA002F"
2096 id="stop1115" />
2097 <stop
2098 offset="1"
2099 style="stop-color:#F7EE5F"
2100 id="stop1117" />
2101 </linearGradient>
2102 <linearGradient
2103 id="XMLID_52_"
2104 gradientUnits="userSpaceOnUse"
2105 x1="-3524.8926"
2106 y1="-3651.9727"
2107 x2="-3534.9126"
2108 y2="-3684.9927"
2109 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2110 <stop
2111 offset="0"
2112 style="stop-color:#500C81"
2113 id="stop1122" />
2114 <stop
2115 offset="0.0374"
2116 style="stop-color:#5F0B7A"
2117 id="stop1124" />
2118 <stop
2119 offset="0.1836"
2120 style="stop-color:#96075F"
2121 id="stop1126" />
2122 <stop
2123 offset="0.3196"
2124 style="stop-color:#C1044A"
2125 id="stop1128" />
2126 <stop
2127 offset="0.4412"
2128 style="stop-color:#E0023B"
2129 id="stop1130" />
2130 <stop
2131 offset="0.5441"
2132 style="stop-color:#F30032"
2133 id="stop1132" />
2134 <stop
2135 offset="0.6158"
2136 style="stop-color:#FA002F"
2137 id="stop1134" />
2138 <stop
2139 offset="1"
2140 style="stop-color:#F7EE5F"
2141 id="stop1136" />
2142 </linearGradient>
2143 <linearGradient
2144 id="XMLID_53_"
2145 gradientUnits="userSpaceOnUse"
2146 x1="-3531.4272"
2147 y1="-3648.5444"
2148 x2="-3543.2471"
2149 y2="-3680.502"
2150 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2151 <stop
2152 offset="0"
2153 style="stop-color:#500C81"
2154 id="stop1141" />
2155 <stop
2156 offset="0.0374"
2157 style="stop-color:#5F0B7A"
2158 id="stop1143" />
2159 <stop
2160 offset="0.1836"
2161 style="stop-color:#96075F"
2162 id="stop1145" />
2163 <stop
2164 offset="0.3196"
2165 style="stop-color:#C1044A"
2166 id="stop1147" />
2167 <stop
2168 offset="0.4412"
2169 style="stop-color:#E0023B"
2170 id="stop1149" />
2171 <stop
2172 offset="0.5441"
2173 style="stop-color:#F30032"
2174 id="stop1151" />
2175 <stop
2176 offset="0.6158"
2177 style="stop-color:#FA002F"
2178 id="stop1153" />
2179 <stop
2180 offset="1"
2181 style="stop-color:#F7EE5F"
2182 id="stop1155" />
2183 </linearGradient>
2184 <linearGradient
2185 id="XMLID_54_"
2186 gradientUnits="userSpaceOnUse"
2187 x1="-3540.4351"
2188 y1="-3647.2563"
2189 x2="-3550.4551"
2190 y2="-3680.2764"
2191 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2192 <stop
2193 offset="0"
2194 style="stop-color:#500C81"
2195 id="stop1160" />
2196 <stop
2197 offset="0.0374"
2198 style="stop-color:#5F0B7A"
2199 id="stop1162" />
2200 <stop
2201 offset="0.1836"
2202 style="stop-color:#96075F"
2203 id="stop1164" />
2204 <stop
2205 offset="0.3196"
2206 style="stop-color:#C1044A"
2207 id="stop1166" />
2208 <stop
2209 offset="0.4412"
2210 style="stop-color:#E0023B"
2211 id="stop1168" />
2212 <stop
2213 offset="0.5441"
2214 style="stop-color:#F30032"
2215 id="stop1170" />
2216 <stop
2217 offset="0.6158"
2218 style="stop-color:#FA002F"
2219 id="stop1172" />
2220 <stop
2221 offset="1"
2222 style="stop-color:#F7EE5F"
2223 id="stop1174" />
2224 </linearGradient>
2225 <linearGradient
2226 id="XMLID_55_"
2227 gradientUnits="userSpaceOnUse"
2228 x1="-3546.5405"
2229 y1="-3642.9551"
2230 x2="-3558.3604"
2231 y2="-3674.9126"
2232 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2233 <stop
2234 offset="0"
2235 style="stop-color:#500C81"
2236 id="stop1179" />
2237 <stop
2238 offset="0.0374"
2239 style="stop-color:#5F0B7A"
2240 id="stop1181" />
2241 <stop
2242 offset="0.1836"
2243 style="stop-color:#96075F"
2244 id="stop1183" />
2245 <stop
2246 offset="0.3196"
2247 style="stop-color:#C1044A"
2248 id="stop1185" />
2249 <stop
2250 offset="0.4412"
2251 style="stop-color:#E0023B"
2252 id="stop1187" />
2253 <stop
2254 offset="0.5441"
2255 style="stop-color:#F30032"
2256 id="stop1189" />
2257 <stop
2258 offset="0.6158"
2259 style="stop-color:#FA002F"
2260 id="stop1191" />
2261 <stop
2262 offset="1"
2263 style="stop-color:#F7EE5F"
2264 id="stop1193" />
2265 </linearGradient>
2266 <linearGradient
2267 id="XMLID_56_"
2268 gradientUnits="userSpaceOnUse"
2269 x1="-3556.0371"
2270 y1="-3642.522"
2271 x2="-3566.0571"
2272 y2="-3675.542"
2273 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2274 <stop
2275 offset="0"
2276 style="stop-color:#500C81"
2277 id="stop1198" />
2278 <stop
2279 offset="0.0374"
2280 style="stop-color:#5F0B7A"
2281 id="stop1200" />
2282 <stop
2283 offset="0.1836"
2284 style="stop-color:#96075F"
2285 id="stop1202" />
2286 <stop
2287 offset="0.3196"
2288 style="stop-color:#C1044A"
2289 id="stop1204" />
2290 <stop
2291 offset="0.4412"
2292 style="stop-color:#E0023B"
2293 id="stop1206" />
2294 <stop
2295 offset="0.5441"
2296 style="stop-color:#F30032"
2297 id="stop1208" />
2298 <stop
2299 offset="0.6158"
2300 style="stop-color:#FA002F"
2301 id="stop1210" />
2302 <stop
2303 offset="1"
2304 style="stop-color:#F7EE5F"
2305 id="stop1212" />
2306 </linearGradient>
2307 <linearGradient
2308 id="XMLID_57_"
2309 gradientUnits="userSpaceOnUse"
2310 x1="-3565.9399"
2311 y1="-3639.5166"
2312 x2="-3575.96"
2313 y2="-3672.5366"
2314 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2315 <stop
2316 offset="0"
2317 style="stop-color:#500C81"
2318 id="stop1217" />
2319 <stop
2320 offset="0.0374"
2321 style="stop-color:#5F0B7A"
2322 id="stop1219" />
2323 <stop
2324 offset="0.1836"
2325 style="stop-color:#96075F"
2326 id="stop1221" />
2327 <stop
2328 offset="0.3196"
2329 style="stop-color:#C1044A"
2330 id="stop1223" />
2331 <stop
2332 offset="0.4412"
2333 style="stop-color:#E0023B"
2334 id="stop1225" />
2335 <stop
2336 offset="0.5441"
2337 style="stop-color:#F30032"
2338 id="stop1227" />
2339 <stop
2340 offset="0.6158"
2341 style="stop-color:#FA002F"
2342 id="stop1229" />
2343 <stop
2344 offset="1"
2345 style="stop-color:#F7EE5F"
2346 id="stop1231" />
2347 </linearGradient>
2348 <linearGradient
2349 id="XMLID_58_"
2350 gradientUnits="userSpaceOnUse"
2351 x1="-3572.1245"
2352 y1="-3633.4922"
2353 x2="-3583.9443"
2354 y2="-3665.4497"
2355 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2356 <stop
2357 offset="0"
2358 style="stop-color:#500C81"
2359 id="stop1236" />
2360 <stop
2361 offset="0.0374"
2362 style="stop-color:#5F0B7A"
2363 id="stop1238" />
2364 <stop
2365 offset="0.1836"
2366 style="stop-color:#96075F"
2367 id="stop1240" />
2368 <stop
2369 offset="0.3196"
2370 style="stop-color:#C1044A"
2371 id="stop1242" />
2372 <stop
2373 offset="0.4412"
2374 style="stop-color:#E0023B"
2375 id="stop1244" />
2376 <stop
2377 offset="0.5441"
2378 style="stop-color:#F30032"
2379 id="stop1246" />
2380 <stop
2381 offset="0.6158"
2382 style="stop-color:#FA002F"
2383 id="stop1248" />
2384 <stop
2385 offset="1"
2386 style="stop-color:#F7EE5F"
2387 id="stop1250" />
2388 </linearGradient>
2389 <linearGradient
2390 id="XMLID_59_"
2391 gradientUnits="userSpaceOnUse"
2392 x1="-3581.0527"
2393 y1="-3634.9307"
2394 x2="-3591.0728"
2395 y2="-3667.9507"
2396 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2397 <stop
2398 offset="0"
2399 style="stop-color:#500C81"
2400 id="stop1255" />
2401 <stop
2402 offset="0.0374"
2403 style="stop-color:#5F0B7A"
2404 id="stop1257" />
2405 <stop
2406 offset="0.1836"
2407 style="stop-color:#96075F"
2408 id="stop1259" />
2409 <stop
2410 offset="0.3196"
2411 style="stop-color:#C1044A"
2412 id="stop1261" />
2413 <stop
2414 offset="0.4412"
2415 style="stop-color:#E0023B"
2416 id="stop1263" />
2417 <stop
2418 offset="0.5441"
2419 style="stop-color:#F30032"
2420 id="stop1265" />
2421 <stop
2422 offset="0.6158"
2423 style="stop-color:#FA002F"
2424 id="stop1267" />
2425 <stop
2426 offset="1"
2427 style="stop-color:#F7EE5F"
2428 id="stop1269" />
2429 </linearGradient>
2430 <linearGradient
2431 id="XMLID_60_"
2432 gradientUnits="userSpaceOnUse"
2433 x1="-3586.04"
2434 y1="-3628.3457"
2435 x2="-3597.8599"
2436 y2="-3660.3032"
2437 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2438 <stop
2439 offset="0"
2440 style="stop-color:#500C81"
2441 id="stop1274" />
2442 <stop
2443 offset="0.0374"
2444 style="stop-color:#5F0B7A"
2445 id="stop1276" />
2446 <stop
2447 offset="0.1836"
2448 style="stop-color:#96075F"
2449 id="stop1278" />
2450 <stop
2451 offset="0.3196"
2452 style="stop-color:#C1044A"
2453 id="stop1280" />
2454 <stop
2455 offset="0.4412"
2456 style="stop-color:#E0023B"
2457 id="stop1282" />
2458 <stop
2459 offset="0.5441"
2460 style="stop-color:#F30032"
2461 id="stop1284" />
2462 <stop
2463 offset="0.6158"
2464 style="stop-color:#FA002F"
2465 id="stop1286" />
2466 <stop
2467 offset="1"
2468 style="stop-color:#F7EE5F"
2469 id="stop1288" />
2470 </linearGradient>
2471 <linearGradient
2472 id="XMLID_61_"
2473 gradientUnits="userSpaceOnUse"
2474 x1="-3595.1567"
2475 y1="-3630.6509"
2476 x2="-3605.1768"
2477 y2="-3663.6709"
2478 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2479 <stop
2480 offset="0"
2481 style="stop-color:#500C81"
2482 id="stop1293" />
2483 <stop
2484 offset="0.0374"
2485 style="stop-color:#5F0B7A"
2486 id="stop1295" />
2487 <stop
2488 offset="0.1836"
2489 style="stop-color:#96075F"
2490 id="stop1297" />
2491 <stop
2492 offset="0.3196"
2493 style="stop-color:#C1044A"
2494 id="stop1299" />
2495 <stop
2496 offset="0.4412"
2497 style="stop-color:#E0023B"
2498 id="stop1301" />
2499 <stop
2500 offset="0.5441"
2501 style="stop-color:#F30032"
2502 id="stop1303" />
2503 <stop
2504 offset="0.6158"
2505 style="stop-color:#FA002F"
2506 id="stop1305" />
2507 <stop
2508 offset="1"
2509 style="stop-color:#F7EE5F"
2510 id="stop1307" />
2511 </linearGradient>
2512 <linearGradient
2513 id="XMLID_62_"
2514 gradientUnits="userSpaceOnUse"
2515 x1="-3598.8574"
2516 y1="-3623.605"
2517 x2="-3610.6772"
2518 y2="-3655.5625"
2519 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2520 <stop
2521 offset="0"
2522 style="stop-color:#500C81"
2523 id="stop1312" />
2524 <stop
2525 offset="0.0374"
2526 style="stop-color:#5F0B7A"
2527 id="stop1314" />
2528 <stop
2529 offset="0.1836"
2530 style="stop-color:#96075F"
2531 id="stop1316" />
2532 <stop
2533 offset="0.3196"
2534 style="stop-color:#C1044A"
2535 id="stop1318" />
2536 <stop
2537 offset="0.4412"
2538 style="stop-color:#E0023B"
2539 id="stop1320" />
2540 <stop
2541 offset="0.5441"
2542 style="stop-color:#F30032"
2543 id="stop1322" />
2544 <stop
2545 offset="0.6158"
2546 style="stop-color:#FA002F"
2547 id="stop1324" />
2548 <stop
2549 offset="1"
2550 style="stop-color:#F7EE5F"
2551 id="stop1326" />
2552 </linearGradient>
2553 <linearGradient
2554 id="XMLID_63_"
2555 gradientUnits="userSpaceOnUse"
2556 x1="-3610.3208"
2557 y1="-3626.0493"
2558 x2="-3620.3408"
2559 y2="-3659.0693"
2560 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2561 <stop
2562 offset="0"
2563 style="stop-color:#500C81"
2564 id="stop1331" />
2565 <stop
2566 offset="0.0374"
2567 style="stop-color:#5F0B7A"
2568 id="stop1333" />
2569 <stop
2570 offset="0.1836"
2571 style="stop-color:#96075F"
2572 id="stop1335" />
2573 <stop
2574 offset="0.3196"
2575 style="stop-color:#C1044A"
2576 id="stop1337" />
2577 <stop
2578 offset="0.4412"
2579 style="stop-color:#E0023B"
2580 id="stop1339" />
2581 <stop
2582 offset="0.5441"
2583 style="stop-color:#F30032"
2584 id="stop1341" />
2585 <stop
2586 offset="0.6158"
2587 style="stop-color:#FA002F"
2588 id="stop1343" />
2589 <stop
2590 offset="1"
2591 style="stop-color:#F7EE5F"
2592 id="stop1345" />
2593 </linearGradient>
2594 <linearGradient
2595 id="XMLID_64_"
2596 gradientUnits="userSpaceOnUse"
2597 x1="-3613.4258"
2598 y1="-3618.2168"
2599 x2="-3625.2456"
2600 y2="-3650.1743"
2601 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2602 <stop
2603 offset="0"
2604 style="stop-color:#500C81"
2605 id="stop1350" />
2606 <stop
2607 offset="0.0374"
2608 style="stop-color:#5F0B7A"
2609 id="stop1352" />
2610 <stop
2611 offset="0.1836"
2612 style="stop-color:#96075F"
2613 id="stop1354" />
2614 <stop
2615 offset="0.3196"
2616 style="stop-color:#C1044A"
2617 id="stop1356" />
2618 <stop
2619 offset="0.4412"
2620 style="stop-color:#E0023B"
2621 id="stop1358" />
2622 <stop
2623 offset="0.5441"
2624 style="stop-color:#F30032"
2625 id="stop1360" />
2626 <stop
2627 offset="0.6158"
2628 style="stop-color:#FA002F"
2629 id="stop1362" />
2630 <stop
2631 offset="1"
2632 style="stop-color:#F7EE5F"
2633 id="stop1364" />
2634 </linearGradient>
2635 <linearGradient
2636 id="XMLID_65_"
2637 gradientUnits="userSpaceOnUse"
2638 x1="-3625.0215"
2639 y1="-3621.5884"
2640 x2="-3635.0415"
2641 y2="-3654.6084"
2642 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2643 <stop
2644 offset="0"
2645 style="stop-color:#500C81"
2646 id="stop1369" />
2647 <stop
2648 offset="0.0374"
2649 style="stop-color:#5F0B7A"
2650 id="stop1371" />
2651 <stop
2652 offset="0.1836"
2653 style="stop-color:#96075F"
2654 id="stop1373" />
2655 <stop
2656 offset="0.3196"
2657 style="stop-color:#C1044A"
2658 id="stop1375" />
2659 <stop
2660 offset="0.4412"
2661 style="stop-color:#E0023B"
2662 id="stop1377" />
2663 <stop
2664 offset="0.5441"
2665 style="stop-color:#F30032"
2666 id="stop1379" />
2667 <stop
2668 offset="0.6158"
2669 style="stop-color:#FA002F"
2670 id="stop1381" />
2671 <stop
2672 offset="1"
2673 style="stop-color:#F7EE5F"
2674 id="stop1383" />
2675 </linearGradient>
2676 <linearGradient
2677 id="XMLID_66_"
2678 gradientUnits="userSpaceOnUse"
2679 x1="-3631.4028"
2680 y1="-3619.6519"
2681 x2="-3641.4229"
2682 y2="-3652.6719"
2683 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2684 <stop
2685 offset="0"
2686 style="stop-color:#500C81"
2687 id="stop1388" />
2688 <stop
2689 offset="0.0374"
2690 style="stop-color:#5F0B7A"
2691 id="stop1390" />
2692 <stop
2693 offset="0.1836"
2694 style="stop-color:#96075F"
2695 id="stop1392" />
2696 <stop
2697 offset="0.3196"
2698 style="stop-color:#C1044A"
2699 id="stop1394" />
2700 <stop
2701 offset="0.4412"
2702 style="stop-color:#E0023B"
2703 id="stop1396" />
2704 <stop
2705 offset="0.5441"
2706 style="stop-color:#F30032"
2707 id="stop1398" />
2708 <stop
2709 offset="0.6158"
2710 style="stop-color:#FA002F"
2711 id="stop1400" />
2712 <stop
2713 offset="1"
2714 style="stop-color:#F7EE5F"
2715 id="stop1402" />
2716 </linearGradient>
2717 <linearGradient
2718 id="XMLID_67_"
2719 gradientUnits="userSpaceOnUse"
2720 x1="-3639.3081"
2721 y1="-3617.2529"
2722 x2="-3649.3281"
2723 y2="-3650.2729"
2724 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2725 <stop
2726 offset="0"
2727 style="stop-color:#500C81"
2728 id="stop1407" />
2729 <stop
2730 offset="0.0374"
2731 style="stop-color:#5F0B7A"
2732 id="stop1409" />
2733 <stop
2734 offset="0.1836"
2735 style="stop-color:#96075F"
2736 id="stop1411" />
2737 <stop
2738 offset="0.3196"
2739 style="stop-color:#C1044A"
2740 id="stop1413" />
2741 <stop
2742 offset="0.4412"
2743 style="stop-color:#E0023B"
2744 id="stop1415" />
2745 <stop
2746 offset="0.5441"
2747 style="stop-color:#F30032"
2748 id="stop1417" />
2749 <stop
2750 offset="0.6158"
2751 style="stop-color:#FA002F"
2752 id="stop1419" />
2753 <stop
2754 offset="1"
2755 style="stop-color:#F7EE5F"
2756 id="stop1421" />
2757 </linearGradient>
2758 <linearGradient
2759 id="XMLID_68_"
2760 gradientUnits="userSpaceOnUse"
2761 x1="-3644.0654"
2762 y1="-3622.0806"
2763 x2="-3668.9834"
2764 y2="-3650.0459"
2765 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2766 <stop
2767 offset="0"
2768 style="stop-color:#500C81"
2769 id="stop1426" />
2770 <stop
2771 offset="0.0374"
2772 style="stop-color:#5F0B7A"
2773 id="stop1428" />
2774 <stop
2775 offset="0.1836"
2776 style="stop-color:#96075F"
2777 id="stop1430" />
2778 <stop
2779 offset="0.3196"
2780 style="stop-color:#C1044A"
2781 id="stop1432" />
2782 <stop
2783 offset="0.4412"
2784 style="stop-color:#E0023B"
2785 id="stop1434" />
2786 <stop
2787 offset="0.5441"
2788 style="stop-color:#F30032"
2789 id="stop1436" />
2790 <stop
2791 offset="0.6158"
2792 style="stop-color:#FA002F"
2793 id="stop1438" />
2794 <stop
2795 offset="1"
2796 style="stop-color:#F7EE5F"
2797 id="stop1440" />
2798 </linearGradient>
2799 <linearGradient
2800 id="XMLID_69_"
2801 gradientUnits="userSpaceOnUse"
2802 x1="-3649.6221"
2803 y1="-3617.1294"
2804 x2="-3674.54"
2805 y2="-3645.0947"
2806 gradientTransform="matrix(-0.1305911,0.1307877,0.1307877,0.1305911,125.19076,1026.4282)">
2807 <stop
2808 offset="0"
2809 style="stop-color:#500C81"
2810 id="stop1445" />
2811 <stop
2812 offset="0.0374"
2813 style="stop-color:#5F0B7A"
2814 id="stop1447" />
2815 <stop
2816 offset="0.1836"
2817 style="stop-color:#96075F"
2818 id="stop1449" />
2819 <stop
2820 offset="0.3196"
2821 style="stop-color:#C1044A"
2822 id="stop1451" />
2823 <stop
2824 offset="0.4412"
2825 style="stop-color:#E0023B"
2826 id="stop1453" />
2827 <stop
2828 offset="0.5441"
2829 style="stop-color:#F30032"
2830 id="stop1455" />
2831 <stop
2832 offset="0.6158"
2833 style="stop-color:#FA002F"
2834 id="stop1457" />
2835 <stop
2836 offset="1"
2837 style="stop-color:#F7EE5F"
2838 id="stop1459" />
2839 </linearGradient>
2840 <linearGradient
2841 id="XMLID_79_"
2842 gradientUnits="userSpaceOnUse"
2843 x1="-3586.4019"
2844 y1="-3601.3828"
2845 x2="-3570.2441"
2846 y2="-3549.8223"
2847 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2848 <stop
2849 offset="0"
2850 style="stop-color:#D8E7EB"
2851 id="stop1709" />
2852 <stop
2853 offset="0.0684"
2854 style="stop-color:#D0DFE4"
2855 id="stop1711" />
2856 <stop
2857 offset="0.1761"
2858 style="stop-color:#B9CAD0"
2859 id="stop1713" />
2860 <stop
2861 offset="0.3096"
2862 style="stop-color:#94A7B0"
2863 id="stop1715" />
2864 <stop
2865 offset="0.4622"
2866 style="stop-color:#627784"
2867 id="stop1717" />
2868 <stop
2869 offset="0.5537"
2870 style="stop-color:#405766"
2871 id="stop1719" />
2872 <stop
2873 offset="0.6113"
2874 style="stop-color:#607682"
2875 id="stop1721" />
2876 <stop
2877 offset="0.6983"
2878 style="stop-color:#8B9EA8"
2879 id="stop1723" />
2880 <stop
2881 offset="0.7829"
2882 style="stop-color:#ADBEC5"
2883 id="stop1725" />
2884 <stop
2885 offset="0.8633"
2886 style="stop-color:#C5D5DA"
2887 id="stop1727" />
2888 <stop
2889 offset="0.9376"
2890 style="stop-color:#D3E2E7"
2891 id="stop1729" />
2892 <stop
2893 offset="1"
2894 style="stop-color:#D8E7EB"
2895 id="stop1731" />
2896 </linearGradient>
2897 <linearGradient
2898 id="XMLID_78_"
2899 gradientUnits="userSpaceOnUse"
2900 x1="-3581.9321"
2901 y1="-3602.7837"
2902 x2="-3565.7744"
2903 y2="-3551.2231"
2904 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2905 <stop
2906 offset="0"
2907 style="stop-color:#D8E7EB"
2908 id="stop1682" />
2909 <stop
2910 offset="0.0684"
2911 style="stop-color:#D0DFE4"
2912 id="stop1684" />
2913 <stop
2914 offset="0.1761"
2915 style="stop-color:#B9CAD0"
2916 id="stop1686" />
2917 <stop
2918 offset="0.3096"
2919 style="stop-color:#94A7B0"
2920 id="stop1688" />
2921 <stop
2922 offset="0.4622"
2923 style="stop-color:#627784"
2924 id="stop1690" />
2925 <stop
2926 offset="0.5537"
2927 style="stop-color:#405766"
2928 id="stop1692" />
2929 <stop
2930 offset="0.6113"
2931 style="stop-color:#607682"
2932 id="stop1694" />
2933 <stop
2934 offset="0.6983"
2935 style="stop-color:#8B9EA8"
2936 id="stop1696" />
2937 <stop
2938 offset="0.7829"
2939 style="stop-color:#ADBEC5"
2940 id="stop1698" />
2941 <stop
2942 offset="0.8633"
2943 style="stop-color:#C5D5DA"
2944 id="stop1700" />
2945 <stop
2946 offset="0.9376"
2947 style="stop-color:#D3E2E7"
2948 id="stop1702" />
2949 <stop
2950 offset="1"
2951 style="stop-color:#D8E7EB"
2952 id="stop1704" />
2953 </linearGradient>
2954 <linearGradient
2955 id="XMLID_77_"
2956 gradientUnits="userSpaceOnUse"
2957 x1="-3576.8662"
2958 y1="-3604.3711"
2959 x2="-3560.7085"
2960 y2="-3552.8105"
2961 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
2962 <stop
2963 offset="0"
2964 style="stop-color:#D8E7EB"
2965 id="stop1655" />
2966 <stop
2967 offset="0.0684"
2968 style="stop-color:#D0DFE4"
2969 id="stop1657" />
2970 <stop
2971 offset="0.1761"
2972 style="stop-color:#B9CAD0"
2973 id="stop1659" />
2974 <stop
2975 offset="0.3096"
2976 style="stop-color:#94A7B0"
2977 id="stop1661" />
2978 <stop
2979 offset="0.4622"
2980 style="stop-color:#627784"
2981 id="stop1663" />
2982 <stop
2983 offset="0.5537"
2984 style="stop-color:#405766"
2985 id="stop1665" />
2986 <stop
2987 offset="0.6113"
2988 style="stop-color:#607682"
2989 id="stop1667" />
2990 <stop
2991 offset="0.6983"
2992 style="stop-color:#8B9EA8"
2993 id="stop1669" />
2994 <stop
2995 offset="0.7829"
2996 style="stop-color:#ADBEC5"
2997 id="stop1671" />
2998 <stop
2999 offset="0.8633"
3000 style="stop-color:#C5D5DA"
3001 id="stop1673" />
3002 <stop
3003 offset="0.9376"
3004 style="stop-color:#D3E2E7"
3005 id="stop1675" />
3006 <stop
3007 offset="1"
3008 style="stop-color:#D8E7EB"
3009 id="stop1677" />
3010 </linearGradient>
3011 <linearGradient
3012 id="XMLID_76_"
3013 gradientUnits="userSpaceOnUse"
3014 x1="-3577.7891"
3015 y1="-3604.082"
3016 x2="-3561.6313"
3017 y2="-3552.5215"
3018 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3019 <stop
3020 offset="0"
3021 style="stop-color:#D8E7EB"
3022 id="stop1628" />
3023 <stop
3024 offset="0.0684"
3025 style="stop-color:#D0DFE4"
3026 id="stop1630" />
3027 <stop
3028 offset="0.1761"
3029 style="stop-color:#B9CAD0"
3030 id="stop1632" />
3031 <stop
3032 offset="0.3096"
3033 style="stop-color:#94A7B0"
3034 id="stop1634" />
3035 <stop
3036 offset="0.4622"
3037 style="stop-color:#627784"
3038 id="stop1636" />
3039 <stop
3040 offset="0.5537"
3041 style="stop-color:#405766"
3042 id="stop1638" />
3043 <stop
3044 offset="0.6113"
3045 style="stop-color:#607682"
3046 id="stop1640" />
3047 <stop
3048 offset="0.6983"
3049 style="stop-color:#8B9EA8"
3050 id="stop1642" />
3051 <stop
3052 offset="0.7829"
3053 style="stop-color:#ADBEC5"
3054 id="stop1644" />
3055 <stop
3056 offset="0.8633"
3057 style="stop-color:#C5D5DA"
3058 id="stop1646" />
3059 <stop
3060 offset="0.9376"
3061 style="stop-color:#D3E2E7"
3062 id="stop1648" />
3063 <stop
3064 offset="1"
3065 style="stop-color:#D8E7EB"
3066 id="stop1650" />
3067 </linearGradient>
3068 <linearGradient
3069 id="XMLID_75_"
3070 gradientUnits="userSpaceOnUse"
3071 x1="-3584.2759"
3072 y1="-3602.0488"
3073 x2="-3568.1182"
3074 y2="-3550.4883"
3075 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3076 <stop
3077 offset="0"
3078 style="stop-color:#D8E7EB"
3079 id="stop1601" />
3080 <stop
3081 offset="0.0684"
3082 style="stop-color:#D0DFE4"
3083 id="stop1603" />
3084 <stop
3085 offset="0.1761"
3086 style="stop-color:#B9CAD0"
3087 id="stop1605" />
3088 <stop
3089 offset="0.3096"
3090 style="stop-color:#94A7B0"
3091 id="stop1607" />
3092 <stop
3093 offset="0.4622"
3094 style="stop-color:#627784"
3095 id="stop1609" />
3096 <stop
3097 offset="0.5537"
3098 style="stop-color:#405766"
3099 id="stop1611" />
3100 <stop
3101 offset="0.6113"
3102 style="stop-color:#607682"
3103 id="stop1613" />
3104 <stop
3105 offset="0.6983"
3106 style="stop-color:#8B9EA8"
3107 id="stop1615" />
3108 <stop
3109 offset="0.7829"
3110 style="stop-color:#ADBEC5"
3111 id="stop1617" />
3112 <stop
3113 offset="0.8633"
3114 style="stop-color:#C5D5DA"
3115 id="stop1619" />
3116 <stop
3117 offset="0.9376"
3118 style="stop-color:#D3E2E7"
3119 id="stop1621" />
3120 <stop
3121 offset="1"
3122 style="stop-color:#D8E7EB"
3123 id="stop1623" />
3124 </linearGradient>
3125 <linearGradient
3126 id="XMLID_74_"
3127 gradientUnits="userSpaceOnUse"
3128 x1="-3578.7671"
3129 y1="-3603.7754"
3130 x2="-3562.6094"
3131 y2="-3552.2148"
3132 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3133 <stop
3134 offset="0"
3135 style="stop-color:#D8E7EB"
3136 id="stop1574" />
3137 <stop
3138 offset="0.0684"
3139 style="stop-color:#D0DFE4"
3140 id="stop1576" />
3141 <stop
3142 offset="0.1761"
3143 style="stop-color:#B9CAD0"
3144 id="stop1578" />
3145 <stop
3146 offset="0.3096"
3147 style="stop-color:#94A7B0"
3148 id="stop1580" />
3149 <stop
3150 offset="0.4622"
3151 style="stop-color:#627784"
3152 id="stop1582" />
3153 <stop
3154 offset="0.5537"
3155 style="stop-color:#405766"
3156 id="stop1584" />
3157 <stop
3158 offset="0.6113"
3159 style="stop-color:#607682"
3160 id="stop1586" />
3161 <stop
3162 offset="0.6983"
3163 style="stop-color:#8B9EA8"
3164 id="stop1588" />
3165 <stop
3166 offset="0.7829"
3167 style="stop-color:#ADBEC5"
3168 id="stop1590" />
3169 <stop
3170 offset="0.8633"
3171 style="stop-color:#C5D5DA"
3172 id="stop1592" />
3173 <stop
3174 offset="0.9376"
3175 style="stop-color:#D3E2E7"
3176 id="stop1594" />
3177 <stop
3178 offset="1"
3179 style="stop-color:#D8E7EB"
3180 id="stop1596" />
3181 </linearGradient>
3182 <linearGradient
3183 id="XMLID_73_"
3184 gradientUnits="userSpaceOnUse"
3185 x1="-3584.6758"
3186 y1="-3601.9238"
3187 x2="-3568.5181"
3188 y2="-3550.3633"
3189 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3190 <stop
3191 offset="0"
3192 style="stop-color:#D8E7EB"
3193 id="stop1547" />
3194 <stop
3195 offset="0.0684"
3196 style="stop-color:#D0DFE4"
3197 id="stop1549" />
3198 <stop
3199 offset="0.1761"
3200 style="stop-color:#B9CAD0"
3201 id="stop1551" />
3202 <stop
3203 offset="0.3096"
3204 style="stop-color:#94A7B0"
3205 id="stop1553" />
3206 <stop
3207 offset="0.4622"
3208 style="stop-color:#627784"
3209 id="stop1555" />
3210 <stop
3211 offset="0.5537"
3212 style="stop-color:#405766"
3213 id="stop1557" />
3214 <stop
3215 offset="0.6113"
3216 style="stop-color:#607682"
3217 id="stop1559" />
3218 <stop
3219 offset="0.6983"
3220 style="stop-color:#8B9EA8"
3221 id="stop1561" />
3222 <stop
3223 offset="0.7829"
3224 style="stop-color:#ADBEC5"
3225 id="stop1563" />
3226 <stop
3227 offset="0.8633"
3228 style="stop-color:#C5D5DA"
3229 id="stop1565" />
3230 <stop
3231 offset="0.9376"
3232 style="stop-color:#D3E2E7"
3233 id="stop1567" />
3234 <stop
3235 offset="1"
3236 style="stop-color:#D8E7EB"
3237 id="stop1569" />
3238 </linearGradient>
3239 <linearGradient
3240 id="XMLID_72_"
3241 gradientUnits="userSpaceOnUse"
3242 x1="-3580.1533"
3243 y1="-3603.3408"
3244 x2="-3563.9956"
3245 y2="-3551.7803"
3246 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3247 <stop
3248 offset="0"
3249 style="stop-color:#D8E7EB"
3250 id="stop1520" />
3251 <stop
3252 offset="0.0684"
3253 style="stop-color:#D0DFE4"
3254 id="stop1522" />
3255 <stop
3256 offset="0.1761"
3257 style="stop-color:#B9CAD0"
3258 id="stop1524" />
3259 <stop
3260 offset="0.3096"
3261 style="stop-color:#94A7B0"
3262 id="stop1526" />
3263 <stop
3264 offset="0.4622"
3265 style="stop-color:#627784"
3266 id="stop1528" />
3267 <stop
3268 offset="0.5537"
3269 style="stop-color:#405766"
3270 id="stop1530" />
3271 <stop
3272 offset="0.6113"
3273 style="stop-color:#607682"
3274 id="stop1532" />
3275 <stop
3276 offset="0.6983"
3277 style="stop-color:#8B9EA8"
3278 id="stop1534" />
3279 <stop
3280 offset="0.7829"
3281 style="stop-color:#ADBEC5"
3282 id="stop1536" />
3283 <stop
3284 offset="0.8633"
3285 style="stop-color:#C5D5DA"
3286 id="stop1538" />
3287 <stop
3288 offset="0.9376"
3289 style="stop-color:#D3E2E7"
3290 id="stop1540" />
3291 <stop
3292 offset="1"
3293 style="stop-color:#D8E7EB"
3294 id="stop1542" />
3295 </linearGradient>
3296 <linearGradient
3297 id="XMLID_71_"
3298 gradientUnits="userSpaceOnUse"
3299 x1="-3578.5146"
3300 y1="-3603.8545"
3301 x2="-3562.3569"
3302 y2="-3552.2939"
3303 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3304 <stop
3305 offset="0"
3306 style="stop-color:#D8E7EB"
3307 id="stop1493" />
3308 <stop
3309 offset="0.0684"
3310 style="stop-color:#D0DFE4"
3311 id="stop1495" />
3312 <stop
3313 offset="0.1761"
3314 style="stop-color:#B9CAD0"
3315 id="stop1497" />
3316 <stop
3317 offset="0.3096"
3318 style="stop-color:#94A7B0"
3319 id="stop1499" />
3320 <stop
3321 offset="0.4622"
3322 style="stop-color:#627784"
3323 id="stop1501" />
3324 <stop
3325 offset="0.5537"
3326 style="stop-color:#405766"
3327 id="stop1503" />
3328 <stop
3329 offset="0.6113"
3330 style="stop-color:#607682"
3331 id="stop1505" />
3332 <stop
3333 offset="0.6983"
3334 style="stop-color:#8B9EA8"
3335 id="stop1507" />
3336 <stop
3337 offset="0.7829"
3338 style="stop-color:#ADBEC5"
3339 id="stop1509" />
3340 <stop
3341 offset="0.8633"
3342 style="stop-color:#C5D5DA"
3343 id="stop1511" />
3344 <stop
3345 offset="0.9376"
3346 style="stop-color:#D3E2E7"
3347 id="stop1513" />
3348 <stop
3349 offset="1"
3350 style="stop-color:#D8E7EB"
3351 id="stop1515" />
3352 </linearGradient>
3353 <linearGradient
3354 id="XMLID_70_"
3355 gradientUnits="userSpaceOnUse"
3356 x1="-3581.9316"
3357 y1="-3602.7837"
3358 x2="-3565.7739"
3359 y2="-3551.2231"
3360 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3361 <stop
3362 offset="0"
3363 style="stop-color:#D8E7EB"
3364 id="stop1466" />
3365 <stop
3366 offset="0.0684"
3367 style="stop-color:#D0DFE4"
3368 id="stop1468" />
3369 <stop
3370 offset="0.1761"
3371 style="stop-color:#B9CAD0"
3372 id="stop1470" />
3373 <stop
3374 offset="0.3096"
3375 style="stop-color:#94A7B0"
3376 id="stop1472" />
3377 <stop
3378 offset="0.4622"
3379 style="stop-color:#627784"
3380 id="stop1474" />
3381 <stop
3382 offset="0.5537"
3383 style="stop-color:#405766"
3384 id="stop1476" />
3385 <stop
3386 offset="0.6113"
3387 style="stop-color:#607682"
3388 id="stop1478" />
3389 <stop
3390 offset="0.6983"
3391 style="stop-color:#8B9EA8"
3392 id="stop1480" />
3393 <stop
3394 offset="0.7829"
3395 style="stop-color:#ADBEC5"
3396 id="stop1482" />
3397 <stop
3398 offset="0.8633"
3399 style="stop-color:#C5D5DA"
3400 id="stop1484" />
3401 <stop
3402 offset="0.9376"
3403 style="stop-color:#D3E2E7"
3404 id="stop1486" />
3405 <stop
3406 offset="1"
3407 style="stop-color:#D8E7EB"
3408 id="stop1488" />
3409 </linearGradient>
3410 <linearGradient
3411 inkscape:collect="always"
3412 id="linearGradient2902">
3413 <stop
3414 style="stop-color:#000000;stop-opacity:1;"
3415 offset="0"
3416 id="stop2904" />
3417 <stop
3418 style="stop-color:#000000;stop-opacity:0;"
3419 offset="1"
3420 id="stop2906" />
3421 </linearGradient>
3422 <linearGradient
3423 id="linearGradient5149">
3424 <stop
3425 style="stop-color:#3cc23c;stop-opacity:1;"
3426 offset="0"
3427 id="stop5151" />
3428 <stop
3429 style="stop-color:#b4ffb4;stop-opacity:1;"
3430 offset="1"
3431 id="stop5153" />
3432 </linearGradient>
3433 <linearGradient
3434 inkscape:collect="always"
3435 xlink:href="#linearGradient5149"
3436 id="linearGradient5155"
3437 x1="209.40289"
3438 y1="196.93959"
3439 x2="402.86024"
3440 y2="201.399"
3441 gradientUnits="userSpaceOnUse"
3442 gradientTransform="matrix(0.3124104,-0.1174332,0.1174332,0.3124104,43.759948,18.537169)" />
3443 <linearGradient
3444 id="linearGradient3195">
3445 <stop
3446 style="stop-color:#0be254;stop-opacity:1;"
3447 offset="0"
3448 id="stop3197" />
3449 <stop
3450 style="stop-color:#ffffff;stop-opacity:1;"
3451 offset="1"
3452 id="stop3199" />
3453 </linearGradient>
3454 <linearGradient
3455 inkscape:collect="always"
3456 xlink:href="#linearGradient3195"
3457 id="linearGradient3201"
3458 x1="372.93762"
3459 y1="201.399"
3460 x2="191.42549"
3461 y2="205.16872"
3462 gradientUnits="userSpaceOnUse"
3463 gradientTransform="matrix(0.3124104,-0.1174332,0.1174332,0.3124104,43.759948,18.537169)" />
3464 <linearGradient
3465 inkscape:collect="always"
3466 xlink:href="#linearGradient2902"
3467 id="linearGradient2908"
3468 x1="-111.49345"
3469 y1="98.656258"
3470 x2="14.666016"
3471 y2="89.682144"
3472 gradientUnits="userSpaceOnUse" />
3473 <linearGradient
3474 inkscape:collect="always"
3475 xlink:href="#linearGradient2902"
3476 id="linearGradient2914"
3477 gradientUnits="userSpaceOnUse"
3478 x1="78.814453"
3479 y1="146.72023"
3480 x2="78.814453"
3481 y2="32.644054" />
3482 <linearGradient
3483 inkscape:collect="always"
3484 xlink:href="#linearGradient2902"
3485 id="linearGradient3669"
3486 gradientUnits="userSpaceOnUse"
3487 x1="78.814453"
3488 y1="194.78421"
3489 x2="78.814453"
3490 y2="18.507591" />
3491 <linearGradient
3492 inkscape:collect="always"
3493 xlink:href="#linearGradient2902"
3494 id="linearGradient9226"
3495 gradientUnits="userSpaceOnUse"
3496 x1="78.814453"
3497 y1="194.78421"
3498 x2="78.814453"
3499 y2="18.507591" />
3500 <linearGradient
3501 inkscape:collect="always"
3502 xlink:href="#linearGradient2902"
3503 id="linearGradient9228"
3504 gradientUnits="userSpaceOnUse"
3505 x1="78.814453"
3506 y1="194.78421"
3507 x2="78.814453"
3508 y2="18.507591" />
3509 <linearGradient
3510 inkscape:collect="always"
3511 xlink:href="#linearGradient2902"
3512 id="linearGradient9230"
3513 gradientUnits="userSpaceOnUse"
3514 x1="78.814453"
3515 y1="194.78421"
3516 x2="78.814453"
3517 y2="18.507591" />
3518 </defs>
3519 <sodipodi:namedview
3520 inkscape:window-height="764"
3521 inkscape:window-width="1264"
3522 inkscape:pageshadow="2"
3523 inkscape:pageopacity="0.0"
3524 guidetolerance="10.0"
3525 gridtolerance="10.0"
3526 objecttolerance="10.0"
3527 borderopacity="1.0"
3528 bordercolor="#666666"
3529 pagecolor="#ffffff"
3530 id="base"
3531 inkscape:zoom="5.2984309"
3532 inkscape:cx="113.24292"
3533 inkscape:cy="23.24562"
3534 inkscape:window-x="0"
3535 inkscape:window-y="0"
3536 inkscape:current-layer="g6" />
3537 <title
3538 id="title4">generated by pstoedit version:3.45 from Z:/asf_logo_1999.eps</title>
3539 <g
3540 xml:space="preserve"
3541 id="g6"
3542 transform="translate(-2.6691177,-22.954412)">
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705 <clipPath
3706 id="clippath1"> <path
3707 d="M 201.762,284.016 L 201.762,284.016 L 200.309,284.512 L 198.891,284.629 L 197.473,284.402 L 196.055,283.828 L 194.676,283.059 L 193.297,282.066 L 191.992,280.922 L 190.691,279.695 L 189.465,278.434 L 188.316,277.168 L 187.203,275.945 L 186.168,274.797 L 185.211,273.84 L 184.328,273.078 L 183.563,272.578 L 182.871,272.348 L 182.871,272.348 L 183.449,272.387 L 184.059,272.422 L 184.75,272.5 L 185.438,272.574 L 186.129,272.652 L 186.82,272.73 L 187.508,272.766 L 188.121,272.844 L 188.121,272.844 L 189.078,273.723 L 189.996,274.566 L 190.801,275.406 L 191.531,276.25 L 192.258,277.051 L 192.949,277.855 L 193.598,278.617 L 194.289,279.348 L 194.98,280.074 L 195.711,280.727 L 196.477,281.375 L 197.355,281.984 L 198.277,282.562 L 199.309,283.098 L 200.461,283.594 L 201.762,284.016"
3708 id="path657" />
3709 </clipPath>
3710 <path
3711 d="M 146.41619,62.696027 C 142.62485,65.569644 129.70379,77.249461 123.31721,81.838063 C 118.18169,85.530297 112.55653,88.526567 107.06272,91.260526 C 105.64199,91.967054 103.02634,93.1178 101.17315,93.67506 C 99.827079,93.854801 97.128165,94.825804 95.80891,94.506847 C 107.56985,90.318569 111.82247,88.150989 123.07148,80.688964 C 127.64352,77.674484 142.56869,64.986455 146.41619,62.696027 z "
3712 id="path8874"
3713 style="stroke:#000000;stroke-width:0.20699584" /><path
3714 d="M 136.08322,75.96456 L 135.84827,76.444287 L 135.52884,76.821806 L 135.12422,77.113635 L 134.63488,77.320749 L 134.10787,77.471469 L 133.52446,77.566277 L 132.92233,77.605399 L 132.30147,77.623884 L 131.69022,77.615944 L 131.0965,77.587142 L 130.52269,77.559544 L 129.98606,77.531729 L 129.51547,77.531972 L 129.11155,77.56102 L 128.80004,77.626292 L 128.57468,77.73983 L 128.72516,77.607097 L 128.88474,77.466238 L 129.07313,77.315738 L 129.26058,77.164773 L 129.44943,77.0143 L 129.6376,76.863585 L 129.81641,76.703506 L 129.9856,76.571758 L 130.43701,76.552809 L 130.8702,76.533371 L 131.27483,76.542489 L 131.66146,76.570577 L 132.03683,76.58881 L 132.40377,76.617369 L 132.75175,76.644745 L 133.10069,76.653861 L 133.44845,76.663222 L 133.78756,76.643787 L 134.13578,76.615231 L 134.5025,76.548532 L 134.86991,76.464317 L 135.2556,76.341686 L 135.65997,76.181138 L 136.08322,75.96456 z "
3715 id="path189"
3716 style="fill:url(#XMLID_1_);stroke:#000000;stroke-width:0.0882756" /><path
3717 d="M 137.20254,74.749258 L 136.85432,75.230434 L 136.47802,75.625449 L 136.08347,75.94608 L 135.66037,76.199765 L 135.21832,76.379263 L 134.77506,76.501913 L 134.30469,76.595989 L 133.83456,76.634152 L 133.35455,76.662957 L 132.87482,76.65359 L 132.37563,76.62575 L 131.89566,76.598397 L 131.4068,76.579448 L 130.91696,76.561459 L 130.44683,76.562175 L 129.9858,76.590236 L 130.08852,76.505751 L 130.19245,76.401822 L 130.29637,76.317112 L 130.40844,76.223287 L 130.52171,76.128252 L 130.64433,76.04401 L 130.75617,75.949472 L 130.86943,75.856358 L 130.982,75.762036 L 131.10438,75.657889 L 131.22725,75.5734 L 131.34917,75.46972 L 131.4711,75.366015 L 131.59327,75.262087 L 131.70631,75.149042 L 131.82872,75.045852 L 132.04545,75.092174 L 132.30872,75.149067 L 132.62862,75.204972 L 132.96773,75.261373 L 133.35386,75.308188 L 133.74914,75.344902 L 134.16287,75.382355 L 134.57661,75.400589 L 134.99134,75.400589 L 135.39548,75.391965 L 135.79123,75.352841 L 136.14857,75.296443 L 136.47785,75.210994 L 136.76992,75.088833 L 137.014,74.937894 L 137.20259,74.749307 L 137.20254,74.749258 z "
3718 id="path208"
3719 style="fill:url(#XMLID_2_);stroke:#000000;stroke-width:0.0882756" /><path
3720 d="M 137.98324,73.986796 L 137.77612,74.250536 L 137.56903,74.495078 L 137.34342,74.702441 L 137.08952,74.881494 L 136.82604,75.051407 L 136.53442,75.174036 L 136.22436,75.277225 L 135.86631,75.353061 L 135.48976,75.40993 L 135.08539,75.43895 L 134.64238,75.429344 L 134.17249,75.411082 L 133.64498,75.354439 L 133.08052,75.280049 L 132.47839,75.186224 L 131.81938,75.055192 L 131.97897,74.894617 L 132.1393,74.75351 L 132.30898,74.621273 L 132.47819,74.489525 L 132.6476,74.358491 L 132.81609,74.225543 L 132.9853,74.056343 L 133.16386,73.87778 L 133.46576,73.876824 L 133.76645,73.895795 L 134.05902,73.905159 L 134.36907,73.933 L 134.67027,73.951483 L 134.97124,73.989402 L 135.27219,74.007142 L 135.57458,74.044569 L 135.87505,74.063764 L 136.17601,74.082488 L 136.48631,74.091855 L 136.77885,74.100748 L 137.07933,74.081555 L 137.38073,74.062365 L 137.68143,74.043393 L 137.98324,73.986796 z "
3721 id="path227"
3722 style="fill:url(#XMLID_3_);stroke:#000000;stroke-width:0.0882756" /><path
3723 d="M 138.7627,73.129575 L 138.67896,73.403388 L 138.51959,73.638103 L 138.29281,73.807768 L 138.00195,73.93064 L 137.6717,74.015131 L 137.28578,74.062167 L 136.88187,74.090722 L 136.43932,74.082564 L 135.99726,74.053764 L 135.53599,74.026878 L 135.09345,73.97935 L 134.66052,73.941922 L 134.2367,73.895379 L 133.85129,73.868028 L 133.50282,73.858441 L 133.20113,73.858686 L 134.30219,72.945284 L 134.56617,72.982247 L 134.83856,73.009872 L 135.10254,73.048746 L 135.36485,73.085461 L 135.62886,73.123379 L 135.90221,73.150978 L 136.16595,73.188186 L 136.43858,73.216497 L 136.71143,73.225366 L 136.99366,73.243602 L 137.26747,73.253922 L 137.55834,73.243356 L 137.84993,73.233992 L 138.15207,73.215044 L 138.45301,73.177594 L 138.7627,73.129575 z "
3724 id="path8879"
3725 style="fill:url(#XMLID_4_);stroke:#000000;stroke-width:0.0882756" /><path
3726 d="M 140.21128,71.246126 L 140.10759,71.726591 L 139.95783,72.122101 L 139.74182,72.451845 L 139.47784,72.715829 L 139.17688,72.922702 L 138.84811,73.083006 L 138.48115,73.185481 L 138.07652,73.252693 L 137.64408,73.271887 L 137.2008,73.28221 L 136.73955,73.233721 L 136.26892,73.197472 L 135.78916,73.131956 L 135.30006,73.057568 L 134.82008,72.973104 L 134.34922,72.897758 L 134.54673,72.756869 L 134.68759,72.634243 L 134.79104,72.549041 L 134.88535,72.473941 L 134.9792,72.399306 L 135.06392,72.314103 L 135.19565,72.200593 L 135.36485,72.049628 L 135.79804,72.124752 L 136.21131,72.180909 L 136.61666,72.228191 L 137.01191,72.227945 L 137.37886,72.218116 L 137.7367,72.199168 L 138.08443,72.151886 L 138.40482,72.094994 L 138.70552,72.019403 L 138.99759,71.9342 L 139.25128,71.830763 L 139.48673,71.727329 L 139.71257,71.614287 L 139.90096,71.500747 L 140.06991,71.368758 L 140.21128,71.246126 z "
3727 id="path265"
3728 style="fill:url(#XMLID_5_);stroke:#000000;stroke-width:0.0882756" /><path
3729 d="M 141.02917,70.313512 L 140.7652,70.652841 L 140.49354,70.963863 L 140.19188,71.227133 L 139.88204,71.462558 L 139.57174,71.661492 L 139.23312,71.830692 L 138.89401,71.962436 L 138.53642,72.056978 L 138.16972,72.142398 L 137.80227,72.189681 L 137.41658,72.218237 L 137.02182,72.21799 L 136.6268,72.199267 L 136.22169,72.153187 L 135.80673,72.096789 L 135.393,72.021932 L 135.59099,71.843149 L 135.8166,71.654024 L 136.03236,71.475708 L 136.23972,71.28707 L 136.42833,71.136351 L 136.57809,71.003891 L 136.69115,70.910064 L 136.74729,70.872148 L 136.78521,70.872615 L 136.87904,70.872367 L 137.02976,70.8719 L 137.21815,70.872638 L 137.43463,70.881755 L 137.69811,70.881758 L 137.9808,70.880798 L 138.29111,70.872419 L 138.62036,70.842191 L 138.96788,70.814347 L 139.3077,70.776895 L 139.66504,70.720003 L 140.02192,70.644412 L 140.36993,70.559456 L 140.70928,70.446191 L 141.02917,70.313512 z "
3730 id="path284"
3731 style="fill:url(#XMLID_6_);stroke:#000000;stroke-width:0.0882756" /><path
3732 d="M 142.25166,69.221573 L 142.04502,69.522765 L 141.8185,69.786745 L 141.53675,70.012811 L 141.23555,70.219931 L 140.89741,70.390085 L 140.54917,70.530457 L 140.18223,70.653088 L 139.78675,70.748116 L 139.40157,70.814592 L 139.00632,70.870523 L 138.61153,70.908689 L 138.21579,70.928131 L 137.82029,70.928618 L 137.44424,70.928864 L 137.08594,70.909892 L 136.73771,70.88183 L 136.85097,70.768561 L 137.00192,70.637285 L 137.16101,70.49593 L 137.32109,70.335846 L 137.4898,70.203119 L 137.65059,70.062479 L 137.78212,69.950168 L 137.90453,69.864724 L 137.92347,69.883672 L 138.00796,69.892319 L 138.18676,69.883917 L 138.42195,69.855114 L 138.71378,69.826336 L 139.05218,69.789374 L 139.42006,69.742094 L 139.79588,69.684517 L 140.19066,69.627157 L 140.58617,69.571471 L 140.96248,69.514826 L 141.31141,69.448106 L 141.63108,69.391485 L 141.90397,69.325474 L 142.11084,69.268364 L 142.25166,69.221573 z "
3733 id="path303"
3734 style="fill:url(#XMLID_7_);stroke:#000000;stroke-width:0.0882756" /><path
3735 d="M 143.52144,67.931146 L 143.52144,68.232088 L 143.38896,68.495606 L 143.14441,68.740639 L 142.80625,68.948251 L 142.39178,69.136644 L 141.92284,69.287826 L 141.4047,69.428691 L 140.87745,69.542693 L 140.33171,69.63679 L 139.80517,69.71312 L 139.30719,69.77913 L 138.85432,69.817267 L 138.46888,69.864796 L 138.16794,69.883524 L 137.97042,69.893847 L 137.88521,69.903208 L 138.01696,69.790186 L 138.15807,69.666821 L 138.3083,69.534827 L 138.45902,69.404263 L 138.62774,69.271068 L 138.78831,69.129713 L 138.94836,68.989316 L 139.11802,68.838377 L 139.39018,68.772121 L 139.66281,68.70685 L 139.9453,68.648535 L 140.20927,68.593338 L 140.49081,68.535983 L 140.77374,68.497816 L 141.05549,68.441663 L 141.32884,68.393917 L 141.62042,68.346168 L 141.89353,68.299597 L 142.17552,68.243195 L 142.44815,68.195669 L 142.72078,68.129905 L 142.99439,68.063675 L 143.25741,68.00654 L 143.52144,67.931146 z "
3736 id="path8884"
3737 style="fill:url(#XMLID_8_);stroke:#000000;stroke-width:0.0882756" /><path
3738 d="M 144.48045,67.008161 L 144.37655,67.224864 L 144.24527,67.412759 L 144.09454,67.581713 L 143.90687,67.715132 L 143.69929,67.827215 L 143.46482,67.931636 L 143.19218,68.016127 L 142.89051,68.091716 L 142.55186,68.167308 L 142.18491,68.233316 L 141.78029,68.300036 L 141.33798,68.384772 L 140.84914,68.479089 L 140.32189,68.573386 L 139.75743,68.686649 L 139.14521,68.828497 L 139.31537,68.677556 L 139.50353,68.508607 L 139.66313,68.348524 L 139.83234,68.198051 L 139.98255,68.066062 L 140.10471,67.962626 L 140.18967,67.896889 L 140.2271,67.8772 L 140.27436,67.886565 L 140.38741,67.88681 L 140.54771,67.8772 L 140.76344,67.867839 L 141.02647,67.848645 L 141.32884,67.829922 L 141.63868,67.800872 L 141.98716,67.753834 L 142.33514,67.706307 L 142.69272,67.649661 L 143.03113,67.573823 L 143.37982,67.489115 L 143.69898,67.394553 L 143.99994,67.281754 L 144.26321,67.149517 L 144.48045,67.008161 z "
3739 id="path337"
3740 style="fill:url(#XMLID_9_);stroke:#000000;stroke-width:0.0882756" /><path
3741 d="M 145.06337,66.217387 L 144.93186,66.517867 L 144.73387,66.790721 L 144.47085,67.018211 L 144.15117,67.206359 L 143.79358,67.3758 L 143.41729,67.507326 L 143.00379,67.621797 L 142.58933,67.696921 L 142.18472,67.763152 L 141.77987,67.810432 L 141.4038,67.848596 L 141.0551,67.857468 L 140.75391,67.877374 L 140.50961,67.876418 L 140.33178,67.886739 L 140.21756,67.886736 L 140.37764,67.745383 L 140.5293,67.613389 L 140.67881,67.482114 L 140.83841,67.35899 L 140.98,67.237542 L 141.13094,67.104836 L 141.29918,66.973067 L 141.45926,66.831714 L 141.6755,66.822351 L 141.9018,66.802421 L 142.1178,66.793301 L 142.35347,66.783718 L 142.59824,66.783716 L 142.83319,66.774353 L 143.07773,66.755409 L 143.32253,66.73572 L 143.55821,66.707906 L 143.79292,66.680058 L 144.02834,66.631846 L 144.24387,66.584806 L 144.47068,66.509682 L 144.67705,66.433846 L 144.87529,66.330654 L 145.06337,66.217387 z "
3742 id="path352"
3743 style="fill:url(#XMLID_10_);stroke:#000000;stroke-width:0.0882756" /><path
3744 d="M 145.62687,65.125916 L 145.5705,65.445344 L 145.46777,65.717486 L 145.31729,65.943797 L 145.13852,66.141797 L 144.91267,66.293228 L 144.66741,66.424262 L 144.39503,66.528163 L 144.09458,66.603508 L 143.79292,66.660401 L 143.45455,66.698318 L 143.12456,66.726875 L 142.78569,66.746069 L 142.44778,66.765288 L 142.10845,66.765288 L 141.77942,66.77514 L 141.45857,66.793621 L 141.57234,66.700015 L 141.7041,66.605697 L 141.83538,66.492653 L 141.97651,66.370736 L 142.1361,66.247613 L 142.29618,66.106753 L 142.47472,65.964685 L 142.63431,65.823329 L 142.79486,65.889314 L 142.97318,65.936594 L 143.14333,65.973804 L 143.33197,65.992529 L 143.51964,65.992995 L 143.71787,65.982921 L 143.91467,65.973333 L 144.1117,65.926544 L 144.31017,65.879263 L 144.50793,65.813035 L 144.71455,65.737933 L 144.90319,65.643365 L 145.09157,65.530815 L 145.27855,65.417768 L 145.45782,65.275949 L 145.62687,65.125916 z "
3745 id="path367"
3746 style="fill:url(#XMLID_11_);stroke:#000000;stroke-width:0.0882756" /><path
3747 d="M 146.2295,64.24009 L 146.13447,64.465664 L 146.01233,64.682856 L 145.86233,64.889482 L 145.66456,65.08773 L 145.46681,65.266757 L 145.23164,65.445319 L 144.97772,65.586918 L 144.72431,65.728027 L 144.45118,65.832177 L 144.17783,65.916886 L 143.91456,65.973287 L 143.64194,66.002579 L 143.37795,66.00209 L 143.12427,65.974488 L 142.87947,65.899117 L 142.66252,65.795462 L 142.78539,65.692248 L 142.90756,65.58832 L 143.04845,65.484884 L 143.18069,65.371374 L 143.31173,65.259064 L 143.45282,65.136192 L 143.60355,65.00469 L 143.77272,64.853259 L 143.93304,64.881571 L 144.10271,64.900543 L 144.26255,64.910124 L 144.41279,64.910127 L 144.56327,64.909907 L 144.70487,64.880611 L 144.85534,64.862597 L 144.9967,64.833552 L 145.13734,64.786982 L 145.28781,64.73009 L 145.43803,64.673446 L 145.58875,64.598566 L 145.7397,64.522482 L 145.8988,64.438238 L 146.05909,64.334558 L 146.2295,64.24009 z "
3748 id="path8889"
3749 style="fill:url(#XMLID_12_);stroke:#000000;stroke-width:0.0882756" /><path
3750 d="M 146.48292,63.722199 L 146.48266,63.835243 L 146.43489,63.957873 L 146.3509,64.080255 L 146.23785,64.193299 L 146.09675,64.315685 L 145.92828,64.428952 L 145.73003,64.532139 L 145.53299,64.636043 L 145.31604,64.72053 L 145.08974,64.796146 L 144.85526,64.862623 L 144.62871,64.900049 L 144.39353,64.928853 L 144.16818,64.928605 L 143.97042,64.900272 L 143.7727,64.853234 L 143.86701,64.779066 L 143.95172,64.712595 L 144.05515,64.627886 L 144.14923,64.552047 L 144.24333,64.477165 L 144.32755,64.410691 L 144.39355,64.345171 L 144.45932,64.278919 L 144.49698,64.316592 L 144.65637,64.306986 L 144.90164,64.269063 L 145.21242,64.203056 L 145.55968,64.09962 L 145.90887,63.995695 L 146.22829,63.863479 L 146.48292,63.722199 z "
3751 id="path397"
3752 style="fill:url(#XMLID_13_);stroke:#000000;stroke-width:0.0882756" /><path
3753 d="M 146.49156,63.18558 L 146.60411,63.336523 L 146.64133,63.468292 L 146.6046,63.581338 L 146.52014,63.703252 L 146.37904,63.807398 L 146.19064,63.901228 L 145.99362,63.985938 L 145.76754,64.061776 L 145.53186,64.127535 L 145.29715,64.17504 L 145.06218,64.222793 L 144.86372,64.250855 L 144.68518,64.278698 L 144.55344,64.297644 L 144.46897,64.307255 L 144.44088,64.297892 L 145.25852,63.647762 L 145.3521,63.629036 L 145.49417,63.581533 L 145.66337,63.525626 L 145.84238,63.458902 L 146.03993,63.393386 L 146.20865,63.317794 L 146.36921,63.251811 L 146.49156,63.18558 z "
3754 id="path412"
3755 style="fill:url(#XMLID_14_);stroke:#000000;stroke-width:0.0882756" /><path
3756 d="M 145.25849,63.647834 L 145.42818,63.497364 L 145.57891,63.365861 L 145.71951,63.24299 L 145.85102,63.129722 L 145.9739,63.026506 L 146.10492,62.913217 L 146.24626,62.809785 L 146.41617,62.69605 L 146.56638,62.90388 L 146.56686,63.072833 L 146.43511,63.205068 L 146.21888,63.327724 L 145.96423,63.411966 L 145.69138,63.497167 L 145.44659,63.572536 L 145.25849,63.647834 z "
3757 id="path427"
3758 style="fill:url(#XMLID_15_);stroke:#000000;stroke-width:0.0882756" /><path
3759 d="M 145.19322,63.526385 L 145.38137,63.393437 L 145.54096,63.271299 L 145.69121,63.177227 L 145.83256,63.072834 L 145.9732,62.988366 L 146.11432,62.903878 L 146.27486,62.800445 L 146.45364,62.697007 L 146.25564,62.480281 L 146.03919,62.433245 L 145.85991,62.517957 L 145.67295,62.688111 L 145.52201,62.913462 L 145.38139,63.149138 L 145.27796,63.365863 L 145.19322,63.526385 z "
3760 id="path8894"
3761 style="fill:url(#XMLID_16_);stroke:#000000;stroke-width:0.0882756" /><path
3762 d="M 131.57981,69.493713 L 131.00598,69.747379 L 130.50706,70.040409 L 130.11275,70.379026 L 129.78325,70.74598 L 129.52023,71.16068 L 129.32296,71.583542 L 129.18183,72.045281 L 129.06928,72.515889 L 128.9944,73.005938 L 128.94734,73.504634 L 128.91111,74.012206 L 128.89192,74.520982 L 128.86383,75.019436 L 128.82711,75.508772 L 128.78008,75.989211 L 128.70568,76.460063 L 128.81797,76.383733 L 128.93078,76.28966 L 129.04382,76.195834 L 129.15662,76.120487 L 129.27016,76.026172 L 129.38295,75.932095 L 129.50462,75.828168 L 129.61768,75.734342 L 129.73,75.63931 L 129.85334,75.536122 L 129.98462,75.442297 L 130.09742,75.347732 L 130.22051,75.244293 L 130.35106,75.150223 L 130.48256,75.036955 L 130.61477,74.942639 L 130.57732,74.717528 L 130.53844,74.435314 L 130.4919,74.105568 L 130.47246,73.747731 L 130.4439,73.362051 L 130.43453,72.957179 L 130.44317,72.533829 L 130.46164,72.119348 L 130.50917,71.695531 L 130.57445,71.292354 L 130.65844,70.905986 L 130.78131,70.538788 L 130.92193,70.209042 L 131.10071,69.916971 L 131.32607,69.672398 L 131.57981,69.493713 z "
3763 id="path461"
3764 style="fill:url(#XMLID_17_);stroke:#000000;stroke-width:0.0882756" /><path
3765 d="M 132.44545,68.946309 L 132.12482,69.097518 L 131.84307,69.267898 L 131.5899,69.44621 L 131.34416,69.654065 L 131.14735,69.888758 L 130.96882,70.16186 L 130.81835,70.443856 L 130.69646,70.7736 L 130.59348,71.140556 L 130.52772,71.546138 L 130.48045,71.987947 L 130.46246,72.477771 L 130.46295,73.02304 L 130.50109,73.606942 L 130.55823,74.246748 L 130.63357,74.943201 L 130.81284,74.783148 L 130.98083,74.650688 L 131.13205,74.518693 L 131.28206,74.406141 L 131.43277,74.292878 L 131.59236,74.170739 L 131.78075,74.038033 L 131.99745,73.878938 L 132.02579,73.568382 L 132.03491,73.257832 L 132.0438,72.927837 L 132.03373,72.61682 L 132.03373,72.297641 L 132.03347,71.978215 L 132.02365,71.666947 L 132.02365,71.346786 L 132.03325,71.036235 L 132.06111,70.726174 L 132.07959,70.424488 L 132.10767,70.114185 L 132.16386,69.813238 L 132.24832,69.520919 L 132.33302,69.229339 L 132.44545,68.946309 z "
3766 id="path480"
3767 style="fill:url(#XMLID_18_);stroke:#000000;stroke-width:0.0882756" /><path
3768 d="M 133.44239,68.36263 L 133.1136,68.390964 L 132.83978,68.514086 L 132.60532,68.712084 L 132.43637,68.975112 L 132.29501,69.30461 L 132.19228,69.671808 L 132.11694,70.085552 L 132.0699,70.5281 L 132.04159,70.989369 L 132.02382,71.45976 L 132.01516,71.92103 L 132.02455,72.382519 L 132.03414,72.805383 L 132.02552,73.209515 L 132.01616,73.5587 L 131.9876,73.869475 L 133.13572,72.96519 L 133.11673,72.682729 L 133.0973,72.399797 L 133.06897,72.126698 L 133.04977,71.845436 L 133.03105,71.562728 L 133.00224,71.271365 L 132.99215,70.979537 L 132.98207,70.706411 L 132.99168,70.414587 L 133.00129,70.123249 L 133.02936,69.83071 L 133.07593,69.539374 L 133.13255,69.238431 L 133.21677,68.946357 L 133.32019,68.654778 L 133.44239,68.36263 z "
3769 id="path499"
3770 style="fill:url(#XMLID_19_);stroke:#000000;stroke-width:0.0882756" /><path
3771 d="M 135.64356,67.476607 L 135.08824,67.485724 L 134.6181,67.561783 L 134.22308,67.694241 L 133.88421,67.882414 L 133.61158,68.136812 L 133.39534,68.428886 L 133.23601,68.777354 L 133.1136,69.14453 L 133.03874,69.558274 L 133.00154,69.990496 L 132.98305,70.461598 L 133.00202,70.932209 L 133.02125,71.421788 L 133.069,71.920244 L 133.11649,72.418941 L 133.16355,72.917641 L 133.35266,72.748196 L 133.51154,72.606619 L 133.64353,72.493844 L 133.77552,72.399306 L 133.90679,72.30548 L 134.04791,72.201329 L 134.20751,72.080128 L 134.40525,71.937567 L 134.34837,71.486152 L 134.31069,71.072188 L 134.30059,70.667559 L 134.30996,70.300631 L 134.35652,69.953118 L 134.41243,69.632736 L 134.47794,69.340688 L 134.57276,69.058224 L 134.6757,68.803603 L 134.7885,68.578004 L 134.92001,68.35194 L 135.05201,68.14507 L 135.19167,67.966043 L 135.33398,67.787258 L 135.49357,67.627668 L 135.64356,67.476607 z "
3772 id="path8899"
3773 style="fill:url(#XMLID_20_);stroke:#000000;stroke-width:0.0882756" /><path
3774 d="M 136.99879,66.628254 L 136.57426,66.826258 L 136.19819,67.033349 L 135.86004,67.278361 L 135.56845,67.532048 L 135.2956,67.805395 L 135.0705,68.106833 L 134.87296,68.418097 L 134.71337,68.748086 L 134.57177,69.095576 L 134.47794,69.453389 L 134.39348,69.83929 L 134.34718,70.224474 L 134.31912,70.629816 L 134.31954,71.04309 L 134.34809,71.467648 L 134.39561,71.909699 L 134.62119,71.740749 L 134.87437,71.54228 L 135.12899,71.344282 L 135.38361,71.146278 L 135.61832,70.967251 L 135.7971,70.826854 L 135.92885,70.713343 L 135.9754,70.665814 L 135.9754,70.628855 L 135.985,70.543901 L 135.98525,70.412126 L 135.98453,70.242933 L 135.98526,70.035814 L 136.0028,69.809992 L 136.02224,69.546723 L 136.04983,69.254649 L 136.10598,68.953238 L 136.16213,68.633594 L 136.24662,68.303355 L 136.35055,67.956089 L 136.47247,67.625872 L 136.62247,67.287724 L 136.79119,66.948121 L 136.99879,66.628254 z "
3775 id="path537"
3776 style="fill:url(#XMLID_21_);stroke:#000000;stroke-width:0.0882756" /><path
3777 d="M 138.55942,65.403567 L 138.20254,65.554995 L 137.8728,65.734051 L 137.5721,65.978129 L 137.3086,66.242113 L 137.06455,66.544262 L 136.84832,66.872803 L 136.66976,67.221273 L 136.50034,67.598056 L 136.36882,67.974347 L 136.2469,68.360249 L 136.16244,68.76487 L 136.07822,69.169252 L 136.03192,69.556358 L 135.99399,69.9514 L 135.97574,70.327225 L 135.9767,70.684814 L 136.10748,70.57226 L 136.27669,70.440512 L 136.46461,70.289549 L 136.66233,70.129715 L 136.85984,69.969167 L 137.03887,69.82759 L 137.18885,69.695845 L 137.29254,69.592162 L 137.27383,69.572968 L 137.27383,69.478898 L 137.30143,69.319775 L 137.34798,69.0841 L 137.41423,68.811468 L 137.48911,68.490838 L 137.58294,68.133028 L 137.68687,67.767034 L 137.78889,67.399612 L 137.90142,67.022585 L 138.02477,66.65541 L 138.13756,66.318001 L 138.25037,66.015134 L 138.36221,65.752603 L 138.46611,65.553647 L 138.55942,65.403567 z "
3778 id="path556"
3779 style="fill:url(#XMLID_22_);stroke:#000000;stroke-width:0.0882756" /><path
3780 d="M 140.14937,64.357952 L 139.8105,64.283071 L 139.49037,64.358443 L 139.1995,64.557157 L 138.9175,64.839155 L 138.67271,65.215418 L 138.43753,65.657965 L 138.22154,66.157157 L 138.03389,66.664974 L 137.85556,67.201834 L 137.71467,67.719016 L 137.58343,68.209307 L 137.47953,68.670333 L 137.39625,69.055989 L 137.32977,69.348063 L 137.29184,69.553974 L 137.26401,69.639419 L 137.42455,69.535986 L 137.59351,69.404925 L 137.77204,69.262368 L 137.95084,69.102284 L 138.1486,68.942919 L 138.34611,68.78188 L 138.54337,68.641236 L 138.74113,68.500104 L 138.8352,68.236121 L 138.9187,67.981721 L 139.00438,67.70884 L 139.07876,67.445323 L 139.16347,67.190701 L 139.2386,66.927433 L 139.32306,66.654328 L 139.39844,66.391306 L 139.48196,66.135946 L 139.55709,65.873166 L 139.65187,65.609899 L 139.74572,65.365352 L 139.82972,65.110458 L 139.93341,64.857021 L 140.03567,64.602153 L 140.14937,64.357952 z "
3781 id="path575"
3782 style="fill:url(#XMLID_23_);stroke:#000000;stroke-width:0.0882756" /><path
3783 d="M 142.28431,63.123433 L 141.92672,63.180567 L 141.60681,63.330575 L 141.3253,63.538649 L 141.06253,63.801921 L 140.8355,64.122084 L 140.63874,64.489748 L 140.46928,64.866062 L 140.31978,65.261325 L 140.20648,65.656836 L 140.09417,66.051854 L 140.01932,66.428633 L 139.95356,66.757886 L 139.89762,67.059079 L 139.86955,67.313234 L 139.8415,67.491549 L 139.83262,67.595478 L 140.00133,67.463238 L 140.16139,67.341075 L 140.30177,67.238132 L 140.43403,67.124129 L 140.57488,67.001499 L 140.71553,66.89831 L 140.87585,66.775437 L 141.04409,66.643664 L 141.09208,66.427676 L 141.1384,66.210482 L 141.1763,65.966185 L 141.22334,65.731002 L 141.26989,65.477093 L 141.31716,65.222941 L 141.36372,64.968539 L 141.42011,64.723997 L 141.47675,64.478713 L 141.56029,64.243999 L 141.64571,64.008817 L 141.73932,63.800985 L 141.85209,63.595072 L 141.97447,63.415088 L 142.12469,63.264861 L 142.28431,63.123433 z "
3784 id="path8904"
3785 style="fill:url(#XMLID_24_);stroke:#000000;stroke-width:0.0882756" /><path
3786 d="M 141.39156,63.472395 L 141.12707,63.529019 L 140.88324,63.623087 L 140.68503,63.745469 L 140.49759,63.896435 L 140.33726,64.074999 L 140.19591,64.291209 L 140.06393,64.555658 L 139.93361,64.837162 L 139.81121,65.167404 L 139.68834,65.515873 L 139.56692,65.9205 L 139.43588,66.353461 L 139.28494,66.824069 L 139.11648,67.332844 L 138.93817,67.886958 L 138.72171,68.48069 L 138.88227,68.3398 L 139.06969,68.188347 L 139.25809,68.03834 L 139.45559,67.896025 L 139.6255,67.764502 L 139.757,67.671385 L 139.85061,67.595084 L 139.87963,67.548759 L 139.87867,67.51084 L 139.89812,67.397821 L 139.90698,67.237272 L 139.93531,67.039986 L 139.97248,66.79544 L 140.02913,66.513194 L 140.08527,66.211296 L 140.15991,65.891402 L 140.25352,65.553032 L 140.34806,65.214166 L 140.47838,64.875058 L 140.61924,64.545557 L 140.78028,64.23525 L 140.96723,63.951827 L 141.16593,63.698413 L 141.39156,63.472395 z "
3787 id="path605"
3788 style="fill:url(#XMLID_25_);stroke:#000000;stroke-width:0.0882756" /><path
3789 d="M 143.51738,62.849373 L 143.15933,62.831136 L 142.8399,62.868342 L 142.5668,62.953791 L 142.32225,63.085536 L 142.12475,63.264811 L 141.94597,63.481044 L 141.78588,63.735691 L 141.65486,63.999674 L 141.55216,64.309737 L 141.45759,64.610681 L 141.38246,64.949298 L 141.30782,65.288899 L 141.23246,65.627763 L 141.1763,65.966133 L 141.12063,66.305955 L 141.04527,66.625136 L 141.1768,66.531065 L 141.30854,66.418042 L 141.47821,66.305488 L 141.6467,66.172539 L 141.82572,66.032387 L 142.01338,65.880711 L 142.1924,65.740072 L 142.37143,65.598003 L 142.30542,65.419686 L 142.26775,65.249998 L 142.23895,65.071435 L 142.2392,64.883045 L 142.24878,64.704483 L 142.28549,64.535308 L 142.34189,64.345958 L 142.40765,64.167397 L 142.49237,63.988618 L 142.59509,63.810054 L 142.7086,63.640365 L 142.83964,63.470452 L 142.99035,63.301503 L 143.15089,63.141447 L 143.32871,62.999599 L 143.51738,62.849373 z "
3790 id="path620"
3791 style="fill:url(#XMLID_26_);stroke:#000000;stroke-width:0.0882756" /><path
3792 d="M 145.29545,62.311059 L 145.1728,62.283462 L 145.03192,62.292823 L 144.87233,62.340596 L 144.72161,62.415476 L 144.56177,62.519134 L 144.40243,62.660736 L 144.25195,62.811698 L 144.09162,62.990263 L 143.95989,63.178183 L 143.82861,63.385796 L 143.72444,63.602251 L 143.64046,63.818264 L 143.57444,64.035455 L 143.53725,64.260806 L 143.53725,64.46817 L 143.55691,64.675042 L 143.65988,64.609525 L 143.78228,64.524569 L 143.90419,64.440105 L 144.02637,64.354411 L 144.1293,64.269704 L 144.21498,64.204186 L 144.27111,64.148035 L 144.29007,64.129085 L 144.30854,64.072217 L 144.34551,63.92197 L 144.43047,63.686321 L 144.53437,63.413934 L 144.67501,63.121616 L 144.85354,62.81106 L 145.05081,62.537958 L 145.29545,62.311059 z "
3793 id="path635"
3794 style="fill:url(#XMLID_27_);stroke:#000000;stroke-width:0.0882756" /><path
3795 d="M 145.9072,62.470673 L 145.76631,62.312043 L 145.61561,62.236205 L 145.47379,62.227331 L 145.32356,62.283489 L 145.18222,62.386456 L 145.05094,62.537885 L 144.91895,62.707328 L 144.78766,62.914446 L 144.68422,63.130682 L 144.58126,63.347381 L 144.48718,63.554747 L 144.4123,63.742917 L 144.3559,63.912607 L 144.29975,64.044598 L 144.29015,64.12906 L 144.27982,64.157127 L 145.16417,63.534841 L 145.20281,63.441014 L 145.2772,63.309024 L 145.35232,63.158547 L 145.44639,62.988638 L 145.54983,62.828579 L 145.66312,62.678821 L 145.78525,62.556195 L 145.9072,62.470673 z "
3796 id="path8909"
3797 style="fill:url(#XMLID_28_);stroke:#000000;stroke-width:0.0882756" /><path
3798 d="M 144.65565,62.462758 L 144.38279,62.490848 L 144.12864,62.55686 L 143.86562,62.651886 L 143.611,62.792774 L 143.37531,62.953103 L 143.15092,63.123237 L 142.94381,63.330842 L 142.75495,63.555239 L 142.59584,63.791156 L 142.46457,64.054894 L 142.36088,64.309269 L 142.28552,64.572784 L 142.2392,64.84564 L 142.23897,65.108912 L 142.28625,65.363066 L 142.3712,65.617001 L 142.50345,65.503978 L 142.6347,65.41062 L 142.76645,65.296618 L 142.91692,65.183596 L 143.06764,65.071264 L 143.21786,64.93927 L 143.39664,64.817112 L 143.57424,64.675043 L 143.54736,64.534424 L 143.53754,64.393068 L 143.54713,64.251959 L 143.56514,64.10149 L 143.60256,63.950773 L 143.64936,63.790688 L 143.706,63.639482 L 143.78159,63.489476 L 143.85646,63.33876 L 143.95033,63.188283 L 144.04464,63.056516 L 144.15743,62.924055 L 144.27048,62.792776 L 144.39216,62.670616 L 144.52415,62.557348 L 144.65565,62.462758 z "
3799 id="path665"
3800 style="fill:url(#XMLID_29_);stroke:#000000;stroke-width:0.0882756" /><path
3801 d="M 106.64878,91.467152 L 106.65864,91.514901 L 106.68669,91.637529 L 106.74334,91.825208 L 106.80955,92.06039 L 106.89332,92.333737 L 106.9975,92.625563 L 107.13861,92.936584 L 107.28982,93.236569 L 107.46958,93.528151 L 107.6762,93.773654 L 107.91188,93.989177 L 108.16603,94.148765 L 108.45836,94.233721 L 108.78714,94.242347 L 109.14449,94.148053 L 109.53112,93.950054 L 109.34153,93.855513 L 109.1625,93.734089 L 108.98443,93.611706 L 108.79626,93.460988 L 108.61721,93.301152 L 108.43889,93.122837 L 108.27834,92.944054 L 108.10843,92.755909 L 107.94813,92.558154 L 107.78829,92.36185 L 107.64719,92.16363 L 107.52453,91.96612 L 107.39229,91.778196 L 107.28886,91.598922 L 107.19407,91.41149 L 107.10986,91.250942 L 107.04433,91.279498 L 106.99658,91.307587 L 106.93107,91.335651 L 106.88379,91.365191 L 106.82717,91.383916 L 106.7806,91.411268 L 106.71437,91.439111 L 106.64878,91.467152 z "
3802 id="path686"
3803 style="fill:url(#XMLID_30_);stroke:#000000;stroke-width:0.0882756" /><path
3804 d="M 107.1002,91.279228 L 107.48706,91.928644 L 107.86362,92.492859 L 108.22167,92.944054 L 108.57015,93.310762 L 108.89964,93.602367 L 109.21978,93.809214 L 109.5212,93.959932 L 109.79407,94.06241 L 110.05783,94.100082 L 110.30191,94.099838 L 110.50949,94.062656 L 110.69668,94.006502 L 110.87643,93.920834 L 111.00795,93.82676 L 111.12026,93.732688 L 111.21457,93.638369 L 111.00745,93.544051 L 110.80059,93.449979 L 110.58411,93.328063 L 110.37675,93.196536 L 110.15092,93.045574 L 109.9342,92.886942 L 109.71725,92.727107 L 109.50151,92.548325 L 109.29343,92.3604 L 109.08607,92.153529 L 108.89864,91.945945 L 108.70951,91.720838 L 108.54009,91.495241 L 108.37114,91.249959 L 108.22931,91.015489 L 108.08796,90.761336 L 107.93773,90.835724 L 107.78655,90.912054 L 107.64544,90.978307 L 107.51441,91.052721 L 107.40111,91.110324 L 107.2792,91.175619 L 107.18535,91.232019 L 107.1002,91.279228 z "
3805 id="path707"
3806 style="fill:url(#XMLID_31_);stroke:#000000;stroke-width:0.0882756" /><path
3807 d="M 108.09782,90.751481 L 108.22885,90.996247 L 108.40883,91.287607 L 108.63441,91.58951 L 108.90797,91.898613 L 109.21804,92.228827 L 109.5382,92.548005 L 109.88642,92.839119 L 110.24497,93.103104 L 110.60327,93.327965 L 110.96061,93.515888 L 111.31843,93.628442 L 111.64721,93.675945 L 111.96807,93.637313 L 112.24907,93.505567 L 112.49483,93.279477 L 112.69186,92.931743 L 112.41828,92.865514 L 112.13628,92.790387 L 111.86293,92.687444 L 111.60903,92.584229 L 111.35537,92.443365 L 111.10986,92.311594 L 110.87467,92.151268 L 110.639,91.991433 L 110.42255,91.812871 L 110.20656,91.615608 L 110.00881,91.417854 L 109.81992,91.192994 L 109.63178,90.967397 L 109.4712,90.731969 L 109.30106,90.486957 L 109.16019,90.233295 L 109.02869,90.28994 L 108.87871,90.365065 L 108.72777,90.440658 L 108.57705,90.516029 L 108.4268,90.591867 L 108.2953,90.648021 L 108.18154,90.704174 L 108.09782,90.751481 z "
3808 id="path728"
3809 style="fill:url(#XMLID_32_);stroke:#000000;stroke-width:0.0882756" /><path
3810 d="M 114.39445,92.261265 L 114.1595,92.535571 L 113.87655,92.742688 L 113.56696,92.865071 L 113.23768,92.931301 L 112.87002,92.939926 L 112.49373,92.88424 L 112.11768,92.771686 L 111.72242,92.621681 L 111.33626,92.424173 L 110.94941,92.188991 L 110.5827,91.916848 L 110.24359,91.616125 L 109.92368,91.2967 L 109.63163,90.948474 L 109.37725,90.5813 L 109.16943,90.204984 L 109.31054,90.138731 L 109.45285,90.072255 L 109.59303,89.988258 L 109.74421,89.911928 L 109.89468,89.837295 L 110.05452,89.752806 L 110.20499,89.677189 L 110.36457,89.592455 L 110.56209,89.789963 L 110.77832,90.005706 L 111.03365,90.24138 L 111.29716,90.485679 L 111.56069,90.729978 L 111.83357,90.984132 L 112.135,91.228455 L 112.4165,91.454274 L 112.70921,91.670411 L 112.98183,91.868165 L 113.265,92.036626 L 113.52877,92.168864 L 113.77307,92.261977 L 114.00921,92.309482 L 114.21559,92.30899 L 114.39445,92.261265 z "
3811 id="path749"
3812 style="fill:url(#XMLID_33_);stroke:#000000;stroke-width:0.0882756" /><path
3813 d="M 110.383,89.592356 L 110.515,89.535711 L 110.63716,89.469728 L 110.74971,89.413795 L 110.86345,89.337957 L 110.98537,89.272219 L 111.0984,89.214862 L 111.22128,89.149099 L 111.35232,89.093412 L 111.55941,89.262611 L 111.77587,89.458915 L 112.01154,89.67537 L 112.25633,89.901435 L 112.49201,90.136617 L 112.74569,90.389812 L 113.01038,90.634848 L 113.26432,90.869563 L 113.52808,91.095874 L 113.80069,91.311863 L 114.07403,91.490154 L 114.32844,91.65048 L 114.60035,91.772643 L 114.86387,91.866248 L 115.11898,91.913997 L 115.37265,91.904167 L 115.0345,92.110794 L 114.7146,92.242539 L 114.39519,92.299897 L 114.08414,92.309014 L 113.77312,92.261979 L 113.46303,92.159746 L 113.16208,92.009053 L 112.8693,91.830492 L 112.56737,91.605386 L 112.26689,91.360103 L 111.97458,91.087002 L 111.66332,90.79638 L 111.35325,90.504552 L 111.03308,90.203608 L 110.72229,89.893301 L 110.383,89.592356 z "
3814 id="path766"
3815 style="fill:url(#XMLID_34_);stroke:#000000;stroke-width:0.0882756" /><path
3816 d="M 116.89653,91.111725 L 116.50199,91.469806 L 116.10673,91.715309 L 115.72154,91.856172 L 115.34476,91.912793 L 114.96844,91.875366 L 114.60964,91.763304 L 114.24348,91.602977 L 113.88543,91.377379 L 113.54631,91.114108 L 113.20841,90.832382 L 112.8777,90.521829 L 112.56739,90.212013 L 112.24653,89.89163 L 111.9458,89.591398 L 111.64436,89.327414 L 111.3619,89.083336 L 111.49343,89.007989 L 111.61534,88.94176 L 111.72838,88.886319 L 111.85125,88.819131 L 111.97293,88.753148 L 112.08548,88.697215 L 112.19875,88.621844 L 112.33077,88.546005 L 112.349,88.583458 L 112.39653,88.667947 L 112.48126,88.791067 L 112.58516,88.951148 L 112.73588,89.138334 L 112.90533,89.346165 L 113.1122,89.572254 L 113.36682,89.807215 L 113.6493,90.033525 L 113.97856,90.26679 L 114.35608,90.473908 L 114.76022,90.671662 L 115.22196,90.831473 L 115.73913,90.97236 L 116.28487,91.065475 L 116.89653,91.111725 z "
3817 id="path787"
3818 style="fill:url(#XMLID_35_);stroke:#000000;stroke-width:0.0882756" /><path
3819 d="M 118.58105,90.235039 L 118.37369,90.536475 L 118.1013,90.772886 L 117.7718,90.932476 L 117.39502,91.045274 L 116.98199,91.102165 L 116.53944,91.111749 L 116.06904,91.056087 L 115.58834,90.953143 L 115.11797,90.801958 L 114.63776,90.604941 L 114.16641,90.360617 L 113.72458,90.070461 L 113.30963,89.750078 L 112.92324,89.383838 L 112.60333,88.989066 L 112.32108,88.555639 L 113.4968,87.943429 L 113.63839,88.103264 L 113.8083,88.29141 L 114.01446,88.478843 L 114.26021,88.685223 L 114.52325,88.893053 L 114.82467,89.099924 L 115.13548,89.316159 L 115.4837,89.5132 L 115.84154,89.700633 L 116.20871,89.860936 L 116.58525,90.010941 L 116.981,90.123985 L 117.37601,90.218058 L 117.78111,90.264603 L 118.18549,90.273475 L 118.58105,90.235039 z "
3820 id="path808"
3821 style="fill:url(#XMLID_36_);stroke:#000000;stroke-width:0.0882756" /><path
3822 d="M 113.45967,87.943159 L 113.6471,87.848864 L 113.87316,87.716873 L 114.13715,87.584414 L 114.39968,87.434188 L 114.66391,87.302417 L 114.91711,87.160595 L 115.13358,87.038188 L 115.29389,86.934506 L 115.5291,87.13226 L 115.77413,87.356654 L 116.02828,87.573847 L 116.28219,87.809029 L 116.55553,88.024772 L 116.82934,88.260691 L 117.10196,88.47668 L 117.38442,88.683798 L 117.66618,88.871967 L 117.9496,89.041166 L 118.25054,89.190435 L 118.5419,89.312816 L 118.84381,89.407355 L 119.13513,89.453927 L 119.42672,89.482015 L 119.72816,89.463778 L 119.5681,89.680017 L 119.36148,89.867915 L 119.11717,90.018632 L 118.83518,90.132144 L 118.51479,90.206778 L 118.1661,90.254527 L 117.79003,90.255682 L 117.36646,90.2084 L 116.94287,90.12369 L 116.4912,89.972481 L 116.0201,89.785515 L 115.52285,89.531607 L 115.02199,89.221299 L 114.51467,88.864669 L 113.98646,88.431954 L 113.45967,87.943159 z "
3823 id="path829"
3824 style="fill:url(#XMLID_37_);stroke:#000000;stroke-width:0.0882756" /><path
3825 d="M 121.14868,88.416151 L 120.7448,88.859902 L 120.3493,89.17957 L 119.93579,89.387623 L 119.52999,89.472334 L 119.12629,89.48194 L 118.72096,89.416424 L 118.33529,89.275069 L 117.9398,89.069157 L 117.56325,88.843805 L 117.18647,88.560603 L 116.83849,88.269735 L 116.48977,87.97862 L 116.16027,87.667846 L 115.84925,87.395212 L 115.55789,87.140812 L 115.28455,86.925069 L 115.44416,86.821633 L 115.58623,86.736186 L 115.72685,86.670914 L 115.85717,86.595788 L 115.99973,86.510833 L 116.13076,86.434507 L 116.29059,86.350507 L 116.4694,86.246114 L 116.75307,86.472671 L 117.02451,86.688903 L 117.29784,86.905139 L 117.57097,87.101443 L 117.83567,87.309029 L 118.09894,87.496951 L 118.3615,87.666372 L 118.62574,87.835101 L 118.89909,87.976458 L 119.18132,88.10749 L 119.4729,88.211419 L 119.77458,88.305737 L 120.10312,88.370785 L 120.43311,88.417577 L 120.78134,88.426917 L 121.14868,88.416151 z "
3826 id="path850"
3827 style="fill:url(#XMLID_38_);stroke:#000000;stroke-width:0.0882756" /><path
3828 d="M 122.49332,87.484272 L 122.13598,87.880001 L 121.75944,88.161974 L 121.3649,88.351078 L 120.95044,88.445864 L 120.53644,88.464344 L 120.11313,88.417307 L 119.68979,88.29635 L 119.26598,88.136513 L 118.85199,87.947634 L 118.44737,87.713407 L 118.07034,87.468398 L 117.70341,87.196965 L 117.35542,86.942567 L 117.04416,86.688412 L 116.75187,86.453229 L 116.5066,86.246853 L 116.66789,86.143171 L 116.80828,86.05797 L 116.93979,85.983088 L 117.06265,85.898111 L 117.18387,85.832594 L 117.3065,85.746434 L 117.4656,85.662189 L 117.63551,85.5489 L 118.05045,85.869284 L 118.45507,86.161109 L 118.84097,86.41455 L 119.20818,86.649733 L 119.5569,86.846281 L 119.90488,87.024842 L 120.22477,87.156589 L 120.53603,87.279217 L 120.82737,87.382407 L 121.11032,87.456574 L 121.37384,87.494494 L 121.61839,87.531675 L 121.86293,87.550623 L 122.07938,87.540549 L 122.29585,87.512237 L 122.49332,87.484272 z "
3829 id="path871"
3830 style="fill:url(#XMLID_39_);stroke:#000000;stroke-width:0.0882756" /><path
3831 d="M 123.70721,86.551186 L 123.36834,86.928436 L 123.00185,87.201783 L 122.61475,87.380591 L 122.20126,87.493633 L 121.78774,87.513072 L 121.36417,87.485007 L 120.93098,87.391156 L 120.49851,87.260613 L 120.07466,87.099823 L 119.66068,86.893194 L 119.26492,86.667843 L 118.87926,86.433375 L 118.52082,86.207777 L 118.19107,85.972593 L 117.9002,85.756581 L 117.63548,85.568189 L 117.76701,85.474117 L 117.90862,85.389628 L 118.05021,85.304206 L 118.19132,85.219718 L 118.33314,85.134518 L 118.47329,85.049559 L 118.61441,84.964608 L 118.76538,84.871244 L 118.95281,85.03995 L 119.17913,85.20915 L 119.42392,85.378597 L 119.71553,85.538183 L 120.02629,85.716498 L 120.35531,85.857854 L 120.70378,86.017201 L 121.05272,86.158776 L 121.42015,86.280935 L 121.78731,86.384127 L 122.13458,86.468834 L 122.48278,86.533396 L 122.8219,86.590285 L 123.14252,86.608987 L 123.44275,86.590039 L 123.70721,86.551186 z "
3832 id="path892"
3833 style="fill:url(#XMLID_40_);stroke:#000000;stroke-width:0.0882756" /><path
3834 d="M 125.09819,85.591244 L 124.77951,86.004986 L 124.39337,86.297057 L 123.96089,86.485694 L 123.49003,86.580259 L 122.99156,86.609257 L 122.48303,86.553348 L 121.95697,86.440059 L 121.43837,86.299663 L 120.93993,86.102644 L 120.46857,85.895283 L 120.03562,85.688412 L 119.63124,85.47267 L 119.31061,85.26533 L 119.04709,85.096376 L 118.85916,84.964629 L 118.76531,84.87127 L 118.93352,84.757758 L 119.09383,84.654078 L 119.26279,84.559982 L 119.42284,84.456544 L 119.59202,84.362224 L 119.74273,84.267193 L 119.89321,84.173342 L 120.03503,84.069908 L 120.24215,84.182951 L 120.47683,84.324059 L 120.74922,84.483651 L 121.06073,84.661718 L 121.39073,84.840529 L 121.73825,85.019556 L 122.09533,85.188017 L 122.47332,85.357707 L 122.84962,85.507464 L 123.21633,85.629847 L 123.58398,85.733531 L 123.93196,85.799292 L 124.27058,85.816817 L 124.57153,85.798089 L 124.85398,85.722989 L 125.09819,85.591244 z "
3835 id="path913"
3836 style="fill:url(#XMLID_41_);stroke:#000000;stroke-width:0.0882756" /><path
3837 d="M 125.90718,84.687204 L 125.64414,85.100945 L 125.3439,85.401669 L 125.00479,85.609746 L 124.6191,85.732377 L 124.22337,85.770048 L 123.80913,85.751322 L 123.36802,85.667327 L 122.925,85.54492 L 122.49134,85.376213 L 122.05914,85.188777 L 121.65404,84.972322 L 121.25804,84.765452 L 120.9007,84.558801 L 120.58943,84.361269 L 120.30719,84.192315 L 120.08206,84.060322 L 120.20373,83.976106 L 120.31702,83.88154 L 120.42886,83.806661 L 120.53279,83.740184 L 120.64584,83.664596 L 120.74999,83.598832 L 120.8431,83.542186 L 120.93788,83.467553 L 121.18292,83.654493 L 121.45531,83.833744 L 121.73777,84.001982 L 122.01063,84.163021 L 122.31207,84.312779 L 122.61374,84.444061 L 122.91494,84.557594 L 123.23507,84.651202 L 123.55499,84.745028 L 123.88449,84.810547 L 124.22357,84.84726 L 124.54278,84.866455 L 124.89171,84.856845 L 125.2205,84.828509 L 125.55961,84.771621 L 125.90718,84.687204 z "
3838 id="path934"
3839 style="fill:url(#XMLID_42_);stroke:#000000;stroke-width:0.0882756" /><path
3840 d="M 127.14889,83.782454 L 126.86739,84.064944 L 126.56574,84.309981 L 126.23694,84.507244 L 125.88775,84.667789 L 125.52177,84.771716 L 125.12625,84.847557 L 124.73195,84.866995 L 124.31846,84.866747 L 123.90302,84.81035 L 123.47968,84.726354 L 123.04602,84.594607 L 122.61382,84.444135 L 122.19001,84.246846 L 121.76618,84.012109 L 121.34211,83.758444 L 120.93795,83.467579 L 121.08819,83.373037 L 121.22953,83.269136 L 121.38959,83.165699 L 121.52974,83.062019 L 121.6711,82.959046 L 121.81149,82.873845 L 121.95332,82.769918 L 122.09443,82.68543 L 122.34905,82.826317 L 122.60295,82.986155 L 122.85758,83.127042 L 123.1026,83.276554 L 123.35674,83.417419 L 123.61016,83.55902 L 123.87365,83.671819 L 124.15611,83.784372 L 124.44769,83.869574 L 124.75871,83.95308 L 125.08846,83.999625 L 125.44555,84.018818 L 125.81322,84.009208 L 126.22671,83.971511 L 126.6786,83.895918 L 127.14889,83.782454 z "
3841 id="path955"
3842 style="fill:url(#XMLID_43_);stroke:#000000;stroke-width:0.0882756" /><path
3843 d="M 128.85116,82.434913 L 128.51278,82.83138 L 128.13671,83.169999 L 127.75106,83.442855 L 127.34621,83.660046 L 126.93246,83.829464 L 126.49953,83.943223 L 126.05698,84.008495 L 125.60604,84.00874 L 125.16396,83.981389 L 124.7118,83.906018 L 124.25919,83.794201 L 123.81809,83.634833 L 123.37578,83.436365 L 122.94214,83.211504 L 122.5284,82.948454 L 122.13169,82.66646 L 122.25433,82.581257 L 122.39618,82.497014 L 122.52651,82.403162 L 122.65824,82.308379 L 122.79024,82.214773 L 122.9412,82.119989 L 123.08209,82.016549 L 123.23328,81.921985 L 123.50542,82.119272 L 123.79724,82.29781 L 124.10925,82.457647 L 124.42868,82.588927 L 124.76658,82.702215 L 125.11554,82.804445 L 125.47336,82.88004 L 125.83982,82.927814 L 126.21708,82.965265 L 126.59267,82.964061 L 126.9697,82.944155 L 127.35513,82.897608 L 127.73167,82.821525 L 128.11757,82.718334 L 128.48477,82.595461 L 128.85116,82.434913 z "
3844 id="path976"
3845 style="fill:url(#XMLID_44_);stroke:#000000;stroke-width:0.0882756" /><path
3846 d="M 129.7818,81.690932 L 129.45278,81.983005 L 129.10506,82.237626 L 128.72827,82.464156 L 128.32439,82.642941 L 127.91929,82.784543 L 127.48732,82.897829 L 127.05414,82.972957 L 126.61183,83.001511 L 126.17001,82.992639 L 125.71881,82.955679 L 125.27482,82.870726 L 124.84234,82.757929 L 124.41973,82.598803 L 124.00549,82.410658 L 123.60878,82.184842 L 123.2332,81.92206 L 123.35437,81.837353 L 123.45806,81.751904 L 123.55263,81.676557 L 123.64648,81.601922 L 123.74056,81.526083 L 123.85333,81.451205 L 123.98436,81.357133 L 124.13532,81.24313 L 124.54066,81.384482 L 124.9069,81.525125 L 125.26498,81.657116 L 125.59449,81.779253 L 125.924,81.882693 L 126.22592,81.976295 L 126.52686,82.050709 L 126.8278,82.106886 L 127.13882,82.153924 L 127.44913,82.162798 L 127.78799,82.163263 L 128.12612,82.125591 L 128.50315,82.068229 L 128.89816,81.973667 L 129.32102,81.851751 L 129.7818,81.690932 z "
3847 id="path997"
3848 style="fill:url(#XMLID_45_);stroke:#000000;stroke-width:0.0882756" /><path
3849 d="M 131.38111,80.278343 L 130.93879,80.758559 L 130.48761,81.173755 L 130.0362,81.494137 L 129.57493,81.748047 L 129.10504,81.936656 L 128.64353,82.058329 L 128.17316,82.135123 L 127.703,82.153139 L 127.23263,82.13537 L 126.76174,82.079218 L 126.30983,81.985854 L 125.84856,81.862489 L 125.3974,81.731923 L 124.96399,81.582165 L 124.53031,81.412965 L 124.11681,81.243303 L 124.20994,81.167931 L 124.34239,81.073856 L 124.50198,80.951718 L 124.66206,80.829091 L 124.84109,80.706685 L 124.99181,80.593417 L 125.12262,80.500057 L 125.21647,80.423509 L 125.59351,80.574447 L 125.98012,80.715799 L 126.36622,80.81828 L 126.74181,80.912131 L 127.11859,80.968309 L 127.50379,81.015098 L 127.88057,81.033331 L 128.2763,81.034045 L 128.65237,81.015098 L 129.03876,80.966611 L 129.43331,80.910459 L 129.81994,80.825258 L 130.20513,80.721354 L 130.60018,80.58936 L 130.9856,80.447762 L 131.38111,80.278343 z "
3850 id="path1018"
3851 style="fill:url(#XMLID_46_);stroke:#000000;stroke-width:0.0882756" /><path
3852 d="M 132.40611,79.514972 L 132.14284,79.760011 L 131.84214,79.985851 L 131.49342,80.202553 L 131.09914,80.390947 L 130.6854,80.561345 L 130.24237,80.702948 L 129.77222,80.816459 L 129.28338,80.910776 L 128.77532,80.967642 L 128.26703,81.005561 L 127.7496,80.996939 L 127.22139,80.95973 L 126.71358,80.883894 L 126.20601,80.77134 L 125.70684,80.611998 L 125.22613,80.414489 L 125.38692,80.292569 L 125.54652,80.169451 L 125.70587,80.028341 L 125.87528,79.896815 L 126.01614,79.774189 L 126.14836,79.679868 L 126.2609,79.604768 L 126.3365,79.585798 L 126.63769,79.547657 L 126.94849,79.519565 L 127.29672,79.50974 L 127.64469,79.499664 L 128.0114,79.509247 L 128.39683,79.517897 L 128.79259,79.536868 L 129.18881,79.555814 L 129.59294,79.583659 L 129.99732,79.593022 L 130.40195,79.602139 L 130.81591,79.600936 L 131.22991,79.601178 L 131.62492,79.581028 L 132.02043,79.562795 L 132.40611,79.514972 z "
3853 id="path1039"
3854 style="fill:url(#XMLID_47_);stroke:#000000;stroke-width:0.0882756" /><path
3855 d="M 133.77909,78.14003 L 133.69485,78.545858 L 133.50695,78.865284 L 133.25304,79.119685 L 132.92353,79.31766 L 132.52851,79.449187 L 132.08575,79.533896 L 131.58755,79.581891 L 131.06076,79.60086 L 130.49558,79.582381 L 129.91242,79.563653 L 129.30042,79.536303 L 128.68917,79.509198 L 128.08679,79.490472 L 127.47505,79.481108 L 126.89119,79.500545 L 126.33655,79.548294 L 126.44095,79.4439 L 126.58059,79.340714 L 126.73156,79.208472 L 126.90099,79.096169 L 127.07067,78.963928 L 127.24899,78.841302 L 127.41795,78.728967 L 127.55929,78.624573 L 128.0105,78.605848 L 128.46191,78.586899 L 128.91453,78.585915 L 129.36617,78.585919 L 129.80775,78.594545 L 130.25101,78.603904 L 130.69332,78.613735 L 131.10728,78.612532 L 131.52102,78.611548 L 131.91653,78.593314 L 132.28323,78.564512 L 132.63146,78.517722 L 132.96074,78.451493 L 133.26193,78.3759 L 133.53432,78.271998 L 133.77909,78.14003 z "
3856 id="path1060"
3857 style="fill:url(#XMLID_48_);stroke:#000000;stroke-width:0.0882756" /><path
3858 d="M 134.96487,77.198074 L 134.57873,77.584712 L 134.17482,77.914209 L 133.7505,78.149393 L 133.30869,78.34835 L 132.86615,78.488972 L 132.40562,78.573703 L 131.93501,78.631553 L 131.46439,78.650035 L 130.97506,78.650771 L 130.4862,78.631823 L 130.00548,78.603732 L 129.50678,78.576384 L 129.00808,78.549031 L 128.50963,78.539177 L 128.02005,78.558393 L 127.53094,78.596314 L 127.63459,78.512314 L 127.75627,78.408879 L 127.91632,78.285291 L 128.06703,78.154726 L 128.22685,78.030891 L 128.34901,77.927456 L 128.45292,77.843706 L 128.50907,77.787059 L 128.70635,77.664187 L 128.94156,77.579233 L 129.20508,77.523571 L 129.5161,77.494275 L 129.85425,77.49354 L 130.23103,77.512265 L 130.62654,77.531483 L 131.04029,77.567459 L 131.49194,77.586653 L 131.94361,77.604422 L 132.43343,77.604173 L 132.92277,77.586184 L 133.4306,77.547798 L 133.9389,77.471959 L 134.4465,77.358447 L 134.96487,77.198074 z "
3859 id="path1081"
3860 style="fill:url(#XMLID_49_);stroke:#000000;stroke-width:0.0882756" /><path
3861 d="M 106.27173,90.394629 L 106.23405,90.356955 L 106.12103,90.263595 L 105.95208,90.113369 L 105.74403,89.924978 L 105.49924,89.699405 L 105.25445,89.43638 L 105.00029,89.1635 L 104.75455,88.863047 L 104.55704,88.552739 L 104.38809,88.232111 L 104.26474,87.922049 L 104.2266,87.602624 L 104.2554,87.309838 L 104.38718,87.027375 L 104.63165,86.763587 L 104.99811,86.547378 L 104.99786,86.73626 L 105.02642,86.933302 L 105.08282,87.159121 L 105.15843,87.38494 L 105.253,87.629263 L 105.35621,87.864937 L 105.4798,88.118132 L 105.61083,88.382116 L 105.74332,88.626415 L 105.89453,88.871206 L 106.04404,89.116218 L 106.18586,89.350687 L 106.33633,89.577489 L 106.47746,89.793477 L 106.60921,89.981402 L 106.73209,90.159963 L 106.67618,90.197636 L 106.62914,90.226439 L 106.56338,90.253815 L 106.5161,90.282862 L 106.45945,90.30063 L 106.40281,90.320314 L 106.34641,90.358479 L 106.27173,90.394629 z "
3862 id="path1100"
3863 style="fill:url(#XMLID_50_);stroke:#000000;stroke-width:0.0882756" /><path
3864 d="M 106.76059,90.18847 L 106.27101,89.454565 L 105.8844,88.804902 L 105.57338,88.230833 L 105.33746,87.733338 L 105.17787,87.308782 L 105.07372,86.942075 L 105.03531,86.641598 L 105.03578,86.396807 L 105.08233,86.17986 L 105.15721,86.029633 L 105.27096,85.896658 L 105.39335,85.812661 L 105.53372,85.747145 L 105.67458,85.700354 L 105.81643,85.671307 L 105.9482,85.652826 L 105.93864,85.906808 L 105.94847,86.178705 L 105.96719,86.424454 L 105.996,86.696105 L 106.04303,86.951438 L 106.09869,87.214463 L 106.16517,87.468592 L 106.25059,87.722992 L 106.34466,87.986042 L 106.45769,88.249313 L 106.57097,88.513296 L 106.70297,88.776813 L 106.85415,89.039372 L 107.00438,89.302889 L 107.16539,89.575277 L 107.34346,89.848845 L 107.21221,89.942204 L 107.11863,90.018043 L 107.04257,90.056208 L 106.98594,90.074934 L 106.93962,90.103023 L 106.88248,90.122708 L 106.8352,90.150772 L 106.76059,90.18847 z "
3865 id="path1119"
3866 style="fill:url(#XMLID_51_);stroke:#000000;stroke-width:0.0882756" /><path
3867 d="M 107.40977,89.895931 L 107.21201,89.642489 L 107.00536,89.340341 L 106.79753,88.982751 L 106.58945,88.588447 L 106.40178,88.155019 L 106.24074,87.713187 L 106.09938,87.271379 L 105.99548,86.82908 L 105.92996,86.405015 L 105.91075,86.009998 L 105.94817,85.6529 L 106.05065,85.341634 L 106.2297,85.087726 L 106.48384,84.908452 L 106.81333,84.824209 L 107.24577,84.823251 L 107.19896,85.114833 L 107.18073,85.416268 L 107.17112,85.708095 L 107.17161,85.990805 L 107.18083,86.28219 L 107.22836,86.57355 L 107.28546,86.85626 L 107.35123,87.128402 L 107.43618,87.421212 L 107.54033,87.693846 L 107.65337,87.97631 L 107.79473,88.248699 L 107.9452,88.530204 L 108.12374,88.803797 L 108.31284,89.08604 L 108.51898,89.368503 L 108.38866,89.442646 L 108.23796,89.518484 L 108.07766,89.60344 L 107.90824,89.678787 L 107.74841,89.745483 L 107.60705,89.810999 L 107.48536,89.858282 L 107.40977,89.895931 z "
3868 id="path1138"
3869 style="fill:url(#XMLID_52_);stroke:#000000;stroke-width:0.0882756" /><path
3870 d="M 108.60998,83.759157 L 108.23369,83.834504 L 107.92314,83.97608 L 107.65965,84.182951 L 107.46214,84.43708 L 107.30205,84.747855 L 107.19911,85.096548 L 107.15185,85.482694 L 107.14249,85.887566 L 107.18112,86.300572 L 107.25671,86.752455 L 107.38871,87.204117 L 107.54925,87.64595 L 107.74797,88.088495 L 107.9918,88.520964 L 108.25578,88.935199 L 108.56655,89.320386 L 108.70791,89.235651 L 108.84927,89.151409 L 108.99927,89.075325 L 109.15024,89.000199 L 109.31943,88.925074 L 109.46991,88.830288 L 109.62013,88.7559 L 109.76099,88.651997 L 109.62971,88.350807 L 109.46892,88.040254 L 109.32782,87.710999 L 109.17808,87.39037 L 109.0264,87.052466 L 108.88409,86.704264 L 108.75305,86.365376 L 108.63089,86.036342 L 108.52579,85.707063 L 108.45066,85.38668 L 108.39355,85.067501 L 108.36595,84.774937 L 108.37436,84.483822 L 108.41274,84.219839 L 108.48713,83.975539 L 108.60998,83.759157 z "
3871 id="path1157"
3872 style="fill:url(#XMLID_53_);stroke:#000000;stroke-width:0.0882756" /><path
3873 d="M 109.92628,83.00488 L 109.40935,83.203125 L 109.00495,83.438553 L 108.71312,83.711654 L 108.5065,84.013335 L 108.38414,84.342566 L 108.34598,84.700401 L 108.36637,85.076446 L 108.44171,85.471463 L 108.56387,85.876335 L 108.73403,86.289832 L 108.91306,86.71461 L 109.1017,87.127861 L 109.29994,87.550722 L 109.50706,87.964711 L 109.68609,88.369337 L 109.83702,88.7646 L 109.94958,88.670281 L 110.07221,88.604273 L 110.19413,88.538043 L 110.32611,88.480907 L 110.44878,88.416349 L 110.58029,88.359237 L 110.71182,88.28389 L 110.83373,88.198442 L 110.80589,88.151873 L 110.72118,88.048437 L 110.62689,87.897965 L 110.48554,87.700433 L 110.33484,87.4566 L 110.175,87.164306 L 110.00534,86.84439 L 109.85437,86.487046 L 109.72165,86.111467 L 109.60836,85.715959 L 109.53274,85.282777 L 109.49458,84.850087 L 109.5049,84.38857 L 109.57953,83.937153 L 109.71128,83.465586 L 109.92628,83.00488 z "
3874 id="path8936"
3875 style="fill:url(#XMLID_54_);stroke:#000000;stroke-width:0.0882756" /><path
3876 d="M 111.45958,81.865661 L 111.0828,81.922279 L 110.73457,82.064127 L 110.43413,82.289234 L 110.18044,82.56258 L 109.95413,82.901198 L 109.78517,83.296682 L 109.66397,83.720255 L 109.57951,84.181993 L 109.55144,84.679979 L 109.57064,85.189245 L 109.65584,85.705933 L 109.78783,86.233433 L 109.9671,86.75012 L 110.21189,87.258895 L 110.5138,87.728792 L 110.88146,88.190038 L 112.10396,87.473434 L 112.00028,87.257936 L 111.88653,86.993486 L 111.76439,86.721097 L 111.64174,86.41079 L 111.52893,86.07146 L 111.40627,85.724191 L 111.30213,85.35606 L 111.20804,84.980236 L 111.13316,84.584751 L 111.08517,84.180617 L 111.06643,83.7856 L 111.06523,83.38987 L 111.10219,82.995344 L 111.18715,82.608954 L 111.29995,82.232169 L 111.45958,81.865661 z "
3877 id="path1195"
3878 style="fill:url(#XMLID_55_);stroke:#000000;stroke-width:0.0882756" /><path
3879 d="M 112.12294,87.51069 L 112.29188,87.398357 L 112.50788,87.257224 L 112.74355,87.134841 L 112.99842,86.992773 L 113.25162,86.870612 L 113.47792,86.738374 L 113.68409,86.625354 L 113.85326,86.531035 L 113.7119,86.202026 L 113.57079,85.854046 L 113.42969,85.505573 L 113.27825,85.147763 L 113.12802,84.790664 L 112.99555,84.433075 L 112.87337,84.066147 L 112.7505,83.698948 L 112.65594,83.342319 L 112.58922,82.993871 L 112.54268,82.6454 L 112.5242,82.306755 L 112.52373,81.986616 L 112.57052,81.675842 L 112.6641,81.374407 L 112.78649,81.101774 L 112.50353,81.15913 L 112.23135,81.26232 L 111.97672,81.403676 L 111.742,81.60143 L 111.53513,81.845757 L 111.35632,82.138296 L 111.21521,82.467549 L 111.11298,82.852982 L 111.04626,83.277268 L 111.02835,83.747411 L 111.06626,84.274419 L 111.15218,84.829272 L 111.30289,85.431628 L 111.5009,86.080331 L 111.78336,86.77703 L 112.12294,87.51069 z "
3880 id="path1214"
3881 style="fill:url(#XMLID_56_);stroke:#000000;stroke-width:0.0882756" /><path
3882 d="M 114.37572,80.319874 L 113.78294,80.499394 L 113.31283,80.724746 L 112.96437,81.017038 L 112.7203,81.356151 L 112.57919,81.722366 L 112.50432,82.137066 L 112.51464,82.560637 L 112.58062,83.022129 L 112.70324,83.483156 L 112.85443,83.953521 L 113.04307,84.423884 L 113.24105,84.884418 L 113.42921,85.33608 L 113.6176,85.769238 L 113.78727,86.164009 L 113.92958,86.530494 L 114.07959,86.417942 L 114.21999,86.33274 L 114.34311,86.248497 L 114.47488,86.172905 L 114.6064,86.097558 L 114.74751,86.01307 L 114.88862,85.909413 L 115.05687,85.796367 L 114.92583,85.419585 L 114.78376,85.052412 L 114.6525,84.695068 L 114.52963,84.347089 L 114.407,83.998863 L 114.31243,83.660956 L 114.21883,83.321602 L 114.14251,83.001929 L 114.08564,82.662355 L 114.05708,82.33379 L 114.03859,82.014364 L 114.0475,81.684862 L 114.08519,81.344797 L 114.15025,81.016255 L 114.25393,80.667805 L 114.37572,80.319874 z "
3883 id="path8940"
3884 style="fill:url(#XMLID_57_);stroke:#000000;stroke-width:0.0882756" /><path
3885 d="M 115.88115,79.490248 L 115.34456,79.651042 L 114.92145,79.886006 L 114.5922,80.178298 L 114.3383,80.526277 L 114.17845,80.912643 L 114.08512,81.344871 L 114.04769,81.797022 L 114.06664,82.268589 L 114.14248,82.757681 L 114.23702,83.246305 L 114.36014,83.726057 L 114.51038,84.196908 L 114.67114,84.658159 L 114.82136,85.072367 L 114.98262,85.457307 L 115.11388,85.796418 L 115.28283,85.68315 L 115.43401,85.588587 L 115.5749,85.503385 L 115.69706,85.418676 L 115.83792,85.334434 L 115.9795,85.249476 L 116.12997,85.136459 L 116.309,85.014049 L 116.10142,84.468071 L 115.92239,83.950645 L 115.76135,83.471851 L 115.62935,83.019699 L 115.51705,82.605465 L 115.42177,82.228461 L 115.3654,81.872073 L 115.30828,81.532745 L 115.29868,81.22219 L 115.30804,80.930119 L 115.33589,80.657509 L 115.39229,80.394237 L 115.47723,80.158099 L 115.57946,79.92385 L 115.72104,79.705945 L 115.88115,79.490248 z "
3886 id="path1252"
3887 style="fill:url(#XMLID_58_);stroke:#000000;stroke-width:0.0882756" /><path
3888 d="M 117.01941,78.858821 L 116.52096,79.037604 L 116.12428,79.283106 L 115.82429,79.583587 L 115.5891,79.933484 L 115.43863,80.309063 L 115.34552,80.723761 L 115.31792,81.166088 L 115.32704,81.627332 L 115.39351,82.087891 L 115.48758,82.558254 L 115.6011,83.028397 L 115.74221,83.470945 L 115.88477,83.91302 L 116.04461,84.318604 L 116.18643,84.684598 L 116.31866,85.023707 L 116.4593,84.938752 L 116.59034,84.863406 L 116.71321,84.777985 L 116.83583,84.712469 L 116.95822,84.627981 L 117.0799,84.543267 L 117.22124,84.4576 L 117.36211,84.354633 L 117.23108,84.109866 L 117.10749,83.818285 L 117.0038,83.508224 L 116.90015,83.160244 L 116.80587,82.802896 L 116.73963,82.416507 L 116.67315,82.030853 L 116.62587,81.626006 L 116.60715,81.231482 L 116.59707,80.825898 L 116.60619,80.440487 L 116.64339,80.082677 L 116.69979,79.724841 L 116.77516,79.404704 L 116.87813,79.113589 L 117.01941,78.858821 z "
3889 id="path1271"
3890 style="fill:url(#XMLID_59_);stroke:#000000;stroke-width:0.0882756" /><path
3891 d="M 118.35473,78.066848 L 117.81933,78.227884 L 117.3955,78.482287 L 117.06626,78.830756 L 116.84114,79.244995 L 116.68106,79.706019 L 116.59587,80.223198 L 116.56848,80.76075 L 116.58771,81.305526 L 116.64506,81.851974 L 116.73913,82.379008 L 116.84401,82.876992 L 116.97577,83.328406 L 117.09865,83.714775 L 117.21096,84.034446 L 117.31461,84.251391 L 117.38084,84.373311 L 117.55026,84.26051 L 117.70047,84.165969 L 117.85117,84.071899 L 118.00236,83.977336 L 118.15258,83.883727 L 118.28361,83.789163 L 118.4252,83.685483 L 118.56608,83.582048 L 118.49049,83.337476 L 118.38731,83.045648 L 118.28315,82.716396 L 118.18884,82.358091 L 118.07532,81.963789 L 117.97213,81.558672 L 117.88597,81.154758 L 117.80172,80.730692 L 117.74462,80.316461 L 117.71653,79.911589 L 117.71606,79.517038 L 117.75302,79.159471 L 117.82839,78.820119 L 117.95105,78.528045 L 118.12937,78.274872 L 118.35473,78.066848 z "
3892 id="path1290"
3893 style="fill:url(#XMLID_60_);stroke:#000000;stroke-width:0.0882756" /><path
3894 d="M 119.42748,77.595989 L 118.90959,77.70014 L 118.49609,77.888039 L 118.18579,78.142659 L 117.95012,78.471913 L 117.80011,78.848451 L 117.71635,79.272517 L 117.6878,79.714814 L 117.71734,80.194075 L 117.78357,80.675031 L 117.87669,81.163654 L 118.00002,81.64412 L 118.13226,82.113771 L 118.27386,82.537587 L 118.40608,82.932849 L 118.52824,83.261882 L 118.62231,83.544105 L 118.75429,83.450007 L 118.87623,83.384266 L 118.98902,83.32809 L 119.09246,83.280341 L 119.20575,83.223227 L 119.30894,83.176656 L 119.40372,83.120261 L 119.50595,83.054006 L 119.35573,82.734801 L 119.22373,82.395938 L 119.10111,82.065945 L 118.98855,81.727302 L 118.89472,81.369493 L 118.81792,81.031565 L 118.76128,80.673481 L 118.72335,80.315895 L 118.70441,79.958057 L 118.7229,79.601179 L 118.75983,79.262317 L 118.82609,78.913844 L 118.92903,78.565149 L 119.05141,78.235898 L 119.22062,77.915513 L 119.42748,77.595989 z "
3895 id="path1309"
3896 style="fill:url(#XMLID_61_);stroke:#000000;stroke-width:0.0882756" /><path
3897 d="M 120.63176,76.841933 L 120.25447,77.011845 L 119.91657,77.21921 L 119.6154,77.482481 L 119.37062,77.765657 L 119.15485,78.095159 L 118.98589,78.433527 L 118.86349,78.819428 L 118.76964,79.233882 L 118.72331,79.657483 L 118.72378,80.109145 L 118.75209,80.570416 L 118.81857,81.050633 L 118.92272,81.548866 L 119.08252,82.047563 L 119.2618,82.565234 L 119.48786,83.072094 L 119.62919,82.968656 L 119.76959,82.86571 L 119.92101,82.7709 L 120.08086,82.686412 L 120.23108,82.592807 L 120.39115,82.508096 L 120.54162,82.414245 L 120.68272,82.309606 L 120.56009,82.01849 L 120.42764,81.71755 L 120.30572,81.42523 L 120.17277,81.124285 L 120.06958,80.813731 L 119.97477,80.513036 L 119.89101,80.183043 L 119.8339,79.86337 L 119.81447,79.524752 L 119.80487,79.176774 L 119.84234,78.818717 L 119.90784,78.451766 L 120.02065,78.074981 L 120.17114,77.67972 L 120.37681,77.265754 L 120.63176,76.841933 z "
3898 id="path1328"
3899 style="fill:url(#XMLID_62_);stroke:#000000;stroke-width:0.0882756" /><path
3900 d="M 122.35296,75.777373 L 121.8456,75.985204 L 121.39299,76.248698 L 121.01717,76.531379 L 120.67879,76.851518 L 120.41505,77.20933 L 120.19882,77.595475 L 120.0392,77.999857 L 119.92639,78.433259 L 119.88033,78.874843 L 119.86185,79.344961 L 119.89926,79.816774 L 119.9756,80.305867 L 120.09847,80.804099 L 120.26742,81.293656 L 120.45656,81.783238 L 120.69149,82.281223 L 120.82324,82.187372 L 120.95501,82.093053 L 121.10594,81.980498 L 121.2473,81.875612 L 121.39752,81.763775 L 121.548,81.649772 L 121.69826,81.53697 L 121.84851,81.424171 L 121.65099,81.094648 L 121.50078,80.756276 L 121.35917,80.408297 L 121.25453,80.041588 L 121.17941,79.683755 L 121.13237,79.316554 L 121.10381,78.949599 L 121.11268,78.582181 L 121.15012,78.205866 L 121.2341,77.839649 L 121.33729,77.471494 L 121.46905,77.114149 L 121.64736,76.765926 L 121.84536,76.417231 L 122.08007,76.088444 L 122.35296,75.777373 z "
3901 id="path8947"
3902 style="fill:url(#XMLID_63_);stroke:#000000;stroke-width:0.0882756" /><path
3903 d="M 123.1142,75.165383 L 122.7475,75.438509 L 122.4096,75.720729 L 122.1089,76.040644 L 121.84517,76.380218 L 121.61981,76.756266 L 121.43167,77.133541 L 121.28141,77.528559 L 121.16839,77.943016 L 121.10287,78.366585 L 121.07578,78.808419 L 121.09422,79.241602 L 121.15112,79.693017 L 121.26463,80.144927 L 121.41559,80.596344 L 121.62247,81.048471 L 121.8778,81.490524 L 121.9995,81.38613 L 122.10293,81.30142 L 122.2157,81.226542 L 122.32896,81.15117 L 122.44152,81.075086 L 122.55453,81.000453 L 122.69567,80.915497 L 122.83631,80.812308 L 122.7233,80.379124 L 122.61028,79.96467 L 122.50564,79.598454 L 122.39356,79.240619 L 122.31772,78.901732 L 122.24258,78.581838 L 122.17631,78.271332 L 122.14729,77.961026 L 122.13768,77.650471 L 122.16553,77.339922 L 122.22264,77.019759 L 122.30713,76.690504 L 122.44728,76.341564 L 122.61648,75.984219 L 122.84207,75.58871 L 123.1142,75.165383 z "
3904 id="path1366"
3905 style="fill:url(#XMLID_64_);stroke:#000000;stroke-width:0.0882756" /><path
3906 d="M 124.80708,73.865618 L 124.22438,74.2237 L 123.71633,74.600233 L 123.29321,74.986383 L 122.95482,75.400615 L 122.68193,75.82419 L 122.47529,76.257371 L 122.32457,76.689815 L 122.24969,77.142903 L 122.20337,77.602972 L 122.21393,78.064241 L 122.26073,78.525759 L 122.33702,78.996148 L 122.44024,79.456927 L 122.57248,79.927562 L 122.7232,80.379222 L 122.88375,80.840223 L 122.96821,80.773994 L 123.08052,80.679923 L 123.22163,80.595435 L 123.36348,80.491041 L 123.5137,80.397431 L 123.6553,80.31201 L 123.76712,80.217939 L 123.86168,80.142592 L 123.71986,79.719975 L 123.61618,79.31486 L 123.54083,78.900623 L 123.49356,78.496709 L 123.46475,78.091617 L 123.4645,77.695862 L 123.50217,77.30156 L 123.55834,76.906074 L 123.64186,76.519684 L 123.74579,76.134028 L 123.87756,75.757493 L 124.0182,75.372085 L 124.18738,74.995057 L 124.37529,74.618985 L 124.58266,74.241715 L 124.80708,73.865618 z "
3907 id="path1385"
3908 style="fill:url(#XMLID_65_);stroke:#000000;stroke-width:0.0882756" /><path
3909 d="M 125.79416,73.064772 L 125.48503,73.280758 L 125.20231,73.545235 L 124.91073,73.855541 L 124.64771,74.213134 L 124.40292,74.590874 L 124.17782,75.004615 L 123.97984,75.446917 L 123.81088,75.917796 L 123.6705,76.416246 L 123.57737,76.924532 L 123.51211,77.441929 L 123.4845,77.978327 L 123.51257,78.515189 L 123.57856,79.051098 L 123.70168,79.587468 L 123.88049,80.123839 L 124.02209,80.001459 L 124.17184,79.889148 L 124.33239,79.785223 L 124.49198,79.681318 L 124.63262,79.596363 L 124.75598,79.530603 L 124.83998,79.464348 L 124.87766,79.408438 L 124.95349,79.106513 L 125.00893,78.78709 L 125.05573,78.438859 L 125.0934,78.062297 L 125.12148,77.676156 L 125.14859,77.271774 L 125.18603,76.857071 L 125.21409,76.434926 L 125.25176,76.001249 L 125.2885,75.568068 L 125.33578,75.126264 L 125.4013,74.702196 L 125.47593,74.269013 L 125.56018,73.86417 L 125.67273,73.45018 L 125.79416,73.064772 z "
3910 id="path1404"
3911 style="fill:url(#XMLID_66_);stroke:#000000;stroke-width:0.0882756" /><path
3912 d="M 127.40354,72.168646 L 126.9334,72.169361 L 126.53861,72.28287 L 126.21798,72.49021 L 125.96429,72.781792 L 125.75767,73.139602 L 125.59807,73.5819 L 125.48551,74.072219 L 125.38208,74.607635 L 125.3264,75.191287 L 125.26978,75.793666 L 125.23306,76.415461 L 125.19563,77.036543 L 125.14931,77.667012 L 125.08426,78.278758 L 125.00051,78.871306 L 124.88722,79.436725 L 125.00889,79.351523 L 125.14063,79.238014 L 125.30119,79.097616 L 125.46101,78.956017 L 125.62948,78.805791 L 125.79003,78.645242 L 125.93113,78.504131 L 126.0528,78.381478 L 126.11881,77.939181 L 126.16538,77.497592 L 126.21171,77.054337 L 126.23043,76.621868 L 126.25849,76.198294 L 126.27723,75.783126 L 126.29521,75.369627 L 126.32306,74.965002 L 126.37968,74.569984 L 126.43558,74.193916 L 126.5203,73.826494 L 126.63237,73.468659 L 126.77349,73.120186 L 126.94294,72.781324 L 127.15938,72.470304 L 127.40354,72.168646 z "
3913 id="path1423"
3914 style="fill:url(#XMLID_67_);stroke:#000000;stroke-width:0.0882756" /><path
3915 d="M 128.45684,71.339955 L 127.96748,71.642105 L 127.56311,71.971627 L 127.23409,72.338093 L 126.97083,72.734805 L 126.77403,73.15813 L 126.61419,73.600674 L 126.50189,74.071041 L 126.41742,74.551012 L 126.37036,75.049709 L 126.32429,75.5489 L 126.30556,76.056717 L 126.27725,76.574608 L 126.24004,77.082672 L 126.20357,77.590486 L 126.11862,78.089677 L 126.02573,78.579013 L 126.1282,78.49477 L 126.26906,78.390866 L 126.42987,78.268459 L 126.58033,78.135731 L 126.74042,78.014525 L 126.88105,77.909419 L 126.98449,77.826134 L 127.05047,77.778385 L 127.21894,77.608939 L 127.34202,77.39271 L 127.43515,77.1287 L 127.50137,76.818639 L 127.53856,76.479531 L 127.56689,76.09363 L 127.58466,75.679418 L 127.60312,75.246234 L 127.62231,74.794082 L 127.65015,74.315046 L 127.70608,73.825708 L 127.78093,73.336128 L 127.89253,72.827597 L 128.03367,72.328898 L 128.21245,71.829952 L 128.45684,71.339955 z "
3916 id="path1442"
3917 style="fill:url(#XMLID_68_);stroke:#000000;stroke-width:0.0882756" /><path
3918 d="M 129.97143,70.482956 L 129.42545,70.633426 L 128.97402,70.916848 L 128.61644,71.27444 L 128.32606,71.716491 L 128.10024,72.225513 L 127.93057,72.790194 L 127.80915,73.383459 L 127.71557,73.985347 L 127.66899,74.596843 L 127.6318,75.199469 L 127.58572,75.774009 L 127.54757,76.300304 L 127.50103,76.761576 L 127.4259,77.156371 L 127.31335,77.458053 L 127.16358,77.664925 L 127.33255,77.533864 L 127.52093,77.401163 L 127.72734,77.250444 L 127.94404,77.091347 L 128.15067,76.921191 L 128.36737,76.761109 L 128.55552,76.591686 L 128.72492,76.440505 L 128.78996,75.97948 L 128.83724,75.555904 L 128.86484,75.151526 L 128.89243,74.766361 L 128.9013,74.39845 L 128.9013,74.041326 L 128.91979,73.683024 L 128.93803,73.344627 L 128.96658,73.015126 L 129.02249,72.676018 L 129.09712,72.337402 L 129.20103,71.989665 L 129.33254,71.631611 L 129.50175,71.274267 L 129.71749,70.88763 L 129.97143,70.482956 z "
3919 id="path1461"
3920 style="fill:url(#XMLID_69_);stroke:#000000;stroke-width:0.0882756" /><g
3921 style="stroke:#000000"
3922 id="g1463"
3923 transform="matrix(0.2457491,-0.2457491,0.2457491,0.2457491,95.022367,94.120824)">
3924 <linearGradient
3925 id="linearGradient8956"
3926 gradientUnits="userSpaceOnUse"
3927 x1="-3581.9316"
3928 y1="-3602.7837"
3929 x2="-3565.7739"
3930 y2="-3551.2231"
3931 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3932 <stop
3933 offset="0"
3934 style="stop-color:#D8E7EB"
3935 id="stop8958" />
3936 <stop
3937 offset="0.0684"
3938 style="stop-color:#D0DFE4"
3939 id="stop8960" />
3940 <stop
3941 offset="0.1761"
3942 style="stop-color:#B9CAD0"
3943 id="stop8962" />
3944 <stop
3945 offset="0.3096"
3946 style="stop-color:#94A7B0"
3947 id="stop8964" />
3948 <stop
3949 offset="0.4622"
3950 style="stop-color:#627784"
3951 id="stop8966" />
3952 <stop
3953 offset="0.5537"
3954 style="stop-color:#405766"
3955 id="stop8968" />
3956 <stop
3957 offset="0.6113"
3958 style="stop-color:#607682"
3959 id="stop8970" />
3960 <stop
3961 offset="0.6983"
3962 style="stop-color:#8B9EA8"
3963 id="stop8972" />
3964 <stop
3965 offset="0.7829"
3966 style="stop-color:#ADBEC5"
3967 id="stop8974" />
3968 <stop
3969 offset="0.8633"
3970 style="stop-color:#C5D5DA"
3971 id="stop8976" />
3972 <stop
3973 offset="0.9376"
3974 style="stop-color:#D3E2E7"
3975 id="stop8978" />
3976 <stop
3977 offset="1"
3978 style="stop-color:#D8E7EB"
3979 id="stop8980" />
3980 </linearGradient>
3981 <path
3982 d="M 26.439,7.3066 L 26.2476,6.9238 L 26.0943,6.5039 L 25.942,6.081 L 25.8287,5.6601 L 25.7115,5.2011 L 25.5963,4.705 L 25.5192,4.246 L 25.4772,3.746 L 25.4411,3.287 L 25.4059,2.789 L 25.4049,2.29 L 25.4401,1.83 L 25.4762,1.371 L 25.5563,0.914 L 25.6686,0.4531 L 25.7829,0.0332 L 25.1716,0.8369 L 24.7126,1.7187 L 24.3679,2.5605 L 24.1784,3.4404 L 24.1022,4.3593 L 24.1022,5.2792 L 24.2184,6.1972 L 24.4127,7.1171 L 24.6402,8.0351 L 24.9498,8.9179 L 25.2926,9.7968 L 25.6354,10.6767 L 26.0202,11.5195 L 26.3669,12.3613 L 26.7126,13.1259 L 27.0554,13.8925 L 26.9812,13.7402 L 26.9441,13.4716 L 26.9021,13.206 L 26.9021,12.8613 L 26.865,12.4375 L 26.865,12.0166 L 26.865,11.5566 L 26.864,11.0605 L 26.8259,10.5615 L 26.8259,10.0635 L 26.7868,9.5293 L 26.7468,9.0313 L 26.7087,8.5704 L 26.6306,8.1095 L 26.5554,7.6896 L 26.439,7.3066 z "
3983 id="path1490"
3984 style="fill:url(#XMLID_70_);stroke-width:0.25400001" />
3985 <linearGradient
3986 id="linearGradient8983"
3987 gradientUnits="userSpaceOnUse"
3988 x1="-3578.5146"
3989 y1="-3603.8545"
3990 x2="-3562.3569"
3991 y2="-3552.2939"
3992 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
3993 <stop
3994 offset="0"
3995 style="stop-color:#D8E7EB"
3996 id="stop8985" />
3997 <stop
3998 offset="0.0684"
3999 style="stop-color:#D0DFE4"
4000 id="stop8987" />
4001 <stop
4002 offset="0.1761"
4003 style="stop-color:#B9CAD0"
4004 id="stop8989" />
4005 <stop
4006 offset="0.3096"
4007 style="stop-color:#94A7B0"
4008 id="stop8991" />
4009 <stop
4010 offset="0.4622"
4011 style="stop-color:#627784"
4012 id="stop8993" />
4013 <stop
4014 offset="0.5537"
4015 style="stop-color:#405766"
4016 id="stop8995" />
4017 <stop
4018 offset="0.6113"
4019 style="stop-color:#607682"
4020 id="stop8997" />
4021 <stop
4022 offset="0.6983"
4023 style="stop-color:#8B9EA8"
4024 id="stop8999" />
4025 <stop
4026 offset="0.7829"
4027 style="stop-color:#ADBEC5"
4028 id="stop9001" />
4029 <stop
4030 offset="0.8633"
4031 style="stop-color:#C5D5DA"
4032 id="stop9003" />
4033 <stop
4034 offset="0.9376"
4035 style="stop-color:#D3E2E7"
4036 id="stop9005" />
4037 <stop
4038 offset="1"
4039 style="stop-color:#D8E7EB"
4040 id="stop9007" />
4041 </linearGradient>
4042 <path
4043 d="M 14.77,23.209 L 15.4985,22.7871 L 16.228,22.3252 L 16.9145,21.9043 L 17.603,21.4824 L 18.2534,21.0996 L 18.9048,20.6777 L 19.521,20.2929 L 20.1304,19.873 L 20.7837,19.4892 L 21.3911,19.1054 L 22.0054,18.6835 L 22.6577,18.3017 L 23.269,17.916 L 23.9233,17.5742 L 24.6088,17.1885 L 25.2983,16.8067 L 24.688,17.417 L 24.0747,18.0332 L 23.4614,18.6445 L 22.8491,19.2207 L 22.1987,19.832 L 21.5473,20.3681 L 20.8969,20.9072 L 20.2094,21.4043 L 19.558,21.8652 L 18.8685,22.248 L 18.2171,22.5937 L 17.5296,22.8623 L 16.8411,23.0918 L 16.1526,23.207 L 15.4612,23.2441 L 14.77,23.209 z "
4044 id="path1517"
4045 style="fill:url(#XMLID_71_);stroke-width:0.25400001" />
4046 <linearGradient
4047 id="linearGradient9010"
4048 gradientUnits="userSpaceOnUse"
4049 x1="-3580.1533"
4050 y1="-3603.3408"
4051 x2="-3563.9956"
4052 y2="-3551.7803"
4053 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4054 <stop
4055 offset="0"
4056 style="stop-color:#D8E7EB"
4057 id="stop9012" />
4058 <stop
4059 offset="0.0684"
4060 style="stop-color:#D0DFE4"
4061 id="stop9014" />
4062 <stop
4063 offset="0.1761"
4064 style="stop-color:#B9CAD0"
4065 id="stop9016" />
4066 <stop
4067 offset="0.3096"
4068 style="stop-color:#94A7B0"
4069 id="stop9018" />
4070 <stop
4071 offset="0.4622"
4072 style="stop-color:#627784"
4073 id="stop9020" />
4074 <stop
4075 offset="0.5537"
4076 style="stop-color:#405766"
4077 id="stop9022" />
4078 <stop
4079 offset="0.6113"
4080 style="stop-color:#607682"
4081 id="stop9024" />
4082 <stop
4083 offset="0.6983"
4084 style="stop-color:#8B9EA8"
4085 id="stop9026" />
4086 <stop
4087 offset="0.7829"
4088 style="stop-color:#ADBEC5"
4089 id="stop9028" />
4090 <stop
4091 offset="0.8633"
4092 style="stop-color:#C5D5DA"
4093 id="stop9030" />
4094 <stop
4095 offset="0.9376"
4096 style="stop-color:#D3E2E7"
4097 id="stop9032" />
4098 <stop
4099 offset="1"
4100 style="stop-color:#D8E7EB"
4101 id="stop9034" />
4102 </linearGradient>
4103 <path
4104 d="M 20.4663,7.8848 L 21.0024,8.1924 L 21.5024,8.5371 L 21.9975,8.8818 L 22.4584,9.2256 L 22.9184,9.5684 L 23.3373,9.9512 L 23.7611,10.3721 L 24.184,10.7549 L 24.5678,11.1768 L 24.9516,11.5987 L 25.3334,12.0196 L 25.6781,12.4766 L 26.0609,12.8985 L 26.4066,13.3575 L 26.7504,13.8165 L 27.0961,14.2755 L 26.9428,13.7794 L 26.7504,13.2804 L 26.559,12.7433 L 26.3295,12.2091 L 26.0981,11.673 L 25.8335,11.174 L 25.5239,10.676 L 25.1782,10.1809 L 24.7964,9.7209 L 24.3355,9.34 L 23.8375,8.9552 L 23.3004,8.6114 L 22.688,8.3428 L 22.0357,8.1133 L 21.271,7.9609 L 20.4663,7.8848 z "
4105 id="path1544"
4106 style="fill:url(#XMLID_72_);stroke-width:0.25400001" />
4107 <linearGradient
4108 id="linearGradient9037"
4109 gradientUnits="userSpaceOnUse"
4110 x1="-3584.6758"
4111 y1="-3601.9238"
4112 x2="-3568.5181"
4113 y2="-3550.3633"
4114 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4115 <stop
4116 offset="0"
4117 style="stop-color:#D8E7EB"
4118 id="stop9039" />
4119 <stop
4120 offset="0.0684"
4121 style="stop-color:#D0DFE4"
4122 id="stop9041" />
4123 <stop
4124 offset="0.1761"
4125 style="stop-color:#B9CAD0"
4126 id="stop9043" />
4127 <stop
4128 offset="0.3096"
4129 style="stop-color:#94A7B0"
4130 id="stop9045" />
4131 <stop
4132 offset="0.4622"
4133 style="stop-color:#627784"
4134 id="stop9047" />
4135 <stop
4136 offset="0.5537"
4137 style="stop-color:#405766"
4138 id="stop9049" />
4139 <stop
4140 offset="0.6113"
4141 style="stop-color:#607682"
4142 id="stop9051" />
4143 <stop
4144 offset="0.6983"
4145 style="stop-color:#8B9EA8"
4146 id="stop9053" />
4147 <stop
4148 offset="0.7829"
4149 style="stop-color:#ADBEC5"
4150 id="stop9055" />
4151 <stop
4152 offset="0.8633"
4153 style="stop-color:#C5D5DA"
4154 id="stop9057" />
4155 <stop
4156 offset="0.9376"
4157 style="stop-color:#D3E2E7"
4158 id="stop9059" />
4159 <stop
4160 offset="1"
4161 style="stop-color:#D8E7EB"
4162 id="stop9061" />
4163 </linearGradient>
4164 <path
4165 d="M 26.1694,3.2852 L 26.3227,4.0518 L 26.437,4.8184 L 26.5913,5.584 L 26.7075,6.2735 L 26.7837,7 L 26.898,7.6895 L 26.9771,8.379 L 27.0923,9.0685 L 27.1675,9.757 L 27.2847,10.4465 L 27.3589,11.0979 L 27.4771,11.7874 L 27.5933,12.4769 L 27.7085,13.1664 L 27.8589,13.8559 L 28.0161,14.5825 L 28.0923,13.7407 L 28.2056,12.936 L 28.2827,12.0932 L 28.3589,11.2524 L 28.396,10.4458 L 28.4341,9.6411 L 28.395,8.8359 L 28.356,8.0723 L 28.2798,7.3057 L 28.1636,6.6172 L 27.9693,5.9277 L 27.7408,5.2763 L 27.4722,4.7031 L 27.0884,4.166 L 26.6665,3.706 L 26.1694,3.2852 z "
4166 id="path1571"
4167 style="fill:url(#XMLID_73_);stroke-width:0.25400001" />
4168 <linearGradient
4169 id="linearGradient9064"
4170 gradientUnits="userSpaceOnUse"
4171 x1="-3578.7671"
4172 y1="-3603.7754"
4173 x2="-3562.6094"
4174 y2="-3552.2148"
4175 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4176 <stop
4177 offset="0"
4178 style="stop-color:#D8E7EB"
4179 id="stop9066" />
4180 <stop
4181 offset="0.0684"
4182 style="stop-color:#D0DFE4"
4183 id="stop9068" />
4184 <stop
4185 offset="0.1761"
4186 style="stop-color:#B9CAD0"
4187 id="stop9070" />
4188 <stop
4189 offset="0.3096"
4190 style="stop-color:#94A7B0"
4191 id="stop9072" />
4192 <stop
4193 offset="0.4622"
4194 style="stop-color:#627784"
4195 id="stop9074" />
4196 <stop
4197 offset="0.5537"
4198 style="stop-color:#405766"
4199 id="stop9076" />
4200 <stop
4201 offset="0.6113"
4202 style="stop-color:#607682"
4203 id="stop9078" />
4204 <stop
4205 offset="0.6983"
4206 style="stop-color:#8B9EA8"
4207 id="stop9080" />
4208 <stop
4209 offset="0.7829"
4210 style="stop-color:#ADBEC5"
4211 id="stop9082" />
4212 <stop
4213 offset="0.8633"
4214 style="stop-color:#C5D5DA"
4215 id="stop9084" />
4216 <stop
4217 offset="0.9376"
4218 style="stop-color:#D3E2E7"
4219 id="stop9086" />
4220 <stop
4221 offset="1"
4222 style="stop-color:#D8E7EB"
4223 id="stop9088" />
4224 </linearGradient>
4225 <path
4226 d="M 22.0698,2.0273 L 21.3774,2.6757 L 20.9614,3.4042 L 20.772,4.1699 L 20.732,4.9746 L 20.8853,5.7783 L 21.1919,6.6211 L 21.6157,7.4649 L 22.1137,8.3057 L 22.6889,9.1094 L 23.3002,9.9141 L 23.9535,10.6778 L 24.6039,11.3673 L 25.2182,12.0568 L 25.7524,12.6291 L 26.2153,13.1682 L 26.5981,13.5862 L 26.521,12.4387 L 26.3667,11.4436 L 26.1753,10.5998 L 25.8687,9.8361 L 25.563,9.1855 L 25.1773,8.6113 L 24.7955,8.0761 L 24.3717,7.5751 L 23.9889,7.079 L 23.5649,6.584 L 23.1831,6.0459 L 22.8765,5.4326 L 22.5699,4.7441 L 22.3394,3.9785 L 22.145,3.0596 L 22.0698,2.0273 z "
4227 id="path1598"
4228 style="fill:url(#XMLID_74_);stroke-width:0.25400001" />
4229 <linearGradient
4230 id="linearGradient9091"
4231 gradientUnits="userSpaceOnUse"
4232 x1="-3584.2759"
4233 y1="-3602.0488"
4234 x2="-3568.1182"
4235 y2="-3550.4883"
4236 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4237 <stop
4238 offset="0"
4239 style="stop-color:#D8E7EB"
4240 id="stop9093" />
4241 <stop
4242 offset="0.0684"
4243 style="stop-color:#D0DFE4"
4244 id="stop9095" />
4245 <stop
4246 offset="0.1761"
4247 style="stop-color:#B9CAD0"
4248 id="stop9097" />
4249 <stop
4250 offset="0.3096"
4251 style="stop-color:#94A7B0"
4252 id="stop9099" />
4253 <stop
4254 offset="0.4622"
4255 style="stop-color:#627784"
4256 id="stop9101" />
4257 <stop
4258 offset="0.5537"
4259 style="stop-color:#405766"
4260 id="stop9103" />
4261 <stop
4262 offset="0.6113"
4263 style="stop-color:#607682"
4264 id="stop9105" />
4265 <stop
4266 offset="0.6983"
4267 style="stop-color:#8B9EA8"
4268 id="stop9107" />
4269 <stop
4270 offset="0.7829"
4271 style="stop-color:#ADBEC5"
4272 id="stop9109" />
4273 <stop
4274 offset="0.8633"
4275 style="stop-color:#C5D5DA"
4276 id="stop9111" />
4277 <stop
4278 offset="0.9376"
4279 style="stop-color:#D3E2E7"
4280 id="stop9113" />
4281 <stop
4282 offset="1"
4283 style="stop-color:#D8E7EB"
4284 id="stop9115" />
4285 </linearGradient>
4286 <path
4287 d="M 26.981,14.2754 L 26.981,13.4707 L 27.0201,12.7061 L 27.0543,11.9415 L 27.0943,11.1739 L 27.1304,10.4454 L 27.2075,9.7188 L 27.2837,8.9922 L 27.3569,8.3008 L 27.4741,7.6123 L 27.5884,6.8848 L 27.7779,6.1953 L 27.9302,5.5058 L 28.1236,4.8163 L 28.3912,4.1268 L 28.6197,3.4373 L 28.9254,2.7478 L 27.814,3.8213 L 26.896,4.8555 L 26.2466,5.8145 L 25.7896,6.7344 L 25.481,7.6133 L 25.3296,8.418 L 25.2925,9.2237 L 25.3696,9.9503 L 25.5649,10.6398 L 25.7514,11.2912 L 26.0209,11.9045 L 26.2914,12.4377 L 26.5219,12.9748 L 26.7504,13.4338 L 26.9027,13.8928 L 26.981,14.2754 z "
4288 id="path1625"
4289 style="fill:url(#XMLID_75_);stroke-width:0.25400001" />
4290 <linearGradient
4291 id="linearGradient9118"
4292 gradientUnits="userSpaceOnUse"
4293 x1="-3577.7891"
4294 y1="-3604.082"
4295 x2="-3561.6313"
4296 y2="-3552.5215"
4297 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4298 <stop
4299 offset="0"
4300 style="stop-color:#D8E7EB"
4301 id="stop9120" />
4302 <stop
4303 offset="0.0684"
4304 style="stop-color:#D0DFE4"
4305 id="stop9122" />
4306 <stop
4307 offset="0.1761"
4308 style="stop-color:#B9CAD0"
4309 id="stop9124" />
4310 <stop
4311 offset="0.3096"
4312 style="stop-color:#94A7B0"
4313 id="stop9126" />
4314 <stop
4315 offset="0.4622"
4316 style="stop-color:#627784"
4317 id="stop9128" />
4318 <stop
4319 offset="0.5537"
4320 style="stop-color:#405766"
4321 id="stop9130" />
4322 <stop
4323 offset="0.6113"
4324 style="stop-color:#607682"
4325 id="stop9132" />
4326 <stop
4327 offset="0.6983"
4328 style="stop-color:#8B9EA8"
4329 id="stop9134" />
4330 <stop
4331 offset="0.7829"
4332 style="stop-color:#ADBEC5"
4333 id="stop9136" />
4334 <stop
4335 offset="0.8633"
4336 style="stop-color:#C5D5DA"
4337 id="stop9138" />
4338 <stop
4339 offset="0.9376"
4340 style="stop-color:#D3E2E7"
4341 id="stop9140" />
4342 <stop
4343 offset="1"
4344 style="stop-color:#D8E7EB"
4345 id="stop9142" />
4346 </linearGradient>
4347 <path
4348 d="M 15.7612,16.9268 L 16.3745,17.0411 L 16.9888,17.1573 L 17.6001,17.2325 L 18.2134,17.3077 L 18.8228,17.3458 L 19.439,17.3829 L 20.0523,17.3829 L 20.6275,17.3829 L 21.2398,17.3438 L 21.814,17.3438 L 22.4273,17.2667 L 23.0005,17.2286 L 23.6138,17.1886 L 24.189,17.1134 L 24.8013,17.0343 L 25.3755,16.9591 L 24.8775,17.1876 L 24.3795,17.38 L 23.8443,17.6114 L 23.3072,17.8018 L 22.733,17.9952 L 22.1568,18.1466 L 21.5464,18.3029 L 20.936,18.3791 L 20.3237,18.4182 L 19.6723,18.4182 L 19.0209,18.3791 L 18.3666,18.2287 L 17.7152,18.0373 L 17.0638,17.7678 L 16.4154,17.3869 L 15.7612,16.9268 z "
4349 id="path1652"
4350 style="fill:url(#XMLID_76_);stroke-width:0.25400001" />
4351 <linearGradient
4352 id="linearGradient9145"
4353 gradientUnits="userSpaceOnUse"
4354 x1="-3576.8662"
4355 y1="-3604.3711"
4356 x2="-3560.7085"
4357 y2="-3552.8105"
4358 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4359 <stop
4360 offset="0"
4361 style="stop-color:#D8E7EB"
4362 id="stop9147" />
4363 <stop
4364 offset="0.0684"
4365 style="stop-color:#D0DFE4"
4366 id="stop9149" />
4367 <stop
4368 offset="0.1761"
4369 style="stop-color:#B9CAD0"
4370 id="stop9151" />
4371 <stop
4372 offset="0.3096"
4373 style="stop-color:#94A7B0"
4374 id="stop9153" />
4375 <stop
4376 offset="0.4622"
4377 style="stop-color:#627784"
4378 id="stop9155" />
4379 <stop
4380 offset="0.5537"
4381 style="stop-color:#405766"
4382 id="stop9157" />
4383 <stop
4384 offset="0.6113"
4385 style="stop-color:#607682"
4386 id="stop9159" />
4387 <stop
4388 offset="0.6983"
4389 style="stop-color:#8B9EA8"
4390 id="stop9161" />
4391 <stop
4392 offset="0.7829"
4393 style="stop-color:#ADBEC5"
4394 id="stop9163" />
4395 <stop
4396 offset="0.8633"
4397 style="stop-color:#C5D5DA"
4398 id="stop9165" />
4399 <stop
4400 offset="0.9376"
4401 style="stop-color:#D3E2E7"
4402 id="stop9167" />
4403 <stop
4404 offset="1"
4405 style="stop-color:#D8E7EB"
4406 id="stop9169" />
4407 </linearGradient>
4408 <path
4409 d="M 19.3267,20.0264 L 18.8658,20.1045 L 18.4469,20.1797 L 17.986,20.2949 L 17.529,20.4101 L 17.0661,20.6044 L 16.61,20.7558 L 16.1891,20.9472 L 15.7291,21.1777 L 15.3082,21.4082 L 14.8873,21.6748 L 14.5045,21.9453 L 14.1188,22.2129 L 13.776,22.5195 L 13.4684,22.8261 L 13.1647,23.1327 L 12.8952,23.4765 L 13.1638,22.5595 L 13.5857,21.7519 L 14.0798,21.0644 L 14.695,20.413 L 15.3825,19.8769 L 16.1501,19.3769 L 16.9899,18.956 L 17.8717,18.6093 L 18.7906,18.3036 L 19.7095,17.997 L 20.6646,17.7656 L 21.6226,17.5351 L 22.5796,17.3417 L 23.4985,17.1522 L 24.3413,16.9979 L 25.1831,16.8055 L 24.7632,16.9588 L 24.189,17.2645 L 23.4986,17.7274 L 22.6578,18.2245 L 21.815,18.7597 L 20.9361,19.2587 L 20.0943,19.7187 L 19.3267,20.0264 z "
4410 id="path1679"
4411 style="fill:url(#XMLID_77_);stroke-width:0.25400001" />
4412 <linearGradient
4413 id="linearGradient9172"
4414 gradientUnits="userSpaceOnUse"
4415 x1="-3581.9321"
4416 y1="-3602.7837"
4417 x2="-3565.7744"
4418 y2="-3551.2231"
4419 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4420 <stop
4421 offset="0"
4422 style="stop-color:#D8E7EB"
4423 id="stop9174" />
4424 <stop
4425 offset="0.0684"
4426 style="stop-color:#D0DFE4"
4427 id="stop9176" />
4428 <stop
4429 offset="0.1761"
4430 style="stop-color:#B9CAD0"
4431 id="stop9178" />
4432 <stop
4433 offset="0.3096"
4434 style="stop-color:#94A7B0"
4435 id="stop9180" />
4436 <stop
4437 offset="0.4622"
4438 style="stop-color:#627784"
4439 id="stop9182" />
4440 <stop
4441 offset="0.5537"
4442 style="stop-color:#405766"
4443 id="stop9184" />
4444 <stop
4445 offset="0.6113"
4446 style="stop-color:#607682"
4447 id="stop9186" />
4448 <stop
4449 offset="0.6983"
4450 style="stop-color:#8B9EA8"
4451 id="stop9188" />
4452 <stop
4453 offset="0.7829"
4454 style="stop-color:#ADBEC5"
4455 id="stop9190" />
4456 <stop
4457 offset="0.8633"
4458 style="stop-color:#C5D5DA"
4459 id="stop9192" />
4460 <stop
4461 offset="0.9376"
4462 style="stop-color:#D3E2E7"
4463 id="stop9194" />
4464 <stop
4465 offset="1"
4466 style="stop-color:#D8E7EB"
4467 id="stop9196" />
4468 </linearGradient>
4469 <path
4470 d="M 17.5698,26.8828 L 17.2251,26.0019 L 17.1079,25.1581 L 17.2241,24.3163 L 17.5307,23.5507 L 17.9877,22.8222 L 18.6,22.0956 L 19.3275,21.4052 L 20.1312,20.7909 L 20.974,20.1776 L 21.8529,19.6415 L 22.7718,19.1034 L 23.6556,18.6444 L 24.4564,18.2245 L 25.223,17.8378 L 25.8744,17.494 L 26.3705,17.1854 L 25.8383,18.2987 L 25.3002,19.2176 L 24.727,19.9451 L 24.1909,20.5974 L 23.6167,21.0954 L 23.0425,21.5544 L 22.4683,21.9001 L 21.8941,22.2458 L 21.3228,22.5915 L 20.7457,22.9733 L 20.1734,23.3571 L 19.6373,23.8171 L 19.1002,24.3903 L 18.565,25.0817 L 18.0679,25.8874 L 17.5698,26.8828 z "
4471 id="path1706"
4472 style="fill:url(#XMLID_78_);stroke-width:0.25400001" />
4473 <linearGradient
4474 id="linearGradient9199"
4475 gradientUnits="userSpaceOnUse"
4476 x1="-3586.4019"
4477 y1="-3601.3828"
4478 x2="-3570.2441"
4479 y2="-3549.8223"
4480 gradientTransform="matrix(-0.5318,4e-4,4e-4,0.5318,-1875.189,1917.5271)">
4481 <stop
4482 offset="0"
4483 style="stop-color:#D8E7EB"
4484 id="stop9201" />
4485 <stop
4486 offset="0.0684"
4487 style="stop-color:#D0DFE4"
4488 id="stop9203" />
4489 <stop
4490 offset="0.1761"
4491 style="stop-color:#B9CAD0"
4492 id="stop9205" />
4493 <stop
4494 offset="0.3096"
4495 style="stop-color:#94A7B0"
4496 id="stop9207" />
4497 <stop
4498 offset="0.4622"
4499 style="stop-color:#627784"
4500 id="stop9209" />
4501 <stop
4502 offset="0.5537"
4503 style="stop-color:#405766"
4504 id="stop9211" />
4505 <stop
4506 offset="0.6113"
4507 style="stop-color:#607682"
4508 id="stop9213" />
4509 <stop
4510 offset="0.6983"
4511 style="stop-color:#8B9EA8"
4512 id="stop9215" />
4513 <stop
4514 offset="0.7829"
4515 style="stop-color:#ADBEC5"
4516 id="stop9217" />
4517 <stop
4518 offset="0.8633"
4519 style="stop-color:#C5D5DA"
4520 id="stop9219" />
4521 <stop
4522 offset="0.9376"
4523 style="stop-color:#D3E2E7"
4524 id="stop9221" />
4525 <stop
4526 offset="1"
4527 style="stop-color:#D8E7EB"
4528 id="stop9223" />
4529 </linearGradient>
4530 <path
4531 d="M 27.2144,17.4531 L 26.7925,18.1054 L 26.3726,18.7577 L 25.9517,19.4081 L 25.5718,20.0214 L 25.2261,20.6347 L 24.8814,21.2841 L 24.5357,21.8984 L 24.231,22.5117 L 23.9273,23.164 L 23.6597,23.8144 L 23.3882,24.4277 L 23.1597,25.0781 L 22.9693,25.7676 L 22.7769,26.418 L 22.6246,27.1075 L 22.4723,27.838 L 22.0885,26.3829 L 21.8971,25.1183 L 21.8561,23.9689 L 21.9713,22.9748 L 22.1998,22.131 L 22.5455,21.3654 L 22.9654,20.7131 L 23.4234,20.176 L 23.9615,19.677 L 24.4967,19.2561 L 25.0699,18.9104 L 25.607,18.6047 L 26.1031,18.299 L 26.5631,18.0314 L 26.9049,17.7248 L 27.2144,17.4531 z "
4532 id="path1733"
4533 style="fill:url(#XMLID_79_);stroke-width:0.25400001" />
4534 </g><g
4535 id="g659">
4536 </g>
4537
4538 <clipPath
4539 id="clippath2"> <path
4540 d="M 206.512,283.82 L 206.512,283.82 L 204.824,284.09 L 203.254,284.129 L 201.801,283.977 L 200.422,283.633 L 199.16,283.098 L 198.008,282.445 L 196.859,281.68 L 195.824,280.801 L 194.789,279.883 L 193.832,278.887 L 192.871,277.816 L 191.953,276.785 L 190.996,275.75 L 190.035,274.719 L 189.078,273.762 L 188.082,272.883 L 188.082,272.883 L 188.465,272.918 L 188.887,272.918 L 189.27,272.957 L 189.691,272.992 L 190.113,273.031 L 190.535,273.109 L 190.953,273.145 L 191.375,273.184 L 191.793,273.223 L 192.258,273.262 L 192.676,273.34 L 193.137,273.375 L 193.598,273.414 L 194.055,273.449 L 194.516,273.449 L 194.973,273.488 L 194.973,273.488 L 195.32,274.023 L 195.742,274.676 L 196.277,275.441 L 196.852,276.242 L 197.543,277.125 L 198.273,278.004 L 199.039,278.922 L 199.844,279.801 L 200.688,280.645 L 201.527,281.449 L 202.41,282.176 L 203.254,282.789 L 204.098,283.285 L 204.941,283.629 L 205.742,283.82 L 206.512,283.82"
4541 id="path666" />
4542 </clipPath>
4543 <g
4544 id="g668">
4545 </g>
4546
4547 <clipPath
4548 id="clippath3"> <path
4549 d="M 209.652,283.855 L 209.652,283.855 L 208.695,283.973 L 207.773,284.047 L 206.895,284.012 L 206.012,283.855 L 205.133,283.668 L 204.289,283.324 L 203.449,282.902 L 202.566,282.328 L 201.684,281.676 L 200.801,280.914 L 199.918,279.992 L 199,279 L 198.043,277.812 L 197.043,276.512 L 196.012,275.098 L 194.938,273.488 L 194.938,273.488 L 195.59,273.488 L 196.203,273.527 L 196.816,273.602 L 197.426,273.676 L 198.039,273.758 L 198.652,273.828 L 199.34,273.828 L 200.066,273.828 L 200.066,273.828 L 200.684,274.441 L 201.258,275.09 L 201.832,275.707 L 202.406,276.395 L 202.98,277.043 L 203.52,277.734 L 204.094,278.383 L 204.633,279.074 L 205.207,279.723 L 205.781,280.371 L 206.391,281.023 L 206.969,281.637 L 207.617,282.211 L 208.273,282.785 L 208.922,283.355 L 209.652,283.855"
4550 id="path675" />
4551 </clipPath>
4552 <g
4553 id="g677">
4554 </g>
4555
4556 <clipPath
4557 id="clippath4"> <path
4558 d="M 212.98,283.699 L 212.98,283.699 L 212.254,284.086 L 211.453,284.238 L 210.645,284.121 L 209.805,283.781 L 208.961,283.281 L 208.078,282.59 L 207.199,281.828 L 206.316,280.91 L 205.473,279.953 L 204.59,278.957 L 203.785,277.961 L 202.98,277.004 L 202.215,276.047 L 201.484,275.207 L 200.797,274.48 L 200.184,273.863 L 204.281,274.246 L 204.281,274.246 L 204.742,274.859 L 205.242,275.469 L 205.699,276.086 L 206.16,276.695 L 206.617,277.309 L 207.117,277.922 L 207.578,278.535 L 208.078,279.145 L 208.613,279.719 L 209.152,280.332 L 209.688,280.91 L 210.301,281.477 L 210.91,282.051 L 211.566,282.629 L 212.254,283.164 L 212.98,283.699"
4559 id="path684" />
4560 </clipPath>
4561 <g
4562 id="g686">
4563 </g>
4564
4565 <clipPath
4566 id="clippath5"> <path
4567 d="M 219.762,282.812 L 219.762,282.812 L 218.574,283.578 L 217.465,284.078 L 216.352,284.309 L 215.277,284.309 L 214.246,284.117 L 213.25,283.777 L 212.293,283.238 L 211.336,282.551 L 210.414,281.711 L 209.492,280.832 L 208.652,279.793 L 207.77,278.762 L 206.926,277.652 L 206.082,276.508 L 205.277,275.359 L 204.473,274.246 L 204.473,274.246 L 205.16,274.363 L 205.699,274.398 L 206.082,274.434 L 206.426,274.477 L 206.77,274.516 L 207.113,274.512 L 207.613,274.551 L 208.266,274.586 L 208.266,274.586 L 208.992,275.621 L 209.723,276.578 L 210.449,277.496 L 211.254,278.301 L 212.02,279.027 L 212.785,279.719 L 213.59,280.328 L 214.359,280.863 L 215.125,281.324 L 215.891,281.742 L 216.617,282.047 L 217.309,282.316 L 217.996,282.547 L 218.609,282.699 L 219.223,282.773 L 219.762,282.812"
4568 id="path693" />
4569 </clipPath>
4570 <g
4571 id="g695">
4572 </g>
4573
4574 <clipPath
4575 id="clippath6"> <path
4576 d="M 223.324,282.578 L 223.324,282.578 L 222.094,282.73 L 220.91,282.812 L 219.762,282.734 L 218.652,282.582 L 217.613,282.355 L 216.582,282.012 L 215.621,281.59 L 214.703,281.055 L 213.785,280.48 L 212.941,279.832 L 212.098,279.105 L 211.293,278.301 L 210.527,277.461 L 209.797,276.543 L 209.07,275.582 L 208.379,274.59 L 208.379,274.59 L 209.145,274.629 L 209.988,274.703 L 210.789,274.777 L 211.598,274.816 L 212.285,274.895 L 212.859,274.926 L 213.281,274.969 L 213.473,275.004 L 213.473,275.004 L 213.551,275.082 L 213.742,275.273 L 214.047,275.578 L 214.43,275.965 L 214.852,276.422 L 215.391,276.957 L 215.965,277.531 L 216.613,278.145 L 217.348,278.754 L 218.109,279.406 L 218.879,280.02 L 219.719,280.633 L 220.598,281.203 L 221.48,281.738 L 222.402,282.199 L 223.324,282.578"
4577 id="path702" />
4578 </clipPath>
4579 <g
4580 id="g704">
4581 </g>
4582
4583 <clipPath
4584 id="clippath7"> <path
4585 d="M 228.031,282.844 L 228.031,282.844 L 226.996,283.039 L 226,283.113 L 224.969,283 L 223.934,282.809 L 222.898,282.469 L 221.906,282.043 L 220.91,281.547 L 219.91,280.934 L 218.992,280.289 L 218.074,279.598 L 217.191,278.871 L 216.348,278.105 L 215.543,277.301 L 214.777,276.539 L 214.086,275.77 L 213.434,275.004 L 213.434,275.004 L 213.895,275.004 L 214.469,275.043 L 215.082,275.082 L 215.73,275.082 L 216.348,275.152 L 216.961,275.195 L 217.457,275.234 L 217.879,275.309 L 217.879,275.309 L 217.879,275.387 L 218.031,275.574 L 218.414,275.922 L 218.949,276.344 L 219.602,276.879 L 220.367,277.488 L 221.211,278.145 L 222.094,278.789 L 223.012,279.477 L 223.93,280.168 L 224.813,280.82 L 225.656,281.395 L 226.422,281.926 L 227.113,282.348 L 227.648,282.652 L 228.031,282.844"
4586 id="path711" />
4587 </clipPath>
4588 <g
4589 id="g713">
4590 </g>
4591
4592 <clipPath
4593 id="clippath8"> <path
4594 d="M 233.238,282.801 L 233.238,282.801 L 232.629,283.414 L 231.824,283.68 L 230.828,283.684 L 229.715,283.418 L 228.488,282.957 L 227.227,282.309 L 225.887,281.543 L 224.582,280.703 L 223.281,279.785 L 222.055,278.867 L 220.906,277.988 L 219.906,277.145 L 219.027,276.457 L 218.375,275.883 L 217.953,275.504 L 217.762,275.348 L 217.762,275.348 L 218.258,275.387 L 218.797,275.422 L 219.371,275.461 L 219.945,275.5 L 220.559,275.574 L 221.172,275.613 L 221.785,275.652 L 222.434,275.691 L 222.434,275.691 L 223.125,276.109 L 223.813,276.531 L 224.504,276.988 L 225.156,277.41 L 225.844,277.867 L 226.496,278.363 L 227.184,278.824 L 227.84,279.285 L 228.527,279.781 L 229.18,280.238 L 229.867,280.699 L 230.52,281.156 L 231.207,281.578 L 231.898,282 L 232.551,282.418 L 233.238,282.801"
4595 id="path720" />
4596 </clipPath>
4597 <g
4598 id="g722">
4599 </g>
4600
4601 <clipPath
4602 id="clippath9"> <path
4603 d="M 237.07,282.875 L 237.07,282.875 L 236.418,283.105 L 235.77,283.223 L 235.117,283.258 L 234.465,283.148 L 233.813,282.953 L 233.125,282.687 L 232.398,282.305 L 231.629,281.848 L 230.785,281.309 L 229.906,280.699 L 228.949,280.012 L 227.875,279.285 L 226.688,278.48 L 225.422,277.602 L 224.043,276.68 L 222.512,275.727 L 222.512,275.727 L 223.164,275.766 L 223.891,275.801 L 224.539,275.801 L 225.191,275.84 L 225.766,275.879 L 226.223,275.914 L 226.531,275.953 L 226.648,275.988 L 226.648,275.988 L 226.723,276.105 L 226.953,276.336 L 227.301,276.645 L 227.758,277.062 L 228.332,277.559 L 228.984,278.137 L 229.676,278.707 L 230.48,279.32 L 231.285,279.93 L 232.129,280.543 L 232.969,281.078 L 233.852,281.613 L 234.695,282.074 L 235.535,282.457 L 236.34,282.723 L 237.07,282.875"
4604 id="path729" />
4605 </clipPath>
4606 <g
4607 id="g731">
4608 </g>
4609
4610 <clipPath
4611 id="clippath10"> <path
4612 d="M 239.863,282.453 L 239.863,282.453 L 238.984,282.797 L 238.027,282.949 L 237.031,282.879 L 235.996,282.609 L 234.922,282.227 L 233.891,281.727 L 232.816,281.121 L 231.82,280.43 L 230.863,279.742 L 229.941,279.016 L 229.098,278.328 L 228.371,277.637 L 227.719,277.062 L 227.223,276.562 L 226.84,276.223 L 226.609,275.988 L 226.609,275.988 L 227.223,276.027 L 227.797,276.07 L 228.371,276.105 L 228.945,276.18 L 229.48,276.223 L 230.059,276.258 L 230.668,276.332 L 231.281,276.371 L 231.281,276.371 L 231.738,276.793 L 232.242,277.211 L 232.699,277.633 L 233.199,278.094 L 233.695,278.59 L 234.195,279.051 L 234.73,279.508 L 235.27,279.965 L 235.805,280.391 L 236.34,280.809 L 236.914,281.191 L 237.449,281.535 L 238.063,281.844 L 238.637,282.105 L 239.25,282.301 L 239.863,282.453"
4613 id="path738" />
4614 </clipPath>
4615 <g
4616 id="g740">
4617 </g>
4618
4619 <clipPath
4620 id="clippath11"> <path
4621 d="M 243.23,281.379 L 243.23,281.379 L 242.469,281.914 L 241.703,282.258 L 240.938,282.414 L 240.172,282.453 L 239.402,282.301 L 238.637,282.07 L 237.871,281.727 L 237.105,281.27 L 236.379,280.77 L 235.613,280.16 L 234.883,279.547 L 234.156,278.895 L 233.426,278.246 L 232.738,277.555 L 232.047,276.906 L 231.355,276.293 L 231.355,276.293 L 231.781,276.332 L 232.238,276.41 L 232.734,276.445 L 233.273,276.484 L 233.848,276.559 L 234.461,276.598 L 235.109,276.672 L 235.723,276.711 L 235.723,276.711 L 235.914,277.172 L 236.184,277.629 L 236.453,278.051 L 236.797,278.473 L 237.18,278.855 L 237.602,279.238 L 238.023,279.617 L 238.52,279.926 L 239.02,280.23 L 239.555,280.5 L 240.129,280.77 L 240.707,280.961 L 241.32,281.113 L 241.93,281.266 L 242.582,281.34 L 243.23,281.379"
4622 id="path747" />
4623 </clipPath>
4624 <g
4625 id="g749">
4626 </g>
4627
4628 <clipPath
4629 id="clippath12"> <path
4630 d="M 246.262,280.801 L 246.262,280.801 L 245.605,281.07 L 244.918,281.262 L 244.191,281.379 L 243.387,281.379 L 242.617,281.34 L 241.777,281.227 L 240.973,280.996 L 240.168,280.77 L 239.402,280.422 L 238.672,280.039 L 238.023,279.617 L 237.41,279.125 L 236.871,278.586 L 236.414,278.016 L 236.07,277.363 L 235.84,276.711 L 235.84,276.711 L 236.297,276.75 L 236.758,276.789 L 237.254,276.863 L 237.754,276.902 L 238.25,276.941 L 238.785,276.977 L 239.359,277.016 L 240.016,277.051 L 240.016,277.051 L 240.281,277.434 L 240.59,277.82 L 240.895,278.164 L 241.199,278.469 L 241.508,278.777 L 241.855,279.004 L 242.199,279.273 L 242.543,279.504 L 242.926,279.695 L 243.348,279.883 L 243.77,280.074 L 244.227,280.227 L 244.688,280.383 L 245.184,280.535 L 245.723,280.648 L 246.262,280.801"
4631 id="path756" />
4632 </clipPath>
4633 <g
4634 id="g758">
4635 </g>
4636
4637 <clipPath
4638 id="clippath13"> <path
4639 d="M 247.828,280.266 L 247.828,280.266 L 247.598,280.492 L 247.254,280.648 L 246.832,280.727 L 246.371,280.727 L 245.836,280.687 L 245.262,280.574 L 244.648,280.383 L 244.035,280.191 L 243.422,279.922 L 242.809,279.613 L 242.199,279.273 L 241.66,278.891 L 241.121,278.469 L 240.664,278.012 L 240.32,277.551 L 240.016,277.051 L 240.016,277.051 L 240.355,277.094 L 240.664,277.129 L 241.047,277.168 L 241.395,277.207 L 241.734,277.242 L 242.043,277.281 L 242.309,277.281 L 242.578,277.281 L 242.578,277.281 L 242.578,277.434 L 242.922,277.738 L 243.5,278.16 L 244.266,278.66 L 245.184,279.156 L 246.105,279.652 L 247.023,280.035 L 247.828,280.266"
4640 id="path765" />
4641 </clipPath>
4642 <g
4643 id="g767">
4644 </g>
4645
4646 <clipPath
4647 id="clippath14"> <path
4648 d="M 248.938,279.191 L 248.938,279.191 L 248.859,279.727 L 248.668,280.07 L 248.363,280.227 L 247.945,280.301 L 247.445,280.227 L 246.871,280.035 L 246.297,279.805 L 245.684,279.5 L 245.07,279.156 L 244.496,278.773 L 243.918,278.395 L 243.461,278.047 L 243.039,277.738 L 242.73,277.512 L 242.539,277.359 L 242.504,277.281 L 245.488,277.621 L 245.488,277.621 L 245.719,277.773 L 246.102,277.969 L 246.563,278.199 L 247.063,278.426 L 247.598,278.695 L 248.094,278.883 L 248.555,279.078 L 248.938,279.191"
4649 id="path774" />
4650 </clipPath>
4651 <g
4652 id="g776">
4653 </g>
4654
4655 <clipPath
4656 id="clippath15"> <path
4657 d="M 245.488,277.621 L 245.488,277.621 L 246.141,277.66 L 246.715,277.699 L 247.25,277.738 L 247.75,277.773 L 248.211,277.812 L 248.707,277.852 L 249.203,277.926 L 249.781,278.043 L 249.781,278.043 L 249.664,278.77 L 249.32,279.113 L 248.785,279.113 L 248.094,278.926 L 247.406,278.578 L 246.676,278.195 L 246.027,277.852 L 245.488,277.621"
4658 id="path783" />
4659 </clipPath>
4660 <g
4661 id="g785">
4662 </g>
4663
4664 <clipPath
4665 id="clippath16"> <path
4666 d="M 245.602,277.242 L 245.602,277.242 L 246.258,277.355 L 246.832,277.43 L 247.328,277.547 L 247.828,277.621 L 248.285,277.734 L 248.742,277.852 L 249.281,277.965 L 249.855,278.117 L 249.855,278.117 L 249.895,277.277 L 249.547,276.738 L 249.012,276.547 L 248.285,276.512 L 247.52,276.664 L 246.754,276.859 L 246.102,277.09 L 245.602,277.242"
4667 id="path792" />
4668 </clipPath>
4669 <g
4670 id="g794">
4671 </g>
4672
4673 <clipPath
4674 id="clippath17"> <path
4675 d="M 205.766,261.684 L 205.766,261.684 L 204.082,261.035 L 202.469,260.613 L 200.977,260.504 L 199.563,260.578 L 198.184,260.887 L 196.922,261.348 L 195.695,261.996 L 194.508,262.727 L 193.359,263.57 L 192.246,264.488 L 191.141,265.449 L 190.066,266.445 L 188.996,267.402 L 187.926,268.324 L 186.852,269.207 L 185.742,270.012 L 185.742,270.012 L 186.129,270.086 L 186.547,270.121 L 186.969,270.164 L 187.352,270.238 L 187.773,270.277 L 188.195,270.316 L 188.656,270.352 L 189.074,270.391 L 189.496,270.426 L 189.957,270.469 L 190.414,270.543 L 190.84,270.582 L 191.297,270.621 L 191.758,270.695 L 192.254,270.73 L 192.715,270.809 L 192.715,270.809 L 193.098,270.273 L 193.594,269.621 L 194.168,268.855 L 194.855,268.09 L 195.582,267.246 L 196.387,266.402 L 197.266,265.559 L 198.148,264.754 L 199.105,263.988 L 200.059,263.301 L 201.016,262.684 L 202.016,262.187 L 202.973,261.801 L 203.93,261.574 L 204.887,261.535 L 205.766,261.684"
4676 id="path801" />
4677 </clipPath>
4678 <g
4679 id="g803">
4680 </g>
4681
4682 <clipPath
4683 id="clippath18"> <path
4684 d="M 208.641,262.332 L 208.641,262.332 L 207.68,261.988 L 206.762,261.762 L 205.883,261.609 L 204.961,261.531 L 204.082,261.609 L 203.16,261.801 L 202.281,262.07 L 201.363,262.492 L 200.406,263.031 L 199.449,263.723 L 198.453,264.523 L 197.418,265.484 L 196.313,266.594 L 195.203,267.859 L 194.016,269.277 L 192.754,270.848 L 192.754,270.848 L 193.441,270.887 L 194.055,270.961 L 194.629,270.996 L 195.164,271.074 L 195.703,271.152 L 196.273,271.227 L 196.926,271.34 L 197.691,271.457 L 197.691,271.457 L 198.383,270.883 L 199.031,270.27 L 199.723,269.617 L 200.336,268.965 L 200.984,268.312 L 201.633,267.664 L 202.246,267.012 L 202.898,266.359 L 203.547,265.746 L 204.234,265.172 L 204.887,264.598 L 205.578,264.023 L 206.301,263.523 L 207.07,263.102 L 207.836,262.68 L 208.641,262.332"
4685 id="path810" />
4686 </clipPath>
4687 <g
4688 id="g812">
4689 </g>
4690
4691 <clipPath
4692 id="clippath19"> <path
4693 d="M 211.855,263.176 L 211.855,263.176 L 211.129,262.562 L 210.32,262.258 L 209.441,262.18 L 208.563,262.371 L 207.605,262.758 L 206.648,263.293 L 205.652,263.984 L 204.656,264.789 L 203.66,265.668 L 202.668,266.59 L 201.711,267.512 L 200.793,268.469 L 199.953,269.348 L 199.109,270.152 L 198.383,270.844 L 197.691,271.418 L 201.867,271.914 L 201.867,271.914 L 202.402,271.301 L 202.941,270.684 L 203.438,270.074 L 203.973,269.461 L 204.508,268.848 L 205.043,268.195 L 205.613,267.582 L 206.152,267.008 L 206.766,266.43 L 207.375,265.859 L 208.027,265.32 L 208.715,264.824 L 209.445,264.324 L 210.211,263.902 L 211.012,263.52 L 211.855,263.176"
4694 id="path819" />
4695 </clipPath>
4696 <g
4697 id="g821">
4698 </g>
4699
4700 <clipPath
4701 id="clippath20"> <path
4702 d="M 218.137,265.852 L 218.137,265.852 L 216.988,264.738 L 215.879,263.937 L 214.805,263.402 L 213.73,263.098 L 212.66,263.059 L 211.625,263.215 L 210.594,263.598 L 209.598,264.098 L 208.602,264.785 L 207.648,265.59 L 206.652,266.512 L 205.73,267.508 L 204.773,268.543 L 203.855,269.652 L 202.941,270.766 L 202.02,271.875 L 202.02,271.875 L 202.75,271.914 L 203.359,271.949 L 203.859,271.988 L 204.32,272.066 L 204.777,272.141 L 205.277,272.215 L 205.848,272.293 L 206.539,272.406 L 206.539,272.406 L 207.344,271.371 L 208.109,270.453 L 208.91,269.609 L 209.676,268.883 L 210.48,268.27 L 211.246,267.73 L 211.973,267.27 L 212.738,266.891 L 213.469,266.582 L 214.156,266.352 L 214.883,266.16 L 215.574,266.008 L 216.223,265.926 L 216.875,265.852 L 217.523,265.852 L 218.137,265.852"
4703 id="path828" />
4704 </clipPath>
4705 <g
4706 id="g830">
4707 </g>
4708
4709 <clipPath
4710 id="clippath21"> <path
4711 d="M 222.621,266.883 L 222.621,266.883 L 221.352,266.422 L 220.168,266.078 L 218.98,265.887 L 217.871,265.809 L 216.762,265.809 L 215.688,265.965 L 214.652,266.199 L 213.656,266.543 L 212.66,266.965 L 211.742,267.5 L 210.785,268.113 L 209.91,268.801 L 209.027,269.57 L 208.188,270.414 L 207.383,271.336 L 206.578,272.332 L 206.578,272.332 L 207.383,272.445 L 208.301,272.555 L 209.223,272.672 L 210.141,272.789 L 210.984,272.902 L 211.633,272.98 L 212.133,273.016 L 212.324,273.016 L 212.324,273.016 L 212.398,272.937 L 212.59,272.785 L 212.859,272.52 L 213.203,272.172 L 213.625,271.754 L 214.121,271.328 L 214.695,270.832 L 215.348,270.293 L 216.074,269.797 L 216.84,269.258 L 217.68,268.758 L 218.598,268.266 L 219.52,267.84 L 220.512,267.457 L 221.547,267.109 L 222.621,266.883"
4712 id="path837" />
4713 </clipPath>
4714 <g
4715 id="g839">
4716 </g>
4717
4718 <clipPath
4719 id="clippath22"> <path
4720 d="M 228.285,267.566 L 228.285,267.566 L 227.254,267.148 L 226.219,266.84 L 225.109,266.727 L 224.035,266.727 L 222.926,266.844 L 221.816,267.074 L 220.742,267.418 L 219.633,267.84 L 218.598,268.34 L 217.566,268.875 L 216.57,269.527 L 215.578,270.176 L 214.695,270.871 L 213.816,271.598 L 213.012,272.324 L 212.285,273.055 L 212.285,273.055 L 212.781,273.094 L 213.395,273.168 L 214.086,273.242 L 214.813,273.32 L 215.539,273.395 L 216.191,273.473 L 216.766,273.508 L 217.188,273.508 L 217.188,273.508 L 217.188,273.43 L 217.379,273.238 L 217.762,272.973 L 218.336,272.59 L 219.023,272.168 L 219.828,271.668 L 220.746,271.133 L 221.703,270.598 L 222.66,270.059 L 223.656,269.52 L 224.652,269.023 L 225.57,268.566 L 226.414,268.18 L 227.176,267.871 L 227.793,267.68 L 228.285,267.566"
4721 id="path846" />
4722 </clipPath>
4723 <g
4724 id="g848">
4725 </g>
4726
4727 <clipPath
4728 id="clippath23"> <path
4729 d="M 233.648,268.672 L 233.648,268.672 L 233.113,267.832 L 232.309,267.332 L 231.313,267.145 L 230.164,267.145 L 228.902,267.414 L 227.523,267.836 L 226.066,268.41 L 224.652,269.062 L 223.195,269.793 L 221.855,270.555 L 220.594,271.289 L 219.445,272.016 L 218.488,272.629 L 217.762,273.09 L 217.266,273.43 L 217.035,273.547 L 217.035,273.547 L 217.57,273.664 L 218.18,273.738 L 218.836,273.812 L 219.523,273.852 L 220.25,273.93 L 220.98,274.004 L 221.668,274.117 L 222.355,274.234 L 222.355,274.234 L 223.086,273.891 L 223.773,273.543 L 224.504,273.16 L 225.191,272.777 L 225.879,272.43 L 226.57,272.047 L 227.297,271.664 L 227.984,271.281 L 228.672,270.93 L 229.359,270.551 L 230.09,270.207 L 230.777,269.902 L 231.469,269.551 L 232.195,269.246 L 232.922,268.937 L 233.648,268.672"
4730 id="path855" />
4731 </clipPath>
4732 <g
4733 id="g857">
4734 </g>
4735
4736 <clipPath
4737 id="clippath24"> <path
4738 d="M 240.504,270.504 L 240.504,270.504 L 239.66,269.895 L 238.707,269.547 L 237.711,269.398 L 236.641,269.398 L 235.527,269.59 L 234.379,269.934 L 233.266,270.355 L 232.16,270.855 L 231.121,271.43 L 230.09,272.008 L 229.172,272.621 L 228.367,273.156 L 227.641,273.656 L 227.066,274.113 L 226.648,274.422 L 226.418,274.613 L 226.418,274.613 L 227.031,274.687 L 227.605,274.766 L 228.098,274.844 L 228.602,274.879 L 229.137,274.918 L 229.633,274.992 L 230.211,275.07 L 230.82,275.145 L 230.82,275.145 L 231.355,274.801 L 231.895,274.453 L 232.469,274.035 L 233.039,273.652 L 233.652,273.23 L 234.266,272.809 L 234.879,272.387 L 235.488,272.004 L 236.105,271.617 L 236.754,271.309 L 237.406,271.008 L 238.02,270.773 L 238.664,270.586 L 239.281,270.469 L 239.895,270.469 L 240.504,270.504"
4739 id="path864" />
4740 </clipPath>
4741 <g
4742 id="g866">
4743 </g>
4744
4745 <clipPath
4746 id="clippath25"> <path
4747 d="M 237.977,269.398 L 237.977,269.398 L 237.324,268.977 L 236.637,268.672 L 235.984,268.516 L 235.297,268.441 L 234.605,268.48 L 233.879,268.633 L 233.074,268.902 L 232.234,269.207 L 231.316,269.633 L 230.355,270.09 L 229.285,270.668 L 228.137,271.281 L 226.871,271.93 L 225.496,272.621 L 224.004,273.387 L 222.355,274.156 L 222.355,274.156 L 222.969,274.195 L 223.66,274.27 L 224.348,274.348 L 225.039,274.461 L 225.652,274.539 L 226.109,274.613 L 226.457,274.652 L 226.609,274.613 L 226.609,274.613 L 226.684,274.535 L 226.953,274.348 L 227.297,274.039 L 227.758,273.695 L 228.332,273.273 L 229.02,272.812 L 229.746,272.312 L 230.551,271.812 L 231.43,271.316 L 232.313,270.82 L 233.266,270.395 L 234.223,270.012 L 235.184,269.707 L 236.141,269.512 L 237.059,269.398 L 237.977,269.398"
4748 id="path873" />
4749 </clipPath>
4750 <g
4751 id="g875">
4752 </g>
4753
4754 <clipPath
4755 id="clippath26"> <path
4756 d="M 243.57,272.457 L 243.57,272.457 L 242.879,271.691 L 242.156,271.113 L 241.426,270.734 L 240.66,270.504 L 239.895,270.469 L 239.09,270.543 L 238.246,270.734 L 237.441,271.008 L 236.602,271.426 L 235.797,271.848 L 234.957,272.383 L 234.113,272.922 L 233.27,273.461 L 232.469,274.035 L 231.66,274.613 L 230.859,275.109 L 230.859,275.109 L 231.32,275.184 L 231.816,275.223 L 232.391,275.34 L 233.004,275.41 L 233.652,275.488 L 234.344,275.562 L 234.992,275.641 L 235.648,275.715 L 235.648,275.715 L 235.875,275.219 L 236.145,274.797 L 236.449,274.375 L 236.832,273.992 L 237.215,273.648 L 237.637,273.379 L 238.137,273.109 L 238.633,272.879 L 239.168,272.687 L 239.738,272.535 L 240.316,272.418 L 240.93,272.34 L 241.578,272.301 L 242.23,272.305 L 242.883,272.379 L 243.57,272.457"
4757 id="path882" />
4758 </clipPath>
4759 <g
4760 id="g884">
4761 </g>
4762
4763 <clipPath
4764 id="clippath27"> <path
4765 d="M 248.285,274.977 L 248.285,274.977 L 248.09,274.672 L 247.785,274.406 L 247.363,274.176 L 246.906,274.023 L 246.367,273.91 L 245.758,273.871 L 245.145,273.871 L 244.453,273.91 L 243.801,274.023 L 243.113,274.18 L 242.461,274.41 L 241.852,274.676 L 241.273,274.984 L 240.738,275.367 L 240.32,275.789 L 239.938,276.25 L 239.938,276.25 L 240.281,276.328 L 240.703,276.402 L 241.121,276.48 L 241.543,276.551 L 241.926,276.59 L 242.234,276.633 L 242.465,276.633 L 242.539,276.633 L 242.539,276.633 L 242.695,276.551 L 243.074,276.324 L 243.727,276.016 L 244.492,275.672 L 245.371,275.363 L 246.367,275.098 L 247.324,274.941 L 248.285,274.977"
4766 id="path891" />
4767 </clipPath>
4768 <g
4769 id="g893">
4770 </g>
4771
4772 <clipPath
4773 id="clippath28"> <path
4774 d="M 249.203,276.547 L 249.203,276.547 L 249.238,275.937 L 249.09,275.477 L 248.816,275.172 L 248.398,274.98 L 247.902,274.902 L 247.324,274.941 L 246.711,275.02 L 246.023,275.172 L 245.371,275.402 L 244.723,275.633 L 244.109,275.863 L 243.574,276.094 L 243.113,276.324 L 242.73,276.48 L 242.539,276.633 L 242.461,276.668 L 245.527,277.199 L 245.527,277.199 L 245.797,277.09 L 246.219,276.973 L 246.676,276.816 L 247.215,276.664 L 247.75,276.547 L 248.285,276.473 L 248.781,276.473 L 249.203,276.547"
4775 id="path900" />
4776 </clipPath>
4777 <g
4778 id="g902">
4779 </g>
4780
4781 <clipPath
4782 id="clippath29"> <path
4783 d="M 246.672,273.984 L 246.672,273.984 L 246.063,273.488 L 245.41,273.105 L 244.68,272.762 L 243.879,272.531 L 243.07,272.379 L 242.27,272.266 L 241.426,272.27 L 240.586,272.34 L 239.781,272.496 L 238.977,272.766 L 238.246,273.074 L 237.559,273.457 L 236.91,273.918 L 236.371,274.453 L 235.953,275.066 L 235.609,275.754 L 235.609,275.754 L 236.105,275.793 L 236.566,275.871 L 237.066,275.906 L 237.602,275.984 L 238.137,276.059 L 238.711,276.098 L 239.324,276.215 L 239.973,276.285 L 239.973,276.285 L 240.203,275.945 L 240.473,275.637 L 240.777,275.367 L 241.121,275.102 L 241.504,274.867 L 241.926,274.641 L 242.348,274.445 L 242.805,274.297 L 243.266,274.141 L 243.762,274.027 L 244.223,273.949 L 244.723,273.91 L 245.219,273.871 L 245.715,273.871 L 246.215,273.91 L 246.672,273.984"
4784 id="path909" />
4785 </clipPath>
4786 <g
4787 id="g911">
4788 </g>
4789
4790 <clipPath
4791 id="clippath30"> <path
4792 d="M 110.332,255.668 L 110.332,255.668 L 110.258,255.785 L 110.063,256.094 L 109.797,256.59 L 109.453,257.203 L 109.066,257.93 L 108.684,258.734 L 108.34,259.656 L 108.035,260.574 L 107.809,261.531 L 107.73,262.453 L 107.773,263.367 L 107.965,264.211 L 108.387,264.98 L 109.035,265.664 L 109.957,266.199 L 111.145,266.586 L 111.145,266.586 L 110.953,266.008 L 110.836,265.395 L 110.723,264.785 L 110.645,264.094 L 110.605,263.406 L 110.605,262.68 L 110.645,261.988 L 110.68,261.262 L 110.758,260.531 L 110.832,259.805 L 110.949,259.117 L 111.098,258.465 L 111.215,257.812 L 111.367,257.238 L 111.555,256.664 L 111.711,256.168 L 111.711,256.168 L 111.52,256.09 L 111.363,256.051 L 111.176,255.977 L 111.02,255.937 L 110.863,255.863 L 110.715,255.824 L 110.523,255.742 L 110.332,255.668"
4793 id="path918" />
4794 </clipPath>
4795 <g
4796 id="g920">
4797 </g>
4798
4799 <clipPath
4800 id="clippath31"> <path
4801 d="M 111.633,256.203 L 111.633,256.203 L 111.098,258.312 L 110.719,260.227 L 110.527,261.871 L 110.492,263.328 L 110.57,264.594 L 110.797,265.664 L 111.105,266.586 L 111.453,267.348 L 111.91,267.961 L 112.41,268.457 L 112.906,268.805 L 113.402,269.07 L 113.941,269.262 L 114.402,269.34 L 114.82,269.375 L 115.207,269.375 L 115.207,269.375 L 114.977,268.762 L 114.746,268.148 L 114.555,267.461 L 114.398,266.773 L 114.246,266.004 L 114.129,265.242 L 114.012,264.477 L 113.938,263.672 L 113.898,262.867 L 113.895,262.023 L 113.938,261.219 L 114.012,260.379 L 114.125,259.574 L 114.281,258.73 L 114.469,257.965 L 114.699,257.16 L 114.699,257.16 L 114.238,257.008 L 113.777,256.855 L 113.355,256.703 L 112.938,256.586 L 112.59,256.473 L 112.211,256.359 L 111.902,256.281 L 111.633,256.203"
4802 id="path927" />
4803 </clipPath>
4804 <g
4805 id="g929">
4806 </g>
4807
4808 <clipPath
4809 id="clippath32"> <path
4810 d="M 114.738,257.16 L 114.738,257.16 L 114.508,257.926 L 114.281,258.883 L 114.125,259.957 L 114.051,261.145 L 114.012,262.445 L 114.012,263.746 L 114.129,265.047 L 114.32,266.312 L 114.594,267.5 L 114.938,268.609 L 115.438,269.566 L 116.008,270.332 L 116.738,270.906 L 117.582,271.211 L 118.539,271.25 L 119.648,270.945 L 119.648,270.945 L 119.227,270.254 L 118.805,269.527 L 118.461,268.762 L 118.152,268.035 L 117.922,267.23 L 117.691,266.465 L 117.539,265.66 L 117.387,264.855 L 117.309,264.051 L 117.27,263.211 L 117.27,262.406 L 117.344,261.562 L 117.418,260.723 L 117.57,259.914 L 117.723,259.07 L 117.953,258.27 L 117.953,258.27 L 117.57,258.113 L 117.113,257.965 L 116.652,257.809 L 116.191,257.656 L 115.73,257.504 L 115.348,257.352 L 115.004,257.234 L 114.738,257.16"
4811 id="path936" />
4812 </clipPath>
4813 <g
4814 id="g938">
4815 </g>
4816
4817 <clipPath
4818 id="clippath33"> <path
4819 d="M 124.477,273.043 L 124.477,273.043 L 123.441,273.121 L 122.445,272.969 L 121.566,272.59 L 120.762,272.051 L 119.992,271.324 L 119.344,270.445 L 118.805,269.449 L 118.305,268.34 L 117.922,267.152 L 117.613,265.887 L 117.422,264.586 L 117.344,263.285 L 117.344,261.984 L 117.457,260.68 L 117.688,259.418 L 118.031,258.23 L 118.031,258.23 L 118.453,258.383 L 118.875,258.535 L 119.332,258.648 L 119.793,258.801 L 120.254,258.957 L 120.75,259.109 L 121.211,259.262 L 121.707,259.414 L 121.707,259.414 L 121.707,260.219 L 121.707,261.098 L 121.746,262.098 L 121.785,263.129 L 121.828,264.164 L 121.863,265.234 L 121.98,266.344 L 122.094,267.379 L 122.25,268.414 L 122.402,269.367 L 122.637,270.289 L 122.902,271.094 L 123.211,271.781 L 123.594,272.359 L 124.016,272.777 L 124.477,273.043"
4820 id="path945" />
4821 </clipPath>
4822 <g
4823 id="g947">
4824 </g>
4825
4826 <clipPath
4827 id="clippath34"> <path
4828 d="M 121.746,259.453 L 121.746,259.453 L 122.129,259.605 L 122.512,259.719 L 122.855,259.836 L 123.238,259.91 L 123.621,260.027 L 123.969,260.141 L 124.352,260.254 L 124.73,260.41 L 124.73,260.41 L 124.809,261.176 L 124.852,262.016 L 124.891,262.934 L 124.926,263.891 L 124.93,264.852 L 124.93,265.883 L 124.969,266.918 L 125.008,267.914 L 125.086,268.91 L 125.199,269.902 L 125.395,270.824 L 125.586,271.668 L 125.891,272.469 L 126.234,273.195 L 126.656,273.809 L 127.195,274.305 L 127.195,274.305 L 126.086,274.039 L 125.164,273.656 L 124.398,273.121 L 123.746,272.508 L 123.211,271.781 L 122.789,270.941 L 122.48,270.023 L 122.25,269.062 L 122.094,267.992 L 121.98,266.883 L 121.941,265.73 L 121.902,264.508 L 121.863,263.281 L 121.824,262.02 L 121.824,260.754 L 121.746,259.453"
4829 id="path954" />
4830 </clipPath>
4831 <g
4832 id="g956">
4833 </g>
4834
4835 <clipPath
4836 id="clippath35"> <path
4837 d="M 131.906,275.797 L 131.906,275.797 L 130.375,275.723 L 129.07,275.418 L 128,274.918 L 127.117,274.27 L 126.43,273.426 L 125.926,272.469 L 125.508,271.398 L 125.238,270.211 L 125.086,268.984 L 124.973,267.723 L 124.93,266.418 L 124.93,265.156 L 124.926,263.852 L 124.926,262.629 L 124.852,261.48 L 124.773,260.406 L 124.773,260.406 L 125.195,260.523 L 125.578,260.637 L 125.918,260.754 L 126.305,260.863 L 126.688,260.98 L 127.031,261.094 L 127.414,261.172 L 127.836,261.285 L 127.836,261.285 L 127.797,261.398 L 127.723,261.668 L 127.645,262.09 L 127.531,262.629 L 127.457,263.316 L 127.379,264.082 L 127.34,264.965 L 127.379,265.961 L 127.492,266.992 L 127.688,268.141 L 128.035,269.328 L 128.457,270.551 L 129.07,271.816 L 129.836,273.156 L 130.758,274.457 L 131.906,275.797"
4838 id="path963" />
4839 </clipPath>
4840 <g
4841 id="g965">
4842 </g>
4843
4844 <clipPath
4845 id="clippath36"> <path
4846 d="M 137.117,277.437 L 137.117,277.437 L 136.082,277.629 L 135.047,277.555 L 134.051,277.211 L 133.055,276.676 L 132.098,275.949 L 131.18,275.07 L 130.336,273.996 L 129.566,272.809 L 128.918,271.547 L 128.344,270.168 L 127.879,268.711 L 127.57,267.223 L 127.379,265.727 L 127.34,264.195 L 127.488,262.742 L 127.797,261.285 L 131.434,262.43 L 131.434,262.43 L 131.398,263.047 L 131.359,263.773 L 131.398,264.574 L 131.48,265.492 L 131.594,266.453 L 131.785,267.488 L 131.977,268.559 L 132.285,269.668 L 132.633,270.777 L 133.051,271.852 L 133.512,272.922 L 134.09,273.957 L 134.699,274.953 L 135.43,275.871 L 136.234,276.711 L 137.117,277.437"
4847 id="path972" />
4848 </clipPath>
4849 <g
4850 id="g974">
4851 </g>
4852
4853 <clipPath
4854 id="clippath37"> <path
4855 d="M 131.359,262.355 L 131.359,262.355 L 131.934,262.547 L 132.66,262.738 L 133.469,263.004 L 134.309,263.234 L 135.113,263.504 L 135.918,263.73 L 136.605,263.922 L 137.145,264.035 L 137.145,264.035 L 137.223,264.918 L 137.262,265.871 L 137.34,266.832 L 137.375,267.824 L 137.492,268.82 L 137.57,269.859 L 137.684,270.852 L 137.84,271.848 L 138.031,272.805 L 138.262,273.727 L 138.57,274.641 L 138.914,275.484 L 139.336,276.289 L 139.836,276.977 L 140.371,277.629 L 141.02,278.203 L 141.02,278.203 L 140.254,278.316 L 139.453,278.281 L 138.648,278.09 L 137.844,277.746 L 137.039,277.246 L 136.234,276.633 L 135.465,275.871 L 134.699,274.914 L 134.012,273.879 L 133.398,272.652 L 132.82,271.312 L 132.328,269.785 L 131.938,268.137 L 131.633,266.379 L 131.438,264.422 L 131.359,262.355"
4856 id="path981" />
4857 </clipPath>
4858 <g
4859 id="g983">
4860 </g>
4861
4862 <clipPath
4863 id="clippath38"> <path
4864 d="M 146.043,278.961 L 146.043,278.961 L 144.316,279.043 L 142.863,278.891 L 141.598,278.473 L 140.598,277.816 L 139.758,277.016 L 139.066,276.059 L 138.57,274.984 L 138.184,273.762 L 137.879,272.539 L 137.688,271.195 L 137.57,269.895 L 137.453,268.594 L 137.414,267.289 L 137.336,266.102 L 137.262,264.992 L 137.145,263.996 L 137.145,263.996 L 137.68,264.113 L 138.145,264.227 L 138.563,264.379 L 138.98,264.492 L 139.441,264.609 L 139.863,264.723 L 140.359,264.875 L 140.938,265.027 L 140.938,265.027 L 141.051,266.066 L 141.164,267.055 L 141.281,268.051 L 141.438,269.008 L 141.555,269.969 L 141.707,270.887 L 141.898,271.766 L 142.09,272.648 L 142.359,273.488 L 142.668,274.332 L 143.047,275.137 L 143.473,275.941 L 144.008,276.742 L 144.582,277.508 L 145.273,278.238 L 146.043,278.961"
4865 id="path990" />
4866 </clipPath>
4867 <g
4868 id="g992">
4869 </g>
4870
4871 <clipPath
4872 id="clippath39"> <path
4873 d="M 150.672,279.801 L 150.672,279.801 L 149.141,279.879 L 147.801,279.687 L 146.613,279.27 L 145.578,278.617 L 144.699,277.816 L 143.934,276.859 L 143.316,275.75 L 142.781,274.562 L 142.324,273.336 L 141.977,272.035 L 141.707,270.77 L 141.512,269.473 L 141.324,268.246 L 141.207,267.098 L 141.09,266.023 L 141.012,265.105 L 141.012,265.105 L 141.551,265.223 L 142.008,265.332 L 142.43,265.449 L 142.852,265.527 L 143.23,265.641 L 143.656,265.715 L 144.152,265.863 L 144.727,265.98 L 144.727,265.98 L 144.918,267.477 L 145.148,268.895 L 145.418,270.195 L 145.688,271.422 L 145.996,272.531 L 146.344,273.602 L 146.723,274.52 L 147.109,275.402 L 147.492,276.207 L 147.914,276.93 L 148.375,277.547 L 148.797,278.117 L 149.258,278.656 L 149.719,279.074 L 150.215,279.457 L 150.672,279.801"
4874 id="path999" />
4875 </clipPath>
4876 <g
4877 id="g1001">
4878 </g>
4879
4880 <clipPath
4881 id="clippath40"> <path
4882 d="M 155.043,280.371 L 155.043,280.371 L 153.586,280.453 L 152.285,280.262 L 151.133,279.84 L 150.059,279.227 L 149.18,278.426 L 148.375,277.508 L 147.684,276.434 L 147.07,275.289 L 146.535,274.098 L 146.113,272.836 L 145.766,271.574 L 145.461,270.309 L 145.188,269.121 L 144.996,267.973 L 144.844,266.941 L 144.688,266.02 L 144.688,266.02 L 145.148,266.098 L 145.605,266.211 L 146.07,266.328 L 146.527,266.441 L 146.988,266.555 L 147.449,266.668 L 147.91,266.785 L 148.406,266.902 L 148.406,266.902 L 148.445,267.625 L 148.559,268.43 L 148.715,269.273 L 148.98,270.191 L 149.25,271.184 L 149.633,272.145 L 150.02,273.176 L 150.441,274.176 L 150.938,275.172 L 151.477,276.129 L 152.012,277.008 L 152.586,277.848 L 153.16,278.652 L 153.777,279.344 L 154.426,279.914 L 155.043,280.371"
4883 id="path1008" />
4884 </clipPath>
4885 <g
4886 id="g1010">
4887 </g>
4888
4889 <clipPath
4890 id="clippath41"> <path
4891 d="M 159.824,281.25 L 159.824,281.25 L 158.336,281.445 L 156.957,281.254 L 155.691,280.758 L 154.539,279.988 L 153.469,279.035 L 152.547,277.887 L 151.707,276.586 L 150.938,275.246 L 150.324,273.832 L 149.785,272.449 L 149.328,271.148 L 148.945,269.887 L 148.715,268.812 L 148.52,267.93 L 148.406,267.281 L 148.406,266.902 L 148.406,266.902 L 148.977,267.012 L 149.516,267.129 L 150.051,267.281 L 150.586,267.395 L 151.121,267.547 L 151.621,267.66 L 152.121,267.777 L 152.617,267.855 L 152.617,267.855 L 152.809,268.504 L 153,269.27 L 153.23,270.148 L 153.504,271.145 L 153.809,272.18 L 154.152,273.25 L 154.535,274.32 L 154.961,275.434 L 155.422,276.504 L 155.918,277.5 L 156.457,278.461 L 157.031,279.301 L 157.684,280.027 L 158.332,280.602 L 159.059,281.023 L 159.824,281.25"
4892 id="path1017" />
4893 </clipPath>
4894 <g
4895 id="g1019">
4896 </g>
4897
4898 <clipPath
4899 id="clippath42"> <path
4900 d="M 163.309,281.055 L 163.309,281.055 L 161.934,281.363 L 160.711,281.363 L 159.598,281.098 L 158.563,280.562 L 157.68,279.836 L 156.879,278.953 L 156.152,277.887 L 155.496,276.734 L 154.961,275.508 L 154.461,274.246 L 154.078,272.984 L 153.691,271.758 L 153.387,270.609 L 153.156,269.574 L 152.922,268.656 L 152.734,267.93 L 152.734,267.93 L 153.152,268.008 L 153.578,268.043 L 153.957,268.117 L 154.301,268.195 L 154.688,268.273 L 155.031,268.352 L 155.336,268.426 L 155.68,268.465 L 155.68,268.465 L 155.801,269.344 L 155.988,270.262 L 156.223,271.18 L 156.449,272.062 L 156.758,272.98 L 157.105,273.863 L 157.484,274.707 L 157.949,275.547 L 158.406,276.391 L 158.945,277.191 L 159.559,277.957 L 160.168,278.645 L 160.898,279.336 L 161.625,279.949 L 162.43,280.523 L 163.309,281.055"
4901 id="path1026" />
4902 </clipPath>
4903 <g
4904 id="g1028">
4905 </g>
4906
4907 <clipPath
4908 id="clippath43"> <path
4909 d="M 167.676,281.742 L 167.676,281.742 L 166.531,281.742 L 165.418,281.629 L 164.348,281.359 L 163.309,280.977 L 162.355,280.445 L 161.395,279.793 L 160.555,279.031 L 159.715,278.191 L 158.98,277.23 L 158.293,276.199 L 157.676,275.047 L 157.105,273.863 L 156.645,272.598 L 156.258,271.258 L 155.91,269.879 L 155.68,268.465 L 155.68,268.465 L 156.18,268.578 L 156.68,268.656 L 157.215,268.77 L 157.711,268.844 L 158.207,268.922 L 158.668,269.035 L 159.168,269.113 L 159.625,269.227 L 159.625,269.227 L 159.855,270.031 L 160.047,270.871 L 160.281,271.676 L 160.473,272.48 L 160.707,273.285 L 160.934,274.09 L 161.238,274.855 L 161.586,275.656 L 162.004,276.426 L 162.469,277.227 L 163.043,277.992 L 163.73,278.758 L 164.5,279.488 L 165.418,280.25 L 166.488,281.016 L 167.676,281.742"
4910 id="path1035" />
4911 </clipPath>
4912 <g
4913 id="g1037">
4914 </g>
4915
4916 <clipPath
4917 id="clippath44"> <path
4918 d="M 173.883,282.465 L 173.883,282.465 L 172.387,282.582 L 170.934,282.508 L 169.594,282.277 L 168.328,281.895 L 167.141,281.398 L 166.031,280.746 L 164.996,279.98 L 164.078,279.062 L 163.234,278.109 L 162.469,277.035 L 161.773,275.887 L 161.203,274.664 L 160.707,273.363 L 160.281,272.023 L 159.973,270.645 L 159.738,269.266 L 159.738,269.266 L 160.164,269.34 L 160.621,269.457 L 161.078,269.531 L 161.539,269.605 L 162,269.684 L 162.5,269.801 L 162.996,269.875 L 163.496,269.988 L 163.496,269.988 L 163.648,270.945 L 163.879,271.902 L 164.188,272.863 L 164.57,273.781 L 165.027,274.699 L 165.531,275.613 L 166.105,276.496 L 166.754,277.34 L 167.445,278.184 L 168.211,278.945 L 169.02,279.672 L 169.898,280.359 L 170.82,280.973 L 171.813,281.547 L 172.809,282.043 L 173.883,282.465"
4919 id="path1044" />
4920 </clipPath>
4921 <g
4922 id="g1046">
4923 </g>
4924
4925 <clipPath
4926 id="clippath45"> <path
4927 d="M 177.289,282.844 L 177.289,282.844 L 176.027,282.77 L 174.801,282.578 L 173.574,282.273 L 172.387,281.816 L 171.277,281.281 L 170.164,280.633 L 169.133,279.902 L 168.172,279.059 L 167.293,278.145 L 166.449,277.152 L 165.719,276.074 L 165.07,274.965 L 164.535,273.781 L 164.074,272.555 L 163.727,271.289 L 163.496,269.988 L 163.496,269.988 L 163.914,270.066 L 164.301,270.102 L 164.645,270.141 L 164.988,270.18 L 165.336,270.219 L 165.715,270.293 L 166.172,270.367 L 166.715,270.445 L 166.715,270.445 L 167.25,271.555 L 167.711,272.59 L 168.168,273.586 L 168.59,274.504 L 169.051,275.387 L 169.473,276.191 L 169.934,276.953 L 170.434,277.68 L 170.973,278.41 L 171.586,279.059 L 172.273,279.746 L 173.035,280.359 L 173.922,281.012 L 174.918,281.621 L 176.027,282.234 L 177.289,282.844"
4928 id="path1053" />
4929 </clipPath>
4930 <g
4931 id="g1055">
4932 </g>
4933
4934 <clipPath
4935 id="clippath46"> <path
4936 d="M 183.418,283.223 L 183.418,283.223 L 181.539,283.301 L 179.777,283.227 L 178.207,282.961 L 176.754,282.539 L 175.414,281.969 L 174.227,281.277 L 173.113,280.477 L 172.121,279.555 L 171.199,278.562 L 170.355,277.488 L 169.625,276.379 L 168.938,275.191 L 168.285,274.008 L 167.711,272.82 L 167.172,271.594 L 166.676,270.406 L 166.676,270.406 L 167.02,270.445 L 167.477,270.523 L 168.051,270.598 L 168.629,270.676 L 169.238,270.789 L 169.777,270.863 L 170.234,270.941 L 170.582,270.977 L 170.582,270.977 L 171.039,272.051 L 171.539,273.125 L 172.117,274.117 L 172.691,275.074 L 173.344,275.957 L 174.031,276.836 L 174.762,277.637 L 175.563,278.445 L 176.367,279.172 L 177.254,279.859 L 178.168,280.547 L 179.129,281.16 L 180.125,281.73 L 181.195,282.27 L 182.27,282.766 L 183.418,283.223"
4937 id="path1062" />
4938 </clipPath>
4939 <g
4940 id="g1064">
4941 </g>
4942
4943 <clipPath
4944 id="clippath47"> <path
4945 d="M 187.055,283.758 L 187.055,283.758 L 186.023,283.719 L 184.953,283.566 L 183.801,283.297 L 182.613,282.879 L 181.426,282.383 L 180.238,281.77 L 179.051,281.043 L 177.863,280.242 L 176.715,279.324 L 175.602,278.367 L 174.566,277.297 L 173.57,276.148 L 172.691,274.961 L 171.887,273.699 L 171.195,272.359 L 170.617,270.977 L 170.617,270.977 L 171.195,271.055 L 171.77,271.133 L 172.379,271.168 L 172.992,271.246 L 173.527,271.281 L 173.988,271.359 L 174.371,271.434 L 174.563,271.551 L 174.563,271.551 L 175.254,272.086 L 175.945,272.66 L 176.672,273.352 L 177.402,274.039 L 178.129,274.801 L 178.895,275.605 L 179.66,276.449 L 180.426,277.293 L 181.195,278.172 L 181.996,279.016 L 182.801,279.855 L 183.648,280.695 L 184.488,281.539 L 185.332,282.301 L 186.176,283.07 L 187.055,283.758"
4946 id="path1071" />
4947 </clipPath>
4948 <g
4949 id="g1073">
4950 </g>
4951
4952 <clipPath
4953 id="clippath48"> <path
4954 d="M 192.648,283.754 L 192.648,283.754 L 191.652,284.406 L 190.617,284.672 L 189.586,284.676 L 188.512,284.406 L 187.441,283.871 L 186.367,283.145 L 185.254,282.227 L 184.145,281.195 L 183.031,280.008 L 181.883,278.781 L 180.695,277.48 L 179.508,276.18 L 178.32,274.918 L 177.094,273.656 L 175.863,272.508 L 174.641,271.477 L 174.641,271.477 L 175.066,271.477 L 175.559,271.551 L 176.137,271.586 L 176.707,271.703 L 177.324,271.781 L 177.934,271.895 L 178.508,272.008 L 179.008,272.082 L 179.008,272.082 L 179.965,272.965 L 180.918,273.844 L 181.844,274.762 L 182.762,275.68 L 183.645,276.598 L 184.527,277.52 L 185.406,278.437 L 186.25,279.277 L 187.094,280.117 L 187.934,280.887 L 188.738,281.574 L 189.543,282.184 L 190.348,282.723 L 191.113,283.18 L 191.883,283.523 L 192.648,283.754"
4955 id="path1080" />
4956 </clipPath>
4957 <g
4958 id="g1082">
4959 </g>
4960
4961 <clipPath
4962 id="clippath49"> <path
4963 d="M 196.977,284.246 L 196.977,284.246 L 195.406,284.25 L 193.91,284.098 L 192.57,283.715 L 191.266,283.219 L 190.082,282.605 L 188.973,281.84 L 187.895,281 L 186.902,280.082 L 185.902,279.086 L 184.949,278.051 L 184.027,277.02 L 183.066,275.949 L 182.109,274.879 L 181.113,273.844 L 180.078,272.887 L 179.008,271.969 L 179.008,271.969 L 179.391,272.008 L 179.848,272.047 L 180.422,272.117 L 180.996,272.16 L 181.574,272.234 L 182.031,272.273 L 182.414,272.312 L 182.645,272.312 L 182.645,272.312 L 183.293,272.465 L 183.945,272.77 L 184.598,273.191 L 185.289,273.766 L 185.977,274.453 L 186.707,275.258 L 187.473,276.102 L 188.238,277.016 L 189.121,277.973 L 190.004,278.926 L 191,279.926 L 192.031,280.883 L 193.145,281.84 L 194.332,282.719 L 195.598,283.52 L 196.977,284.246"
4964 id="path1089" />
4965 </clipPath>
4966 <g
4967 id="g1091">
4968 </g>
4969
4970 <clipPath
4971 id="clippath50"> <path
4972 d="M 111.746,252.719 L 111.746,252.719 L 111.746,252.566 L 111.707,252.145 L 111.668,251.496 L 111.629,250.691 L 111.59,249.734 L 111.629,248.699 L 111.664,247.629 L 111.777,246.516 L 112.008,245.484 L 112.316,244.488 L 112.695,243.605 L 113.27,242.879 L 113.922,242.34 L 114.766,242.035 L 115.797,241.992 L 116.984,242.301 L 116.984,242.301 L 116.598,242.684 L 116.258,243.145 L 115.91,243.719 L 115.605,244.332 L 115.301,245.02 L 115.031,245.711 L 114.77,246.477 L 114.496,247.281 L 114.27,248.047 L 114.082,248.852 L 113.887,249.656 L 113.699,250.422 L 113.543,251.187 L 113.391,251.914 L 113.277,252.566 L 113.16,253.176 L 113.16,253.176 L 112.973,253.141 L 112.816,253.105 L 112.629,253.027 L 112.473,252.988 L 112.32,252.91 L 112.164,252.836 L 111.973,252.797 L 111.746,252.719"
4973 id="path1098" />
4974 </clipPath>
4975 <g
4976 id="g1100">
4977 </g>
4978
4979 <clipPath
4980 id="clippath51"> <path
4981 d="M 113.16,253.293 L 113.16,253.293 L 113.66,250.805 L 114.195,248.695 L 114.73,246.895 L 115.262,245.402 L 115.801,244.215 L 116.336,243.258 L 116.867,242.566 L 117.367,242.07 L 117.902,241.723 L 118.359,241.57 L 118.863,241.531 L 119.285,241.609 L 119.703,241.762 L 120.086,241.953 L 120.43,242.184 L 120.738,242.414 L 120.738,242.414 L 120.203,242.91 L 119.668,243.484 L 119.207,244.023 L 118.711,244.633 L 118.289,245.246 L 117.867,245.898 L 117.484,246.551 L 117.141,247.238 L 116.797,247.969 L 116.492,248.734 L 116.184,249.5 L 115.918,250.305 L 115.691,251.148 L 115.461,251.988 L 115.234,252.871 L 115.039,253.789 L 115.039,253.789 L 114.582,253.711 L 114.238,253.676 L 114.004,253.598 L 113.852,253.523 L 113.699,253.484 L 113.543,253.41 L 113.391,253.367 L 113.16,253.293"
4982 id="path1107" />
4983 </clipPath>
4984 <g
4985 id="g1109">
4986 </g>
4987
4988 <clipPath
4989 id="clippath52"> <path
4990 d="M 115.078,254.02 L 115.078,254.02 L 115.191,253.102 L 115.387,252.066 L 115.691,250.918 L 116.07,249.691 L 116.57,248.426 L 117.141,247.199 L 117.754,246.016 L 118.441,244.902 L 119.168,243.906 L 119.934,243.062 L 120.738,242.414 L 121.578,241.988 L 122.461,241.836 L 123.344,241.988 L 124.184,242.488 L 125.066,243.363 L 125.066,243.363 L 124.379,243.863 L 123.727,244.441 L 123.113,245.016 L 122.539,245.59 L 121.965,246.203 L 121.469,246.891 L 121.012,247.582 L 120.59,248.27 L 120.168,249.039 L 119.824,249.805 L 119.48,250.609 L 119.215,251.453 L 118.945,252.332 L 118.754,253.25 L 118.563,254.211 L 118.41,255.203 L 118.41,255.203 L 117.992,255.09 L 117.531,254.937 L 117.031,254.785 L 116.535,254.594 L 116.074,254.402 L 115.652,254.246 L 115.309,254.098 L 115.078,254.02"
4991 id="path1116" />
4992 </clipPath>
4993 <g
4994 id="g1118">
4995 </g>
4996
4997 <clipPath
4998 id="clippath53"> <path
4999 d="M 130.008,243.977 L 130.008,243.977 L 129.086,243.363 L 128.168,243.02 L 127.211,242.906 L 126.289,243.02 L 125.332,243.328 L 124.414,243.828 L 123.531,244.516 L 122.691,245.32 L 121.926,246.238 L 121.16,247.312 L 120.512,248.5 L 119.941,249.727 L 119.445,251.031 L 119.059,252.406 L 118.754,253.785 L 118.602,255.203 L 118.602,255.203 L 119.063,255.316 L 119.523,255.434 L 119.98,255.586 L 120.441,255.738 L 120.938,255.93 L 121.438,256.043 L 121.895,256.199 L 122.395,256.273 L 122.395,256.273 L 122.738,255.395 L 123.043,254.434 L 123.426,253.477 L 123.773,252.52 L 124.152,251.523 L 124.57,250.527 L 124.992,249.57 L 125.414,248.652 L 125.871,247.77 L 126.371,246.965 L 126.902,246.199 L 127.441,245.547 L 128.051,244.973 L 128.668,244.512 L 129.316,244.168 L 130.008,243.977"
5000 id="path1125" />
5001 </clipPath>
5002 <g
5003 id="g1127">
5004 </g>
5005
5006 <clipPath
5007 id="clippath54"> <path
5008 d="M 134.219,245.117 L 134.219,245.117 L 132.766,244.473 L 131.461,244.129 L 130.313,244.09 L 129.277,244.281 L 128.359,244.703 L 127.555,245.355 L 126.832,246.16 L 126.18,247.117 L 125.605,248.191 L 125.109,249.379 L 124.609,250.605 L 124.152,251.832 L 123.695,253.094 L 123.273,254.359 L 122.816,255.547 L 122.32,256.656 L 122.32,256.656 L 122.738,256.695 L 123.121,256.809 L 123.508,256.922 L 123.891,257.074 L 124.273,257.191 L 124.656,257.344 L 125.078,257.457 L 125.496,257.531 L 125.496,257.531 L 125.535,257.383 L 125.574,256.996 L 125.688,256.5 L 125.801,255.809 L 125.992,255.008 L 126.262,254.09 L 126.566,253.094 L 126.988,252.059 L 127.48,251.023 L 128.055,249.988 L 128.785,248.953 L 129.586,247.996 L 130.547,247.078 L 131.617,246.309 L 132.844,245.617 L 134.219,245.117"
5009 id="path1134" />
5010 </clipPath>
5011 <g
5012 id="g1136">
5013 </g>
5014
5015 <clipPath
5016 id="clippath55"> <path
5017 d="M 139.656,245.922 L 139.656,245.922 L 138.773,245.27 L 137.777,244.852 L 136.707,244.695 L 135.637,244.738 L 134.484,244.965 L 133.34,245.426 L 132.23,246.043 L 131.117,246.809 L 130.047,247.766 L 129.051,248.84 L 128.172,250.066 L 127.367,251.406 L 126.68,252.824 L 126.145,254.355 L 125.801,255.926 L 125.613,257.613 L 129.559,258.641 L 129.559,258.641 L 129.785,257.992 L 130.094,257.223 L 130.398,256.422 L 130.781,255.539 L 131.238,254.617 L 131.699,253.664 L 132.234,252.703 L 132.809,251.746 L 133.461,250.789 L 134.184,249.867 L 134.949,249.027 L 135.754,248.219 L 136.633,247.492 L 137.59,246.879 L 138.586,246.344 L 139.656,245.922"
5018 id="path1143" />
5019 </clipPath>
5020 <g
5021 id="g1145">
5022 </g>
5023
5024 <clipPath
5025 id="clippath56"> <path
5026 d="M 129.52,258.758 L 129.52,258.758 L 130.094,258.871 L 130.82,259.023 L 131.547,259.254 L 132.355,259.484 L 133.117,259.75 L 133.848,259.941 L 134.496,260.133 L 135.035,260.285 L 135.035,260.285 L 135.414,259.328 L 135.836,258.332 L 136.258,257.336 L 136.68,256.301 L 137.098,255.266 L 137.559,254.27 L 138.055,253.273 L 138.551,252.277 L 139.086,251.359 L 139.66,250.516 L 140.273,249.711 L 140.926,248.984 L 141.574,248.332 L 142.301,247.797 L 143.105,247.371 L 143.91,247.066 L 143.91,247.066 L 143.219,246.609 L 142.453,246.266 L 141.648,246.035 L 140.77,245.957 L 139.852,246.035 L 138.891,246.266 L 137.934,246.648 L 136.941,247.227 L 135.945,247.953 L 134.949,248.871 L 133.957,250.023 L 133,251.324 L 132.082,252.859 L 131.164,254.582 L 130.324,256.574 L 129.52,258.758"
5027 id="path1152" />
5028 </clipPath>
5029 <g
5030 id="g1154">
5031 </g>
5032
5033 <clipPath
5034 id="clippath57"> <path
5035 d="M 148.734,248.711 L 148.734,248.711 L 147.164,247.867 L 145.746,247.371 L 144.445,247.258 L 143.258,247.449 L 142.227,247.906 L 141.23,248.598 L 140.391,249.48 L 139.586,250.555 L 138.895,251.742 L 138.246,253.008 L 137.672,254.348 L 137.141,255.687 L 136.602,256.988 L 136.105,258.254 L 135.648,259.402 L 135.191,260.437 L 135.191,260.437 L 135.723,260.516 L 136.184,260.625 L 136.605,260.707 L 137.027,260.82 L 137.449,260.934 L 137.906,261.051 L 138.406,261.125 L 138.977,261.238 L 138.977,261.238 L 139.477,260.203 L 139.934,259.168 L 140.395,258.176 L 140.852,257.215 L 141.313,256.258 L 141.809,255.379 L 142.309,254.496 L 142.801,253.691 L 143.379,252.887 L 143.988,252.16 L 144.602,251.473 L 145.289,250.82 L 146.059,250.203 L 146.859,249.668 L 147.777,249.168 L 148.734,248.711"
5036 id="path1161" />
5037 </clipPath>
5038 <g
5039 id="g1163">
5040 </g>
5041
5042 <clipPath
5043 id="clippath58"> <path
5044 d="M 153.484,250.086 L 153.484,250.086 L 152.066,249.32 L 150.727,248.937 L 149.465,248.863 L 148.238,249.051 L 147.129,249.516 L 146.059,250.203 L 145.063,251.047 L 144.141,252.047 L 143.301,253.195 L 142.496,254.383 L 141.773,255.609 L 141.121,256.871 L 140.508,258.137 L 139.973,259.285 L 139.516,260.398 L 139.094,261.355 L 139.094,261.355 L 139.668,261.469 L 140.168,261.582 L 140.629,261.695 L 141.047,261.773 L 141.508,261.887 L 141.969,262.004 L 142.504,262.078 L 143.117,262.195 L 143.117,262.195 L 143.805,260.66 L 144.492,259.242 L 145.141,257.941 L 145.793,256.754 L 146.406,255.68 L 146.98,254.723 L 147.59,253.883 L 148.164,253.074 L 148.777,252.422 L 149.391,251.848 L 150,251.352 L 150.652,250.93 L 151.305,250.621 L 151.988,250.352 L 152.723,250.199 L 153.484,250.086"
5045 id="path1170" />
5046 </clipPath>
5047 <g
5048 id="g1172">
5049 </g>
5050
5051 <clipPath
5052 id="clippath59"> <path
5053 d="M 157.086,251.113 L 157.086,251.113 L 155.707,250.465 L 154.402,250.156 L 153.18,250.16 L 151.988,250.391 L 150.918,250.852 L 149.887,251.504 L 148.93,252.348 L 148.012,253.305 L 147.207,254.379 L 146.441,255.527 L 145.719,256.715 L 145.102,257.902 L 144.492,259.09 L 143.992,260.238 L 143.539,261.273 L 143.117,262.234 L 143.117,262.234 L 143.578,262.348 L 143.996,262.461 L 144.418,262.535 L 144.801,262.652 L 145.223,262.73 L 145.645,262.805 L 146.105,262.918 L 146.602,262.992 L 146.602,262.992 L 146.832,262.23 L 147.176,261.387 L 147.594,260.543 L 148.094,259.625 L 148.629,258.707 L 149.277,257.785 L 149.926,256.863 L 150.656,255.945 L 151.418,255.105 L 152.223,254.258 L 153.027,253.492 L 153.832,252.84 L 154.672,252.227 L 155.477,251.73 L 156.281,251.348 L 157.086,251.113"
5054 id="path1179" />
5055 </clipPath>
5056 <g
5057 id="g1181">
5058 </g>
5059
5060 <clipPath
5061 id="clippath60"> <path
5062 d="M 161.414,252.223 L 161.414,252.223 L 159.996,251.461 L 158.617,251.113 L 157.238,251.152 L 155.938,251.539 L 154.672,252.152 L 153.449,253.031 L 152.297,254.066 L 151.23,255.215 L 150.234,256.445 L 149.352,257.707 L 148.555,258.934 L 147.902,260.121 L 147.367,261.156 L 146.945,262.035 L 146.715,262.687 L 146.602,263.07 L 146.602,263.07 L 147.176,263.184 L 147.672,263.301 L 148.172,263.414 L 148.672,263.531 L 149.168,263.645 L 149.629,263.719 L 150.125,263.797 L 150.621,263.871 L 150.621,263.871 L 150.969,263.223 L 151.352,262.418 L 151.809,261.535 L 152.348,260.613 L 152.918,259.582 L 153.531,258.547 L 154.176,257.551 L 154.867,256.516 L 155.598,255.555 L 156.363,254.676 L 157.164,253.871 L 157.969,253.219 L 158.809,252.684 L 159.656,252.34 L 160.531,252.184 L 161.414,252.223"
5063 id="path1188" />
5064 </clipPath>
5065 <g
5066 id="g1190">
5067 </g>
5068
5069 <clipPath
5070 id="clippath61"> <path
5071 d="M 164.555,253.445 L 164.555,253.445 L 163.289,252.605 L 162.066,252.145 L 160.914,252.031 L 159.766,252.223 L 158.695,252.684 L 157.66,253.375 L 156.703,254.219 L 155.789,255.254 L 154.945,256.363 L 154.141,257.551 L 153.414,258.777 L 152.727,260.004 L 152.152,261.152 L 151.617,262.227 L 151.199,263.145 L 150.816,263.91 L 150.816,263.91 L 151.273,263.988 L 151.656,264.102 L 152,264.215 L 152.309,264.328 L 152.656,264.445 L 152.961,264.559 L 153.266,264.637 L 153.609,264.711 L 153.609,264.711 L 153.953,263.754 L 154.375,262.797 L 154.797,261.875 L 155.258,260.957 L 155.793,260.039 L 156.324,259.195 L 156.938,258.352 L 157.59,257.547 L 158.277,256.781 L 159.039,256.094 L 159.805,255.477 L 160.648,254.902 L 161.57,254.402 L 162.488,253.984 L 163.484,253.676 L 164.555,253.445"
5072 id="path1197" />
5073 </clipPath>
5074 <g
5075 id="g1199">
5076 </g>
5077
5078 <clipPath
5079 id="clippath62"> <path
5080 d="M 168.539,254.363 L 168.539,254.363 L 167.426,253.941 L 166.316,253.676 L 165.168,253.598 L 164.094,253.676 L 162.984,253.906 L 161.953,254.25 L 160.918,254.789 L 159.883,255.441 L 158.926,256.207 L 158.012,257.129 L 157.129,258.121 L 156.285,259.234 L 155.484,260.461 L 154.797,261.801 L 154.105,263.219 L 153.535,264.711 L 153.535,264.711 L 154.035,264.789 L 154.531,264.863 L 155.031,264.98 L 155.527,265.133 L 156.023,265.246 L 156.523,265.402 L 157.02,265.516 L 157.52,265.59 L 157.52,265.59 L 157.859,264.746 L 158.203,263.867 L 158.551,263.023 L 158.895,262.141 L 159.316,261.301 L 159.734,260.492 L 160.234,259.652 L 160.77,258.887 L 161.418,258.156 L 162.105,257.43 L 162.91,256.777 L 163.789,256.164 L 164.785,255.629 L 165.898,255.129 L 167.16,254.707 L 168.539,254.363"
5081 id="path1206" />
5082 </clipPath>
5083 <g
5084 id="g1208">
5085 </g>
5086
5087 <clipPath
5088 id="clippath63"> <path
5089 d="M 174.207,255.699 L 174.207,255.699 L 172.754,255.09 L 171.293,254.703 L 169.957,254.516 L 168.613,254.477 L 167.352,254.668 L 166.125,255.016 L 164.977,255.512 L 163.867,256.164 L 162.875,256.969 L 161.879,257.887 L 160.996,258.926 L 160.156,260.074 L 159.395,261.34 L 158.738,262.676 L 158.129,264.059 L 157.594,265.551 L 157.594,265.551 L 158.051,265.629 L 158.512,265.703 L 159.047,265.781 L 159.551,265.855 L 160.082,265.934 L 160.621,266.008 L 161.156,266.086 L 161.691,266.16 L 161.691,266.16 L 161.961,265.09 L 162.344,264.094 L 162.762,263.098 L 163.297,262.141 L 163.871,261.258 L 164.523,260.414 L 165.211,259.609 L 165.977,258.883 L 166.82,258.191 L 167.734,257.617 L 168.695,257.078 L 169.688,256.617 L 170.762,256.273 L 171.871,255.969 L 173.02,255.777 L 174.207,255.699"
5090 id="path1215" />
5091 </clipPath>
5092 <g
5093 id="g1217">
5094 </g>
5095
5096 <clipPath
5097 id="clippath64"> <path
5098 d="M 177,256 L 177,256 L 175.699,255.809 L 174.438,255.699 L 173.176,255.738 L 171.949,255.891 L 170.723,256.199 L 169.574,256.582 L 168.465,257.082 L 167.391,257.695 L 166.395,258.422 L 165.441,259.266 L 164.598,260.184 L 163.793,261.219 L 163.105,262.367 L 162.496,263.594 L 161.996,264.934 L 161.617,266.355 L 161.617,266.355 L 162.078,266.391 L 162.461,266.426 L 162.84,266.504 L 163.223,266.582 L 163.609,266.656 L 163.988,266.734 L 164.449,266.848 L 164.945,266.926 L 164.945,266.926 L 165.598,265.812 L 166.211,264.738 L 166.742,263.781 L 167.242,262.828 L 167.777,261.984 L 168.277,261.18 L 168.773,260.414 L 169.348,259.723 L 169.957,259.07 L 170.648,258.496 L 171.414,257.961 L 172.258,257.461 L 173.25,257.039 L 174.324,256.656 L 175.586,256.309 L 177,256"
5099 id="path1224" />
5100 </clipPath>
5101 <g
5102 id="g1226">
5103 </g>
5104
5105 <clipPath
5106 id="clippath65"> <path
5107 d="M 183.09,256.801 L 183.09,256.801 L 181.176,256.344 L 179.375,256.078 L 177.73,256 L 176.199,256.156 L 174.781,256.461 L 173.48,256.922 L 172.293,257.496 L 171.219,258.266 L 170.188,259.109 L 169.273,260.066 L 168.426,261.102 L 167.625,262.215 L 166.898,263.363 L 166.211,264.59 L 165.598,265.812 L 164.984,267.078 L 164.984,267.078 L 165.293,267.113 L 165.715,267.152 L 166.172,267.27 L 166.672,267.344 L 167.168,267.461 L 167.633,267.574 L 168.051,267.609 L 168.395,267.648 L 168.395,267.648 L 168.969,266.5 L 169.582,265.465 L 170.27,264.469 L 170.996,263.551 L 171.762,262.668 L 172.566,261.863 L 173.445,261.137 L 174.363,260.445 L 175.32,259.832 L 176.316,259.258 L 177.348,258.758 L 178.418,258.262 L 179.531,257.84 L 180.68,257.457 L 181.867,257.109 L 183.09,256.801"
5108 id="path1233" />
5109 </clipPath>
5110 <g
5111 id="g1235">
5112 </g>
5113
5114 <clipPath
5115 id="clippath66"> <path
5116 d="M 186.727,257.18 L 186.727,257.18 L 185.66,256.988 L 184.547,256.953 L 183.32,256.992 L 182.059,257.184 L 180.793,257.453 L 179.492,257.84 L 178.191,258.336 L 176.887,258.949 L 175.59,259.676 L 174.363,260.523 L 173.18,261.441 L 172.031,262.477 L 170.996,263.629 L 170.039,264.852 L 169.199,266.195 L 168.473,267.648 L 168.473,267.648 L 169.008,267.687 L 169.543,267.766 L 170.082,267.879 L 170.617,267.992 L 171.074,268.105 L 171.461,268.223 L 171.766,268.258 L 171.957,268.223 L 171.957,268.223 L 172.723,267.762 L 173.488,267.227 L 174.289,266.613 L 175.133,265.922 L 175.977,265.195 L 176.855,264.426 L 177.773,263.66 L 178.691,262.855 L 179.648,262.051 L 180.605,261.242 L 181.602,260.441 L 182.598,259.711 L 183.629,258.984 L 184.625,258.332 L 185.695,257.719 L 186.727,257.18"
5117 id="path1242" />
5118 </clipPath>
5119 <g
5120 id="g1244">
5121 </g>
5122
5123 <clipPath
5124 id="clippath67"> <path
5125 d="M 191.824,258.633 L 191.824,258.633 L 190.867,257.676 L 189.832,257.105 L 188.758,256.871 L 187.648,256.949 L 186.5,257.258 L 185.277,257.832 L 184.051,258.602 L 182.75,259.48 L 181.449,260.555 L 180.109,261.664 L 178.77,262.855 L 177.43,264.043 L 176.051,265.23 L 174.676,266.344 L 173.297,267.379 L 171.918,268.301 L 171.918,268.301 L 172.34,268.371 L 172.84,268.41 L 173.449,268.453 L 174.063,268.488 L 174.711,268.527 L 175.363,268.527 L 175.941,268.527 L 176.438,268.523 L 176.438,268.523 L 177.473,267.758 L 178.465,266.953 L 179.461,266.148 L 180.379,265.305 L 181.297,264.5 L 182.18,263.695 L 183.059,262.891 L 183.938,262.121 L 184.855,261.434 L 185.734,260.781 L 186.656,260.207 L 187.609,259.707 L 188.605,259.285 L 189.641,258.941 L 190.715,258.746 L 191.824,258.633"
5126 id="path1251" />
5127 </clipPath>
5128 <g
5129 id="g1253">
5130 </g>
5131
5132 <clipPath
5133 id="clippath68"> <path
5134 d="M 195.656,259.09 L 195.656,259.09 L 194.043,258.707 L 192.551,258.555 L 191.137,258.633 L 189.793,258.902 L 188.531,259.363 L 187.305,259.937 L 186.121,260.668 L 184.973,261.473 L 183.859,262.391 L 182.754,263.312 L 181.68,264.309 L 180.57,265.305 L 179.461,266.262 L 178.352,267.223 L 177.164,268.062 L 175.98,268.871 L 175.98,268.871 L 176.359,268.906 L 176.855,268.984 L 177.434,269.059 L 178.012,269.098 L 178.582,269.176 L 179.082,269.246 L 179.461,269.289 L 179.695,269.328 L 179.695,269.328 L 180.383,269.324 L 181.07,269.137 L 181.797,268.789 L 182.563,268.293 L 183.328,267.676 L 184.172,266.949 L 185.051,266.145 L 185.969,265.301 L 186.93,264.418 L 187.961,263.5 L 189.07,262.617 L 190.219,261.773 L 191.48,260.969 L 192.781,260.238 L 194.16,259.59 L 195.656,259.09"
5135 id="path1260" />
5136 </clipPath>
5137 <g
5138 id="g1262">
5139 </g>
5140
5141 <clipPath
5142 id="clippath69"> <path
5143 d="M 200.48,260.426 L 200.48,260.426 L 199.063,259.621 L 197.566,259.281 L 196.113,259.281 L 194.621,259.59 L 193.129,260.164 L 191.633,260.969 L 190.18,261.926 L 188.766,262.961 L 187.426,264.113 L 186.121,265.262 L 184.859,266.336 L 183.711,267.332 L 182.68,268.176 L 181.723,268.824 L 180.879,269.211 L 180.156,269.324 L 180.156,269.324 L 180.766,269.402 L 181.418,269.516 L 182.145,269.629 L 182.91,269.746 L 183.676,269.82 L 184.441,269.934 L 185.168,269.973 L 185.824,270.012 L 185.824,270.012 L 186.895,269.207 L 187.852,268.441 L 188.73,267.672 L 189.57,266.945 L 190.336,266.215 L 191.063,265.488 L 191.828,264.797 L 192.555,264.145 L 193.285,263.535 L 194.086,262.957 L 194.926,262.422 L 195.848,261.926 L 196.844,261.465 L 197.914,261.082 L 199.141,260.734 L 200.48,260.426"
5144 id="path1269" />
5145 </clipPath>
5146 <g
5147 id="g1271">
5148 </g>
5149
5150 <clipPath
5151 id="clippath70"> <path
5152 d="M 107.719,244.719 L 107.719,244.719 L 107.527,244.336 L 107.371,243.918 L 107.223,243.492 L 107.105,243.074 L 106.988,242.613 L 106.875,242.117 L 106.797,241.66 L 106.758,241.16 L 106.719,240.699 L 106.684,240.203 L 106.684,239.703 L 106.719,239.242 L 106.754,238.785 L 106.836,238.328 L 106.949,237.863 L 107.063,237.445 L 107.063,237.445 L 106.449,238.25 L 105.992,239.133 L 105.648,239.973 L 105.457,240.852 L 105.383,241.773 L 105.383,242.691 L 105.496,243.609 L 105.691,244.531 L 105.918,245.449 L 106.227,246.332 L 106.57,247.211 L 106.914,248.09 L 107.297,248.93 L 107.645,249.773 L 107.992,250.539 L 108.336,251.305 L 108.336,251.305 L 108.262,251.152 L 108.223,250.883 L 108.18,250.617 L 108.18,250.273 L 108.145,249.852 L 108.145,249.43 L 108.145,248.969 L 108.145,248.473 L 108.105,247.973 L 108.105,247.477 L 108.066,246.941 L 108.027,246.445 L 107.988,245.984 L 107.91,245.523 L 107.836,245.102 L 107.719,244.719"
5153 id="path1278" />
5154 </clipPath>
5155 <g
5156 id="g1280">
5157 </g>
5158
5159 <clipPath
5160 id="clippath71"> <path
5161 d="M 96.0469,260.621 L 96.0469,260.621 L 96.7773,260.199 L 97.5078,259.738 L 98.1953,259.316 L 98.8828,258.895 L 99.5313,258.512 L 100.184,258.09 L 100.801,257.707 L 101.41,257.285 L 102.063,256.902 L 102.668,256.52 L 103.285,256.098 L 103.938,255.715 L 104.547,255.328 L 105.203,254.988 L 105.887,254.602 L 106.578,254.219 L 106.578,254.219 L 105.969,254.828 L 105.352,255.445 L 104.738,256.055 L 104.129,256.633 L 103.477,257.242 L 102.828,257.781 L 102.176,258.32 L 101.488,258.816 L 100.836,259.277 L 100.148,259.66 L 99.4961,260.008 L 98.8086,260.273 L 98.1211,260.504 L 97.4297,260.617 L 96.7383,260.656 L 96.0469,260.621"
5162 id="path1287" />
5163 </clipPath>
5164 <g
5165 id="g1289">
5166 </g>
5167
5168 <clipPath
5169 id="clippath72"> <path
5170 d="M 101.746,245.297 L 101.746,245.297 L 102.281,245.605 L 102.781,245.949 L 103.277,246.293 L 103.738,246.637 L 104.199,246.98 L 104.617,247.363 L 105.039,247.785 L 105.465,248.168 L 105.848,248.59 L 106.23,249.012 L 106.613,249.43 L 106.957,249.891 L 107.34,250.309 L 107.684,250.77 L 108.031,251.23 L 108.375,251.687 L 108.375,251.687 L 108.223,251.191 L 108.031,250.691 L 107.84,250.156 L 107.609,249.621 L 107.379,249.086 L 107.113,248.586 L 106.801,248.09 L 106.457,247.594 L 106.074,247.133 L 105.613,246.754 L 105.117,246.367 L 104.578,246.023 L 103.969,245.754 L 103.316,245.527 L 102.551,245.371 L 101.746,245.297"
5171 id="path1296" />
5172 </clipPath>
5173 <g
5174 id="g1298">
5175 </g>
5176
5177 <clipPath
5178 id="clippath73"> <path
5179 d="M 107.449,240.699 L 107.449,240.699 L 107.602,241.465 L 107.715,242.23 L 107.871,242.996 L 107.984,243.684 L 108.063,244.414 L 108.176,245.102 L 108.258,245.793 L 108.371,246.48 L 108.445,247.168 L 108.563,247.859 L 108.637,248.512 L 108.758,249.199 L 108.871,249.891 L 108.988,250.578 L 109.137,251.27 L 109.293,251.992 L 109.293,251.992 L 109.371,251.152 L 109.484,250.348 L 109.563,249.504 L 109.637,248.664 L 109.676,247.859 L 109.715,247.051 L 109.672,246.246 L 109.637,245.484 L 109.559,244.719 L 109.441,244.031 L 109.246,243.34 L 109.02,242.687 L 108.75,242.113 L 108.367,241.578 L 107.945,241.117 L 107.449,240.699"
5180 id="path1305" />
5181 </clipPath>
5182 <g
5183 id="g1307">
5184 </g>
5185
5186 <clipPath
5187 id="clippath74"> <path
5188 d="M 103.348,239.441 L 103.348,239.441 L 102.656,240.09 L 102.238,240.816 L 102.051,241.582 L 102.012,242.387 L 102.164,243.191 L 102.473,244.035 L 102.895,244.879 L 103.395,245.719 L 103.969,246.523 L 104.578,247.328 L 105.23,248.09 L 105.883,248.781 L 106.496,249.469 L 107.031,250.043 L 107.492,250.582 L 107.879,250.996 L 107.879,250.996 L 107.801,249.852 L 107.645,248.855 L 107.453,248.012 L 107.148,247.246 L 106.844,246.598 L 106.457,246.023 L 106.074,245.488 L 105.652,244.988 L 105.27,244.492 L 104.844,243.996 L 104.461,243.457 L 104.156,242.844 L 103.848,242.156 L 103.617,241.391 L 103.422,240.473 L 103.348,239.441"
5189 id="path1314" />
5190 </clipPath>
5191 <g
5192 id="g1316">
5193 </g>
5194
5195 <clipPath
5196 id="clippath75"> <path
5197 d="M 108.262,251.687 L 108.262,251.687 L 108.262,250.883 L 108.297,250.117 L 108.332,249.355 L 108.371,248.586 L 108.41,247.859 L 108.484,247.133 L 108.563,246.406 L 108.637,245.715 L 108.754,245.023 L 108.867,244.297 L 109.055,243.609 L 109.211,242.918 L 109.402,242.23 L 109.668,241.539 L 109.898,240.852 L 110.203,240.16 L 110.203,240.16 L 109.094,241.234 L 108.176,242.27 L 107.527,243.227 L 107.07,244.148 L 106.762,245.027 L 106.609,245.832 L 106.57,246.637 L 106.648,247.363 L 106.844,248.051 L 107.031,248.703 L 107.301,249.316 L 107.57,249.852 L 107.801,250.387 L 108.031,250.848 L 108.18,251.305 L 108.262,251.687"
5198 id="path1323" />
5199 </clipPath>
5200 <g
5201 id="g1325">
5202 </g>
5203
5204 <clipPath
5205 id="clippath76"> <path
5206 d="M 97.0391,254.34 L 97.0391,254.34 L 97.6523,254.453 L 98.2695,254.57 L 98.8789,254.645 L 99.4922,254.719 L 100.102,254.758 L 100.719,254.797 L 101.332,254.797 L 101.906,254.797 L 102.52,254.758 L 103.094,254.758 L 103.707,254.68 L 104.281,254.641 L 104.895,254.602 L 105.469,254.527 L 106.082,254.445 L 106.656,254.371 L 106.656,254.371 L 106.156,254.602 L 105.66,254.793 L 105.121,255.023 L 104.586,255.215 L 104.012,255.406 L 103.434,255.559 L 102.824,255.715 L 102.215,255.793 L 101.602,255.832 L 100.953,255.832 L 100.301,255.793 L 99.6445,255.641 L 98.9922,255.449 L 98.3438,255.18 L 97.6953,254.801 L 97.0391,254.34"
5207 id="path1332" />
5208 </clipPath>
5209 <g
5210 id="g1334">
5211 </g>
5212
5213 <clipPath
5214 id="clippath77"> <path
5215 d="M 100.605,257.437 L 100.605,257.437 L 100.145,257.516 L 99.7266,257.594 L 99.2656,257.707 L 98.8086,257.824 L 98.3438,258.016 L 97.8906,258.168 L 97.4688,258.359 L 97.0078,258.59 L 96.5859,258.82 L 96.1641,259.086 L 95.7852,259.359 L 95.3984,259.625 L 95.0547,259.93 L 94.7461,260.238 L 94.4453,260.547 L 94.1719,260.891 L 94.1719,260.891 L 94.4414,259.973 L 94.8633,259.164 L 95.3594,258.477 L 95.9727,257.824 L 96.6602,257.289 L 97.4297,256.789 L 98.2695,256.367 L 99.1523,256.023 L 100.07,255.715 L 100.988,255.41 L 101.945,255.176 L 102.902,254.949 L 103.859,254.754 L 104.777,254.566 L 105.621,254.41 L 106.461,254.219 L 106.461,254.219 L 106.043,254.371 L 105.469,254.676 L 104.777,255.141 L 103.938,255.637 L 103.094,256.172 L 102.215,256.672 L 101.371,257.133 L 100.605,257.437"
5216 id="path1341" />
5217 </clipPath>
5218 <g
5219 id="g1343">
5220 </g>
5221
5222 <clipPath
5223 id="clippath78"> <path
5224 d="M 98.8477,264.297 L 98.8477,264.297 L 98.5039,263.414 L 98.3867,262.57 L 98.5039,261.73 L 98.8086,260.965 L 99.2656,260.234 L 99.8789,259.508 L 100.605,258.816 L 101.41,258.203 L 102.254,257.59 L 103.133,257.055 L 104.051,256.516 L 104.934,256.055 L 105.734,255.637 L 106.504,255.25 L 107.152,254.906 L 107.648,254.598 L 107.648,254.598 L 107.117,255.711 L 106.578,256.629 L 106.008,257.359 L 105.469,258.012 L 104.895,258.508 L 104.32,258.969 L 103.746,259.312 L 103.172,259.66 L 102.602,260.004 L 102.023,260.387 L 101.453,260.77 L 100.914,261.23 L 100.379,261.801 L 99.8438,262.492 L 99.3477,263.301 L 98.8477,264.297"
5225 id="path1350" />
5226 </clipPath>
5227 <g
5228 id="g1352">
5229 </g>
5230
5231 <clipPath
5232 id="clippath79"> <path
5233 d="M 108.492,254.863 L 108.492,254.863 L 108.07,255.52 L 107.652,256.172 L 107.23,256.82 L 106.852,257.434 L 106.504,258.047 L 106.16,258.695 L 105.816,259.309 L 105.512,259.926 L 105.207,260.578 L 104.938,261.227 L 104.668,261.84 L 104.438,262.488 L 104.246,263.18 L 104.055,263.832 L 103.902,264.52 L 103.75,265.25 L 103.75,265.25 L 103.367,263.797 L 103.176,262.531 L 103.137,261.383 L 103.25,260.387 L 103.477,259.543 L 103.824,258.777 L 104.242,258.125 L 104.703,257.59 L 105.238,257.09 L 105.777,256.668 L 106.348,256.324 L 106.887,256.016 L 107.383,255.711 L 107.844,255.445 L 108.184,255.137 L 108.492,254.863"
5234 id="path1359" />
5235 </clipPath>
5236 <g
5237 id="g1361">
5238 </g>
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289 <flowRoot
5290 xml:space="preserve"
5291 id="flowRoot2898"
5292 transform="matrix(0.3776217,0,0,0.3776217,94.044679,45.940478)"
5293 style="fill:url(#linearGradient3669);fill-opacity:1"><flowRegion
5294 id="flowRegion2900"
5295 style="fill:url(#linearGradient9226);fill-opacity:1"><rect
5296 id="rect2902"
5297 width="132.38824"
5298 height="126.51618"
5299 x="13.879412"
5300 y="32.076469"
5301 style="fill:url(#linearGradient9228);fill-opacity:1" /></flowRegion><flowPara
5302 id="flowPara2904"
5303 style="fill:url(#linearGradient9230);fill-opacity:1">1010100101010101011010101110101011101011101010111101011100101011101011010011000010101010101111011000110101010101100101110111010110011101</flowPara></flowRoot>
5304 </g></svg>
0 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2
3 <!--
4 Licensed to the Apache Software Foundation (ASF) under one
5 or more contributor license agreements. See the NOTICE file
6 distributed with this work for additional information
7 regarding copyright ownership. The ASF licenses this file
8 to you under the Apache License, Version 2.0 (the
9 "License"); you may not use this file except in compliance
10 with the License. You may obtain a copy of the License at
11
12 http://www.apache.org/licenses/LICENSE-2.0
13
14 Unless required by applicable law or agreed to in writing,
15 software distributed under the License is distributed on an
16 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 KIND, either express or implied. See the License for the
18 specific language governing permissions and limitations
19 under the License.
20 -->
21
22 #macro ( link $href $name )
23 #if ( ( $href.toLowerCase().startsWith("http") || $href.toLowerCase().startsWith("https") ) )
24 <a href="$href" class="externalLink">$name</a>
25 #else
26 <a href="$href">$name</a>
27 #end
28 #end
29
30 #macro ( banner $banner $id )
31 #if ( $banner )
32 #if( $banner.href )
33 <a href="$banner.href" id="$id" #if( $banner.alt ) title="$banner.alt" #end >
34 #else
35 <div id="$id">
36 #end
37
38 #if( $banner.src )
39 #set ( $src = $banner.src )
40 #if ( ! ( $src.toLowerCase().startsWith("http") || $src.toLowerCase().startsWith("https") ) )
41 #set ( $src = $PathTool.calculateLink( $src, $relativePath ) )
42 #set ( $src = $src.replaceAll( "\\", "/" ) )
43 #end
44 #if ( $banner.alt )
45 #set ( $alt = $banner.alt )
46 #else
47 #set ( $alt = $banner.name )
48 #end
49 <img src="$src" alt="$alt" />
50 #else
51 $banner.name
52 #end
53
54 #if( $banner.href )
55 </a>
56 #else
57 </div>
58 #end
59 #end
60 #end
61
62 #macro ( links $links )
63 #set ( $counter = 0 )
64 #foreach( $item in $links )
65 #set ( $counter = $counter + 1 )
66 #set ( $currentItemHref = $PathTool.calculateLink( $item.href, $relativePath ) )
67 #set ( $currentItemHref = $currentItemHref.replaceAll( "\\", "/" ) )
68 #link( $currentItemHref $item.name )
69 #if ( $links.size() > $counter )
70 |
71 #end
72 #end
73 #end
74
75 #macro ( breadcrumbs $breadcrumbs )
76 #set ( $counter = 0 )
77 #foreach( $item in $breadcrumbs )
78 #set ( $counter = $counter + 1 )
79 #set ( $currentItemHref = $PathTool.calculateLink( $item.href, $relativePath ) )
80 #set ( $currentItemHref = $currentItemHref.replaceAll( "\\", "/" ) )
81
82 #if ( $currentItemHref == $alignedFileName || $currentItemHref == "" )
83 $item.name
84 #else
85 #link( $currentItemHref $item.name )
86 #end
87 #if ( $breadcrumbs.size() > $counter )
88 &gt;
89 #end
90 #end
91 #end
92
93 #macro ( displayTree $display $item )
94 #if ( $item && $item.items && $item.items.size() > 0 )
95 #foreach( $subitem in $item.items )
96 #set ( $subitemHref = $PathTool.calculateLink( $subitem.href, $relativePath ) )
97 #set ( $subitemHref = $subitemHref.replaceAll( "\\", "/" ) )
98 #if ( $alignedFileName == $subitemHref )
99 #set ( $display = true )
100 #end
101
102 #displayTree( $display $subitem )
103 #end
104 #end
105 #end
106
107 #macro ( menuItem $item )
108 #set ( $collapse = "none" )
109 #set ( $currentItemHref = $PathTool.calculateLink( $item.href, $relativePath ) )
110 #set ( $currentItemHref = $currentItemHref.replaceAll( "\\", "/" ) )
111
112 #if ( $item && $item.items && $item.items.size() > 0 )
113 #if ( $item.collapse == false )
114 #set ( $collapse = "expanded" )
115 #else
116 ## By default collapsed
117 #set ( $collapse = "collapsed" )
118 #end
119
120 #set ( $display = false )
121 #displayTree( $display $item )
122
123 #if ( $alignedFileName == $currentItemHref || $display )
124 #set ( $collapse = "expanded" )
125 #end
126 #end
127 <li class="$collapse">
128 #if ( $item.img )
129 #if ( ! ( $item.img.toLowerCase().startsWith("http") || $item.img.toLowerCase().startsWith("https") ) )
130 #set ( $src = $PathTool.calculateLink( $item.img, $relativePath ) )
131 #set ( $src = $src.replaceAll( "\\", "/" ) )
132 <img src="$src"/>
133 #else
134 <img src="$item.img" align="absbottom" style="border-width: 0"/>
135 #end
136 #end
137 #if ( $alignedFileName == $currentItemHref )
138 <strong>$item.name</strong>
139 #else
140 #link( $currentItemHref $item.name )
141 #end
142 #if ( $item && $item.items && $item.items.size() > 0 )
143 #if ( $collapse == "expanded" )
144 <ul>
145 #foreach( $subitem in $item.items )
146 #menuItem( $subitem )
147 #end
148 </ul>
149 #end
150 #end
151 </li>
152 #end
153
154 #macro ( mainMenu $menus )
155 #foreach( $menu in $menus )
156 #if ( $menu.name )
157 <h5>$menu.name</h5>
158 #end
159 #if ( $menu.items && $menu.items.size() > 0 )
160 <ul>
161 #foreach( $item in $menu.items )
162 #menuItem( $item )
163 #end
164 </ul>
165 #end
166 #end
167 #end
168
169 <html xmlns="http://www.w3.org/1999/xhtml">
170 <head>
171 <meta http-equiv="Content-Type" content="text/html; charset=${outputEncoding}" />
172 <title>$title</title>
173 <style type="text/css" media="all">
174 @import url("$relativePath/css/site.css");
175 </style>
176 <link rel="icon" type="image/png" href="$relativePath/tikaNoText16.png" />
177 <script type="text/javascript">
178 function getBlank(form, stdValue) {
179 if (form.value == stdValue) {
180 form.value = '';
181 }
182 return true;
183 }
184 function getPrompt(form, stdValue) {
185 if (form.value == '') {
186 form.value = stdValue;
187 }
188 return true;
189 }
190 </script>
191 </head>
192 <body class="composite">
193 <div id="banner">
194 #banner( $decoration.bannerLeft "bannerLeft" )
195 #banner( $decoration.bannerRight "bannerRight" )
196 <div class="clear">
197 <hr/>
198 </div>
199 </div>
200 <div id="search">
201 <script type="text/javascript">
202 function selectProvider(form) {
203 provider = form.elements['searchProvider'].value;
204 if (provider == "any") {
205 if (Math.random() > 0.5) {
206 provider = "lucid";
207 } else {
208 provider = "sl";
209 }
210 }
211
212 if (provider == "lucid") {
213 form.action = "http://search.lucidimagination.com/p:tika";
214 } else if (provider == "sl") {
215 form.action = "http://search-lucene.com/tika";
216 }
217
218 days = 90;
219 date = new Date();
220 date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
221 expires = "; expires=" + date.toGMTString();
222 document.cookie = "searchProvider=" + provider + expires + "; path=/";
223 }
224 </script>
225 <form action="http://search.lucidimagination.com/p:tika" method="get" id="searchform">
226 <input type="text" id="query" name="q" size="30" onFocus="getBlank (this, 'Search with Apache Solr');" value="Search with Apache Solr"></input>
227 <input type="submit" value="Search" name="Search" onclick="selectProvider(this.form)"/>
228 @
229 <select name="searchProvider" id="searchProvider">
230 <option value="any">select provider</option>
231 <option value="lucid">Lucid Find</option>
232 <option value="sl">Search-Lucene</option>
233 </select>
234 <script type="text/javascript">
235 if (document.cookie.length>0) {
236 cStart=document.cookie.indexOf("searchProvider=");
237 if (cStart!=-1) {
238 cStart=cStart + "searchProvider=".length;
239 cEnd=document.cookie.indexOf(";", cStart);
240 if (cEnd==-1) {
241 cEnd=document.cookie.length;
242 }
243 provider = unescape(document.cookie.substring(cStart,cEnd));
244 document.forms['searchform'].elements['searchProvider'].value = provider;
245 }
246 }
247 </script>
248 </form>
249 <div class="clear">
250 <hr/>
251 </div>
252 </div>
253 <div id="leftColumn">
254 <div id="navcolumn">
255 #mainMenu( $decoration.body.menus )
256 </div>
257 <div id="bookpromo">
258 <a href="http://manning.com/mattmann"><img src="http://www.manning.com/mattmann/mattmann_cover150.jpg" border="0"/></a>
259 </div>
260 </div>
261 <div id="bodyColumn">
262 <div id="contentBox">
263 $bodyContent
264 </div>
265 </div>
266 <div class="clear">
267 <hr/>
268 </div>
269 <div id="footer">
270 <p>
271 #set ( $currentYear = ${currentDate.year} + 1900 )
272 Copyright $currentYear
273 <a href="http://www.apache.org/">The Apache Software Foundation</a>.
274 Site powered by <a href="http://maven.apache.org/">Apache Maven</a>.
275 Search powered by <a href="http://www.lucidimagination.com">Lucid Imagination</a> & <a href="http://sematext.com">Sematext</a>.
276 </p>
277 <div class="clear">
278 <hr/>
279 </div>
280 </div>
281 </body>
282 </html>
0 <?xml version="1.0" encoding="UTF-8"?>
1 <!--
2 Licensed to the Apache Software Foundation (ASF) under one
3 or more contributor license agreements. See the NOTICE file
4 distributed with this work for additional information
5 regarding copyright ownership. The ASF licenses this file
6 to you under the Apache License, Version 2.0 (the
7 "License"); you may not use this file except in compliance
8 with the License. You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing,
13 software distributed under the License is distributed on an
14 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 KIND, either express or implied. See the License for the
16 specific language governing permissions and limitations
17 under the License.
18 -->
19 <project name="Apache Tika">
20 <bannerLeft>
21 <alt>Apache Tika</alt>
22 <src>http://tika.apache.org/tika.png</src>
23 <href>http://tika.apache.org</href>
24 </bannerLeft>
25 <bannerRight>
26 <alt>Apache</alt>
27 <src>http://www.apache.org/images/feather-small.gif</src>
28 <href>www.apache.org</href>
29 </bannerRight>
30 <body>
31 <head>
32 </head>
33 <links>
34 <item name="Apache" href="http://www.apache.org/" />
35 <item name="Lucene" href="http://lucene.apache.org/"/>
36 </links>
37 <menu name="Apache Tika">
38 <item name="Changes" href="index.html"/>
39 <item name="Getting Started" href="gettingstarted.html"/>
40 <item name="Supported Formats" href="formats.html"/>
41 <item name="Parser API" href="parser.html"/>
42 <item name="Parser 5min Quick Start Guide" href="parser_guide.html"/>
43 </menu>
44 <menu ref="reports"/>
45 </body>
46 </project>
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-app</artifactId>
32 <name>Apache Tika application</name>
33 <url>http://tika.apache.org/</url>
34
35 <properties>
36 <ikvm>${env.IKVM_HOME}</ikvm>
37 </properties>
38
39 <dependencies>
40 <dependency>
41 <groupId>${project.groupId}</groupId>
42 <artifactId>tika-parsers</artifactId>
43 <version>${project.version}</version>
44 </dependency>
45 <dependency>
46 <groupId>${project.groupId}</groupId>
47 <artifactId>tika-xmp</artifactId>
48 <version>${project.version}</version>
49 </dependency>
50 <dependency>
51 <groupId>org.slf4j</groupId>
52 <artifactId>slf4j-log4j12</artifactId>
53 <version>1.5.6</version>
54 </dependency>
55 <dependency>
56 <groupId>com.google.code.gson</groupId>
57 <artifactId>gson</artifactId>
58 <version>1.7.1</version>
59 </dependency>
60 <dependency>
61 <groupId>junit</groupId>
62 <artifactId>junit</artifactId>
63 <scope>test</scope>
64 <version>4.11</version>
65 </dependency>
66 <dependency>
67 <artifactId>commons-io</artifactId>
68 <groupId>commons-io</groupId>
69 <version>2.1</version>
70 <scope>test</scope>
71 </dependency>
72 </dependencies>
73
74 <build>
75 <resources>
76 <resource>
77 <directory>src/main/resources</directory>
78 <filtering>false</filtering>
79 </resource>
80 <resource>
81 <directory>src/main/resources-filtered</directory>
82 <filtering>true</filtering>
83 </resource>
84 </resources>
85 <plugins>
86 <plugin>
87 <artifactId>maven-shade-plugin</artifactId>
88 <executions>
89 <execution>
90 <phase>package</phase>
91 <goals>
92 <goal>shade</goal>
93 </goals>
94 <configuration>
95 <createDependencyReducedPom>
96 false
97 </createDependencyReducedPom>
98 <filters>
99 <filter>
100 <artifact>*:*</artifact>
101 <excludes>
102 <exclude>META-INF/*</exclude>
103 <exclude>LICENSE.txt</exclude>
104 <exclude>NOTICE.txt</exclude>
105 <exclude>CHANGES</exclude>
106 <exclude>README</exclude>
107 <exclude>builddef.lst</exclude>
108 <!-- TIKA-763: Workaround to avoid including LGPL classes -->
109 <exclude>ucar/nc2/iosp/fysat/Fysat*.class</exclude>
110 <exclude>ucar/nc2/dataset/transform/VOceanSG1*class</exclude>
111 <exclude>ucar/unidata/geoloc/vertical/OceanSG*.class</exclude>
112 </excludes>
113 </filter>
114 </filters>
115 <transformers>
116 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
117 <mainClass>org.apache.tika.cli.TikaCLI</mainClass>
118 </transformer>
119 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
120 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
121 <resource>META-INF/LICENSE</resource>
122 <file>target/classes/META-INF/LICENSE</file>
123 </transformer>
124 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
125 <resource>META-INF/NOTICE</resource>
126 <file>target/classes/META-INF/NOTICE</file>
127 </transformer>
128 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
129 <resource>META-INF/DEPENDENCIES</resource>
130 <file>target/classes/META-INF/DEPENDENCIES</file>
131 </transformer>
132 </transformers>
133 </configuration>
134 </execution>
135 </executions>
136 </plugin>
137 </plugins>
138 </build>
139
140 <profiles>
141 <profile>
142 <id>ikvm</id>
143 <activation>
144 <file>
145 <exists>${ikvm}/bin/ikvmc.exe</exists>
146 </file>
147 </activation>
148 <build>
149 <plugins>
150 <plugin>
151 <artifactId>maven-antrun-plugin</artifactId>
152 <executions>
153 <execution>
154 <phase>package</phase>
155 <goals>
156 <goal>run</goal>
157 </goals>
158 <configuration>
159 <target>
160 <exec executable="${ikvm}/bin/ikvmc.exe">
161 <arg value="-nowarn:0100" />
162 <arg value="-nowarn:0105" />
163 <arg value="-nowarn:0109" />
164 <arg value="-nowarn:0111" />
165 <arg value="-nowarn:0112" />
166 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Util.dll" />
167 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Charsets.dll" />
168 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Text.dll" />
169 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Core.dll" />
170 <arg value="-reference:${ikvm}/bin/IKVM.AWT.WinForms.dll" />
171 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Media.dll" />
172 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Misc.dll" />
173 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Security.dll" />
174 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.SwingAWT.dll" />
175 <arg value="-target:library" />
176 <arg value="-compressresources" />
177 <arg value="-out:${project.build.directory}/${project.build.finalName}.dll" />
178 <arg value="${project.build.directory}/${project.build.finalName}.jar" />
179 </exec>
180 </target>
181 </configuration>
182 </execution>
183 </executions>
184 </plugin>
185 <plugin>
186 <groupId>org.codehaus.mojo</groupId>
187 <artifactId>build-helper-maven-plugin</artifactId>
188 <version>1.7</version>
189 <executions>
190 <execution>
191 <phase>package</phase>
192 <goals>
193 <goal>attach-artifact</goal>
194 </goals>
195 <configuration>
196 <artifacts>
197 <artifacts>
198 <file>${project.build.directory}/${project.build.finalName}.dll</file>
199 <type>dll</type>
200 </artifacts>
201 </artifacts>
202 </configuration>
203 </execution>
204 </executions>
205 </plugin>
206 </plugins>
207 </build>
208 </profile>
209 </profiles>
210
211 <organization>
212 <name>The Apache Software Foundation</name>
213 <url>http://www.apache.org</url>
214 </organization>
215 <scm>
216 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-app</url>
217 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-app</connection>
218 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-app</developerConnection>
219 </scm>
220 <issueManagement>
221 <system>JIRA</system>
222 <url>https://issues.apache.org/jira/browse/TIKA</url>
223 </issueManagement>
224 <ciManagement>
225 <system>Jenkins</system>
226 <url>https://builds.apache.org/job/Tika-trunk/</url>
227 </ciManagement>
228 </project>
0 APACHE TIKA SUBCOMPONENTS
1
2 Apache Tika includes a number of subcomponents with separate copyright notices
3 and license terms. Your use of these subcomponents is subject to the terms and
4 conditions of the following licenses.
5
6 Bouncy Castle libraries (bcmail and bcprov)
7
8 Copyright (c) 2000-2009 The Legion Of The Bouncy Castle
9 (http://www.bouncycastle.org)
10
11 Permission is hereby granted, free of charge, to any person obtaining
12 a copy of this software and associated documentation files
13 (the "Software"), to deal in the Software without restriction,
14 including without limitation the rights to use, copy, modify, merge,
15 publish, distribute, sublicense, and/or sell copies of the Software,
16 and to permit persons to whom the Software is furnished to do so,
17 subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included
20 in all copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
26 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
27 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 OTHER DEALINGS IN THE SOFTWARE.
29
30 Contributions made to the original PDFBox, JempBox and FontBox projects:
31
32 Copyright (c) 2002-2007, www.pdfbox.org
33 Copyright (c) 2006-2007, www.jempbox.org
34 All rights reserved.
35
36 Redistribution and use in source and binary forms, with or without
37 modification, are permitted provided that the following conditions are met:
38
39 1. Redistributions of source code must retain the above copyright notice,
40 this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 3. Neither the name of pdfbox; nor the names of its contributors may be
47 used to endorse or promote products derived from this software without
48 specific prior written permission.
49
50 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
51 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
56 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
57 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 SUCH DAMAGE.
61
62 Adobe Font Metrics (AFM) for PDF Core 14 Fonts
63
64 This file and the 14 PostScript(R) AFM files it accompanies may be used,
65 copied, and distributed for any purpose and without charge, with or without
66 modification, provided that all copyright notices are retained; that the
67 AFM files are not distributed without this file; that all modifications
68 to this file or any of the AFM files are prominently noted in the modified
69 file(s); and that this paragraph is not modified. Adobe Systems has no
70 responsibility or obligation to support the use of the AFM files.
71
72 CMaps for PDF Fonts (http://www.adobe.com/devnet/font/#pcfi and
73 ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/adobe/)
74
75 Copyright 1990-2001 Adobe Systems Incorporated.
76 All Rights Reserved.
77
78 Patents Pending
79
80 NOTICE: All information contained herein is the property
81 of Adobe Systems Incorporated.
82
83 Permission is granted for redistribution of this file
84 provided this copyright notice is maintained intact and
85 that the contents of this file are not altered in any
86 way from its original form.
87
88 PostScript and Display PostScript are trademarks of
89 Adobe Systems Incorporated which may be registered in
90 certain jurisdictions.
91
92 Adobe Glyphlist (http://www.adobe.com/devnet/opentype/archives/glyph.html)
93
94 Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
95
96 Permission is hereby granted, free of charge, to any person obtaining a
97 copy of this documentation file to use, copy, publish, distribute,
98 sublicense, and/or sell copies of the documentation, and to permit
99 others to do the same, provided that:
100 - No modification, editing or other alteration of this document is
101 allowed; and
102 - The above copyright notice and this permission notice shall be
103 included in all copies of the documentation.
104
105 Permission is hereby granted, free of charge, to any person obtaining a
106 copy of this documentation file, to create their own derivative works
107 from the content of this document to use, copy, publish, distribute,
108 sublicense, and/or sell the derivative works, and to permit others to do
109 the same, provided that the derived work is not represented as being a
110 copy or version of this document.
111
112 Adobe shall not be liable to any party for any loss of revenue or profit
113 or for indirect, incidental, special, consequential, or other similar
114 damages, whether based on tort (including without limitation negligence
115 or strict liability), contract or other legal or equitable grounds even
116 if Adobe has been advised or had reason to know of the possibility of
117 such damages. The Adobe materials are provided on an "AS IS" basis.
118 Adobe specifically disclaims all express, statutory, or implied
119 warranties relating to the Adobe materials, including but not limited to
120 those concerning merchantability or fitness for a particular purpose or
121 non-infringement of any third party rights regarding the Adobe
122 materials.
123
124 Charset detection code from ICU4J (http://site.icu-project.org/)
125
126 Copyright (c) 1995-2009 International Business Machines Corporation
127 and others
128
129 All rights reserved.
130
131 Permission is hereby granted, free of charge, to any person obtaining
132 a copy of this software and associated documentation files (the
133 "Software"), to deal in the Software without restriction, including
134 without limitation the rights to use, copy, modify, merge, publish,
135 distribute, and/or sell copies of the Software, and to permit persons
136 to whom the Software is furnished to do so, provided that the above
137 copyright notice(s) and this permission notice appear in all copies
138 of the Software and that both the above copyright notice(s) and this
139 permission notice appear in supporting documentation.
140
141 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
142 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
143 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
144 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
145 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
146 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
147 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
148 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
149 SOFTWARE.
150
151 Except as contained in this notice, the name of a copyright holder shall
152 not be used in advertising or otherwise to promote the sale, use or other
153 dealings in this Software without prior written authorization of the
154 copyright holder.
155
156 ASM library (asm)
157
158 Copyright (c) 2000-2005 INRIA, France Telecom
159 All rights reserved.
160
161 Redistribution and use in source and binary forms, with or without
162 modification, are permitted provided that the following conditions
163 are met:
164
165 1. Redistributions of source code must retain the above copyright
166 notice, this list of conditions and the following disclaimer.
167
168 2. Redistributions in binary form must reproduce the above copyright
169 notice, this list of conditions and the following disclaimer in the
170 documentation and/or other materials provided with the distribution.
171
172 3. Neither the name of the copyright holders nor the names of its
173 contributors may be used to endorse or promote products derived from
174 this software without specific prior written permission.
175
176 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
177 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
178 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
179 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
180 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
181 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
182 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
183 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
184 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
185 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
186 THE POSSIBILITY OF SUCH DAMAGE.
187
188 MIME type information from file-4.26.tar.gz (http://www.darwinsys.com/file/)
189
190 Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
191 Software written by Ian F. Darwin and others;
192 maintained 1994- Christos Zoulas.
193
194 This software is not subject to any export provision of the United States
195 Department of Commerce, and may be exported to any country or planet.
196
197 Redistribution and use in source and binary forms, with or without
198 modification, are permitted provided that the following conditions
199 are met:
200 1. Redistributions of source code must retain the above copyright
201 notice immediately at the beginning of the file, without modification,
202 this list of conditions, and the following disclaimer.
203 2. Redistributions in binary form must reproduce the above copyright
204 notice, this list of conditions and the following disclaimer in the
205 documentation and/or other materials provided with the distribution.
206
207 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
208 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
209 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
210 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
211 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
212 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
213 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
214 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
215 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
216 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
217 SUCH DAMAGE.
218
219 Office Open XML schemas (poi-ooxml-schemas)
220
221 The Office Open XML schema definitions used by Apache POI are
222 a part of the Office Open XML ECMA Specification (ECMA-376, [1]).
223 As defined in section 9.4 of the ECMA bylaws [2], this specification
224 is available to all interested parties without restriction:
225
226 9.4 All documents when approved shall be made available to
227 all interested parties without restriction.
228
229 Furthermore, both Microsoft and Adobe have granted patent licenses
230 to this work [3,4,5].
231
232 [1] http://www.ecma-international.org/publications/standards/Ecma-376.htm
233 [2] http://www.ecma-international.org/memento/Ecmabylaws.htm
234 [3] http://www.microsoft.com/interop/osp/
235 [4] http://www.ecma-international.org/publications/files/ECMA-ST/Ecma%20PATENT/ECMA-376%20Edition%201%20Microsoft%20Patent%20Declaration.pdf
236 [5] http://www.ecma-international.org/publications/files/ECMA-ST/Ecma%20PATENT/ga-2006-191.pdf
237
238 DOM4J library (dom4j)
239
240 Copyright 2001-2005 (C) MetaStuff, Ltd. All Rights Reserved.
241
242 Redistribution and use of this software and associated documentation
243 ("Software"), with or without modification, are permitted provided
244 that the following conditions are met:
245
246 1. Redistributions of source code must retain copyright
247 statements and notices. Redistributions must also contain a
248 copy of this document.
249
250 2. Redistributions in binary form must reproduce the
251 above copyright notice, this list of conditions and the
252 following disclaimer in the documentation and/or other
253 materials provided with the distribution.
254
255 3. The name "DOM4J" must not be used to endorse or promote
256 products derived from this Software without prior written
257 permission of MetaStuff, Ltd. For written permission,
258 please contact dom4j-info@metastuff.com.
259
260 4. Products derived from this Software may not be called "DOM4J"
261 nor may "DOM4J" appear in their names without prior written
262 permission of MetaStuff, Ltd. DOM4J is a registered
263 trademark of MetaStuff, Ltd.
264
265 5. Due credit should be given to the DOM4J Project -
266 http://www.dom4j.org
267
268 THIS SOFTWARE IS PROVIDED BY METASTUFF, LTD. AND CONTRIBUTORS
269 ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
270 NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
271 FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
272 METASTUFF, LTD. OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
273 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
274 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
275 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
276 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
277 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
278 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
279 OF THE POSSIBILITY OF SUCH DAMAGE.
280
281 SLF4J library (slf4j-api, slf4j-log4j12)
282
283 Copyright (c) 2004-2008 QOS.ch
284 All rights reserved.
285
286 Permission is hereby granted, free of charge, to any person obtaining
287 a copy of this software and associated documentation files (the
288 "Software"), to deal in the Software without restriction, including
289 without limitation the rights to use, copy, modify, merge, publish,
290 distribute, sublicense, and/or sell copies of the Software, and to
291 permit persons to whom the Software is furnished to do so, subject to
292 the following conditions:
293
294 The above copyright notice and this permission notice shall be
295 included in all copies or substantial portions of the Software.
296
297 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
298 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
299 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
300 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
301 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
302 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
303 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
304
305 NetCDF library (netcdf)
306
307 Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
308
309 Portions of this software were developed by the Unidata Program at the
310 University Corporation for Atmospheric Research.
311
312 Access and use of this software shall impose the following obligations
313 and understandings on the user. The user is granted the right, without
314 any fee or cost, to use, copy, modify, alter, enhance and distribute
315 this software, and any derivative works thereof, and its supporting
316 documentation for any purpose whatsoever, provided that this entire
317 notice appears in all copies of the software, derivative works and
318 supporting documentation. Further, UCAR requests that the user credit
319 UCAR/Unidata in any publications that result from the use of this
320 software or in any product that includes this software. The names UCAR
321 and/or Unidata, however, may not be used in any advertising or publicity
322 to endorse or promote any products or commercial entity unless specific
323 written permission is obtained from UCAR/Unidata. The user also
324 understands that UCAR/Unidata is not obligated to provide the user with
325 any support, consulting, training or assistance of any kind with regard
326 to the use, operation and performance of this software nor to provide
327 the user with any updates, revisions, new versions or "bug fixes."
328
329 THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
330 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
331 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
332 DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
333 INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
334 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
335 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
336 WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
337
338 BZip classes inside the NetCDF library
339
340 The Apache Software License, Version 1.1
341
342 Copyright (c) 2001-2003 The Apache Software Foundation. All rights
343 reserved.
344
345 Redistribution and use in source and binary forms, with or without
346 modification, are permitted provided that the following conditions
347 are met:
348
349 1. Redistributions of source code must retain the above copyright
350 notice, this list of conditions and the following disclaimer.
351
352 2. Redistributions in binary form must reproduce the above copyright
353 notice, this list of conditions and the following disclaimer in
354 the documentation and/or other materials provided with the
355 distribution.
356
357 3. The end-user documentation included with the redistribution, if
358 any, must include the following acknowlegement:
359 "This product includes software developed by the
360 Apache Software Foundation (http://www.apache.org/)."
361 Alternately, this acknowlegement may appear in the software itself,
362 if and wherever such third-party acknowlegements normally appear.
363
364 4. The names "Ant" and "Apache Software
365 Foundation" must not be used to endorse or promote products derived
366 from this software without prior written permission. For written
367 permission, please contact apache@apache.org.
368
369 5. Products derived from this software may not be called "Apache"
370 nor may "Apache" appear in their names without prior written
371 permission of the Apache Group.
372
373 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
374 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
375 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
376 DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
377 ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
378 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
379 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
380 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
381 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
382 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
383 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
384 SUCH DAMAGE.
385
386 XZ compression library (xz)
387
388 All the files in this package have been written by Lasse Collin
389 and/or Igor Pavlov. All these files have been put into the
390 public domain. You can do whatever you want with these files.
391
392 This software is provided "as is", without any warranty.
393
394 XMPCore library (xmpcore)
395
396 The BSD License
397
398 Copyright (c) 2009, Adobe Systems Incorporated All rights reserved.
399
400 Redistribution and use in source and binary forms, with or without
401 modification, are permitted provided that the following conditions are met:
402
403 * Redistributions of source code must retain the above copyright notice,
404 this list of conditions and the following disclaimer.
405
406 * Redistributions in binary form must reproduce the above copyright
407 notice, this list of conditions and the following disclaimer in the
408 documentation and/or other materials provided with the distribution.
409
410 * Neither the name of Adobe Systems Incorporated, nor the names of its
411 contributors may be used to endorse or promote products derived from
412 this software without specific prior written permission.
413
414 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
415 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
416 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANT ABILITY AND FITNESS FOR
417 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
418 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
419 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
420 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
421 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
422 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
423 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
424 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
425
426 juniversalchardet library (juniversalchardet)
427
428 MOZILLA PUBLIC LICENSE
429 Version 1.1
430
431 ---------------
432
433 1. Definitions.
434
435 1.0.1. "Commercial Use" means distribution or otherwise making the
436 Covered Code available to a third party.
437
438 1.1. "Contributor" means each entity that creates or contributes to
439 the creation of Modifications.
440
441 1.2. "Contributor Version" means the combination of the Original
442 Code, prior Modifications used by a Contributor, and the Modifications
443 made by that particular Contributor.
444
445 1.3. "Covered Code" means the Original Code or Modifications or the
446 combination of the Original Code and Modifications, in each case
447 including portions thereof.
448
449 1.4. "Electronic Distribution Mechanism" means a mechanism generally
450 accepted in the software development community for the electronic
451 transfer of data.
452
453 1.5. "Executable" means Covered Code in any form other than Source
454 Code.
455
456 1.6. "Initial Developer" means the individual or entity identified
457 as the Initial Developer in the Source Code notice required by Exhibit
458 A.
459
460 1.7. "Larger Work" means a work which combines Covered Code or
461 portions thereof with code not governed by the terms of this License.
462
463 1.8. "License" means this document.
464
465 1.8.1. "Licensable" means having the right to grant, to the maximum
466 extent possible, whether at the time of the initial grant or
467 subsequently acquired, any and all of the rights conveyed herein.
468
469 1.9. "Modifications" means any addition to or deletion from the
470 substance or structure of either the Original Code or any previous
471 Modifications. When Covered Code is released as a series of files, a
472 Modification is:
473 A. Any addition to or deletion from the contents of a file
474 containing Original Code or previous Modifications.
475
476 B. Any new file that contains any part of the Original Code or
477 previous Modifications.
478
479 1.10. "Original Code" means Source Code of computer software code
480 which is described in the Source Code notice required by Exhibit A as
481 Original Code, and which, at the time of its release under this
482 License is not already Covered Code governed by this License.
483
484 1.10.1. "Patent Claims" means any patent claim(s), now owned or
485 hereafter acquired, including without limitation, method, process,
486 and apparatus claims, in any patent Licensable by grantor.
487
488 1.11. "Source Code" means the preferred form of the Covered Code for
489 making modifications to it, including all modules it contains, plus
490 any associated interface definition files, scripts used to control
491 compilation and installation of an Executable, or source code
492 differential comparisons against either the Original Code or another
493 well known, available Covered Code of the Contributor's choice. The
494 Source Code can be in a compressed or archival form, provided the
495 appropriate decompression or de-archiving software is widely available
496 for no charge.
497
498 1.12. "You" (or "Your") means an individual or a legal entity
499 exercising rights under, and complying with all of the terms of, this
500 License or a future version of this License issued under Section 6.1.
501 For legal entities, "You" includes any entity which controls, is
502 controlled by, or is under common control with You. For purposes of
503 this definition, "control" means (a) the power, direct or indirect,
504 to cause the direction or management of such entity, whether by
505 contract or otherwise, or (b) ownership of more than fifty percent
506 (50%) of the outstanding shares or beneficial ownership of such
507 entity.
508
509 2. Source Code License.
510
511 2.1. The Initial Developer Grant.
512 The Initial Developer hereby grants You a world-wide, royalty-free,
513 non-exclusive license, subject to third party intellectual property
514 claims:
515 (a) under intellectual property rights (other than patent or
516 trademark) Licensable by Initial Developer to use, reproduce,
517 modify, display, perform, sublicense and distribute the Original
518 Code (or portions thereof) with or without Modifications, and/or
519 as part of a Larger Work; and
520
521 (b) under Patents Claims infringed by the making, using or
522 selling of Original Code, to make, have made, use, practice,
523 sell, and offer for sale, and/or otherwise dispose of the
524 Original Code (or portions thereof).
525
526 (c) the licenses granted in this Section 2.1(a) and (b) are
527 effective on the date Initial Developer first distributes
528 Original Code under the terms of this License.
529
530 (d) Notwithstanding Section 2.1(b) above, no patent license is
531 granted: 1) for code that You delete from the Original Code; 2)
532 separate from the Original Code; or 3) for infringements caused
533 by: i) the modification of the Original Code or ii) the
534 combination of the Original Code with other software or devices.
535
536 2.2. Contributor Grant.
537 Subject to third party intellectual property claims, each Contributor
538 hereby grants You a world-wide, royalty-free, non-exclusive license
539
540 (a) under intellectual property rights (other than patent or
541 trademark) Licensable by Contributor, to use, reproduce, modify,
542 display, perform, sublicense and distribute the Modifications
543 created by such Contributor (or portions thereof) either on an
544 unmodified basis, with other Modifications, as Covered Code
545 and/or as part of a Larger Work; and
546
547 (b) under Patent Claims infringed by the making, using, or
548 selling of Modifications made by that Contributor either alone
549 and/or in combination with its Contributor Version (or portions
550 of such combination), to make, use, sell, offer for sale, have
551 made, and/or otherwise dispose of: 1) Modifications made by that
552 Contributor (or portions thereof); and 2) the combination of
553 Modifications made by that Contributor with its Contributor
554 Version (or portions of such combination).
555
556 (c) the licenses granted in Sections 2.2(a) and 2.2(b) are
557 effective on the date Contributor first makes Commercial Use of
558 the Covered Code.
559
560 (d) Notwithstanding Section 2.2(b) above, no patent license is
561 granted: 1) for any code that Contributor has deleted from the
562 Contributor Version; 2) separate from the Contributor Version;
563 3) for infringements caused by: i) third party modifications of
564 Contributor Version or ii) the combination of Modifications made
565 by that Contributor with other software (except as part of the
566 Contributor Version) or other devices; or 4) under Patent Claims
567 infringed by Covered Code in the absence of Modifications made by
568 that Contributor.
569
570 3. Distribution Obligations.
571
572 3.1. Application of License.
573 The Modifications which You create or to which You contribute are
574 governed by the terms of this License, including without limitation
575 Section 2.2. The Source Code version of Covered Code may be
576 distributed only under the terms of this License or a future version
577 of this License released under Section 6.1, and You must include a
578 copy of this License with every copy of the Source Code You
579 distribute. You may not offer or impose any terms on any Source Code
580 version that alters or restricts the applicable version of this
581 License or the recipients' rights hereunder. However, You may include
582 an additional document offering the additional rights described in
583 Section 3.5.
584
585 3.2. Availability of Source Code.
586 Any Modification which You create or to which You contribute must be
587 made available in Source Code form under the terms of this License
588 either on the same media as an Executable version or via an accepted
589 Electronic Distribution Mechanism to anyone to whom you made an
590 Executable version available; and if made available via Electronic
591 Distribution Mechanism, must remain available for at least twelve (12)
592 months after the date it initially became available, or at least six
593 (6) months after a subsequent version of that particular Modification
594 has been made available to such recipients. You are responsible for
595 ensuring that the Source Code version remains available even if the
596 Electronic Distribution Mechanism is maintained by a third party.
597
598 3.3. Description of Modifications.
599 You must cause all Covered Code to which You contribute to contain a
600 file documenting the changes You made to create that Covered Code and
601 the date of any change. You must include a prominent statement that
602 the Modification is derived, directly or indirectly, from Original
603 Code provided by the Initial Developer and including the name of the
604 Initial Developer in (a) the Source Code, and (b) in any notice in an
605 Executable version or related documentation in which You describe the
606 origin or ownership of the Covered Code.
607
608 3.4. Intellectual Property Matters
609 (a) Third Party Claims.
610 If Contributor has knowledge that a license under a third party's
611 intellectual property rights is required to exercise the rights
612 granted by such Contributor under Sections 2.1 or 2.2,
613 Contributor must include a text file with the Source Code
614 distribution titled "LEGAL" which describes the claim and the
615 party making the claim in sufficient detail that a recipient will
616 know whom to contact. If Contributor obtains such knowledge after
617 the Modification is made available as described in Section 3.2,
618 Contributor shall promptly modify the LEGAL file in all copies
619 Contributor makes available thereafter and shall take other steps
620 (such as notifying appropriate mailing lists or newsgroups)
621 reasonably calculated to inform those who received the Covered
622 Code that new knowledge has been obtained.
623
624 (b) Contributor APIs.
625 If Contributor's Modifications include an application programming
626 interface and Contributor has knowledge of patent licenses which
627 are reasonably necessary to implement that API, Contributor must
628 also include this information in the LEGAL file.
629
630 (c) Representations.
631 Contributor represents that, except as disclosed pursuant to
632 Section 3.4(a) above, Contributor believes that Contributor's
633 Modifications are Contributor's original creation(s) and/or
634 Contributor has sufficient rights to grant the rights conveyed by
635 this License.
636
637 3.5. Required Notices.
638 You must duplicate the notice in Exhibit A in each file of the Source
639 Code. If it is not possible to put such notice in a particular Source
640 Code file due to its structure, then You must include such notice in a
641 location (such as a relevant directory) where a user would be likely
642 to look for such a notice. If You created one or more Modification(s)
643 You may add your name as a Contributor to the notice described in
644 Exhibit A. You must also duplicate this License in any documentation
645 for the Source Code where You describe recipients' rights or ownership
646 rights relating to Covered Code. You may choose to offer, and to
647 charge a fee for, warranty, support, indemnity or liability
648 obligations to one or more recipients of Covered Code. However, You
649 may do so only on Your own behalf, and not on behalf of the Initial
650 Developer or any Contributor. You must make it absolutely clear than
651 any such warranty, support, indemnity or liability obligation is
652 offered by You alone, and You hereby agree to indemnify the Initial
653 Developer and every Contributor for any liability incurred by the
654 Initial Developer or such Contributor as a result of warranty,
655 support, indemnity or liability terms You offer.
656
657 3.6. Distribution of Executable Versions.
658 You may distribute Covered Code in Executable form only if the
659 requirements of Section 3.1-3.5 have been met for that Covered Code,
660 and if You include a notice stating that the Source Code version of
661 the Covered Code is available under the terms of this License,
662 including a description of how and where You have fulfilled the
663 obligations of Section 3.2. The notice must be conspicuously included
664 in any notice in an Executable version, related documentation or
665 collateral in which You describe recipients' rights relating to the
666 Covered Code. You may distribute the Executable version of Covered
667 Code or ownership rights under a license of Your choice, which may
668 contain terms different from this License, provided that You are in
669 compliance with the terms of this License and that the license for the
670 Executable version does not attempt to limit or alter the recipient's
671 rights in the Source Code version from the rights set forth in this
672 License. If You distribute the Executable version under a different
673 license You must make it absolutely clear that any terms which differ
674 from this License are offered by You alone, not by the Initial
675 Developer or any Contributor. You hereby agree to indemnify the
676 Initial Developer and every Contributor for any liability incurred by
677 the Initial Developer or such Contributor as a result of any such
678 terms You offer.
679
680 3.7. Larger Works.
681 You may create a Larger Work by combining Covered Code with other code
682 not governed by the terms of this License and distribute the Larger
683 Work as a single product. In such a case, You must make sure the
684 requirements of this License are fulfilled for the Covered Code.
685
686 4. Inability to Comply Due to Statute or Regulation.
687
688 If it is impossible for You to comply with any of the terms of this
689 License with respect to some or all of the Covered Code due to
690 statute, judicial order, or regulation then You must: (a) comply with
691 the terms of this License to the maximum extent possible; and (b)
692 describe the limitations and the code they affect. Such description
693 must be included in the LEGAL file described in Section 3.4 and must
694 be included with all distributions of the Source Code. Except to the
695 extent prohibited by statute or regulation, such description must be
696 sufficiently detailed for a recipient of ordinary skill to be able to
697 understand it.
698
699 5. Application of this License.
700
701 This License applies to code to which the Initial Developer has
702 attached the notice in Exhibit A and to related Covered Code.
703
704 6. Versions of the License.
705
706 6.1. New Versions.
707 Netscape Communications Corporation ("Netscape") may publish revised
708 and/or new versions of the License from time to time. Each version
709 will be given a distinguishing version number.
710
711 6.2. Effect of New Versions.
712 Once Covered Code has been published under a particular version of the
713 License, You may always continue to use it under the terms of that
714 version. You may also choose to use such Covered Code under the terms
715 of any subsequent version of the License published by Netscape. No one
716 other than Netscape has the right to modify the terms applicable to
717 Covered Code created under this License.
718
719 6.3. Derivative Works.
720 If You create or use a modified version of this License (which you may
721 only do in order to apply it to code which is not already Covered Code
722 governed by this License), You must (a) rename Your license so that
723 the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape",
724 "MPL", "NPL" or any confusingly similar phrase do not appear in your
725 license (except to note that your license differs from this License)
726 and (b) otherwise make it clear that Your version of the license
727 contains terms which differ from the Mozilla Public License and
728 Netscape Public License. (Filling in the name of the Initial
729 Developer, Original Code or Contributor in the notice described in
730 Exhibit A shall not of themselves be deemed to be modifications of
731 this License.)
732
733 7. DISCLAIMER OF WARRANTY.
734
735 COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
736 WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
737 WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF
738 DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING.
739 THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE
740 IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT,
741 YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE
742 COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER
743 OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
744 ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
745
746 8. TERMINATION.
747
748 8.1. This License and the rights granted hereunder will terminate
749 automatically if You fail to comply with terms herein and fail to cure
750 such breach within 30 days of becoming aware of the breach. All
751 sublicenses to the Covered Code which are properly granted shall
752 survive any termination of this License. Provisions which, by their
753 nature, must remain in effect beyond the termination of this License
754 shall survive.
755
756 8.2. If You initiate litigation by asserting a patent infringement
757 claim (excluding declatory judgment actions) against Initial Developer
758 or a Contributor (the Initial Developer or Contributor against whom
759 You file such action is referred to as "Participant") alleging that:
760
761 (a) such Participant's Contributor Version directly or indirectly
762 infringes any patent, then any and all rights granted by such
763 Participant to You under Sections 2.1 and/or 2.2 of this License
764 shall, upon 60 days notice from Participant terminate prospectively,
765 unless if within 60 days after receipt of notice You either: (i)
766 agree in writing to pay Participant a mutually agreeable reasonable
767 royalty for Your past and future use of Modifications made by such
768 Participant, or (ii) withdraw Your litigation claim with respect to
769 the Contributor Version against such Participant. If within 60 days
770 of notice, a reasonable royalty and payment arrangement are not
771 mutually agreed upon in writing by the parties or the litigation claim
772 is not withdrawn, the rights granted by Participant to You under
773 Sections 2.1 and/or 2.2 automatically terminate at the expiration of
774 the 60 day notice period specified above.
775
776 (b) any software, hardware, or device, other than such Participant's
777 Contributor Version, directly or indirectly infringes any patent, then
778 any rights granted to You by such Participant under Sections 2.1(b)
779 and 2.2(b) are revoked effective as of the date You first made, used,
780 sold, distributed, or had made, Modifications made by that
781 Participant.
782
783 8.3. If You assert a patent infringement claim against Participant
784 alleging that such Participant's Contributor Version directly or
785 indirectly infringes any patent where such claim is resolved (such as
786 by license or settlement) prior to the initiation of patent
787 infringement litigation, then the reasonable value of the licenses
788 granted by such Participant under Sections 2.1 or 2.2 shall be taken
789 into account in determining the amount or value of any payment or
790 license.
791
792 8.4. In the event of termination under Sections 8.1 or 8.2 above,
793 all end user license agreements (excluding distributors and resellers)
794 which have been validly granted by You or any distributor hereunder
795 prior to termination shall survive termination.
796
797 9. LIMITATION OF LIABILITY.
798
799 UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
800 (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
801 DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE,
802 OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR
803 ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
804 CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL,
805 WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
806 COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
807 INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
808 LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY
809 RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
810 PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
811 EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO
812 THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
813
814 10. U.S. GOVERNMENT END USERS.
815
816 The Covered Code is a "commercial item," as that term is defined in
817 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
818 software" and "commercial computer software documentation," as such
819 terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48
820 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995),
821 all U.S. Government End Users acquire Covered Code with only those
822 rights set forth herein.
823
824 11. MISCELLANEOUS.
825
826 This License represents the complete agreement concerning subject
827 matter hereof. If any provision of this License is held to be
828 unenforceable, such provision shall be reformed only to the extent
829 necessary to make it enforceable. This License shall be governed by
830 California law provisions (except to the extent applicable law, if
831 any, provides otherwise), excluding its conflict-of-law provisions.
832 With respect to disputes in which at least one party is a citizen of,
833 or an entity chartered or registered to do business in the United
834 States of America, any litigation relating to this License shall be
835 subject to the jurisdiction of the Federal Courts of the Northern
836 District of California, with venue lying in Santa Clara County,
837 California, with the losing party responsible for costs, including
838 without limitation, court costs and reasonable attorneys' fees and
839 expenses. The application of the United Nations Convention on
840 Contracts for the International Sale of Goods is expressly excluded.
841 Any law or regulation which provides that the language of a contract
842 shall be construed against the drafter shall not apply to this
843 License.
844
845 12. RESPONSIBILITY FOR CLAIMS.
846
847 As between Initial Developer and the Contributors, each party is
848 responsible for claims and damages arising, directly or indirectly,
849 out of its utilization of rights under this License and You agree to
850 work with Initial Developer and Contributors to distribute such
851 responsibility on an equitable basis. Nothing herein is intended or
852 shall be deemed to constitute any admission of liability.
853
854 13. MULTIPLE-LICENSED CODE.
855
856 Initial Developer may designate portions of the Covered Code as
857 "Multiple-Licensed". "Multiple-Licensed" means that the Initial
858 Developer permits you to utilize portions of the Covered Code under
859 Your choice of the NPL or the alternative licenses, if any, specified
860 by the Initial Developer in the file described in Exhibit A.
861
862 EXHIBIT A -Mozilla Public License.
863
864 ``The contents of this file are subject to the Mozilla Public License
865 Version 1.1 (the "License"); you may not use this file except in
866 compliance with the License. You may obtain a copy of the License at
867 http://www.mozilla.org/MPL/
868
869 Software distributed under the License is distributed on an "AS IS"
870 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
871 License for the specific language governing rights and limitations
872 under the License.
873
874 The Original Code is ______________________________________.
875
876 The Initial Developer of the Original Code is ________________________.
877 Portions created by ______________________ are Copyright (C) ______
878 _______________________. All Rights Reserved.
879
880 Contributor(s): ______________________________________.
881
882 Alternatively, the contents of this file may be used under the terms
883 of the _____ license (the "[___] License"), in which case the
884 provisions of [______] License are applicable instead of those
885 above. If you wish to allow use of your version of this file only
886 under the terms of the [____] License and not to allow others to use
887 your version of this file under the MPL, indicate your decision by
888 deleting the provisions above and replace them with the notice and
889 other provisions required by the [___] License. If you do not delete
890 the provisions above, a recipient may use your version of this file
891 under either the MPL or the [___] License."
892
893 [NOTE: The text of this Exhibit A may differ slightly from the text of
894 the notices in the Source Code files of the Original Code. You should
895 use the text of this Exhibit A rather than the text found in the
896 Original Code Source Code for Your Modifications.]
897
898 AspectJ runtime library (aspectjrt)
899
900 Eclipse Public License - v 1.0
901
902 THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
903 PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
904 THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
905
906 1. DEFINITIONS
907
908 "Contribution" means:
909
910 a) in the case of the initial Contributor, the initial code and
911 documentation distributed under this Agreement, and
912
913 b) in the case of each subsequent Contributor:
914
915 i) changes to the Program, and
916
917 ii) additions to the Program;
918
919 where such changes and/or additions to the Program originate from and
920 are distributed by that particular Contributor. A Contribution
921 'originates' from a Contributor if it was added to the Program by
922 such Contributor itself or anyone acting on such Contributor's behalf.
923 Contributions do not include additions to the Program which: (i) are
924 separate modules of software distributed in conjunction with the
925 Program under their own license agreement, and (ii) are not derivative
926 works of the Program.
927
928 "Contributor" means any person or entity that distributes the Program.
929
930 "Licensed Patents " mean patent claims licensable by a Contributor which
931 are necessarily infringed by the use or sale of its Contribution alone or
932 when combined with the Program.
933
934 "Program" means the Contributions distributed in accordance with this
935 Agreement.
936
937 "Recipient" means anyone who receives the Program under this Agreement,
938 including all Contributors.
939
940 2. GRANT OF RIGHTS
941
942 a) Subject to the terms of this Agreement, each Contributor hereby grants
943 Recipient a non-exclusive, worldwide, royalty-free copyright license to
944 reproduce, prepare derivative works of, publicly display, publicly
945 perform, distribute and sublicense the Contribution of such
946 Contributor, if any, and such derivative works, in source code and
947 object code form.
948
949 b) Subject to the terms of this Agreement, each Contributor hereby grants
950 Recipient a non-exclusive, worldwide, royalty-free patent license under
951 Licensed Patents to make, use, sell, offer to sell, import and
952 otherwise transfer the Contribution of such Contributor, if any, in
953 source code and object code form. This patent license shall apply to
954 the combination of the Contribution and the Program if, at the time
955 the Contribution is added by the Contributor, such addition of the
956 Contribution causes such combination to be covered by the Licensed
957 Patents. The patent license shall not apply to any other combinations
958 which include the Contribution. No hardware per se is licensed hereunder.
959
960 c) Recipient understands that although each Contributor grants the
961 licenses to its Contributions set forth herein, no assurances are
962 provided by any Contributor that the Program does not infringe the
963 patent or other intellectual property rights of any other entity. Each
964 Contributor disclaims any liability to Recipient for claims brought by
965 any other entity based on infringement of intellectual property rights
966 or otherwise. As a condition to exercising the rights and licenses
967 granted hereunder, each Recipient hereby assumes sole responsibility
968 to secure any other intellectual property rights needed, if any. For
969 example, if a third party patent license is required to allow Recipient
970 to distribute the Program, it is Recipient's responsibility to acquire
971 that license before distributing the Program.
972
973 d) Each Contributor represents that to its knowledge it has sufficient
974 copyright rights in its Contribution, if any, to grant the copyright
975 license set forth in this Agreement.
976
977 3. REQUIREMENTS
978
979 A Contributor may choose to distribute the Program in object code form
980 under its own license agreement, provided that:
981
982 a) it complies with the terms and conditions of this Agreement; and
983
984 b) its license agreement:
985
986 i) effectively disclaims on behalf of all Contributors all warranties
987 and conditions, express and implied, including warranties or
988 conditions of title and non-infringement, and implied warranties
989 or conditions of merchantability and fitness for a particular
990 purpose;
991
992 ii) effectively excludes on behalf of all Contributors all liability
993 for damages, including direct, indirect, special, incidental and
994 consequential damages, such as lost profits;
995
996 iii) states that any provisions which differ from this Agreement are
997 offered by that Contributor alone and not by any other party; and
998
999 iv) states that source code for the Program is available from such
1000 Contributor, and informs licensees how to obtain it in a
1001 reasonable manner on or through a medium customarily used for
1002 software exchange.
1003
1004 When the Program is made available in source code form:
1005
1006 a) it must be made available under this Agreement; and
1007
1008 b) a copy of this Agreement must be included with each copy of the
1009 Program.
1010
1011 Contributors may not remove or alter any copyright notices contained
1012 within the Program.
1013
1014 Each Contributor must identify itself as the originator of its
1015 Contribution, if any, in a manner that reasonably allows subsequent
1016 Recipients to identify the originator of the Contribution.
1017
1018 4. COMMERCIAL DISTRIBUTION
1019
1020 Commercial distributors of software may accept certain responsibilities
1021 with respect to end users, business partners and the like. While this
1022 license is intended to facilitate the commercial use of the Program,
1023 the Contributor who includes the Program in a commercial product offering
1024 should do so in a manner which does not create potential liability for
1025 other Contributors. Therefore, if a Contributor includes the Program in
1026 a commercial product offering, such Contributor ("Commercial Contributor")
1027 hereby agrees to defend and indemnify every other Contributor
1028 ("Indemnified Contributor") against any losses, damages and costs
1029 (collectively "Losses") arising from claims, lawsuits and other legal
1030 actions brought by a third party against the Indemnified Contributor to
1031 the extent caused by the acts or omissions of such Commercial Contributor
1032 in connection with its distribution of the Program in a commercial
1033 product offering. The obligations in this section do not apply to any
1034 claims or Losses relating to any actual or alleged intellectual property
1035 infringement. In order to qualify, an Indemnified Contributor must:
1036 a) promptly notify the Commercial Contributor in writing of such claim,
1037 and b) allow the Commercial Contributor to control, and cooperate with
1038 the Commercial Contributor in, the defense and any related settlement
1039 negotiations. The Indemnified Contributor may participate in any such
1040 claim at its own expense.
1041
1042 For example, a Contributor might include the Program in a commercial
1043 product offering, Product X. That Contributor is then a Commercial
1044 Contributor. If that Commercial Contributor then makes performance claims,
1045 or offers warranties related to Product X, those performance claims and
1046 warranties are such Commercial Contributor's responsibility alone. Under
1047 this section, the Commercial Contributor would have to defend claims
1048 against the other Contributors related to those performance claims and
1049 warranties, and if a court requires any other Contributor to pay any
1050 damages as a result, the Commercial Contributor must pay those damages.
1051
1052 5. NO WARRANTY
1053
1054 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED
1055 ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
1056 EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
1057 CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
1058 PARTICULAR PURPOSE. Each Recipient is solely responsible for determining
1059 the appropriateness of using and distributing the Program and assumes all
1060 risks associated with its exercise of rights under this Agreement ,
1061 including but not limited to the risks and costs of program errors,
1062 compliance with applicable laws, damage to or loss of data, programs or
1063 equipment, and unavailability or interruption of operations.
1064
1065 6. DISCLAIMER OF LIABILITY
1066
1067 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
1068 ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
1069 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
1070 WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
1071 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1072 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
1073 DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
1074 HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
1075
1076 7. GENERAL
1077
1078 If any provision of this Agreement is invalid or unenforceable under
1079 applicable law, it shall not affect the validity or enforceability of
1080 the remainder of the terms of this Agreement, and without further action
1081 by the parties hereto, such provision shall be reformed to the minimum
1082 extent necessary to make such provision valid and enforceable.
1083
1084 If Recipient institutes patent litigation against any entity (including
1085 a cross-claim or counterclaim in a lawsuit) alleging that the Program
1086 itself (excluding combinations of the Program with other software or
1087 hardware) infringes such Recipient's patent(s), then such Recipient's
1088 rights granted under Section 2(b) shall terminate as of the date such
1089 litigation is filed.
1090
1091 All Recipient's rights under this Agreement shall terminate if it fails
1092 to comply with any of the material terms or conditions of this Agreement
1093 and does not cure such failure in a reasonable period of time after
1094 becoming aware of such noncompliance. If all Recipient's rights under
1095 this Agreement terminate, Recipient agrees to cease use and distribution
1096 of the Program as soon as reasonably practicable. However, Recipient's
1097 obligations under this Agreement and any licenses granted by Recipient
1098 relating to the Program shall continue and survive.
1099
1100 Everyone is permitted to copy and distribute copies of this Agreement,
1101 but in order to avoid inconsistency the Agreement is copyrighted and may
1102 only be modified in the following manner. The Agreement Steward reserves
1103 the right to publish new versions (including revisions) of this Agreement
1104 from time to time. No one other than the Agreement Steward has the right
1105 to modify this Agreement. The Eclipse Foundation is the initial Agreement
1106 Steward. The Eclipse Foundation may assign the responsibility to serve as
1107 the Agreement Steward to a suitable separate entity. Each new version of
1108 the Agreement will be given a distinguishing version number. The Program
1109 (including Contributions) may always be distributed subject to the version
1110 of the Agreement under which it was received. In addition, after a new
1111 version of the Agreement is published, Contributor may elect to distribute
1112 the Program (including its Contributions) under the new version. Except as
1113 expressly stated in Sections 2(a) and 2(b) above, Recipient receives no
1114 rights or licenses to the intellectual property of any Contributor under
1115 this Agreement, whether expressly, by implication, estoppel or otherwise.
1116 All rights in the Program not expressly granted under this Agreement
1117 are reserved.
1118
1119 This Agreement is governed by the laws of the State of New York and the
1120 intellectual property laws of the United States of America. No party to
1121 this Agreement will bring a legal action under this Agreement more than
1122 one year after the cause of action arose. Each party waives its rights to
1123 a jury trial in any resulting litigation.
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one or more
2 contributor license agreements. See the NOTICE file distributed with
3 this work for additional information regarding copyright ownership.
4 The ASF licenses this file to You under the Apache License, Version 2.0
5 (the "License"); you may not use this file except in compliance with
6 the License. You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 -->
16 <assembly>
17 <id>standalone</id>
18 <formats>
19 <format>jar</format>
20 </formats>
21 <includeBaseDirectory>false</includeBaseDirectory>
22 <dependencySets>
23 <dependencySet>
24 <outputDirectory></outputDirectory>
25 <unpack>true</unpack>
26 <unpackOptions>
27 <excludes>
28 <exclude>META-INF/MANIFEST.MF</exclude>
29 <exclude>META-INF/README*</exclude>
30 <exclude>META-INF/NOTICE*</exclude>
31 <exclude>META-INF/LICENSE*</exclude>
32 <exclude>README*</exclude>
33 <exclude>NOTICE*</exclude>
34 <exclude>LICENSE*</exclude>
35 </excludes>
36 </unpackOptions>
37 </dependencySet>
38 </dependencySets>
39 </assembly>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.cli;
17
18 import java.io.File;
19 import java.io.FileOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.OutputStream;
23 import java.io.OutputStreamWriter;
24 import java.io.PrintStream;
25 import java.io.PrintWriter;
26 import java.io.UnsupportedEncodingException;
27 import java.io.Writer;
28 import java.lang.reflect.Field;
29 import java.net.ServerSocket;
30 import java.net.Socket;
31 import java.net.URI;
32 import java.net.URL;
33 import java.text.NumberFormat;
34 import java.text.ParsePosition;
35 import java.util.Arrays;
36 import java.util.Comparator;
37 import java.util.HashMap;
38 import java.util.HashSet;
39 import java.util.List;
40 import java.util.Map.Entry;
41 import java.util.Map;
42 import java.util.Set;
43 import javax.xml.transform.OutputKeys;
44 import javax.xml.transform.TransformerConfigurationException;
45 import javax.xml.transform.sax.SAXTransformerFactory;
46 import javax.xml.transform.sax.TransformerHandler;
47 import javax.xml.transform.stream.StreamResult;
48
49 import org.apache.commons.logging.Log;
50 import org.apache.commons.logging.LogFactory;
51 import org.apache.log4j.BasicConfigurator;
52 import org.apache.log4j.Level;
53 import org.apache.log4j.Logger;
54 import org.apache.log4j.SimpleLayout;
55 import org.apache.log4j.WriterAppender;
56 import org.apache.poi.poifs.filesystem.DirectoryEntry;
57 import org.apache.poi.poifs.filesystem.DocumentEntry;
58 import org.apache.poi.poifs.filesystem.DocumentInputStream;
59 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
60 import org.apache.tika.Tika;
61 import org.apache.tika.config.TikaConfig;
62 import org.apache.tika.detect.CompositeDetector;
63 import org.apache.tika.detect.DefaultDetector;
64 import org.apache.tika.detect.Detector;
65 import org.apache.tika.exception.TikaException;
66 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
67 import org.apache.tika.fork.ForkParser;
68 import org.apache.tika.gui.TikaGUI;
69 import org.apache.tika.io.CloseShieldInputStream;
70 import org.apache.tika.io.IOUtils;
71 import org.apache.tika.io.TikaInputStream;
72 import org.apache.tika.language.LanguageProfilerBuilder;
73 import org.apache.tika.language.ProfilingHandler;
74 import org.apache.tika.metadata.Metadata;
75 import org.apache.tika.mime.MediaType;
76 import org.apache.tika.mime.MediaTypeRegistry;
77 import org.apache.tika.mime.MimeTypeException;
78 import org.apache.tika.parser.AutoDetectParser;
79 import org.apache.tika.parser.CompositeParser;
80 import org.apache.tika.parser.NetworkParser;
81 import org.apache.tika.parser.ParseContext;
82 import org.apache.tika.parser.Parser;
83 import org.apache.tika.parser.ParserDecorator;
84 import org.apache.tika.parser.PasswordProvider;
85 import org.apache.tika.parser.html.BoilerpipeContentHandler;
86 import org.apache.tika.sax.BodyContentHandler;
87 import org.apache.tika.sax.ExpandedTitleContentHandler;
88 import org.apache.tika.xmp.XMPMetadata;
89 import org.xml.sax.ContentHandler;
90 import org.xml.sax.SAXException;
91 import org.xml.sax.helpers.DefaultHandler;
92 import com.google.gson.Gson;
93 import org.apache.tika.io.FilenameUtils;
94
95 /**
96 * Simple command line interface for Apache Tika.
97 */
98 public class TikaCLI {
99 private File extractDir = new File(".");
100
101 private static final Log logger = LogFactory.getLog(TikaCLI.class);
102
103 public static void main(String[] args) throws Exception {
104 BasicConfigurator.configure(
105 new WriterAppender(new SimpleLayout(), System.err));
106 Logger.getRootLogger().setLevel(Level.INFO);
107
108 TikaCLI cli = new TikaCLI();
109 if (args.length > 0) {
110 for (int i = 0; i < args.length; i++) {
111 cli.process(args[i]);
112 }
113 if (cli.pipeMode) {
114 cli.process("-");
115 }
116 } else {
117 // Started with no arguments. Wait for up to 0.1s to see if
118 // we have something waiting in standard input and use the
119 // pipe mode if we have. If no input is seen, start the GUI.
120 if (System.in.available() == 0) {
121 Thread.sleep(100);
122 }
123 if (System.in.available() > 0) {
124 cli.process("-");
125 } else {
126 cli.process("--gui");
127 }
128 }
129 }
130
131 private class OutputType {
132
133 public void process(
134 InputStream input, OutputStream output, Metadata metadata)
135 throws Exception {
136 Parser p = parser;
137 if (fork) {
138 p = new ForkParser(TikaCLI.class.getClassLoader(), p);
139 }
140 ContentHandler handler = getContentHandler(output, metadata);
141 p.parse(input, handler, metadata, context);
142 // fix for TIKA-596: if a parser doesn't generate
143 // XHTML output, the lack of an output document prevents
144 // metadata from being output: this fixes that
145 if (handler instanceof NoDocumentMetHandler){
146 NoDocumentMetHandler metHandler = (NoDocumentMetHandler)handler;
147 if(!metHandler.metOutput()){
148 metHandler.endDocument();
149 }
150 }
151 }
152
153 protected ContentHandler getContentHandler(
154 OutputStream output, Metadata metadata) throws Exception {
155 throw new UnsupportedOperationException();
156 }
157
158 }
159
160 private final OutputType XML = new OutputType() {
161 @Override
162 protected ContentHandler getContentHandler(
163 OutputStream output, Metadata metadata) throws Exception {
164 return getTransformerHandler(output, "xml", encoding, prettyPrint);
165 }
166 };
167
168 private final OutputType HTML = new OutputType() {
169 @Override
170 protected ContentHandler getContentHandler(
171 OutputStream output, Metadata metadata) throws Exception {
172 return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint));
173 }
174 };
175
176 private final OutputType TEXT = new OutputType() {
177 @Override
178 protected ContentHandler getContentHandler(
179 OutputStream output, Metadata metadata) throws Exception {
180 return new BodyContentHandler(getOutputWriter(output, encoding));
181 }
182 };
183
184 private final OutputType NO_OUTPUT = new OutputType() {
185 @Override
186 protected ContentHandler getContentHandler(
187 OutputStream output, Metadata metadata) {
188 return new DefaultHandler();
189 }
190 };
191
192 private final OutputType TEXT_MAIN = new OutputType() {
193 @Override
194 protected ContentHandler getContentHandler(
195 OutputStream output, Metadata metadata) throws Exception {
196 return new BoilerpipeContentHandler(getOutputWriter(output, encoding));
197 }
198 };
199
200 private final OutputType METADATA = new OutputType() {
201 @Override
202 protected ContentHandler getContentHandler(
203 OutputStream output, Metadata metadata) throws Exception {
204 final PrintWriter writer =
205 new PrintWriter(getOutputWriter(output, encoding));
206 return new NoDocumentMetHandler(metadata, writer);
207 }
208 };
209
210 private final OutputType JSON = new OutputType() {
211 @Override
212 protected ContentHandler getContentHandler(
213 OutputStream output, Metadata metadata) throws Exception {
214 final PrintWriter writer =
215 new PrintWriter(getOutputWriter(output, encoding));
216 return new NoDocumentJSONMetHandler(metadata, writer);
217 }
218 };
219
220 private final OutputType XMP = new OutputType() {
221 @Override
222 protected ContentHandler getContentHandler(
223 OutputStream output, final Metadata metadata) throws Exception {
224 final PrintWriter writer =
225 new PrintWriter(getOutputWriter(output, encoding));
226 return new NoDocumentXMPMetaHandler(metadata, writer);
227 }
228 };
229
230 private final OutputType LANGUAGE = new OutputType() {
231 @Override
232 protected ContentHandler getContentHandler(
233 OutputStream output, Metadata metadata) throws Exception {
234 final PrintWriter writer =
235 new PrintWriter(getOutputWriter(output, encoding));
236 return new ProfilingHandler() {
237 public void endDocument() {
238 writer.println(getLanguage().getLanguage());
239 writer.flush();
240 }
241 };
242 }
243 };
244
245 private final OutputType DETECT = new OutputType() {
246 @Override
247 public void process(
248 InputStream stream, OutputStream output, Metadata metadata)
249 throws Exception {
250 PrintWriter writer =
251 new PrintWriter(getOutputWriter(output, encoding));
252 writer.println(detector.detect(stream, metadata).toString());
253 writer.flush();
254 }
255 };
256
257
258 /* Creates ngram profile */
259 private final OutputType CREATE_PROFILE = new OutputType() {
260 @Override
261 public void process(
262 InputStream stream, OutputStream output, Metadata metadata)
263 throws Exception {
264 ngp = LanguageProfilerBuilder.create(profileName, stream, encoding);
265 FileOutputStream fos = new FileOutputStream(new File(profileName + ".ngp"));
266 ngp.save(fos);//saves ngram profile
267 fos.close();
268 PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
269 writer.println("ngram profile location:=" + new File(ngp.getName()).getCanonicalPath());
270 writer.flush();
271 }
272 };
273
274 private ParseContext context;
275
276 private Detector detector;
277
278 private Parser parser;
279
280 private OutputType type = XML;
281
282 private LanguageProfilerBuilder ngp = null;
283
284 /**
285 * Output character encoding, or <code>null</code> for platform default
286 */
287 private String encoding = null;
288
289 /**
290 * Password for opening encrypted documents, or <code>null</code>.
291 */
292 private String password = System.getenv("TIKA_PASSWORD");
293
294 private boolean pipeMode = true;
295
296 private boolean serverMode = false;
297
298 private boolean fork = false;
299
300 private String profileName = null;
301
302 private boolean prettyPrint;
303
304 public TikaCLI() throws Exception {
305 context = new ParseContext();
306 detector = new DefaultDetector();
307 parser = new AutoDetectParser(detector);
308 context.set(Parser.class, parser);
309 context.set(PasswordProvider.class, new PasswordProvider() {
310 public String getPassword(Metadata metadata) {
311 return password;
312 }
313 });
314 }
315
316 public void process(String arg) throws Exception {
317 if (arg.equals("-?") || arg.equals("--help")) {
318 pipeMode = false;
319 usage();
320 } else if (arg.equals("-V") || arg.equals("--version")) {
321 pipeMode = false;
322 version();
323 } else if (arg.equals("-v") || arg.equals("--verbose")) {
324 Logger.getRootLogger().setLevel(Level.DEBUG);
325 } else if (arg.equals("-g") || arg.equals("--gui")) {
326 pipeMode = false;
327 TikaGUI.main(new String[0]);
328 } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
329 pipeMode = false;
330 displayParsers(false);
331 } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
332 pipeMode = false;
333 displayDetectors();
334 } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
335 pipeMode = false;
336 displayParsers(true);
337 } else if(arg.equals("--list-met-models")){
338 pipeMode = false;
339 displayMetModels();
340 } else if(arg.equals("--list-supported-types")){
341 pipeMode = false;
342 displaySupportedTypes();
343 } else if (arg.equals("--container-aware")
344 || arg.equals("--container-aware-detector")) {
345 // ignore, as container-aware detectors are now always used
346 } else if (arg.equals("-f") || arg.equals("--fork")) {
347 fork = true;
348 } else if (arg.startsWith("-e")) {
349 encoding = arg.substring("-e".length());
350 } else if (arg.startsWith("--encoding=")) {
351 encoding = arg.substring("--encoding=".length());
352 } else if (arg.startsWith("-p") && !arg.equals("-p")) {
353 password = arg.substring("-p".length());
354 } else if (arg.startsWith("--password=")) {
355 password = arg.substring("--password=".length());
356 } else if (arg.equals("-j") || arg.equals("--json")) {
357 type = JSON;
358 } else if (arg.equals("-y") || arg.equals("--xmp")) {
359 type = XMP;
360 } else if (arg.equals("-x") || arg.equals("--xml")) {
361 type = XML;
362 } else if (arg.equals("-h") || arg.equals("--html")) {
363 type = HTML;
364 } else if (arg.equals("-t") || arg.equals("--text")) {
365 type = TEXT;
366 } else if (arg.equals("-T") || arg.equals("--text-main")) {
367 type = TEXT_MAIN;
368 } else if (arg.equals("-m") || arg.equals("--metadata")) {
369 type = METADATA;
370 } else if (arg.equals("-l") || arg.equals("--language")) {
371 type = LANGUAGE;
372 } else if (arg.equals("-d") || arg.equals("--detect")) {
373 type = DETECT;
374 } else if (arg.startsWith("--extract-dir=")) {
375 extractDir = new File(arg.substring("--extract-dir=".length()));
376 } else if (arg.equals("-z") || arg.equals("--extract")) {
377 type = NO_OUTPUT;
378 context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
379 } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
380 prettyPrint = true;
381 } else if (arg.equals("-p") || arg.equals("--port")
382 || arg.equals("-s") || arg.equals("--server")) {
383 serverMode = true;
384 pipeMode = false;
385 } else if (arg.startsWith("-c")) {
386 URI uri = new URI(arg.substring("-c".length()));
387 parser = new NetworkParser(uri);
388 } else if (arg.startsWith("--client=")) {
389 URI uri = new URI(arg.substring("--client=".length()));
390 parser = new NetworkParser(uri);
391 } else if(arg.startsWith("--create-profile=")){
392 profileName = arg.substring("--create-profile=".length());
393 type = CREATE_PROFILE;
394 } else {
395 pipeMode = false;
396 if (serverMode) {
397 new TikaServer(Integer.parseInt(arg)).start();
398 } else if (arg.equals("-")) {
399 InputStream stream =
400 TikaInputStream.get(new CloseShieldInputStream(System.in));
401 try {
402 type.process(stream, System.out, new Metadata());
403 } finally {
404 stream.close();
405 }
406 } else {
407 URL url;
408 File file = new File(arg);
409 if (file.isFile()) {
410 url = file.toURI().toURL();
411 } else {
412 url = new URL(arg);
413 }
414 Metadata metadata = new Metadata();
415 InputStream input = TikaInputStream.get(url, metadata);
416 try {
417 type.process(input, System.out, metadata);
418 } finally {
419 input.close();
420 System.out.flush();
421 }
422 }
423 }
424 }
425
426 private void usage() {
427 PrintStream out = System.out;
428 out.println("usage: java -jar tika-app.jar [option...] [file|port...]");
429 out.println();
430 out.println("Options:");
431 out.println(" -? or --help Print this usage message");
432 out.println(" -v or --verbose Print debug level messages");
433 out.println(" -V or --version Print the Apache Tika version number");
434 out.println();
435 out.println(" -g or --gui Start the Apache Tika GUI");
436 out.println(" -s or --server Start the Apache Tika server");
437 out.println(" -f or --fork Use Fork Mode for out-of-process extraction");
438 out.println();
439 out.println(" -x or --xml Output XHTML content (default)");
440 out.println(" -h or --html Output HTML content");
441 out.println(" -t or --text Output plain text content");
442 out.println(" -T or --text-main Output plain text content (main content only)");
443 out.println(" -m or --metadata Output only metadata");
444 out.println(" -j or --json Output metadata in JSON");
445 out.println(" -y or --xmp Output metadata in XMP");
446 out.println(" -l or --language Output only language");
447 out.println(" -d or --detect Detect document type");
448 out.println(" -eX or --encoding=X Use output encoding X");
449 out.println(" -pX or --password=X Use document password X");
450 out.println(" -z or --extract Extract all attachements into current directory");
451 out.println(" --extract-dir=<dir> Specify target directory for -z");
452 out.println(" -r or --pretty-print For XML and XHTML outputs, adds newlines and");
453 out.println(" whitespace, for better readability");
454 out.println();
455 out.println(" --create-profile=X");
456 out.println(" Create NGram profile, where X is a profile name");
457 out.println(" --list-parsers");
458 out.println(" List the available document parsers");
459 out.println(" --list-parser-details");
460 out.println(" List the available document parsers, and their supported mime types");
461 out.println(" --list-detectors");
462 out.println(" List the available document detectors");
463 out.println(" --list-met-models");
464 out.println(" List the available metadata models, and their supported keys");
465 out.println(" --list-supported-types");
466 out.println(" List all known media types and related information");
467 out.println();
468 out.println("Description:");
469 out.println(" Apache Tika will parse the file(s) specified on the");
470 out.println(" command line and output the extracted text content");
471 out.println(" or metadata to standard output.");
472 out.println();
473 out.println(" Instead of a file name you can also specify the URL");
474 out.println(" of a document to be parsed.");
475 out.println();
476 out.println(" If no file name or URL is specified (or the special");
477 out.println(" name \"-\" is used), then the standard input stream");
478 out.println(" is parsed. If no arguments were given and no input");
479 out.println(" data is available, the GUI is started instead.");
480 out.println();
481 out.println("- GUI mode");
482 out.println();
483 out.println(" Use the \"--gui\" (or \"-g\") option to start the");
484 out.println(" Apache Tika GUI. You can drag and drop files from");
485 out.println(" a normal file explorer to the GUI window to extract");
486 out.println(" text content and metadata from the files.");
487 out.println();
488 out.println("- Server mode");
489 out.println();
490 out.println(" Use the \"--server\" (or \"-s\") option to start the");
491 out.println(" Apache Tika server. The server will listen to the");
492 out.println(" ports you specify as one or more arguments.");
493 out.println();
494 }
495
496 private void version() {
497 System.out.println(new Tika().toString());
498 }
499
500 private void displayMetModels(){
501 Class<?>[] modelClasses = Metadata.class.getInterfaces();
502 Arrays.sort(modelClasses, new Comparator<Class<?>>() {
503 public int compare(Class<?> o1, Class<?> o2) {
504 return o1.getName().compareTo(o2.getName());
505 }
506 });
507
508 for (Class<?> modelClass: modelClasses) {
509 // we don't care about internal Tika met classes
510 // if we do, then we can take this conditional out
511 if (!modelClass.getSimpleName().contains("Tika")) {
512 System.out.println(modelClass.getSimpleName());
513 Field[] keyFields = modelClass.getFields();
514 Arrays.sort(keyFields, new Comparator<Field>() {
515 public int compare(Field o1, Field o2) {
516 return o1.getName().compareTo(o2.getName());
517 }
518 });
519 for (Field keyField: keyFields) {
520 System.out.println(" "+keyField.getName());
521 }
522 }
523 }
524 }
525
526 /*
527 * Displays loaded parsers and their mime types
528 * If a parser is a composite parser, it will list the
529 * sub parsers and their mime-types.
530 */
531 private void displayParsers(boolean includeMimeTypes) {
532 displayParser(parser, includeMimeTypes, 0);
533 }
534
535 private void displayParser(Parser p, boolean includeMimeTypes, int i) {
536 boolean isComposite = (p instanceof CompositeParser);
537 String name = (p instanceof ParserDecorator) ?
538 ((ParserDecorator) p).getWrappedParser().getClass().getName() :
539 p.getClass().getName();
540 System.out.println(indent(i) + name + (isComposite ? " (Composite Parser):" : ""));
541 if (includeMimeTypes && !isComposite) {
542 for (MediaType mt : p.getSupportedTypes(context)) {
543 System.out.println(indent(i+2) + mt);
544 }
545 }
546
547 if (isComposite) {
548 Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
549 for(Parser sp : subParsers) {
550 displayParser(sp, includeMimeTypes, i+2);
551 }
552 }
553 }
554
555 /*
556 * Displays loaded detectors and their mime types
557 * If a detector is a composite detector, it will list the
558 * sub detectors.
559 */
560 private void displayDetectors() {
561 displayDetector(detector, 0);
562 }
563
564 private void displayDetector(Detector d, int i) {
565 boolean isComposite = (d instanceof CompositeDetector);
566 String name = d.getClass().getName();
567 System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : ""));
568 if (isComposite) {
569 List<Detector> subDetectors = ((CompositeDetector)d).getDetectors();
570 for(Detector sd : subDetectors) {
571 displayDetector(sd, i+2);
572 }
573 }
574 }
575
576 private String indent(int indent) {
577 return " ".substring(0, indent);
578 }
579
580 private Parser[] sortParsers(Map<Parser, Set<MediaType>> parsers) {
581 // Get a nicely sorted list of the parsers
582 Parser[] sortedParsers = parsers.keySet().toArray(new Parser[parsers.size()]);
583 Arrays.sort(sortedParsers, new Comparator<Parser>() {
584 public int compare(Parser p1, Parser p2) {
585 String name1 = p1.getClass().getName();
586 String name2 = p2.getClass().getName();
587 return name1.compareTo(name2);
588 }
589 });
590 return sortedParsers;
591 }
592
593 private Map<Parser, Set<MediaType>> invertMediaTypeMap(Map<MediaType, Parser> supported) {
594 Map<Parser,Set<MediaType>> parsers = new HashMap<Parser, Set<MediaType>>();
595 for(Entry<MediaType, Parser> e : supported.entrySet()) {
596 if (!parsers.containsKey(e.getValue())) {
597 parsers.put(e.getValue(), new HashSet<MediaType>());
598 }
599 parsers.get(e.getValue()).add(e.getKey());
600 }
601 return parsers;
602 }
603
604 /**
605 * Prints all the known media types, aliases and matching parser classes.
606 */
607 private void displaySupportedTypes() {
608 AutoDetectParser parser = new AutoDetectParser();
609 MediaTypeRegistry registry = parser.getMediaTypeRegistry();
610 Map<MediaType, Parser> parsers = parser.getParsers();
611
612 for (MediaType type : registry.getTypes()) {
613 System.out.println(type);
614 for (MediaType alias : registry.getAliases(type)) {
615 System.out.println(" alias: " + alias);
616 }
617 MediaType supertype = registry.getSupertype(type);
618 if (supertype != null) {
619 System.out.println(" supertype: " + supertype);
620 }
621 Parser p = parsers.get(type);
622 if (p != null) {
623 System.out.println(" parser: " + p.getClass().getName());
624 }
625 }
626 }
627
628 /**
629 * Returns a output writer with the given encoding.
630 *
631 * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
632 * @param output output stream
633 * @param encoding output encoding,
634 * or <code>null</code> for the platform default
635 * @return output writer
636 * @throws UnsupportedEncodingException
637 * if the given encoding is not supported
638 */
639 private static Writer getOutputWriter(OutputStream output, String encoding)
640 throws UnsupportedEncodingException {
641 if (encoding != null) {
642 return new OutputStreamWriter(output, encoding);
643 } else if (System.getProperty("os.name")
644 .toLowerCase().startsWith("mac os x")) {
645 // TIKA-324: Override the default encoding on Mac OS X
646 return new OutputStreamWriter(output, "UTF-8");
647 } else {
648 return new OutputStreamWriter(output);
649 }
650 }
651
652 /**
653 * Returns a transformer handler that serializes incoming SAX events
654 * to XHTML or HTML (depending the given method) using the given output
655 * encoding.
656 *
657 * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
658 * @param output output stream
659 * @param method "xml" or "html"
660 * @param encoding output encoding,
661 * or <code>null</code> for the platform default
662 * @return {@link System#out} transformer handler
663 * @throws TransformerConfigurationException
664 * if the transformer can not be created
665 */
666 private static TransformerHandler getTransformerHandler(
667 OutputStream output, String method, String encoding, boolean prettyPrint)
668 throws TransformerConfigurationException {
669 SAXTransformerFactory factory = (SAXTransformerFactory)
670 SAXTransformerFactory.newInstance();
671 TransformerHandler handler = factory.newTransformerHandler();
672 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
673 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
674 if (encoding != null) {
675 handler.getTransformer().setOutputProperty(
676 OutputKeys.ENCODING, encoding);
677 }
678 handler.setResult(new StreamResult(output));
679 return handler;
680 }
681
682 private class FileEmbeddedDocumentExtractor
683 implements EmbeddedDocumentExtractor {
684
685 private int count = 0;
686 private final TikaConfig config = TikaConfig.getDefaultConfig();
687
688 public boolean shouldParseEmbedded(Metadata metadata) {
689 return true;
690 }
691
692 public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
693 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
694
695 if (name == null) {
696 name = "file" + count++;
697 }
698
699 MediaType contentType = detector.detect(inputStream, metadata);
700
701 if (name.indexOf('.')==-1 && contentType!=null) {
702 try {
703 name += config.getMimeRepository().forName(
704 contentType.toString()).getExtension();
705 } catch (MimeTypeException e) {
706 e.printStackTrace();
707 }
708 }
709
710 String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
711 if (relID != null && !name.startsWith(relID)) {
712 name = relID + "_" + name;
713 }
714
715 File outputFile = new File(extractDir, FilenameUtils.normalize(name));
716 File parent = outputFile.getParentFile();
717 if (!parent.exists()) {
718 if (!parent.mkdirs()) {
719 throw new IOException("unable to create directory \"" + parent + "\"");
720 }
721 }
722 System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
723
724 FileOutputStream os = null;
725
726 try {
727 os = new FileOutputStream(outputFile);
728
729 if (inputStream instanceof TikaInputStream) {
730 TikaInputStream tin = (TikaInputStream) inputStream;
731
732 if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
733 POIFSFileSystem fs = new POIFSFileSystem();
734 copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
735 fs.writeFilesystem(os);
736 } else {
737 IOUtils.copy(inputStream, os);
738 }
739 } else {
740 IOUtils.copy(inputStream, os);
741 }
742 } catch (Exception e) {
743 //
744 // being a CLI program messages should go to the stderr too
745 //
746 String msg = String.format(
747 "Ignoring unexpected exception trying to save embedded file %s (%s)",
748 name,
749 e.getMessage()
750 );
751 System.err.println(msg);
752 logger.warn(msg, e);
753 } finally {
754 if (os != null) {
755 os.close();
756 }
757 }
758 }
759
760 protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
761 throws IOException {
762 for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
763 if (entry instanceof DirectoryEntry) {
764 // Need to recurse
765 DirectoryEntry newDir = destDir.createDirectory(entry.getName());
766 copy((DirectoryEntry) entry, newDir);
767 } else {
768 // Copy entry
769 InputStream contents = new DocumentInputStream((DocumentEntry) entry);
770 try {
771 destDir.createDocument(entry.getName(), contents);
772 } finally {
773 contents.close();
774 }
775 }
776 }
777 }
778 }
779
780 private class TikaServer extends Thread {
781
782 private final ServerSocket server;
783
784 public TikaServer(int port) throws IOException {
785 super("Tika server at port " + port);
786 server = new ServerSocket(port);
787 }
788
789 @Override
790 public void run() {
791 try {
792 try {
793 while (true) {
794 processSocketInBackground(server.accept());
795 }
796 } finally {
797 server.close();
798 }
799 } catch (IOException e) {
800 e.printStackTrace();
801 }
802 }
803
804 private void processSocketInBackground(final Socket socket) {
805 Thread thread = new Thread() {
806 @Override
807 public void run() {
808 try {
809 try {
810 InputStream rawInput = socket.getInputStream();
811 OutputStream output = socket.getOutputStream();
812 InputStream input = TikaInputStream.get(rawInput);
813 type.process(input, output, new Metadata());
814 output.flush();
815 } finally {
816 socket.close();
817 }
818 } catch (Exception e) {
819 e.printStackTrace();
820 }
821 }
822 };
823 thread.setDaemon(true);
824 thread.start();
825 }
826
827 }
828
829 private class NoDocumentMetHandler extends DefaultHandler {
830
831 protected final Metadata metadata;
832
833 protected PrintWriter writer;
834
835 private boolean metOutput;
836
837 public NoDocumentMetHandler(Metadata metadata, PrintWriter writer){
838 this.metadata = metadata;
839 this.writer = writer;
840 this.metOutput = false;
841 }
842
843 @Override
844 public void endDocument() {
845 String[] names = metadata.names();
846 Arrays.sort(names);
847 outputMetadata(names);
848 writer.flush();
849 this.metOutput = true;
850 }
851
852 public void outputMetadata(String[] names) {
853 for (String name : names) {
854 for(String value : metadata.getValues(name)) {
855 writer.println(name + ": " + value);
856 }
857 }
858 }
859
860 public boolean metOutput(){
861 return this.metOutput;
862 }
863
864 }
865
866 /**
867 * Outputs the Tika metadata as XMP using the Tika XMP module
868 */
869 private class NoDocumentXMPMetaHandler extends DefaultHandler
870 {
871 protected final Metadata metadata;
872
873 protected PrintWriter writer;
874
875 public NoDocumentXMPMetaHandler(Metadata metadata, PrintWriter writer){
876 this.metadata = metadata;
877 this.writer = writer;
878 }
879
880 @Override
881 public void endDocument() throws SAXException
882 {
883 try
884 {
885 XMPMetadata xmp = new XMPMetadata(metadata);
886 String result;
887 result = xmp.toString();
888 writer.write(result);
889 writer.flush();
890 }
891 catch (TikaException e)
892 {
893 throw new SAXException(e);
894 }
895 }
896 }
897
898 /**
899 * Uses GSON to do the JSON escaping, but does
900 * the general JSON glueing ourselves.
901 */
902 private class NoDocumentJSONMetHandler extends NoDocumentMetHandler {
903 private NumberFormat formatter;
904 private Gson gson;
905
906 public NoDocumentJSONMetHandler(Metadata metadata, PrintWriter writer){
907 super(metadata, writer);
908
909 formatter = NumberFormat.getInstance();
910 gson = new Gson();
911 }
912
913 @Override
914 public void outputMetadata(String[] names) {
915 writer.print("{ ");
916 boolean first = true;
917 for (String name : names) {
918 if(! first) {
919 writer.println(", ");
920 } else {
921 first = false;
922 }
923 gson.toJson(name, writer);
924 writer.print(":");
925 outputValues(metadata.getValues(name));
926 }
927 writer.print(" }");
928 }
929
930 public void outputValues(String[] values) {
931 if(values.length > 1) {
932 writer.print("[");
933 }
934 for(int i=0; i<values.length; i++) {
935 String value = values[i];
936 if(i > 0) {
937 writer.print(", ");
938 }
939
940 if(value == null || value.length() == 0) {
941 writer.print("null");
942 } else {
943 // Is it a number?
944 ParsePosition pos = new ParsePosition(0);
945 formatter.parse(value, pos);
946 if(value.length() == pos.getIndex()) {
947 // It's a number. Remove leading zeros and output
948 value = value.replaceFirst("^0+(\\d)", "$1");
949 writer.print(value);
950 } else {
951 // Not a number, escape it
952 gson.toJson(value, writer);
953 }
954 }
955 }
956 if(values.length > 1) {
957 writer.print("]");
958 }
959 }
960 }
961 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.gui;
17
18 import java.awt.datatransfer.Clipboard;
19 import java.awt.datatransfer.DataFlavor;
20 import java.awt.datatransfer.Transferable;
21 import java.awt.event.InputEvent;
22 import java.io.File;
23 import java.net.URI;
24 import java.net.URL;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.StringTokenizer;
28
29 import javax.swing.Icon;
30 import javax.swing.JComponent;
31 import javax.swing.TransferHandler;
32
33 /**
34 * Utility class that turns drag-and-drop events into Tika parse requests.
35 */
36 class ParsingTransferHandler extends TransferHandler {
37
38 /**
39 * Serial version UID.
40 */
41 private static final long serialVersionUID = -557932290014044494L;
42
43 private final TransferHandler delegate;
44
45 private final TikaGUI tika;
46
47 private static DataFlavor uriListFlavor;
48 private static DataFlavor urlListFlavor;
49 static {
50 try {
51 uriListFlavor = new DataFlavor("text/uri-list;class=java.lang.String");
52 urlListFlavor = new DataFlavor("text/plain;class=java.lang.String");
53 } catch (ClassNotFoundException e) {
54 }
55 }
56
57 public ParsingTransferHandler(TransferHandler delegate, TikaGUI tika) {
58 this.delegate = delegate;
59 this.tika = tika;
60 }
61
62 public boolean canImport(JComponent component, DataFlavor[] flavors) {
63 for (DataFlavor flavor : flavors) {
64 if (flavor.equals(DataFlavor.javaFileListFlavor) || flavor.equals(uriListFlavor) || flavor.equals(urlListFlavor)) {
65 return true;
66 }
67 }
68 return false;
69 }
70
71 @SuppressWarnings("unchecked")
72 public boolean importData(
73 JComponent component, Transferable transferable) {
74 try {
75 if (transferable.isDataFlavorSupported(DataFlavor.javaFileListFlavor)) {
76 importFiles((List<File>) transferable.getTransferData(
77 DataFlavor.javaFileListFlavor));
78 } else if (transferable.isDataFlavorSupported(urlListFlavor)) {
79 Object data = transferable.getTransferData(urlListFlavor);
80 tika.openURL(new URL(data.toString()));
81 } else if (transferable.isDataFlavorSupported(uriListFlavor)) {
82 importFiles(uriToFileList(
83 transferable.getTransferData(uriListFlavor)));
84 }
85 return true;
86 } catch (Exception e) {
87 return false;
88 }
89 }
90
91 private void importFiles(List<File> files) {
92 for (File file : files) {
93 tika.openFile(file);
94 }
95 }
96
97 public void exportAsDrag(JComponent arg0, InputEvent arg1, int arg2) {
98 delegate.exportAsDrag(arg0, arg1, arg2);
99 }
100
101 public void exportToClipboard(JComponent arg0, Clipboard arg1, int arg2)
102 throws IllegalStateException {
103 delegate.exportToClipboard(arg0, arg1, arg2);
104 }
105
106 public int getSourceActions(JComponent arg0) {
107 return delegate.getSourceActions(arg0);
108 }
109
110 public Icon getVisualRepresentation(Transferable arg0) {
111 return delegate.getVisualRepresentation(arg0);
112 }
113
114 private static List<File> uriToFileList(Object data) {
115 List<File> list = new ArrayList<File>();
116 StringTokenizer st = new StringTokenizer(data.toString(), "\r\n");
117 while (st.hasMoreTokens())
118 {
119 String s = st.nextToken();
120 if (s.startsWith("#")) {
121 continue;
122 }
123 try {
124 list.add(new File(new URI(s)));
125 } catch (Exception e) {
126 }
127 }
128 return list;
129 }
130 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.gui;
17
18 import java.awt.CardLayout;
19 import java.awt.Color;
20 import java.awt.Dimension;
21 import java.awt.Toolkit;
22 import java.awt.event.ActionEvent;
23 import java.awt.event.ActionListener;
24 import java.awt.event.KeyEvent;
25 import java.awt.event.WindowEvent;
26 import java.io.File;
27 import java.io.FileOutputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.PrintWriter;
31 import java.io.StringWriter;
32 import java.io.Writer;
33 import java.net.MalformedURLException;
34 import java.net.URL;
35 import java.util.Arrays;
36 import java.util.HashMap;
37 import java.util.Map;
38 import java.util.Set;
39
40 import javax.swing.Box;
41 import javax.swing.JDialog;
42 import javax.swing.JEditorPane;
43 import javax.swing.JFileChooser;
44 import javax.swing.JFrame;
45 import javax.swing.JMenu;
46 import javax.swing.JMenuBar;
47 import javax.swing.JMenuItem;
48 import javax.swing.JOptionPane;
49 import javax.swing.JPanel;
50 import javax.swing.JScrollPane;
51 import javax.swing.JTextPane;
52 import javax.swing.ProgressMonitorInputStream;
53 import javax.swing.SwingUtilities;
54 import javax.swing.UIManager;
55 import javax.swing.event.HyperlinkEvent;
56 import javax.swing.event.HyperlinkEvent.EventType;
57 import javax.swing.event.HyperlinkListener;
58 import javax.xml.transform.OutputKeys;
59 import javax.xml.transform.TransformerConfigurationException;
60 import javax.xml.transform.sax.SAXTransformerFactory;
61 import javax.xml.transform.sax.TransformerHandler;
62 import javax.xml.transform.stream.StreamResult;
63
64 import org.apache.tika.exception.TikaException;
65 import org.apache.tika.extractor.DocumentSelector;
66 import org.apache.tika.io.IOUtils;
67 import org.apache.tika.io.TikaInputStream;
68 import org.apache.tika.metadata.Metadata;
69 import org.apache.tika.mime.MediaType;
70 import org.apache.tika.parser.AbstractParser;
71 import org.apache.tika.parser.AutoDetectParser;
72 import org.apache.tika.parser.ParseContext;
73 import org.apache.tika.parser.Parser;
74 import org.apache.tika.parser.html.BoilerpipeContentHandler;
75 import org.apache.tika.sax.BodyContentHandler;
76 import org.apache.tika.sax.ContentHandlerDecorator;
77 import org.apache.tika.sax.TeeContentHandler;
78 import org.apache.tika.sax.XHTMLContentHandler;
79 import org.xml.sax.Attributes;
80 import org.xml.sax.ContentHandler;
81 import org.xml.sax.SAXException;
82 import org.xml.sax.helpers.AttributesImpl;
83
84 /**
85 * Simple Swing GUI for Apache Tika. You can drag and drop files on top
86 * of the window to have them parsed.
87 */
88 public class TikaGUI extends JFrame
89 implements ActionListener, HyperlinkListener {
90
91 /**
92 * Serial version UID.
93 */
94 private static final long serialVersionUID = 5883906936187059495L;
95
96 /**
97 * Main method. Sets the Swing look and feel to the operating system
98 * settings, and starts the Tika GUI with an {@link AutoDetectParser}
99 * instance as the default parser.
100 *
101 * @param args ignored
102 * @throws Exception if an error occurs
103 */
104 public static void main(String[] args) throws Exception {
105 UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
106 SwingUtilities.invokeLater(new Runnable() {
107 public void run() {
108 new TikaGUI(new AutoDetectParser()).setVisible(true);
109 }
110 });
111 }
112
113 /**
114 * Parsing context.
115 */
116 private final ParseContext context;
117
118 /**
119 * Configured parser instance.
120 */
121 private final Parser parser;
122
123 /**
124 * Captures requested embedded images
125 */
126 private final ImageSavingParser imageParser;
127
128 /**
129 * The card layout for switching between different views.
130 */
131 private final CardLayout layout = new CardLayout();
132
133 /**
134 * Container for the editor cards.
135 */
136 private final JPanel cards;
137
138 /**
139 * Formatted XHTML output.
140 */
141 private final JEditorPane html;
142
143 /**
144 * Plain text output.
145 */
146 private final JEditorPane text;
147
148 /**
149 * Main content output.
150 */
151 private final JEditorPane textMain;
152
153 /**
154 * Raw XHTML source.
155 */
156 private final JEditorPane xml;
157
158 /**
159 * Document metadata.
160 */
161 private final JEditorPane metadata;
162
163 /**
164 * File chooser.
165 */
166 private final JFileChooser chooser = new JFileChooser();
167
168 public TikaGUI(Parser parser) {
169 super("Apache Tika");
170 setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
171
172 addMenuBar();
173
174 cards = new JPanel(layout);
175 addWelcomeCard(cards, "welcome");
176 metadata = addCard(cards, "text/plain", "metadata");
177 html = addCard(cards, "text/html", "html");
178 text = addCard(cards, "text/plain", "text");
179 textMain = addCard(cards, "text/plain", "main");
180 xml = addCard(cards, "text/plain", "xhtml");
181 add(cards);
182 layout.show(cards, "welcome");
183
184 setPreferredSize(new Dimension(640, 480));
185 pack();
186
187 this.context = new ParseContext();
188 this.parser = parser;
189
190 this.imageParser = new ImageSavingParser(parser);
191 this.context.set(DocumentSelector.class, new ImageDocumentSelector());
192 this.context.set(Parser.class, imageParser);
193 }
194
195 private void addMenuBar() {
196 JMenuBar bar = new JMenuBar();
197
198 JMenu file = new JMenu("File");
199 file.setMnemonic(KeyEvent.VK_F);
200 addMenuItem(file, "Open...", "openfile", KeyEvent.VK_O);
201 addMenuItem(file, "Open URL...", "openurl", KeyEvent.VK_U);
202 file.addSeparator();
203 addMenuItem(file, "Exit", "exit", KeyEvent.VK_X);
204 bar.add(file);
205
206 JMenu view = new JMenu("View");
207 view.setMnemonic(KeyEvent.VK_V);
208 addMenuItem(view, "Metadata", "metadata", KeyEvent.VK_M);
209 addMenuItem(view, "Formatted text", "html", KeyEvent.VK_F);
210 addMenuItem(view, "Plain text", "text", KeyEvent.VK_P);
211 addMenuItem(view, "Main content", "main", KeyEvent.VK_C);
212 addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S);
213 bar.add(view);
214
215 bar.add(Box.createHorizontalGlue());
216 JMenu help = new JMenu("Help");
217 help.setMnemonic(KeyEvent.VK_H);
218 addMenuItem(help, "About Tika", "about", KeyEvent.VK_A);
219 bar.add(help);
220
221 setJMenuBar(bar);
222 }
223
224 private void addMenuItem(
225 JMenu menu, String title, String command, int key) {
226 JMenuItem item = new JMenuItem(title, key);
227 item.setActionCommand(command);
228 item.addActionListener(this);
229 menu.add(item);
230 }
231
232 public void actionPerformed(ActionEvent e) {
233 String command = e.getActionCommand();
234 if ("openfile".equals(command)) {
235 int rv = chooser.showOpenDialog(this);
236 if (rv == JFileChooser.APPROVE_OPTION) {
237 openFile(chooser.getSelectedFile());
238 }
239 } else if ("openurl".equals(command)) {
240 Object rv = JOptionPane.showInputDialog(
241 this, "Enter the URL of the resource to be parsed:",
242 "Open URL", JOptionPane.PLAIN_MESSAGE,
243 null, null, "");
244 if (rv != null && rv.toString().length() > 0) {
245 try {
246 openURL(new URL(rv.toString().trim()));
247 } catch (MalformedURLException exception) {
248 JOptionPane.showMessageDialog(
249 this, "The given string is not a valid URL",
250 "Invalid URL", JOptionPane.ERROR_MESSAGE);
251 }
252 }
253 } else if ("html".equals(command)) {
254 layout.show(cards, command);
255 } else if ("text".equals(command)) {
256 layout.show(cards, command);
257 } else if ("main".equals(command)) {
258 layout.show(cards, command);
259 } else if ("xhtml".equals(command)) {
260 layout.show(cards, command);
261 } else if ("metadata".equals(command)) {
262 layout.show(cards, command);
263 } else if ("about".equals(command)) {
264 textDialog(
265 "About Apache Tika",
266 TikaGUI.class.getResource("about.html"));
267 } else if ("exit".equals(command)) {
268 Toolkit.getDefaultToolkit().getSystemEventQueue().postEvent(
269 new WindowEvent(this, WindowEvent.WINDOW_CLOSING));
270 }
271 }
272
273 public void openFile(File file) {
274 try {
275 Metadata metadata = new Metadata();
276 TikaInputStream stream = TikaInputStream.get(file, metadata);
277 try {
278 handleStream(stream, metadata);
279 } finally {
280 stream.close();
281 }
282 } catch (Throwable t) {
283 handleError(file.getPath(), t);
284 }
285 }
286
287 public void openURL(URL url) {
288 try {
289 Metadata metadata = new Metadata();
290 TikaInputStream stream = TikaInputStream.get(url, metadata);
291 try {
292 handleStream(stream, metadata);
293 } finally {
294 stream.close();
295 }
296 } catch (Throwable t) {
297 handleError(url.toString(), t);
298 }
299 }
300
301 private void handleStream(InputStream input, Metadata md)
302 throws Exception {
303 StringWriter htmlBuffer = new StringWriter();
304 StringWriter textBuffer = new StringWriter();
305 StringWriter textMainBuffer = new StringWriter();
306 StringWriter xmlBuffer = new StringWriter();
307 StringBuilder metadataBuffer = new StringBuilder();
308
309 ContentHandler handler = new TeeContentHandler(
310 getHtmlHandler(htmlBuffer),
311 getTextContentHandler(textBuffer),
312 getTextMainContentHandler(textMainBuffer),
313 getXmlContentHandler(xmlBuffer));
314
315 context.set(DocumentSelector.class, new ImageDocumentSelector());
316
317 input = new ProgressMonitorInputStream(
318 this, "Parsing stream", input);
319 parser.parse(input, handler, md, context);
320
321 String[] names = md.names();
322 Arrays.sort(names);
323 for (String name : names) {
324 metadataBuffer.append(name);
325 metadataBuffer.append(": ");
326 metadataBuffer.append(md.get(name));
327 metadataBuffer.append("\n");
328 }
329
330 String name = md.get(Metadata.RESOURCE_NAME_KEY);
331 if (name != null && name.length() > 0) {
332 setTitle("Apache Tika: " + name);
333 } else {
334 setTitle("Apache Tika: unnamed document");
335 }
336
337 setText(metadata, metadataBuffer.toString());
338 setText(xml, xmlBuffer.toString());
339 setText(text, textBuffer.toString());
340 setText(textMain, textMainBuffer.toString());
341 setText(html, htmlBuffer.toString());
342 layout.show(cards, "metadata");
343 }
344
345 private void handleError(String name, Throwable t) {
346 StringWriter writer = new StringWriter();
347 writer.append("Apache Tika was unable to parse the document\n");
348 writer.append("at " + name + ".\n\n");
349 writer.append("The full exception stack trace is included below:\n\n");
350 t.printStackTrace(new PrintWriter(writer));
351
352 JEditorPane editor =
353 new JEditorPane("text/plain", writer.toString());
354 editor.setEditable(false);
355 editor.setBackground(Color.WHITE);
356 editor.setCaretPosition(0);
357 editor.setPreferredSize(new Dimension(600, 400));
358
359 JDialog dialog = new JDialog(this, "Apache Tika error");
360 dialog.add(new JScrollPane(editor));
361 dialog.pack();
362 dialog.setVisible(true);
363 }
364
365 private void addWelcomeCard(JPanel panel, String name) {
366 try {
367 JEditorPane editor =
368 new JEditorPane(TikaGUI.class.getResource("welcome.html"));
369 editor.setContentType("text/html");
370 editor.setEditable(false);
371 editor.setBackground(Color.WHITE);
372 editor.setTransferHandler(new ParsingTransferHandler(
373 editor.getTransferHandler(), this));
374 panel.add(new JScrollPane(editor), name);
375 } catch (IOException e) {
376 e.printStackTrace();
377 }
378 }
379
380 private JEditorPane addCard(JPanel panel, String type, String name) {
381 JEditorPane editor = new JTextPane();
382 editor.setBackground(Color.WHITE);
383 editor.setContentType(type);
384 editor.setTransferHandler(new ParsingTransferHandler(
385 editor.getTransferHandler(), this));
386 panel.add(new JScrollPane(editor), name);
387 return editor;
388 }
389
390 private void textDialog(String title, URL resource) {
391 try {
392 JDialog dialog = new JDialog(this, title);
393 JEditorPane editor = new JEditorPane(resource);
394 editor.setContentType("text/html");
395 editor.setEditable(false);
396 editor.setBackground(Color.WHITE);
397 editor.setPreferredSize(new Dimension(400, 250));
398 editor.addHyperlinkListener(this);
399 dialog.add(editor);
400 dialog.pack();
401 dialog.setVisible(true);
402 } catch (IOException e) {
403 e.printStackTrace();
404 }
405 }
406
407 public void hyperlinkUpdate(HyperlinkEvent e) {
408 if (e.getEventType() == EventType.ACTIVATED) {
409 try {
410 URL url = e.getURL();
411 InputStream stream = url.openStream();
412 try {
413 StringWriter writer = new StringWriter();
414 IOUtils.copy(stream, writer, "UTF-8");
415
416 JEditorPane editor =
417 new JEditorPane("text/plain", writer.toString());
418 editor.setEditable(false);
419 editor.setBackground(Color.WHITE);
420 editor.setCaretPosition(0);
421 editor.setPreferredSize(new Dimension(600, 400));
422
423 String name = url.toString();
424 name = name.substring(name.lastIndexOf('/') + 1);
425
426 JDialog dialog = new JDialog(this, "Apache Tika: " + name);
427 dialog.add(new JScrollPane(editor));
428 dialog.pack();
429 dialog.setVisible(true);
430 } finally {
431 stream.close();
432 }
433 } catch (IOException exception) {
434 exception.printStackTrace();
435 }
436 }
437 }
438
439 private void setText(JEditorPane editor, String text) {
440 editor.setText(text);
441 editor.setCaretPosition(0);
442 }
443
444 /**
445 * Creates and returns a content handler that turns XHTML input to
446 * simplified HTML output that can be correctly parsed and displayed
447 * by {@link JEditorPane}.
448 * <p>
449 * The returned content handler is set to output <code>html</code>
450 * to the given writer. The XHTML namespace is removed from the output
451 * to prevent the serializer from using the &lt;tag/&gt; empty element
452 * syntax that causes extra "&gt;" characters to be displayed.
453 * The &lt;head&gt; tags are dropped to prevent the serializer from
454 * generating a &lt;META&gt; content type tag that makes
455 * {@link JEditorPane} fail thinking that the document character set
456 * is inconsistent.
457 * <p>
458 * Additionally, it will use ImageSavingParser to re-write embedded:(image)
459 * image links to be file:///(temporary file) so that they can be loaded.
460 *
461 * @param writer output writer
462 * @return HTML content handler
463 * @throws TransformerConfigurationException if an error occurs
464 */
465 private ContentHandler getHtmlHandler(Writer writer)
466 throws TransformerConfigurationException {
467 SAXTransformerFactory factory = (SAXTransformerFactory)
468 SAXTransformerFactory.newInstance();
469 TransformerHandler handler = factory.newTransformerHandler();
470 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
471 handler.setResult(new StreamResult(writer));
472 return new ContentHandlerDecorator(handler) {
473 @Override
474 public void startElement(
475 String uri, String localName, String name, Attributes atts)
476 throws SAXException {
477 if (XHTMLContentHandler.XHTML.equals(uri)) {
478 uri = null;
479 }
480 if (!"head".equals(localName)) {
481 if("img".equals(localName)) {
482 AttributesImpl newAttrs;
483 if(atts instanceof AttributesImpl) {
484 newAttrs = (AttributesImpl)atts;
485 } else {
486 newAttrs = new AttributesImpl(atts);
487 }
488
489 for(int i=0; i<newAttrs.getLength(); i++) {
490 if("src".equals(newAttrs.getLocalName(i))) {
491 String src = newAttrs.getValue(i);
492 if(src.startsWith("embedded:")) {
493 String filename = src.substring(src.indexOf(':')+1);
494 try {
495 File img = imageParser.requestSave(filename);
496 String newSrc = img.toURI().toString();
497 newAttrs.setValue(i, newSrc);
498 } catch(IOException e) {
499 System.err.println("Error creating temp image file " + filename);
500 // The html viewer will show a broken image too to alert them
501 }
502 }
503 }
504 }
505 super.startElement(uri, localName, name, newAttrs);
506 } else {
507 super.startElement(uri, localName, name, atts);
508 }
509 }
510 }
511 @Override
512 public void endElement(String uri, String localName, String name)
513 throws SAXException {
514 if (XHTMLContentHandler.XHTML.equals(uri)) {
515 uri = null;
516 }
517 if (!"head".equals(localName)) {
518 super.endElement(uri, localName, name);
519 }
520 }
521 @Override
522 public void startPrefixMapping(String prefix, String uri) {
523 }
524 @Override
525 public void endPrefixMapping(String prefix) {
526 }
527 };
528 }
529
530 private ContentHandler getTextContentHandler(Writer writer) {
531 return new BodyContentHandler(writer);
532 }
533 private ContentHandler getTextMainContentHandler(Writer writer) {
534 return new BoilerpipeContentHandler(writer);
535 }
536
537 private ContentHandler getXmlContentHandler(Writer writer)
538 throws TransformerConfigurationException {
539 SAXTransformerFactory factory = (SAXTransformerFactory)
540 SAXTransformerFactory.newInstance();
541 TransformerHandler handler = factory.newTransformerHandler();
542 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
543 handler.setResult(new StreamResult(writer));
544 return handler;
545 }
546
547 /**
548 * A {@link DocumentSelector} that accepts only images.
549 */
550 private static class ImageDocumentSelector implements DocumentSelector {
551 public boolean select(Metadata metadata) {
552 String type = metadata.get(Metadata.CONTENT_TYPE);
553 return type != null && type.startsWith("image/");
554 }
555 }
556
557 /**
558 * A recursive parser that saves certain images into the temporary
559 * directory, and delegates everything else to another downstream
560 * parser.
561 */
562 private static class ImageSavingParser extends AbstractParser {
563 private Map<String,File> wanted = new HashMap<String,File>();
564 private Parser downstreamParser;
565 private File tmpDir;
566
567 private ImageSavingParser(Parser downstreamParser) {
568 this.downstreamParser = downstreamParser;
569
570 try {
571 File t = File.createTempFile("tika", ".test");
572 tmpDir = t.getParentFile();
573 } catch(IOException e) {}
574 }
575
576 public File requestSave(String embeddedName) throws IOException {
577 String suffix = ".tika";
578
579 int splitAt = embeddedName.lastIndexOf('.');
580 if (splitAt > 0) {
581 embeddedName.substring(splitAt);
582 }
583
584 File tmp = File.createTempFile("tika-embedded-", suffix);
585 wanted.put(embeddedName, tmp);
586 return tmp;
587 }
588
589 public Set<MediaType> getSupportedTypes(ParseContext context) {
590 // Never used in an auto setup
591 return null;
592 }
593
594 public void parse(InputStream stream, ContentHandler handler,
595 Metadata metadata, ParseContext context) throws IOException,
596 SAXException, TikaException {
597 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
598 if(name != null && wanted.containsKey(name)) {
599 FileOutputStream out = new FileOutputStream(wanted.get(name));
600 IOUtils.copy(stream, out);
601 out.close();
602 } else {
603 if(downstreamParser != null) {
604 downstreamParser.parse(stream, handler, metadata, context);
605 }
606 }
607 }
608
609 }
610
611 }
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one
2 or more contributor license agreements. See the NOTICE file
3 distributed with this work for additional information
4 regarding copyright ownership. The ASF licenses this file
5 to you under the Apache License, Version 2.0 (the
6 "License"); you may not use this file except in compliance
7 with the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing,
12 software distributed under the License is distributed on an
13 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 KIND, either express or implied. See the License for the
15 specific language governing permissions and limitations
16 under the License.
17 -->
18 <html>
19 <body>
20 <center>
21 <p><img src="tika.png" width="292" height="100"></p>
22
23 <p>Apache Tika&trade; ${project.version} is a toolkit for
24 detecting and extracting metadata and structured text content
25 from various documents using existing parser libraries.</p>
26
27 <p><a href="/META-INF/NOTICE">Copyright notice</a>
28 - <a href="/META-INF/LICENSE">License terms</a></p>
29 </center>
30 </body>
31 </html>
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one
2 or more contributor license agreements. See the NOTICE file
3 distributed with this work for additional information
4 regarding copyright ownership. The ASF licenses this file
5 to you under the Apache License, Version 2.0 (the
6 "License"); you may not use this file except in compliance
7 with the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing,
12 software distributed under the License is distributed on an
13 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 KIND, either express or implied. See the License for the
15 specific language governing permissions and limitations
16 under the License.
17 -->
18 <html>
19 <body>
20 <center>
21 <p><img src="tika.png" width="292" height="100"></p>
22 <p>Welcome to Apache Tika version ${project.version}!</p>
23 <p>To see what Tika can do, just drop<br>
24 a file or a URL to this window.<br>
25 Use the View menu to switch views.</p>
26 </center>
27 </body>
28 </html>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.cli;
17
18 import java.io.ByteArrayOutputStream;
19 import java.io.File;
20 import java.io.PrintStream;
21 import java.net.URI;
22
23 import org.apache.commons.io.FileUtils;
24
25 import org.junit.After;
26 import static org.junit.Assert.assertTrue;
27 import org.junit.Before;
28 import org.junit.Test;
29
30 /**
31 * Tests the Tika's cli
32 */
33 public class TikaCLITest {
34
35 /* Test members */
36 private File profile = null;
37 private ByteArrayOutputStream outContent = null;
38 private PrintStream stdout = null;
39 private URI testDataURI = new File("src/test/resources/test-data/").toURI();
40 private String resourcePrefix = testDataURI.toString();
41
42 @Before
43 public void setUp() throws Exception {
44 profile = new File("welsh.ngp");
45 outContent = new ByteArrayOutputStream();
46 stdout = System.out;
47 System.setOut(new PrintStream(outContent));
48 }
49
50 /**
51 * Creates a welsh language profile
52 *
53 * @throws Exception
54 */
55 @Test
56 public void testCreateProfile() throws Exception {
57 String[] params = {"--create-profile=welsh", "-eUTF-8", resourcePrefix + "welsh_corpus.txt"};
58 TikaCLI.main(params);
59 assertTrue(profile.exists());
60 }
61
62 /**
63 * Tests --list-parser-detail option of the cli
64 *
65 * @throws Exception
66 */
67 @Test
68 public void testListParserDetail() throws Exception{
69 String[] params = {"--list-parser-detail"};
70 TikaCLI.main(params);
71 assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
72 }
73
74 /**
75 * Tests --list-parser option of the cli
76 *
77 * @throws Exception
78 */
79 @Test
80 public void testListParsers() throws Exception{
81 String[] params = {"--list-parser"};
82 TikaCLI.main(params);
83 //Assert was commented temporarily for finding the problem
84 // Assert.assertTrue(outContent != null && outContent.toString().contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
85 }
86
87 /**
88 * Tests -x option of the cli
89 *
90 * @throws Exception
91 */
92 @Test
93 public void testXMLOutput() throws Exception{
94 String[] params = {"-x", resourcePrefix + "alice.cli.test"};
95 TikaCLI.main(params);
96 assertTrue(outContent.toString().contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
97 }
98
99 /**
100 * Tests a -h option of the cli
101 *
102 * @throws Exception
103 */
104 @Test
105 public void testHTMLOutput() throws Exception{
106 String[] params = {"-h", resourcePrefix + "alice.cli.test"};
107 TikaCLI.main(params);
108 assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
109 assertTrue("Expanded <title></title> element should be present",
110 outContent.toString().contains("<title></title>"));
111 }
112
113 /**
114 * Tests -t option of the cli
115 *
116 * @throws Exception
117 */
118 @Test
119 public void testTextOutput() throws Exception{
120 String[] params = {"-t", resourcePrefix + "alice.cli.test"};
121 TikaCLI.main(params);
122 assertTrue(outContent.toString().contains("finished off the cake"));
123 }
124
125 /**
126 * Tests -m option of the cli
127 * @throws Exception
128 */
129 @Test
130 public void testMetadataOutput() throws Exception{
131 String[] params = {"-m", resourcePrefix + "alice.cli.test"};
132 TikaCLI.main(params);
133 assertTrue(outContent.toString().contains("text/plain"));
134 }
135
136 /**
137 * Tests -l option of the cli
138 *
139 * @throws Exception
140 */
141 @Test
142 public void testLanguageOutput() throws Exception{
143 String[] params = {"-l", resourcePrefix + "alice.cli.test"};
144 TikaCLI.main(params);
145 assertTrue(outContent.toString().contains("en"));
146 }
147
148 /**
149 * Tests -d option of the cli
150 *
151 * @throws Exception
152 */
153 @Test
154 public void testDetectOutput() throws Exception{
155 String[] params = {"-d", resourcePrefix + "alice.cli.test"};
156 TikaCLI.main(params);
157 assertTrue(outContent.toString().contains("text/plain"));
158 }
159
160 /**
161 * Tests --list-met-models option of the cli
162 *
163 * @throws Exception
164 */
165 @Test
166 public void testListMetModels() throws Exception{
167 String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"};
168 TikaCLI.main(params);
169 assertTrue(outContent.toString().contains("text/plain"));
170 }
171
172 /**
173 * Tests --list-supported-types option of the cli
174 *
175 * @throws Exception
176 */
177 @Test
178 public void testListSupportedTypes() throws Exception{
179 String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"};
180 TikaCLI.main(params);
181 assertTrue(outContent.toString().contains("supertype: application/octet-stream"));
182 }
183
184 /**
185 * Tears down the test. Returns the System.out
186 */
187 @After
188 public void tearDown() throws Exception {
189 if(profile != null && profile.exists())
190 profile.delete();
191 System.setOut(stdout);
192 }
193
194 @Test
195 public void testExtract() throws Exception {
196 File tempFile = File.createTempFile("tika-test-", "");
197 tempFile.delete();
198 tempFile.mkdir(); // not really good method for production usage, but ok for tests
199 // google guava library has better solution
200
201 try {
202 String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/coffee.xls"};
203
204 TikaCLI.main(params);
205
206 // ChemDraw file
207 File expected1 = new File(tempFile, "MBD002B040A.cdx");
208 // OLE10Native
209 File expected2 = new File(tempFile, "MBD002B0FA6_file5");
210 // Image of one of the embedded resources
211 File expected3 = new File(tempFile, "file0.emf");
212
213 assertTrue(expected1.exists());
214 assertTrue(expected2.exists());
215 assertTrue(expected3.exists());
216
217 assertTrue(expected1.length()>0);
218 assertTrue(expected2.length()>0);
219 assertTrue(expected3.length()>0);
220 } finally {
221 FileUtils.deleteDirectory(tempFile);
222 }
223
224 }
225
226 // TIKA-920
227 @Test
228 public void testMultiValuedMetadata() throws Exception {
229 String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"};
230 TikaCLI.main(params);
231 String content = outContent.toString();
232 assertTrue(content.contains("sheetNames: Checking"));
233 assertTrue(content.contains("sheetNames: Secon sheet"));
234 assertTrue(content.contains("sheetNames: Logical Sheet 3"));
235 assertTrue(content.contains("sheetNames: Sheet 4"));
236 }
237
238 // TIKA-1031
239 @Test
240 public void testZipWithSubdirs() throws Exception {
241 String[] params = {"-z", "--extract-dir=target", resourcePrefix + "testWithSubdirs.zip"};
242 new File("subdir/foo.txt").delete();
243 new File("subdir").delete();
244 TikaCLI.main(params);
245 String content = outContent.toString();
246 assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
247 // clean up. TODO: These should be in target.
248 new File("target/subdir/foo.txt").delete();
249 new File("target/subdir").delete();
250 }
251 }
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-bundle</artifactId>
32 <packaging>bundle</packaging>
33 <name>Apache Tika OSGi bundle</name>
34 <description>
35 OSGi bundle that contains the tika-parsers component and all its
36 upstream dependencies that aren't OSGI bundles by themselves. This
37 bundle exports no packages, only the Parser and Detector services
38 from the tika-parsers component.
39 </description>
40 <url>http://tika.apache.org/</url>
41
42 <properties>
43 <pax.exam.version>2.2.0</pax.exam.version>
44 </properties>
45
46 <dependencies>
47 <dependency>
48 <groupId>${project.groupId}</groupId>
49 <artifactId>tika-core</artifactId>
50 <version>${project.version}</version>
51 </dependency>
52 <dependency>
53 <groupId>${project.groupId}</groupId>
54 <artifactId>tika-parsers</artifactId>
55 <version>${project.version}</version>
56 <scope>provided</scope>
57 </dependency>
58
59 <!-- Test dependencies -->
60 <dependency>
61 <groupId>junit</groupId>
62 <artifactId>junit</artifactId>
63 <scope>test</scope>
64 <version>4.11</version>
65 </dependency>
66 <dependency>
67 <groupId>org.ops4j.pax.exam</groupId>
68 <artifactId>pax-exam-junit4</artifactId>
69 <version>${pax.exam.version}</version>
70 <scope>test</scope>
71 </dependency>
72 <dependency>
73 <groupId>org.ops4j.pax.exam</groupId>
74 <artifactId>pax-exam-container-native</artifactId>
75 <version>${pax.exam.version}</version>
76 <scope>test</scope>
77 </dependency>
78 <dependency>
79 <groupId>org.apache.felix</groupId>
80 <artifactId>org.apache.felix.framework</artifactId>
81 <version>4.0.1</version>
82 <scope>test</scope>
83 </dependency>
84 <dependency>
85 <groupId>org.ops4j.pax.exam</groupId>
86 <artifactId>pax-exam-link-assembly</artifactId>
87 <version>${pax.exam.version}</version>
88 <scope>test</scope>
89 </dependency>
90 <dependency>
91 <groupId>org.ops4j.pax.url</groupId>
92 <artifactId>pax-url-aether</artifactId>
93 <version>1.3.3</version>
94 <scope>test</scope>
95 </dependency>
96 <dependency>
97 <groupId>org.slf4j</groupId>
98 <artifactId>slf4j-simple</artifactId>
99 <version>1.6.1</version>
100 <scope>test</scope>
101 </dependency>
102 </dependencies>
103
104 <build>
105 <plugins>
106 <plugin>
107 <groupId>org.apache.felix</groupId>
108 <artifactId>maven-bundle-plugin</artifactId>
109 <extensions>true</extensions>
110 <configuration>
111 <instructions>
112 <Bundle-Activator>
113 org.apache.tika.parser.internal.Activator
114 </Bundle-Activator>
115 <Embed-Dependency>
116 tika-parsers;inline=true,
117 commons-compress, xz, commons-codec, commons-io,
118 pdfbox,fontbox,jempbox,bcmail-jdk15,bcprov-jdk15,
119 poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
120 xmlbeans, dom4j,
121 tagsoup,
122 asm,
123 juniversalchardet,
124 vorbis-java-core, vorbis-java-tika,
125 isoparser, aspectjrt,
126 metadata-extractor,
127 boilerpipe, rome,
128 apache-mime4j-core, apache-mime4j-dom
129 </Embed-Dependency>
130 <Embed-Transitive>true</Embed-Transitive>
131 <Bundle-DocURL>${project.url}</Bundle-DocURL>
132 <Export-Package>
133 !org.apache.tika.parser,
134 !org.apache.tika.parser.external,
135 org.apache.tika.parser.*
136 </Export-Package>
137 <Import-Package>
138 *,
139 com.adobe.xmp;resolution:=optional,
140 com.adobe.xmp.properties;resolution:=optional,
141 com.google.protobuf;resolution:=optional,
142 com.ibm.icu.text;resolution:=optional,
143 com.sleepycat.je;resolution:=optional,
144 com.sun.javadoc;resolution:=optional,
145 com.sun.msv.datatype;resolution:=optional,
146 com.sun.msv.datatype.xsd;resolution:=optional,
147 com.sun.tools.javadoc;resolution:=optional,
148 edu.wisc.ssec.mcidas;resolution:=optional,
149 edu.wisc.ssec.mcidas.adde;resolution:=optional,
150 javax.activation;resolution:=optional,
151 javax.mail;resolution:=optional,
152 javax.mail.internet;resolution:=optional,
153 javax.xml.bind;resolution:=optional,
154 javax.xml.stream;version="[1.0,2)";resolution:=optional,
155 javax.xml.stream.events;version="[1.0,2)";resolution:=optional,
156 javax.xml.stream.util;version="[1.0,2)";resolution:=optional,
157 junit.framework;resolution:=optional,
158 junit.textui;resolution:=optional,
159 net.sf.ehcache;resolution:=optional,
160 nu.xom;resolution:=optional,
161 opendap.dap;resolution:=optional,
162 opendap.dap.parser;resolution:=optional,
163 org.apache.commons.httpclient;resolution:=optional,
164 org.apache.commons.httpclient.auth;resolution:=optional,
165 org.apache.commons.httpclient.methods;resolution:=optional,
166 org.apache.commons.httpclient.params;resolution:=optional,
167 org.apache.commons.httpclient.protocol;resolution:=optional,
168 org.apache.crimson.jaxp;resolution:=optional,
169 org.apache.tools.ant;resolution:=optional,
170 org.apache.tools.ant.taskdefs;resolution:=optional,
171 org.apache.tools.ant.types;resolution:=optional,
172 org.apache.xerces.parsers;resolution:=optional,
173 org.apache.xerces.util;resolution:=optional,
174 org.apache.xerces.xni;resolution:=optional,
175 org.apache.xerces.xni.parser;resolution:=optional,
176 org.apache.xml.resolver;resolution:=optional,
177 org.apache.xml.resolver.tools;resolution:=optional,
178 org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional,
179 org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional,
180 org.cyberneko.html.xercesbridge;resolution:=optional,
181 org.gjt.xpp;resolution:=optional,
182 org.jaxen;resolution:=optional,
183 org.jaxen.dom4j;resolution:=optional,
184 org.jaxen.pattern;resolution:=optional,
185 org.jaxen.saxpath;resolution:=optional,
186 org.jdom;resolution:=optional,
187 org.jdom.input;resolution:=optional,
188 org.jdom.output;resolution:=optional,
189 org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
190 org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
191 org.osgi.framework;resolution:=optional,
192 org.w3c.dom;resolution:=optional,
193 org.relaxng.datatype;resolution:=optional,
194 org.xml.sax;resolution:=optional,
195 org.xml.sax.ext;resolution:=optional,
196 org.xml.sax.helpers;resolution:=optional,
197 org.xmlpull.v1;resolution:=optional,
198 schemasMicrosoftComOfficePowerpoint;resolution:=optional,
199 schemasMicrosoftComOfficeWord;resolution:=optional,
200 ucar.grib;resolution:=optional,
201 ucar.grib.grib1;resolution:=optional,
202 ucar.grib.grib2;resolution:=optional,
203 ucar.grid;resolution:=optional,
204 visad;resolution:=optional,
205 visad.data;resolution:=optional,
206 visad.data.vis5d;resolution:=optional,
207 visad.jmet;resolution:=optional,
208 visad.util;resolution:=optional
209 </Import-Package>
210 </instructions>
211 </configuration>
212 </plugin>
213 <!-- TIKA-763: Workaround to avoid including LGPL classes -->
214 <plugin>
215 <artifactId>maven-dependency-plugin</artifactId>
216 <executions>
217 <execution>
218 <phase>prepare-package</phase>
219 <goals>
220 <goal>unpack-dependencies</goal>
221 </goals>
222 <configuration>
223 <includeArtifactIds>netcdf</includeArtifactIds>
224 <excludes>
225 ucar/nc2/iosp/fysat/Fysat*.class,
226 ucar/nc2/dataset/transform/VOceanSG1*class,
227 ucar/unidata/geoloc/vertical/OceanSG*.class,
228 META-INF/**,CHANGES,README
229 </excludes>
230 <outputDirectory>
231 ${project.build.directory}/classes
232 </outputDirectory>
233 </configuration>
234 </execution>
235 </executions>
236 </plugin>
237 </plugins>
238 </build>
239
240 <profiles>
241 <profile>
242 <id>java6</id>
243 <activation>
244 <jdk>[1.6,)</jdk>
245 </activation>
246 <build>
247 <plugins>
248 <plugin>
249 <artifactId>maven-assembly-plugin</artifactId>
250 <executions>
251 <execution>
252 <phase>pre-integration-test</phase>
253 <goals>
254 <goal>single</goal>
255 </goals>
256 <configuration>
257 <descriptor>test-bundles.xml</descriptor>
258 <finalName>test</finalName>
259 <attach>false</attach>
260 </configuration>
261 </execution>
262 </executions>
263 </plugin>
264 <plugin>
265 <artifactId>maven-failsafe-plugin</artifactId>
266 <version>2.10</version>
267 <executions>
268 <execution>
269 <goals>
270 <goal>integration-test</goal>
271 <goal>verify</goal>
272 </goals>
273 </execution>
274 </executions>
275 <configuration>
276 <systemPropertyVariables>
277 <org.ops4j.pax.logging.DefaultServiceLog.level>
278 WARN
279 </org.ops4j.pax.logging.DefaultServiceLog.level>
280 </systemPropertyVariables>
281 </configuration>
282 </plugin>
283 </plugins>
284 </build>
285 </profile>
286 </profiles>
287
288 <organization>
289 <name>The Apache Software Founation</name>
290 <url>http://www.apache.org</url>
291 </organization>
292 <scm>
293 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-bundle</url>
294 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-bundle</connection>
295 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-bundle</developerConnection>
296 </scm>
297 <issueManagement>
298 <system>JIRA</system>
299 <url>https://issues.apache.org/jira/browse/TIKA</url>
300 </issueManagement>
301 <ciManagement>
302 <system>Jenkins</system>
303 <url>https://builds.apache.org/job/Tika-trunk/</url>
304 </ciManagement>
305 </project>
0 APACHE TIKA SUBCOMPONENTS
1
2 Apache Tika includes a number of subcomponents with separate copyright notices
3 and license terms. Your use of these subcomponents is subject to the terms and
4 conditions of the following licenses.
5
6 Bouncy Castle libraries (bcmail and bcprov)
7
8 Copyright (c) 2000-2009 The Legion Of The Bouncy Castle
9 (http://www.bouncycastle.org)
10
11 Permission is hereby granted, free of charge, to any person obtaining
12 a copy of this software and associated documentation files
13 (the "Software"), to deal in the Software without restriction,
14 including without limitation the rights to use, copy, modify, merge,
15 publish, distribute, sublicense, and/or sell copies of the Software,
16 and to permit persons to whom the Software is furnished to do so,
17 subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included
20 in all copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
26 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
27 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 OTHER DEALINGS IN THE SOFTWARE.
29
30 Contributions made to the original PDFBox, JempBox and FontBox projects:
31
32 Copyright (c) 2002-2007, www.pdfbox.org
33 Copyright (c) 2006-2007, www.jempbox.org
34 All rights reserved.
35
36 Redistribution and use in source and binary forms, with or without
37 modification, are permitted provided that the following conditions are met:
38
39 1. Redistributions of source code must retain the above copyright notice,
40 this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 3. Neither the name of pdfbox; nor the names of its contributors may be
47 used to endorse or promote products derived from this software without
48 specific prior written permission.
49
50 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
51 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
56 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
57 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 SUCH DAMAGE.
61
62 Adobe Font Metrics (AFM) for PDF Core 14 Fonts
63
64 This file and the 14 PostScript(R) AFM files it accompanies may be used,
65 copied, and distributed for any purpose and without charge, with or without
66 modification, provided that all copyright notices are retained; that the
67 AFM files are not distributed without this file; that all modifications
68 to this file or any of the AFM files are prominently noted in the modified
69 file(s); and that this paragraph is not modified. Adobe Systems has no
70 responsibility or obligation to support the use of the AFM files.
71
72 CMaps for PDF Fonts (http://www.adobe.com/devnet/font/#pcfi and
73 ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/adobe/)
74
75 Copyright 1990-2001 Adobe Systems Incorporated.
76 All Rights Reserved.
77
78 Patents Pending
79
80 NOTICE: All information contained herein is the property
81 of Adobe Systems Incorporated.
82
83 Permission is granted for redistribution of this file
84 provided this copyright notice is maintained intact and
85 that the contents of this file are not altered in any
86 way from its original form.
87
88 PostScript and Display PostScript are trademarks of
89 Adobe Systems Incorporated which may be registered in
90 certain jurisdictions.
91
92 Adobe Glyphlist (http://www.adobe.com/devnet/opentype/archives/glyph.html)
93
94 Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
95
96 Permission is hereby granted, free of charge, to any person obtaining a
97 copy of this documentation file to use, copy, publish, distribute,
98 sublicense, and/or sell copies of the documentation, and to permit
99 others to do the same, provided that:
100 - No modification, editing or other alteration of this document is
101 allowed; and
102 - The above copyright notice and this permission notice shall be
103 included in all copies of the documentation.
104
105 Permission is hereby granted, free of charge, to any person obtaining a
106 copy of this documentation file, to create their own derivative works
107 from the content of this document to use, copy, publish, distribute,
108 sublicense, and/or sell the derivative works, and to permit others to do
109 the same, provided that the derived work is not represented as being a
110 copy or version of this document.
111
112 Adobe shall not be liable to any party for any loss of revenue or profit
113 or for indirect, incidental, special, consequential, or other similar
114 damages, whether based on tort (including without limitation negligence
115 or strict liability), contract or other legal or equitable grounds even
116 if Adobe has been advised or had reason to know of the possibility of
117 such damages. The Adobe materials are provided on an "AS IS" basis.
118 Adobe specifically disclaims all express, statutory, or implied
119 warranties relating to the Adobe materials, including but not limited to
120 those concerning merchantability or fitness for a particular purpose or
121 non-infringement of any third party rights regarding the Adobe
122 materials.
123
124 Charset detection code from ICU4J (http://site.icu-project.org/)
125
126 Copyright (c) 1995-2009 International Business Machines Corporation
127 and others
128
129 All rights reserved.
130
131 Permission is hereby granted, free of charge, to any person obtaining
132 a copy of this software and associated documentation files (the
133 "Software"), to deal in the Software without restriction, including
134 without limitation the rights to use, copy, modify, merge, publish,
135 distribute, and/or sell copies of the Software, and to permit persons
136 to whom the Software is furnished to do so, provided that the above
137 copyright notice(s) and this permission notice appear in all copies
138 of the Software and that both the above copyright notice(s) and this
139 permission notice appear in supporting documentation.
140
141 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
142 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
143 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
144 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
145 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
146 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
147 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
148 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
149 SOFTWARE.
150
151 Except as contained in this notice, the name of a copyright holder shall
152 not be used in advertising or otherwise to promote the sale, use or other
153 dealings in this Software without prior written authorization of the
154 copyright holder.
155
156 ASM library (asm)
157
158 Copyright (c) 2000-2005 INRIA, France Telecom
159 All rights reserved.
160
161 Redistribution and use in source and binary forms, with or without
162 modification, are permitted provided that the following conditions
163 are met:
164
165 1. Redistributions of source code must retain the above copyright
166 notice, this list of conditions and the following disclaimer.
167
168 2. Redistributions in binary form must reproduce the above copyright
169 notice, this list of conditions and the following disclaimer in the
170 documentation and/or other materials provided with the distribution.
171
172 3. Neither the name of the copyright holders nor the names of its
173 contributors may be used to endorse or promote products derived from
174 this software without specific prior written permission.
175
176 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
177 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
178 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
179 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
180 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
181 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
182 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
183 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
184 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
185 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
186 THE POSSIBILITY OF SUCH DAMAGE.
187
188 MIME type information from file-4.26.tar.gz (http://www.darwinsys.com/file/)
189
190 Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
191 Software written by Ian F. Darwin and others;
192 maintained 1994- Christos Zoulas.
193
194 This software is not subject to any export provision of the United States
195 Department of Commerce, and may be exported to any country or planet.
196
197 Redistribution and use in source and binary forms, with or without
198 modification, are permitted provided that the following conditions
199 are met:
200 1. Redistributions of source code must retain the above copyright
201 notice immediately at the beginning of the file, without modification,
202 this list of conditions, and the following disclaimer.
203 2. Redistributions in binary form must reproduce the above copyright
204 notice, this list of conditions and the following disclaimer in the
205 documentation and/or other materials provided with the distribution.
206
207 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
208 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
209 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
210 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
211 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
212 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
213 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
214 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
215 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
216 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
217 SUCH DAMAGE.
218
219 Office Open XML schemas (poi-ooxml-schemas)
220
221 The Office Open XML schema definitions used by Apache POI are
222 a part of the Office Open XML ECMA Specification (ECMA-376, [1]).
223 As defined in section 9.4 of the ECMA bylaws [2], this specification
224 is available to all interested parties without restriction:
225
226 9.4 All documents when approved shall be made available to
227 all interested parties without restriction.
228
229 Furthermore, both Microsoft and Adobe have granted patent licenses
230 to this work [3,4,5].
231
232 [1] http://www.ecma-international.org/publications/standards/Ecma-376.htm
233 [2] http://www.ecma-international.org/memento/Ecmabylaws.htm
234 [3] http://www.microsoft.com/interop/osp/
235 [4] http://www.ecma-international.org/publications/files/ECMA-ST/Ecma%20PATENT/ECMA-376%20Edition%201%20Microsoft%20Patent%20Declaration.pdf
236 [5] http://www.ecma-international.org/publications/files/ECMA-ST/Ecma%20PATENT/ga-2006-191.pdf
237
238 DOM4J library (dom4j)
239
240 Copyright 2001-2005 (C) MetaStuff, Ltd. All Rights Reserved.
241
242 Redistribution and use of this software and associated documentation
243 ("Software"), with or without modification, are permitted provided
244 that the following conditions are met:
245
246 1. Redistributions of source code must retain copyright
247 statements and notices. Redistributions must also contain a
248 copy of this document.
249
250 2. Redistributions in binary form must reproduce the
251 above copyright notice, this list of conditions and the
252 following disclaimer in the documentation and/or other
253 materials provided with the distribution.
254
255 3. The name "DOM4J" must not be used to endorse or promote
256 products derived from this Software without prior written
257 permission of MetaStuff, Ltd. For written permission,
258 please contact dom4j-info@metastuff.com.
259
260 4. Products derived from this Software may not be called "DOM4J"
261 nor may "DOM4J" appear in their names without prior written
262 permission of MetaStuff, Ltd. DOM4J is a registered
263 trademark of MetaStuff, Ltd.
264
265 5. Due credit should be given to the DOM4J Project -
266 http://www.dom4j.org
267
268 THIS SOFTWARE IS PROVIDED BY METASTUFF, LTD. AND CONTRIBUTORS
269 ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
270 NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
271 FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
272 METASTUFF, LTD. OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
273 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
274 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
275 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
276 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
277 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
278 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
279 OF THE POSSIBILITY OF SUCH DAMAGE.
280
281 SLF4J library (slf4j-api, slf4j-log4j12)
282
283 Copyright (c) 2004-2008 QOS.ch
284 All rights reserved.
285
286 Permission is hereby granted, free of charge, to any person obtaining
287 a copy of this software and associated documentation files (the
288 "Software"), to deal in the Software without restriction, including
289 without limitation the rights to use, copy, modify, merge, publish,
290 distribute, sublicense, and/or sell copies of the Software, and to
291 permit persons to whom the Software is furnished to do so, subject to
292 the following conditions:
293
294 The above copyright notice and this permission notice shall be
295 included in all copies or substantial portions of the Software.
296
297 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
298 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
299 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
300 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
301 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
302 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
303 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
304
305 NetCDF library (netcdf)
306
307 Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
308
309 Portions of this software were developed by the Unidata Program at the
310 University Corporation for Atmospheric Research.
311
312 Access and use of this software shall impose the following obligations
313 and understandings on the user. The user is granted the right, without
314 any fee or cost, to use, copy, modify, alter, enhance and distribute
315 this software, and any derivative works thereof, and its supporting
316 documentation for any purpose whatsoever, provided that this entire
317 notice appears in all copies of the software, derivative works and
318 supporting documentation. Further, UCAR requests that the user credit
319 UCAR/Unidata in any publications that result from the use of this
320 software or in any product that includes this software. The names UCAR
321 and/or Unidata, however, may not be used in any advertising or publicity
322 to endorse or promote any products or commercial entity unless specific
323 written permission is obtained from UCAR/Unidata. The user also
324 understands that UCAR/Unidata is not obligated to provide the user with
325 any support, consulting, training or assistance of any kind with regard
326 to the use, operation and performance of this software nor to provide
327 the user with any updates, revisions, new versions or "bug fixes."
328
329 THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
330 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
331 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
332 DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
333 INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
334 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
335 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
336 WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
337
338 BZip classes inside the NetCDF library
339
340 The Apache Software License, Version 1.1
341
342 Copyright (c) 2001-2003 The Apache Software Foundation. All rights
343 reserved.
344
345 Redistribution and use in source and binary forms, with or without
346 modification, are permitted provided that the following conditions
347 are met:
348
349 1. Redistributions of source code must retain the above copyright
350 notice, this list of conditions and the following disclaimer.
351
352 2. Redistributions in binary form must reproduce the above copyright
353 notice, this list of conditions and the following disclaimer in
354 the documentation and/or other materials provided with the
355 distribution.
356
357 3. The end-user documentation included with the redistribution, if
358 any, must include the following acknowlegement:
359 "This product includes software developed by the
360 Apache Software Foundation (http://www.apache.org/)."
361 Alternately, this acknowlegement may appear in the software itself,
362 if and wherever such third-party acknowlegements normally appear.
363
364 4. The names "Ant" and "Apache Software
365 Foundation" must not be used to endorse or promote products derived
366 from this software without prior written permission. For written
367 permission, please contact apache@apache.org.
368
369 5. Products derived from this software may not be called "Apache"
370 nor may "Apache" appear in their names without prior written
371 permission of the Apache Group.
372
373 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
374 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
375 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
376 DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
377 ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
378 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
379 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
380 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
381 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
382 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
383 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
384 SUCH DAMAGE.
385
386 XZ compression library (xz)
387
388 All the files in this package have been written by Lasse Collin
389 and/or Igor Pavlov. All these files have been put into the
390 public domain. You can do whatever you want with these files.
391
392 This software is provided "as is", without any warranty.
393
394 juniversalchardet library (juniversalchardet)
395
396 MOZILLA PUBLIC LICENSE
397 Version 1.1
398
399 ---------------
400
401 1. Definitions.
402
403 1.0.1. "Commercial Use" means distribution or otherwise making the
404 Covered Code available to a third party.
405
406 1.1. "Contributor" means each entity that creates or contributes to
407 the creation of Modifications.
408
409 1.2. "Contributor Version" means the combination of the Original
410 Code, prior Modifications used by a Contributor, and the Modifications
411 made by that particular Contributor.
412
413 1.3. "Covered Code" means the Original Code or Modifications or the
414 combination of the Original Code and Modifications, in each case
415 including portions thereof.
416
417 1.4. "Electronic Distribution Mechanism" means a mechanism generally
418 accepted in the software development community for the electronic
419 transfer of data.
420
421 1.5. "Executable" means Covered Code in any form other than Source
422 Code.
423
424 1.6. "Initial Developer" means the individual or entity identified
425 as the Initial Developer in the Source Code notice required by Exhibit
426 A.
427
428 1.7. "Larger Work" means a work which combines Covered Code or
429 portions thereof with code not governed by the terms of this License.
430
431 1.8. "License" means this document.
432
433 1.8.1. "Licensable" means having the right to grant, to the maximum
434 extent possible, whether at the time of the initial grant or
435 subsequently acquired, any and all of the rights conveyed herein.
436
437 1.9. "Modifications" means any addition to or deletion from the
438 substance or structure of either the Original Code or any previous
439 Modifications. When Covered Code is released as a series of files, a
440 Modification is:
441 A. Any addition to or deletion from the contents of a file
442 containing Original Code or previous Modifications.
443
444 B. Any new file that contains any part of the Original Code or
445 previous Modifications.
446
447 1.10. "Original Code" means Source Code of computer software code
448 which is described in the Source Code notice required by Exhibit A as
449 Original Code, and which, at the time of its release under this
450 License is not already Covered Code governed by this License.
451
452 1.10.1. "Patent Claims" means any patent claim(s), now owned or
453 hereafter acquired, including without limitation, method, process,
454 and apparatus claims, in any patent Licensable by grantor.
455
456 1.11. "Source Code" means the preferred form of the Covered Code for
457 making modifications to it, including all modules it contains, plus
458 any associated interface definition files, scripts used to control
459 compilation and installation of an Executable, or source code
460 differential comparisons against either the Original Code or another
461 well known, available Covered Code of the Contributor's choice. The
462 Source Code can be in a compressed or archival form, provided the
463 appropriate decompression or de-archiving software is widely available
464 for no charge.
465
466 1.12. "You" (or "Your") means an individual or a legal entity
467 exercising rights under, and complying with all of the terms of, this
468 License or a future version of this License issued under Section 6.1.
469 For legal entities, "You" includes any entity which controls, is
470 controlled by, or is under common control with You. For purposes of
471 this definition, "control" means (a) the power, direct or indirect,
472 to cause the direction or management of such entity, whether by
473 contract or otherwise, or (b) ownership of more than fifty percent
474 (50%) of the outstanding shares or beneficial ownership of such
475 entity.
476
477 2. Source Code License.
478
479 2.1. The Initial Developer Grant.
480 The Initial Developer hereby grants You a world-wide, royalty-free,
481 non-exclusive license, subject to third party intellectual property
482 claims:
483 (a) under intellectual property rights (other than patent or
484 trademark) Licensable by Initial Developer to use, reproduce,
485 modify, display, perform, sublicense and distribute the Original
486 Code (or portions thereof) with or without Modifications, and/or
487 as part of a Larger Work; and
488
489 (b) under Patents Claims infringed by the making, using or
490 selling of Original Code, to make, have made, use, practice,
491 sell, and offer for sale, and/or otherwise dispose of the
492 Original Code (or portions thereof).
493
494 (c) the licenses granted in this Section 2.1(a) and (b) are
495 effective on the date Initial Developer first distributes
496 Original Code under the terms of this License.
497
498 (d) Notwithstanding Section 2.1(b) above, no patent license is
499 granted: 1) for code that You delete from the Original Code; 2)
500 separate from the Original Code; or 3) for infringements caused
501 by: i) the modification of the Original Code or ii) the
502 combination of the Original Code with other software or devices.
503
504 2.2. Contributor Grant.
505 Subject to third party intellectual property claims, each Contributor
506 hereby grants You a world-wide, royalty-free, non-exclusive license
507
508 (a) under intellectual property rights (other than patent or
509 trademark) Licensable by Contributor, to use, reproduce, modify,
510 display, perform, sublicense and distribute the Modifications
511 created by such Contributor (or portions thereof) either on an
512 unmodified basis, with other Modifications, as Covered Code
513 and/or as part of a Larger Work; and
514
515 (b) under Patent Claims infringed by the making, using, or
516 selling of Modifications made by that Contributor either alone
517 and/or in combination with its Contributor Version (or portions
518 of such combination), to make, use, sell, offer for sale, have
519 made, and/or otherwise dispose of: 1) Modifications made by that
520 Contributor (or portions thereof); and 2) the combination of
521 Modifications made by that Contributor with its Contributor
522 Version (or portions of such combination).
523
524 (c) the licenses granted in Sections 2.2(a) and 2.2(b) are
525 effective on the date Contributor first makes Commercial Use of
526 the Covered Code.
527
528 (d) Notwithstanding Section 2.2(b) above, no patent license is
529 granted: 1) for any code that Contributor has deleted from the
530 Contributor Version; 2) separate from the Contributor Version;
531 3) for infringements caused by: i) third party modifications of
532 Contributor Version or ii) the combination of Modifications made
533 by that Contributor with other software (except as part of the
534 Contributor Version) or other devices; or 4) under Patent Claims
535 infringed by Covered Code in the absence of Modifications made by
536 that Contributor.
537
538 3. Distribution Obligations.
539
540 3.1. Application of License.
541 The Modifications which You create or to which You contribute are
542 governed by the terms of this License, including without limitation
543 Section 2.2. The Source Code version of Covered Code may be
544 distributed only under the terms of this License or a future version
545 of this License released under Section 6.1, and You must include a
546 copy of this License with every copy of the Source Code You
547 distribute. You may not offer or impose any terms on any Source Code
548 version that alters or restricts the applicable version of this
549 License or the recipients' rights hereunder. However, You may include
550 an additional document offering the additional rights described in
551 Section 3.5.
552
553 3.2. Availability of Source Code.
554 Any Modification which You create or to which You contribute must be
555 made available in Source Code form under the terms of this License
556 either on the same media as an Executable version or via an accepted
557 Electronic Distribution Mechanism to anyone to whom you made an
558 Executable version available; and if made available via Electronic
559 Distribution Mechanism, must remain available for at least twelve (12)
560 months after the date it initially became available, or at least six
561 (6) months after a subsequent version of that particular Modification
562 has been made available to such recipients. You are responsible for
563 ensuring that the Source Code version remains available even if the
564 Electronic Distribution Mechanism is maintained by a third party.
565
566 3.3. Description of Modifications.
567 You must cause all Covered Code to which You contribute to contain a
568 file documenting the changes You made to create that Covered Code and
569 the date of any change. You must include a prominent statement that
570 the Modification is derived, directly or indirectly, from Original
571 Code provided by the Initial Developer and including the name of the
572 Initial Developer in (a) the Source Code, and (b) in any notice in an
573 Executable version or related documentation in which You describe the
574 origin or ownership of the Covered Code.
575
576 3.4. Intellectual Property Matters
577 (a) Third Party Claims.
578 If Contributor has knowledge that a license under a third party's
579 intellectual property rights is required to exercise the rights
580 granted by such Contributor under Sections 2.1 or 2.2,
581 Contributor must include a text file with the Source Code
582 distribution titled "LEGAL" which describes the claim and the
583 party making the claim in sufficient detail that a recipient will
584 know whom to contact. If Contributor obtains such knowledge after
585 the Modification is made available as described in Section 3.2,
586 Contributor shall promptly modify the LEGAL file in all copies
587 Contributor makes available thereafter and shall take other steps
588 (such as notifying appropriate mailing lists or newsgroups)
589 reasonably calculated to inform those who received the Covered
590 Code that new knowledge has been obtained.
591
592 (b) Contributor APIs.
593 If Contributor's Modifications include an application programming
594 interface and Contributor has knowledge of patent licenses which
595 are reasonably necessary to implement that API, Contributor must
596 also include this information in the LEGAL file.
597
598 (c) Representations.
599 Contributor represents that, except as disclosed pursuant to
600 Section 3.4(a) above, Contributor believes that Contributor's
601 Modifications are Contributor's original creation(s) and/or
602 Contributor has sufficient rights to grant the rights conveyed by
603 this License.
604
605 3.5. Required Notices.
606 You must duplicate the notice in Exhibit A in each file of the Source
607 Code. If it is not possible to put such notice in a particular Source
608 Code file due to its structure, then You must include such notice in a
609 location (such as a relevant directory) where a user would be likely
610 to look for such a notice. If You created one or more Modification(s)
611 You may add your name as a Contributor to the notice described in
612 Exhibit A. You must also duplicate this License in any documentation
613 for the Source Code where You describe recipients' rights or ownership
614 rights relating to Covered Code. You may choose to offer, and to
615 charge a fee for, warranty, support, indemnity or liability
616 obligations to one or more recipients of Covered Code. However, You
617 may do so only on Your own behalf, and not on behalf of the Initial
618 Developer or any Contributor. You must make it absolutely clear than
619 any such warranty, support, indemnity or liability obligation is
620 offered by You alone, and You hereby agree to indemnify the Initial
621 Developer and every Contributor for any liability incurred by the
622 Initial Developer or such Contributor as a result of warranty,
623 support, indemnity or liability terms You offer.
624
625 3.6. Distribution of Executable Versions.
626 You may distribute Covered Code in Executable form only if the
627 requirements of Section 3.1-3.5 have been met for that Covered Code,
628 and if You include a notice stating that the Source Code version of
629 the Covered Code is available under the terms of this License,
630 including a description of how and where You have fulfilled the
631 obligations of Section 3.2. The notice must be conspicuously included
632 in any notice in an Executable version, related documentation or
633 collateral in which You describe recipients' rights relating to the
634 Covered Code. You may distribute the Executable version of Covered
635 Code or ownership rights under a license of Your choice, which may
636 contain terms different from this License, provided that You are in
637 compliance with the terms of this License and that the license for the
638 Executable version does not attempt to limit or alter the recipient's
639 rights in the Source Code version from the rights set forth in this
640 License. If You distribute the Executable version under a different
641 license You must make it absolutely clear that any terms which differ
642 from this License are offered by You alone, not by the Initial
643 Developer or any Contributor. You hereby agree to indemnify the
644 Initial Developer and every Contributor for any liability incurred by
645 the Initial Developer or such Contributor as a result of any such
646 terms You offer.
647
648 3.7. Larger Works.
649 You may create a Larger Work by combining Covered Code with other code
650 not governed by the terms of this License and distribute the Larger
651 Work as a single product. In such a case, You must make sure the
652 requirements of this License are fulfilled for the Covered Code.
653
654 4. Inability to Comply Due to Statute or Regulation.
655
656 If it is impossible for You to comply with any of the terms of this
657 License with respect to some or all of the Covered Code due to
658 statute, judicial order, or regulation then You must: (a) comply with
659 the terms of this License to the maximum extent possible; and (b)
660 describe the limitations and the code they affect. Such description
661 must be included in the LEGAL file described in Section 3.4 and must
662 be included with all distributions of the Source Code. Except to the
663 extent prohibited by statute or regulation, such description must be
664 sufficiently detailed for a recipient of ordinary skill to be able to
665 understand it.
666
667 5. Application of this License.
668
669 This License applies to code to which the Initial Developer has
670 attached the notice in Exhibit A and to related Covered Code.
671
672 6. Versions of the License.
673
674 6.1. New Versions.
675 Netscape Communications Corporation ("Netscape") may publish revised
676 and/or new versions of the License from time to time. Each version
677 will be given a distinguishing version number.
678
679 6.2. Effect of New Versions.
680 Once Covered Code has been published under a particular version of the
681 License, You may always continue to use it under the terms of that
682 version. You may also choose to use such Covered Code under the terms
683 of any subsequent version of the License published by Netscape. No one
684 other than Netscape has the right to modify the terms applicable to
685 Covered Code created under this License.
686
687 6.3. Derivative Works.
688 If You create or use a modified version of this License (which you may
689 only do in order to apply it to code which is not already Covered Code
690 governed by this License), You must (a) rename Your license so that
691 the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape",
692 "MPL", "NPL" or any confusingly similar phrase do not appear in your
693 license (except to note that your license differs from this License)
694 and (b) otherwise make it clear that Your version of the license
695 contains terms which differ from the Mozilla Public License and
696 Netscape Public License. (Filling in the name of the Initial
697 Developer, Original Code or Contributor in the notice described in
698 Exhibit A shall not of themselves be deemed to be modifications of
699 this License.)
700
701 7. DISCLAIMER OF WARRANTY.
702
703 COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
704 WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
705 WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF
706 DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING.
707 THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE
708 IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT,
709 YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE
710 COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER
711 OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
712 ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
713
714 8. TERMINATION.
715
716 8.1. This License and the rights granted hereunder will terminate
717 automatically if You fail to comply with terms herein and fail to cure
718 such breach within 30 days of becoming aware of the breach. All
719 sublicenses to the Covered Code which are properly granted shall
720 survive any termination of this License. Provisions which, by their
721 nature, must remain in effect beyond the termination of this License
722 shall survive.
723
724 8.2. If You initiate litigation by asserting a patent infringement
725 claim (excluding declatory judgment actions) against Initial Developer
726 or a Contributor (the Initial Developer or Contributor against whom
727 You file such action is referred to as "Participant") alleging that:
728
729 (a) such Participant's Contributor Version directly or indirectly
730 infringes any patent, then any and all rights granted by such
731 Participant to You under Sections 2.1 and/or 2.2 of this License
732 shall, upon 60 days notice from Participant terminate prospectively,
733 unless if within 60 days after receipt of notice You either: (i)
734 agree in writing to pay Participant a mutually agreeable reasonable
735 royalty for Your past and future use of Modifications made by such
736 Participant, or (ii) withdraw Your litigation claim with respect to
737 the Contributor Version against such Participant. If within 60 days
738 of notice, a reasonable royalty and payment arrangement are not
739 mutually agreed upon in writing by the parties or the litigation claim
740 is not withdrawn, the rights granted by Participant to You under
741 Sections 2.1 and/or 2.2 automatically terminate at the expiration of
742 the 60 day notice period specified above.
743
744 (b) any software, hardware, or device, other than such Participant's
745 Contributor Version, directly or indirectly infringes any patent, then
746 any rights granted to You by such Participant under Sections 2.1(b)
747 and 2.2(b) are revoked effective as of the date You first made, used,
748 sold, distributed, or had made, Modifications made by that
749 Participant.
750
751 8.3. If You assert a patent infringement claim against Participant
752 alleging that such Participant's Contributor Version directly or
753 indirectly infringes any patent where such claim is resolved (such as
754 by license or settlement) prior to the initiation of patent
755 infringement litigation, then the reasonable value of the licenses
756 granted by such Participant under Sections 2.1 or 2.2 shall be taken
757 into account in determining the amount or value of any payment or
758 license.
759
760 8.4. In the event of termination under Sections 8.1 or 8.2 above,
761 all end user license agreements (excluding distributors and resellers)
762 which have been validly granted by You or any distributor hereunder
763 prior to termination shall survive termination.
764
765 9. LIMITATION OF LIABILITY.
766
767 UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
768 (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
769 DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE,
770 OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR
771 ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
772 CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL,
773 WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
774 COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
775 INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
776 LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY
777 RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
778 PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
779 EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO
780 THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
781
782 10. U.S. GOVERNMENT END USERS.
783
784 The Covered Code is a "commercial item," as that term is defined in
785 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
786 software" and "commercial computer software documentation," as such
787 terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48
788 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995),
789 all U.S. Government End Users acquire Covered Code with only those
790 rights set forth herein.
791
792 11. MISCELLANEOUS.
793
794 This License represents the complete agreement concerning subject
795 matter hereof. If any provision of this License is held to be
796 unenforceable, such provision shall be reformed only to the extent
797 necessary to make it enforceable. This License shall be governed by
798 California law provisions (except to the extent applicable law, if
799 any, provides otherwise), excluding its conflict-of-law provisions.
800 With respect to disputes in which at least one party is a citizen of,
801 or an entity chartered or registered to do business in the United
802 States of America, any litigation relating to this License shall be
803 subject to the jurisdiction of the Federal Courts of the Northern
804 District of California, with venue lying in Santa Clara County,
805 California, with the losing party responsible for costs, including
806 without limitation, court costs and reasonable attorneys' fees and
807 expenses. The application of the United Nations Convention on
808 Contracts for the International Sale of Goods is expressly excluded.
809 Any law or regulation which provides that the language of a contract
810 shall be construed against the drafter shall not apply to this
811 License.
812
813 12. RESPONSIBILITY FOR CLAIMS.
814
815 As between Initial Developer and the Contributors, each party is
816 responsible for claims and damages arising, directly or indirectly,
817 out of its utilization of rights under this License and You agree to
818 work with Initial Developer and Contributors to distribute such
819 responsibility on an equitable basis. Nothing herein is intended or
820 shall be deemed to constitute any admission of liability.
821
822 13. MULTIPLE-LICENSED CODE.
823
824 Initial Developer may designate portions of the Covered Code as
825 "Multiple-Licensed". "Multiple-Licensed" means that the Initial
826 Developer permits you to utilize portions of the Covered Code under
827 Your choice of the NPL or the alternative licenses, if any, specified
828 by the Initial Developer in the file described in Exhibit A.
829
830 EXHIBIT A -Mozilla Public License.
831
832 ``The contents of this file are subject to the Mozilla Public License
833 Version 1.1 (the "License"); you may not use this file except in
834 compliance with the License. You may obtain a copy of the License at
835 http://www.mozilla.org/MPL/
836
837 Software distributed under the License is distributed on an "AS IS"
838 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
839 License for the specific language governing rights and limitations
840 under the License.
841
842 The Original Code is ______________________________________.
843
844 The Initial Developer of the Original Code is ________________________.
845 Portions created by ______________________ are Copyright (C) ______
846 _______________________. All Rights Reserved.
847
848 Contributor(s): ______________________________________.
849
850 Alternatively, the contents of this file may be used under the terms
851 of the _____ license (the "[___] License"), in which case the
852 provisions of [______] License are applicable instead of those
853 above. If you wish to allow use of your version of this file only
854 under the terms of the [____] License and not to allow others to use
855 your version of this file under the MPL, indicate your decision by
856 deleting the provisions above and replace them with the notice and
857 other provisions required by the [___] License. If you do not delete
858 the provisions above, a recipient may use your version of this file
859 under either the MPL or the [___] License."
860
861 [NOTE: The text of this Exhibit A may differ slightly from the text of
862 the notices in the Source Code files of the Original Code. You should
863 use the text of this Exhibit A rather than the text found in the
864 Original Code Source Code for Your Modifications.]
865
866 AspectJ runtime library (aspectjrt)
867
868 Eclipse Public License - v 1.0
869
870 THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
871 PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
872 THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
873
874 1. DEFINITIONS
875
876 "Contribution" means:
877
878 a) in the case of the initial Contributor, the initial code and
879 documentation distributed under this Agreement, and
880
881 b) in the case of each subsequent Contributor:
882
883 i) changes to the Program, and
884
885 ii) additions to the Program;
886
887 where such changes and/or additions to the Program originate from and
888 are distributed by that particular Contributor. A Contribution
889 'originates' from a Contributor if it was added to the Program by
890 such Contributor itself or anyone acting on such Contributor's behalf.
891 Contributions do not include additions to the Program which: (i) are
892 separate modules of software distributed in conjunction with the
893 Program under their own license agreement, and (ii) are not derivative
894 works of the Program.
895
896 "Contributor" means any person or entity that distributes the Program.
897
898 "Licensed Patents " mean patent claims licensable by a Contributor which
899 are necessarily infringed by the use or sale of its Contribution alone or
900 when combined with the Program.
901
902 "Program" means the Contributions distributed in accordance with this
903 Agreement.
904
905 "Recipient" means anyone who receives the Program under this Agreement,
906 including all Contributors.
907
908 2. GRANT OF RIGHTS
909
910 a) Subject to the terms of this Agreement, each Contributor hereby grants
911 Recipient a non-exclusive, worldwide, royalty-free copyright license to
912 reproduce, prepare derivative works of, publicly display, publicly
913 perform, distribute and sublicense the Contribution of such
914 Contributor, if any, and such derivative works, in source code and
915 object code form.
916
917 b) Subject to the terms of this Agreement, each Contributor hereby grants
918 Recipient a non-exclusive, worldwide, royalty-free patent license under
919 Licensed Patents to make, use, sell, offer to sell, import and
920 otherwise transfer the Contribution of such Contributor, if any, in
921 source code and object code form. This patent license shall apply to
922 the combination of the Contribution and the Program if, at the time
923 the Contribution is added by the Contributor, such addition of the
924 Contribution causes such combination to be covered by the Licensed
925 Patents. The patent license shall not apply to any other combinations
926 which include the Contribution. No hardware per se is licensed hereunder.
927
928 c) Recipient understands that although each Contributor grants the
929 licenses to its Contributions set forth herein, no assurances are
930 provided by any Contributor that the Program does not infringe the
931 patent or other intellectual property rights of any other entity. Each
932 Contributor disclaims any liability to Recipient for claims brought by
933 any other entity based on infringement of intellectual property rights
934 or otherwise. As a condition to exercising the rights and licenses
935 granted hereunder, each Recipient hereby assumes sole responsibility
936 to secure any other intellectual property rights needed, if any. For
937 example, if a third party patent license is required to allow Recipient
938 to distribute the Program, it is Recipient's responsibility to acquire
939 that license before distributing the Program.
940
941 d) Each Contributor represents that to its knowledge it has sufficient
942 copyright rights in its Contribution, if any, to grant the copyright
943 license set forth in this Agreement.
944
945 3. REQUIREMENTS
946
947 A Contributor may choose to distribute the Program in object code form
948 under its own license agreement, provided that:
949
950 a) it complies with the terms and conditions of this Agreement; and
951
952 b) its license agreement:
953
954 i) effectively disclaims on behalf of all Contributors all warranties
955 and conditions, express and implied, including warranties or
956 conditions of title and non-infringement, and implied warranties
957 or conditions of merchantability and fitness for a particular
958 purpose;
959
960 ii) effectively excludes on behalf of all Contributors all liability
961 for damages, including direct, indirect, special, incidental and
962 consequential damages, such as lost profits;
963
964 iii) states that any provisions which differ from this Agreement are
965 offered by that Contributor alone and not by any other party; and
966
967 iv) states that source code for the Program is available from such
968 Contributor, and informs licensees how to obtain it in a
969 reasonable manner on or through a medium customarily used for
970 software exchange.
971
972 When the Program is made available in source code form:
973
974 a) it must be made available under this Agreement; and
975
976 b) a copy of this Agreement must be included with each copy of the
977 Program.
978
979 Contributors may not remove or alter any copyright notices contained
980 within the Program.
981
982 Each Contributor must identify itself as the originator of its
983 Contribution, if any, in a manner that reasonably allows subsequent
984 Recipients to identify the originator of the Contribution.
985
986 4. COMMERCIAL DISTRIBUTION
987
988 Commercial distributors of software may accept certain responsibilities
989 with respect to end users, business partners and the like. While this
990 license is intended to facilitate the commercial use of the Program,
991 the Contributor who includes the Program in a commercial product offering
992 should do so in a manner which does not create potential liability for
993 other Contributors. Therefore, if a Contributor includes the Program in
994 a commercial product offering, such Contributor ("Commercial Contributor")
995 hereby agrees to defend and indemnify every other Contributor
996 ("Indemnified Contributor") against any losses, damages and costs
997 (collectively "Losses") arising from claims, lawsuits and other legal
998 actions brought by a third party against the Indemnified Contributor to
999 the extent caused by the acts or omissions of such Commercial Contributor
1000 in connection with its distribution of the Program in a commercial
1001 product offering. The obligations in this section do not apply to any
1002 claims or Losses relating to any actual or alleged intellectual property
1003 infringement. In order to qualify, an Indemnified Contributor must:
1004 a) promptly notify the Commercial Contributor in writing of such claim,
1005 and b) allow the Commercial Contributor to control, and cooperate with
1006 the Commercial Contributor in, the defense and any related settlement
1007 negotiations. The Indemnified Contributor may participate in any such
1008 claim at its own expense.
1009
1010 For example, a Contributor might include the Program in a commercial
1011 product offering, Product X. That Contributor is then a Commercial
1012 Contributor. If that Commercial Contributor then makes performance claims,
1013 or offers warranties related to Product X, those performance claims and
1014 warranties are such Commercial Contributor's responsibility alone. Under
1015 this section, the Commercial Contributor would have to defend claims
1016 against the other Contributors related to those performance claims and
1017 warranties, and if a court requires any other Contributor to pay any
1018 damages as a result, the Commercial Contributor must pay those damages.
1019
1020 5. NO WARRANTY
1021
1022 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED
1023 ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
1024 EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
1025 CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
1026 PARTICULAR PURPOSE. Each Recipient is solely responsible for determining
1027 the appropriateness of using and distributing the Program and assumes all
1028 risks associated with its exercise of rights under this Agreement ,
1029 including but not limited to the risks and costs of program errors,
1030 compliance with applicable laws, damage to or loss of data, programs or
1031 equipment, and unavailability or interruption of operations.
1032
1033 6. DISCLAIMER OF LIABILITY
1034
1035 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
1036 ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
1037 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
1038 WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
1039 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1040 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
1041 DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
1042 HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
1043
1044 7. GENERAL
1045
1046 If any provision of this Agreement is invalid or unenforceable under
1047 applicable law, it shall not affect the validity or enforceability of
1048 the remainder of the terms of this Agreement, and without further action
1049 by the parties hereto, such provision shall be reformed to the minimum
1050 extent necessary to make such provision valid and enforceable.
1051
1052 If Recipient institutes patent litigation against any entity (including
1053 a cross-claim or counterclaim in a lawsuit) alleging that the Program
1054 itself (excluding combinations of the Program with other software or
1055 hardware) infringes such Recipient's patent(s), then such Recipient's
1056 rights granted under Section 2(b) shall terminate as of the date such
1057 litigation is filed.
1058
1059 All Recipient's rights under this Agreement shall terminate if it fails
1060 to comply with any of the material terms or conditions of this Agreement
1061 and does not cure such failure in a reasonable period of time after
1062 becoming aware of such noncompliance. If all Recipient's rights under
1063 this Agreement terminate, Recipient agrees to cease use and distribution
1064 of the Program as soon as reasonably practicable. However, Recipient's
1065 obligations under this Agreement and any licenses granted by Recipient
1066 relating to the Program shall continue and survive.
1067
1068 Everyone is permitted to copy and distribute copies of this Agreement,
1069 but in order to avoid inconsistency the Agreement is copyrighted and may
1070 only be modified in the following manner. The Agreement Steward reserves
1071 the right to publish new versions (including revisions) of this Agreement
1072 from time to time. No one other than the Agreement Steward has the right
1073 to modify this Agreement. The Eclipse Foundation is the initial Agreement
1074 Steward. The Eclipse Foundation may assign the responsibility to serve as
1075 the Agreement Steward to a suitable separate entity. Each new version of
1076 the Agreement will be given a distinguishing version number. The Program
1077 (including Contributions) may always be distributed subject to the version
1078 of the Agreement under which it was received. In addition, after a new
1079 version of the Agreement is published, Contributor may elect to distribute
1080 the Program (including its Contributions) under the new version. Except as
1081 expressly stated in Sections 2(a) and 2(b) above, Recipient receives no
1082 rights or licenses to the intellectual property of any Contributor under
1083 this Agreement, whether expressly, by implication, estoppel or otherwise.
1084 All rights in the Program not expressly granted under this Agreement
1085 are reserved.
1086
1087 This Agreement is governed by the laws of the State of New York and the
1088 intellectual property laws of the United States of America. No party to
1089 this Agreement will bring a legal action under this Agreement more than
1090 one year after the cause of action arose. Each party waives its rights to
1091 a jury trial in any resulting litigation.
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.bundle;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20 import static org.ops4j.pax.exam.CoreOptions.bundle;
21 import static org.ops4j.pax.exam.CoreOptions.junitBundles;
22
23 import java.io.File;
24 import java.io.FileInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.net.URISyntaxException;
28
29 import org.apache.tika.Tika;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.parser.Parser;
33 import org.apache.tika.sax.BodyContentHandler;
34 import org.junit.Test;
35 import org.ops4j.pax.exam.CoreOptions;
36 import org.ops4j.pax.exam.Option;
37 import org.ops4j.pax.exam.junit.Configuration;
38 import org.osgi.framework.BundleContext;
39 import org.xml.sax.ContentHandler;
40
41 public class BundleIT {
42
43 private final File TARGET = new File("target");
44
45 @Configuration
46 public Option[] configuration() throws IOException, URISyntaxException {
47 File base = new File(TARGET, "test-bundles");
48 return CoreOptions.options(
49 junitBundles(),
50 bundle(new File(base, "tika-core.jar").toURI().toURL().toString()),
51 bundle(new File(base, "tika-bundle.jar").toURI().toURL().toString()));
52 }
53
54 //@Test
55 public void testTikaBundle(BundleContext bc) throws Exception {
56 Tika tika = new Tika();
57
58 // Simple type detection
59 assertEquals("text/plain", tika.detect("test.txt"));
60 assertEquals("application/pdf", tika.detect("test.pdf"));
61
62 // Simple text extraction
63 String xml = tika.parseToString(new File("pom.xml"));
64 assertTrue(xml.contains("tika-bundle"));
65
66 // Package extraction
67 ContentHandler handler = new BodyContentHandler();
68
69 Parser parser = tika.getParser();
70 ParseContext context = new ParseContext();
71 context.set(Parser.class, parser);
72
73 InputStream stream =
74 new FileInputStream("src/test/resources/test-documents.zip");
75 try {
76 parser.parse(stream, handler, new Metadata(), context);
77 } finally {
78 stream.close();
79 }
80
81 String content = handler.toString();
82 assertTrue(content.contains("testEXCEL.xls"));
83 assertTrue(content.contains("Sample Excel Worksheet"));
84 assertTrue(content.contains("testHTML.html"));
85 assertTrue(content.contains("Test Indexation Html"));
86 assertTrue(content.contains("testOpenOffice2.odt"));
87 assertTrue(content.contains("This is a sample Open Office document"));
88 assertTrue(content.contains("testPDF.pdf"));
89 assertTrue(content.contains("Apache Tika"));
90 assertTrue(content.contains("testPPT.ppt"));
91 assertTrue(content.contains("Sample Powerpoint Slide"));
92 assertTrue(content.contains("testRTF.rtf"));
93 assertTrue(content.contains("indexation Word"));
94 assertTrue(content.contains("testTXT.txt"));
95 assertTrue(content.contains("Test d'indexation de Txt"));
96 assertTrue(content.contains("testWORD.doc"));
97 assertTrue(content.contains("This is a sample Microsoft Word Document"));
98 assertTrue(content.contains("testXML.xml"));
99 assertTrue(content.contains("Rida Benjelloun"));
100 }
101
102 }
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one or more
2 contributor license agreements. See the NOTICE file distributed with
3 this work for additional information regarding copyright ownership.
4 The ASF licenses this file to You under the Apache License, Version 2.0
5 (the "License"); you may not use this file except in compliance with
6 the License. You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 -->
16 <assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
17 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
18 xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
19 <id>bundles</id>
20 <formats>
21 <format>dir</format>
22 </formats>
23 <includeBaseDirectory>false</includeBaseDirectory>
24 <dependencySets>
25 <dependencySet>
26 <outputDirectory/>
27 <outputFileNameMapping>${artifact.artifactId}.jar</outputFileNameMapping>
28 <includes>
29 <include>org.apache.tika:tika-core</include>
30 <include>org.apache.tika:tika-bundle</include>
31 </includes>
32 </dependencySet>
33 </dependencySets>
34 </assembly>
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-core</artifactId>
32 <packaging>bundle</packaging>
33 <name>Apache Tika core</name>
34 <url>http://tika.apache.org/</url>
35
36 <dependencies>
37 <!-- Optional OSGi dependencies, used only when running within OSGi -->
38 <dependency>
39 <groupId>org.osgi</groupId>
40 <artifactId>org.osgi.core</artifactId>
41 <version>4.0.0</version>
42 <scope>provided</scope>
43 <optional>true</optional>
44 </dependency>
45 <dependency>
46 <groupId>org.osgi</groupId>
47 <artifactId>org.osgi.compendium</artifactId>
48 <version>4.0.0</version>
49 <scope>provided</scope>
50 <optional>true</optional>
51 </dependency>
52 <dependency>
53 <groupId>biz.aQute</groupId>
54 <artifactId>bndlib</artifactId>
55 <scope>provided</scope>
56 </dependency>
57
58 <!-- Test dependencies -->
59 <dependency>
60 <groupId>junit</groupId>
61 <artifactId>junit</artifactId>
62 <scope>test</scope>
63 <version>4.11</version>
64 </dependency>
65 </dependencies>
66
67 <build>
68 <plugins>
69 <plugin>
70 <groupId>org.apache.felix</groupId>
71 <artifactId>maven-bundle-plugin</artifactId>
72 <extensions>true</extensions>
73 <configuration>
74 <instructions>
75 <Bundle-DocURL>${project.url}</Bundle-DocURL>
76 <Bundle-Activator>
77 org.apache.tika.config.TikaActivator
78 </Bundle-Activator>
79 <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
80 </instructions>
81 </configuration>
82 </plugin>
83 <plugin>
84 <groupId>org.apache.rat</groupId>
85 <artifactId>apache-rat-plugin</artifactId>
86 <configuration>
87 <excludes>
88 <exclude>src/test/resources/org/apache/tika/**</exclude>
89 </excludes>
90 </configuration>
91 </plugin>
92 <plugin>
93 <groupId>org.codehaus.mojo</groupId>
94 <artifactId>clirr-maven-plugin</artifactId>
95 <executions>
96 <execution>
97 <phase>verify</phase>
98 <goals>
99 <goal>check</goal>
100 </goals>
101 <configuration>
102 <excludes>
103 <exlude>org/apache/tika/config/TikaActivator</exlude>
104 <exlude>org/apache/tika/metadata/Property$PropertyType</exlude>
105 <exlude>org/apache/tika/metadata/Property$ValueType</exlude>
106 <exlude>org/apache/tika/metadata/DublinCore</exlude>
107 <exlude>org/apache/tika/metadata/Metadata</exlude>
108 <exlude>org/apache/tika/metadata/MSOffice</exlude>
109 <exlude>org/apache/tika/parser/EmptyParser</exlude>
110 </excludes>
111 <comparisonArtifacts>
112 <comparisonArtifact>
113 <groupId>org.apache.tika</groupId>
114 <artifactId>tika-core</artifactId>
115 <version>1.0</version>
116 <type>jar</type>
117 </comparisonArtifact>
118 </comparisonArtifacts>
119 </configuration>
120 </execution>
121 </executions>
122 </plugin>
123 <plugin>
124 <artifactId>maven-failsafe-plugin</artifactId>
125 <version>2.10</version>
126 <configuration>
127 <additionalClasspathElements>
128 <additionalClasspathElement>
129 ${project.build.directory}/${project.build.finalName}.jar
130 </additionalClasspathElement>
131 </additionalClasspathElements>
132 </configuration>
133 <executions>
134 <execution>
135 <goals>
136 <goal>integration-test</goal>
137 <goal>verify</goal>
138 </goals>
139 </execution>
140 </executions>
141 </plugin>
142 </plugins>
143 </build>
144
145 <description>This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API. </description>
146 <organization>
147 <name>The Apache Software Foundation</name>
148 <url>http://www.apache.org</url>
149 </organization>
150 <scm>
151 <url>http://svn.apache.org/viewvc/tika/tags/1.5/core</url>
152 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/core</connection>
153 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/core</developerConnection>
154 </scm>
155 <issueManagement>
156 <system>JIRA</system>
157 <url>https://issues.apache.org/jira/browse/TIKA</url>
158 </issueManagement>
159 <ciManagement>
160 <system>Jenkins</system>
161 <url>https://builds.apache.org/job/Tika-trunk/</url>
162 </ciManagement>
163 </project>
0 APACHE TIKA SUBCOMPONENTS
1
2 Apache Tika includes a number of subcomponents with separate copyright notices
3 and license terms. Your use of these subcomponents is subject to the terms and
4 conditions of the following licenses.
5
6 MIME type information from file-4.26.tar.gz (http://www.darwinsys.com/file/)
7
8 Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
9 Software written by Ian F. Darwin and others;
10 maintained 1994- Christos Zoulas.
11
12 This software is not subject to any export provision of the United States
13 Department of Commerce, and may be exported to any country or planet.
14
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions
17 are met:
18 1. Redistributions of source code must retain the above copyright
19 notice immediately at the beginning of the file, without modification,
20 this list of conditions, and the following disclaimer.
21 2. Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
24
25 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
29 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 SUCH DAMAGE.
36
37
38 IPTC Photo Metadata descriptions are taken from the IPTC Photo Metadata
39 Standard, July 2010, Copyright 2010 International Press Telecommunications
40 Council.
41
42 1. The Specifications and Materials are licensed for use only on the condition that you agree to be bound by the terms of this license. Subject to this and other licensing requirements contained herein, you may, on a non-exclusive basis, use the Specifications and Materials.
43 2. The IPTC openly provides the Specifications and Materials for voluntary use by individuals, partnerships, companies, corporations, organizations and any other entity for use at the entity's own risk. This disclaimer, license and release is intended to apply to the IPTC, its officers, directors, agents, representatives, members, contributors, affiliates, contractors, or co-venturers acting jointly or severally.
44 3. The Document and translations thereof may be copied and furnished to others, and derivative works that comment on or otherwise explain it or assist in its implementation may be prepared, copied, published and distributed, in whole or in part, without restriction of any kind, provided that the copyright and license notices and references to the IPTC appearing in the Document and the terms of this Specifications License Agreement are included on all such copies and derivative works. Further, upon the receipt of written permission from the IPTC, the Document may be modified for the purpose of developing applications that use IPTC Specifications or as required to translate the Document into languages other than English.
45 4. Any use, duplication, distribution, or exploitation of the Document and Specifications and Materials in any manner is at your own risk.
46 5. NO WARRANTY, EXPRESSED OR IMPLIED, IS MADE REGARDING THE ACCURACY, ADEQUACY, COMPLETENESS, LEGALITY, RELIABILITY OR USEFULNESS OF ANY INFORMATION CONTAINED IN THE DOCUMENT OR IN ANY SPECIFICATION OR OTHER PRODUCT OR SERVICE PRODUCED OR SPONSORED BY THE IPTC. THE DOCUMENT AND THE INFORMATION CONTAINED HEREIN AND INCLUDED IN ANY SPECIFICATION OR OTHER PRODUCT OR SERVICE OF THE IPTC IS PROVIDED ON AN "AS IS" BASIS. THE IPTC DISCLAIMS ALL WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY ACTUAL OR ASSERTED WARRANTY OF NON-INFRINGEMENT OF PROPRIETARY RIGHTS, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. NEITHER THE IPTC NOR ITS CONTRIBUTORS SHALL BE HELD LIABLE FOR ANY IMPROPER OR INCORRECT USE OF INFORMATION. NEITHER THE IPTC NOR ITS CONTRIBUTORS ASSUME ANY RESPONSIBILITY FOR ANYONE'S USE OF INFORMATION PROVIDED BY THE IPTC. IN NO EVENT SHALL THE IPTC OR ITS CONTRIBUTORS BE LIABLE TO ANYONE FOR DAMAGES OF ANY KIND, INCLUDING BUT NOT LIMITED TO, COMPENSATORY DAMAGES, LOST PROFITS, LOST DATA OR ANY FORM OF SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL OR PUNITIVE DAMAGES OF ANY KIND WHETHER BASED ON BREACH OF CONTRACT OR WARRANTY, TORT, PRODUCT LIABILITY OR OTHERWISE.
47 6. The IPTC takes no position regarding the validity or scope of any Intellectual Property or other rights that might be claimed to pertain to the implementation or use of the technology described in the Document or the extent to which any license under such rights might or might not be available. The IPTC does not represent that it has made any effort to identify any such rights. Copies of claims of rights made available for publication, assurances of licenses to be made available, or the result of an attempt made to obtain a general license or permission for the use of such proprietary rights by implementers or users of the Specifications and Materials, can be obtained from the Managing Director of the IPTC.
48 7. By using the Specifications and Materials including the Document in any manner or for any purpose, you release the IPTC from all liabilities, claims, causes of action, allegations, losses, injuries, damages, or detriments of any nature arising from or relating to the use of the Specifications, Materials or any portion thereof. You further agree not to file a lawsuit, make a claim, or take any other formal or informal legal action against the IPTC, resulting from your acquisition, use, duplication, distribution, or exploitation of the Specifications, Materials or any portion thereof. Finally, you hereby agree that the IPTC is not liable for any direct, indirect, special or consequential damages arising from or relating to your acquisition, use, duplication, distribution, or exploitation of the Specifications, Materials or any portion thereof.
49 8. Specifications and Materials may be downloaded or copied provided that ALL copies retain the ownership, copyright and license notices.
50 9. Materials may not be edited, modified, or presented in a context that creates a misleading or false impression or statement as to the positions, actions, or statements of the IPTC.
51 10. The name and trademarks of the IPTC may not be used in advertising, publicity, or in relation to products or services and their names without the specific, written prior permission of the IPTC. Any permitted use of the trademarks of the IPTC, whether registered or not, shall be accompanied by an appropriate mark and attribution, as agreed with the IPTC.
52 11. Specifications may be extended by both members and non-members to provide additional functionality (Extension Specifications) provided that there is a clear recognition of the IPTC IP and its ownership in the Extension Specifications and the related documentation and provided that the extensions are clearly identified and provided that a perpetual license is granted by the creator of the Extension Specifications for other members and non-members to use the Extension Specifications and to continue extensions of the Extension Specifications. The IPTC does not waive any of its rights in the Specifications and Materials in this context. The Extension Specifications may be considered the intellectual property of their creator. The IPTC expressly disclaims any responsibility for damage caused by an extension to the Specifications.
53 12. Specifications and Materials may be included in derivative work of both members and non-members provided that there is a clear recognition of the IPTC IP and its ownership in the derivative work and its related documentation. The IPTC does not waive any of its rights in the Specifications and Materials in this context. Derivative work in its entirety may be considered the intellectual property of the creator of the work .The IPTC expressly disclaims any responsibility for damage caused when its IP is used in a derivative context.
54 13. This Specifications License Agreement is perpetual subject to your conformance to the terms of this Agreement. The IPTC may terminate this Specifications License Agreement immediately upon your breach of this Agreement and, upon such termination you will cease all use, duplication, distribution, and/or exploitation in any manner of the Specifications and Materials.
55 14. This Specifications License Agreement reflects the entire agreement of the parties regarding the subject matter hereof and supersedes all prior agreements or representations regarding such matters, whether written or oral. To the extent any portion or provision of this Specifications License Agreement is found to be illegal or unenforceable, then the remaining provisions of this Specifications License Agreement will remain in full force and effect and the illegal or unenforceable provision will be construed to give it such effect as it may properly have that is consistent with the intentions of the parties.
56 15. This Specifications License Agreement may only be modified in writing signed by an authorized representative of the IPTC.
57 16. This Specifications License Agreement is governed by the law of United Kingdom, as such law is applied to contracts made and fully performed in the United Kingdom. Any disputes arising from or relating to this Specifications License Agreement will be resolved in the courts of the United Kingdom. You consent to the jurisdiction of such courts over you and covenant not to assert before such courts any objection to proceeding in such forums.
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import java.io.BufferedInputStream;
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.Reader;
23 import java.net.URL;
24 import java.util.Properties;
25
26 import org.apache.tika.config.TikaConfig;
27 import org.apache.tika.detect.Detector;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.TikaInputStream;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.parser.AutoDetectParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.Parser;
34 import org.apache.tika.parser.ParsingReader;
35 import org.apache.tika.sax.BodyContentHandler;
36 import org.apache.tika.sax.WriteOutContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * Facade class for accessing Tika functionality. This class hides much of
41 * the underlying complexity of the lower level Tika classes and provides
42 * simple methods for many common parsing and type detection operations.
43 *
44 * @since Apache Tika 0.5
45 * @see Parser
46 * @see Detector
47 */
48 public class Tika {
49
50 /**
51 * The detector instance used by this facade.
52 */
53 private final Detector detector;
54
55 /**
56 * The parser instance used by this facade.
57 */
58 private final Parser parser;
59
60 /**
61 * Maximum length of the strings returned by the parseToString methods.
62 * Used to prevent out of memory problems with huge input documents.
63 * The default setting is 100k characters.
64 */
65 private int maxStringLength = 100 * 1000;
66
67 /**
68 * Creates a Tika facade using the given detector and parser instances.
69 *
70 * @since Apache Tika 0.8
71 * @param detector type detector
72 * @param parser document parser
73 */
74 public Tika(Detector detector, Parser parser) {
75 this.detector = detector;
76 this.parser = parser;
77 }
78
79 /**
80 * Creates a Tika facade using the given configuration.
81 *
82 * @param config Tika configuration
83 */
84 public Tika(TikaConfig config) {
85 this(config.getDetector(), new AutoDetectParser(config));
86 }
87
88 /**
89 * Creates a Tika facade using the default configuration.
90 */
91 public Tika() {
92 this(TikaConfig.getDefaultConfig());
93 }
94
95 /**
96 * Creates a Tika facade using the given detector instance and the
97 * default parser configuration.
98 *
99 * @since Apache Tika 0.8
100 * @param detector type detector
101 */
102 public Tika(Detector detector) {
103 this(detector, new AutoDetectParser(detector));
104 }
105
106
107 /**
108 * Detects the media type of the given document. The type detection is
109 * based on the content of the given document stream and any given
110 * document metadata. The document stream can be <code>null</code>,
111 * in which case only the given document metadata is used for type
112 * detection.
113 * <p>
114 * If the document stream supports the
115 * {@link InputStream#markSupported() mark feature}, then the stream is
116 * marked and reset to the original position before this method returns.
117 * Only a limited number of bytes are read from the stream.
118 * <p>
119 * The given document stream is <em>not</em> closed by this method.
120 * <p>
121 * Unlike in the {@link #parse(InputStream, Metadata)} method, the
122 * given document metadata is <em>not</em> modified by this method.
123 *
124 * @param stream the document stream, or <code>null</code>
125 * @param metadata document metadata
126 * @return detected media type
127 * @throws IOException if the stream can not be read
128 */
129 public String detect(InputStream stream, Metadata metadata)
130 throws IOException {
131 if (stream == null || stream.markSupported()) {
132 return detector.detect(stream, metadata).toString();
133 } else {
134 return detector.detect(
135 new BufferedInputStream(stream), metadata).toString();
136 }
137 }
138
139 /**
140 * Detects the media type of the given document. The type detection is
141 * based on the content of the given document stream and the name of the
142 * document.
143 * <p>
144 * If the document stream supports the
145 * {@link InputStream#markSupported() mark feature}, then the stream is
146 * marked and reset to the original position before this method returns.
147 * Only a limited number of bytes are read from the stream.
148 * <p>
149 * The given document stream is <em>not</em> closed by this method.
150 *
151 * @since Apache Tika 0.9
152 * @param stream the document stream
153 * @param name document name
154 * @return detected media type
155 * @throws IOException if the stream can not be read
156 */
157 public String detect(InputStream stream, String name) throws IOException {
158 Metadata metadata = new Metadata();
159 metadata.set(Metadata.RESOURCE_NAME_KEY, name);
160 return detect(stream, metadata);
161 }
162
163 /**
164 * Detects the media type of the given document. The type detection is
165 * based on the content of the given document stream.
166 * <p>
167 * If the document stream supports the
168 * {@link InputStream#markSupported() mark feature}, then the stream is
169 * marked and reset to the original position before this method returns.
170 * Only a limited number of bytes are read from the stream.
171 * <p>
172 * The given document stream is <em>not</em> closed by this method.
173 *
174 * @param stream the document stream
175 * @return detected media type
176 * @throws IOException if the stream can not be read
177 */
178 public String detect(InputStream stream) throws IOException {
179 return detect(stream, new Metadata());
180 }
181
182 /**
183 * Detects the media type of the given document. The type detection is
184 * based on the first few bytes of a document and the document name.
185 * <p>
186 * For best results at least a few kilobytes of the document data
187 * are needed. See also the other detect() methods for better
188 * alternatives when you have more than just the document prefix
189 * available for type detection.
190 *
191 * @since Apache Tika 0.9
192 * @param prefix first few bytes of the document
193 * @param name document name
194 * @return detected media type
195 */
196 public String detect(byte[] prefix, String name) {
197 try {
198 InputStream stream = TikaInputStream.get(prefix);
199 try {
200 return detect(stream, name);
201 } finally {
202 stream.close();
203 }
204 } catch (IOException e) {
205 throw new IllegalStateException("Unexpected IOException", e);
206 }
207 }
208
209 /**
210 * Detects the media type of the given document. The type detection is
211 * based on the first few bytes of a document.
212 * <p>
213 * For best results at least a few kilobytes of the document data
214 * are needed. See also the other detect() methods for better
215 * alternatives when you have more than just the document prefix
216 * available for type detection.
217 *
218 * @since Apache Tika 0.9
219 * @param prefix first few bytes of the document
220 * @return detected media type
221 */
222 public String detect(byte[] prefix) {
223 try {
224 InputStream stream = TikaInputStream.get(prefix);
225 try {
226 return detect(stream);
227 } finally {
228 stream.close();
229 }
230 } catch (IOException e) {
231 throw new IllegalStateException("Unexpected IOException", e);
232 }
233 }
234
235 /**
236 * Detects the media type of the given file. The type detection is
237 * based on the document content and a potential known file extension.
238 * <p>
239 * Use the {@link #detect(String)} method when you want to detect the
240 * type of the document without actually accessing the file.
241 *
242 * @param file the file
243 * @return detected media type
244 * @throws IOException if the file can not be read
245 */
246 public String detect(File file) throws IOException {
247 return detect(file.toURI().toURL());
248 }
249
250 /**
251 * Detects the media type of the resource at the given URL. The type
252 * detection is based on the document content and a potential known
253 * file extension included in the URL.
254 * <p>
255 * Use the {@link #detect(String)} method when you want to detect the
256 * type of the document without actually accessing the URL.
257 *
258 * @param url the URL of the resource
259 * @return detected media type
260 * @throws IOException if the resource can not be read
261 */
262 public String detect(URL url) throws IOException {
263 Metadata metadata = new Metadata();
264 InputStream stream = TikaInputStream.get(url, metadata);
265 try {
266 return detect(stream, metadata);
267 } finally {
268 stream.close();
269 }
270 }
271
272 /**
273 * Detects the media type of a document with the given file name.
274 * The type detection is based on known file name extensions.
275 * <p>
276 * The given name can also be a URL or a full file path. In such cases
277 * only the file name part of the string is used for type detection.
278 *
279 * @param name the file name of the document
280 * @return detected media type
281 */
282 public String detect(String name) {
283 try {
284 return detect((InputStream) null, name);
285 } catch (IOException e) {
286 throw new IllegalStateException("Unexpected IOException", e);
287 }
288 }
289
290 /**
291 * Parses the given document and returns the extracted text content.
292 * Input metadata like a file name or a content type hint can be passed
293 * in the given metadata instance. Metadata information extracted from
294 * the document is returned in that same metadata instance.
295 * <p>
296 * The returned reader will be responsible for closing the given stream.
297 * The stream and any associated resources will be closed at or before
298 * the time when the {@link Reader#close()} method is called.
299 *
300 * @param stream the document to be parsed
301 * @param metadata document metadata
302 * @return extracted text content
303 * @throws IOException if the document can not be read or parsed
304 */
305 public Reader parse(InputStream stream, Metadata metadata)
306 throws IOException {
307 ParseContext context = new ParseContext();
308 context.set(Parser.class, parser);
309 return new ParsingReader(parser, stream, metadata, context);
310 }
311
312 /**
313 * Parses the given document and returns the extracted text content.
314 * <p>
315 * The returned reader will be responsible for closing the given stream.
316 * The stream and any associated resources will be closed at or before
317 * the time when the {@link Reader#close()} method is called.
318 *
319 * @param stream the document to be parsed
320 * @return extracted text content
321 * @throws IOException if the document can not be read or parsed
322 */
323 public Reader parse(InputStream stream) throws IOException {
324 return parse(stream, new Metadata());
325 }
326
327 /**
328 * Parses the given file and returns the extracted text content.
329 *
330 * @param file the file to be parsed
331 * @return extracted text content
332 * @throws IOException if the file can not be read or parsed
333 */
334 public Reader parse(File file) throws IOException {
335 return parse(file.toURI().toURL());
336 }
337
338 /**
339 * Parses the resource at the given URL and returns the extracted
340 * text content.
341 *
342 * @param url the URL of the resource to be parsed
343 * @return extracted text content
344 * @throws IOException if the resource can not be read or parsed
345 */
346 public Reader parse(URL url) throws IOException {
347 Metadata metadata = new Metadata();
348 InputStream stream = TikaInputStream.get(url, metadata);
349 return parse(stream, metadata);
350 }
351
352 /**
353 * Parses the given document and returns the extracted text content.
354 * The given input stream is closed by this method.
355 * <p>
356 * To avoid unpredictable excess memory use, the returned string contains
357 * only up to {@link #getMaxStringLength()} first characters extracted
358 * from the input document. Use the {@link #setMaxStringLength(int)}
359 * method to adjust this limitation.
360 * <p>
361 * <strong>NOTE:</strong> Unlike most other Tika methods that take an
362 * {@link InputStream}, this method will close the given stream for
363 * you as a convenience. With other methods you are still responsible
364 * for closing the stream or a wrapper instance returned by Tika.
365 *
366 * @param stream the document to be parsed
367 * @param metadata document metadata
368 * @return extracted text content
369 * @throws IOException if the document can not be read
370 * @throws TikaException if the document can not be parsed
371 */
372 public String parseToString(InputStream stream, Metadata metadata)
373 throws IOException, TikaException {
374 WriteOutContentHandler handler =
375 new WriteOutContentHandler(maxStringLength);
376 try {
377 ParseContext context = new ParseContext();
378 context.set(Parser.class, parser);
379 parser.parse(
380 stream, new BodyContentHandler(handler), metadata, context);
381 } catch (SAXException e) {
382 if (!handler.isWriteLimitReached(e)) {
383 // This should never happen with BodyContentHandler...
384 throw new TikaException("Unexpected SAX processing failure", e);
385 }
386 } finally {
387 stream.close();
388 }
389 return handler.toString();
390 }
391
392 /**
393 * Parses the given document and returns the extracted text content.
394 * The given input stream is closed by this method. This method lets
395 * you control the maxStringLength per call.
396 * <p>
397 * To avoid unpredictable excess memory use, the returned string contains
398 * only up to maxLength (parameter) first characters extracted
399 * from the input document.
400 * <p>
401 * <strong>NOTE:</strong> Unlike most other Tika methods that take an
402 * {@link InputStream}, this method will close the given stream for
403 * you as a convenience. With other methods you are still responsible
404 * for closing the stream or a wrapper instance returned by Tika.
405 *
406 * @param stream the document to be parsed
407 * @param metadata document metadata
408 * @param maxLength maximum length of the returned string
409 * @return extracted text content
410 * @throws IOException if the document can not be read
411 * @throws TikaException if the document can not be parsed
412 */
413 public String parseToString(InputStream stream, Metadata metadata, int maxLength)
414 throws IOException, TikaException {
415 WriteOutContentHandler handler =
416 new WriteOutContentHandler(maxLength);
417 try {
418 ParseContext context = new ParseContext();
419 context.set(Parser.class, parser);
420 parser.parse(
421 stream, new BodyContentHandler(handler), metadata, context);
422 } catch (SAXException e) {
423 if (!handler.isWriteLimitReached(e)) {
424 // This should never happen with BodyContentHandler...
425 throw new TikaException("Unexpected SAX processing failure", e);
426 }
427 } finally {
428 stream.close();
429 }
430 return handler.toString();
431 }
432
433 /**
434 * Parses the given document and returns the extracted text content.
435 * The given input stream is closed by this method.
436 * <p>
437 * To avoid unpredictable excess memory use, the returned string contains
438 * only up to {@link #getMaxStringLength()} first characters extracted
439 * from the input document. Use the {@link #setMaxStringLength(int)}
440 * method to adjust this limitation.
441 * <p>
442 * <strong>NOTE:</strong> Unlike most other Tika methods that take an
443 * {@link InputStream}, this method will close the given stream for
444 * you as a convenience. With other methods you are still responsible
445 * for closing the stream or a wrapper instance returned by Tika.
446 *
447 * @param stream the document to be parsed
448 * @return extracted text content
449 * @throws IOException if the document can not be read
450 * @throws TikaException if the document can not be parsed
451 */
452 public String parseToString(InputStream stream)
453 throws IOException, TikaException {
454 return parseToString(stream, new Metadata());
455 }
456
457 /**
458 * Parses the given file and returns the extracted text content.
459 * <p>
460 * To avoid unpredictable excess memory use, the returned string contains
461 * only up to {@link #getMaxStringLength()} first characters extracted
462 * from the input document. Use the {@link #setMaxStringLength(int)}
463 * method to adjust this limitation.
464 *
465 * @param file the file to be parsed
466 * @return extracted text content
467 * @throws IOException if the file can not be read
468 * @throws TikaException if the file can not be parsed
469 */
470 public String parseToString(File file) throws IOException, TikaException {
471 return parseToString(file.toURI().toURL());
472 }
473
474 /**
475 * Parses the resource at the given URL and returns the extracted
476 * text content.
477 * <p>
478 * To avoid unpredictable excess memory use, the returned string contains
479 * only up to {@link #getMaxStringLength()} first characters extracted
480 * from the input document. Use the {@link #setMaxStringLength(int)}
481 * method to adjust this limitation.
482 *
483 * @param url the URL of the resource to be parsed
484 * @return extracted text content
485 * @throws IOException if the resource can not be read
486 * @throws TikaException if the resource can not be parsed
487 */
488 public String parseToString(URL url) throws IOException, TikaException {
489 Metadata metadata = new Metadata();
490 InputStream stream = TikaInputStream.get(url, metadata);
491 return parseToString(stream, metadata);
492 }
493
494 /**
495 * Returns the maximum length of strings returned by the
496 * parseToString methods.
497 *
498 * @since Apache Tika 0.7
499 * @return maximum string length, or -1 if the limit has been disabled
500 */
501 public int getMaxStringLength() {
502 return maxStringLength;
503 }
504
505 /**
506 * Sets the maximum length of strings returned by the parseToString
507 * methods.
508 *
509 * @since Apache Tika 0.7
510 * @param maxStringLength maximum string length,
511 * or -1 to disable this limit
512 */
513 public void setMaxStringLength(int maxStringLength) {
514 this.maxStringLength = maxStringLength;
515 }
516
517 /**
518 * Returns the parser instance used by this facade.
519 *
520 * @since Apache Tika 0.10
521 * @return parser instance
522 */
523 public Parser getParser() {
524 return parser;
525 }
526
527 /**
528 * Returns the detector instance used by this facade.
529 *
530 * @since Apache Tika 0.10
531 * @return detector instance
532 */
533 public Detector getDetector() {
534 return detector;
535 }
536
537 //--------------------------------------------------------------< Object >
538
539 public String toString() {
540 String version = null;
541
542 try {
543 InputStream stream = Tika.class.getResourceAsStream(
544 "/META-INF/maven/org.apache.tika/tika-core/pom.properties");
545 if (stream != null) {
546 try {
547 Properties properties = new Properties();
548 properties.load(stream);
549 version = properties.getProperty("version");
550 } finally {
551 stream.close();
552 }
553 }
554 } catch (Exception ignore) {
555 }
556
557 if (version != null) {
558 return "Apache Tika " + version;
559 } else {
560 return "Apache Tika";
561 }
562 }
563
564 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.config;
17
18 import java.util.logging.Level;
19 import java.util.logging.Logger;
20
21 /**
22 * Interface for error handling strategies in service class loading.
23 * You can implement this interface for a custom error handling mechanism,
24 * or use one of the predefined strategies.
25 *
26 * @since Apache Tika 0.9
27 */
28 public interface LoadErrorHandler {
29
30 /**
31 * Handles a problem encountered when trying to load the specified
32 * service class. The implementation can log or otherwise process
33 * the given error information. If the method returns normally, then
34 * the service loader simply skips this class and continues with the
35 * next one.
36 *
37 * @param classname name of the service class
38 * @param throwable the encountered problem
39 */
40 void handleLoadError(String classname, Throwable throwable);
41
42 /**
43 * Strategy that simply ignores all problems.
44 */
45 LoadErrorHandler IGNORE = new LoadErrorHandler() {
46 public void handleLoadError(String classname, Throwable throwable) {
47 }
48 };
49
50 /**
51 * Strategy that logs warnings of all problems using a {@link Logger}
52 * created using the given class name.
53 */
54 LoadErrorHandler WARN = new LoadErrorHandler() {
55 public void handleLoadError(String classname, Throwable throwable) {
56 Logger.getLogger(classname).log(
57 Level.WARNING, "Unable to load " + classname, throwable);
58 }
59 };
60
61 /**
62 * Strategy that throws a {@link RuntimeException} with the given
63 * throwable as the root cause, thus interrupting the entire service
64 * loading operation.
65 */
66 LoadErrorHandler THROW = new LoadErrorHandler() {
67 public void handleLoadError(String classname, Throwable throwable) {
68 throw new RuntimeException("Unable to load " + classname, throwable);
69 }
70 };
71
72 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.config;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.Collections;
26 import java.util.Enumeration;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.regex.Pattern;
31
32 /**
33 * Internal utility class that Tika uses to look up service providers.
34 *
35 * @since Apache Tika 0.9
36 */
37 public class ServiceLoader {
38
39 /**
40 * The default context class loader to use for all threads, or
41 * <code>null</code> to automatically select the context class loader.
42 */
43 private static volatile ClassLoader contextClassLoader = null;
44
45 private static class RankedService implements Comparable<RankedService> {
46 private Object service;
47 private int rank;
48
49 public RankedService(Object service, int rank) {
50 this.service = service;
51 this.rank = rank;
52 }
53
54 public boolean isInstanceOf(Class<?> iface) {
55 return iface.isAssignableFrom(service.getClass());
56 }
57
58 public int compareTo(RankedService that) {
59 return that.rank - rank; // highest number first
60 }
61
62 }
63
64 /**
65 * The dynamic set of services available in an OSGi environment.
66 * Managed by the {@link TikaActivator} class and used as an additional
67 * source of service instances in the {@link #loadServiceProviders(Class)}
68 * method.
69 */
70 private static final Map<Object, RankedService> services =
71 new HashMap<Object, RankedService>();
72
73 /**
74 * Returns the context class loader of the current thread. If such
75 * a class loader is not available, then the loader of this class or
76 * finally the system class loader is returned.
77 *
78 * @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
79 * @return context class loader, or <code>null</code> if no loader
80 * is available
81 */
82 static ClassLoader getContextClassLoader() {
83 ClassLoader loader = contextClassLoader;
84 if (loader == null) {
85 loader = ServiceLoader.class.getClassLoader();
86 }
87 if (loader == null) {
88 loader = ClassLoader.getSystemClassLoader();
89 }
90 return loader;
91 }
92
93 /**
94 * Sets the context class loader to use for all threads that access
95 * this class. Used for example in an OSGi environment to avoid problems
96 * with the default context class loader.
97 *
98 * @param loader default context class loader,
99 * or <code>null</code> to automatically pick the loader
100 */
101 public static void setContextClassLoader(ClassLoader loader) {
102 contextClassLoader = loader;
103 }
104
105 static void addService(Object reference, Object service, int rank) {
106 synchronized (services) {
107 services.put(reference, new RankedService(service, rank));
108 }
109 }
110
111 static Object removeService(Object reference) {
112 synchronized (services) {
113 return services.remove(reference);
114 }
115 }
116
117 private final ClassLoader loader;
118
119 private final LoadErrorHandler handler;
120
121 private final boolean dynamic;
122
123 public ServiceLoader(
124 ClassLoader loader, LoadErrorHandler handler, boolean dynamic) {
125 this.loader = loader;
126 this.handler = handler;
127 this.dynamic = dynamic;
128 }
129
130 public ServiceLoader(ClassLoader loader, LoadErrorHandler handler) {
131 this(loader, handler, false);
132 }
133
134 public ServiceLoader(ClassLoader loader) {
135 this(loader, LoadErrorHandler.IGNORE);
136 }
137
138 public ServiceLoader() {
139 this(getContextClassLoader(), LoadErrorHandler.IGNORE, true);
140 }
141
142 /**
143 * Returns the load error handler used by this loader.
144 *
145 * @return load error handler
146 * @since Apache Tika 1.3
147 */
148 public LoadErrorHandler getLoadErrorHandler() {
149 return handler;
150 }
151
152 /**
153 * Returns an input stream for reading the specified resource from the
154 * configured class loader.
155 *
156 * @param name resource name
157 * @return input stream, or <code>null</code> if the resource was not found
158 * @see ClassLoader#getResourceAsStream(String)
159 * @since Apache Tika 1.1
160 */
161 public InputStream getResourceAsStream(String name) {
162 if (loader != null) {
163 return loader.getResourceAsStream(name);
164 } else {
165 return null;
166 }
167 }
168
169 /**
170 * Loads and returns the named service class that's expected to implement
171 * the given interface.
172 *
173 * @param iface service interface
174 * @param name service class name
175 * @return service class
176 * @throws ClassNotFoundException if the service class can not be found
177 * or does not implement the given interface
178 * @see Class#forName(String, boolean, ClassLoader)
179 * @since Apache Tika 1.1
180 */
181 @SuppressWarnings("unchecked")
182 public <T> Class<? extends T> getServiceClass(Class<T> iface, String name)
183 throws ClassNotFoundException {
184 if (loader == null) {
185 throw new ClassNotFoundException(
186 "Service class " + name + " is not available");
187 }
188 Class<?> klass = Class.forName(name, true, loader);
189 if (klass.isInterface()) {
190 throw new ClassNotFoundException(
191 "Service class " + name + " is an interface");
192 } else if (!iface.isAssignableFrom(klass)) {
193 throw new ClassNotFoundException(
194 "Service class " + name
195 + " does not implement " + iface.getName());
196 } else {
197 return (Class<? extends T>) klass;
198 }
199 }
200
201 /**
202 * Returns all the available service resources matching the
203 * given pattern, such as all instances of tika-mimetypes.xml
204 * on the classpath, or all org.apache.tika.parser.Parser
205 * service files.
206 */
207 public Enumeration<URL> findServiceResources(String filePattern) {
208 try {
209 Enumeration<URL> resources = loader.getResources(filePattern);
210 return resources;
211 } catch (IOException ignore) {
212 // We couldn't get the list of service resource files
213 List<URL> empty = Collections.emptyList();
214 return Collections.enumeration( empty );
215 }
216 }
217
218 /**
219 * Returns all the available service providers of the given type.
220 *
221 * @param iface service provider interface
222 * @return available service providers
223 */
224 public <T> List<T> loadServiceProviders(Class<T> iface) {
225 List<T> providers = new ArrayList<T>();
226 providers.addAll(loadDynamicServiceProviders(iface));
227 providers.addAll(loadStaticServiceProviders(iface));
228 return providers;
229 }
230
231 /**
232 * Returns the available dynamic service providers of the given type.
233 * The returned list is newly allocated and may be freely modified
234 * by the caller.
235 *
236 * @since Apache Tika 1.2
237 * @param iface service provider interface
238 * @return dynamic service providers
239 */
240 @SuppressWarnings("unchecked")
241 public <T> List<T> loadDynamicServiceProviders(Class<T> iface) {
242 if (dynamic) {
243 synchronized (services) {
244 List<RankedService> list =
245 new ArrayList<RankedService>(services.values());
246 Collections.sort(list);
247
248 List<T> providers = new ArrayList<T>(list.size());
249 for (RankedService service : list) {
250 if (service.isInstanceOf(iface)) {
251 providers.add((T) service.service);
252 }
253 }
254 return providers;
255 }
256 } else {
257 return new ArrayList<T>(0);
258 }
259 }
260
261 /**
262 * Returns the available static service providers of the given type.
263 * The providers are loaded using the service provider mechanism using
264 * the configured class loader (if any). The returned list is newly
265 * allocated and may be freely modified by the caller.
266 *
267 * @since Apache Tika 1.2
268 * @param iface service provider interface
269 * @return static service providers
270 */
271 @SuppressWarnings("unchecked")
272 public <T> List<T> loadStaticServiceProviders(Class<T> iface) {
273 List<T> providers = new ArrayList<T>();
274
275 if (loader != null) {
276 List<String> names = new ArrayList<String>();
277
278 String serviceName = iface.getName();
279 Enumeration<URL> resources =
280 findServiceResources("META-INF/services/" + serviceName);
281 for (URL resource : Collections.list(resources)) {
282 try {
283 collectServiceClassNames(resource, names);
284 } catch (IOException e) {
285 handler.handleLoadError(serviceName, e);
286 }
287 }
288
289 for (String name : names) {
290 try {
291 Class<?> klass = loader.loadClass(name);
292 if (iface.isAssignableFrom(klass)) {
293 providers.add((T) klass.newInstance());
294 }
295 } catch (Throwable t) {
296 handler.handleLoadError(name, t);
297 }
298 }
299 }
300
301 return providers;
302 }
303
304 private static final Pattern COMMENT = Pattern.compile("#.*");
305
306 private static final Pattern WHITESPACE = Pattern.compile("\\s+");
307
308 private void collectServiceClassNames(URL resource, Collection<String> names)
309 throws IOException {
310 InputStream stream = resource.openStream();
311 try {
312 BufferedReader reader =
313 new BufferedReader(new InputStreamReader(stream, "UTF-8"));
314 String line = reader.readLine();
315 while (line != null) {
316 line = COMMENT.matcher(line).replaceFirst("");
317 line = WHITESPACE.matcher(line).replaceAll("");
318 if (line.length() > 0) {
319 names.add(line);
320 }
321 line = reader.readLine();
322 }
323 } finally {
324 stream.close();
325 }
326 }
327
328 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.config;
17
18 import org.apache.tika.detect.Detector;
19 import org.apache.tika.parser.Parser;
20 import org.osgi.framework.BundleActivator;
21 import org.osgi.framework.BundleContext;
22 import org.osgi.framework.Constants;
23 import org.osgi.framework.ServiceReference;
24 import org.osgi.util.tracker.ServiceTracker;
25 import org.osgi.util.tracker.ServiceTrackerCustomizer;
26
27 /**
28 * Bundle activator that adjust the class loading mechanism of the
29 * {@link ServiceLoader} class to work correctly in an OSGi environment.
30 * <p>
31 * Note that you should <strong>not</strong> access this class directly.
32 * Instead the OSGi environment (if present) will automatically invoke the
33 * methods of this class based on the Bundle-Activator setting in the bundle
34 * manifest.
35 *
36 * @since Apache Tika 0.9
37 */
38 public class TikaActivator implements BundleActivator, ServiceTrackerCustomizer {
39
40 private ServiceTracker detectorTracker;
41
42 private ServiceTracker parserTracker;
43
44 private BundleContext bundleContext;
45 //-----------------------------------------------------< BundleActivator >
46
47 public void start(final BundleContext context) throws Exception {
48 bundleContext = context;
49
50 detectorTracker = new ServiceTracker(context, Detector.class.getName(), this);
51 parserTracker = new ServiceTracker(context, Parser.class.getName(), this);
52
53 detectorTracker.open();
54 parserTracker.open();
55 }
56
57 public void stop(BundleContext context) throws Exception {
58 parserTracker.close();
59 detectorTracker.close();
60 }
61
62 public Object addingService(ServiceReference reference) {
63 int rank = 0;
64 Object property = reference.getProperty(Constants.SERVICE_RANKING);
65 if (property instanceof Integer) {
66 rank = (Integer) property;
67 }
68
69 Object service = bundleContext.getService(reference);
70 ServiceLoader.addService(reference, service, rank);
71 return service;
72 }
73
74 public void modifiedService(ServiceReference reference, Object service) {
75 }
76
77 public void removedService(ServiceReference reference, Object service) {
78 ServiceLoader.removeService(reference);
79 bundleContext.ungetService(reference);
80 }
81
82 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.config;
17
18 import java.io.File;
19 import java.io.FileInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27
28 import javax.imageio.spi.ServiceRegistry;
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32
33 import org.apache.tika.detect.CompositeDetector;
34 import org.apache.tika.detect.DefaultDetector;
35 import org.apache.tika.detect.Detector;
36 import org.apache.tika.exception.TikaException;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.mime.MediaTypeRegistry;
39 import org.apache.tika.mime.MimeTypeException;
40 import org.apache.tika.mime.MimeTypes;
41 import org.apache.tika.mime.MimeTypesFactory;
42 import org.apache.tika.parser.AutoDetectParser;
43 import org.apache.tika.parser.CompositeParser;
44 import org.apache.tika.parser.DefaultParser;
45 import org.apache.tika.parser.Parser;
46 import org.apache.tika.parser.ParserDecorator;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.Element;
49 import org.w3c.dom.Node;
50 import org.w3c.dom.NodeList;
51 import org.xml.sax.SAXException;
52
53 /**
54 * Parse xml config file.
55 */
56 public class TikaConfig {
57
58 private static MimeTypes getDefaultMimeTypes(ClassLoader loader) {
59 return MimeTypes.getDefaultMimeTypes(loader);
60 }
61
62 private static Detector getDefaultDetector(
63 MimeTypes types, ServiceLoader loader) {
64 return new DefaultDetector(types, loader);
65 }
66
67 private static CompositeParser getDefaultParser(
68 MimeTypes types, ServiceLoader loader) {
69 return new DefaultParser(types.getMediaTypeRegistry(), loader);
70 }
71
72 private final CompositeParser parser;
73 private final Detector detector;
74
75 private final MimeTypes mimeTypes;
76
77 public TikaConfig(String file)
78 throws TikaException, IOException, SAXException {
79 this(new File(file));
80 }
81
82 public TikaConfig(File file)
83 throws TikaException, IOException, SAXException {
84 this(getBuilder().parse(file));
85 }
86
87 public TikaConfig(URL url)
88 throws TikaException, IOException, SAXException {
89 this(url, ServiceLoader.getContextClassLoader());
90 }
91
92 public TikaConfig(URL url, ClassLoader loader)
93 throws TikaException, IOException, SAXException {
94 this(getBuilder().parse(url.toString()).getDocumentElement(), loader);
95 }
96
97 public TikaConfig(InputStream stream)
98 throws TikaException, IOException, SAXException {
99 this(getBuilder().parse(stream));
100 }
101
102 public TikaConfig(Document document) throws TikaException, IOException {
103 this(document.getDocumentElement());
104 }
105
106 public TikaConfig(Element element) throws TikaException, IOException {
107 this(element, new ServiceLoader());
108 }
109
110 public TikaConfig(Element element, ClassLoader loader)
111 throws TikaException, IOException {
112 this(element, new ServiceLoader(loader));
113 }
114
115 private TikaConfig(Element element, ServiceLoader loader)
116 throws TikaException, IOException {
117 this.mimeTypes = typesFromDomElement(element);
118 this.detector = detectorFromDomElement(element, mimeTypes, loader);
119 this.parser = parserFromDomElement(element, mimeTypes, loader);
120 }
121
122 /**
123 * Creates a Tika configuration from the built-in media type rules
124 * and all the {@link Parser} implementations available through the
125 * {@link ServiceRegistry service provider mechanism} in the given
126 * class loader.
127 *
128 * @since Apache Tika 0.8
129 * @param loader the class loader through which parser implementations
130 * are loaded, or <code>null</code> for no parsers
131 * @throws MimeTypeException if the built-in media type rules are broken
132 * @throws IOException if the built-in media type rules can not be read
133 */
134 public TikaConfig(ClassLoader loader)
135 throws MimeTypeException, IOException {
136 ServiceLoader serviceLoader = new ServiceLoader(loader);
137 this.mimeTypes = getDefaultMimeTypes(loader);
138 this.detector = getDefaultDetector(mimeTypes, serviceLoader);
139 this.parser = getDefaultParser(mimeTypes, serviceLoader);
140 }
141
142 /**
143 * Creates a default Tika configuration.
144 * First checks whether an XML config file is specified, either in
145 * <ol>
146 * <li>System property "tika.config", or</li>
147 * <li>Environment variable TIKA_CONFIG</li>
148 * </ol>
149 * <p>If one of these have a value, try to resolve it relative to file
150 * system or classpath.</p>
151 * <p>If XML config is not specified, initialize from the built-in media
152 * type rules and all the {@link Parser} implementations available through
153 * the {@link ServiceRegistry service provider mechanism} in the context
154 * class loader of the current thread.</p>
155 *
156 * @throws IOException if the configuration can not be read
157 * @throws TikaException if problem with MimeTypes or parsing XML config
158 */
159 public TikaConfig() throws TikaException, IOException {
160 ServiceLoader loader = new ServiceLoader();
161
162 String config = System.getProperty("tika.config");
163 if (config == null) {
164 config = System.getenv("TIKA_CONFIG");
165 }
166
167 if (config == null) {
168 this.mimeTypes = getDefaultMimeTypes(ServiceLoader.getContextClassLoader());
169 this.parser = getDefaultParser(mimeTypes, loader);
170 this.detector = getDefaultDetector(mimeTypes, loader);
171 } else {
172 // Locate the given configuration file
173 InputStream stream = null;
174 File file = new File(config);
175 if (file.isFile()) {
176 stream = new FileInputStream(file);
177 }
178 if (stream == null) {
179 try {
180 stream = new URL(config).openStream();
181 } catch (IOException ignore) {
182 }
183 }
184 if (stream == null) {
185 stream = loader.getResourceAsStream(config);
186 }
187 if (stream == null) {
188 throw new TikaException(
189 "Specified Tika configuration not found: " + config);
190 }
191
192 try {
193 Element element =
194 getBuilder().parse(stream).getDocumentElement();
195 this.mimeTypes = typesFromDomElement(element);
196 this.parser =
197 parserFromDomElement(element, mimeTypes, loader);
198 this.detector =
199 detectorFromDomElement(element, mimeTypes, loader);
200 } catch (SAXException e) {
201 throw new TikaException(
202 "Specified Tika configuration has syntax errors: "
203 + config, e);
204 } finally {
205 stream.close();
206 }
207 }
208 }
209
210 private static String getText(Node node) {
211 if (node.getNodeType() == Node.TEXT_NODE) {
212 return node.getNodeValue();
213 } else if (node.getNodeType() == Node.ELEMENT_NODE) {
214 StringBuilder builder = new StringBuilder();
215 NodeList list = node.getChildNodes();
216 for (int i = 0; i < list.getLength(); i++) {
217 builder.append(getText(list.item(i)));
218 }
219 return builder.toString();
220 } else {
221 return "";
222 }
223 }
224
225 /**
226 * @deprecated Use the {@link #getParser()} method instead
227 */
228 public Parser getParser(MediaType mimeType) {
229 return parser.getParsers().get(mimeType);
230 }
231
232 /**
233 * Returns the configured parser instance.
234 *
235 * @return configured parser
236 */
237 public Parser getParser() {
238 return parser;
239 }
240
241 /**
242 * Returns the configured detector instance.
243 *
244 * @return configured detector
245 */
246 public Detector getDetector() {
247 return detector;
248 }
249
250 public MimeTypes getMimeRepository(){
251 return mimeTypes;
252 }
253
254 public MediaTypeRegistry getMediaTypeRegistry() {
255 return mimeTypes.getMediaTypeRegistry();
256 }
257
258 /**
259 * Provides a default configuration (TikaConfig). Currently creates a
260 * new instance each time it's called; we may be able to have it
261 * return a shared instance once it is completely immutable.
262 *
263 * @return default configuration
264 */
265 public static TikaConfig getDefaultConfig() {
266 try {
267 return new TikaConfig();
268 } catch (IOException e) {
269 throw new RuntimeException(
270 "Unable to read default configuration", e);
271 } catch (TikaException e) {
272 throw new RuntimeException(
273 "Unable to access default configuration", e);
274 }
275 }
276
277 private static DocumentBuilder getBuilder() throws TikaException {
278 try {
279 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
280 } catch (ParserConfigurationException e) {
281 throw new TikaException("XML parser not available", e);
282 }
283 }
284
285 private static Element getChild(Element element, String name) {
286 Node child = element.getFirstChild();
287 while (child != null) {
288 if (child.getNodeType() == Node.ELEMENT_NODE
289 && name.equals(child.getNodeName())) {
290 return (Element) child;
291 }
292 child = child.getNextSibling();
293 }
294 return null;
295 }
296
297 private static MimeTypes typesFromDomElement(Element element)
298 throws TikaException, IOException {
299 Element mtr = getChild(element, "mimeTypeRepository");
300 if (mtr != null && mtr.hasAttribute("resource")) {
301 return MimeTypesFactory.create(mtr.getAttribute("resource"));
302 } else {
303 return getDefaultMimeTypes(null);
304 }
305 }
306
307 private static CompositeParser parserFromDomElement(
308 Element element, MimeTypes mimeTypes, ServiceLoader loader)
309 throws TikaException, IOException {
310 List<Parser> parsers = new ArrayList<Parser>();
311 NodeList nodes = element.getElementsByTagName("parser");
312 for (int i = 0; i < nodes.getLength(); i++) {
313 Element node = (Element) nodes.item(i);
314 String name = node.getAttribute("class");
315
316 try {
317 Class<? extends Parser> parserClass =
318 loader.getServiceClass(Parser.class, name);
319 // https://issues.apache.org/jira/browse/TIKA-866
320 if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
321 throw new TikaException(
322 "AutoDetectParser not supported in a <parser>"
323 + " configuration element: " + name);
324 }
325 Parser parser = parserClass.newInstance();
326
327 NodeList mimes = node.getElementsByTagName("mime");
328 if (mimes.getLength() > 0) {
329 Set<MediaType> types = new HashSet<MediaType>();
330 for (int j = 0; j < mimes.getLength(); j++) {
331 String mime = getText(mimes.item(j));
332 MediaType type = MediaType.parse(mime);
333 if (type != null) {
334 types.add(type);
335 } else {
336 throw new TikaException(
337 "Invalid media type name: " + mime);
338 }
339 }
340 parser = ParserDecorator.withTypes(parser, types);
341 }
342
343 parsers.add(parser);
344 } catch (ClassNotFoundException e) {
345 throw new TikaException(
346 "Unable to find a parser class: " + name, e);
347 } catch (IllegalAccessException e) {
348 throw new TikaException(
349 "Unable to access a parser class: " + name, e);
350 } catch (InstantiationException e) {
351 throw new TikaException(
352 "Unable to instantiate a parser class: " + name, e);
353 }
354 }
355 if (parsers.isEmpty()) {
356 return getDefaultParser(mimeTypes, loader);
357 } else {
358 MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
359 return new CompositeParser(registry, parsers);
360 }
361 }
362
363 private static Detector detectorFromDomElement(
364 Element element, MimeTypes mimeTypes, ServiceLoader loader)
365 throws TikaException, IOException {
366 List<Detector> detectors = new ArrayList<Detector>();
367 NodeList nodes = element.getElementsByTagName("detector");
368 for (int i = 0; i < nodes.getLength(); i++) {
369 Element node = (Element) nodes.item(i);
370 String name = node.getAttribute("class");
371
372 try {
373 Class<? extends Detector> detectorClass =
374 loader.getServiceClass(Detector.class, name);
375 detectors.add(detectorClass.newInstance());
376 } catch (ClassNotFoundException e) {
377 throw new TikaException(
378 "Unable to find a detector class: " + name, e);
379 } catch (IllegalAccessException e) {
380 throw new TikaException(
381 "Unable to access a detector class: " + name, e);
382 } catch (InstantiationException e) {
383 throw new TikaException(
384 "Unable to instantiate a detector class: " + name, e);
385 }
386 }
387 if (detectors.isEmpty()) {
388 return getDefaultDetector(mimeTypes, loader);
389 } else {
390 MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
391 return new CompositeDetector(registry, detectors);
392 }
393 }
394 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Tika configuration tools.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.config;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.BufferedInputStream;
19 import java.io.BufferedReader;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.nio.charset.Charset;
24 import java.util.List;
25
26 import org.apache.tika.config.LoadErrorHandler;
27 import org.apache.tika.config.ServiceLoader;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.utils.CharsetUtils;
32 import org.xml.sax.InputSource;
33
34 /**
35 * An input stream reader that automatically detects the character encoding
36 * to be used for converting bytes to characters.
37 *
38 * @since Apache Tika 1.2
39 */
40 public class AutoDetectReader extends BufferedReader {
41
42 private static final ServiceLoader DEFAULT_LOADER =
43 new ServiceLoader(AutoDetectReader.class.getClassLoader());
44
45 private static Charset detect(
46 InputStream input, Metadata metadata,
47 List<EncodingDetector> detectors, LoadErrorHandler handler)
48 throws IOException, TikaException {
49 // Ask all given detectors for the character encoding
50 for (EncodingDetector detector : detectors) {
51 try {
52 Charset charset = detector.detect(input, metadata);
53 if (charset != null) {
54 return charset;
55 }
56 } catch (NoClassDefFoundError e) {
57 // TIKA-1041: Detector dependencies not present.
58 handler.handleLoadError(detector.getClass().getName(), e);
59 }
60 }
61
62 // Try determining the encoding based on hints in document metadata
63 MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
64 if (type != null) {
65 String charset = type.getParameters().get("charset");
66 if (charset != null) {
67 try {
68 return CharsetUtils.forName(charset);
69 } catch (Exception e) {
70 // ignore
71 }
72 }
73 }
74
75 throw new TikaException(
76 "Failed to detect the character encoding of a document");
77 }
78
79 private final Charset charset;
80
81 private AutoDetectReader(InputStream stream, Charset charset)
82 throws IOException {
83 super(new InputStreamReader(stream, charset));
84 this.charset = charset;
85
86 // TIKA-240: Drop the BOM if present
87 mark(1);
88 if (read() != '\ufeff') { // zero-width no-break space
89 reset();
90 }
91 }
92
93 private AutoDetectReader(
94 BufferedInputStream stream, Metadata metadata,
95 List<EncodingDetector> detectors, LoadErrorHandler handler)
96 throws IOException, TikaException {
97 this(stream, detect(stream, metadata, detectors, handler));
98 }
99
100 public AutoDetectReader(
101 InputStream stream, Metadata metadata,
102 ServiceLoader loader) throws IOException, TikaException {
103 this(new BufferedInputStream(stream), metadata,
104 loader.loadServiceProviders(EncodingDetector.class),
105 loader.getLoadErrorHandler());
106 }
107
108 public AutoDetectReader(InputStream stream, Metadata metadata)
109 throws IOException, TikaException {
110 this(new BufferedInputStream(stream), metadata, DEFAULT_LOADER);
111 }
112
113 public AutoDetectReader(InputStream stream)
114 throws IOException, TikaException {
115 this(stream, new Metadata());
116 }
117
118 public Charset getCharset() {
119 return charset;
120 }
121
122 public InputSource asInputSource() {
123 InputSource source = new InputSource(this);
124 source.setEncoding(charset.name());
125 return source;
126 }
127
128 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26 import org.apache.tika.mime.MediaTypeRegistry;
27
28 /**
29 * Content type detector that combines multiple different detection mechanisms.
30 */
31 public class CompositeDetector implements Detector {
32
33 /**
34 * Serial version UID
35 */
36 private static final long serialVersionUID = 5980683158436430252L;
37
38 private final MediaTypeRegistry registry;
39
40 private final List<Detector> detectors;
41
42 public CompositeDetector(
43 MediaTypeRegistry registry, List<Detector> detectors) {
44 this.registry = registry;
45 this.detectors = detectors;
46 }
47
48 public CompositeDetector(List<Detector> detectors) {
49 this(new MediaTypeRegistry(), detectors);
50 }
51
52 public CompositeDetector(Detector... detectors) {
53 this(Arrays.asList(detectors));
54 }
55
56 public MediaType detect(InputStream input, Metadata metadata)
57 throws IOException {
58 MediaType type = MediaType.OCTET_STREAM;
59 for (Detector detector : getDetectors()) {
60 MediaType detected = detector.detect(input, metadata);
61 if (registry.isSpecializationOf(detected, type)) {
62 type = detected;
63 }
64 }
65 return type;
66 }
67
68 /**
69 * Returns the component detectors.
70 */
71 public List<Detector> getDetectors() {
72 return Collections.unmodifiableList(detectors);
73 }
74 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.util.Collections;
19 import java.util.Comparator;
20 import java.util.List;
21
22 import javax.imageio.spi.ServiceRegistry;
23
24 import org.apache.tika.config.ServiceLoader;
25 import org.apache.tika.mime.MimeTypes;
26
27 /**
28 * A composite detector based on all the {@link Detector} implementations
29 * available through the {@link ServiceRegistry service provider mechanism}.
30 *
31 * Detectors are loaded and returned in a specified order, of user supplied
32 * followed by non-MimeType Tika, followed by the Tika MimeType class.
33 * If you need to control the order of the Detectors, you should instead
34 * construct your own {@link CompositeDetector} and pass in the list
35 * of Detectors in the required order.
36 *
37 * @since Apache Tika 0.9
38 */
39 public class DefaultDetector extends CompositeDetector {
40
41 /** Serial version UID */
42 private static final long serialVersionUID = -8170114575326908027L;
43
44 /**
45 * Finds all statically loadable detectors and sort the list by name,
46 * rather than discovery order. Detectors are used in the given order,
47 * so put the Tika parsers last so that non-Tika (user supplied)
48 * parsers can take precedence.
49 *
50 * @param loader service loader
51 * @return ordered list of statically loadable detectors
52 */
53 private static List<Detector> getDefaultDetectors(
54 MimeTypes types, ServiceLoader loader) {
55 List<Detector> detectors =
56 loader.loadStaticServiceProviders(Detector.class);
57 Collections.sort(detectors, new Comparator<Detector>() {
58 public int compare(Detector d1, Detector d2) {
59 String n1 = d1.getClass().getName();
60 String n2 = d2.getClass().getName();
61 boolean t1 = n1.startsWith("org.apache.tika.");
62 boolean t2 = n2.startsWith("org.apache.tika.");
63 if (t1 == t2) {
64 return n1.compareTo(n2);
65 } else if (t1) {
66 return 1;
67 } else {
68 return -1;
69 }
70 }
71 });
72 // Finally the Tika MimeTypes as a fallback
73 detectors.add(types);
74 return detectors;
75 }
76
77 private transient final ServiceLoader loader;
78
79 public DefaultDetector(MimeTypes types, ServiceLoader loader) {
80 super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader));
81 this.loader = loader;
82 }
83
84 public DefaultDetector(MimeTypes types, ClassLoader loader) {
85 this(types, new ServiceLoader(loader));
86 }
87
88 public DefaultDetector(ClassLoader loader) {
89 this(MimeTypes.getDefaultMimeTypes(), loader);
90 }
91
92 public DefaultDetector(MimeTypes types) {
93 this(types, new ServiceLoader());
94 }
95
96 public DefaultDetector() {
97 this(MimeTypes.getDefaultMimeTypes());
98 }
99
100 @Override
101 public List<Detector> getDetectors() {
102 if (loader != null) {
103 List<Detector> detectors =
104 loader.loadDynamicServiceProviders(Detector.class);
105 detectors.addAll(super.getDetectors());
106 return detectors;
107 } else {
108 return super.getDetectors();
109 }
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.Serializable;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.mime.MediaType;
24
25 /**
26 * Content type detector. Implementations of this interface use various
27 * heuristics to detect the content type of a document based on given
28 * input metadata or the first few bytes of the document stream.
29 *
30 * @since Apache Tika 0.3
31 */
32 public interface Detector extends Serializable {
33
34 /**
35 * Detects the content type of the given input document. Returns
36 * <code>application/octet-stream</code> if the type of the document
37 * can not be detected.
38 * <p>
39 * If the document input stream is not available, then the first
40 * argument may be <code>null</code>. Otherwise the detector may
41 * read bytes from the start of the stream to help in type detection.
42 * The given stream is guaranteed to support the
43 * {@link InputStream#markSupported() mark feature} and the detector
44 * is expected to {@link InputStream#mark(int) mark} the stream before
45 * reading any bytes from it, and to {@link InputStream#reset() reset}
46 * the stream before returning. The stream must not be closed by the
47 * detector.
48 * <p>
49 * The given input metadata is only read, not modified, by the detector.
50 *
51 * @param input document input stream, or <code>null</code>
52 * @param metadata input metadata for the document
53 * @return detected media type, or <code>application/octet-stream</code>
54 * @throws IOException if the document input stream could not be read
55 */
56 MediaType detect(InputStream input, Metadata metadata) throws IOException;
57
58 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.mime.MediaType;
23
24 /**
25 * Dummy detector that returns application/octet-stream for all documents.
26 */
27 public class EmptyDetector implements Detector {
28
29 /**
30 * Singleton instance of this class.
31 */
32 public static final EmptyDetector INSTANCE = new EmptyDetector();
33
34 public MediaType detect(InputStream input, Metadata metadata)
35 throws IOException {
36 return MediaType.OCTET_STREAM;
37 }
38
39 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.charset.Charset;
21
22 import org.apache.tika.metadata.Metadata;
23
24 /**
25 * Character encoding detector. Implementations of this interface use
26 * various heuristics to detect the character encoding of a text document
27 * based on given input metadata or the first few bytes of the document stream.
28 *
29 * @since Apache Tika 0.4
30 */
31 public interface EncodingDetector {
32
33 /**
34 * Detects the character encoding of the given text document, or
35 * <code>null</code> if the encoding of the document can not be detected.
36 * <p>
37 * If the document input stream is not available, then the first
38 * argument may be <code>null</code>. Otherwise the detector may
39 * read bytes from the start of the stream to help in encoding detection.
40 * The given stream is guaranteed to support the
41 * {@link InputStream#markSupported() mark feature} and the detector
42 * is expected to {@link InputStream#mark(int) mark} the stream before
43 * reading any bytes from it, and to {@link InputStream#reset() reset}
44 * the stream before returning. The stream must not be closed by the
45 * detector.
46 * <p>
47 * The given input metadata is only read, not modified, by the detector.
48 *
49 * @param input text document input stream, or <code>null</code>
50 * @param metadata input metadata for the document
51 * @return detected character encoding, or <code>null</code>
52 * @throws IOException if the document input stream could not be read
53 */
54 Charset detect(InputStream input, Metadata metadata) throws IOException;
55
56 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.CharArrayWriter;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.nio.ByteBuffer;
22 import java.nio.CharBuffer;
23 import java.nio.charset.Charset;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.mime.MediaType;
29
30 /**
31 * Content type detection based on magic bytes, i.e. type-specific patterns
32 * near the beginning of the document input stream.
33 *
34 * Because this works on bytes, not characters, by default any string
35 * matching is done as ISO_8859_1. To use an explicit different
36 * encoding, supply a type other than "string" / "stringignorecase"
37 *
38 * @since Apache Tika 0.3
39 */
40 public class MagicDetector implements Detector {
41
42 private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
43
44 public static MagicDetector parse(
45 MediaType mediaType,
46 String type, String offset, String value, String mask) {
47 int start = 0;
48 int end = 0;
49 if (offset != null) {
50 int colon = offset.indexOf(':');
51 if (colon == -1) {
52 start = Integer.parseInt(offset);
53 end = start;
54 } else {
55 start = Integer.parseInt(offset.substring(0, colon));
56 end = Integer.parseInt(offset.substring(colon + 1));
57 }
58 }
59
60 byte[] patternBytes = decodeValue(value, type);
61 byte[] maskBytes = null;
62 if (mask != null) {
63 maskBytes = decodeValue(mask, type);
64 }
65
66 return new MagicDetector(
67 mediaType, patternBytes, maskBytes,
68 type.equals("regex"), type.equals("stringignorecase"),
69 start, end);
70 }
71
72 private static byte[] decodeValue(String value, String type) {
73 // Preliminary check
74 if ((value == null) || (type == null)) {
75 return null;
76 }
77
78 byte[] decoded = null;
79 String tmpVal = null;
80 int radix = 8;
81
82 // hex
83 if (value.startsWith("0x")) {
84 tmpVal = value.substring(2);
85 radix = 16;
86 } else {
87 tmpVal = value;
88 radix = 8;
89 }
90
91 if (type.equals("string")
92 || type.equals("regex")
93 || type.equals("unicodeLE")
94 || type.equals("unicodeBE")) {
95 decoded = decodeString(value, type);
96 } else if (type.equals("stringignorecase")) {
97 decoded = decodeString(value.toLowerCase(), type);
98 } else if (type.equals("byte")) {
99 decoded = tmpVal.getBytes();
100 } else if (type.equals("host16") || type.equals("little16")) {
101 int i = Integer.parseInt(tmpVal, radix);
102 decoded = new byte[] { (byte) (i & 0x00FF), (byte) (i >> 8) };
103 } else if (type.equals("big16")) {
104 int i = Integer.parseInt(tmpVal, radix);
105 decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
106 } else if (type.equals("host32") || type.equals("little32")) {
107 long i = Long.parseLong(tmpVal, radix);
108 decoded = new byte[] {
109 (byte) ((i & 0x000000FF)),
110 (byte) ((i & 0x0000FF00) >> 8),
111 (byte) ((i & 0x00FF0000) >> 16),
112 (byte) ((i & 0xFF000000) >> 24) };
113 } else if (type.equals("big32")) {
114 long i = Long.parseLong(tmpVal, radix);
115 decoded = new byte[] {
116 (byte) ((i & 0xFF000000) >> 24),
117 (byte) ((i & 0x00FF0000) >> 16),
118 (byte) ((i & 0x0000FF00) >> 8),
119 (byte) ((i & 0x000000FF)) };
120 }
121 return decoded;
122 }
123
124 private static byte[] decodeString(String value, String type) {
125 if (value.startsWith("0x")) {
126 byte[] vals = new byte[(value.length() - 2) / 2];
127 for (int i = 0; i < vals.length; i++) {
128 vals[i] = (byte)
129 Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
130 }
131 return vals;
132 }
133
134 CharArrayWriter decoded = new CharArrayWriter();
135
136 for (int i = 0; i < value.length(); i++) {
137 if (value.charAt(i) == '\\') {
138 if (value.charAt(i + 1) == '\\') {
139 decoded.write('\\');
140 i++;
141 } else if (value.charAt(i + 1) == 'x') {
142 decoded.write(Integer.parseInt(
143 value.substring(i + 2, i + 4), 16));
144 i += 3;
145 } else if (value.charAt(i + 1) == 'r') {
146 decoded.write((int)'\r');
147 i++;
148 } else if (value.charAt(i + 1) == 'n') {
149 decoded.write((int)'\n');
150 i++;
151 } else {
152 int j = i + 1;
153 while ((j < i + 4) && (j < value.length())
154 && (Character.isDigit(value.charAt(j)))) {
155 j++;
156 }
157 decoded.write(Short.decode(
158 "0" + value.substring(i + 1, j)).byteValue());
159 i = j - 1;
160 }
161 } else {
162 decoded.write(value.charAt(i));
163 }
164 }
165
166 // Now turn the chars into bytes
167 char[] chars = decoded.toCharArray();
168 byte[] bytes;
169 if ("unicodeLE".equals(type)) {
170 bytes = new byte[chars.length * 2];
171 for (int i = 0; i < chars.length; i++) {
172 bytes[i * 2] = (byte) (chars[i] & 0xff);
173 bytes[i * 2 + 1] = (byte) (chars[i] >> 8);
174 }
175 } else if ("unicodeBE".equals(type)) {
176 bytes = new byte[chars.length * 2];
177 for(int i = 0; i < chars.length; i++) {
178 bytes[i * 2] = (byte) (chars[i] >> 8);
179 bytes[i * 2 + 1] = (byte) (chars[i] & 0xff);
180 }
181 } else {
182 // Copy with truncation
183 bytes = new byte[chars.length];
184 for(int i = 0; i < bytes.length; i++) {
185 bytes[i] = (byte) chars[i];
186 }
187 }
188 return bytes;
189 }
190
191 /**
192 * The matching media type. Returned by the
193 * {@link #detect(InputStream, Metadata)} method if a match is found.
194 */
195 private final MediaType type;
196
197 /**
198 * Length of the comparison window.
199 */
200 private final int length;
201
202 /**
203 * The magic match pattern. If this byte pattern is equal to the
204 * possibly bit-masked bytes from the input stream, then the type
205 * detection succeeds and the configured {@link #type} is returned.
206 */
207 private final byte[] pattern;
208
209 /**
210 * Length of the pattern, which in the case of regular expressions will
211 * not be the same as the comparison window length.
212 */
213 private final int patternLength;
214
215 /**
216 * True if pattern is a regular expression, false otherwise.
217 */
218 private final boolean isRegex;
219
220 /**
221 * True if we're doing a case-insensitive string match, false otherwise.
222 */
223 private final boolean isStringIgnoreCase;
224
225 /**
226 * Bit mask that is applied to the source bytes before pattern matching.
227 */
228 private final byte[] mask;
229
230 /**
231 * First offset (inclusive) of the comparison window within the
232 * document input stream. Greater than or equal to zero.
233 */
234 private final int offsetRangeBegin;
235
236 /**
237 * Last offset (inclusive) of the comparison window within the document
238 * input stream. Greater than or equal to the
239 * {@link #offsetRangeBegin first offset}.
240 * <p>
241 * Note that this is <em>not</em> the offset of the last byte read from
242 * the document stream. Instead, the last window of bytes to be compared
243 * starts at this offset.
244 */
245 private final int offsetRangeEnd;
246
247 /**
248 * Creates a detector for input documents that have the exact given byte
249 * pattern at the beginning of the document stream.
250 *
251 * @param type matching media type
252 * @param pattern magic match pattern
253 */
254 public MagicDetector(MediaType type, byte[] pattern) {
255 this(type, pattern, 0);
256 }
257
258 /**
259 * Creates a detector for input documents that have the exact given byte
260 * pattern at the given offset of the document stream.
261 *
262 * @param type matching media type
263 * @param pattern magic match pattern
264 * @param offset offset of the pattern match
265 */
266 public MagicDetector(MediaType type, byte[] pattern, int offset) {
267 this(type, pattern, null, offset, offset);
268 }
269
270 /**
271 * Creates a detector for input documents that meet the specified magic
272 * match. {@code pattern} must NOT be a regular expression.
273 * Constructor maintained for legacy reasons.
274 */
275 public MagicDetector(
276 MediaType type, byte[] pattern, byte[] mask,
277 int offsetRangeBegin, int offsetRangeEnd) {
278 this(type, pattern, mask, false, offsetRangeBegin, offsetRangeEnd);
279 }
280
281 /**
282 * Creates a detector for input documents that meet the specified
283 * magic match.
284 */
285 public MagicDetector(
286 MediaType type, byte[] pattern, byte[] mask,
287 boolean isRegex,
288 int offsetRangeBegin, int offsetRangeEnd) {
289 this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd);
290 }
291 /**
292 * Creates a detector for input documents that meet the specified
293 * magic match.
294 */
295 public MagicDetector(
296 MediaType type, byte[] pattern, byte[] mask,
297 boolean isRegex, boolean isStringIgnoreCase,
298 int offsetRangeBegin, int offsetRangeEnd) {
299 if (type == null) {
300 throw new IllegalArgumentException("Matching media type is null");
301 } else if (pattern == null) {
302 throw new IllegalArgumentException("Magic match pattern is null");
303 } else if (offsetRangeBegin < 0
304 || offsetRangeEnd < offsetRangeBegin) {
305 throw new IllegalArgumentException(
306 "Invalid offset range: ["
307 + offsetRangeBegin + "," + offsetRangeEnd + "]");
308 }
309
310 this.type = type;
311
312 this.isRegex = isRegex;
313 this.isStringIgnoreCase = isStringIgnoreCase;
314
315 this.patternLength = Math.max(pattern.length, mask != null ? mask.length : 0);
316
317 if (this.isRegex) {
318 // 8K buffer should cope with most regex patterns
319 this.length = 8 * 1024;
320 } else {
321 this.length = patternLength;
322 }
323
324 this.mask = new byte[this.patternLength];
325 this.pattern = new byte[this.patternLength];
326
327 for (int i = 0; i < this.patternLength; i++) {
328 if (mask != null && i < mask.length) {
329 this.mask[i] = mask[i];
330 } else {
331 this.mask[i] = -1;
332 }
333
334 if (i < pattern.length) {
335 this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
336 } else {
337 this.pattern[i] = 0;
338 }
339 }
340
341 this.offsetRangeBegin = offsetRangeBegin;
342 this.offsetRangeEnd = offsetRangeEnd;
343 }
344
345 /**
346 *
347 * @param input document input stream, or <code>null</code>
348 * @param metadata ignored
349 */
350 public MediaType detect(InputStream input, Metadata metadata)
351 throws IOException {
352 if (input == null) {
353 return MediaType.OCTET_STREAM;
354 }
355
356 input.mark(offsetRangeEnd + length);
357 try {
358 int offset = 0;
359
360 // Skip bytes at the beginning, using skip() or read()
361 while (offset < offsetRangeBegin) {
362 long n = input.skip(offsetRangeBegin - offset);
363 if (n > 0) {
364 offset += n;
365 } else if (input.read() != -1) {
366 offset += 1;
367 } else {
368 return MediaType.OCTET_STREAM;
369 }
370 }
371
372 // Fill in the comparison window
373 byte[] buffer =
374 new byte[length + (offsetRangeEnd - offsetRangeBegin)];
375 int n = input.read(buffer);
376 if (n > 0) {
377 offset += n;
378 }
379 while (n != -1 && offset < offsetRangeEnd + length) {
380 int bufferOffset = offset - offsetRangeBegin;
381 n = input.read(
382 buffer, bufferOffset, buffer.length - bufferOffset);
383 // increment offset - in case not all read (see testDetectStreamReadProblems)
384 if (n > 0) {
385 offset += n;
386 }
387 }
388
389 if (this.isRegex) {
390 int flags = 0;
391 if (this.isStringIgnoreCase) {
392 flags = Pattern.CASE_INSENSITIVE;
393 }
394
395 Pattern p = Pattern.compile(new String(this.pattern), flags);
396
397 ByteBuffer bb = ByteBuffer.wrap(buffer);
398 CharBuffer result = ISO_8859_1.decode(bb);
399 Matcher m = p.matcher(result);
400
401 boolean match = false;
402 // Loop until we've covered the entire offset range
403 for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
404 m.region(i, length+i);
405 match = m.lookingAt(); // match regex from start of region
406 if (match) {
407 return type;
408 }
409 }
410 } else {
411 if (offset < offsetRangeBegin + length) {
412 return MediaType.OCTET_STREAM;
413 }
414 // Loop until we've covered the entire offset range
415 for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
416 boolean match = true;
417 int masked;
418 for (int j = 0; match && j < length; j++) {
419 masked = (buffer[i + j] & mask[j]);
420 if (this.isStringIgnoreCase) {
421 masked = Character.toLowerCase(masked);
422 }
423 match = (masked == pattern[j]);
424 }
425 if (match) {
426 return type;
427 }
428 }
429 }
430
431 return MediaType.OCTET_STREAM;
432 } finally {
433 input.reset();
434 }
435 }
436
437 public int getLength() {
438 return this.patternLength;
439 }
440
441 /**
442 * Returns a string representation of the Detection Rule.
443 * Should sort nicely by type and details, as we sometimes
444 * compare these.
445 */
446 public String toString() {
447 // Needs to be unique, as these get compared.
448 return "Magic Detection for " + type +
449 " looking for " + pattern.length +
450 " bytes = " + this.pattern +
451 " mask = " + this.mask;
452 }
453 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.InputStream;
19 import java.io.UnsupportedEncodingException;
20 import java.net.URLDecoder;
21 import java.util.Map;
22 import java.util.regex.Pattern;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26
27 /**
28 * Content type detection based on the resource name. An instance of this
29 * class contains a set of regular expression patterns that are matched
30 * against the resource name potentially given as a part of the input metadata.
31 * <p>
32 * If a pattern matches the given name, then the media type associated with
33 * that pattern is returned as the likely content type of the input document.
34 * Otherwise the returned type is <code>application/octet-stream</code>.
35 * <p>
36 * See the {@link #detect(InputStream, Metadata)} method for more details
37 * of the matching algorithm.
38 *
39 * @since Apache Tika 0.3
40 */
41 public class NameDetector implements Detector {
42
43 /**
44 * The regular expression patterns used for type detection.
45 */
46 private final Map<Pattern, MediaType> patterns;
47
48 /**
49 * Creates a new content type detector based on the given name patterns.
50 * The given pattern map is not copied, so the caller may update the
51 * mappings even after this detector instance has been created. However,
52 * the map <em>must not be concurrently modified</em> while this instance
53 * is used for type detection.
54 *
55 * @param patterns map from name patterns to corresponding media types
56 */
57 public NameDetector(Map<Pattern, MediaType> patterns) {
58 this.patterns = patterns;
59 }
60
61 /**
62 * Detects the content type of an input document based on the document
63 * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
64 * the given input metadata is expected to contain the name (normally
65 * a file name or a URL) of the input document.
66 * <p>
67 * If a resource name is given, then it is first processed as follows.
68 * <ol>
69 * <li>
70 * Potential URL query (?...) and fragment identifier (#...)
71 * parts are removed from the end of the resource name.
72 * </li>
73 * <li>
74 * Potential leading path elements (up to the last slash or backslash)
75 * are removed from the beginning of the resource name.
76 * </li>
77 * <li>
78 * Potential URL encodings (%nn, in UTF-8) are decoded.
79 * </li>
80 * <li>
81 * Any leading and trailing whitespace is removed.
82 * </li>
83 * </ol>
84 * <p>
85 * The resulting name string (if any) is then matched in sequence against
86 * all the configured name patterns. If a match is found, then the (first)
87 * matching media type is returned.
88 *
89 * @param input ignored
90 * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
91 * @return detected media type, or <code>application/octet-stream</code>
92 */
93 public MediaType detect(InputStream input, Metadata metadata) {
94 // Look for a resource name in the input metadata
95 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
96 if (name != null) {
97 // If the name is a URL, skip the trailing query and fragment parts
98 int question = name.indexOf('?');
99 if (question != -1) {
100 name = name.substring(0, question);
101 }
102 int hash = name.indexOf('#');
103 if (hash != -1) {
104 name = name.substring(0, hash);
105 }
106
107 // If the name is a URL or a path, skip all but the last component
108 int slash = name.lastIndexOf('/');
109 if (slash != -1) {
110 name = name.substring(slash + 1);
111 }
112 int backslash = name.lastIndexOf('\\');
113 if (backslash != -1) {
114 name = name.substring(backslash + 1);
115 }
116
117 // Decode any potential URL encoding
118 int percent = name.indexOf('%');
119 if (percent != -1) {
120 try {
121 name = URLDecoder.decode(name, "UTF-8");
122 } catch (UnsupportedEncodingException e) {
123 throw new IllegalStateException("UTF-8 not supported", e);
124 }
125 }
126
127 // Skip any leading or trailing whitespace
128 name = name.trim();
129 if (name.length() > 0) {
130 // Match the name against the registered patterns
131 for (Pattern pattern : patterns.keySet()) {
132 if (pattern.matcher(name).matches()) {
133 return patterns.get(pattern);
134 }
135 }
136 }
137 }
138
139 return MediaType.OCTET_STREAM;
140 }
141
142 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.mime.MediaType;
24
25 /**
26 * Content type detection of plain text documents. This detector looks at the
27 * beginning of the document input stream and considers the document to be
28 * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
29 * found. As a special case some control bytes (up to 2% of all characters)
30 * are also allowed in a text document if it also contains no or just a few
31 * (less than 10%) characters above the 7-bit ASCII range.
32 * <p>
33 * Note that text documents with a character encoding like UTF-16 are better
34 * detected with {@link MagicDetector} and an appropriate magic byte pattern.
35 *
36 * @since Apache Tika 0.3
37 */
38 public class TextDetector implements Detector {
39
40 /** Serial version UID */
41 private static final long serialVersionUID = 4774601079503507765L;
42
43 /**
44 * The number of bytes from the beginning of the document stream
45 * to test for control bytes.
46 */
47 private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 512;
48
49 /**
50 * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
51 * in the range below 0x20 (the space character). If an entry in this
52 * table is <code>true</code> then that byte is very unlikely to occur
53 * in a plain text document.
54 * <p>
55 * The contents of this lookup table are based on the following definition
56 * from section 4 of the "Content-Type Processing Model" Internet-draft
57 * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
58 * >draft-abarth-mime-sniff-01</a>).
59 * <pre>
60 * +-------------------------+
61 * | Binary data byte ranges |
62 * +-------------------------+
63 * | 0x00 -- 0x08 |
64 * | 0x0B |
65 * | 0x0E -- 0x1A |
66 * | 0x1C -- 0x1F |
67 * +-------------------------+
68 * </pre>
69 *
70 * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
71 */
72 private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
73
74 static {
75 Arrays.fill(IS_CONTROL_BYTE, true);
76 IS_CONTROL_BYTE[0x09] = false; // tabulator
77 IS_CONTROL_BYTE[0x0A] = false; // new line
78 IS_CONTROL_BYTE[0x0C] = false; // new page
79 IS_CONTROL_BYTE[0x0D] = false; // carriage return
80 IS_CONTROL_BYTE[0x1B] = false; // escape
81 }
82
83 private final int bytesToTest;
84
85 /**
86 * Constructs a {@link TextDetector} which will look at the default number
87 * of bytes from the beginning of the document.
88 */
89 public TextDetector() {
90 this(DEFAULT_NUMBER_OF_BYTES_TO_TEST);
91 }
92
93 /**
94 * Constructs a {@link TextDetector} which will look at a given number of
95 * bytes from the beginning of the document.
96 */
97 public TextDetector(int bytesToTest) {
98 this.bytesToTest = bytesToTest;
99 }
100
101 /**
102 * Looks at the beginning of the document input stream to determine
103 * whether the document is text or not.
104 *
105 * @param input document input stream, or <code>null</code>
106 * @param metadata ignored
107 * @return "text/plain" if the input stream suggest a text document,
108 * "application/octet-stream" otherwise
109 */
110 public MediaType detect(InputStream input, Metadata metadata)
111 throws IOException {
112 if (input == null) {
113 return MediaType.OCTET_STREAM;
114 }
115
116 input.mark(bytesToTest);
117 try {
118 TextStatistics stats = new TextStatistics();
119
120 byte[] buffer = new byte[1024];
121 int n = 0;
122 int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
123 while (m != -1 && n < bytesToTest) {
124 stats.addData(buffer, 0, m);
125 n += m;
126 m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
127 }
128
129 if (stats.isMostlyAscii() || stats.looksLikeUTF8()) {
130 return MediaType.TEXT_PLAIN;
131 } else {
132 return MediaType.OCTET_STREAM;
133 }
134 } finally {
135 input.reset();
136 }
137 }
138
139 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 /**
19 * Utility class for computing a histogram of the bytes seen in a stream.
20 *
21 * @since Apache Tika 1.2
22 */
23 public class TextStatistics {
24
25 private final int[] counts = new int[256];
26
27 private int total = 0;
28
29 public void addData(byte[] buffer, int offset, int length) {
30 for (int i = 0; i < length; i++) {
31 counts[buffer[offset + i] & 0xff]++;
32 total++;
33 }
34 }
35
36 /**
37 * Checks whether at least one byte was seen and that the bytes that
38 * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
39 *
40 * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
41 * @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
42 * @return <code>true</code> if the seen bytes were mostly safe ASCII,
43 * <code>false</code> otherwise
44 */
45 public boolean isMostlyAscii() {
46 int control = count(0, 0x20);
47 int ascii = count(0x20, 128);
48 int safe = countSafeControl();
49 return total > 0
50 && (control - safe) * 100 < total * 2
51 && (ascii + safe) * 100 > total * 90;
52 }
53
54 /**
55 * Checks whether the observed byte stream looks like UTF-8 encoded text.
56 *
57 * @since Apache Tika 1.3
58 * @return <code>true</code> if the seen bytes look like UTF-8,
59 * <code>false</code> otherwise
60 */
61 public boolean looksLikeUTF8() {
62 int control = count(0, 0x20);
63 int utf8 = count(0x20, 0x80);
64 int safe = countSafeControl();
65
66 int expectedContinuation = 0;
67 int[] leading = new int[] {
68 count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8) };
69 for (int i = 0; i < leading.length; i++) {
70 utf8 += leading[i];
71 expectedContinuation += (i + 1) * leading[i];
72 }
73
74 int continuation = count(0x80, 0xc0);
75 return utf8 > 0
76 && continuation <= expectedContinuation
77 && continuation >= expectedContinuation - 3
78 && count(0xf80, 0x100) == 0
79 && (control - safe) * 100 < utf8 * 2;
80 }
81
82 /**
83 * Returns the total number of bytes seen so far.
84 *
85 * @return count of all bytes
86 */
87 public int count() {
88 return total;
89 }
90
91 /**
92 * Returns the number of occurrences of the given byte.
93 *
94 * @param b byte
95 * @return count of the given byte
96 */
97 public int count(int b) {
98 return counts[b & 0xff];
99 }
100
101 /**
102 * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
103 * page feed and escape).
104 * <p>
105 * This definition of control characters is based on section 4 of the
106 * "Content-Type Processing Model" Internet-draft
107 * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
108 * >draft-abarth-mime-sniff-01</a>).
109 * <pre>
110 * +-------------------------+
111 * | Binary data byte ranges |
112 * +-------------------------+
113 * | 0x00 -- 0x08 |
114 * | 0x0B |
115 * | 0x0E -- 0x1A |
116 * | 0x1C -- 0x1F |
117 * +-------------------------+
118 * </pre>
119 *
120 * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
121 * @return count of control characters
122 */
123 public int countControl() {
124 return count(0, 0x20) - countSafeControl();
125 }
126
127 /**
128 * Counts "safe" (i.e. seven-bit non-control) ASCII characters.
129 *
130 * @see #countControl()
131 * @return count of safe ASCII characters
132 */
133 public int countSafeAscii() {
134 return count(0x20, 128) + countSafeControl();
135 }
136
137 /**
138 * Counts eight bit characters, i.e. bytes with their highest bit set.
139 *
140 * @return count of eight bit characters
141 */
142 public int countEightBit() {
143 return count(128, 256);
144 }
145
146 private int count(int from, int to) {
147 assert 0 <= from && to <= counts.length;
148 int count = 0;
149 for (int i = from; i < to; i++) {
150 count += counts[i];
151 }
152 return count;
153 }
154
155 private int countSafeControl() {
156 return count('\t') + count('\n') + count('\r') // tab, LF, CR
157 + count(0x0c) + count(0x1b); // new page, escape
158 }
159
160 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.InputStream;
19
20 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.mime.MediaType;
22
23 /**
24 * Content type detection based on a content type hint. This detector simply
25 * trusts any valid content type hint given in the input metadata, and returns
26 * that as the likely type of the input document.
27 *
28 * @since Apache Tika 0.3
29 */
30 public class TypeDetector implements Detector {
31
32 /**
33 * Detects the content type of an input document based on a type hint
34 * given in the input metadata. The CONTENT_TYPE attribute of the given
35 * input metadata is expected to contain the type of the input document.
36 * If that attribute exists and contains a valid type name, then that
37 * type is returned.
38 *
39 * @param input ignored
40 * @param metadata input metadata, possibly with a CONTENT_TYPE value
41 * @return detected media type, or <code>application/octet-stream</code>
42 */
43 public MediaType detect(InputStream input, Metadata metadata) {
44 // Look for a type hint in the input metadata
45 String hint = metadata.get(Metadata.CONTENT_TYPE);
46 if (hint != null) {
47 MediaType type = MediaType.parse(hint);
48 if (type != null) {
49 return type;
50 }
51 }
52 return MediaType.OCTET_STREAM;
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.InputStream;
20
21 import javax.xml.XMLConstants;
22 import javax.xml.namespace.QName;
23 import javax.xml.parsers.SAXParserFactory;
24
25 import org.apache.tika.io.CloseShieldInputStream;
26 import org.apache.tika.sax.OfflineContentHandler;
27 import org.xml.sax.Attributes;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.helpers.DefaultHandler;
30
31 /**
32 * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
33 * the namespace URI and local name of the root element of an XML file.
34 *
35 * @since Apache Tika 0.4
36 */
37 public class XmlRootExtractor {
38
39 public QName extractRootElement(byte[] data) {
40 return extractRootElement(new ByteArrayInputStream(data));
41 }
42
43 /**
44 * @since Apache Tika 0.9
45 */
46 public QName extractRootElement(InputStream stream) {
47 ExtractorHandler handler = new ExtractorHandler();
48 try {
49 SAXParserFactory factory = SAXParserFactory.newInstance();
50 factory.setNamespaceAware(true);
51 factory.setValidating(false);
52 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
53 factory.newSAXParser().parse(
54 new CloseShieldInputStream(stream),
55 new OfflineContentHandler(handler));
56 } catch (Exception ignore) {
57 }
58 return handler.rootElement;
59 }
60
61 private static class ExtractorHandler extends DefaultHandler {
62
63 private QName rootElement = null;
64
65 @Override
66 public void startElement(
67 String uri, String local, String name, Attributes attributes)
68 throws SAXException {
69 this.rootElement = new QName(uri, local);
70 throw new SAXException("Aborting: root element received");
71 }
72
73 }
74
75 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Media type detection.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.detect;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.embedder;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.OutputStream;
21 import java.io.Serializable;
22 import java.util.Set;
23
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MediaType;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.parser.Parser;
29
30 /**
31 * Tika embedder interface
32 *
33 * @since Apache Tika 1.3
34 */
35 public interface Embedder extends Serializable {
36
37 /**
38 * Returns the set of media types supported by this embedder when used with
39 * the given parse context.
40 * <p>
41 * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)}
42 * so that parser implementations may also choose to implement this interface.
43 *
44 * @param context parse context
45 * @return immutable set of media types
46 */
47 Set<MediaType> getSupportedEmbedTypes(ParseContext context);
48
49 /**
50 * Embeds related document metadata from the given metadata object into the
51 * given output stream.
52 * <p>
53 * The given document stream is consumed but not closed by this method. The
54 * responsibility to close the stream remains on the caller.
55 * <p>
56 * Information about the parsing context can be passed in the context
57 * parameter. See the parser implementations for the kinds of context
58 * information they expect.
59 * <p>
60 * In general implementations should favor preserving the source file's metadata
61 * unless an update to a field is explicitly defined in the Metadata object.
62 * More specifically:
63 * <ul>
64 * <li>Embedder implementations should only attempt to update metadata fields
65 * present in the given Metadata object. Other fields should be left untouched.</li>
66 * <li>Embedder implementations should set properties as empty when the
67 * corresponding field in the Metadata object is an empty string, i.e. ""</li>
68 * <li>Embedder implementations should nullify or delete properties
69 * corresponding to fields with a null value in the given Metadata object.</li>
70 * <li>Embedder implementations should set the property
71 * corresponding to a particular field in the given Metadata object in all
72 * metadata containers whenever possible and appropriate for the file format at the time.
73 * If a particular metadata container falls out of use and/or is superseded by another
74 * (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when
75 * to cease embedding in the alternate container.</li>
76 * <li>Embedder implementations should attempt to embed as much of the metadata
77 * as accurately as possible. An implementation may choose a strict approach
78 * and throw an exception if a value to be embedded exceeds the length allowed
79 * or may choose to truncate the value.</li>
80 * </ul>
81 *
82 * @param metadata document metadata (input and output)
83 * @param originalStream the document stream (input)
84 * @param outputStream the output stream to write the metadata embedded data to
85 * @param context parse context
86 * @throws IOException if the document stream could not be read
87 * @throws TikaException if the document could not be parsed
88 */
89 void embed(Metadata metadata, InputStream originalStream,
90 OutputStream outputStream, ParseContext context)
91 throws IOException, TikaException;
92
93 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.embedder;
17
18 import java.io.ByteArrayOutputStream;
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.OutputStream;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Set;
30
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.io.IOUtils;
33 import org.apache.tika.io.TemporaryResources;
34 import org.apache.tika.io.TikaInputStream;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.Property;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.external.ExternalParser;
40
41 /**
42 * Embedder that uses an external program (like sed or exiftool) to embed text
43 * content and metadata into a given document.
44 *
45 * @since Apache Tika 1.3
46 */
47 public class ExternalEmbedder implements Embedder {
48
49 private static final long serialVersionUID = -2828829275642475697L;
50
51 /**
52 * Token to be replaced with a String array of metadata assignment command
53 * arguments
54 */
55 public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
56
57 /**
58 * Token to be replaced with a String array of metadata assignment command
59 * arguments
60 */
61 public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";
62
63 /**
64 * Media types supported by the external program.
65 */
66 private Set<MediaType> supportedEmbedTypes = Collections.emptySet();
67
68 /**
69 * Mapping of Tika metadata to command line parameters.
70 */
71 private Map<Property, String[]> metadataCommandArguments = null;
72
73 /**
74 * The external command to invoke.
75 *
76 * @see Runtime#exec(String[])
77 */
78 private String[] command = new String[] {
79 "sed", "-e",
80 "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
81 ExternalParser.INPUT_FILE_TOKEN
82 };
83
84 private String commandAssignmentOperator = "=";
85 private String commandAssignmentDelimeter = ", ";
86 private String commandAppendOperator = "=";
87
88 private boolean quoteAssignmentValues = false;
89
90 private TemporaryResources tmp = new TemporaryResources();
91
92 public Set<MediaType> getSupportedEmbedTypes(ParseContext context) {
93 return getSupportedEmbedTypes();
94 }
95
96 public Set<MediaType> getSupportedEmbedTypes() {
97 return supportedEmbedTypes;
98 }
99
100 public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes) {
101 this.supportedEmbedTypes = Collections
102 .unmodifiableSet(new HashSet<MediaType>(supportedEmbedTypes));
103 }
104
105 /**
106 * Gets the command to be run. This can include either of
107 * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
108 * needs filenames.
109 *
110 * @return
111 */
112 public String[] getCommand() {
113 return command;
114 }
115
116 /**
117 * Sets the command to be run. This can include either of
118 * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
119 * needs filenames.
120 *
121 * @see Runtime#exec(String[])
122 */
123 public void setCommand(String... command) {
124 this.command = command;
125 }
126
127 /**
128 * Gets the assignment operator for the command line tool, i.e. "=".
129 *
130 * @return the assignment operator
131 */
132 public String getCommandAssignmentOperator() {
133 return commandAssignmentOperator;
134 }
135
136 /**
137 * Sets the assignment operator for the command line tool, i.e. "=".
138 *
139 * @param commandAssignmentOperator
140 */
141 public void setCommandAssignmentOperator(String commandAssignmentOperator) {
142 this.commandAssignmentOperator = commandAssignmentOperator;
143 }
144
145 /**
146 * Gets the delimiter for multiple assignments for the command line tool,
147 * i.e. ", ".
148 *
149 * @return the assignment delimiter
150 */
151 public String getCommandAssignmentDelimeter() {
152 return commandAssignmentDelimeter;
153 }
154
155 /**
156 * Sets the delimiter for multiple assignments for the command line tool,
157 * i.e. ", ".
158 *
159 * @param commandAssignmentDelimeter
160 */
161 public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) {
162 this.commandAssignmentDelimeter = commandAssignmentDelimeter;
163 }
164
165 /**
166 * Gets the operator to append rather than replace a value for the command
167 * line tool, i.e. "+=".
168 *
169 * @return the append operator
170 */
171 public String getCommandAppendOperator() {
172 return commandAppendOperator;
173 }
174
175 /**
176 * Sets the operator to append rather than replace a value for the command
177 * line tool, i.e. "+=".
178 *
179 * @param commandAppendOperator
180 */
181 public void setCommandAppendOperator(String commandAppendOperator) {
182 this.commandAppendOperator = commandAppendOperator;
183 }
184
185 /**
186 * Gets whether or not to quote assignment values, i.e. tag='value'. The
187 * default is false.
188 *
189 * @return whether or not to quote assignment values
190 */
191 public boolean isQuoteAssignmentValues() {
192 return quoteAssignmentValues;
193 }
194
195 /**
196 * Sets whether or not to quote assignment values, i.e. tag='value'.
197 *
198 * @param quoteAssignmentValues
199 */
200 public void setQuoteAssignmentValues(boolean quoteAssignmentValues) {
201 this.quoteAssignmentValues = quoteAssignmentValues;
202 }
203
204 /**
205 * Gets the map of Metadata keys to command line parameters.
206 *
207 * @return the metadata to CLI param map
208 */
209 public Map<Property, String[]> getMetadataCommandArguments() {
210 return metadataCommandArguments;
211 }
212
213 /**
214 * Sets the map of Metadata keys to command line parameters. Set this to
215 * null to disable Metadata embedding.
216 *
217 * @param arguments
218 */
219 public void setMetadataCommandArguments(Map<Property, String[]> arguments) {
220 this.metadataCommandArguments = arguments;
221 }
222
223 /**
224 * Constructs a collection of command line arguments responsible for setting
225 * individual metadata fields based on the given <code>metadata</code>.
226 *
227 * @param metadata the metadata to embed
228 * @return the metadata-related command line arguments
229 */
230 protected List<String> getCommandMetadataSegments(Metadata metadata) {
231 List<String> commandMetadataSegments = new ArrayList<String>();
232 if (metadata == null || metadata.names() == null) {
233 return commandMetadataSegments;
234 }
235 for (String metadataName : metadata.names()) {
236 for (Property property : getMetadataCommandArguments().keySet()) {
237 if (metadataName.equals(property.getName())) {
238 String[] metadataCommandArguments = getMetadataCommandArguments().get(property);
239 if (metadataCommandArguments != null) {
240 for (String metadataCommandArgument : metadataCommandArguments) {
241 if (metadata.isMultiValued(metadataName)) {
242 for (String metadataValue : metadata.getValues(metadataName)) {
243 String assignmentValue = metadataValue;
244 if (quoteAssignmentValues) {
245 assignmentValue = "'" + assignmentValue + "'";
246 }
247 commandMetadataSegments.add(metadataCommandArgument
248 + commandAppendOperator
249 + assignmentValue);
250 }
251 } else {
252 String assignmentValue = metadata.get(metadataName);
253 if (quoteAssignmentValues) {
254 assignmentValue = "'" + assignmentValue + "'";
255 }
256 commandMetadataSegments.add(metadataCommandArgument
257 + commandAssignmentOperator
258 + assignmentValue);
259 }
260 }
261 }
262 }
263 }
264 }
265 return commandMetadataSegments;
266 }
267
268 /**
269 * Serializes a collection of metadata command line arguments into a single
270 * string.
271 *
272 * @param metadataCommandArguments
273 * @return the serialized metadata arguments string
274 */
275 protected static String serializeMetadata(
276 List<String> metadataCommandArguments) {
277 if (metadataCommandArguments != null) {
278 return Arrays.toString(metadataCommandArguments.toArray());
279 }
280 return "";
281 }
282
283 /**
284 * Executes the configured external command and passes the given document
285 * stream as a simple XHTML document to the given SAX content handler.
286 * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
287 * has been called to set arguments.
288 */
289 public void embed(final Metadata metadata, final InputStream inputStream,
290 final OutputStream outputStream, final ParseContext context)
291 throws IOException, TikaException {
292
293 boolean inputToStdIn = true;
294 boolean outputFromStdOut = true;
295 boolean hasMetadataCommandArguments =
296 (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
297 boolean serializeMetadataCommandArgumentsToken = false;
298 boolean replacedMetadataCommandArgumentsToken = false;
299
300 TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
301 File tempOutputFile = null;
302
303 List<String> commandMetadataSegments = null;
304 if (hasMetadataCommandArguments) {
305 commandMetadataSegments = getCommandMetadataSegments(metadata);
306 }
307
308 // Build our command
309 List<String> origCmd = Arrays.asList(command);
310 List<String> cmd = new ArrayList<String>();
311 for (String commandSegment : origCmd) {
312 if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
313 commandSegment = commandSegment.replace(
314 ExternalParser.INPUT_FILE_TOKEN,
315 tikaInputStream.getFile().toString());
316 inputToStdIn = false;
317 }
318 if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
319 tempOutputFile = tmp.createTemporaryFile();
320 commandSegment = commandSegment.replace(
321 ExternalParser.OUTPUT_FILE_TOKEN,
322 tempOutputFile.toString());
323 outputFromStdOut = false;
324 }
325 if (commandSegment
326 .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
327 serializeMetadataCommandArgumentsToken = true;
328 }
329 if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
330 if (hasMetadataCommandArguments) {
331 for (String commandMetadataSegment : commandMetadataSegments) {
332 cmd.add(commandMetadataSegment);
333 }
334 }
335 replacedMetadataCommandArgumentsToken = true;
336 } else {
337 cmd.add(commandSegment);
338 }
339 }
340 if (hasMetadataCommandArguments) {
341 if (serializeMetadataCommandArgumentsToken) {
342 // Find all metadata tokens and replace with encapsulated metadata
343 int i = 0;
344 for (String commandSegment : cmd) {
345 if (commandSegment
346 .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
347 commandSegment = commandSegment.replace(
348 METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
349 serializeMetadata(commandMetadataSegments));
350 cmd.set(i, commandSegment);
351 }
352 i++;
353 }
354 } else if (!replacedMetadataCommandArgumentsToken
355 && !serializeMetadataCommandArgumentsToken) {
356 // Tack metadata onto the end of the cmd as arguments
357 cmd.addAll(commandMetadataSegments);
358 }
359 }
360
361 // Execute
362 Process process;
363 if (cmd.toArray().length == 1) {
364 process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
365 } else {
366 process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
367 }
368
369 ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
370
371 try {
372 sendStdErrToOutputStream(process, stdErrOutputStream);
373
374 if (inputToStdIn) {
375 sendInputStreamToStdIn(inputStream, process);
376 } else {
377 // We're not writing to std in this case so close
378 process.getOutputStream().close();
379 }
380
381 if (outputFromStdOut) {
382 sendStdOutToOutputStream(process, outputStream);
383 } else {
384 tmp.dispose();
385 try {
386 process.waitFor();
387 } catch (InterruptedException ignore) {
388 }
389 // The command is finished, read the output file into the given output stream
390 InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
391 IOUtils.copy(tempOutputFileInputStream, outputStream);
392 }
393 } finally {
394 if (outputFromStdOut) {
395 try {
396 process.waitFor();
397 } catch (InterruptedException ignore) {
398 }
399 } else {
400 try {
401 // Clean up temp output files
402 tempOutputFile.delete();
403 } catch (Exception e) {
404 }
405 }
406 if (!inputToStdIn) {
407 // Close input file (and delete if created by up TemporaryResources.createTemporaryFile)
408 IOUtils.closeQuietly(tikaInputStream);
409 }
410 IOUtils.closeQuietly(outputStream);
411 IOUtils.closeQuietly(stdErrOutputStream);
412 if (process.exitValue() != 0) {
413 throw new TikaException("There was an error executing the command line" +
414 "\nExecutable Command:\n\n" + cmd +
415 "\nExecutable Error:\n\n" + stdErrOutputStream.toString("UTF-8"));
416 }
417 }
418 }
419
420 /**
421 * Creates a new thread for copying a given input stream to a given output stream.
422 *
423 * @param inputStream the source input stream
424 * @param outputStream the target output stream
425 */
426 private void multiThreadedStreamCopy(
427 final InputStream inputStream,
428 final OutputStream outputStream) {
429 new Thread(new Runnable() {
430 public void run() {
431 try {
432 IOUtils.copy(inputStream, outputStream);
433 } catch (IOException e) {
434 System.out.println("ERROR: " + e.getMessage());
435 }
436 }
437 }).start();
438 }
439
440 /**
441 * Sends the contents of the given input stream to the
442 * standard input of the given process. Potential exceptions are
443 * ignored.
444 * <p>
445 * Note that the given input stream is <em>not</em> closed by this method.
446 *
447 * @param process the process
448 * @param inputStream the input stream to send to standard input of the process
449 */
450 private void sendInputStreamToStdIn(
451 final InputStream inputStream,
452 final Process process) {
453 multiThreadedStreamCopy(inputStream, process.getOutputStream());
454 }
455
456 /**
457 * Sends the standard output of the given
458 * process to the given output stream. Potential exceptions are
459 * ignored.
460 * <p>
461 * Note that the given output stream is <em>not</em> closed by this method.
462 *
463 * @param process the process
464 * @param outputStream the putput stream to send to standard input of the process
465 */
466 private void sendStdOutToOutputStream(
467 final Process process,
468 final OutputStream outputStream) {
469 try {
470 IOUtils.copy(process.getInputStream(), outputStream);
471 } catch (IOException e) {
472 System.out.println("ERROR: " + e.getMessage());
473 }
474 }
475
476 /**
477 * Starts a thread that reads and discards the contents of the standard
478 * stream of the given process. Potential exceptions are ignored, and the
479 * stream is closed once fully processed.
480 *
481 * @param process the process
482 * param outputStream the output stream to send to standard error of the process
483 */
484 private void sendStdErrToOutputStream(
485 final Process process,
486 final OutputStream outputStream) {
487 multiThreadedStreamCopy(process.getErrorStream(), outputStream);
488 }
489
490 /**
491 * Checks to see if the command can be run. Typically used with something
492 * like "myapp --version" to check to see if "myapp" is installed and on the
493 * path.
494 *
495 * @param checkCmd the check command to run
496 * @param errorValue what is considered an error value?
497 * @return whether or not the check completed without error
498 */
499 public static boolean check(String checkCmd, int... errorValue) {
500 return check(new String[] { checkCmd }, errorValue);
501 }
502
503 /**
504 * Checks to see if the command can be run. Typically used with something
505 * like "myapp --version" to check to see if "myapp" is installed and on the
506 * path.
507 *
508 * @param checkCmd the check command to run
509 * @param errorValue what is considered an error value?
510 * @return whether or not the check completed without error
511 */
512 public static boolean check(String[] checkCmd, int... errorValue) {
513 if (errorValue.length == 0) {
514 errorValue = new int[] { 127 };
515 }
516
517 try {
518 Process process;
519 if (checkCmd.length == 1) {
520 process = Runtime.getRuntime().exec(checkCmd[0]);
521 } else {
522 process = Runtime.getRuntime().exec(checkCmd);
523 }
524 int result = process.waitFor();
525
526 for (int err : errorValue) {
527 if (result == err)
528 return false;
529 }
530 return true;
531 } catch (IOException e) {
532 // Some problem, command is there or is broken
533 return false;
534 } catch (InterruptedException ie) {
535 // Some problem, command is there or is broken
536 return false;
537 }
538 }
539 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.exception;
17
18 public class EncryptedDocumentException extends TikaException {
19 public EncryptedDocumentException() {
20 super("Unable to process: document is encrypted");
21 }
22
23 public EncryptedDocumentException(Throwable th) {
24 super("Unable to process: document is encrypted", th);
25 }
26
27 public EncryptedDocumentException(String info) {
28 super(info);
29 }
30
31 public EncryptedDocumentException(String info, Throwable th) {
32 super(info, th);
33 }
34 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.exception;
17
18 /**
19 * Tika exception
20 */
21 public class TikaException extends Exception {
22
23 public TikaException(String msg) {
24 super(msg);
25 }
26
27 public TikaException(String msg, Throwable cause) {
28 super(msg, cause);
29 }
30
31 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Tika exception.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.exception;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.extractor;
17
18 import java.io.IOException;
19 import java.io.Serializable;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.io.TikaInputStream;
23
24 /**
25 * Tika container extractor interface.
26 * Container Extractors provide access to the embedded
27 * resources within container formats such as .zip and .doc
28 */
29 public interface ContainerExtractor extends Serializable {
30 /**
31 * Is this Container Extractor able to process the
32 * supplied container?
33 * @since Apache Tika 0.8
34 */
35 boolean isSupported(TikaInputStream input) throws IOException;
36
37 /**
38 * Processes a container file, and extracts all the embedded
39 * resources from within it.
40 * <p>
41 * The {@link EmbeddedResourceHandler} you supply will
42 * be called for each embedded resource in the container. It is
43 * up to you whether you process the contents of the resource or not.
44 * <p>
45 * The given document stream is consumed but not closed by this method.
46 * The responsibility to close the stream remains on the caller.
47 * <p>
48 * If required, nested containers (such as a .docx within a .zip)
49 * can automatically be recursed into, and processed inline. If
50 * no recurseExtractor is given, the nested containers will be
51 * treated as with any other embedded resources.
52 *
53 * @since Apache Tika 0.8
54 * @param stream the document stream (input)
55 * @param recurseExtractor the extractor to use on any embedded containers
56 * @param handler handler for the embedded files (output)
57 * @throws IOException if the document stream could not be read
58 * @throws TikaException if the container could not be parsed
59 */
60 void extract(
61 TikaInputStream stream, ContainerExtractor recurseExtractor,
62 EmbeddedResourceHandler handler)
63 throws IOException, TikaException;
64 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.extractor;
17
18 import org.apache.tika.metadata.Metadata;
19
20 /**
21 * Interface for different document selection strategies for purposes like
22 * embedded document extraction by a {@link ContainerExtractor} instance.
23 * An implementation of this interface defines some specific selection
24 * criteria to be applied against the document metadata passed to the
25 * {@link #select(Metadata)} method.
26 *
27 * @since Apache Tika 0.8
28 */
29 public interface DocumentSelector {
30
31 /**
32 * Checks if a document with the given metadata matches the specified
33 * selection criteria.
34 *
35 * @param metadata document metadata
36 * @return <code>true</code> if the document matches the selection criteria,
37 * <code>false</code> otherwise
38 */
39 boolean select(Metadata metadata);
40
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.extractor;
18
19 import org.apache.tika.metadata.Metadata;
20 import org.xml.sax.ContentHandler;
21 import org.xml.sax.SAXException;
22
23 import java.io.IOException;
24 import java.io.InputStream;
25
26 public interface EmbeddedDocumentExtractor {
27 boolean shouldParseEmbedded(Metadata metadata);
28
29 /**
30 * Processes the supplied embedded resource, calling the delegating
31 * parser with the appropriate details.
32 * @param stream The embedded resource
33 * @param handler The handler to use
34 * @param metadata The metadata for the embedded resource
35 * @param outputHtml Should we output HTML for this resource, or has the parser already done so?
36 * @throws org.xml.sax.SAXException
37 * @throws java.io.IOException
38 */
39 void parseEmbedded(
40 InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
41 throws SAXException, IOException;
42 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.extractor;
17
18 import java.io.InputStream;
19
20 import org.apache.tika.mime.MediaType;
21
22 /**
23 * Tika container extractor callback interface.
24 * To work with a {@link ContainerExtractor}, your code needs
25 * to implement this interface.
26 */
27 public interface EmbeddedResourceHandler {
28 /**
29 * Called to process an embedded resource within the container.
30 * This will be called once per embedded resource within the
31 * container, along with whatever details are available on
32 * the embedded resource.
33 *
34 * @since Apache Tika 0.8
35 * @param filename The filename of the embedded resource, if known
36 * @param mediaType The media type of the embedded resource, if known
37 * @param stream The contents of the embedded resource
38 */
39 void handle(String filename, MediaType mediaType, InputStream stream);
40 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.extractor;
17
18 import java.io.File;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Set;
22
23 import org.apache.tika.config.TikaConfig;
24 import org.apache.tika.detect.DefaultDetector;
25 import org.apache.tika.detect.Detector;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.io.TemporaryResources;
28 import org.apache.tika.io.TikaInputStream;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.AutoDetectParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.parser.Parser;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37 import org.xml.sax.helpers.DefaultHandler;
38
39 /**
40 * An implementation of {@link ContainerExtractor} powered by the regular
41 * {@link Parser} API. This allows you to easily extract out all the
42 * embedded resources from within container files supported by normal Tika
43 * parsers. By default the {@link AutoDetectParser} will be used, to allow
44 * extraction from the widest range of containers.
45 */
46 public class ParserContainerExtractor implements ContainerExtractor {
47
48 /** Serial version UID */
49 private static final long serialVersionUID = 2261131045580861514L;
50
51 private final Parser parser;
52
53 private final Detector detector;
54
55 public ParserContainerExtractor() {
56 this(TikaConfig.getDefaultConfig());
57 }
58
59 public ParserContainerExtractor(TikaConfig config) {
60 this(new AutoDetectParser(config),
61 new DefaultDetector(config.getMimeRepository()));
62 }
63
64 public ParserContainerExtractor(Parser parser, Detector detector) {
65 this.parser = parser;
66 this.detector = detector;
67 }
68
69 public boolean isSupported(TikaInputStream input) throws IOException {
70 MediaType type = detector.detect(input, new Metadata());
71 return parser.getSupportedTypes(new ParseContext()).contains(type);
72 }
73
74 public void extract(
75 TikaInputStream stream, ContainerExtractor recurseExtractor,
76 EmbeddedResourceHandler handler)
77 throws IOException, TikaException {
78 ParseContext context = new ParseContext();
79 context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
80 try {
81 parser.parse(stream, new DefaultHandler(), new Metadata(), context);
82 } catch (SAXException e) {
83 throw new TikaException("Unexpected SAX exception", e);
84 }
85 }
86
87 private class RecursiveParser extends AbstractParser {
88
89 private final ContainerExtractor extractor;
90
91 private final EmbeddedResourceHandler handler;
92
93 private RecursiveParser(
94 ContainerExtractor extractor,
95 EmbeddedResourceHandler handler) {
96 this.extractor = extractor;
97 this.handler = handler;
98 }
99
100 public Set<MediaType> getSupportedTypes(ParseContext context) {
101 return parser.getSupportedTypes(context);
102 }
103
104 public void parse(
105 InputStream stream, ContentHandler ignored,
106 Metadata metadata, ParseContext context)
107 throws IOException, SAXException, TikaException {
108 TemporaryResources tmp = new TemporaryResources();
109 try {
110 TikaInputStream tis = TikaInputStream.get(stream, tmp);
111
112 // Figure out what we have to process
113 String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
114 MediaType type = detector.detect(tis, metadata);
115
116 if (extractor == null) {
117 // Let the handler process the embedded resource
118 handler.handle(filename, type, tis);
119 } else {
120 // Use a temporary file to process the stream twice
121 File file = tis.getFile();
122
123 // Let the handler process the embedded resource
124 InputStream input = TikaInputStream.get(file);
125 try {
126 handler.handle(filename, type, input);
127 } finally {
128 input.close();
129 }
130
131 // Recurse
132 extractor.extract(tis, extractor, handler);
133 }
134 } finally {
135 tmp.dispose();
136 }
137 }
138
139 }
140
141 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.extractor;
17
18 import java.io.File;
19 import java.io.FilenameFilter;
20 import java.io.IOException;
21 import java.io.InputStream;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.io.CloseShieldInputStream;
25 import org.apache.tika.io.TemporaryResources;
26 import org.apache.tika.io.TikaInputStream;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.parser.DelegatingParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.sax.BodyContentHandler;
32 import org.apache.tika.sax.EmbeddedContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35 import org.xml.sax.helpers.AttributesImpl;
36
37 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
38
39 /**
40 * Helper class for parsers of package archives or other compound document
41 * formats that support embedded or attached component documents.
42 *
43 * @since Apache Tika 0.8
44 */
45 public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
46
47 private static final File ABSTRACT_PATH = new File("");
48
49 private static final Parser DELEGATING_PARSER = new DelegatingParser();
50
51 private final ParseContext context;
52
53 public ParsingEmbeddedDocumentExtractor(ParseContext context) {
54 this.context = context;
55 }
56
57 public boolean shouldParseEmbedded(Metadata metadata) {
58 DocumentSelector selector = context.get(DocumentSelector.class);
59 if (selector != null) {
60 return selector.select(metadata);
61 }
62
63 FilenameFilter filter = context.get(FilenameFilter.class);
64 if (filter != null) {
65 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
66 if (name != null) {
67 return filter.accept(ABSTRACT_PATH, name);
68 }
69 }
70
71 return true;
72 }
73
74 public void parseEmbedded(
75 InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
76 throws SAXException, IOException {
77 if(outputHtml) {
78 AttributesImpl attributes = new AttributesImpl();
79 attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
80 handler.startElement(XHTML, "div", "div", attributes);
81 }
82
83 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
84 if (name != null && name.length() > 0 && outputHtml) {
85 handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
86 char[] chars = name.toCharArray();
87 handler.characters(chars, 0, chars.length);
88 handler.endElement(XHTML, "h1", "h1");
89 }
90
91 // Use the delegate parser to parse this entry
92 TemporaryResources tmp = new TemporaryResources();
93 try {
94 final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
95 if (stream instanceof TikaInputStream) {
96 final Object container = ((TikaInputStream) stream).getOpenContainer();
97 if (container != null) {
98 newStream.setOpenContainer(container);
99 }
100 }
101 DELEGATING_PARSER.parse(
102 newStream,
103 new EmbeddedContentHandler(new BodyContentHandler(handler)),
104 metadata, context);
105 } catch (TikaException e) {
106 // TODO: can we log a warning somehow?
107 // Could not parse the entry, just skip the content
108 } finally {
109 tmp.close();
110 }
111
112 if(outputHtml) {
113 handler.endElement(XHTML, "div", "div");
114 }
115 }
116
117 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Extraction of component documents.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.extractor;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.ByteArrayOutputStream;
19 import java.io.DataInputStream;
20 import java.io.DataOutputStream;
21 import java.io.IOException;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.Enumeration;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29
30 class ClassLoaderProxy extends ClassLoader implements ForkProxy {
31
32 /** Serial version UID */
33 private static final long serialVersionUID = -7303109260448540420L;
34
35 /**
36 * Names of resources that could not be found. Used to avoid repeated
37 * lookup of commonly accessed, but often not present, resources like
38 * <code>META-INF/services/javax.xml.parsers.SAXParserFactory</code>.
39 */
40 private final Set<String> notFound = new HashSet<String>();
41
42 private final int resource;
43
44 private transient DataInputStream input;
45
46 private transient DataOutputStream output;
47
48 public ClassLoaderProxy(int resource) {
49 this.resource = resource;
50 }
51
52 public void init(DataInputStream input, DataOutputStream output) {
53 this.input = input;
54 this.output = output;
55 }
56
57 @Override
58 protected synchronized URL findResource(String name) {
59 if (notFound.contains(name)) {
60 return null;
61 }
62 try {
63 // Send a request to load the resource data
64 output.write(ForkServer.RESOURCE);
65 output.write(resource);
66 output.write(1);
67 output.writeUTF(name);
68 output.flush();
69
70 // Receive the response
71 if (input.readBoolean()) {
72 return MemoryURLStreamHandler.createURL(readStream());
73 } else {
74 notFound.add(name);
75 return null;
76 }
77 } catch (IOException e) {
78 return null;
79 }
80 }
81
82 @Override
83 protected synchronized Enumeration<URL> findResources(String name)
84 throws IOException {
85 // Send a request to load the resources
86 output.write(ForkServer.RESOURCE);
87 output.write(resource);
88 output.write(2);
89 output.writeUTF(name);
90 output.flush();
91
92 // Receive the response
93 List<URL> resources = new ArrayList<URL>();
94 while (input.readBoolean()) {
95 resources.add(MemoryURLStreamHandler.createURL(readStream()));
96 }
97 return Collections.enumeration(resources);
98 }
99
100 @Override
101 protected synchronized Class<?> findClass(String name)
102 throws ClassNotFoundException {
103 try {
104 // Send a request to load the class data
105 output.write(ForkServer.RESOURCE);
106 output.write(resource);
107 output.write(1);
108 output.writeUTF(name.replace('.', '/') + ".class");
109 output.flush();
110
111 // Receive the response
112 if (input.readBoolean()) {
113 byte[] data = readStream();
114 return defineClass(name, data, 0, data.length);
115 } else {
116 throw new ClassNotFoundException("Unable to find class " + name);
117 }
118 } catch (IOException e) {
119 throw new ClassNotFoundException("Unable to load class " + name, e);
120 }
121 }
122
123 private byte[] readStream() throws IOException {
124 ByteArrayOutputStream stream = new ByteArrayOutputStream();
125 byte[] buffer = new byte[0xffff];
126 int n;
127 while ((n = input.readUnsignedShort()) > 0) {
128 input.readFully(buffer, 0, n);
129 stream.write(buffer, 0, n);
130 }
131 return stream.toByteArray();
132 }
133
134 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.Enumeration;
24
25 class ClassLoaderResource implements ForkResource {
26
27 private final ClassLoader loader;
28
29 public ClassLoaderResource(ClassLoader loader) {
30 this.loader = loader;
31 }
32
33 /**
34 * Processes a request for one (code 1) or many (code 2) class loader
35 * resources. The requested resources are sent preceded with a boolean
36 * <code>true</code> value. If the resource was not found (code 1) or
37 * when the last resource has been sent (code 2), a boolean
38 * <code>false</code> value is sent instead.
39 *
40 * @param name resource name
41 * @throws IOException if the resource could not be sent
42 */
43 public Throwable process(DataInputStream input, DataOutputStream output)
44 throws IOException {
45 byte type = input.readByte();
46 String name = input.readUTF();
47 if (type == 1) {
48 InputStream stream = loader.getResourceAsStream(name);
49 if (stream != null) {
50 output.writeBoolean(true);
51 writeAndCloseStream(output, stream);
52 } else {
53 output.writeBoolean(false);
54 }
55 } else if (type == 2) {
56 Enumeration<URL> resources = loader.getResources(name);
57 while (resources.hasMoreElements()) {
58 output.writeBoolean(true);
59 InputStream stream = resources.nextElement().openStream();
60 writeAndCloseStream(output, stream);
61 }
62 output.writeBoolean(false);
63 }
64 output.flush();
65 return null;
66 }
67
68 /**
69 * Sends the contents of the given input stream to the given output.
70 * The stream is sent in chunks of less than 64kB, each preceded by
71 * a 16-bit integer value that indicates the length of the following
72 * chunk. A zero short value is sent at the end to signify the end of
73 * the stream.
74 * <p>
75 * The stream is guaranteed to be closed by this method, regardless of
76 * the way it returns.
77 *
78 * @param stream the stream to be sent
79 * @throws IOException if the stream could not be sent
80 */
81 private void writeAndCloseStream(
82 DataOutputStream output, InputStream stream) throws IOException {
83 try {
84 byte[] buffer = new byte[0x10000 - 1];
85 int n;
86 while ((n = stream.read(buffer)) != -1) {
87 output.writeShort(n);
88 output.write(buffer, 0, n);
89 }
90 output.writeShort(0);
91 } finally {
92 stream.close();
93 }
94 }
95
96 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.IOException;
21
22 import org.xml.sax.Attributes;
23 import org.xml.sax.ContentHandler;
24 import org.xml.sax.Locator;
25 import org.xml.sax.SAXException;
26
27 class ContentHandlerProxy implements ContentHandler, ForkProxy {
28
29 public static final int START_DOCUMENT = 1;
30 public static final int END_DOCUMENT = 2;
31 public static final int START_PREFIX_MAPPING = 3;
32 public static final int END_PREFIX_MAPPING = 4;
33 public static final int START_ELEMENT = 5;
34 public static final int END_ELEMENT = 6;
35 public static final int CHARACTERS = 7;
36 public static final int IGNORABLE_WHITESPACE = 8;
37 public static final int PROCESSING_INSTRUCTION = 9;
38 public static final int SKIPPED_ENTITY = 10;
39
40 /** Serial version UID */
41 private static final long serialVersionUID = 737511106054617524L;
42
43 private final int resource;
44
45 private transient DataOutputStream output;
46
47 public ContentHandlerProxy(int resource) {
48 this.resource = resource;
49 }
50
51 public void init(DataInputStream input, DataOutputStream output) {
52 this.output = output;
53 }
54
55 private void sendRequest(int type) throws SAXException {
56 try {
57 output.writeByte(ForkServer.RESOURCE);
58 output.writeByte(resource);
59 output.writeByte(type);
60 } catch (IOException e) {
61 throw new SAXException("Unexpected fork proxy problem", e);
62 }
63 }
64
65 private void sendString(String string) throws SAXException {
66 try {
67 if (string != null) {
68 output.writeBoolean(true);
69 output.writeUTF(string);
70 } else {
71 output.writeBoolean(false);
72 }
73 } catch (IOException e) {
74 throw new SAXException("Unexpected fork proxy problem", e);
75 }
76 }
77
78 private void sendCharacters(char[] ch, int start, int length)
79 throws SAXException {
80 try {
81 output.writeInt(length);
82 for (int i = 0; i < length; i++) {
83 output.writeChar(ch[start + i]);
84 }
85 } catch (IOException e) {
86 throw new SAXException("Unexpected fork proxy problem", e);
87 }
88 }
89
90 private void doneSending() throws SAXException {
91 try {
92 output.flush();
93 } catch (IOException e) {
94 throw new SAXException("Unexpected fork proxy problem", e);
95 }
96 }
97
98 public void setDocumentLocator(Locator locator) {
99 // skip
100 }
101
102 public void startDocument() throws SAXException {
103 sendRequest(START_DOCUMENT);
104 doneSending();
105 }
106
107 public void endDocument() throws SAXException {
108 sendRequest(END_DOCUMENT);
109 doneSending();
110 }
111
112 public void startPrefixMapping(String prefix, String uri)
113 throws SAXException {
114 sendRequest(START_PREFIX_MAPPING);
115 sendString(prefix);
116 sendString(uri);
117 doneSending();
118 }
119
120 public void endPrefixMapping(String prefix) throws SAXException {
121 sendRequest(END_PREFIX_MAPPING);
122 sendString(prefix);
123 doneSending();
124 }
125
126 public void startElement(
127 String uri, String localName, String qName, Attributes atts)
128 throws SAXException {
129 sendRequest(START_ELEMENT);
130 sendString(uri);
131 sendString(localName);
132 sendString(qName);
133 int n = -1;
134 if (atts != null) {
135 n = atts.getLength();
136 }
137 try {
138 output.writeInt(n);
139 } catch (IOException e) {
140 throw new SAXException("Unexpected fork proxy problem", e);
141 }
142 for (int i = 0; i < n; i++) {
143 sendString(atts.getURI(i));
144 sendString(atts.getLocalName(i));
145 sendString(atts.getQName(i));
146 sendString(atts.getType(i));
147 sendString(atts.getValue(i));
148 }
149 doneSending();
150 }
151
152 public void endElement(String uri, String localName, String qName)
153 throws SAXException {
154 sendRequest(END_ELEMENT);
155 sendString(uri);
156 sendString(localName);
157 sendString(qName);
158 doneSending();
159 }
160
161 public void characters(char[] ch, int start, int length)
162 throws SAXException {
163 sendRequest(CHARACTERS);
164 sendCharacters(ch, start, length);
165 doneSending();
166 }
167
168 public void ignorableWhitespace(char[] ch, int start, int length)
169 throws SAXException {
170 sendRequest(IGNORABLE_WHITESPACE);
171 sendCharacters(ch, start, length);
172 doneSending();
173 }
174
175 public void processingInstruction(String target, String data)
176 throws SAXException {
177 sendRequest(PROCESSING_INSTRUCTION);
178 sendString(target);
179 sendString(data);
180 doneSending();
181 }
182
183 public void skippedEntity(String name) throws SAXException {
184 sendRequest(SKIPPED_ENTITY);
185 sendString(name);
186 doneSending();
187 }
188
189 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.IOException;
21
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.helpers.AttributesImpl;
25
26 class ContentHandlerResource implements ForkResource {
27
28 private final ContentHandler handler;
29
30 public ContentHandlerResource(ContentHandler handler) {
31 this.handler = handler;
32 }
33
34 public Throwable process(DataInputStream input, DataOutputStream output)
35 throws IOException {
36 try {
37 internalProcess(input);
38 return null;
39 } catch (SAXException e) {
40 return e;
41 }
42 }
43
44 private void internalProcess(DataInputStream input)
45 throws IOException, SAXException {
46 int type = input.readUnsignedByte();
47 if (type == ContentHandlerProxy.START_DOCUMENT) {
48 handler.startDocument();
49 } else if (type == ContentHandlerProxy.END_DOCUMENT) {
50 handler.endDocument();
51 } else if (type == ContentHandlerProxy.START_PREFIX_MAPPING) {
52 handler.startPrefixMapping(readString(input), readString(input));
53 } else if (type == ContentHandlerProxy.END_PREFIX_MAPPING) {
54 handler.endPrefixMapping(readString(input));
55 } else if (type == ContentHandlerProxy.START_ELEMENT) {
56 String uri = readString(input);
57 String localName = readString(input);
58 String qName = readString(input);
59 AttributesImpl atts = null;
60 int n = input.readInt();
61 if (n >= 0) {
62 atts = new AttributesImpl();
63 for (int i = 0; i < n; i++) {
64 atts.addAttribute(
65 readString(input), readString(input),
66 readString(input), readString(input),
67 readString(input));
68 }
69 }
70 handler.startElement(uri, localName, qName, atts);
71 } else if (type == ContentHandlerProxy.END_ELEMENT) {
72 String uri = readString(input);
73 String localName = readString(input);
74 String qName = readString(input);
75 handler.endElement(uri, localName, qName);
76 } else if (type == ContentHandlerProxy.CHARACTERS) {
77 char[] ch = readCharacters(input);
78 handler.characters(ch, 0, ch.length);
79 } else if (type == ContentHandlerProxy.IGNORABLE_WHITESPACE) {
80 char[] ch = readCharacters(input);
81 handler.characters(ch, 0, ch.length);
82 } else if (type == ContentHandlerProxy.PROCESSING_INSTRUCTION) {
83 handler.processingInstruction(readString(input), readString(input));
84 } else if (type == ContentHandlerProxy.SKIPPED_ENTITY) {
85 handler.skippedEntity(readString(input));
86 }
87 }
88
89 private String readString(DataInputStream input) throws IOException {
90 if (input.readBoolean()) {
91 return input.readUTF();
92 } else {
93 return null;
94 }
95 }
96
97 private char[] readCharacters(DataInputStream input) throws IOException {
98 int n = input.readInt();
99 char[] ch = new char[n];
100 for (int i = 0; i < n; i++) {
101 ch[i] = input.readChar();
102 }
103 return ch;
104 }
105
106 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.File;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.NotSerializableException;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.List;
28 import java.util.jar.JarEntry;
29 import java.util.jar.JarOutputStream;
30 import java.util.zip.ZipEntry;
31
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.io.IOExceptionWithCause;
34 import org.apache.tika.io.IOUtils;
35 import org.xml.sax.ContentHandler;
36
37 class ForkClient {
38
39 private final List<ForkResource> resources = new ArrayList<ForkResource>();
40
41 private final ClassLoader loader;
42
43 private final File jar;
44
45 private final Process process;
46
47 private final DataOutputStream output;
48
49 private final DataInputStream input;
50
51 private final InputStream error;
52
53 public ForkClient(ClassLoader loader, Object object, String java)
54 throws IOException, TikaException {
55 boolean ok = false;
56 try {
57 this.loader = loader;
58 this.jar = createBootstrapJar();
59
60 ProcessBuilder builder = new ProcessBuilder();
61 List<String> command = new ArrayList<String>();
62 command.addAll(Arrays.asList(java.split("\\s+")));
63 command.add("-jar");
64 command.add(jar.getPath());
65 builder.command(command);
66 this.process = builder.start();
67
68 this.output = new DataOutputStream(process.getOutputStream());
69 this.input = new DataInputStream(process.getInputStream());
70 this.error = process.getErrorStream();
71
72 waitForStartBeacon();
73
74 sendObject(loader, resources);
75 sendObject(object, resources);
76
77 ok = true;
78 } finally {
79 if (!ok) {
80 close();
81 }
82 }
83 }
84
85 private void waitForStartBeacon() throws IOException {
86 while (true) {
87 consumeErrorStream();
88 int type = input.read();
89 if ((byte) type == ForkServer.READY) {
90 consumeErrorStream();
91 return;
92 }
93 }
94 }
95
96 public synchronized boolean ping() {
97 try {
98 output.writeByte(ForkServer.PING);
99 output.flush();
100 while (true) {
101 consumeErrorStream();
102 int type = input.read();
103 if (type == ForkServer.PING) {
104 consumeErrorStream();
105 return true;
106 } else {
107 return false;
108 }
109 }
110 } catch (IOException e) {
111 return false;
112 }
113 }
114
115
116 public synchronized Throwable call(String method, Object... args)
117 throws IOException, TikaException {
118 List<ForkResource> r = new ArrayList<ForkResource>(resources);
119 output.writeByte(ForkServer.CALL);
120 output.writeUTF(method);
121 for (int i = 0; i < args.length; i++) {
122 sendObject(args[i], r);
123 }
124 return waitForResponse(r);
125 }
126
127 /**
128 * Serializes the object first into an in-memory buffer and then
129 * writes it to the output stream with a preceding size integer.
130 *
131 * @param object object to be serialized
132 * @param resources list of fork resources, used when adding proxies
133 * @throws IOException if the object could not be serialized
134 */
135 private void sendObject(Object object, List<ForkResource> resources)
136 throws IOException, TikaException {
137 int n = resources.size();
138 if (object instanceof InputStream) {
139 resources.add(new InputStreamResource((InputStream) object));
140 object = new InputStreamProxy(n);
141 } else if (object instanceof ContentHandler) {
142 resources.add(new ContentHandlerResource((ContentHandler) object));
143 object = new ContentHandlerProxy(n);
144 } else if (object instanceof ClassLoader) {
145 resources.add(new ClassLoaderResource((ClassLoader) object));
146 object = new ClassLoaderProxy(n);
147 }
148
149 try {
150 ForkObjectInputStream.sendObject(object, output);
151 } catch(NotSerializableException nse) {
152 // Build a more friendly error message for this
153 throw new TikaException(
154 "Unable to serialize " + object.getClass().getSimpleName() +
155 " to pass to the Forked Parser", nse);
156 }
157
158 waitForResponse(resources);
159 }
160
161 public synchronized void close() {
162 try {
163 if (output != null) {
164 output.close();
165 }
166 if (input != null) {
167 input.close();
168 }
169 if (error != null) {
170 error.close();
171 }
172 } catch (IOException ignore) {
173 }
174 if (process != null) {
175 process.destroy();
176 }
177 if (jar != null) {
178 jar.delete();
179 }
180 }
181
182 private Throwable waitForResponse(List<ForkResource> resources)
183 throws IOException {
184 output.flush();
185 while (true) {
186 consumeErrorStream();
187 int type = input.read();
188 if (type == -1) {
189 consumeErrorStream();
190 throw new IOException(
191 "Lost connection to a forked server process");
192 } else if (type == ForkServer.RESOURCE) {
193 ForkResource resource =
194 resources.get(input.readUnsignedByte());
195 resource.process(input, output);
196 } else if ((byte) type == ForkServer.ERROR) {
197 try {
198 return (Throwable) ForkObjectInputStream.readObject(
199 input, loader);
200 } catch (ClassNotFoundException e) {
201 throw new IOExceptionWithCause(
202 "Unable to deserialize an exception", e);
203 }
204 } else {
205 return null;
206 }
207 }
208 }
209
210 /**
211 * Consumes all pending bytes from the standard error stream of the
212 * forked server process, and prints them out to the standard error
213 * stream of this process. This method should be called always before
214 * expecting some output from the server, to prevent the server from
215 * blocking due to a filled up pipe buffer of the error stream.
216 *
217 * @throws IOException if the error stream could not be read
218 */
219 private void consumeErrorStream() throws IOException {
220 int n;
221 while ((n = error.available()) > 0) {
222 byte[] b = new byte[n];
223 n = error.read(b);
224 if (n > 0) {
225 System.err.write(b, 0, n);
226 }
227 }
228 }
229
230 /**
231 * Creates a temporary jar file that can be used to bootstrap the forked
232 * server process. Remember to remove the file when no longer used.
233 *
234 * @return the created jar file
235 * @throws IOException if the bootstrap archive could not be created
236 */
237 private static File createBootstrapJar() throws IOException {
238 File file = File.createTempFile("apache-tika-fork-", ".jar");
239 boolean ok = false;
240 try {
241 fillBootstrapJar(file);
242 ok = true;
243 } finally {
244 if (!ok) {
245 file.delete();
246 }
247 }
248 return file;
249 }
250
251 /**
252 * Fills in the jar file used to bootstrap the forked server process.
253 * All the required <code>.class</code> files and a manifest with a
254 * <code>Main-Class</code> entry are written into the archive.
255 *
256 * @param file file to hold the bootstrap archive
257 * @throws IOException if the bootstrap archive could not be created
258 */
259 private static void fillBootstrapJar(File file) throws IOException {
260 JarOutputStream jar = new JarOutputStream(new FileOutputStream(file));
261 try {
262 String manifest =
263 "Main-Class: " + ForkServer.class.getName() + "\n";
264 jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF"));
265 jar.write(manifest.getBytes("UTF-8"));
266
267 Class<?>[] bootstrap = {
268 ForkServer.class, ForkObjectInputStream.class,
269 ForkProxy.class, ClassLoaderProxy.class,
270 MemoryURLConnection.class,
271 MemoryURLStreamHandler.class,
272 MemoryURLStreamHandlerFactory.class,
273 MemoryURLStreamRecord.class
274 };
275 ClassLoader loader = ForkServer.class.getClassLoader();
276 for (Class<?> klass : bootstrap) {
277 String path = klass.getName().replace('.', '/') + ".class";
278 InputStream input = loader.getResourceAsStream(path);
279 try {
280 jar.putNextEntry(new JarEntry(path));
281 IOUtils.copy(input, jar);
282 } finally {
283 input.close();
284 }
285 }
286 } finally {
287 jar.close();
288 }
289 }
290
291 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.ByteArrayOutputStream;
20 import java.io.DataInputStream;
21 import java.io.DataOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.ObjectInputStream;
25 import java.io.ObjectOutputStream;
26 import java.io.ObjectStreamClass;
27
28 /**
29 * An object input stream that uses a given class loader when deserializing
30 * objects.
31 * <p>
32 * Note that this functionality could easily be implemented as a simple
33 * anonymous {@link ObjectInputStream} subclass, but since the
34 * functionality is needed during the somewhat complicated bootstrapping
35 * of the stdin/out communication channel of a forked server process,
36 * it's better if class has a stable name that can be referenced at
37 * compile-time by the {@link ForkClient} class.
38 */
39 class ForkObjectInputStream extends ObjectInputStream {
40
41 /** The class loader used when deserializing objects. */
42 private final ClassLoader loader;
43
44 /**
45 * Creates a new object input stream that uses the given class loader
46 * when deserializing objects.
47 *
48 * @param input underlying input stream
49 * @param loader class loader used when deserializing objects
50 * @throws IOException if this stream could not be initiated
51 */
52 public ForkObjectInputStream(InputStream input, ClassLoader loader)
53 throws IOException {
54 super(input);
55 this.loader = loader;
56 }
57
58 /**
59 * Loads the identified class from the specified class loader.
60 *
61 * @param desc class description
62 * @return class loaded class
63 * @throws ClassNotFoundException if the class can not be found
64 */
65 @Override
66 protected Class<?> resolveClass(ObjectStreamClass desc)
67 throws ClassNotFoundException {
68 return Class.forName(desc.getName(), false, loader);
69 }
70
71 /**
72 * Serializes the object first into an in-memory buffer and then
73 * writes it to the output stream with a preceding size integer.
74 *
75 * @param object object to be serialized
76 * @param output output stream
77 * @throws IOException if the object could not be serialized
78 */
79 public static void sendObject(Object object, DataOutputStream output)
80 throws IOException {
81 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
82 ObjectOutputStream serializer = new ObjectOutputStream(buffer);
83 serializer.writeObject(object);
84 serializer.close();
85
86 byte[] data = buffer.toByteArray();
87 output.writeInt(data.length);
88 output.write(data);
89 }
90
91 /**
92 * Deserializes an object from the given stream. The serialized object
93 * is expected to be preceded by a size integer, that is used for reading
94 * the entire serialization into a memory before deserializing it.
95 *
96 * @param input input stream from which the serialized object is read
97 * @param loader class loader to be used for loading referenced classes
98 * @throws IOException if the object could not be deserialized
99 * @throws ClassNotFoundException if a referenced class is not found
100 */
101 public static Object readObject(DataInputStream input, ClassLoader loader)
102 throws IOException, ClassNotFoundException {
103 int n = input.readInt();
104 byte[] data = new byte[n];
105 input.readFully(data);
106
107 ObjectInputStream deserializer =
108 new ForkObjectInputStream(new ByteArrayInputStream(data), loader);
109 return deserializer.readObject();
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.LinkedList;
21 import java.util.Queue;
22 import java.util.Set;
23
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MediaType;
27 import org.apache.tika.parser.AbstractParser;
28 import org.apache.tika.parser.AutoDetectParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.sax.TeeContentHandler;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.SAXException;
34
35 public class ForkParser extends AbstractParser {
36
37 /** Serial version UID */
38 private static final long serialVersionUID = -4962742892274663950L;
39
40 private final ClassLoader loader;
41
42 private final Parser parser;
43
44 /** Java command line */
45 private String java = "java -Xmx32m";
46
47 /** Process pool size */
48 private int poolSize = 5;
49
50 private int currentlyInUse = 0;
51
52 private final Queue<ForkClient> pool =
53 new LinkedList<ForkClient>();
54
55 /**
56 * @param loader The ClassLoader to use
57 * @param parser the parser to delegate to. This one cannot be another ForkParser
58 */
59 public ForkParser(ClassLoader loader, Parser parser) {
60 if (parser instanceof ForkParser) {
61 throw new IllegalArgumentException("The underlying parser of a ForkParser should not be a ForkParser, but a specific implementation.");
62 }
63 this.loader = loader;
64 this.parser = parser;
65 }
66
67 public ForkParser(ClassLoader loader) {
68 this(loader, new AutoDetectParser());
69 }
70
71 public ForkParser() {
72 this(ForkParser.class.getClassLoader());
73 }
74
75 /**
76 * Returns the size of the process pool.
77 *
78 * @return process pool size
79 */
80 public synchronized int getPoolSize() {
81 return poolSize;
82 }
83
84 /**
85 * Sets the size of the process pool.
86 *
87 * @param poolSize process pool size
88 */
89 public synchronized void setPoolSize(int poolSize) {
90 this.poolSize = poolSize;
91 }
92
93 /**
94 * Returns the command used to start the forked server process.
95 *
96 * @return java command line
97 */
98 public String getJavaCommand() {
99 return java;
100 }
101
102 /**
103 * Sets the command used to start the forked server process.
104 * The given command line is split on whitespace and the arguments
105 * "-jar" and "/path/to/bootstrap.jar" are appended to it when starting
106 * the process. The default setting is "java -Xmx32m".
107 *
108 * @param java java command line
109 */
110 public void setJavaCommand(String java) {
111 this.java = java;
112 }
113
114 public Set<MediaType> getSupportedTypes(ParseContext context) {
115 return parser.getSupportedTypes(context);
116 }
117
118 public void parse(
119 InputStream stream, ContentHandler handler,
120 Metadata metadata, ParseContext context)
121 throws IOException, SAXException, TikaException {
122 if (stream == null) {
123 throw new NullPointerException("null stream");
124 }
125
126 Throwable t;
127
128 boolean alive = false;
129 ForkClient client = acquireClient();
130 try {
131 ContentHandler tee = new TeeContentHandler(
132 handler, new MetadataContentHandler(metadata));
133 t = client.call("parse", stream, tee, metadata, context);
134 alive = true;
135 } catch (TikaException te) {
136 // Problem occurred on our side
137 alive = true;
138 throw te;
139 } catch (IOException e) {
140 // Problem occurred on the other side
141 throw new TikaException(
142 "Failed to communicate with a forked parser process."
143 + " The process has most likely crashed due to some error"
144 + " like running out of memory. A new process will be"
145 + " started for the next parsing request.", e);
146 } finally {
147 releaseClient(client, alive);
148 }
149
150 if (t instanceof IOException) {
151 throw (IOException) t;
152 } else if (t instanceof SAXException) {
153 throw (SAXException) t;
154 } else if (t instanceof TikaException) {
155 throw (TikaException) t;
156 } else if (t != null) {
157 throw new TikaException(
158 "Unexpected error in forked server process", t);
159 }
160 }
161
162 public synchronized void close() {
163 for (ForkClient client : pool) {
164 client.close();
165 }
166 pool.clear();
167 poolSize = 0;
168 }
169
170 private synchronized ForkClient acquireClient()
171 throws IOException, TikaException {
172 while (true) {
173 ForkClient client = pool.poll();
174
175 // Create a new process if there's room in the pool
176 if (client == null && currentlyInUse < poolSize) {
177 client = new ForkClient(loader, parser, java);
178 }
179
180 // Ping the process, and get rid of it if it's inactive
181 if (client != null && !client.ping()) {
182 client.close();
183 client = null;
184 }
185
186 if (client != null) {
187 currentlyInUse++;
188 return client;
189 } else if (currentlyInUse >= poolSize) {
190 try {
191 wait();
192 } catch (InterruptedException e) {
193 throw new TikaException(
194 "Interrupted while waiting for a fork parser", e);
195 }
196 }
197 }
198 }
199
200 private synchronized void releaseClient(ForkClient client, boolean alive) {
201 currentlyInUse--;
202 if (currentlyInUse + pool.size() < poolSize && alive) {
203 pool.offer(client);
204 notifyAll();
205 } else {
206 client.close();
207 }
208 }
209
210 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.Serializable;
21
22 public interface ForkProxy extends Serializable {
23
24 void init(DataInputStream input, DataOutputStream output);
25
26 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20
21 import java.io.IOException;
22
23 public interface ForkResource {
24
25 Throwable process(DataInputStream input, DataOutputStream output)
26 throws IOException;
27
28 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.DataInputStream;
20 import java.io.DataOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.NotSerializableException;
24 import java.io.OutputStream;
25 import java.lang.reflect.InvocationTargetException;
26 import java.lang.reflect.Method;
27 import java.net.URL;
28 import java.util.zip.CheckedInputStream;
29 import java.util.zip.CheckedOutputStream;
30 import java.util.zip.Checksum;
31
32 import org.apache.tika.exception.TikaException;
33
34 class ForkServer implements Runnable, Checksum {
35
36 public static final byte ERROR = -1;
37
38 public static final byte DONE = 0;
39
40 public static final byte CALL = 1;
41
42 public static final byte PING = 2;
43
44 public static final byte RESOURCE = 3;
45
46 public static final byte READY = 4;
47
48 /**
49 * Starts a forked server process using the standard input and output
50 * streams for communication with the parent process. Any attempts by
51 * stray code to read from standard input or write to standard output
52 * is redirected to avoid interfering with the communication channel.
53 *
54 * @param args command line arguments, ignored
55 * @throws Exception if the server could not be started
56 */
57 public static void main(String[] args) throws Exception {
58 URL.setURLStreamHandlerFactory(new MemoryURLStreamHandlerFactory());
59
60 ForkServer server = new ForkServer(System.in, System.out);
61 System.setIn(new ByteArrayInputStream(new byte[0]));
62 System.setOut(System.err);
63
64 Thread watchdog = new Thread(server, "Tika Watchdog");
65 watchdog.setDaemon(true);
66 watchdog.start();
67
68 server.processRequests();
69 }
70
71 /** Input stream for reading from the parent process */
72 private final DataInputStream input;
73
74 /** Output stream for writing to the parent process */
75 private final DataOutputStream output;
76
77 private volatile boolean active = true;
78
79 /**
80 * Sets up a forked server instance using the given stdin/out
81 * communication channel.
82 *
83 * @param input input stream for reading from the parent process
84 * @param output output stream for writing to the parent process
85 * @throws IOException if the server instance could not be created
86 */
87 public ForkServer(InputStream input, OutputStream output)
88 throws IOException {
89 this.input =
90 new DataInputStream(new CheckedInputStream(input, this));
91 this.output =
92 new DataOutputStream(new CheckedOutputStream(output, this));
93 }
94
95 public void run() {
96 try {
97 while (active) {
98 active = false;
99 Thread.sleep(5000);
100 }
101 System.exit(0);
102 } catch (InterruptedException e) {
103 }
104 }
105
106 public void processRequests() {
107 try {
108 output.writeByte(READY);
109 output.flush();
110
111 ClassLoader loader = (ClassLoader) readObject(
112 ForkServer.class.getClassLoader());
113 Thread.currentThread().setContextClassLoader(loader);
114
115 Object object = readObject(loader);
116 while (true) {
117 int request = input.read();
118 if (request == -1) {
119 break;
120 } else if (request == PING) {
121 output.writeByte(PING);
122 } else if (request == CALL) {
123 call(loader, object);
124 } else {
125 throw new IllegalStateException("Unexpected request");
126 }
127 output.flush();
128 }
129 } catch (Throwable t) {
130 t.printStackTrace();
131 }
132 System.err.flush();
133 }
134
135 private void call(ClassLoader loader, Object object) throws Exception {
136 Method method = getMethod(object, input.readUTF());
137 Object[] args =
138 new Object[method.getParameterTypes().length];
139 for (int i = 0; i < args.length; i++) {
140 args[i] = readObject(loader);
141 }
142 try {
143 method.invoke(object, args);
144 output.write(DONE);
145 } catch (InvocationTargetException e) {
146 output.write(ERROR);
147
148 // Try to send the underlying Exception itself
149 Throwable toSend = e.getCause();
150 try {
151 ForkObjectInputStream.sendObject(toSend, output);
152 } catch (NotSerializableException nse) {
153 // Need to build a serializable version of it
154 TikaException te = new TikaException( toSend.getMessage() );
155 te.setStackTrace( toSend.getStackTrace() );
156 ForkObjectInputStream.sendObject(te, output);
157 }
158 }
159 }
160
161 private Method getMethod(Object object, String name) {
162 Class<?> klass = object.getClass();
163 while (klass != null) {
164 for (Class<?> iface : klass.getInterfaces()) {
165 for (Method method : iface.getMethods()) {
166 if (name.equals(method.getName())) {
167 return method;
168 }
169 }
170 }
171 klass = klass.getSuperclass();
172 }
173 return null;
174 }
175
176 /**
177 * Deserializes an object from the given stream. The serialized object
178 * is expected to be preceded by a size integer, that is used for reading
179 * the entire serialization into a memory before deserializing it.
180 *
181 * @param input input stream from which the serialized object is read
182 * @param loader class loader to be used for loading referenced classes
183 * @throws IOException if the object could not be deserialized
184 * @throws ClassNotFoundException if a referenced class is not found
185 */
186 private Object readObject(ClassLoader loader)
187 throws IOException, ClassNotFoundException {
188 Object object = ForkObjectInputStream.readObject(input, loader);
189 if (object instanceof ForkProxy) {
190 ((ForkProxy) object).init(input, output);
191 }
192
193 // Tell the parent process that we successfully received this object
194 output.writeByte(ForkServer.DONE);
195 output.flush();
196
197 return object;
198 }
199
200 //------------------------------------------------------------< Checksum >
201
202 public void update(int b) {
203 active = true;
204 }
205
206 public void update(byte[] b, int off, int len) {
207 active = true;
208 }
209
210 public long getValue() {
211 return 0;
212 }
213
214 public void reset() {
215 }
216
217 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22
23 class InputStreamProxy extends InputStream implements ForkProxy {
24
25 /** Serial version UID */
26 private static final long serialVersionUID = 4350939227765568438L;
27
28 private final int resource;
29
30 private transient DataInputStream input;
31
32 private transient DataOutputStream output;
33
34 public InputStreamProxy(int resource) {
35 this.resource = resource;
36 }
37
38 public void init(DataInputStream input, DataOutputStream output) {
39 this.input = input;
40 this.output = output;
41 }
42
43 @Override
44 public int read() throws IOException {
45 output.writeByte(ForkServer.RESOURCE);
46 output.writeByte(resource);
47 output.writeInt(1);
48 output.flush();
49 int n = input.readInt();
50 if (n == 1) {
51 return input.readUnsignedByte();
52 } else {
53 return n;
54 }
55 }
56
57 @Override
58 public int read(byte[] b, int off, int len) throws IOException {
59 output.writeByte(ForkServer.RESOURCE);
60 output.writeByte(resource);
61 output.writeInt(len);
62 output.flush();
63 int n = input.readInt();
64 if (n > 0) {
65 input.readFully(b, off, n);
66 }
67 return n;
68 }
69
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.DataInputStream;
19 import java.io.DataOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22
23 class InputStreamResource implements ForkResource {
24
25 private final InputStream stream;
26
27 public InputStreamResource(InputStream stream) {
28 this.stream = stream;
29 }
30
31 public Throwable process(DataInputStream input, DataOutputStream output)
32 throws IOException {
33 int n = input.readInt();
34 byte[] buffer = new byte[n];
35 int m;
36 try {
37 m = stream.read(buffer);
38 } catch (IOException e) {
39 return e;
40 }
41 output.writeInt(m);
42 if (m > 0) {
43 output.write(buffer, 0, m);
44 }
45 output.flush();
46 return null;
47 }
48
49 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.InputStream;
20 import java.net.URL;
21 import java.net.URLConnection;
22
23 class MemoryURLConnection extends URLConnection {
24
25 private final byte[] data;
26
27 MemoryURLConnection(URL url, byte[] data) {
28 super(url);
29 this.data = data;
30 }
31
32 @Override
33 public void connect() {
34 }
35
36 @Override
37 public InputStream getInputStream() {
38 return new ByteArrayInputStream(data);
39 }
40
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.IOException;
19 import java.lang.ref.WeakReference;
20 import java.net.MalformedURLException;
21 import java.net.URL;
22 import java.net.URLConnection;
23 import java.net.URLStreamHandler;
24 import java.util.Iterator;
25 import java.util.LinkedList;
26 import java.util.List;
27 import java.util.concurrent.atomic.AtomicInteger;
28
29 class MemoryURLStreamHandler extends URLStreamHandler {
30
31 private static final AtomicInteger counter = new AtomicInteger();
32
33 private static final List<MemoryURLStreamRecord> records =
34 new LinkedList<MemoryURLStreamRecord>();
35
36 public static URL createURL(byte[] data) {
37 try {
38 int i = counter.incrementAndGet();
39 URL url = new URL("tika-in-memory", "localhost", "/" + i);
40
41 MemoryURLStreamRecord record = new MemoryURLStreamRecord();
42 record.url = new WeakReference<URL>(url);
43 record.data = data;
44 records.add(record);
45
46 return url;
47 } catch (MalformedURLException e) {
48 throw new RuntimeException(e);
49 }
50 }
51
52 @Override
53 protected URLConnection openConnection(URL u) throws IOException {
54 Iterator<MemoryURLStreamRecord> iterator = records.iterator();
55 while (iterator.hasNext()) {
56 MemoryURLStreamRecord record = iterator.next();
57 URL url = record.url.get();
58 if (url == null) {
59 iterator.remove();
60 } else if (url == u) {
61 return new MemoryURLConnection(u, record.data);
62 }
63 }
64 throw new IOException("Unknown URL: " + u);
65 }
66
67 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.net.URLStreamHandler;
19 import java.net.URLStreamHandlerFactory;
20
21 class MemoryURLStreamHandlerFactory implements URLStreamHandlerFactory {
22
23 public URLStreamHandler createURLStreamHandler(String protocol) {
24 if ("tika-in-memory".equals(protocol)) {
25 return new MemoryURLStreamHandler();
26 } else {
27 return null;
28 }
29 }
30
31 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.lang.ref.WeakReference;
19 import java.net.URL;
20
21 class MemoryURLStreamRecord {
22
23 public WeakReference<URL> url;
24 public byte[] data;
25
26 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.xml.sax.Attributes;
20 import org.xml.sax.SAXException;
21 import org.xml.sax.helpers.DefaultHandler;
22
23 class MetadataContentHandler extends DefaultHandler {
24
25 private final Metadata metadata;
26
27 public MetadataContentHandler(Metadata metadata) {
28 this.metadata = metadata;
29 }
30
31 public void startElement(
32 String uri, String local, String name, Attributes attributes)
33 throws SAXException {
34 if ("meta".equals(local)) {
35 String aname = attributes.getValue("name");
36 String content = attributes.getValue("content");
37 metadata.add(aname, content);
38 }
39 }
40
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Forked parser.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.fork;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.InputStream;
19
20 /**
21 * Proxy stream that prevents the underlying input stream from being closed.
22 * <p>
23 * This class is typically used in cases where an input stream needs to be
24 * passed to a component that wants to explicitly close the stream even if
25 * more input would still be available to other components.
26 *
27 * @since Apache Tika 0.4, copied from Commons IO 1.4
28 */
29 public class CloseShieldInputStream extends ProxyInputStream {
30
31 /**
32 * Creates a proxy that shields the given input stream from being
33 * closed.
34 *
35 * @param in underlying input stream
36 */
37 public CloseShieldInputStream(InputStream in) {
38 super(in);
39 }
40
41 /**
42 * Replaces the underlying input stream with a {@link ClosedInputStream}
43 * sentinel. The original input stream will remain open, but this proxy
44 * will appear closed.
45 */
46 @Override
47 public void close() {
48 in = new ClosedInputStream();
49 }
50
51 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.InputStream;
19
20 /**
21 * Closed input stream. This stream returns -1 to all attempts to read
22 * something from the stream.
23 * <p>
24 * Typically uses of this class include testing for corner cases in methods
25 * that accept input streams and acting as a sentinel value instead of a
26 * <code>null</code> input stream.
27 *
28 * @since Apache Tika 0.4, copied from Commons IO 1.4
29 */
30 public class ClosedInputStream extends InputStream {
31
32 /**
33 * Returns -1 to indicate that the stream is closed.
34 *
35 * @return always -1
36 */
37 @Override
38 public int read() {
39 return -1;
40 }
41
42 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 /**
22 * A decorating input stream that counts the number of bytes that have passed
23 * through the stream so far.
24 * <p>
25 * A typical use case would be during debugging, to ensure that data is being
26 * read as expected.
27 *
28 * @author Marcelo Liberato
29 * @since Apache Tika 0.4, copied from Commons IO 1.4
30 */
31 public class CountingInputStream extends ProxyInputStream {
32
33 /** The count of bytes that have passed. */
34 private long count;
35
36 /**
37 * Constructs a new CountingInputStream.
38 *
39 * @param in the InputStream to delegate to
40 */
41 public CountingInputStream(InputStream in) {
42 super(in);
43 }
44
45 //-----------------------------------------------------------------------
46 /**
47 * Reads a number of bytes into the byte array, keeping count of the
48 * number read.
49 *
50 * @param b the buffer into which the data is read, not null
51 * @return the total number of bytes read into the buffer, -1 if end of stream
52 * @throws IOException if an I/O error occurs
53 * @see java.io.InputStream#read(byte[])
54 */
55 @Override
56 public int read(byte[] b) throws IOException {
57 int found = super.read(b);
58 this.count += (found >= 0) ? found : 0;
59 return found;
60 }
61
62 /**
63 * Reads a number of bytes into the byte array at a specific offset,
64 * keeping count of the number read.
65 *
66 * @param b the buffer into which the data is read, not null
67 * @param off the start offset in the buffer
68 * @param len the maximum number of bytes to read
69 * @return the total number of bytes read into the buffer, -1 if end of stream
70 * @throws IOException if an I/O error occurs
71 * @see java.io.InputStream#read(byte[], int, int)
72 */
73 @Override
74 public int read(byte[] b, int off, int len) throws IOException {
75 int found = super.read(b, off, len);
76 this.count += (found >= 0) ? found : 0;
77 return found;
78 }
79
80 /**
81 * Reads the next byte of data adding to the count of bytes received
82 * if a byte is successfully read.
83 *
84 * @return the byte read, -1 if end of stream
85 * @throws IOException if an I/O error occurs
86 * @see java.io.InputStream#read()
87 */
88 @Override
89 public int read() throws IOException {
90 int found = super.read();
91 this.count += (found >= 0) ? 1 : 0;
92 return found;
93 }
94
95 /**
96 * Skips the stream over the specified number of bytes, adding the skipped
97 * amount to the count.
98 *
99 * @param length the number of bytes to skip
100 * @return the actual number of bytes skipped
101 * @throws IOException if an I/O error occurs
102 * @see java.io.InputStream#skip(long)
103 */
104 @Override
105 public long skip(final long length) throws IOException {
106 final long skip = super.skip(length);
107 this.count += skip;
108 return skip;
109 }
110
111 //-----------------------------------------------------------------------
112 /**
113 * The number of bytes that have passed through this stream.
114 * <p>
115 * NOTE: From v1.3 this method throws an ArithmeticException if the
116 * count is greater than can be expressed by an <code>int</code>.
117 * See {@link #getByteCount()} for a method using a <code>long</code>.
118 *
119 * @return the number of bytes accumulated
120 * @throws ArithmeticException if the byte count is too large
121 */
122 public synchronized int getCount() {
123 long result = getByteCount();
124 if (result > Integer.MAX_VALUE) {
125 throw new ArithmeticException("The byte count " + result + " is too large to be converted to an int");
126 }
127 return (int) result;
128 }
129
130 /**
131 * Set the byte count back to 0.
132 * <p>
133 * NOTE: From v1.3 this method throws an ArithmeticException if the
134 * count is greater than can be expressed by an <code>int</code>.
135 * See {@link #resetByteCount()} for a method using a <code>long</code>.
136 *
137 * @return the count previous to resetting
138 * @throws ArithmeticException if the byte count is too large
139 */
140 public synchronized int resetCount() {
141 long result = resetByteCount();
142 if (result > Integer.MAX_VALUE) {
143 throw new ArithmeticException("The byte count " + result + " is too large to be converted to an int");
144 }
145 return (int) result;
146 }
147
148 /**
149 * The number of bytes that have passed through this stream.
150 * <p>
151 * NOTE: This method is an alternative for <code>getCount()</code>
152 * and was added because that method returns an integer which will
153 * result in incorrect count for files over 2GB.
154 *
155 * @return the number of bytes accumulated
156 * @since Commons IO 1.3
157 */
158 public synchronized long getByteCount() {
159 return this.count;
160 }
161
162 /**
163 * Set the byte count back to 0.
164 * <p>
165 * NOTE: This method is an alternative for <code>resetCount()</code>
166 * and was added because that method returns an integer which will
167 * result in incorrect count for files over 2GB.
168 *
169 * @return the count previous to resetting
170 * @since Commons IO 1.3
171 */
172 public synchronized long resetByteCount() {
173 long tmp = this.count;
174 this.count = 0;
175 return tmp;
176 }
177
178 public String toString() {
179 return "Tika Counting InputStream wrapping " + in.toString();
180 }
181 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22
23 /**
24 * General Endian Related Utilties.
25 * <p>
26 * This class provides static utility methods for input/output operations
27 * on numbers in Big and Little Endian formats.
28 * <p>
29 * Origin of code: Based on the version in POI
30 */
31 public class EndianUtils {
32 /**
33 * Get a LE short value from an InputStream
34 *
35 * @param stream the InputStream from which the short is to be read
36 * @return the short (16-bit) value
37 * @exception IOException will be propagated back to the caller
38 * @exception BufferUnderrunException if the stream cannot provide enough bytes
39 */
40 public static short readShortLE(InputStream stream) throws IOException, BufferUnderrunException {
41 return (short) readUShortLE(stream);
42 }
43 /**
44 * Get a BE short value from an InputStream
45 *
46 * @param stream the InputStream from which the short is to be read
47 * @return the short (16-bit) value
48 * @exception IOException will be propagated back to the caller
49 * @exception BufferUnderrunException if the stream cannot provide enough bytes
50 */
51 public static short readShortBE(InputStream stream) throws IOException, BufferUnderrunException {
52 return (short) readUShortBE(stream);
53 }
54
55 public static int readUShortLE(InputStream stream) throws IOException, BufferUnderrunException {
56 int ch1 = stream.read();
57 int ch2 = stream.read();
58 if ((ch1 | ch2) < 0) {
59 throw new BufferUnderrunException();
60 }
61 return (ch2 << 8) + (ch1 << 0);
62 }
63 public static int readUShortBE(InputStream stream) throws IOException, BufferUnderrunException {
64 int ch1 = stream.read();
65 int ch2 = stream.read();
66 if ((ch1 | ch2) < 0) {
67 throw new BufferUnderrunException();
68 }
69 return (ch1 << 8) + (ch2 << 0);
70 }
71
72 /**
73 * Get a LE int value from an InputStream
74 *
75 * @param stream the InputStream from which the int is to be read
76 * @return the int (32-bit) value
77 * @exception IOException will be propagated back to the caller
78 * @exception BufferUnderrunException if the stream cannot provide enough bytes
79 */
80 public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
81 int ch1 = stream.read();
82 int ch2 = stream.read();
83 int ch3 = stream.read();
84 int ch4 = stream.read();
85 if ((ch1 | ch2 | ch3 | ch4) < 0) {
86 throw new BufferUnderrunException();
87 }
88 return (ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0);
89 }
90 /**
91 * Get a BE int value from an InputStream
92 *
93 * @param stream the InputStream from which the int is to be read
94 * @return the int (32-bit) value
95 * @exception IOException will be propagated back to the caller
96 * @exception BufferUnderrunException if the stream cannot provide enough bytes
97 */
98 public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
99 int ch1 = stream.read();
100 int ch2 = stream.read();
101 int ch3 = stream.read();
102 int ch4 = stream.read();
103 if ((ch1 | ch2 | ch3 | ch4) < 0) {
104 throw new BufferUnderrunException();
105 }
106 return (ch1 << 24) + (ch2<<16) + (ch3 << 8) + (ch4 << 0);
107 }
108
109 /**
110 * Get a LE long value from an InputStream
111 *
112 * @param stream the InputStream from which the long is to be read
113 * @return the long (64-bit) value
114 * @exception IOException will be propagated back to the caller
115 * @exception BufferUnderrunException if the stream cannot provide enough bytes
116 */
117 public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
118 int ch1 = stream.read();
119 int ch2 = stream.read();
120 int ch3 = stream.read();
121 int ch4 = stream.read();
122 int ch5 = stream.read();
123 int ch6 = stream.read();
124 int ch7 = stream.read();
125 int ch8 = stream.read();
126 if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
127 throw new BufferUnderrunException();
128 }
129
130 return
131 ((long)ch8 << 56) +
132 ((long)ch7 << 48) +
133 ((long)ch6 << 40) +
134 ((long)ch5 << 32) +
135 ((long)ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
136 (ch3 << 16) +
137 (ch2 << 8) +
138 (ch1 << 0);
139 }
140 /**
141 * Get a NE long value from an InputStream
142 *
143 * @param stream the InputStream from which the long is to be read
144 * @return the long (64-bit) value
145 * @exception IOException will be propagated back to the caller
146 * @exception BufferUnderrunException if the stream cannot provide enough bytes
147 */
148 public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
149 int ch1 = stream.read();
150 int ch2 = stream.read();
151 int ch3 = stream.read();
152 int ch4 = stream.read();
153 int ch5 = stream.read();
154 int ch6 = stream.read();
155 int ch7 = stream.read();
156 int ch8 = stream.read();
157 if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
158 throw new BufferUnderrunException();
159 }
160
161 return
162 ((long)ch1 << 56) +
163 ((long)ch2 << 48) +
164 ((long)ch3 << 40) +
165 ((long)ch4 << 32) +
166 ((long)ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
167 (ch6 << 16) +
168 (ch7 << 8) +
169 (ch8 << 0);
170 }
171
172
173 /**
174 * Get a LE short value from the beginning of a byte array
175 *
176 *@param data the byte array
177 *@return the short (16-bit) value
178 */
179 public static short getShortLE(byte[] data) {
180 return getShortLE(data, 0);
181 }
182 /**
183 * Get a LE short value from a byte array
184 *
185 *@param data the byte array
186 *@param offset a starting offset into the byte array
187 *@return the short (16-bit) value
188 */
189 public static short getShortLE(byte[] data, int offset) {
190 return (short)getUShortLE(data, offset);
191 }
192
193 /**
194 * Get a LE unsigned short value from the beginning of a byte array
195 *
196 *@param data the byte array
197 *@return the unsigned short (16-bit) value in an int
198 */
199 public static int getUShortLE(byte[] data) {
200 return getUShortLE(data, 0);
201 }
202 /**
203 * Get a LE unsigned short value from a byte array
204 *
205 *@param data the byte array
206 *@param offset a starting offset into the byte array
207 *@return the unsigned short (16-bit) value in an integer
208 */
209 public static int getUShortLE(byte[] data, int offset) {
210 int b0 = data[offset] & 0xFF;
211 int b1 = data[offset+1] & 0xFF;
212 return (b1 << 8) + (b0 << 0);
213 }
214
215 /**
216 * Get a BE short value from the beginning of a byte array
217 *
218 *@param data the byte array
219 *@return the short (16-bit) value
220 */
221 public static short getShortBE(byte[] data) {
222 return getShortBE(data, 0);
223 }
224 /**
225 * Get a BE short value from a byte array
226 *
227 *@param data the byte array
228 *@param offset a starting offset into the byte array
229 *@return the short (16-bit) value
230 */
231 public static short getShortBE(byte[] data, int offset) {
232 return (short)getUShortBE(data, offset);
233 }
234
235 /**
236 * Get a BE unsigned short value from the beginning of a byte array
237 *
238 *@param data the byte array
239 *@return the unsigned short (16-bit) value in an int
240 */
241 public static int getUShortBE(byte[] data) {
242 return getUShortBE(data, 0);
243 }
244 /**
245 * Get a BE unsigned short value from a byte array
246 *
247 *@param data the byte array
248 *@param offset a starting offset into the byte array
249 *@return the unsigned short (16-bit) value in an integer
250 */
251 public static int getUShortBE(byte[] data, int offset) {
252 int b0 = data[offset] & 0xFF;
253 int b1 = data[offset+1] & 0xFF;
254 return (b0 << 8) + (b1 << 0);
255 }
256
257 /**
258 * Get a LE int value from the beginning of a byte array
259 *
260 *@param data the byte array
261 *@return the int (32-bit) value
262 */
263 public static int getIntLE(byte[] data) {
264 return getIntLE(data, 0);
265 }
266 /**
267 * Get a LE int value from a byte array
268 *
269 *@param data the byte array
270 *@param offset a starting offset into the byte array
271 *@return the int (32-bit) value
272 */
273 public static int getIntLE(byte[] data, int offset) {
274 int i=offset;
275 int b0 = data[i++] & 0xFF;
276 int b1 = data[i++] & 0xFF;
277 int b2 = data[i++] & 0xFF;
278 int b3 = data[i++] & 0xFF;
279 return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
280 }
281
282 /**
283 * Get a BE int value from the beginning of a byte array
284 *
285 *@param data the byte array
286 *@return the int (32-bit) value
287 */
288 public static int getIntBE(byte[] data) {
289 return getIntBE(data, 0);
290 }
291 /**
292 * Get a BE int value from a byte array
293 *
294 *@param data the byte array
295 *@param offset a starting offset into the byte array
296 *@return the int (32-bit) value
297 */
298 public static int getIntBE(byte[] data, int offset) {
299 int i=offset;
300 int b0 = data[i++] & 0xFF;
301 int b1 = data[i++] & 0xFF;
302 int b2 = data[i++] & 0xFF;
303 int b3 = data[i++] & 0xFF;
304 return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
305 }
306
307 /**
308 * Get a LE unsigned int value from a byte array
309 *
310 *@param data the byte array
311 *@return the unsigned int (32-bit) value in a long
312 */
313 public static long getUIntLE(byte[] data) {
314 return getUIntLE(data,0);
315 }
316 /**
317 * Get a LE unsigned int value from a byte array
318 *
319 *@param data the byte array
320 *@param offset a starting offset into the byte array
321 *@return the unsigned int (32-bit) value in a long
322 */
323 public static long getUIntLE(byte[] data, int offset) {
324 long retNum = getIntLE(data, offset);
325 return retNum & 0x00FFFFFFFFl;
326 }
327
328 /**
329 * Get a BE unsigned int value from a byte array
330 *
331 *@param data the byte array
332 *@return the unsigned int (32-bit) value in a long
333 */
334 public static long getUIntBE(byte[] data) {
335 return getUIntBE(data,0);
336 }
337 /**
338 * Get a BE unsigned int value from a byte array
339 *
340 *@param data the byte array
341 *@param offset a starting offset into the byte array
342 *@return the unsigned int (32-bit) value in a long
343 */
344 public static long getUIntBE(byte[] data, int offset) {
345 long retNum = getIntBE(data, offset);
346 return retNum & 0x00FFFFFFFFl;
347 }
348
349 /**
350 * Get a LE long value from a byte array
351 *
352 *@param data the byte array
353 *@param offset a starting offset into the byte array
354 *@return the long (64-bit) value
355 */
356 public static long getLongLE(byte[] data, int offset) {
357 long result = 0;
358
359 for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
360 result <<= 8;
361 result |= 0xff & data[j];
362 }
363 return result;
364 }
365 private static final int LONG_SIZE = 8;
366
367
368 /**
369 * Convert an 'unsigned' byte to an integer. ie, don't carry across the
370 * sign.
371 *
372 * @param b Description of the Parameter
373 * @return Description of the Return Value
374 */
375 public static int ubyteToInt(byte b) {
376 return b & 0xFF;
377 }
378
379 /**
380 * get the unsigned value of a byte.
381 *
382 * @param data
383 * the byte array.
384 * @param offset
385 * a starting offset into the byte array.
386 * @return the unsigned value of the byte as a 16 bit short
387 */
388 public static short getUByte( byte[] data, int offset )
389 {
390 return (short) ( data[offset] & 0xFF );
391 }
392
393
394 public static class BufferUnderrunException extends TikaException {
395 private static final long serialVersionUID = 8358288231138076276L;
396 public BufferUnderrunException() {
397 super("Insufficient data left in stream for required read");
398 }
399 }
400 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.util.HashSet;
19
20
21 public class FilenameUtils {
22
23 /**
24 * Reserved characters
25 */
26 public final static char[] RESERVED_FILENAME_CHARACTERS = {
27 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
28 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
29 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
30 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
31 '?', ':', '*', '<', '>', '|'
32 };
33
34 private final static HashSet<Character> RESERVED = new HashSet<Character>(38);
35
36 static {
37 for (int i=0; i<RESERVED_FILENAME_CHARACTERS.length; ++i) {
38 RESERVED.add(RESERVED_FILENAME_CHARACTERS[i]);
39 }
40 }
41
42 /**
43 * Scans the given file name for reserved characters on different OSs and
44 * file systems and returns a sanitized version of the name with the
45 * reserved chars replaced by their hexadecimal value.
46 *
47 * For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
48 *
49 * @param name the file name to be normalized - NOT NULL
50 *
51 * @return the normalized file name
52 *
53 * @throws IllegalArgumentException if name is null
54 */
55 public static String normalize(final String name) {
56 if (name == null) {
57 throw new IllegalArgumentException("name cannot be null");
58 }
59
60 StringBuilder sb = new StringBuilder();
61
62 for (char c: name.toCharArray()) {
63 if (RESERVED.contains(c)) {
64 sb.append('%').append((c<16) ? "0" : "").append(Integer.toHexString(c).toUpperCase());
65 } else {
66 sb.append(c);
67 }
68 }
69
70 return sb.toString();
71 }
72 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19
20 /**
21 * Subclasses IOException with the {@link Throwable} constructors missing before Java 6. If you are using Java 6,
22 * consider this class deprecated and use {@link IOException}.
23 *
24 * @author <a href="http://commons.apache.org/io/">Apache Commons IO</a>
25 * @since Apache Tika 0.4, copied from Commons IO 1.4
26 */
27 public class IOExceptionWithCause extends IOException {
28
29 /**
30 * Defines the serial version UID.
31 */
32 private static final long serialVersionUID = 1L;
33
34 /**
35 * Constructs a new instance with the given message and cause.
36 * <p>
37 * As specified in {@link Throwable}, the message in the given <code>cause</code> is not used in this instance's
38 * message.
39 * </p>
40 *
41 * @param message
42 * the message (see {@link #getMessage()})
43 * @param cause
44 * the cause (see {@link #getCause()}). A <code>null</code> value is allowed.
45 */
46 public IOExceptionWithCause(String message, Throwable cause) {
47 super(message);
48 this.initCause(cause);
49 }
50
51 /**
52 * Constructs a new instance with the given cause.
53 * <p>
54 * The message is set to <code>cause==null ? null : cause.toString()</code>, which by default contains the class
55 * and message of <code>cause</code>. This constructor is useful for call sites that just wrap another throwable.
56 * </p>
57 *
58 * @param cause
59 * the cause (see {@link #getCause()}). A <code>null</code> value is allowed.
60 */
61 public IOExceptionWithCause(Throwable cause) {
62 super(cause == null ? null : cause.toString());
63 this.initCause(cause);
64 }
65
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.BufferedInputStream;
19 import java.io.BufferedReader;
20 import java.io.ByteArrayInputStream;
21 import java.io.ByteArrayOutputStream;
22 import java.io.CharArrayWriter;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.io.OutputStream;
27 import java.io.OutputStreamWriter;
28 import java.io.Reader;
29 import java.io.StringWriter;
30 import java.io.Writer;
31 import java.nio.channels.Channel;
32 import java.util.ArrayList;
33 import java.util.List;
34
35 /**
36 * General IO stream manipulation utilities.
37 * <p>
38 * This class provides static utility methods for input/output operations.
39 * <ul>
40 * <li>closeQuietly - these methods close a stream ignoring nulls and exceptions
41 * <li>toXxx/read - these methods read data from a stream
42 * <li>write - these methods write data to a stream
43 * <li>copy - these methods copy all the data from one stream to another
44 * <li>contentEquals - these methods compare the content of two streams
45 * </ul>
46 * <p>
47 * The byte-to-char methods and char-to-byte methods involve a conversion step.
48 * Two methods are provided in each case, one that uses the platform default
49 * encoding and the other which allows you to specify an encoding. You are
50 * encouraged to always specify an encoding because relying on the platform
51 * default can lead to unexpected results, for example when moving from
52 * development to production.
53 * <p>
54 * All the methods in this class that read a stream are buffered internally.
55 * This means that there is no cause to use a <code>BufferedInputStream</code>
56 * or <code>BufferedReader</code>. The default buffer size of 4K has been shown
57 * to be efficient in tests.
58 * <p>
59 * Wherever possible, the methods in this class do <em>not</em> flush or close
60 * the stream. This is to avoid making non-portable assumptions about the
61 * streams' origin and further use. Thus the caller is still responsible for
62 * closing streams after use.
63 * <p>
64 * Origin of code: Excalibur.
65 *
66 * @author Peter Donald
67 * @author Jeff Turner
68 * @author Matthew Hawthorne
69 * @author Stephen Colebourne
70 * @author Gareth Davis
71 * @author Ian Springer
72 * @author Niall Pemberton
73 * @author Sandy McArthur
74 * @since Apache Tika 0.4, copied (partially) from Commons IO 1.4
75 */
76 public class IOUtils {
77
78 /**
79 * The default buffer size to use.
80 */
81 private static final int DEFAULT_BUFFER_SIZE = 1024 * 4;
82
83 /**
84 * Instances should NOT be constructed in standard programming.
85 */
86 public IOUtils() {
87 super();
88 }
89
90 //-----------------------------------------------------------------------
91 /**
92 * Unconditionally close an <code>Reader</code>.
93 * <p>
94 * Equivalent to {@link Reader#close()}, except any exceptions will be ignored.
95 * This is typically used in finally blocks.
96 *
97 * @param input the Reader to close, may be null or already closed
98 */
99 public static void closeQuietly(Reader input) {
100 try {
101 if (input != null) {
102 input.close();
103 }
104 } catch (IOException ioe) {
105 // ignore
106 }
107 }
108
109 /**
110 * Unconditionally close a <code>Channel</code>.
111 * <p>
112 * Equivalent to {@link Channel#close()}, except any exceptions will be ignored.
113 * This is typically used in finally blocks.
114 *
115 * @param channel the Channel to close, may be null or already closed
116 */
117 public static void closeQuietly(Channel channel) {
118 try {
119 if (channel != null) {
120 channel.close();
121 }
122 } catch (IOException ioe) {
123 // ignore
124 }
125 }
126
127 /**
128 * Unconditionally close a <code>Writer</code>.
129 * <p>
130 * Equivalent to {@link Writer#close()}, except any exceptions will be ignored.
131 * This is typically used in finally blocks.
132 *
133 * @param output the Writer to close, may be null or already closed
134 */
135 public static void closeQuietly(Writer output) {
136 try {
137 if (output != null) {
138 output.close();
139 }
140 } catch (IOException ioe) {
141 // ignore
142 }
143 }
144
145 /**
146 * Unconditionally close an <code>InputStream</code>.
147 * <p>
148 * Equivalent to {@link InputStream#close()}, except any exceptions will be ignored.
149 * This is typically used in finally blocks.
150 *
151 * @param input the InputStream to close, may be null or already closed
152 */
153 public static void closeQuietly(InputStream input) {
154 try {
155 if (input != null) {
156 input.close();
157 }
158 } catch (IOException ioe) {
159 // ignore
160 }
161 }
162
163 /**
164 * Unconditionally close an <code>OutputStream</code>.
165 * <p>
166 * Equivalent to {@link OutputStream#close()}, except any exceptions will be ignored.
167 * This is typically used in finally blocks.
168 *
169 * @param output the OutputStream to close, may be null or already closed
170 */
171 public static void closeQuietly(OutputStream output) {
172 try {
173 if (output != null) {
174 output.close();
175 }
176 } catch (IOException ioe) {
177 // ignore
178 }
179 }
180
181 // read toByteArray
182 //-----------------------------------------------------------------------
183 /**
184 * Get the contents of an <code>InputStream</code> as a <code>byte[]</code>.
185 * <p>
186 * This method buffers the input internally, so there is no need to use a
187 * <code>BufferedInputStream</code>.
188 *
189 * @param input the <code>InputStream</code> to read from
190 * @return the requested byte array
191 * @throws NullPointerException if the input is null
192 * @throws IOException if an I/O error occurs
193 */
194 public static byte[] toByteArray(InputStream input) throws IOException {
195 ByteArrayOutputStream output = new ByteArrayOutputStream();
196 copy(input, output);
197 return output.toByteArray();
198 }
199
200 /**
201 * Get the contents of a <code>Reader</code> as a <code>byte[]</code>
202 * using the default character encoding of the platform.
203 * <p>
204 * This method buffers the input internally, so there is no need to use a
205 * <code>BufferedReader</code>.
206 *
207 * @param input the <code>Reader</code> to read from
208 * @return the requested byte array
209 * @throws NullPointerException if the input is null
210 * @throws IOException if an I/O error occurs
211 */
212 public static byte[] toByteArray(Reader input) throws IOException {
213 ByteArrayOutputStream output = new ByteArrayOutputStream();
214 copy(input, output);
215 return output.toByteArray();
216 }
217
218 /**
219 * Get the contents of a <code>Reader</code> as a <code>byte[]</code>
220 * using the specified character encoding.
221 * <p>
222 * Character encoding names can be found at
223 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
224 * <p>
225 * This method buffers the input internally, so there is no need to use a
226 * <code>BufferedReader</code>.
227 *
228 * @param input the <code>Reader</code> to read from
229 * @param encoding the encoding to use, null means platform default
230 * @return the requested byte array
231 * @throws NullPointerException if the input is null
232 * @throws IOException if an I/O error occurs
233 * @since Commons IO 1.1
234 */
235 public static byte[] toByteArray(Reader input, String encoding)
236 throws IOException {
237 ByteArrayOutputStream output = new ByteArrayOutputStream();
238 copy(input, output, encoding);
239 return output.toByteArray();
240 }
241
242 /**
243 * Get the contents of a <code>String</code> as a <code>byte[]</code>
244 * using the default character encoding of the platform.
245 * <p>
246 * This is the same as {@link String#getBytes()}.
247 *
248 * @param input the <code>String</code> to convert
249 * @return the requested byte array
250 * @throws NullPointerException if the input is null
251 * @throws IOException if an I/O error occurs (never occurs)
252 * @deprecated Use {@link String#getBytes()}
253 */
254 @Deprecated
255 public static byte[] toByteArray(String input) throws IOException {
256 return input.getBytes();
257 }
258
259 // read char[]
260 //-----------------------------------------------------------------------
261 /**
262 * Get the contents of an <code>InputStream</code> as a character array
263 * using the default character encoding of the platform.
264 * <p>
265 * This method buffers the input internally, so there is no need to use a
266 * <code>BufferedInputStream</code>.
267 *
268 * @param is the <code>InputStream</code> to read from
269 * @return the requested character array
270 * @throws NullPointerException if the input is null
271 * @throws IOException if an I/O error occurs
272 * @since Commons IO 1.1
273 */
274 public static char[] toCharArray(InputStream is) throws IOException {
275 CharArrayWriter output = new CharArrayWriter();
276 copy(is, output);
277 return output.toCharArray();
278 }
279
280 /**
281 * Get the contents of an <code>InputStream</code> as a character array
282 * using the specified character encoding.
283 * <p>
284 * Character encoding names can be found at
285 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
286 * <p>
287 * This method buffers the input internally, so there is no need to use a
288 * <code>BufferedInputStream</code>.
289 *
290 * @param is the <code>InputStream</code> to read from
291 * @param encoding the encoding to use, null means platform default
292 * @return the requested character array
293 * @throws NullPointerException if the input is null
294 * @throws IOException if an I/O error occurs
295 * @since Commons IO 1.1
296 */
297 public static char[] toCharArray(InputStream is, String encoding)
298 throws IOException {
299 CharArrayWriter output = new CharArrayWriter();
300 copy(is, output, encoding);
301 return output.toCharArray();
302 }
303
304 /**
305 * Get the contents of a <code>Reader</code> as a character array.
306 * <p>
307 * This method buffers the input internally, so there is no need to use a
308 * <code>BufferedReader</code>.
309 *
310 * @param input the <code>Reader</code> to read from
311 * @return the requested character array
312 * @throws NullPointerException if the input is null
313 * @throws IOException if an I/O error occurs
314 * @since Commons IO 1.1
315 */
316 public static char[] toCharArray(Reader input) throws IOException {
317 CharArrayWriter sw = new CharArrayWriter();
318 copy(input, sw);
319 return sw.toCharArray();
320 }
321
322 // read toString
323 //-----------------------------------------------------------------------
324 /**
325 * Get the contents of an <code>InputStream</code> as a String
326 * using the default character encoding of the platform.
327 * <p>
328 * This method buffers the input internally, so there is no need to use a
329 * <code>BufferedInputStream</code>.
330 *
331 * @param input the <code>InputStream</code> to read from
332 * @return the requested String
333 * @throws NullPointerException if the input is null
334 * @throws IOException if an I/O error occurs
335 */
336 public static String toString(InputStream input) throws IOException {
337 StringWriter sw = new StringWriter();
338 copy(input, sw);
339 return sw.toString();
340 }
341
342 /**
343 * Get the contents of an <code>InputStream</code> as a String
344 * using the specified character encoding.
345 * <p>
346 * Character encoding names can be found at
347 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
348 * <p>
349 * This method buffers the input internally, so there is no need to use a
350 * <code>BufferedInputStream</code>.
351 *
352 * @param input the <code>InputStream</code> to read from
353 * @param encoding the encoding to use, null means platform default
354 * @return the requested String
355 * @throws NullPointerException if the input is null
356 * @throws IOException if an I/O error occurs
357 */
358 public static String toString(InputStream input, String encoding)
359 throws IOException {
360 StringWriter sw = new StringWriter();
361 copy(input, sw, encoding);
362 return sw.toString();
363 }
364
365 /**
366 * Get the contents of a <code>Reader</code> as a String.
367 * <p>
368 * This method buffers the input internally, so there is no need to use a
369 * <code>BufferedReader</code>.
370 *
371 * @param input the <code>Reader</code> to read from
372 * @return the requested String
373 * @throws NullPointerException if the input is null
374 * @throws IOException if an I/O error occurs
375 */
376 public static String toString(Reader input) throws IOException {
377 StringWriter sw = new StringWriter();
378 copy(input, sw);
379 return sw.toString();
380 }
381
382 /**
383 * Get the contents of a <code>byte[]</code> as a String
384 * using the default character encoding of the platform.
385 *
386 * @param input the byte array to read from
387 * @return the requested String
388 * @throws NullPointerException if the input is null
389 * @throws IOException if an I/O error occurs (never occurs)
390 * @deprecated Use {@link String#String(byte[])}
391 */
392 @Deprecated
393 public static String toString(byte[] input) throws IOException {
394 return new String(input);
395 }
396
397 /**
398 * Get the contents of a <code>byte[]</code> as a String
399 * using the specified character encoding.
400 * <p>
401 * Character encoding names can be found at
402 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
403 *
404 * @param input the byte array to read from
405 * @param encoding the encoding to use, null means platform default
406 * @return the requested String
407 * @throws NullPointerException if the input is null
408 * @throws IOException if an I/O error occurs (never occurs)
409 * @deprecated Use {@link String#String(byte[],String)}
410 */
411 @Deprecated
412 public static String toString(byte[] input, String encoding)
413 throws IOException {
414 if (encoding == null) {
415 return new String(input);
416 } else {
417 return new String(input, encoding);
418 }
419 }
420
421 // readLines
422 //-----------------------------------------------------------------------
423 /**
424 * Get the contents of an <code>InputStream</code> as a list of Strings,
425 * one entry per line, using the default character encoding of the platform.
426 * <p>
427 * This method buffers the input internally, so there is no need to use a
428 * <code>BufferedInputStream</code>.
429 *
430 * @param input the <code>InputStream</code> to read from, not null
431 * @return the list of Strings, never null
432 * @throws NullPointerException if the input is null
433 * @throws IOException if an I/O error occurs
434 * @since Commons IO 1.1
435 */
436 public static List<String> readLines(InputStream input) throws IOException {
437 InputStreamReader reader = new InputStreamReader(input);
438 return readLines(reader);
439 }
440
441 /**
442 * Get the contents of an <code>InputStream</code> as a list of Strings,
443 * one entry per line, using the specified character encoding.
444 * <p>
445 * Character encoding names can be found at
446 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
447 * <p>
448 * This method buffers the input internally, so there is no need to use a
449 * <code>BufferedInputStream</code>.
450 *
451 * @param input the <code>InputStream</code> to read from, not null
452 * @param encoding the encoding to use, null means platform default
453 * @return the list of Strings, never null
454 * @throws NullPointerException if the input is null
455 * @throws IOException if an I/O error occurs
456 * @since Commons IO 1.1
457 */
458 public static List<String> readLines(InputStream input, String encoding) throws IOException {
459 if (encoding == null) {
460 return readLines(input);
461 } else {
462 InputStreamReader reader = new InputStreamReader(input, encoding);
463 return readLines(reader);
464 }
465 }
466
467 /**
468 * Get the contents of a <code>Reader</code> as a list of Strings,
469 * one entry per line.
470 * <p>
471 * This method buffers the input internally, so there is no need to use a
472 * <code>BufferedReader</code>.
473 *
474 * @param input the <code>Reader</code> to read from, not null
475 * @return the list of Strings, never null
476 * @throws NullPointerException if the input is null
477 * @throws IOException if an I/O error occurs
478 * @since Commons IO 1.1
479 */
480 public static List<String> readLines(Reader input) throws IOException {
481 BufferedReader reader = new BufferedReader(input);
482 List<String> list = new ArrayList<String>();
483 String line = reader.readLine();
484 while (line != null) {
485 list.add(line);
486 line = reader.readLine();
487 }
488 return list;
489 }
490
491 //-----------------------------------------------------------------------
492 /**
493 * Convert the specified CharSequence to an input stream, encoded as bytes
494 * using the default character encoding of the platform.
495 *
496 * @param input the CharSequence to convert
497 * @return an input stream
498 * @since IO 2.0
499 */
500 public static InputStream toInputStream(CharSequence input) {
501 return toInputStream(input.toString());
502 }
503
504 /**
505 * Convert the specified CharSequence to an input stream, encoded as bytes
506 * using the specified character encoding.
507 * <p>
508 * Character encoding names can be found at
509 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
510 *
511 * @param input the CharSequence to convert
512 * @param encoding the encoding to use, null means platform default
513 * @throws IOException if the encoding is invalid
514 * @return an input stream
515 * @since IO 2.0
516 */
517 public static InputStream toInputStream(CharSequence input, String encoding) throws IOException {
518 return toInputStream(input.toString(), encoding);
519 }
520
521 //-----------------------------------------------------------------------
522 /**
523 * Convert the specified string to an input stream, encoded as bytes
524 * using the default character encoding of the platform.
525 *
526 * @param input the string to convert
527 * @return an input stream
528 * @since Commons IO 1.1
529 */
530 public static InputStream toInputStream(String input) {
531 byte[] bytes = input.getBytes();
532 return new ByteArrayInputStream(bytes);
533 }
534
535 /**
536 * Convert the specified string to an input stream, encoded as bytes
537 * using the specified character encoding.
538 * <p>
539 * Character encoding names can be found at
540 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
541 *
542 * @param input the string to convert
543 * @param encoding the encoding to use, null means platform default
544 * @throws IOException if the encoding is invalid
545 * @return an input stream
546 * @since Commons IO 1.1
547 */
548 public static InputStream toInputStream(String input, String encoding) throws IOException {
549 byte[] bytes = encoding != null ? input.getBytes(encoding) : input.getBytes();
550 return new ByteArrayInputStream(bytes);
551 }
552
553 // write byte[]
554 //-----------------------------------------------------------------------
555 /**
556 * Writes bytes from a <code>byte[]</code> to an <code>OutputStream</code>.
557 *
558 * @param data the byte array to write, do not modify during output,
559 * null ignored
560 * @param output the <code>OutputStream</code> to write to
561 * @throws NullPointerException if output is null
562 * @throws IOException if an I/O error occurs
563 * @since Commons IO 1.1
564 */
565 public static void write(byte[] data, OutputStream output)
566 throws IOException {
567 if (data != null) {
568 output.write(data);
569 }
570 }
571
572 /**
573 * Writes bytes from a <code>byte[]</code> to chars on a <code>Writer</code>
574 * using the default character encoding of the platform.
575 * <p>
576 * This method uses {@link String#String(byte[])}.
577 *
578 * @param data the byte array to write, do not modify during output,
579 * null ignored
580 * @param output the <code>Writer</code> to write to
581 * @throws NullPointerException if output is null
582 * @throws IOException if an I/O error occurs
583 * @since Commons IO 1.1
584 */
585 public static void write(byte[] data, Writer output) throws IOException {
586 if (data != null) {
587 output.write(new String(data));
588 }
589 }
590
591 /**
592 * Writes bytes from a <code>byte[]</code> to chars on a <code>Writer</code>
593 * using the specified character encoding.
594 * <p>
595 * Character encoding names can be found at
596 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
597 * <p>
598 * This method uses {@link String#String(byte[], String)}.
599 *
600 * @param data the byte array to write, do not modify during output,
601 * null ignored
602 * @param output the <code>Writer</code> to write to
603 * @param encoding the encoding to use, null means platform default
604 * @throws NullPointerException if output is null
605 * @throws IOException if an I/O error occurs
606 * @since Commons IO 1.1
607 */
608 public static void write(byte[] data, Writer output, String encoding)
609 throws IOException {
610 if (data != null) {
611 if (encoding == null) {
612 write(data, output);
613 } else {
614 output.write(new String(data, encoding));
615 }
616 }
617 }
618
619 // write char[]
620 //-----------------------------------------------------------------------
621 /**
622 * Writes chars from a <code>char[]</code> to a <code>Writer</code>
623 * using the default character encoding of the platform.
624 *
625 * @param data the char array to write, do not modify during output,
626 * null ignored
627 * @param output the <code>Writer</code> to write to
628 * @throws NullPointerException if output is null
629 * @throws IOException if an I/O error occurs
630 * @since Commons IO 1.1
631 */
632 public static void write(char[] data, Writer output) throws IOException {
633 if (data != null) {
634 output.write(data);
635 }
636 }
637
638 /**
639 * Writes chars from a <code>char[]</code> to bytes on an
640 * <code>OutputStream</code>.
641 * <p>
642 * This method uses {@link String#String(char[])} and
643 * {@link String#getBytes()}.
644 *
645 * @param data the char array to write, do not modify during output,
646 * null ignored
647 * @param output the <code>OutputStream</code> to write to
648 * @throws NullPointerException if output is null
649 * @throws IOException if an I/O error occurs
650 * @since Commons IO 1.1
651 */
652 public static void write(char[] data, OutputStream output)
653 throws IOException {
654 if (data != null) {
655 output.write(new String(data).getBytes());
656 }
657 }
658
659 /**
660 * Writes chars from a <code>char[]</code> to bytes on an
661 * <code>OutputStream</code> using the specified character encoding.
662 * <p>
663 * Character encoding names can be found at
664 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
665 * <p>
666 * This method uses {@link String#String(char[])} and
667 * {@link String#getBytes(String)}.
668 *
669 * @param data the char array to write, do not modify during output,
670 * null ignored
671 * @param output the <code>OutputStream</code> to write to
672 * @param encoding the encoding to use, null means platform default
673 * @throws NullPointerException if output is null
674 * @throws IOException if an I/O error occurs
675 * @since Commons IO 1.1
676 */
677 public static void write(char[] data, OutputStream output, String encoding)
678 throws IOException {
679 if (data != null) {
680 if (encoding == null) {
681 write(data, output);
682 } else {
683 output.write(new String(data).getBytes(encoding));
684 }
685 }
686 }
687
688 // write CharSequence
689 //-----------------------------------------------------------------------
690 /**
691 * Writes chars from a <code>CharSequence</code> to a <code>Writer</code>.
692 *
693 * @param data the <code>CharSequence</code> to write, null ignored
694 * @param output the <code>Writer</code> to write to
695 * @throws NullPointerException if output is null
696 * @throws IOException if an I/O error occurs
697 * @since Commons IO 2.0
698 */
699 public static void write(CharSequence data, Writer output) throws IOException {
700 if (data != null) {
701 write(data.toString(), output);
702 }
703 }
704
705 /**
706 * Writes chars from a <code>CharSequence</code> to bytes on an
707 * <code>OutputStream</code> using the default character encoding of the
708 * platform.
709 * <p>
710 * This method uses {@link String#getBytes()}.
711 *
712 * @param data the <code>CharSequence</code> to write, null ignored
713 * @param output the <code>OutputStream</code> to write to
714 * @throws NullPointerException if output is null
715 * @throws IOException if an I/O error occurs
716 * @since Commons IO 2.0
717 */
718 public static void write(CharSequence data, OutputStream output)
719 throws IOException {
720 if (data != null) {
721 write(data.toString(), output);
722 }
723 }
724
725 /**
726 * Writes chars from a <code>CharSequence</code> to bytes on an
727 * <code>OutputStream</code> using the specified character encoding.
728 * <p>
729 * Character encoding names can be found at
730 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
731 * <p>
732 * This method uses {@link String#getBytes(String)}.
733 *
734 * @param data the <code>CharSequence</code> to write, null ignored
735 * @param output the <code>OutputStream</code> to write to
736 * @param encoding the encoding to use, null means platform default
737 * @throws NullPointerException if output is null
738 * @throws IOException if an I/O error occurs
739 * @since Commons IO 2.0
740 */
741 public static void write(CharSequence data, OutputStream output, String encoding)
742 throws IOException {
743 if (data != null) {
744 write(data.toString(), output, encoding);
745 }
746 }
747
748 // write String
749 //-----------------------------------------------------------------------
750 /**
751 * Writes chars from a <code>String</code> to a <code>Writer</code>.
752 *
753 * @param data the <code>String</code> to write, null ignored
754 * @param output the <code>Writer</code> to write to
755 * @throws NullPointerException if output is null
756 * @throws IOException if an I/O error occurs
757 * @since Commons IO 1.1
758 */
759 public static void write(String data, Writer output) throws IOException {
760 if (data != null) {
761 output.write(data);
762 }
763 }
764
765 /**
766 * Writes chars from a <code>String</code> to bytes on an
767 * <code>OutputStream</code> using the default character encoding of the
768 * platform.
769 * <p>
770 * This method uses {@link String#getBytes()}.
771 *
772 * @param data the <code>String</code> to write, null ignored
773 * @param output the <code>OutputStream</code> to write to
774 * @throws NullPointerException if output is null
775 * @throws IOException if an I/O error occurs
776 * @since Commons IO 1.1
777 */
778 public static void write(String data, OutputStream output)
779 throws IOException {
780 if (data != null) {
781 output.write(data.getBytes());
782 }
783 }
784
785 /**
786 * Writes chars from a <code>String</code> to bytes on an
787 * <code>OutputStream</code> using the specified character encoding.
788 * <p>
789 * Character encoding names can be found at
790 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
791 * <p>
792 * This method uses {@link String#getBytes(String)}.
793 *
794 * @param data the <code>String</code> to write, null ignored
795 * @param output the <code>OutputStream</code> to write to
796 * @param encoding the encoding to use, null means platform default
797 * @throws NullPointerException if output is null
798 * @throws IOException if an I/O error occurs
799 * @since Commons IO 1.1
800 */
801 public static void write(String data, OutputStream output, String encoding)
802 throws IOException {
803 if (data != null) {
804 if (encoding == null) {
805 write(data, output);
806 } else {
807 output.write(data.getBytes(encoding));
808 }
809 }
810 }
811
812 // write StringBuffer
813 //-----------------------------------------------------------------------
814 /**
815 * Writes chars from a <code>StringBuffer</code> to a <code>Writer</code>.
816 *
817 * @param data the <code>StringBuffer</code> to write, null ignored
818 * @param output the <code>Writer</code> to write to
819 * @throws NullPointerException if output is null
820 * @throws IOException if an I/O error occurs
821 * @since Commons IO 1.1
822 * @deprecated replaced by write(CharSequence, Writer)
823 */
824 @Deprecated
825 public static void write(StringBuffer data, Writer output)
826 throws IOException {
827 if (data != null) {
828 output.write(data.toString());
829 }
830 }
831
832 /**
833 * Writes chars from a <code>StringBuffer</code> to bytes on an
834 * <code>OutputStream</code> using the default character encoding of the
835 * platform.
836 * <p>
837 * This method uses {@link String#getBytes()}.
838 *
839 * @param data the <code>StringBuffer</code> to write, null ignored
840 * @param output the <code>OutputStream</code> to write to
841 * @throws NullPointerException if output is null
842 * @throws IOException if an I/O error occurs
843 * @since Commons IO 1.1
844 * @deprecated replaced by write(CharSequence, OutputStream)
845 */
846 @Deprecated
847 public static void write(StringBuffer data, OutputStream output)
848 throws IOException {
849 if (data != null) {
850 output.write(data.toString().getBytes());
851 }
852 }
853
854 /**
855 * Writes chars from a <code>StringBuffer</code> to bytes on an
856 * <code>OutputStream</code> using the specified character encoding.
857 * <p>
858 * Character encoding names can be found at
859 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
860 * <p>
861 * This method uses {@link String#getBytes(String)}.
862 *
863 * @param data the <code>StringBuffer</code> to write, null ignored
864 * @param output the <code>OutputStream</code> to write to
865 * @param encoding the encoding to use, null means platform default
866 * @throws NullPointerException if output is null
867 * @throws IOException if an I/O error occurs
868 * @since Commons IO 1.1
869 * @deprecated replaced by write(CharSequence, OutputStream, String)
870 */
871 @Deprecated
872 public static void write(StringBuffer data, OutputStream output,
873 String encoding) throws IOException {
874 if (data != null) {
875 if (encoding == null) {
876 write(data, output);
877 } else {
878 output.write(data.toString().getBytes(encoding));
879 }
880 }
881 }
882
883 // copy from InputStream
884 //-----------------------------------------------------------------------
885 /**
886 * Copy bytes from an <code>InputStream</code> to an
887 * <code>OutputStream</code>.
888 * <p>
889 * This method buffers the input internally, so there is no need to use a
890 * <code>BufferedInputStream</code>.
891 * <p>
892 * Large streams (over 2GB) will return a bytes copied value of
893 * <code>-1</code> after the copy has completed since the correct
894 * number of bytes cannot be returned as an int. For large streams
895 * use the <code>copyLarge(InputStream, OutputStream)</code> method.
896 *
897 * @param input the <code>InputStream</code> to read from
898 * @param output the <code>OutputStream</code> to write to
899 * @return the number of bytes copied
900 * @throws NullPointerException if the input or output is null
901 * @throws IOException if an I/O error occurs
902 * @throws ArithmeticException if the byte count is too large
903 * @since Commons IO 1.1
904 */
905 public static int copy(InputStream input, OutputStream output) throws IOException {
906 long count = copyLarge(input, output);
907 if (count > Integer.MAX_VALUE) {
908 return -1;
909 }
910 return (int) count;
911 }
912
913 /**
914 * Copy bytes from a large (over 2GB) <code>InputStream</code> to an
915 * <code>OutputStream</code>.
916 * <p>
917 * This method buffers the input internally, so there is no need to use a
918 * <code>BufferedInputStream</code>.
919 *
920 * @param input the <code>InputStream</code> to read from
921 * @param output the <code>OutputStream</code> to write to
922 * @return the number of bytes copied
923 * @throws NullPointerException if the input or output is null
924 * @throws IOException if an I/O error occurs
925 * @since Commons IO 1.3
926 */
927 public static long copyLarge(InputStream input, OutputStream output)
928 throws IOException {
929 byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
930 long count = 0;
931 int n = 0;
932 while (-1 != (n = input.read(buffer))) {
933 output.write(buffer, 0, n);
934 count += n;
935 }
936 return count;
937 }
938
939 /**
940 * Copy bytes from an <code>InputStream</code> to chars on a
941 * <code>Writer</code> using the default character encoding of the platform.
942 * <p>
943 * This method buffers the input internally, so there is no need to use a
944 * <code>BufferedInputStream</code>.
945 * <p>
946 * This method uses {@link InputStreamReader}.
947 *
948 * @param input the <code>InputStream</code> to read from
949 * @param output the <code>Writer</code> to write to
950 * @throws NullPointerException if the input or output is null
951 * @throws IOException if an I/O error occurs
952 * @since Commons IO 1.1
953 */
954 public static void copy(InputStream input, Writer output)
955 throws IOException {
956 InputStreamReader in = new InputStreamReader(input);
957 copy(in, output);
958 }
959
960 /**
961 * Copy bytes from an <code>InputStream</code> to chars on a
962 * <code>Writer</code> using the specified character encoding.
963 * <p>
964 * This method buffers the input internally, so there is no need to use a
965 * <code>BufferedInputStream</code>.
966 * <p>
967 * Character encoding names can be found at
968 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
969 * <p>
970 * This method uses {@link InputStreamReader}.
971 *
972 * @param input the <code>InputStream</code> to read from
973 * @param output the <code>Writer</code> to write to
974 * @param encoding the encoding to use, null means platform default
975 * @throws NullPointerException if the input or output is null
976 * @throws IOException if an I/O error occurs
977 * @since Commons IO 1.1
978 */
979 public static void copy(InputStream input, Writer output, String encoding)
980 throws IOException {
981 if (encoding == null) {
982 copy(input, output);
983 } else {
984 InputStreamReader in = new InputStreamReader(input, encoding);
985 copy(in, output);
986 }
987 }
988
989 // copy from Reader
990 //-----------------------------------------------------------------------
991 /**
992 * Copy chars from a <code>Reader</code> to a <code>Writer</code>.
993 * <p>
994 * This method buffers the input internally, so there is no need to use a
995 * <code>BufferedReader</code>.
996 * <p>
997 * Large streams (over 2GB) will return a chars copied value of
998 * <code>-1</code> after the copy has completed since the correct
999 * number of chars cannot be returned as an int. For large streams
1000 * use the <code>copyLarge(Reader, Writer)</code> method.
1001 *
1002 * @param input the <code>Reader</code> to read from
1003 * @param output the <code>Writer</code> to write to
1004 * @return the number of characters copied
1005 * @throws NullPointerException if the input or output is null
1006 * @throws IOException if an I/O error occurs
1007 * @throws ArithmeticException if the character count is too large
1008 * @since Commons IO 1.1
1009 */
1010 public static int copy(Reader input, Writer output) throws IOException {
1011 long count = copyLarge(input, output);
1012 if (count > Integer.MAX_VALUE) {
1013 return -1;
1014 }
1015 return (int) count;
1016 }
1017
1018 /**
1019 * Copy chars from a large (over 2GB) <code>Reader</code> to a <code>Writer</code>.
1020 * <p>
1021 * This method buffers the input internally, so there is no need to use a
1022 * <code>BufferedReader</code>.
1023 *
1024 * @param input the <code>Reader</code> to read from
1025 * @param output the <code>Writer</code> to write to
1026 * @return the number of characters copied
1027 * @throws NullPointerException if the input or output is null
1028 * @throws IOException if an I/O error occurs
1029 * @since Commons IO 1.3
1030 */
1031 public static long copyLarge(Reader input, Writer output) throws IOException {
1032 char[] buffer = new char[DEFAULT_BUFFER_SIZE];
1033 long count = 0;
1034 int n = 0;
1035 while (-1 != (n = input.read(buffer))) {
1036 output.write(buffer, 0, n);
1037 count += n;
1038 }
1039 return count;
1040 }
1041
1042 /**
1043 * Copy chars from a <code>Reader</code> to bytes on an
1044 * <code>OutputStream</code> using the default character encoding of the
1045 * platform, and calling flush.
1046 * <p>
1047 * This method buffers the input internally, so there is no need to use a
1048 * <code>BufferedReader</code>.
1049 * <p>
1050 * Due to the implementation of OutputStreamWriter, this method performs a
1051 * flush.
1052 * <p>
1053 * This method uses {@link OutputStreamWriter}.
1054 *
1055 * @param input the <code>Reader</code> to read from
1056 * @param output the <code>OutputStream</code> to write to
1057 * @throws NullPointerException if the input or output is null
1058 * @throws IOException if an I/O error occurs
1059 * @since Commons IO 1.1
1060 */
1061 public static void copy(Reader input, OutputStream output)
1062 throws IOException {
1063 OutputStreamWriter out = new OutputStreamWriter(output);
1064 copy(input, out);
1065 // XXX Unless anyone is planning on rewriting OutputStreamWriter, we
1066 // have to flush here.
1067 out.flush();
1068 }
1069
1070 /**
1071 * Copy chars from a <code>Reader</code> to bytes on an
1072 * <code>OutputStream</code> using the specified character encoding, and
1073 * calling flush.
1074 * <p>
1075 * This method buffers the input internally, so there is no need to use a
1076 * <code>BufferedReader</code>.
1077 * <p>
1078 * Character encoding names can be found at
1079 * <a href="http://www.iana.org/assignments/character-sets">IANA</a>.
1080 * <p>
1081 * Due to the implementation of OutputStreamWriter, this method performs a
1082 * flush.
1083 * <p>
1084 * This method uses {@link OutputStreamWriter}.
1085 *
1086 * @param input the <code>Reader</code> to read from
1087 * @param output the <code>OutputStream</code> to write to
1088 * @param encoding the encoding to use, null means platform default
1089 * @throws NullPointerException if the input or output is null
1090 * @throws IOException if an I/O error occurs
1091 * @since Commons IO 1.1
1092 */
1093 public static void copy(Reader input, OutputStream output, String encoding)
1094 throws IOException {
1095 if (encoding == null) {
1096 copy(input, output);
1097 } else {
1098 OutputStreamWriter out = new OutputStreamWriter(output, encoding);
1099 copy(input, out);
1100 // XXX Unless anyone is planning on rewriting OutputStreamWriter,
1101 // we have to flush here.
1102 out.flush();
1103 }
1104 }
1105
1106 // content equals
1107 //-----------------------------------------------------------------------
1108 /**
1109 * Compare the contents of two Streams to determine if they are equal or
1110 * not.
1111 * <p>
1112 * This method buffers the input internally using
1113 * <code>BufferedInputStream</code> if they are not already buffered.
1114 *
1115 * @param input1 the first stream
1116 * @param input2 the second stream
1117 * @return true if the content of the streams are equal or they both don't
1118 * exist, false otherwise
1119 * @throws NullPointerException if either input is null
1120 * @throws IOException if an I/O error occurs
1121 */
1122 public static boolean contentEquals(InputStream input1, InputStream input2)
1123 throws IOException {
1124 if (!(input1 instanceof BufferedInputStream)) {
1125 input1 = new BufferedInputStream(input1);
1126 }
1127 if (!(input2 instanceof BufferedInputStream)) {
1128 input2 = new BufferedInputStream(input2);
1129 }
1130
1131 int ch = input1.read();
1132 while (-1 != ch) {
1133 int ch2 = input2.read();
1134 if (ch != ch2) {
1135 return false;
1136 }
1137 ch = input1.read();
1138 }
1139
1140 int ch2 = input2.read();
1141 return (ch2 == -1);
1142 }
1143
1144 /**
1145 * Compare the contents of two Readers to determine if they are equal or
1146 * not.
1147 * <p>
1148 * This method buffers the input internally using
1149 * <code>BufferedReader</code> if they are not already buffered.
1150 *
1151 * @param input1 the first reader
1152 * @param input2 the second reader
1153 * @return true if the content of the readers are equal or they both don't
1154 * exist, false otherwise
1155 * @throws NullPointerException if either input is null
1156 * @throws IOException if an I/O error occurs
1157 * @since Commons IO 1.1
1158 */
1159 public static boolean contentEquals(Reader input1, Reader input2)
1160 throws IOException {
1161 if (!(input1 instanceof BufferedReader)) {
1162 input1 = new BufferedReader(input1);
1163 }
1164 if (!(input2 instanceof BufferedReader)) {
1165 input2 = new BufferedReader(input2);
1166 }
1167
1168 int ch = input1.read();
1169 while (-1 != ch) {
1170 int ch2 = input2.read();
1171 if (ch != ch2) {
1172 return false;
1173 }
1174 ch = input1.read();
1175 }
1176
1177 int ch2 = input2.read();
1178 return (ch2 == -1);
1179 }
1180
1181 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 /**
22 * Stream wrapper that make it easy to read up to n bytes ahead from
23 * a stream that supports the mark feature. This class insulates the
24 * underlying stream from things like possible mark(), reset() and close()
25 * calls by external components that might otherwise invalidate the marked
26 * state of a stream.
27 * <p>
28 * The recommended usage pattern of this class is:
29 * <pre>
30 * InputStream lookahead = new LookaheadInputStream(stream, n);
31 * try {
32 * processStream(lookahead);
33 * } finally {
34 * lookahead.close();
35 * }
36 * </pre>
37 * <p>
38 * This usage pattern guarantees that only up to n bytes from the original
39 * stream can ever be read, and that the stream will have been marked and
40 * then reset to its original state once the above code block exits. No
41 * code in the fictional processStream() method can affect the the state of
42 * the original stream.
43 *
44 * @since Apache Tika 0.10
45 */
46 public class LookaheadInputStream extends InputStream {
47
48 private InputStream stream;
49
50 private final byte[] buffer;
51
52 private int buffered = 0;
53
54 private int position = 0;
55
56 private int mark = 0;
57
58 /**
59 * Creates a lookahead wrapper for the given input stream.
60 * The given input stream should support the mark feature,
61 * as otherwise the state of that stream will be undefined
62 * after the lookahead wrapper has been closed. As a special
63 * case a <code>null</code> stream is treated as an empty stream.
64 *
65 * @param stream input stream, can be <code>null</code>
66 * @param n maximum number of bytes to look ahead
67 */
68 public LookaheadInputStream(InputStream stream, int n) {
69 this.stream = stream;
70 this.buffer = new byte[n];
71 if (stream != null) {
72 stream.mark(n);
73 }
74 }
75
76 @Override
77 public void close() throws IOException {
78 if (stream != null) {
79 stream.reset();
80 stream = null;
81 }
82 }
83
84 private void fill() throws IOException {
85 if (available() == 0 && buffered < buffer.length && stream != null) {
86 int n = stream.read(buffer, buffered, buffer.length - buffered);
87 if (n != -1) {
88 buffered += n;
89 } else {
90 close();
91 }
92 }
93 }
94
95 @Override
96 public int read() throws IOException {
97 fill();
98 if (buffered > position) {
99 return 0xff & buffer[position++];
100 } else {
101 return -1;
102 }
103 }
104
105 @Override
106 public int read(byte[] b, int off, int len) throws IOException {
107 fill();
108 if (buffered > position) {
109 len = Math.min(len, buffered - position);
110 System.arraycopy(buffer, position, b, off, len);
111 position += len;
112 return len;
113 } else {
114 return -1;
115 }
116 }
117
118 @Override
119 public long skip(long n) throws IOException {
120 fill();
121 n = Math.min(n, available());
122 position += n;
123 return n;
124 }
125
126 @Override
127 public int available() {
128 return buffered - position;
129 }
130
131 @Override
132 public boolean markSupported() {
133 return true;
134 }
135
136 @Override
137 public synchronized void mark(int readlimit) {
138 mark = position;
139 }
140
141 @Override
142 public synchronized void reset() {
143 position = mark;
144 }
145
146 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.EOFException;
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 /**
23 * A functional, light weight {@link InputStream} that emulates
24 * a stream of a specified size.
25 * <p>
26 * This implementation provides a light weight
27 * object for testing with an {@link InputStream}
28 * where the contents don't matter.
29 * <p>
30 * One use case would be for testing the handling of
31 * large {@link InputStream} as it can emulate that
32 * scenario without the overhead of actually processing
33 * large numbers of bytes - significantly speeding up
34 * test execution times.
35 * <p>
36 * This implementation returns zero from the method that
37 * reads a byte and leaves the array unchanged in the read
38 * methods that are passed a byte array.
39 * If alternative data is required the <code>processByte()</code> and
40 * <code>processBytes()</code> methods can be implemented to generate
41 * data, for example:
42 *
43 * <pre>
44 * public class TestInputStream extends NullInputStream {
45 * public TestInputStream(int size) {
46 * super(size);
47 * }
48 * protected int processByte() {
49 * return ... // return required value here
50 * }
51 * protected void processBytes(byte[] bytes, int offset, int length) {
52 * for (int i = offset; i < length; i++) {
53 * bytes[i] = ... // set array value here
54 * }
55 * }
56 * }
57 * </pre>
58 *
59 * @since Apache Tika 0.4, copied from Commons IO 1.4
60 */
61 public class NullInputStream extends InputStream {
62
63 private final long size;
64 private long position;
65 private long mark = -1;
66 private long readlimit;
67 private boolean eof;
68 private final boolean throwEofException;
69 private final boolean markSupported;
70
71 /**
72 * Create an {@link InputStream} that emulates a specified size
73 * which supports marking and does not throw EOFException.
74 *
75 * @param size The size of the input stream to emulate.
76 */
77 public NullInputStream(long size) {
78 this(size, true, false);
79 }
80
81 /**
82 * Create an {@link InputStream} that emulates a specified
83 * size with option settings.
84 *
85 * @param size The size of the input stream to emulate.
86 * @param markSupported Whether this instance will support
87 * the <code>mark()</code> functionality.
88 * @param throwEofException Whether this implementation
89 * will throw an {@link EOFException} or return -1 when the
90 * end of file is reached.
91 */
92 public NullInputStream(long size, boolean markSupported, boolean throwEofException) {
93 this.size = size;
94 this.markSupported = markSupported;
95 this.throwEofException = throwEofException;
96 }
97
98 /**
99 * Return the current position.
100 *
101 * @return the current position.
102 */
103 public long getPosition() {
104 return position;
105 }
106
107 /**
108 * Return the size this {@link InputStream} emulates.
109 *
110 * @return The size of the input stream to emulate.
111 */
112 public long getSize() {
113 return size;
114 }
115
116 /**
117 * Return the number of bytes that can be read.
118 *
119 * @return The number of bytes that can be read.
120 */
121 @Override
122 public int available() {
123 long avail = size - position;
124 if (avail <= 0) {
125 return 0;
126 } else if (avail > Integer.MAX_VALUE) {
127 return Integer.MAX_VALUE;
128 } else {
129 return (int)avail;
130 }
131 }
132
133 /**
134 * Close this input stream - resets the internal state to
135 * the initial values.
136 *
137 * @throws IOException If an error occurs.
138 */
139 @Override
140 public void close() throws IOException {
141 eof = false;
142 position = 0;
143 mark = -1;
144 }
145
146 /**
147 * Mark the current position.
148 *
149 * @param readlimit The number of bytes before this marked position
150 * is invalid.
151 * @throws UnsupportedOperationException if mark is not supported.
152 */
153 @Override
154 public synchronized void mark(int readlimit) {
155 if (!markSupported) {
156 throw new UnsupportedOperationException("Mark not supported");
157 }
158 mark = position;
159 this.readlimit = readlimit;
160 }
161
162 /**
163 * Indicates whether <i>mark</i> is supported.
164 *
165 * @return Whether <i>mark</i> is supported or not.
166 */
167 @Override
168 public boolean markSupported() {
169 return markSupported;
170 }
171
172 /**
173 * Read a byte.
174 *
175 * @return Either The byte value returned by <code>processByte()</code>
176 * or <code>-1</code> if the end of file has been reached and
177 * <code>throwEofException</code> is set to <code>false</code>.
178 * @throws EOFException if the end of file is reached and
179 * <code>throwEofException</code> is set to <code>true</code>.
180 * @throws IOException if trying to read past the end of file.
181 */
182 @Override
183 public int read() throws IOException {
184 if (eof) {
185 throw new IOException("Read after end of file");
186 }
187 if (position == size) {
188 return doEndOfFile();
189 }
190 position++;
191 return processByte();
192 }
193
194 /**
195 * Read some bytes into the specified array.
196 *
197 * @param bytes The byte array to read into
198 * @return The number of bytes read or <code>-1</code>
199 * if the end of file has been reached and
200 * <code>throwEofException</code> is set to <code>false</code>.
201 * @throws EOFException if the end of file is reached and
202 * <code>throwEofException</code> is set to <code>true</code>.
203 * @throws IOException if trying to read past the end of file.
204 */
205 @Override
206 public int read(byte[] bytes) throws IOException {
207 return read(bytes, 0, bytes.length);
208 }
209
210 /**
211 * Read the specified number bytes into an array.
212 *
213 * @param bytes The byte array to read into.
214 * @param offset The offset to start reading bytes into.
215 * @param length The number of bytes to read.
216 * @return The number of bytes read or <code>-1</code>
217 * if the end of file has been reached and
218 * <code>throwEofException</code> is set to <code>false</code>.
219 * @throws EOFException if the end of file is reached and
220 * <code>throwEofException</code> is set to <code>true</code>.
221 * @throws IOException if trying to read past the end of file.
222 */
223 @Override
224 public int read(byte[] bytes, int offset, int length) throws IOException {
225 if (eof) {
226 throw new IOException("Read after end of file");
227 }
228 if (position == size) {
229 return doEndOfFile();
230 }
231 position += length;
232 int returnLength = length;
233 if (position > size) {
234 returnLength = length - (int)(position - size);
235 position = size;
236 }
237 processBytes(bytes, offset, returnLength);
238 return returnLength;
239 }
240
241 /**
242 * Reset the stream to the point when mark was last called.
243 *
244 * @throws UnsupportedOperationException if mark is not supported.
245 * @throws IOException If no position has been marked
246 * or the read limit has been exceed since the last position was
247 * marked.
248 */
249 @Override
250 public synchronized void reset() throws IOException {
251 if (!markSupported) {
252 throw new UnsupportedOperationException("Mark not supported");
253 }
254 if (mark < 0) {
255 throw new IOException("No position has been marked");
256 }
257 if (position > (mark + readlimit)) {
258 throw new IOException("Marked position [" + mark +
259 "] is no longer valid - passed the read limit [" +
260 readlimit + "]");
261 }
262 position = mark;
263 eof = false;
264 }
265
266 /**
267 * Skip a specified number of bytes.
268 *
269 * @param numberOfBytes The number of bytes to skip.
270 * @return The number of bytes skipped or <code>-1</code>
271 * if the end of file has been reached and
272 * <code>throwEofException</code> is set to <code>false</code>.
273 * @throws EOFException if the end of file is reached and
274 * <code>throwEofException</code> is set to <code>true</code>.
275 * @throws IOException if trying to read past the end of file.
276 */
277 @Override
278 public long skip(long numberOfBytes) throws IOException {
279 if (eof) {
280 throw new IOException("Skip after end of file");
281 }
282 if (position == size) {
283 return doEndOfFile();
284 }
285 position += numberOfBytes;
286 long returnLength = numberOfBytes;
287 if (position > size) {
288 returnLength = numberOfBytes - (position - size);
289 position = size;
290 }
291 return returnLength;
292 }
293
294 /**
295 * Return a byte value for the <code>read()</code> method.
296 * <p>
297 * This implementation returns zero.
298 *
299 * @return This implementation always returns zero.
300 */
301 protected int processByte() {
302 // do nothing - overridable by subclass
303 return 0;
304 }
305
306 /**
307 * Process the bytes for the <code>read(byte[], offset, length)</code>
308 * method.
309 * <p>
310 * This implementation leaves the byte array unchanged.
311 *
312 * @param bytes The byte array
313 * @param offset The offset to start at.
314 * @param length The number of bytes.
315 */
316 protected void processBytes(byte[] bytes, int offset, int length) {
317 // do nothing - overridable by subclass
318 }
319
320 /**
321 * Handle End of File.
322 *
323 * @return <code>-1</code> if <code>throwEofException</code> is
324 * set to <code>false</code>
325 * @throws EOFException if <code>throwEofException</code> is set
326 * to <code>true</code>.
327 */
328 private int doEndOfFile() throws EOFException {
329 eof = true;
330 if (throwEofException) {
331 throw new EOFException();
332 }
333 return -1;
334 }
335
336 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19 import java.io.OutputStream;
20
21 /**
22 * This OutputStream writes all data to the famous <b>/dev/null</b>.
23 * <p>
24 * This output stream has no destination (file/socket etc.) and all
25 * bytes written to it are ignored and lost.
26 *
27 * @author Jeremias Maerki
28 * @since Apache Tika 0.4, copied from Commons IO 1.4
29 */
30 public class NullOutputStream extends OutputStream {
31
32 /**
33 * A singleton.
34 */
35 public static final NullOutputStream NULL_OUTPUT_STREAM = new NullOutputStream();
36
37 /**
38 * Does nothing - output to <code>/dev/null</code>.
39 * @param b The bytes to write
40 * @param off The start offset
41 * @param len The number of bytes to write
42 */
43 @Override
44 public void write(byte[] b, int off, int len) {
45 //to /dev/null
46 }
47
48 /**
49 * Does nothing - output to <code>/dev/null</code>.
50 * @param b The byte to write
51 */
52 @Override
53 public void write(int b) {
54 //to /dev/null
55 }
56
57 /**
58 * Does nothing - output to <code>/dev/null</code>.
59 * @param b The bytes to write
60 * @throws IOException never
61 */
62 @Override
63 public void write(byte[] b) throws IOException {
64 //to /dev/null
65 }
66
67 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.FilterInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 /**
23 * A Proxy stream which acts as expected, that is it passes the method
24 * calls on to the proxied stream and doesn't change which methods are
25 * being called.
26 * <p>
27 * It is an alternative base class to FilterInputStream
28 * to increase reusability, because FilterInputStream changes the
29 * methods being called, such as read(byte[]) to read(byte[], int, int).
30 * <p>
31 * See the protected methods for ways in which a subclass can easily decorate
32 * a stream with custom pre-, post- or error processing functionality.
33 *
34 * @author Stephen Colebourne
35 * @version $Id: ProxyInputStream.java 934061 2010-04-14 17:56:37Z jukka $
36 */
37 public abstract class ProxyInputStream extends FilterInputStream {
38
39 /**
40 * Constructs a new ProxyInputStream.
41 *
42 * @param proxy the InputStream to delegate to
43 */
44 public ProxyInputStream(InputStream proxy) {
45 super(proxy);
46 // the proxy is stored in a protected superclass variable named 'in'
47 }
48
49 /**
50 * Invokes the delegate's <code>read()</code> method.
51 * @return the byte read or -1 if the end of stream
52 * @throws IOException if an I/O error occurs
53 */
54 @Override
55 public int read() throws IOException {
56 try {
57 beforeRead(1);
58 int b = in.read();
59 afterRead(b != -1 ? 1 : -1);
60 return b;
61 } catch (IOException e) {
62 handleIOException(e);
63 return -1;
64 }
65 }
66
67 /**
68 * Invokes the delegate's <code>read(byte[])</code> method.
69 * @param bts the buffer to read the bytes into
70 * @return the number of bytes read or -1 if the end of stream
71 * @throws IOException if an I/O error occurs
72 */
73 @Override
74 public int read(byte[] bts) throws IOException {
75 try {
76 beforeRead(bts.length);
77 int n = in.read(bts);
78 afterRead(n);
79 return n;
80 } catch (IOException e) {
81 handleIOException(e);
82 return -1;
83 }
84 }
85
86 /**
87 * Invokes the delegate's <code>read(byte[], int, int)</code> method.
88 * @param bts the buffer to read the bytes into
89 * @param off The start offset
90 * @param len The number of bytes to read
91 * @return the number of bytes read or -1 if the end of stream
92 * @throws IOException if an I/O error occurs
93 */
94 @Override
95 public int read(byte[] bts, int off, int len) throws IOException {
96 try {
97 beforeRead(len);
98 int n = in.read(bts, off, len);
99 afterRead(n);
100 return n;
101 } catch (IOException e) {
102 handleIOException(e);
103 return -1;
104 }
105 }
106
107 /**
108 * Invokes the delegate's <code>skip(long)</code> method.
109 * @param ln the number of bytes to skip
110 * @return the actual number of bytes skipped
111 * @throws IOException if an I/O error occurs
112 */
113 @Override
114 public long skip(long ln) throws IOException {
115 try {
116 return in.skip(ln);
117 } catch (IOException e) {
118 handleIOException(e);
119 return 0;
120 }
121 }
122
123 /**
124 * Invokes the delegate's <code>available()</code> method.
125 * @return the number of available bytes
126 * @throws IOException if an I/O error occurs
127 */
128 @Override
129 public int available() throws IOException {
130 try {
131 return super.available();
132 } catch (IOException e) {
133 handleIOException(e);
134 return 0;
135 }
136 }
137
138 /**
139 * Invokes the delegate's <code>close()</code> method.
140 * @throws IOException if an I/O error occurs
141 */
142 @Override
143 public void close() throws IOException {
144 try {
145 in.close();
146 } catch (IOException e) {
147 handleIOException(e);
148 }
149 }
150
151 /**
152 * Invokes the delegate's <code>mark(int)</code> method.
153 * @param readlimit read ahead limit
154 */
155 @Override
156 public synchronized void mark(int readlimit) {
157 in.mark(readlimit);
158 }
159
160 /**
161 * Invokes the delegate's <code>reset()</code> method.
162 * @throws IOException if an I/O error occurs
163 */
164 @Override
165 public synchronized void reset() throws IOException {
166 try {
167 in.reset();
168 } catch (IOException e) {
169 handleIOException(e);
170 }
171 }
172
173 /**
174 * Invokes the delegate's <code>markSupported()</code> method.
175 * @return true if mark is supported, otherwise false
176 */
177 @Override
178 public boolean markSupported() {
179 return in.markSupported();
180 }
181
182 /**
183 * Invoked by the read methods before the call is proxied. The number
184 * of bytes that the caller wanted to read (1 for the {@link #read()}
185 * method, buffer length for {@link #read(byte[])}, etc.) is given as
186 * an argument.
187 * <p>
188 * Subclasses can override this method to add common pre-processing
189 * functionality without having to override all the read methods.
190 * The default implementation does nothing.
191 * <p>
192 * Note this method is <em>not</em> called from {@link #skip(long)} or
193 * {@link #reset()}. You need to explicitly override those methods if
194 * you want to add pre-processing steps also to them.
195 *
196 * @since Commons IO 2.0
197 * @param n number of bytes that the caller asked to be read
198 * @throws IOException if the pre-processing fails
199 */
200 protected void beforeRead(int n) throws IOException {
201 }
202
203 /**
204 * Invoked by the read methods after the proxied call has returned
205 * successfully. The number of bytes returned to the caller (or -1 if
206 * the end of stream was reached) is given as an argument.
207 * <p>
208 * Subclasses can override this method to add common post-processing
209 * functionality without having to override all the read methods.
210 * The default implementation does nothing.
211 * <p>
212 * Note this method is <em>not</em> called from {@link #skip(long)} or
213 * {@link #reset()}. You need to explicitly override those methods if
214 * you want to add post-processing steps also to them.
215 *
216 * @since Commons IO 2.0
217 * @param n number of bytes read, or -1 if the end of stream was reached
218 * @throws IOException if the post-processing fails
219 */
220 protected void afterRead(int n) throws IOException {
221 }
222
223 /**
224 * Handle any IOExceptions thrown.
225 * <p>
226 * This method provides a point to implement custom exception
227 * handling. The default behaviour is to re-throw the exception.
228 * @param e The IOException thrown
229 * @throws IOException if an I/O error occurs
230 * @since Commons IO 2.0
231 */
232 protected void handleIOException(IOException e) throws IOException {
233 throw e;
234 }
235
236 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19
20 /**
21 * An {@link IOException} wrapper that tags the wrapped exception with
22 * a given object reference. Both the tag and the wrapped original exception
23 * can be used to determine further processing when this exception is caught.
24 */
25 public class TaggedIOException extends IOExceptionWithCause {
26
27 /**
28 * The object reference used to tag the exception.
29 */
30 private final Object tag;
31
32 /**
33 * Creates a tagged wrapper for the given exception.
34 *
35 * @param original the exception to be tagged
36 * @param tag tag object
37 */
38 public TaggedIOException(IOException original, Object tag) {
39 super(original.getMessage(), original);
40 this.tag = tag;
41 }
42
43 /**
44 * Returns the object reference used as the tag this exception.
45 *
46 * @return tag object
47 */
48 public Object getTag() {
49 return tag;
50 }
51
52 /**
53 * Returns the wrapped exception. The only difference to the overridden
54 * {@link Throwable#getCause()} method is the narrower return type.
55 *
56 * @return wrapped exception
57 */
58 @Override
59 public IOException getCause() {
60 return (IOException) super.getCause();
61 }
62
63 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.Serializable;
21 import java.util.UUID;
22
23 /**
24 * An input stream decorator that tags potential exceptions so that the
25 * stream that caused the exception can easily be identified. This is
26 * done by using the {@link TaggedIOException} class to wrap all thrown
27 * {@link IOException}s. See below for an example of using this class.
28 * <pre>
29 * TaggedInputStream stream = new TaggedInputStream(...);
30 * try {
31 * // Processing that may throw an IOException either from this stream
32 * // or from some other IO activity like temporary files, etc.
33 * processStream(stream);
34 * } catch (IOException e) {
35 * if (stream.isCauseOf(e)) {
36 * // The exception was caused by this stream.
37 * // Use e.getCause() to get the original exception.
38 * } else {
39 * // The exception was caused by something else.
40 * }
41 * }
42 * </pre>
43 * <p>
44 * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be
45 * used to let higher levels of code handle the exception caused by this
46 * stream while other processing errors are being taken care of at this
47 * lower level.
48 * <pre>
49 * TaggedInputStream stream = new TaggedInputStream(...);
50 * try {
51 * processStream(stream);
52 * } catch (IOException e) {
53 * stream.throwIfCauseOf(e);
54 * // ... or process the exception that was caused by something else
55 * }
56 * </pre>
57 *
58 * @see TaggedIOException
59 */
60 public class TaggedInputStream extends ProxyInputStream {
61
62 /**
63 * The unique (serializable) tag of this stream.
64 */
65 private final Serializable tag = UUID.randomUUID();
66
67 /**
68 * Creates a tagging decorator for the given input stream.
69 *
70 * @param proxy input stream to be decorated
71 */
72 public TaggedInputStream(InputStream proxy) {
73 super(proxy);
74 }
75
76 /**
77 * Casts or wraps the given stream to a TaggedInputStream instance.
78 *
79 * @param stream normal input stream
80 * @return a TaggedInputStream instance
81 */
82 public static TaggedInputStream get(InputStream proxy) {
83 if(proxy instanceof TaggedInputStream) {
84 return (TaggedInputStream)proxy;
85 }
86 return new TaggedInputStream(proxy);
87 }
88
89 /**
90 * Tests if the given exception was caused by this stream.
91 *
92 * @param exception an exception
93 * @return <code>true</code> if the exception was thrown by this stream,
94 * <code>false</code> otherwise
95 */
96 public boolean isCauseOf(IOException exception) {
97 if (exception instanceof TaggedIOException) {
98 TaggedIOException tagged = (TaggedIOException) exception;
99 return tag.equals(tagged.getTag());
100 } else {
101 return false;
102 }
103 }
104
105 /**
106 * Re-throws the original exception thrown by this stream. This method
107 * first checks whether the given exception is a {@link TaggedIOException}
108 * wrapper created by this decorator, and then unwraps and throws the
109 * original wrapped exception. Returns normally if the exception was
110 * not thrown by this stream.
111 *
112 * @param exception an exception
113 * @throws IOException original exception, if any, thrown by this stream
114 */
115 public void throwIfCauseOf(Exception exception) throws IOException {
116 if (exception instanceof TaggedIOException) {
117 TaggedIOException tagged = (TaggedIOException) exception;
118 if (tag.equals(tagged.getTag())) {
119 throw tagged.getCause();
120 }
121 }
122 }
123
124 /**
125 * Tags any IOExceptions thrown, wrapping and re-throwing.
126 *
127 * @param e The IOException thrown
128 * @throws IOException if an I/O error occurs
129 */
130 @Override
131 protected void handleIOException(IOException e) throws IOException {
132 throw new TaggedIOException(e, tag);
133 }
134
135 public String toString() {
136 return "Tika Tagged InputStream wrapping " + in;
137 }
138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.FilterInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 /**
23 * <p>
24 * A specialized input stream implementation which records the last portion read
25 * from an underlying stream.
26 * </p>
27 * <p>
28 * This stream implementation is useful to deal with information which is known
29 * to be located at the end of a stream (e.g. ID3 v1 tags). While reading bytes
30 * from the underlying stream, a given number of bytes is kept in an internal
31 * buffer. This buffer can then be queried after the whole stream was read. It
32 * contains the last bytes read from the original input stream.
33 * </p>
34 *
35 * @param in the underlying input stream
36 * @param tailSize the size of the tail buffer
37 */
38 public class TailStream extends FilterInputStream
39 {
40 /** Constant for the default skip buffer size. */
41 private static final int SKIP_SIZE = 4096;
42
43 /** The buffer in which the tail data is stored. */
44 private final byte[] tailBuffer;
45
46 /** The size of the internal tail buffer. */
47 private final int tailSize;
48
49 /** A copy of the internal tail buffer used for mark() operations. */
50 private byte[] markBuffer;
51
52 /** The number of bytes that have been read so far. */
53 private long bytesRead;
54
55 /** The number of bytes read at the last mark() operation. */
56 private long markBytesRead;
57
58 /** The current index into the tail buffer. */
59 private int currentIndex;
60
61 /** A copy of the current index used for mark() operations. */
62 private int markIndex;
63
64 /**
65 * Creates a new instance of {@code TailStream}.
66 *
67 * @param in the underlying input stream
68 * @param size the size of the tail buffer
69 */
70 public TailStream(InputStream in, int size)
71 {
72 super(in);
73 tailSize = size;
74 tailBuffer = new byte[size];
75 }
76
77 /**
78 * {@inheritDoc} This implementation adds the read byte to the internal tail
79 * buffer.
80 */
81 @Override
82 public int read() throws IOException
83 {
84 int c = super.read();
85 if (c != -1)
86 {
87 appendByte((byte) c);
88 }
89 return c;
90 }
91
92 /**
93 * {@inheritDoc} This implementation delegates to the underlying stream and
94 * then adds the correct portion of the read buffer to the internal tail
95 * buffer.
96 */
97 @Override
98 public int read(byte[] buf) throws IOException
99 {
100 int read = super.read(buf);
101 if (read > 0)
102 {
103 appendBuf(buf, 0, read);
104 }
105 return read;
106 }
107
108 /**
109 * {@inheritDoc} This implementation delegates to the underlying stream and
110 * then adds the correct portion of the read buffer to the internal tail
111 * buffer.
112 */
113 @Override
114 public int read(byte[] buf, int ofs, int length) throws IOException
115 {
116 int read = super.read(buf, ofs, length);
117 if (read > 0)
118 {
119 appendBuf(buf, ofs, read);
120 }
121 return read;
122 }
123
124 /**
125 * {@inheritDoc} This implementation delegates to the {@code read()} method
126 * to ensure that the tail buffer is also filled if data is skipped.
127 */
128 @Override
129 public long skip(long n) throws IOException
130 {
131 int bufSize = (int) Math.min(n, SKIP_SIZE);
132 byte[] buf = new byte[bufSize];
133 long bytesSkipped = 0;
134 int bytesRead = 0;
135
136 while(bytesSkipped < n && bytesRead != -1)
137 {
138 int len = (int) Math.min(bufSize, n - bytesSkipped);
139 bytesRead = read(buf, 0, len);
140 if(bytesRead != -1)
141 {
142 bytesSkipped += bytesRead;
143 }
144 }
145
146 return (bytesRead < 0 && bytesSkipped == 0) ? -1 : bytesSkipped;
147 }
148
149 /**
150 * {@inheritDoc} This implementation saves the internal state including the
151 * content of the tail buffer so that it can be restored when ''reset()'' is
152 * called later.
153 */
154 @Override
155 public void mark(int limit)
156 {
157 markBuffer = new byte[tailSize];
158 System.arraycopy(tailBuffer, 0, markBuffer, 0, tailSize);
159 markIndex = currentIndex;
160 markBytesRead = bytesRead;
161 }
162
163 /**
164 * {@inheritDoc} This implementation restores this stream's state to the
165 * state when ''mark()'' was called the last time. If ''mark()'' has not
166 * been called before, this method has no effect.
167 */
168 @Override
169 public void reset()
170 {
171 if (markBuffer != null)
172 {
173 System.arraycopy(markBuffer, 0, tailBuffer, 0, tailSize);
174 currentIndex = markIndex;
175 bytesRead = markBytesRead;
176 }
177 }
178
179 /**
180 * Returns an array with the last data read from the underlying stream. If
181 * the underlying stream contained more data than the ''tailSize''
182 * constructor argument, the returned array has a length of ''tailSize''.
183 * Otherwise, its length equals the number of bytes read.
184 *
185 * @return an array with the last data read from the underlying stream
186 */
187 public byte[] getTail()
188 {
189 int size = (int) Math.min(tailSize, bytesRead);
190 byte[] result = new byte[size];
191 System.arraycopy(tailBuffer, currentIndex, result, 0, size
192 - currentIndex);
193 System.arraycopy(tailBuffer, 0, result, size - currentIndex,
194 currentIndex);
195 return result;
196 }
197
198 /**
199 * Adds the given byte to the internal tail buffer.
200 *
201 * @param b the byte to be added
202 */
203 private void appendByte(byte b)
204 {
205 tailBuffer[currentIndex++] = b;
206 if (currentIndex >= tailSize)
207 {
208 currentIndex = 0;
209 }
210 bytesRead++;
211 }
212
213 /**
214 * Adds the content of the given buffer to the internal tail buffer.
215 *
216 * @param buf the buffer
217 * @param ofs the start offset in the buffer
218 * @param length the number of bytes to be copied
219 */
220 private void appendBuf(byte[] buf, int ofs, int length)
221 {
222 if (length >= tailSize)
223 {
224 replaceTailBuffer(buf, ofs, length);
225 }
226 else
227 {
228 copyToTailBuffer(buf, ofs, length);
229 }
230
231 bytesRead += length;
232 }
233
234 /**
235 * Replaces the content of the internal tail buffer by the last portion of
236 * the given buffer. This method is called if a buffer was read from the
237 * underlying stream whose length is larger than the tail buffer.
238 *
239 * @param buf the buffer
240 * @param ofs the start offset in the buffer
241 * @param length the number of bytes to be copied
242 */
243 private void replaceTailBuffer(byte[] buf, int ofs, int length)
244 {
245 System.arraycopy(buf, ofs + length - tailSize, tailBuffer, 0, tailSize);
246 currentIndex = 0;
247 }
248
249 /**
250 * Copies the given buffer into the internal tail buffer at the current
251 * position. This method is called if a buffer is read from the underlying
252 * stream whose length is smaller than the tail buffer. In this case the
253 * tail buffer is only partly overwritten.
254 *
255 * @param buf the buffer
256 * @param ofs the start offset in the buffer
257 * @param length the number of bytes to be copied
258 */
259 private void copyToTailBuffer(byte[] buf, int ofs, int length)
260 {
261 int remaining = tailSize - currentIndex;
262 int size1 = Math.min(remaining, length);
263 System.arraycopy(buf, ofs, tailBuffer, currentIndex, size1);
264 System.arraycopy(buf, ofs + size1, tailBuffer, 0, length - size1);
265 currentIndex = (currentIndex + length) % tailSize;
266 }
267 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.Closeable;
19 import java.io.File;
20 import java.io.IOException;
21 import java.util.LinkedList;
22 import java.util.List;
23
24 import org.apache.tika.exception.TikaException;
25
26 /**
27 * Utility class for tracking and ultimately closing or otherwise disposing
28 * a collection of temporary resources.
29 * <p>
30 * Note that this class is not thread-safe.
31 *
32 * @since Apache Tika 0.10
33 */
34 public class TemporaryResources implements Closeable {
35
36 /**
37 * Tracked resources in LIFO order.
38 */
39 private final LinkedList<Closeable> resources = new LinkedList<Closeable>();
40
41 /**
42 * Directory for temporary files, <code>null</code> for the system default.
43 */
44 private File tmp = null;
45
46 /**
47 * Sets the directory to be used for the temporary files created by
48 * the {@link #createTemporaryFile()} method.
49 *
50 * @param tmp temporary file directory,
51 * or <code>null</code> for the system default
52 */
53 public void setTemporaryFileDirectory(File tmp) {
54 this.tmp = tmp;
55 }
56
57 /**
58 * Creates and returns a temporary file that will automatically be
59 * deleted when the {@link #close()} method is called.
60 *
61 * @return
62 * @throws IOException
63 */
64 public File createTemporaryFile() throws IOException {
65 final File file = File.createTempFile("apache-tika-", ".tmp", tmp);
66 addResource(new Closeable() {
67 public void close() throws IOException {
68 if (!file.delete()) {
69 throw new IOException(
70 "Could not delete temporary file "
71 + file.getPath());
72 }
73 }
74 });
75 return file;
76 }
77
78 /**
79 * Adds a new resource to the set of tracked resources that will all be
80 * closed when the {@link #close()} method is called.
81 *
82 * @param resource resource to be tracked
83 */
84 public void addResource(Closeable resource) {
85 resources.addFirst(resource);
86 }
87
88 /**
89 * Returns the latest of the tracked resources that implements or
90 * extends the given interface or class.
91 *
92 * @param klass interface or class
93 * @return matching resource, or <code>null</code> if not found
94 */
95 @SuppressWarnings("unchecked")
96 public <T extends Closeable> T getResource(Class<T> klass) {
97 for (Closeable resource : resources) {
98 if (klass.isAssignableFrom(resource.getClass())) {
99 return (T) resource;
100 }
101 }
102 return null;
103 }
104
105 /**
106 * Closes all tracked resources. The resources are closed in reverse order
107 * from how they were added.
108 * <p>
109 * Any thrown exceptions from managed resources are collected and
110 * then re-thrown only once all the resources have been closed.
111 *
112 * @throws IOException if one or more of the tracked resources
113 * could not be closed
114 */
115 public void close() throws IOException {
116 // Release all resources and keep track of any exceptions
117 List<IOException> exceptions = new LinkedList<IOException>();
118 for (Closeable resource : resources) {
119 try {
120 resource.close();
121 } catch (IOException e) {
122 exceptions.add(e);
123 }
124 }
125 resources.clear();
126
127 // Throw any exceptions that were captured from above
128 if (!exceptions.isEmpty()) {
129 if (exceptions.size() == 1) {
130 throw exceptions.get(0);
131 } else {
132 throw new IOExceptionWithCause(
133 "Multiple IOExceptions" + exceptions,
134 exceptions.get(0));
135 }
136 }
137 }
138
139 /**
140 * Calls the {@link #close()} method and wraps the potential
141 * {@link IOException} into a {@link TikaException} for convenience
142 * when used within Tika.
143 *
144 * @throws TikaException if one or more of the tracked resources
145 * could not be closed
146 */
147 public void dispose() throws TikaException {
148 try {
149 close();
150 } catch (IOException e) {
151 throw new TikaException("Failed to close temporary resources", e);
152 }
153 }
154
155 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.BufferedInputStream;
19 import java.io.ByteArrayInputStream;
20 import java.io.Closeable;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileNotFoundException;
24 import java.io.FileOutputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.OutputStream;
28 import java.net.URI;
29 import java.net.URISyntaxException;
30 import java.net.URL;
31 import java.net.URLConnection;
32 import java.nio.channels.FileChannel;
33 import java.sql.Blob;
34 import java.sql.SQLException;
35
36 import org.apache.tika.metadata.Metadata;
37
38 /**
39 * Input stream with extended capabilities. The purpose of this class is
40 * to allow files and other resources and information to be associated with
41 * the {@link InputStream} instance passed through the
42 * {@link org.apache.tika.parser.Parser} interface and other similar APIs.
43 * <p>
44 * TikaInputStream instances can be created using the various static
45 * <code>get()</code> factory methods. Most of these methods take an optional
46 * {@link Metadata} argument that is then filled with the available input
47 * metadata from the given resource. The created TikaInputStream instance
48 * keeps track of the original resource used to create it, while behaving
49 * otherwise just like a normal, buffered {@link InputStream}.
50 * A TikaInputStream instance is also guaranteed to support the
51 * {@link #mark(int)} feature.
52 * <p>
53 * Code that wants to access the underlying file or other resources
54 * associated with a TikaInputStream should first use the
55 * {@link #get(InputStream)} factory method to cast or wrap a given
56 * {@link InputStream} into a TikaInputStream instance.
57 *
58 * @since Apache Tika 0.8
59 */
60 public class TikaInputStream extends TaggedInputStream {
61
62 /**
63 * Checks whether the given stream is a TikaInputStream instance.
64 * The given stream can be <code>null</code>, in which case the return
65 * value is <code>false</code>.
66 *
67 * @param stream input stream, possibly <code>null</code>
68 * @return <code>true</code> if the stream is a TikaInputStream instance,
69 * <code>false</code> otherwise
70 */
71 public static boolean isTikaInputStream(InputStream stream) {
72 return stream instanceof TikaInputStream;
73 }
74
75 /**
76 * Casts or wraps the given stream to a TikaInputStream instance.
77 * This method can be used to access the functionality of this class
78 * even when given just a normal input stream instance.
79 * <p>
80 * The given temporary file provider is used for any temporary files,
81 * and should be disposed when the returned stream is no longer used.
82 * <p>
83 * Use this method instead of the {@link #get(InputStream)} alternative
84 * when you <em>don't</em> explicitly close the returned stream. The
85 * recommended access pattern is:
86 * <pre>
87 * TemporaryResources tmp = new TemporaryResources();
88 * try {
89 * TikaInputStream stream = TikaInputStream.get(..., tmp);
90 * // process stream but don't close it
91 * } finally {
92 * tmp.close();
93 * }
94 * </pre>
95 * <p>
96 * The given stream instance will <em>not</em> be closed when the
97 * {@link TemporaryResources#close()} method is called. The caller
98 * is expected to explicitly close the original stream when it's no
99 * longer used.
100 *
101 * @since Apache Tika 0.10
102 * @param stream normal input stream
103 * @return a TikaInputStream instance
104 */
105 public static TikaInputStream get(
106 InputStream stream, TemporaryResources tmp) {
107 if (stream == null) {
108 throw new NullPointerException("The Stream must not be null");
109 }
110 if (stream instanceof TikaInputStream) {
111 return (TikaInputStream) stream;
112 } else {
113 // Make sure that the stream is buffered and that it
114 // (properly) supports the mark feature
115 if (!(stream instanceof BufferedInputStream)
116 && !(stream instanceof ByteArrayInputStream)) {
117 stream = new BufferedInputStream(stream);
118 }
119 return new TikaInputStream(stream, tmp, -1);
120 }
121 }
122
123 /**
124 * Casts or wraps the given stream to a TikaInputStream instance.
125 * This method can be used to access the functionality of this class
126 * even when given just a normal input stream instance.
127 * <p>
128 * Use this method instead of the
129 * {@link #get(InputStream, TemporaryResources)} alternative when you
130 * <em>do</em> explicitly close the returned stream. The recommended
131 * access pattern is:
132 * <pre>
133 * TikaInputStream stream = TikaInputStream.get(...);
134 * try {
135 * // process stream
136 * } finally {
137 * stream.close();
138 * }
139 * </pre>
140 * <p>
141 * The given stream instance will be closed along with any other resources
142 * associated with the returned TikaInputStream instance when the
143 * {@link #close()} method is called.
144 *
145 * @param stream normal input stream
146 * @return a TikaInputStream instance
147 */
148 public static TikaInputStream get(InputStream stream) {
149 return get(stream, new TemporaryResources());
150 }
151
152 /**
153 * Returns the given stream casts to a TikaInputStream, or
154 * <code>null</code> if the stream is not a TikaInputStream.
155 *
156 * @since Apache Tika 0.10
157 * @param stream normal input stream
158 * @return a TikaInputStream instance
159 */
160 public static TikaInputStream cast(InputStream stream) {
161 if (stream instanceof TikaInputStream) {
162 return (TikaInputStream) stream;
163 } else {
164 return null;
165 }
166 }
167
168 /**
169 * Creates a TikaInputStream from the given array of bytes.
170 * <p>
171 * Note that you must always explicitly close the returned stream as in
172 * some cases it may end up writing the given data to a temporary file.
173 *
174 * @param data input data
175 * @return a TikaInputStream instance
176 */
177 public static TikaInputStream get(byte[] data) {
178 return get(data, new Metadata());
179 }
180
181 /**
182 * Creates a TikaInputStream from the given array of bytes. The length of
183 * the array is stored as input metadata in the given metadata instance.
184 * <p>
185 * Note that you must always explicitly close the returned stream as in
186 * some cases it may end up writing the given data to a temporary file.
187 *
188 * @param data input data
189 * @param metadata metadata instance
190 * @return a TikaInputStream instance
191 * @throws IOException
192 */
193 public static TikaInputStream get(byte[] data, Metadata metadata) {
194 metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
195 return new TikaInputStream(
196 new ByteArrayInputStream(data),
197 new TemporaryResources(), data.length);
198 }
199
200 /**
201 * Creates a TikaInputStream from the given file.
202 * <p>
203 * Note that you must always explicitly close the returned stream to
204 * prevent leaking open file handles.
205 *
206 * @param file input file
207 * @return a TikaInputStream instance
208 * @throws FileNotFoundException if the file does not exist
209 */
210 public static TikaInputStream get(File file) throws FileNotFoundException {
211 return get(file, new Metadata());
212 }
213
214 /**
215 * Creates a TikaInputStream from the given file. The file name and
216 * length are stored as input metadata in the given metadata instance.
217 * <p>
218 * Note that you must always explicitly close the returned stream to
219 * prevent leaking open file handles.
220 *
221 * @param file input file
222 * @param metadata metadata instance
223 * @return a TikaInputStream instance
224 * @throws FileNotFoundException if the file does not exist
225 */
226 public static TikaInputStream get(File file, Metadata metadata)
227 throws FileNotFoundException {
228 metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
229 metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.length()));
230 return new TikaInputStream(file);
231 }
232
233 /**
234 * Creates a TikaInputStream from the given database BLOB.
235 * <p>
236 * Note that the result set containing the BLOB may need to be kept open
237 * until the returned TikaInputStream has been processed and closed.
238 * You must also always explicitly close the returned stream as in
239 * some cases it may end up writing the blob data to a temporary file.
240 *
241 * @param blob database BLOB
242 * @return a TikaInputStream instance
243 * @throws SQLException if BLOB data can not be accessed
244 */
245 public static TikaInputStream get(Blob blob) throws SQLException {
246 return get(blob, new Metadata());
247 }
248
249 /**
250 * Blob size threshold that limits the largest BLOB size to be
251 * buffered fully in memory by the {@link #get(Blob, Metadata)}
252 * method.
253 */
254 private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024;
255
256 /**
257 * Creates a TikaInputStream from the given database BLOB. The BLOB
258 * length (if available) is stored as input metadata in the given
259 * metadata instance.
260 * <p>
261 * Note that the result set containing the BLOB may need to be kept open
262 * until the returned TikaInputStream has been processed and closed.
263 * You must also always explicitly close the returned stream as in
264 * some cases it may end up writing the blob data to a temporary file.
265 *
266 * @param blob database BLOB
267 * @param metadata metadata instance
268 * @return a TikaInputStream instance
269 * @throws SQLException if BLOB data can not be accessed
270 */
271 public static TikaInputStream get(Blob blob, Metadata metadata)
272 throws SQLException {
273 long length = -1;
274 try {
275 length = blob.length();
276 metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
277 } catch (SQLException ignore) {
278 }
279
280 // Prefer an in-memory buffer for reasonably sized blobs to reduce
281 // the likelihood of problems caused by long-lived database accesses
282 if (0 <= length && length <= BLOB_SIZE_THRESHOLD) {
283 // the offset in Blob.getBytes() starts at 1
284 return get(blob.getBytes(1, (int) length), metadata);
285 } else {
286 return new TikaInputStream(
287 new BufferedInputStream(blob.getBinaryStream()),
288 new TemporaryResources(), length);
289 }
290 }
291
292 /**
293 * Creates a TikaInputStream from the resource at the given URI.
294 * <p>
295 * Note that you must always explicitly close the returned stream as in
296 * some cases it may end up writing the resource to a temporary file.
297 *
298 * @param uri resource URI
299 * @return a TikaInputStream instance
300 * @throws IOException if the resource can not be accessed
301 */
302 public static TikaInputStream get(URI uri) throws IOException {
303 return get(uri, new Metadata());
304 }
305
306 /**
307 * Creates a TikaInputStream from the resource at the given URI. The
308 * available input metadata is stored in the given metadata instance.
309 * <p>
310 * Note that you must always explicitly close the returned stream as in
311 * some cases it may end up writing the resource to a temporary file.
312 *
313 * @param uri resource URI
314 * @param metadata metadata instance
315 * @return a TikaInputStream instance
316 * @throws IOException if the resource can not be accessed
317 */
318 public static TikaInputStream get(URI uri, Metadata metadata)
319 throws IOException {
320 // Special handling for file:// URIs
321 if ("file".equalsIgnoreCase(uri.getScheme())) {
322 File file = new File(uri);
323 if (file.isFile()) {
324 return get(file, metadata);
325 }
326 }
327
328 return get(uri.toURL(), metadata);
329 }
330
331 /**
332 * Creates a TikaInputStream from the resource at the given URL.
333 * <p>
334 * Note that you must always explicitly close the returned stream as in
335 * some cases it may end up writing the resource to a temporary file.
336 *
337 * @param url resource URL
338 * @return a TikaInputStream instance
339 * @throws IOException if the resource can not be accessed
340 */
341 public static TikaInputStream get(URL url) throws IOException {
342 return get(url, new Metadata());
343 }
344
345 /**
346 * Creates a TikaInputStream from the resource at the given URL. The
347 * available input metadata is stored in the given metadata instance.
348 * <p>
349 * Note that you must always explicitly close the returned stream as in
350 * some cases it may end up writing the resource to a temporary file.
351 *
352 * @param url resource URL
353 * @param metadata metadata instance
354 * @return a TikaInputStream instance
355 * @throws IOException if the resource can not be accessed
356 */
357 public static TikaInputStream get(URL url, Metadata metadata)
358 throws IOException {
359 // Special handling for file:// URLs
360 if ("file".equalsIgnoreCase(url.getProtocol())) {
361 try {
362 File file = new File(url.toURI());
363 if (file.isFile()) {
364 return get(file, metadata);
365 }
366 } catch (URISyntaxException e) {
367 // fall through
368 }
369 }
370
371 URLConnection connection = url.openConnection();
372
373 String path = url.getPath();
374 int slash = path.lastIndexOf('/');
375 if (slash + 1 < path.length()) { // works even with -1!
376 metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
377 }
378
379 String type = connection.getContentType();
380 if (type != null) {
381 metadata.set(Metadata.CONTENT_TYPE, type);
382 }
383
384 String encoding = connection.getContentEncoding();
385 if (encoding != null) {
386 metadata.set(Metadata.CONTENT_ENCODING, encoding);
387 }
388
389 int length = connection.getContentLength();
390 if (length >= 0) {
391 metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
392 }
393
394 return new TikaInputStream(
395 new BufferedInputStream(connection.getInputStream()),
396 new TemporaryResources(), length);
397 }
398
399 /**
400 * The file that contains the contents of this stream. This is either
401 * the original file passed to the {@link #TikaInputStream(File)}
402 * constructor or a temporary file created by a call to the
403 * {@link #getFile()} method. If neither has been called, then
404 * the value is <code>null</code>.
405 */
406 private File file;
407
408 /**
409 * Tracker of temporary resources.
410 */
411 private final TemporaryResources tmp;
412
413 /**
414 * Total length of the stream, or -1 if unknown.
415 */
416 private long length;
417
418 /**
419 * Current read position within this stream.
420 */
421 private long position = 0;
422
423 /**
424 * Marked position, or -1 if there is no current mark.
425 */
426 private long mark = -1;
427
428 /**
429 * A opened container, such as a POIFS FileSystem
430 * for an OLE2 document, or a Zip file for a
431 * zip based (eg ooxml, odf) document.
432 */
433 private Object openContainer;
434
435 /**
436 * Creates a TikaInputStream instance. This private constructor is used
437 * by the static factory methods based on the available information.
438 *
439 * @param file the file that contains the stream
440 * @throws FileNotFoundException if the file does not exist
441 */
442 private TikaInputStream(File file) throws FileNotFoundException {
443 super(new BufferedInputStream(new FileInputStream(file)));
444 this.file = file;
445 this.tmp = new TemporaryResources();
446 this.length = file.length();
447 }
448
449 /**
450 * Creates a TikaInputStream instance. This private constructor is used
451 * by the static factory methods based on the available information.
452 * <p>
453 * The given stream needs to be included in the given temporary resource
454 * collection if the caller wants it also to get closed when the
455 * {@link #close()} method is invoked.
456 *
457 * @param stream <em>buffered</em> stream (must support the mark feature)
458 * @param tmp tracker for temporary resources associated with this stream
459 * @param length total length of the stream, or -1 if unknown
460 */
461 private TikaInputStream(
462 InputStream stream, TemporaryResources tmp, long length) {
463 super(stream);
464 this.file = null;
465 this.tmp = tmp;
466 this.length = length;
467 }
468
469 /**
470 * Fills the given buffer with upcoming bytes from this stream without
471 * advancing the current stream position. The buffer is filled up unless
472 * the end of stream is encountered before that. This method will block
473 * if not enough bytes are immediately available.
474 *
475 * @param buffer byte buffer
476 * @return number of bytes written to the buffer
477 * @throws IOException if the stream can not be read
478 */
479 public int peek(byte[] buffer) throws IOException {
480 int n = 0;
481
482 mark(buffer.length);
483
484 int m = read(buffer);
485 while (m != -1) {
486 n += m;
487 if (n < buffer.length) {
488 m = read(buffer, n, buffer.length - n);
489 } else {
490 m = -1;
491 }
492 }
493
494 reset();
495
496 return n;
497 }
498
499 /**
500 * Returns the open container object, such as a
501 * POIFS FileSystem in the event of an OLE2
502 * document being detected and processed by
503 * the OLE2 detector.
504 */
505 public Object getOpenContainer() {
506 return openContainer;
507 }
508
509 /**
510 * Stores the open container object against
511 * the stream, eg after a Zip contents
512 * detector has loaded the file to decide
513 * what it contains.
514 */
515 public void setOpenContainer(Object container) {
516 openContainer = container;
517 if (container instanceof Closeable) {
518 tmp.addResource((Closeable) container);
519 }
520 }
521
522 public boolean hasFile() {
523 return file != null;
524 }
525
526 public File getFile() throws IOException {
527 if (file == null) {
528 if (position > 0) {
529 throw new IOException("Stream is already being read");
530 } else {
531 // Spool the entire stream into a temporary file
532 file = tmp.createTemporaryFile();
533 OutputStream out = new FileOutputStream(file);
534 try {
535 IOUtils.copy(in, out);
536 } finally {
537 out.close();
538 }
539
540 // Create a new input stream and make sure it'll get closed
541 FileInputStream newStream = new FileInputStream(file);
542 tmp.addResource(newStream);
543
544 // Replace the spooled stream with the new stream in a way
545 // that still ends up closing the old stream if or when the
546 // close() method is called. The closing of the new stream
547 // is already being handled as noted above.
548 final InputStream oldStream = in;
549 in = new BufferedInputStream(newStream) {
550 @Override
551 public void close() throws IOException {
552 oldStream.close();
553 }
554 };
555
556 length = file.length();
557 }
558 }
559 return file;
560 }
561
562 public FileChannel getFileChannel() throws IOException {
563 FileInputStream fis = new FileInputStream(getFile());
564 tmp.addResource(fis);
565 FileChannel channel = fis.getChannel();
566 tmp.addResource(channel);
567 return channel;
568 }
569
570 public boolean hasLength() {
571 return length != -1;
572 }
573
574 /**
575 * Returns the length (in bytes) of this stream. Note that if the length
576 * was not available when this stream was instantiated, then this method
577 * will use the {@link #getFile()} method to buffer the entire stream to
578 * a temporary file in order to calculate the stream length. This case
579 * will only work if the stream has not yet been consumed.
580 *
581 * @return stream length
582 * @throws IOException if the length can not be determined
583 */
584 public long getLength() throws IOException {
585 if (length == -1) {
586 length = getFile().length();
587 }
588 return length;
589 }
590
591 /**
592 * Returns the current position within the stream.
593 *
594 * @return stream position
595 */
596 public long getPosition() {
597 return position;
598 }
599
600 @Override
601 public long skip(long ln) throws IOException {
602 long n = super.skip(ln);
603 position += n;
604 return n;
605 }
606
607 @Override
608 public void mark(int readlimit) {
609 super.mark(readlimit);
610 mark = position;
611 }
612
613 @Override
614 public boolean markSupported() {
615 return true;
616 }
617
618 @Override
619 public void reset() throws IOException {
620 super.reset();
621 position = mark;
622 mark = -1;
623 }
624
625 @Override
626 public void close() throws IOException {
627 file = null;
628 mark = -1;
629
630 // The close method was explicitly called, so we indeed
631 // are expected to close the input stream. Handle that
632 // by adding that stream as a resource to be tracked before
633 // closing all of them. This way also possible exceptions from
634 // the close() calls get managed properly.
635 tmp.addResource(in);
636 tmp.close();
637 }
638
639 @Override
640 protected void afterRead(int n) {
641 if (n != -1) {
642 position += n;
643 }
644 }
645
646 public String toString() {
647 String str = "TikaInputStream of ";
648 if (hasFile()) {
649 str += file.toString();
650 } else {
651 str += in.toString();
652 }
653 if (openContainer != null) {
654 str += " (in " + openContainer + ")";
655 }
656 return str;
657 }
658 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * IO utilities.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.io;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.util.HashMap;
23 import java.util.Map;
24 import java.util.Properties;
25 import java.util.Set;
26
27 /**
28 * Identifier of the language that best matches a given content profile.
29 * The content profile is compared to generic language profiles based on
30 * material from various sources.
31 *
32 * @since Apache Tika 0.5
33 * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
34 * Europarl: A Parallel Corpus for Statistical Machine Translation</a>
35 * @see <a href="http://www.loc.gov/standards/iso639-2/php/code_list.php">
36 * ISO 639 Language Codes</a>
37 */
38 public class LanguageIdentifier {
39
40 /**
41 * The available language profiles.
42 */
43 private static final Map<String, LanguageProfile> PROFILES =
44 new HashMap<String, LanguageProfile>();
45 private static final String PROFILE_SUFFIX = ".ngp";
46 private static final String PROFILE_ENCODING = "UTF-8";
47
48 private static Properties props = new Properties();
49 private static String errors = "";
50
51 private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties";
52 private static final String PROPERTIES_FILE = "tika.language.properties";
53 private static final String LANGUAGES_KEY = "languages";
54 private static final double CERTAINTY_LIMIT = 0.022;
55
56 private final String language;
57
58 private final double distance;
59
60 /*
61 * Always attempt initializing language profiles when class is loaded first time
62 */
63 static {
64 initProfiles();
65 }
66
67 /*
68 * Add one language profile based on config in property file
69 */
70 private static void addProfile(String language) throws Exception {
71 try {
72 LanguageProfile profile = new LanguageProfile();
73
74 InputStream stream =
75 LanguageIdentifier.class.getResourceAsStream(language + PROFILE_SUFFIX);
76 try {
77 BufferedReader reader =
78 new BufferedReader(new InputStreamReader(stream, PROFILE_ENCODING));
79 String line = reader.readLine();
80 while (line != null) {
81 if (line.length() > 0 && !line.startsWith("#")) {
82 int space = line.indexOf(' ');
83 profile.add(
84 line.substring(0, space),
85 Long.parseLong(line.substring(space + 1)));
86 }
87 line = reader.readLine();
88 }
89 } finally {
90 stream.close();
91 }
92
93 addProfile(language, profile);
94 } catch (Throwable t) {
95 throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage());
96 }
97 }
98
99 /**
100 * Adds a single language profile
101 * @param language an ISO 639 code representing language
102 * @param profile the language profile
103 */
104 public static void addProfile(String language, LanguageProfile profile) {
105 PROFILES.put(language, profile);
106 }
107
108 /**
109 * Constructs a language identifier based on a LanguageProfile
110 * @param profile the language profile
111 */
112 public LanguageIdentifier(LanguageProfile profile) {
113 String minLanguage = "unknown";
114 double minDistance = 1.0;
115 for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) {
116 double distance = profile.distance(entry.getValue());
117 if (distance < minDistance) {
118 minDistance = distance;
119 minLanguage = entry.getKey();
120 }
121 }
122
123 this.language = minLanguage;
124 this.distance = minDistance;
125 }
126
127 /**
128 * Constructs a language identifier based on a String of text content
129 * @param content the text
130 */
131 public LanguageIdentifier(String content) {
132 this(new LanguageProfile(content));
133 }
134
135 /**
136 * Gets the identified language
137 * @return an ISO 639 code representing the detected language
138 */
139 public String getLanguage() {
140 return language;
141 }
142
143 /**
144 * Tries to judge whether the identification is certain enough
145 * to be trusted.
146 * WARNING: Will never return true for small amount of input texts.
147 * @return <code>true</code> if the distance is smaller then {@value #CERTAINTY_LIMIT}, <code>false</code> otherwise
148 */
149 public boolean isReasonablyCertain() {
150 return distance < CERTAINTY_LIMIT;
151 }
152
153 /**
154 * Builds the language profiles.
155 * The list of languages are fetched from a property file named "tika.language.properties"
156 * If a file called "tika.language.override.properties" is found on classpath, this is used instead
157 * The property file contains a key "languages" with values being comma-separated language codes
158 */
159 public static void initProfiles() {
160 clearProfiles();
161
162 errors = "";
163 InputStream stream;
164 stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
165 if(stream == null) {
166 stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_FILE);
167 }
168
169 if(stream != null){
170 try {
171 props = new Properties();
172 props.load(stream);
173 } catch (IOException e) {
174 errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n";
175 }
176 }
177
178 String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
179 for(String language : languages) {
180 language = language.trim();
181 String name = props.getProperty("name."+language, "Unknown");
182 try {
183 addProfile(language);
184 } catch (Exception e) {
185 errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n";
186 }
187 }
188 }
189
190 /**
191 * Initializes the language profiles from a user supplied initialized Map.
192 * This overrides the default set of profiles initialized at startup,
193 * and provides an alternative to configuring profiles through property file
194 *
195 * @param profilesMap map of language profiles
196 */
197 public static void initProfiles(Map<String, LanguageProfile> profilesMap) {
198 clearProfiles();
199 for(Map.Entry<String, LanguageProfile> entry : profilesMap.entrySet()) {
200 addProfile(entry.getKey(), entry.getValue());
201 }
202 }
203
204 /**
205 * Clears the current map of language profiles
206 */
207 public static void clearProfiles() {
208 PROFILES.clear();
209 }
210
211 /**
212 * Tests whether there were errors initializing language config
213 * @return true if there are errors. Use getErrors() to retrieve.
214 */
215 public static boolean hasErrors() {
216 return errors != "";
217 }
218
219 /**
220 * Returns a string of error messages related to initializing langauge profiles
221 * @return the String containing the error messages
222 */
223 public static String getErrors() {
224 return errors;
225 }
226
227 /**
228 * Returns what languages are supported for language identification
229 * @return A set of Strings being the ISO 639 language codes
230 */
231 public static Set<String> getSupportedLanguages() {
232 return PROFILES.keySet();
233 }
234
235 @Override
236 public String toString() {
237 return language + " (" + distance + ")";
238 }
239
240 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.util.HashMap;
19 import java.util.HashSet;
20 import java.util.Map;
21 import java.util.Set;
22
23 /**
24 * Language profile based on ngram counts.
25 *
26 * @since Apache Tika 0.5
27 */
28 public class LanguageProfile {
29
30 public static final int DEFAULT_NGRAM_LENGTH = 3;
31
32 private final int length;
33
34 /**
35 * The ngrams that make up this profile.
36 */
37 private final Map<String, Counter> ngrams =
38 new HashMap<String, Counter>();
39
40 /**
41 * The sum of all ngram counts in this profile.
42 * Used to calculate relative ngram frequency.
43 */
44 private long count = 0;
45
46 private static class Counter {
47 private long count = 0;
48 public String toString() {
49 return Long.toString(count);
50 }
51 }
52
53 public LanguageProfile(int length) {
54 this.length = length;
55 }
56
57 public LanguageProfile() {
58 this(DEFAULT_NGRAM_LENGTH);
59 }
60
61 public LanguageProfile(String content, int length) {
62 this(length);
63
64 ProfilingWriter writer = new ProfilingWriter(this);
65 char[] ch = content.toCharArray();
66 writer.write(ch, 0, ch.length);
67 }
68
69 public LanguageProfile(String content) {
70 this(content, DEFAULT_NGRAM_LENGTH);
71 }
72
73 public long getCount() {
74 return count;
75 }
76
77 public long getCount(String ngram) {
78 Counter counter = ngrams.get(ngram);
79 if (counter != null) {
80 return counter.count;
81 } else {
82 return 0;
83 }
84 }
85
86 /**
87 * Adds a single occurrence of the given ngram to this profile.
88 *
89 * @param ngram the ngram
90 */
91 public void add(String ngram) {
92 add(ngram, 1);
93 }
94
95 /**
96 * Adds multiple occurrences of the given ngram to this profile.
97 *
98 * @param ngram the ngram
99 * @param count number of occurrences to add
100 */
101 public void add(String ngram, long count) {
102 if (length != ngram.length()) {
103 throw new IllegalArgumentException(
104 "Unable to add an ngram of incorrect length: "
105 + ngram.length() + " != " + length);
106 }
107
108 Counter counter = ngrams.get(ngram);
109 if (counter == null) {
110 counter = new Counter();
111 ngrams.put(ngram, counter);
112 }
113 counter.count += count;
114 this.count += count;
115 }
116
117 /**
118 * Calculates the geometric distance between this and the given
119 * other language profile.
120 *
121 * @param that the other language profile
122 * @return distance between the profiles
123 */
124 public double distance(LanguageProfile that) {
125 if (length != that.length) {
126 throw new IllegalArgumentException(
127 "Unable to calculage distance of language profiles"
128 + " with different ngram lengths: "
129 + that.length + " != " + length);
130 }
131
132 double sumOfSquares = 0.0;
133 double thisCount = Math.max(this.count, 1.0);
134 double thatCount = Math.max(that.count, 1.0);
135
136 Set<String> ngrams = new HashSet<String>();
137 ngrams.addAll(this.ngrams.keySet());
138 ngrams.addAll(that.ngrams.keySet());
139 for (String ngram : ngrams) {
140 double thisFrequency = this.getCount(ngram) / thisCount;
141 double thatFrequency = that.getCount(ngram) / thatCount;
142 double difference = thisFrequency - thatFrequency;
143 sumOfSquares += difference * difference;
144 }
145
146 return Math.sqrt(sumOfSquares);
147 }
148
149 @Override
150 public String toString() {
151 return ngrams.toString();
152 }
153
154 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 // JDK imports
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.OutputStream;
28 import java.util.ArrayList;
29 import java.util.Collections;
30 import java.util.Date;
31 import java.util.HashMap;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Map;
35 import org.apache.tika.exception.TikaException;
36
37 /**
38 * This class runs a ngram analysis over submitted text, results might be used
39 * for automatic language identification.
40 *
41 * The similarity calculation is at experimental level. You have been warned.
42 *
43 * Methods are provided to build new NGramProfiles profiles.
44 *
45 * @author Sami Siren
46 * @author Jerome Charron - http://frutch.free.fr/
47 */
48 public class LanguageProfilerBuilder {
49
50 // public static final Log LOG =
51 // LogFactory.getLog(LanguageProfilerBuilder.class);
52
53 /** The minimum length allowed for a ngram. */
54 final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
55
56 /** The maximum length allowed for a ngram. */
57 final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
58
59 /** The default min length of ngram */
60 final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
61
62 /** The default max length of ngram */
63 final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
64
65 /** The ngram profile file extension */
66 final static String FILE_EXTENSION = "ngp";
67
68 /** The profile max size (number of ngrams of the same size) */
69 final static int MAX_SIZE = 1000;
70
71 /** separator char */
72 final static char SEPARATOR = '_';
73 /** The String form of the separator char */
74 private final static String SEP_CHARSEQ = new String(
75 new char[] { SEPARATOR });
76
77 /** The profile's name */
78 private String name = null;
79
80 /** The NGrams of this profile sorted on the number of occurrences */
81 private List<NGramEntry> sorted = null;
82
83 /** The min length of ngram */
84 private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
85
86 /** The max length of ngram */
87 private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
88
89 /** The total number of ngrams occurences */
90 private int[] ngramcounts = null;
91
92 /** An index of the ngrams of the profile */
93 private Map<CharSequence, NGramEntry> ngrams = null;
94
95 /** A StringBuffer used during analysis */
96 private QuickStringBuffer word = new QuickStringBuffer();
97
98 /**
99 * Constructs a new ngram profile
100 *
101 * @param name is the name of the profile
102 * @param minlen is the min length of ngram sequences
103 * @param maxlen is the max length of ngram sequences
104 */
105 public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
106 // TODO: Compute the initial capacity using minlen and maxlen.
107 this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
108 this.minLength = minlen;
109 this.maxLength = maxlen;
110 this.name = name;
111 }
112
113 /**
114 * Constructs a new ngram profile where minlen=3, maxlen=3
115 *
116 * @param name is a name of profile, usually two length string
117 * @since Tika 1.0
118 */
119 public LanguageProfilerBuilder(String name) {
120 this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
121 this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
122 this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
123 this.name = name;
124 }
125
126 /**
127 * @return Returns the name.
128 */
129 public String getName() {
130 return name;
131 }
132
133 // This method was commented because it depends on org.apache.lucene.analysis.Token
134 // that is not a part of the Tika
135 // /**
136 // * Adds ngrams from a token to this profile
137 // *
138 // * @param t is the Token to be added
139 // */
140 // public void add(Token t) {
141 // add(new StringBuffer().append(SEPARATOR)
142 // .append(t.term())
143 // .append(SEPARATOR));
144 // }
145
146 /**
147 * Adds ngrams from a single word to this profile
148 *
149 * @param word is the word to add
150 */
151 public void add(StringBuffer word) {
152 for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
153 add(word, i);
154 }
155 }
156
157 /**
158 * Adds the last NGrams from the specified word.
159 */
160 private void add(QuickStringBuffer word) {
161 int wlen = word.length();
162 if (wlen >= minLength) {
163 int max = Math.min(maxLength, wlen);
164 for (int i = minLength; i <= max; i++) {
165 add(word.subSequence(wlen - i, wlen));
166 }
167 }
168 }
169
170 /**
171 * Adds ngrams from a single word in this profile
172 *
173 * @param word is the word to add
174 * @param n is the ngram size
175 */
176 private void add(CharSequence cs) {
177
178 if (cs.equals(SEP_CHARSEQ)) {
179 return;
180 }
181 NGramEntry nge = ngrams.get(cs);
182 if (nge == null) {
183 nge = new NGramEntry(cs);
184 ngrams.put(cs, nge);
185 }
186 nge.inc();
187 }
188
189 /**
190 * Analyzes a piece of text
191 *
192 * @param text
193 * the text to be analyzed
194 */
195 public void analyze(StringBuilder text) {
196
197 if (ngrams != null) {
198 ngrams.clear();
199 sorted = null;
200 ngramcounts = null;
201 }
202
203 word.clear().append(SEPARATOR);
204 for (int i = 0; i < text.length(); i++) {
205 char c = Character.toLowerCase(text.charAt(i));
206
207 if (Character.isLetter(c)) {
208 add(word.append(c));
209 } else {
210 // found word boundary
211 if (word.length() > 1) {
212 // we have a word!
213 add(word.append(SEPARATOR));
214 word.clear().append(SEPARATOR);
215 }
216 }
217 }
218
219 if (word.length() > 1) {
220 // we have a word!
221 add(word.append(SEPARATOR));
222 }
223 normalize();
224 }
225
226 /**
227 * @param word
228 * @param n sequence length
229 */
230 private void add(StringBuffer word, int n) {
231 for (int i = 0; i <= word.length() - n; i++) {
232 add(word.subSequence(i, i + n));
233 }
234 }
235
236 /**
237 * Normalizes the profile (calculates the ngrams frequencies)
238 */
239 protected void normalize() {
240 NGramEntry e = null;
241 Iterator<NGramEntry> i = ngrams.values().iterator();
242
243 // Calculates ngram count if not already done
244 if (ngramcounts == null) {
245 ngramcounts = new int[maxLength + 1];
246 while (i.hasNext()) {
247 e = i.next();
248 ngramcounts[e.size()] += e.count;
249 }
250 }
251
252 i = ngrams.values().iterator();
253 while (i.hasNext()) {
254 e = i.next();
255 e.frequency = (float) e.count / (float) ngramcounts[e.size()];
256 }
257 }
258
259 /**
260 * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
261 *
262 * @return sorted vector of ngrams
263 */
264 public List<NGramEntry> getSorted() {
265 // make sure sorting is done only once
266 if (sorted == null) {
267 sorted = new ArrayList<NGramEntry>(ngrams.values());
268 Collections.sort(sorted);
269
270 // trim at NGRAM_LENGTH entries
271 if (sorted.size() > MAX_SIZE) {
272 sorted = sorted.subList(0, MAX_SIZE);
273 }
274 }
275 return sorted;
276 }
277
278 // Inherited JavaDoc
279 public String toString() {
280
281 StringBuffer s = new StringBuffer().append("NGramProfile: ")
282 .append(name).append("\n");
283
284 Iterator<NGramEntry> i = getSorted().iterator();
285
286 while (i.hasNext()) {
287 NGramEntry entry = i.next();
288 s.append("[").append(entry.seq).append("/").append(entry.count)
289 .append("/").append(entry.frequency).append("]\n");
290 }
291 return s.toString();
292 }
293
294 /**
295 * Calculates a score how well NGramProfiles match each other
296 *
297 * @param another
298 * ngram profile to compare against
299 * @return similarity 0=exact match
300 * @throws TikaException
301 * if could not calculate a score
302 */
303 public float getSimilarity(LanguageProfilerBuilder another)
304 throws TikaException {
305
306 float sum = 0;
307
308 try {
309 Iterator<NGramEntry> i = another.getSorted().iterator();
310 while (i.hasNext()) {
311 NGramEntry other = i.next();
312 if (ngrams.containsKey(other.seq)) {
313 sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
314 } else {
315 sum += other.frequency;
316 }
317 }
318 i = getSorted().iterator();
319 while (i.hasNext()) {
320 NGramEntry other = i.next();
321 if (another.ngrams.containsKey(other.seq)) {
322 sum += Math.abs((other.frequency - another.ngrams
323 .get(other.seq).frequency)) / 2;
324 } else {
325 sum += other.frequency;
326 }
327 }
328 } catch (Exception e) {
329 throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
330 }
331 return sum;
332 }
333
334 /**
335 * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
336 *
337 * @param is the InputStream to read
338 */
339 public void load(InputStream is) throws IOException {
340
341 ngrams.clear();
342 ngramcounts = new int[maxLength + 1];
343 BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
344 String line = null;
345
346 while ((line = reader.readLine()) != null) {
347
348 // # starts a comment line
349 if (line.charAt(0) != '#') {
350 int spacepos = line.indexOf(' ');
351 String ngramsequence = line.substring(0, spacepos).trim();
352 int len = ngramsequence.length();
353 if ((len >= minLength) && (len <= maxLength)) {
354 int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
355 NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
356 ngrams.put(en.getSeq(), en);
357 ngramcounts[len] += ngramcount;
358 }
359 }
360 }
361 normalize();
362 }
363
364 /**
365 * Creates a new Language profile from (preferably quite large - 5-10k of
366 * lines) text file
367 *
368 * @param name to be given for the profile
369 * @param is a stream to be read
370 * @param encoding is the encoding of stream
371 *
372 * @throws TikaException if could not create a language profile
373 *
374 */
375 public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
376
377 LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
378 ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
379 BufferedInputStream bis = new BufferedInputStream(is);
380
381 byte buffer[] = new byte[4096];
382 StringBuilder text = new StringBuilder();
383 int len;
384
385 try {
386 while ((len = bis.read(buffer)) != -1) {
387 text.append(new String(buffer, 0, len, encoding));
388 }
389 } catch (IOException e) {
390 throw new TikaException("Could not create profile, " + e.getMessage());
391 }
392
393 newProfile.analyze(text);
394 return newProfile;
395 }
396
397 /**
398 * Writes NGramProfile content into OutputStream, content is outputted with
399 * UTF-8 encoding
400 *
401 * @param os the Stream to output to
402 *
403 * @throws IOException
404 */
405 public void save(OutputStream os) throws IOException {
406 os.write(("# NgramProfile generated at " + new Date() +
407 " for Apache Tika Language Identification\n").getBytes());
408
409 // And then each ngram
410
411 // First dispatch ngrams in many lists depending on their size
412 // (one list for each size, in order to store MAX_SIZE ngrams for each
413 // size of ngram)
414 List<NGramEntry> list = new ArrayList<NGramEntry>();
415 List<NGramEntry> sublist = new ArrayList<NGramEntry>();
416 NGramEntry[] entries = ngrams.values().toArray(
417 new NGramEntry[ngrams.size()]);
418 for (int i = minLength; i <= maxLength; i++) {
419 for (int j = 0; j < entries.length; j++) {
420 if (entries[j].getSeq().length() == i) {
421 sublist.add(entries[j]);
422 }
423 }
424 Collections.sort(sublist);
425 if (sublist.size() > MAX_SIZE) {
426 sublist = sublist.subList(0, MAX_SIZE);
427 }
428 list.addAll(sublist);
429 sublist.clear();
430 }
431 for (int i = 0; i < list.size(); i++) {
432 NGramEntry e = list.get(i);
433 String line = e.toString() + " " + e.getCount() + "\n";
434 os.write(line.getBytes("UTF-8"));
435 }
436 os.flush();
437 }
438
439 /**
440 * main method used for testing only
441 *
442 * @param args
443 */
444 public static void main(String args[]) {
445
446 // -create he sample_he.txt utf-8
447
448 String usage = "Usage: NGramProfile "
449 + "[-create profilename filename encoding] "
450 + "[-similarity file1 file2] "
451 + "[-score profile-name filename encoding]";
452 int command = 0;
453
454 final int CREATE = 1;
455 final int SIMILARITY = 2;
456 final int SCORE = 3;
457
458 String profilename = "";
459 String filename = "";
460 String filename2 = "";
461 String encoding = "";
462
463 if (args.length == 0) {
464 System.err.println(usage);
465 System.exit(-1);
466 }
467
468 for (int i = 0; i < args.length; i++) { // parse command line
469 if (args[i].equals("-create")) { // found -create option
470 command = CREATE;
471 profilename = args[++i];
472 filename = args[++i];
473 encoding = args[++i];
474 }
475
476 if (args[i].equals("-similarity")) { // found -similarity option
477 command = SIMILARITY;
478 filename = args[++i];
479 filename2 = args[++i];
480 encoding = args[++i];
481 }
482
483 if (args[i].equals("-score")) { // found -Score option
484 command = SCORE;
485 profilename = args[++i];
486 filename = args[++i];
487 encoding = args[++i];
488 }
489 }
490
491 try {
492
493 switch (command) {
494
495 case CREATE:
496
497 File f = new File(filename);
498 FileInputStream fis = new FileInputStream(f);
499 LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
500 .create(profilename, fis, encoding);
501 fis.close();
502 f = new File(profilename + "." + FILE_EXTENSION);
503 FileOutputStream fos = new FileOutputStream(f);
504 newProfile.save(fos);
505 System.out.println("new profile " + profilename + "."
506 + FILE_EXTENSION + " was created.");
507 break;
508
509 case SIMILARITY:
510
511 f = new File(filename);
512 fis = new FileInputStream(f);
513 newProfile = LanguageProfilerBuilder.create(filename, fis,
514 encoding);
515 newProfile.normalize();
516
517 f = new File(filename2);
518 fis = new FileInputStream(f);
519 LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
520 .create(filename2, fis, encoding);
521 newProfile2.normalize();
522 System.out.println("Similarity is "
523 + newProfile.getSimilarity(newProfile2));
524 break;
525
526 case SCORE:
527 f = new File(filename);
528 fis = new FileInputStream(f);
529 newProfile = LanguageProfilerBuilder.create(filename, fis,
530 encoding);
531
532 f = new File(profilename + "." + FILE_EXTENSION);
533 fis = new FileInputStream(f);
534 LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
535 profilename, DEFAULT_MIN_NGRAM_LENGTH,
536 DEFAULT_MAX_NGRAM_LENGTH);
537 compare.load(fis);
538 System.out.println("Score is "
539 + compare.getSimilarity(newProfile));
540 break;
541
542 }
543
544 } catch (Exception e) {
545 e.printStackTrace();
546 // throw new TikaException("");
547 }
548 }
549
550
551 /**
552 * Inner class that describes a NGram
553 */
554 static class NGramEntry implements Comparable<NGramEntry> {
555
556 /** The NGRamProfile this NGram is related to */
557 private LanguageProfilerBuilder profile = null;
558
559 /** The sequence of characters of the ngram */
560 CharSequence seq = null;
561
562 /** The number of occurences of this ngram in its profile */
563 private int count = 0;
564
565 /** The frequency of this ngram in its profile */
566 private float frequency = 0.0F;
567
568 /**
569 * Constructs a new NGramEntry
570 *
571 * @param seq is the sequence of characters of the ngram
572 */
573 public NGramEntry(CharSequence seq) {
574 this.seq = seq;
575 }
576
577 /**
578 * Constructs a new NGramEntry
579 *
580 * @param seq is the sequence of characters of the ngram
581 * @param count is the number of occurrences of this ngram
582 */
583 public NGramEntry(String seq, int count) {
584 this.seq = new StringBuffer(seq).subSequence(0, seq.length());
585 this.count = count;
586 }
587
588 /**
589 * Returns the number of occurrences of this ngram in its profile
590 *
591 * @return the number of occurrences of this ngram in its profile
592 */
593 public int getCount() {
594 return count;
595 }
596
597 /**
598 * Returns the frequency of this ngram in its profile
599 *
600 * @return the frequency of this ngram in its profile
601 */
602 public float getFrequency() {
603 return frequency;
604 }
605
606 /**
607 * Returns the sequence of characters of this ngram
608 *
609 * @return the sequence of characters of this ngram
610 */
611 public CharSequence getSeq() {
612 return seq;
613 }
614
615 /**
616 * Returns the size of this ngram
617 *
618 * @return the size of this ngram
619 */
620 public int size() {
621 return seq.length();
622 }
623
624 // Inherited JavaDoc
625 public int compareTo(NGramEntry ngram) {
626 int diff = Float.compare(ngram.getFrequency(), frequency);
627 if (diff != 0) {
628 return diff;
629 } else {
630 return (toString().compareTo(ngram.toString()));
631 }
632 }
633
634 /**
635 * Increments the number of occurrences of this ngram.
636 */
637 public void inc() {
638 count++;
639 }
640
641 /**
642 * Associated a profile to this ngram
643 *
644 * @param profile
645 * is the profile associated to this ngram
646 */
647 public void setProfile(LanguageProfilerBuilder profile) {
648 this.profile = profile;
649 }
650
651 /**
652 * Returns the profile associated to this ngram
653 *
654 * @return the profile associated to this ngram
655 */
656 public LanguageProfilerBuilder getProfile() {
657 return profile;
658 }
659
660 // Inherited JavaDoc
661 public String toString() {
662 return seq.toString();
663 }
664
665 // Inherited JavaDoc
666 public int hashCode() {
667 return seq.hashCode();
668 }
669
670 // Inherited JavaDoc
671 public boolean equals(Object obj) {
672
673 NGramEntry ngram = null;
674 try {
675 ngram = (NGramEntry) obj;
676 return ngram.seq.equals(seq);
677 } catch (Exception e) {
678 return false;
679 }
680 }
681
682 }
683
684 private static class QuickStringBuffer implements CharSequence {
685
686 private char value[];
687
688 private int count;
689
690 QuickStringBuffer() {
691 this(16);
692 }
693
694 QuickStringBuffer(char[] value) {
695 this.value = value;
696 count = value.length;
697 }
698
699 QuickStringBuffer(int length) {
700 value = new char[length];
701 }
702
703 QuickStringBuffer(String str) {
704 this(str.length() + 16);
705 append(str);
706 }
707
708 public int length() {
709 return count;
710 }
711
712 private void expandCapacity(int minimumCapacity) {
713 int newCapacity = (value.length + 1) * 2;
714 if (newCapacity < 0) {
715 newCapacity = Integer.MAX_VALUE;
716 } else if (minimumCapacity > newCapacity) {
717 newCapacity = minimumCapacity;
718 }
719
720 char newValue[] = new char[newCapacity];
721 System.arraycopy(value, 0, newValue, 0, count);
722 value = newValue;
723 }
724
725 QuickStringBuffer clear() {
726 count = 0;
727 return this;
728 }
729
730 public char charAt(int index) {
731 return value[index];
732 }
733
734 QuickStringBuffer append(String str) {
735 if (str == null) {
736 str = String.valueOf(str);
737 }
738
739 int len = str.length();
740 int newcount = count + len;
741 if (newcount > value.length) {
742 expandCapacity(newcount);
743 }
744 str.getChars(0, len, value, count);
745 count = newcount;
746 return this;
747 }
748
749 QuickStringBuffer append(char c) {
750 int newcount = count + 1;
751 if (newcount > value.length) {
752 expandCapacity(newcount);
753 }
754 value[count++] = c;
755 return this;
756 }
757
758 public CharSequence subSequence(int start, int end) {
759 return new String(value, start, end - start);
760 }
761
762 public String toString() {
763 return new String(this.value);
764 }
765 }
766 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import org.apache.tika.sax.WriteOutContentHandler;
19
20 /**
21 * SAX content handler that builds a language profile based on all the
22 * received character content.
23 *
24 * @since Apache Tika 0.5
25 */
26 public class ProfilingHandler extends WriteOutContentHandler {
27
28 private final ProfilingWriter writer;
29
30 public ProfilingHandler(ProfilingWriter writer) {
31 super(writer);
32 this.writer = writer;
33 }
34
35 public ProfilingHandler(LanguageProfile profile) {
36 this(new ProfilingWriter(profile));
37 }
38
39 public ProfilingHandler() {
40 this(new ProfilingWriter());
41 }
42
43 /**
44 * Returns the language profile being built by this content handler.
45 * Note that the returned profile gets updated whenever new SAX events
46 * are received by this content handler. Use the {@link #getLanguage()}
47 * method to get the language that best matches the current state of
48 * the profile.
49 *
50 * @return language profile
51 */
52 public LanguageProfile getProfile() {
53 return writer.getProfile();
54 }
55
56 /**
57 * Returns the language that best matches the current state of the
58 * language profile.
59 *
60 * @return language that best matches the current profile
61 */
62 public LanguageIdentifier getLanguage() {
63 return writer.getLanguage();
64 }
65
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.io.IOException;
19 import java.io.Writer;
20
21 /**
22 * Writer that builds a language profile based on all the written content.
23 *
24 * @since Apache Tika 0.5
25 */
26 public class ProfilingWriter extends Writer {
27
28 private final LanguageProfile profile;
29
30 private char[] buffer = new char[] { 0, 0, '_' };
31
32 private int n = 1;
33
34 public ProfilingWriter(LanguageProfile profile) {
35 this.profile = profile;
36 }
37
38 public ProfilingWriter() {
39 this(new LanguageProfile());
40 }
41
42 /**
43 * Returns the language profile being built by this writer. Note that
44 * the returned profile gets updated whenever new characters are written.
45 * Use the {@link #getLanguage()} method to get the language that best
46 * matches the current state of the profile.
47 *
48 * @return language profile
49 */
50 public LanguageProfile getProfile() {
51 return profile;
52 }
53
54 /**
55 * Returns the language that best matches the current state of the
56 * language profile.
57 *
58 * @return language that best matches the current profile
59 */
60 public LanguageIdentifier getLanguage() {
61 return new LanguageIdentifier(profile);
62 }
63
64 @Override
65 public void write(char[] cbuf, int off, int len) {
66 for (int i = 0; i < len; i++) {
67 char c = Character.toLowerCase(cbuf[off + i]);
68 if (Character.isLetter(c)) {
69 addLetter(c);
70 } else {
71 addSeparator();
72 }
73 }
74 }
75
76 private void addLetter(char c) {
77 System.arraycopy(buffer, 1, buffer, 0, buffer.length - 1);
78 buffer[buffer.length - 1] = c;
79 n++;
80 if (n >= buffer.length) {
81 profile.add(new String(buffer));
82 }
83 }
84
85 private void addSeparator() {
86 addLetter('_');
87 n = 1;
88 }
89
90 @Override
91 public void close() throws IOException {
92 addSeparator();
93 }
94
95 /**
96 * Ignored.
97 */
98 @Override
99 public void flush() {
100 }
101
102 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Language detection.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.language;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Met keys from NCAR CCSM files in the <a
20 * href="http://cf-pcmdi.llnl.gov/">Climate Forecast Convention</a>.
21 */
22 public interface ClimateForcast {
23
24 public static final String PROGRAM_ID = "prg_ID";
25
26 public static final String COMMAND_LINE = "cmd_ln";
27
28 public static final String HISTORY = "history";
29
30 public static final String TABLE_ID = "table_id";
31
32 public static final String INSTITUTION = "institution";
33
34 public static final String SOURCE = "source";
35
36 public static final String CONTACT = "contact";
37
38 public static final String PROJECT_ID = "project_id";
39
40 public static final String CONVENTIONS = "Conventions";
41
42 public static final String REFERENCES = "references";
43
44 public static final String ACKNOWLEDGEMENT = "acknowledgement";
45
46 public static final String REALIZATION = "realization";
47
48 public static final String EXPERIMENT_ID = "experiment_id";
49
50 public static final String COMMENT = "comment";
51
52 public static final String MODEL_NAME_ENGLISH = "model_name_english";
53
54 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of Creative Commons properties names.
20 *
21 * @see <a href="http://www.creativecommons.org/">creativecommons.org</a>
22 */
23 public interface CreativeCommons {
24
25 String LICENSE_URL = "License-Url";
26
27 String LICENSE_LOCATION = "License-Location";
28
29 String WORK_TYPE = "Work-Type";
30
31 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of Dublin Core metadata names.
20 *
21 * @see <a href="http://dublincore.org">dublincore.org</a>
22 */
23 public interface DublinCore {
24
25 public static final String NAMESPACE_URI_DC = "http://purl.org/dc/elements/1.1/";
26 public static final String NAMESPACE_URI_DC_TERMS = "http://purl.org/dc/terms/";
27 public static final String PREFIX_DC = "dc";
28 public static final String PREFIX_DC_TERMS = "dcterms";
29
30 /**
31 * Typically, Format may include the media-type or dimensions of the
32 * resource. Format may be used to determine the software, hardware or
33 * other equipment needed to display or operate the resource. Examples
34 * of dimensions include size and duration. Recommended best practice is
35 * to select a value from a controlled vocabulary (for example, the list
36 * of Internet Media Types [MIME] defining computer media formats).
37 */
38 Property FORMAT = Property.internalText(
39 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "format");
40
41 /**
42 * Recommended best practice is to identify the resource by means of
43 * a string or number conforming to a formal identification system.
44 * Example formal identification systems include the Uniform Resource
45 * Identifier (URI) (including the Uniform Resource Locator (URL)),
46 * the Digital Object Identifier (DOI) and the International Standard
47 * Book Number (ISBN).
48 */
49 Property IDENTIFIER = Property.internalText(
50 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "identifier");
51
52 /**
53 * Date on which the resource was changed.
54 */
55 Property MODIFIED = Property.internalDate(
56 PREFIX_DC_TERMS + Metadata.NAMESPACE_PREFIX_DELIMITER + "modified");
57
58 /**
59 * An entity responsible for making contributions to the content of the
60 * resource. Examples of a Contributor include a person, an organisation,
61 * or a service. Typically, the name of a Contributor should be used to
62 * indicate the entity.
63 */
64 Property CONTRIBUTOR = Property.internalTextBag(
65 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "contributor");
66
67 /**
68 * The extent or scope of the content of the resource. Coverage will
69 * typically include spatial location (a place name or geographic
70 * coordinates), temporal period (a period label, date, or date range)
71 * or jurisdiction (such as a named administrative entity). Recommended
72 * best practice is to select a value from a controlled vocabulary (for
73 * example, the Thesaurus of Geographic Names [TGN]) and that, where
74 * appropriate, named places or time periods be used in preference to
75 * numeric identifiers such as sets of coordinates or date ranges.
76 */
77 Property COVERAGE = Property.internalText(
78 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "coverage");
79
80 /**
81 * An entity primarily responsible for making the content of the resource.
82 * Examples of a Creator include a person, an organisation, or a service.
83 * Typically, the name of a Creator should be used to indicate the entity.
84 */
85 Property CREATOR = Property.internalTextBag(
86 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "creator");
87
88 /**
89 * Date of creation of the resource.
90 */
91 Property CREATED = Property.internalDate(
92 PREFIX_DC_TERMS + Metadata.NAMESPACE_PREFIX_DELIMITER + "created");
93
94 /**
95 * A date associated with an event in the life cycle of the resource.
96 * Typically, Date will be associated with the creation or availability of
97 * the resource. Recommended best practice for encoding the date value is
98 * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
99 * format.
100 */
101 Property DATE = Property.internalDate(
102 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "date");
103
104 /**
105 * An account of the content of the resource. Description may include
106 * but is not limited to: an abstract, table of contents, reference to
107 * a graphical representation of content or a free-text account of
108 * the content.
109 */
110 Property DESCRIPTION = Property.internalText(
111 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "description");
112
113 /**
114 * A language of the intellectual content of the resource. Recommended
115 * best practice is to use RFC 3066 [RFC3066], which, in conjunction
116 * with ISO 639 [ISO639], defines two- and three-letter primary language
117 * tags with optional subtags. Examples include "en" or "eng" for English,
118 * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
119 */
120 Property LANGUAGE = Property.internalText(
121 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "language");
122
123 /**
124 * An entity responsible for making the resource available. Examples of
125 * a Publisher include a person, an organisation, or a service. Typically,
126 * the name of a Publisher should be used to indicate the entity.
127 */
128 Property PUBLISHER = Property.internalText(
129 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "publisher");
130
131 /**
132 * A reference to a related resource. Recommended best practice is to
133 * reference the resource by means of a string or number conforming to
134 * a formal identification system.
135 */
136 Property RELATION = Property.internalText(
137 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "relation");
138
139 /**
140 * Information about rights held in and over the resource. Typically,
141 * a Rights element will contain a rights management statement for
142 * the resource, or reference a service providing such information.
143 * Rights information often encompasses Intellectual Property Rights
144 * (IPR), Copyright, and various Property Rights. If the Rights element
145 * is absent, no assumptions can be made about the status of these and
146 * other rights with respect to the resource.
147 */
148 Property RIGHTS = Property.internalText(
149 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "rights");
150
151 /**
152 * A reference to a resource from which the present resource is derived.
153 * The present resource may be derived from the Source resource in whole
154 * or in part. Recommended best practice is to reference the resource by
155 * means of a string or number conforming to a formal identification
156 * system.
157 */
158 Property SOURCE = Property.internalText(
159 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "source");
160
161 /**
162 * The topic of the content of the resource. Typically, a Subject will
163 * be expressed as keywords, key phrases or classification codes that
164 * describe a topic of the resource. Recommended best practice is to
165 * select a value from a controlled vocabulary or formal classification
166 * scheme.
167 */
168 Property SUBJECT = Property.internalTextBag(
169 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "subject");
170
171 /**
172 * A name given to the resource. Typically, a Title will be a name by
173 * which the resource is formally known.
174 */
175 Property TITLE = Property.internalText(
176 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "title");
177
178 /**
179 * The nature or genre of the content of the resource. Type includes terms
180 * describing general categories, functions, genres, or aggregation levels
181 * for content. Recommended best practice is to select a value from a
182 * controlled vocabulary (for example, the DCMI Type Vocabulary
183 * [DCMITYPE]). To describe the physical or digital manifestation of
184 * the resource, use the Format element.
185 */
186 Property TYPE = Property.internalText(
187 PREFIX_DC + Metadata.NAMESPACE_PREFIX_DELIMITER + "type");
188
189 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Geographic schema. This is a collection of
20 * {@link Property property definition} constants for geographic
21 * information, as defined in the W3C Geo Vocabularies.
22 *
23 * @since Apache Tika 0.8
24 * @see <a href="http://www.w3.org/2003/01/geo/"
25 * >W3C Basic Geo Vocabulary</a>
26 */
27 public interface Geographic {
28
29 /**
30 * The WGS84 Latitude of the Point
31 */
32 Property LATITUDE =
33 Property.internalReal("geo:lat");
34
35 /**
36 * The WGS84 Longitude of the Point
37 */
38 Property LONGITUDE =
39 Property.internalReal("geo:long");
40
41 /**
42 * The WGS84 Altitude of the Point
43 */
44 Property ALTITUDE =
45 Property.internalReal("geo:alt");
46
47 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of HTTP header names.
20 *
21 * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol --
22 * HTTP/1.1 (RFC 2616)</a>
23 */
24 public interface HttpHeaders {
25
26 String CONTENT_ENCODING = "Content-Encoding";
27
28 String CONTENT_LANGUAGE = "Content-Language";
29
30 String CONTENT_LENGTH = "Content-Length";
31
32 String CONTENT_LOCATION = "Content-Location";
33
34 String CONTENT_DISPOSITION = "Content-Disposition";
35
36 String CONTENT_MD5 = "Content-MD5";
37
38 String CONTENT_TYPE = "Content-Type";
39
40 Property LAST_MODIFIED =
41 Property.internalDate("Last-Modified");
42
43 String LOCATION = "Location";
44
45 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010)
17 * standard. These parts Copyright 2010 International Press Telecommunications
18 * Council.
19 */
20 package org.apache.tika.metadata;
21
22 /**
23 * IPTC photo metadata schema.
24 *
25 * A collection of
26 * {@link Property property definition} constants for the photo metadata
27 * properties defined in the IPTC standard.
28 *
29 * @since Apache Tika 1.1
30 * @see <a href="http://www.iptc.org/std/photometadata/specification/IPTC-PhotoMetadata-201007_1.pdf">IPTC Photo Metadata</a>
31 */
32 public interface IPTC {
33
34 String NAMESPACE_URI_IPTC_CORE = "http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/";
35 String NAMESPACE_URI_IPTC_EXT = "http://iptc.org/std/Iptc4xmpExt/2008-02-29/";
36 String NAMESPACE_URI_PLUS = "http://ns.useplus.org/ldf/xmp/1.0/";
37
38 String PREFIX_IPTC_CORE = "Iptc4xmpCore";
39 String PREFIX_IPTC_EXT = "Iptc4xmpExt";
40 String PREFIX_PLUS = "plus";
41
42 /**
43 * Name of the city the content is focussing on -- either the place shown
44 * in visual media or referenced by text or audio media. This element is at
45 * the third level of a top-down geographical hierarchy.
46 * <p>
47 * This is a detail of a location with blurred semantics as it does not
48 * clearly indicate whether it is the location in the image or the location
49 * the photo was taken - which can be different. Two more concise properties
50 * are available in IPTC Extension with Location Created and Location Shown
51 * in the Image.
52 * <p>
53 * Maps to this IIM property: 2:90 City
54 *
55 * @see Photoshop#CITY
56 */
57 Property CITY = Photoshop.CITY;
58
59 /**
60 * Full name of the country the content is focussing on -- either the
61 * country shown in visual media or referenced in text or audio media. This
62 * element is at the top/first level of a top- down geographical hierarchy.
63 * The full name should be expressed as a verbal name and not as a code, a
64 * code should go to the element "CountryCode"
65 * <p>
66 * This is a detail of a location with blurred semantics as it does not
67 * clearly indicate whether it is the location in the image or the location
68 * the photo was taken - which can be different. Two more concise properties
69 * are available in IPTC Extension with Location Created and Location Shown
70 * in the Image.
71 * <p>
72 * Maps to this IIM property: 2:101 Country/Primary Location Name
73 *
74 * @see Photoshop#COUNTRY
75 */
76 Property COUNTRY = Photoshop.COUNTRY;
77
78 /**
79 * Code of the country the content is focussing on -- either the country
80 * shown in visual media or referenced in text or audio media. This element
81 * is at the top/first level of a top-down geographical hierarchy. The code
82 * should be taken from ISO 3166 two or three letter code. The full name of
83 * a country should go to the "Country" element.
84 * <p>
85 * This is a detail of a location with blurred semantics as it does not
86 * clearly indicate whether it is the location in the image or the location
87 * the photo was taken - which can be different. Two more concise properties
88 * are available in IPTC Extension with Location Created and Location Shown
89 * in the Image.
90 * <p>
91 * Maps to this IIM property: 2:100 Country/Primary Location Code
92 */
93 Property COUNTRY_CODE = Property.internalText(
94 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CountryCode");
95
96 /**
97 * A textual description, including captions, of the item's content,
98 * particularly used where the object is not text.
99 * <p>
100 * Note: the XMP property (dc:description) which stores the value of this
101 * IPTC Core property is of type Lang Alt. Hence any software agent dealing
102 * with this property must abide to the processing rules for
103 * Lang Alt value type as specified by the XMP specifications.
104 * <p>
105 * Maps to this IIM property: 2:120 Caption/Abstract
106 *
107 * @see DublinCore#DESCRIPTION
108 */
109 Property DESCRIPTION = DublinCore.DESCRIPTION;
110
111 /**
112 * A brief synopsis of the caption. Headline is not the same as Title.
113 * <p>
114 * Maps to this IIM property: 2:105 Headline
115 *
116 * @see Photoshop#HEADLINE
117 */
118 Property HEADLINE = Photoshop.HEADLINE;
119
120 /**
121 * Describes the nature, intellectual, artistic or journalistic
122 * characteristic of a item, not specifically its content.
123 * <p>
124 * The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs
125 * photo specific extension to be better usable with this field (as of the
126 * release of this standard in the year 2008).
127 * <p>
128 * Maps to this IIM property: 2:04 Object Attribute Reference
129 */
130 Property INTELLECTUAL_GENRE = Property.internalText(
131 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "IntellectualGenre");
132
133 /**
134 * Keywords to express the subject of the content. Keywords may be free
135 * text and don't have to be taken from a controlled vocabulary. Codes from
136 * the controlled vocabulary IPTC Subject NewsCodes must go to the
137 * "Subject Code" field.
138 * <p>
139 * Single values of this field should not be restricted to single words
140 * but must allow for phrases as well.
141 * <p>
142 * Maps to this IIM property: 2:25 Keywords
143 *
144 * @see DublinCore#SUBJECT
145 */
146 Property KEYWORDS = DublinCore.SUBJECT;
147
148 /**
149 * Name of the subregion of a country -- either called province or state or
150 * anything else -- the content is focussing on -- either the subregion
151 * shown in visual media or referenced by text or audio media. This element
152 * is at the second level of a top-down geographical hierarchy.
153 * <p>
154 * This is a detail of a location with blurred semantics as it does not
155 * clearly indicate whether it is the location in the image or the location
156 * the photo was taken - which can be different. Two more concise properties
157 * are available in IPTC Extension with Location Created and Location Shown
158 * in the Image.
159 * <p>
160 * Maps to this IIM property: 2:95 Province/State
161 *
162 * @see Photoshop#STATE
163 */
164 Property PROVINCE_OR_STATE = Photoshop.STATE;
165
166 /**
167 * Describes the scene of a news content. Specifies one or more terms
168 * from the IPTC "Scene-NewsCodes". Each Scene is represented as a string of
169 * 6 digits in an unordered list.
170 * <p>
171 * Note: Only Scene values from this IPTC taxonomy should be used here. More
172 * about the IPTC Scene-NewsCodes at www.newscodes.org.
173 */
174 Property SCENE_CODE = Property.internalTextBag(
175 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "Scene");
176
177 /**
178 * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy
179 * to categorise the content. Each Subject is represented as a string of 8
180 * digits in an unordered list.
181 * <p>
182 * Note: Only Subjects from a controlled vocabulary should be used here,
183 * free text has to be put into the Keyword element. More about
184 * IPTC Subject-NewsCodes at www.newscodes.org.
185 */
186 Property SUBJECT_CODE = Property.internalTextBag(
187 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "SubjectCode");
188
189 /**
190 * Name of a sublocation the content is focussing on -- either the
191 * location shown in visual media or referenced by text or audio media. This
192 * location name could either be the name of a sublocation to a city or the
193 * name of a well known location or (natural) monument outside a city. In
194 * the sense of a sublocation to a city this element is at the fourth level
195 * of a top-down geographical hierarchy.
196 * <p>
197 * This is a detail of a location with blurred semantics as it does not
198 * clearly indicate whether it is the location in the image or the location
199 * the photo was taken - which can be different. Two more concise properties
200 * are available in IPTC Extension with Location Created and Location Shown
201 * in the Image.
202 * <p>
203 * Maps to this IIM property: 2:92 Sublocation
204 */
205 Property SUBLOCATION = Property.internalText(
206 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "Location");
207
208 /**
209 * Designates the date and optionally the time the intellectual content was
210 * created rather than the date of the creation of the physical
211 * representation.
212 * <p>
213 * If a software system requires explicit time values and no time is given
214 * by the Date Created property the software system should default the time
215 * to 00:00:00. If the software system does not require an explicit time
216 * value the time part should be left empty as it is.
217 * <p>
218 * Note 1: Any content of the IIM dataset 2:60, Time Created, should be
219 * merged to this element.
220 * Note 2: Implementers are encouraged to provide
221 * the creation date and time from the EXIF data of a digital
222 * camera to the user for entering this date for the first time.
223 * <p>
224 * Maps to this IIM property: 2:55 Date Created
225 *
226 * @see Photoshop#DATE_CREATED
227 */
228 Property DATE_CREATED = Photoshop.DATE_CREATED;
229
230 /**
231 * Identifier or the name of the person involved in writing, editing or
232 * correcting the description of the content.
233 * <p>
234 * Maps to this IIM property: 2:122 Writer/Editor
235 *
236 * @see Photoshop#CAPTION_WRITER
237 */
238 Property DESCRIPTION_WRITER = Photoshop.CAPTION_WRITER;
239
240 /**
241 * Any of a number of instructions from the provider or creator to the
242 * receiver of the item.
243 * <p>
244 * Maps to this IIM property: 2:40 Special Instruction
245 *
246 * @see Photoshop#INSTRUCTIONS
247 */
248 Property INSTRUCTIONS = Photoshop.INSTRUCTIONS;
249
250 /**
251 * Number or identifier for the purpose of improved workflow handling. This
252 * is a user created identifier related to the job for which the item is
253 * supplied.
254 * <p>
255 * Note: As this identifier references a job of the receiver's workflow it
256 * must first be issued by the receiver, then transmitted to the creator or
257 * provider of the news object and finally added by the creator
258 * to this field.
259 * <p>
260 * Maps to this IIM property: 2:103 Original Transmission Reference
261 *
262 * @see Photoshop#TRANSMISSION_REFERENCE
263 */
264 Property JOB_ID = Photoshop.TRANSMISSION_REFERENCE;
265
266 /**
267 * A shorthand reference for the item. Title provides a short human readable
268 * name which can be a text and/or numeric reference. It is not the same as
269 * Headline.
270 * <p>
271 * Many use the Title field to store the filename of the image, though the
272 * field may be used in many ways. Formal identifiers are provided by the
273 * Digital Image Id, or the Registry Entry property of the IPTC Extension.
274 * <p>
275 * Note 1: This element aligns with the use of Dublin Core's "Title"
276 * element.
277 * Note 2: the XMP property (dc:title) which stores the value of
278 * this IPTC Core property is of type Lang Alt. Hence any software agent
279 * dealing with this property must abide to the processing rules for Lang
280 * Alt value type as specified by the XMP specifications.
281 * <p>
282 * Maps to this IIM property: 2:05 Object Name
283 *
284 * @see DublinCore#TITLE
285 */
286 Property TITLE = DublinCore.TITLE;
287
288 /**
289 * Contains any necessary copyright notice for claiming the intellectual
290 * property for this item and should identify the current owner of the
291 * copyright for the item. Other entities like the creator of the item may
292 * be added in the corresponding field. Notes on usage rights should be
293 * provided in "Rights usage terms".
294 * <p>
295 * Copyright ownership can be expressed in a more controlled way using the
296 * PLUS fields "Copyright Owner", "Copyright Owner ID",
297 * "Copyright Owner Name" of the IPTC Extension. It is the user's
298 * responsibility to keep the values of the four fields in sync.
299 * <p>
300 * Note: the XMP property (dc:rights) which stores the value of this IPTC
301 * Core property is of type Lang Alt. Hence any software agent dealing with
302 * this property must abide to the processing rules for Lang Alt
303 * value type as specified by the XMP specifications.
304 * <p>
305 * Maps to this IIM property: 2:116 Copyright Notice
306 *
307 * @see DublinCore#RIGHTS
308 */
309 Property COPYRIGHT_NOTICE = DublinCore.RIGHTS;
310
311 /**
312 * Contains the name of the person who created the content of this item, a
313 * photographer for photos, a graphic artist for graphics, or a writer for
314 * textual news, but in cases where the photographer should not be
315 * identified the name of a company or organisation may be appropriate.
316 * <p>
317 * The creator can be expressed in a more controlled way using the
318 * "Image Creator" of PLUS in the IPTC Extension additionally. It is the
319 * user's responsibility to keep the values of the IPTC Core and the PLUS
320 * fields in sync.
321 * <p>
322 * Maps to this IIM property: 2:80 By-line
323 *
324 * @see DublinCore#CREATOR
325 */
326 Property CREATOR = DublinCore.CREATOR;
327
328 /**
329 * The creator's contact information provides all necessary information to
330 * get in contact with the creator of this item and comprises a set of
331 * sub-properties for proper addressing.
332 * <p>
333 * The IPTC Extension Licensor fields should be used instead of these
334 * Creator's Contact Info fields if you are using IPTC Extension fields. If
335 * the creator is also the licensor his or her contact information should be
336 * provided in the Licensor fields.
337 * <p>
338 * Note 1 to user interface implementers: All sub-properties of "Creator's
339 * contact information" should be shown as group on the form.
340 * Note 2: the
341 * CreatorContactInfo sub-properties' naming aligns with the vCard
342 * specification RFC 2426.
343 */
344 Property CREATORS_CONTACT_INFO = Property.internalText(
345 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CreatorContactInfo");
346
347 /**
348 * Contains the job title of the person who created the content of this
349 * item. As this is sort of a qualifier the Creator element has to be filled
350 * in as mandatory prerequisite for using Creator's Jobtitle.
351 * <p>
352 * Maps to this IIM property: 2:85 By-line Title
353 *
354 * @see Photoshop#AUTHORS_POSITION
355 */
356 Property CREATORS_JOB_TITLE = Photoshop.AUTHORS_POSITION;
357
358 /**
359 * The credit to person(s) and/or organisation(s) required by the supplier
360 * of the item to be used when published. This is a free-text field.
361 * <p>
362 * Note 1: For more formal identifications of the creator or the owner of
363 * the copyrights of this image other rights properties may be used.
364 * Note 2:
365 * This property was named "Credit" by the IIM metadata, then it was renamed
366 * to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it has been renamed to
367 * "Credit Line" as the field is used for this purpose by many users.
368 * <p>
369 * Maps to this IIM property: 2:110 Credit
370 *
371 * @see Photoshop#CREDIT_LINE
372 */
373 Property CREDIT_LINE = Photoshop.CREDIT;
374
375 /**
376 * The licensing parameters of the item expressed in free-text.
377 * <p>
378 * The PLUS fields of the IPTC Extension can be used in parallel to express
379 * the licensed usage in more controlled terms.
380 */
381 Property RIGHTS_USAGE_TERMS = XMPRights.USAGE_TERMS;
382
383 /**
384 * Identifies the original owner of the copyright for the intellectual
385 * content of the item. This could be an agency, a member of an agency or an
386 * individual. Source could be different from Creator and from the entities
387 * in the CopyrightNotice.
388 * <p>
389 * The original owner can never change. For that reason the content of this
390 * property should never be changed or deleted after the information is
391 * entered following the news object's initial creation.
392 * <p>
393 * Maps to this IIM property: 2:115 Source
394 *
395 * @see Photoshop#SOURCE
396 */
397 Property SOURCE = Photoshop.SOURCE;
398
399 /**
400 * The contact information address part. Comprises an optional company name
401 * and all required information to locate the building or postbox to which
402 * mail should be sent. To that end, the address is a multiline field.
403 * <p>
404 * Note 1: to user interface implementers: This field should be part of a
405 * "Contact information" group on the form.
406 * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
407 */
408 Property CONTACT_INFO_ADDRESS = Property.internalTextBag(
409 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiAdrExtadr");
410
411 /**
412 * The contact information city part.
413 * <p>
414 * Note 1: to user interface implementers: This field should be part of a
415 * "Contact information" group on the form.
416 * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
417 */
418 Property CONTACT_INFO_CITY = Property.internalText(
419 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity");
420
421 /**
422 * The contact information country part.
423 * <p>
424 * Note 1: to user interface implementers: This field should be part of a
425 * "Contact information" group on the form.
426 * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
427 */
428 Property CONTACT_INFO_COUNTRY = Property.internalText(
429 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry");
430
431 /**
432 * The contact information email address part.
433 * <p>
434 * Multiple email addresses can be given. May have to be separated by a
435 * comma in the user interface.
436 * <p>
437 * Note 1: to user interface implementers: This field should be part of a
438 * "Contact information" group on the form.
439 * Note 2 to user interface
440 * implementers: provide sufficient space to fill in multiple e-mail
441 * addresses.
442 * Note 3: the ContactInfo naming aligns with the vCard
443 * specification RFC 2426.
444 */
445 Property CONTACT_INFO_EMAIL = Property.internalTextBag(
446 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiEmailWork");
447
448 /**
449 * The contact information phone number part.
450 * <p>
451 * Multiple numbers can be given. May have to be separated by a
452 * comma in the user interface.
453 * <p>
454 * Note 1: to user interface implementers: This field should be part of a
455 * "Contact information" group on the form.
456 * Note 2 to user interface
457 * implementers: provide sufficient space to fill in multiple international
458 * numbers.
459 * Note 3: the ContactInfo naming aligns with the vCard
460 * specification RFC 2426.
461 */
462 Property CONTACT_INFO_PHONE = Property.internalTextBag(
463 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiTelWork");
464
465 /**
466 * The contact information part denoting the local postal code.
467 * <p>
468 * Note 1: to user interface implementers: This field should be part of a
469 * "Contact information" group on the form.
470 * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
471 */
472 Property CONTACT_INFO_POSTAL_CODE = Property.internalText(
473 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiAdrPcode");
474
475 /**
476 * The contact information part denoting regional information such as state or province.
477 * <p>
478 * Note 1: to user interface implementers: This field should be part of a
479 * "Contact information" group on the form.
480 * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
481 */
482 Property CONTACT_INFO_STATE_PROVINCE = Property.internalText(
483 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiAdrRegion");
484
485 /**
486 * The contact information web address part. Multiple addresses can be given, separated by a comma.
487 * <p>
488 * Note 1: to user interface implementers: This field should be part of a
489 * "Contact information" group on the form.
490 * Note 2 to user interface
491 * implementers: provide sufficient space to fill in multiple URLs.
492 * Note 3: the ContactInfo naming aligns with the vCard
493 * specification RFC 2426.
494 */
495 Property CONTACT_INFO_WEB_URL = Property.internalTextBag(
496 PREFIX_IPTC_CORE + Metadata.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork");
497
498 /**
499 * As this metadata element pertains to distribution management, it was not
500 * adopted. However, this data is still synchronised with the XMP property
501 * [photoshop:Urgency], and hence, available for future use, but outside the
502 * IPTC Core.
503 *
504 * @deprecated
505 */
506 Property URGENCY = Photoshop.URGENCY;
507
508 /**
509 * As this metadata element was earmarked as deprecated already for IIM 4.1,
510 * it was not adopted. However, this data is still synchronised with the XMP
511 * property [photoshop:Category], and hence available for future use - but
512 * outside the IPTC Core. For migrating from Category codes to Subject Codes
513 * please read the Guideline for mapping Category Codes to Subject NewsCodes
514 * section below.
515 *
516 * @deprecated
517 */
518 Property CATEGORY = Photoshop.CATEGORY;
519
520 /**
521 * As this metadata element was earmarked as deprecated already for IIM 4.1,
522 * it was not adopted. However, this data is still synchronised with the XMP
523 * property [photoshop:SupplementalCategories], and hence available for
524 * future use - but outside the IPTC Core.
525 *
526 * @deprecated
527 */
528 Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES;
529
530 /**
531 * Information about the ethnicity and other facets of the model(s) in a
532 * model-released image.
533 * <p>
534 * Use the Model Age field for the age of model(s).
535 */
536 Property ADDITIONAL_MODEL_INFO = Property.internalText(
537 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AddlModelInfo");
538
539 /**
540 * A set of metadata about artwork or an object in the item
541 */
542 Property ARTWORK_OR_OBJECT = Property.internalTextBag(
543 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "ArtworkOrObject");
544
545 /**
546 * A set of metadata about artwork or an object in the item
547 */
548 Property ORGANISATION_CODE = Property.internalTextBag(
549 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "OrganisationInImageCode");
550
551 /**
552 * A term to describe the content of the image by a value from a Controlled
553 * Vocabulary.
554 * <p>
555 * This property is part of the Photo Metadata 2008 specifications, but
556 * should not released to the public on the standard Adobe Custom Panels for
557 * IPTC metadata or other user interfaces unless agreed by the IPTC.
558 */
559 Property CONTROLLED_VOCABULARY_TERM = Property.internalTextBag(
560 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "CVterm");
561
562 /**
563 * A location the content of the item is about. For photos that is a
564 * location shown in the image.
565 * <p>
566 * If the location the image was taken in is different from this location
567 * the property Location Created should be used too.
568 */
569 Property LOCATION_SHOWN = Property.internalTextBag(
570 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShown");
571
572 /**
573 * Age of the human model(s) at the time this image was taken in a model
574 * released image.
575 * <p>
576 * The user should be aware of any legal implications of providing ages for
577 * young models. Ages below 18 years should not be included.
578 */
579 Property MODEL_AGE = Property.internalTextBag(
580 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "ModelAge");
581
582 /**
583 * Name of the organisation or company which is featured in the content.
584 * <p>
585 * May be supplemented by values from a controlled vocabulary in the
586 * Organisation Code field.
587 */
588 Property ORGANISATION_NAME = Property.internalTextBag(
589 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "OrganisationInImageName");
590
591 /**
592 * Name of a person the content of the item is about. For photos that is a
593 * person shown in the image.
594 */
595 Property PERSON = Property.internalTextBag(
596 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "PersonInImage");
597
598 /**
599 * Globally unique identifier for the item. It is created and applied by the
600 * creator of the item at the time of its creation . This value shall not be
601 * changed after that time.
602 * <p>
603 * The identifier will probably be generated by the technical means of an
604 * imaging device or software and should be applied to the digital image
605 * file as early as possible in its life cycle. This identifier does not
606 * identify any pictured content, particularly in case of a scan of non-
607 * digital images, only this digital representation.
608 * <p>
609 * Any algorithm to create this identifier has to comply with the technical
610 * requirements to create a globally unique id. Any device creating digital
611 * images - e.g. still image cameras, video cameras, scanners - should
612 * create such an identifer right at the time of the creation of the digital
613 * data and add the id to the set of metadata without compromising
614 * performance. It is recommended that this image identifier allows
615 * identifying the device by which the image data and the GUID were created.
616 * IPTC's basic requirements for unique ids are:
617 * - It must be globally unique. Algorithms for this purpose exist.
618 * - It should identify the camera body.
619 * - It should identify each individual photo from this camera body.
620 * - It should identify the date and time of the creation of the picture.
621 * - It should be secured against tampering.
622 * This field should be implemented in a way to prove it has not been changed since its value has
623 * been applied. If the identifier has been created by the imaging device
624 * its type and brand can be found in the Exif/technical metadata.
625 */
626 Property DIGITAL_IMAGE_GUID = Property.internalText(
627 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "DigImageGUID");
628
629 /**
630 * The type of the source digital file.
631 * <p>
632 * The IPTC recommends not to implement this property any longer.
633 *
634 * @deprecated
635 */
636 Property DIGITAL_SOURCE_FILE_TYPE = Property.internalText(
637 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "DigitalSourcefileType");
638
639 /**
640 * The type of the source of this digital image
641 */
642 Property DIGITAL_SOURCE_TYPE = Property.internalText(
643 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "DigitalSourceType");
644
645 /**
646 * Names or describes the specific event the content relates to.
647 * <p>
648 * Examples are: a press conference, dedication ceremony, etc. If this is a
649 * sub-event of a larger event both can be provided by the field: e.g. XXXIX
650 * Olympic Summer Games (Beijing): opening ceremony. Unplanned events could
651 * be named by this property too.
652 */
653 Property EVENT = Property.internalText(
654 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "Event");
655
656 /**
657 * Both a Registry Item Id and a Registry Organisation Id to record any
658 * registration of this item with a registry.
659 * <p>
660 * Typically an id from a registry is negotiated and applied after the
661 * creation of the digital image.
662 * <p>
663 * Any user interface implementation must show both sub-properties - Item Id
664 * and Organisation Id - as corresponding values. Further an input to both
665 * fields should be made mandatory.
666 */
667 Property IMAGE_REGISTRY_ENTRY = Property.internalTextBag(
668 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "RegistryId");
669
670 /**
671 * Identifies the most recent supplier of the item, who is not necessarily
672 * its owner or creator.
673 * <p>
674 * For identifying the supplier either a well known and/or registered
675 * company name or a URL of the company's web site may be used. This
676 * property succeeds the Provider property of IPTC Core 1.0 by its semantics
677 * as that Provider was renamed to Credit Line.
678 * <p>
679 * This is a PLUS version 1.2 property included in the IPTC Extension
680 * schema.
681 */
682 Property IMAGE_SUPPLIER = Property.internalText(
683 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier");
684
685 /**
686 * Identifies the most recent supplier of the item, who is not necessarily
687 * its owner or creator.
688 * <p>
689 * For identifying the supplier either a well known and/or registered
690 * company name or a URL of the company's web site may be used. This
691 * property succeeds the Provider property of IPTC Core 1.0 by its semantics
692 * as that Provider was renamed to Credit Line.
693 * <p>
694 * This is a PLUS version 1.2 property included in the IPTC Extension
695 * schema.
696 */
697 Property IMAGE_SUPPLIER_ID = Property.composite(
698 Property.internalText(
699 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierID"),
700 new Property[] { Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE) });
701
702 /** @deprecated use {@link IPTC#IMAGE_SUPPLIER_ID} */
703 public static final String IMAGE_SUPPLIER_ID_WRONG_CASE =
704 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierId";
705
706 /**
707 * Identifies the most recent supplier of the item, who is not necessarily
708 * its owner or creator.
709 * <p>
710 * For identifying the supplier either a well known and/or registered
711 * company name or a URL of the company's web site may be used. This
712 * property succeeds the Provider property of IPTC Core 1.0 by its semantics
713 * as that Provider was renamed to Credit Line.
714 * <p>
715 * This is a PLUS version 1.2 property included in the IPTC Extension
716 * schema.
717 */
718 Property IMAGE_SUPPLIER_NAME = Property.internalText(
719 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierName");
720
721 /**
722 * Optional identifier assigned by the Image Supplier to the image.
723 * <p>
724 * This is a PLUS version 1.2 property included in the IPTC Extension
725 * schema.
726 */
727 Property IMAGE_SUPPLIER_IMAGE_ID = Property.internalText(
728 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierImageID");
729
730 /**
731 * The date and optionally time when any of the IPTC photo metadata fields
732 * has been last edited
733 * <p>
734 * The public use of this property is deprecated by IPTC Extension version
735 * 1.1. It may only still be used by a private user interface for a use
736 * scoped to a company. If used this field should be a timestamp of the
737 * latest change applied to any of the fields.
738 * <p>
739 * The value of this property should never be set by software. XMP-aware
740 * software should reflect any changes to metadata by the xmp:MetadataDate
741 * property of the XMP Basic scheme.
742 */
743 Property IPTC_LAST_EDITED = Property.internalDate(
744 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "IptcLastEdited");
745
746 /**
747 * The location the content of the item was created.
748 * <p>
749 * If the location in the image is different from the location the photo was
750 * taken the IPTC Extension property Location Shown in the Image should be
751 * used.
752 */
753 Property LOCATION_CREATED = Property.internalTextBag(
754 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreated");
755
756 /**
757 * The maximum available height in pixels of the original photo from which
758 * this photo has been derived by downsizing.
759 */
760 Property MAX_AVAIL_HEIGHT = Property.internalInteger(
761 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "MaxAvailHeight");
762
763 /**
764 * The maximum available width in pixels of the original photo from which
765 * this photo has been derived by downsizing.
766 */
767 Property MAX_AVAIL_WIDTH = Property.internalInteger(
768 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "MaxAvailWidth");
769
770 /**
771 * The version number of the PLUS standards in place at the time of the
772 * transaction.
773 * <p>
774 * This property was included into the IPTC Extension schema from PLUS
775 * version 1.2 as all other PLUS properties. To reflect this the value of
776 * "PLUS Version" should be set to the string "1.2.0"
777 */
778 Property PLUS_VERSION = Property.internalText(
779 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "Version");
780
781 /**
782 * Owner or owners of the copyright in the licensed image.
783 * <p>
784 * Serves to identify the rights holder/s for the image. The Copyright
785 * Owner, Image Creator and Licensor may be the same or different entities.
786 * <p>
787 * This is a PLUS version 1.2 property included in the IPTC Extension
788 * schema.
789 */
790 Property COPYRIGHT_OWNER = Property.internalTextBag(
791 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner");
792
793 /**
794 * The ID of the owner or owners of the copyright in the licensed image.
795 * <p>
796 * Serves to identify the rights holder/s for the image. The Copyright
797 * Owner, Image Creator and Licensor may be the same or different entities.
798 * <p>
799 * This is a PLUS version 1.2 property included in the IPTC Extension
800 * schema.
801 */
802 Property COPYRIGHT_OWNER_ID = Property.composite(
803 Property.internalTextBag(
804 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerID"),
805 new Property[] { Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE) });
806
807 /** @deprecated use {@link IPTC#COPYRIGHT_OWNER_ID} */
808 public static final String COPYRIGHT_OWNER_ID_WRONG_CASE =
809 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerId";
810
811 /**
812 * The name of the owner or owners of the copyright in the licensed image.
813 * <p>
814 * Serves to identify the rights holder/s for the image. The Copyright
815 * Owner, Image Creator and Licensor may be the same or different entities.
816 * <p>
817 * This is a PLUS version 1.2 property included in the IPTC Extension
818 * schema.
819 */
820 Property COPYRIGHT_OWNER_NAME = Property.internalTextBag(
821 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerName");
822
823 /**
824 * Creator or creators of the image.
825 * <p>
826 * The creator can be additionally expressed in free-text using the IPTC
827 * Core Creator field. In many countries, the Image Creator must be
828 * attributed in association with any use of the image. The Image Creator,
829 * Copyright Owner, Image Supplier and Licensor may be the same or different
830 * entities.
831 * <p>
832 * This is a PLUS version 1.2 property included in the IPTC Extension
833 * schema.
834 */
835 Property IMAGE_CREATOR = Property.internalTextBag(
836 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageCreator");
837
838 /**
839 * The ID of the creator or creators of the image.
840 * <p>
841 * The creator can be additionally expressed in free-text using the IPTC
842 * Core Creator field. In many countries, the Image Creator must be
843 * attributed in association with any use of the image. The Image Creator,
844 * Copyright Owner, Image Supplier and Licensor may be the same or different
845 * entities.
846 * <p>
847 * This is a PLUS version 1.2 property included in the IPTC Extension
848 * schema.
849 */
850 Property IMAGE_CREATOR_ID = Property.composite(
851 Property.internalTextBag(
852 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorID"),
853 new Property[] { Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE) });
854
855 /** @deprecated use {@link IPTC#IMAGE_CREATOR_ID} */
856 public static final String IMAGE_CREATOR_ID_WRONG_CASE =
857 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorId";
858
859 /**
860 * The name of the creator or creators of the image.
861 * <p>
862 * The creator can be additionally expressed in free-text using the IPTC
863 * Core Creator field. In many countries, the Image Creator must be
864 * attributed in association with any use of the image. The Image Creator,
865 * Copyright Owner, Image Supplier and Licensor may be the same or different
866 * entities.
867 * <p>
868 * This is a PLUS version 1.2 property included in the IPTC Extension
869 * schema.
870 */
871 Property IMAGE_CREATOR_NAME = Property.internalTextBag(
872 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorName");
873
874 /**
875 * A person or company that should be contacted to obtain a licence for
876 * using the item or who has licensed the item.
877 * <p>
878 * This is a PLUS version 1.2 property included in the IPTC Extension
879 * schema.
880 */
881 Property LICENSOR = Property.internalTextBag(
882 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "Licensor");
883
884 /**
885 * The ID of the person or company that should be contacted to obtain a licence for
886 * using the item or who has licensed the item.
887 * <p>
888 * This is a PLUS version 1.2 property included in the IPTC Extension
889 * schema.
890 */
891 Property LICENSOR_ID = Property.composite(
892 Property.internalTextBag(
893 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorID"),
894 new Property[] { Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE) });
895
896 /** @deprecated use {@link IPTC#LICENSOR_ID} */
897 public static final String LICENSOR_ID_WRONG_CASE =
898 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorId";
899
900 /**
901 * The name of the person or company that should be contacted to obtain a licence for
902 * using the item or who has licensed the item.
903 * <p>
904 * This is a PLUS version 1.2 property included in the IPTC Extension
905 * schema.
906 */
907 Property LICENSOR_NAME = Property.internalTextBag(
908 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorName");
909
910 /**
911 * The city of a person or company that should be contacted to obtain a licence for
912 * using the item or who has licensed the item.
913 * <p>
914 * This is a PLUS version 1.2 property included in the IPTC Extension
915 * schema.
916 */
917 Property LICENSOR_CITY = Property.internalTextBag(
918 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorCity");
919
920 /**
921 * The country of a person or company that should be contacted to obtain a licence for
922 * using the item or who has licensed the item.
923 * <p>
924 * This is a PLUS version 1.2 property included in the IPTC Extension
925 * schema.
926 */
927 Property LICENSOR_COUNTRY = Property.internalTextBag(
928 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorCountry");
929
930 /**
931 * The email of a person or company that should be contacted to obtain a licence for
932 * using the item or who has licensed the item.
933 * <p>
934 * This is a PLUS version 1.2 property included in the IPTC Extension
935 * schema.
936 */
937 Property LICENSOR_EMAIL = Property.internalTextBag(
938 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail");
939
940 /**
941 * The extended address of a person or company that should be contacted to obtain a licence for
942 * using the item or who has licensed the item.
943 * <p>
944 * This is a PLUS version 1.2 property included in the IPTC Extension
945 * schema.
946 */
947 Property LICENSOR_EXTENDED_ADDRESS = Property.internalTextBag(
948 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorExtendedAddress");
949
950 /**
951 * The postal code of a person or company that should be contacted to obtain a licence for
952 * using the item or who has licensed the item.
953 * <p>
954 * This is a PLUS version 1.2 property included in the IPTC Extension
955 * schema.
956 */
957 Property LICENSOR_POSTAL_CODE = Property.internalTextBag(
958 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorPostalCode");
959
960 /**
961 * The region of a person or company that should be contacted to obtain a licence for
962 * using the item or who has licensed the item.
963 * <p>
964 * This is a PLUS version 1.2 property included in the IPTC Extension
965 * schema.
966 */
967 Property LICENSOR_REGION = Property.internalTextBag(
968 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion");
969
970 /**
971 * The street address of a person or company that should be contacted to obtain a licence for
972 * using the item or who has licensed the item.
973 * <p>
974 * This is a PLUS version 1.2 property included in the IPTC Extension
975 * schema.
976 */
977 Property LICENSOR_STREET_ADDRESS = Property.internalTextBag(
978 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorStreetAddress");
979
980 /**
981 * The phone number of a person or company that should be contacted to obtain a licence for
982 * using the item or who has licensed the item.
983 * <p>
984 * This is a PLUS version 1.2 property included in the IPTC Extension
985 * schema.
986 */
987 Property LICENSOR_TELEPHONE_1 = Property.internalTextBag(
988 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone1");
989
990 /**
991 * The phone number of a person or company that should be contacted to obtain a licence for
992 * using the item or who has licensed the item.
993 * <p>
994 * This is a PLUS version 1.2 property included in the IPTC Extension
995 * schema.
996 */
997 Property LICENSOR_TELEPHONE_2 = Property.internalTextBag(
998 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone2");
999
1000 /**
1001 * The URL of a person or company that should be contacted to obtain a licence for
1002 * using the item or who has licensed the item.
1003 * <p>
1004 * This is a PLUS version 1.2 property included in the IPTC Extension
1005 * schema.
1006 */
1007 Property LICENSOR_URL = Property.internalTextBag(
1008 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "LicensorURL");
1009
1010 /**
1011 * Age of the youngest model pictured in the image, at the time that the
1012 * image was made.
1013 * <p>
1014 * This age should not be displayed to the public on open web portals and
1015 * the like. But it may be used by image repositories in a
1016 * B2B enviroment.
1017 * <p>
1018 * This is a PLUS version 1.2 property included in the IPTC Extension
1019 * schema.
1020 */
1021 Property MINOR_MODEL_AGE_DISCLOSURE = Property.internalText(
1022 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "MinorModelAgeDisclosure");
1023
1024 /**
1025 * Optional identifier associated with each Model Release.
1026 * <p>
1027 * This is a PLUS version 1.2 property included in the IPTC Extension
1028 * schema.
1029 */
1030 Property MODEL_RELEASE_ID = Property.internalTextBag(
1031 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID");
1032
1033 /**
1034 * Summarizes the availability and scope of model releases authorizing usage
1035 * of the likenesses of persons appearing in the photograph.
1036 * <p>
1037 * It is recommended to apply the PLUS controlled value Unlimited Model
1038 * Releases (MR- UMR) very carefully and to check the wording of the model
1039 * release thoroughly before applying it.
1040 * <p>
1041 * This is a PLUS version 1.2 property included in the IPTC Extension
1042 * schema.
1043 */
1044 Property MODEL_RELEASE_STATUS = Property.internalText(
1045 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseStatus");
1046
1047 /**
1048 * Optional identifier associated with each Property Release.
1049 * <p>
1050 * This is a PLUS version 1.2 property included in the IPTC Extension
1051 * schema.
1052 */
1053 Property PROPERTY_RELEASE_ID = Property.internalTextBag(
1054 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseID");
1055
1056 /**
1057 * Summarises the availability and scope of property releases authorizing
1058 * usage of the properties appearing in the photograph.
1059 * <p>
1060 * It is recommended to apply the value PR-UPR very carefully and to check
1061 * the wording of the property release thoroughly before applying it.
1062 * <p>
1063 * This is a PLUS version 1.2 property included in the IPTC Extension
1064 * schema.
1065 */
1066 Property PROPERTY_RELEASE_STATUS = Property.internalText(
1067 PREFIX_PLUS + Metadata.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseStatus");
1068
1069 /**
1070 * Contains any necessary copyright notice for claiming the intellectual
1071 * property for artwork or an object in the image and should identify the
1072 * current owner of the copyright of this work with associated intellectual
1073 * property rights.
1074 */
1075 Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = Property.internalTextBag(
1076 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AOCopyrightNotice");
1077
1078 /**
1079 * Contains the name of the artist who has created artwork or an object in the image.
1080 */
1081 Property ARTWORK_OR_OBJECT_DETAIL_CREATOR = Property.internalTextBag(
1082 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AOCreator");
1083
1084 /**
1085 * Designates the date and optionally the time the artwork or object in the
1086 * image was created. This relates to artwork or objects with associated
1087 * intellectual property rights.
1088 */
1089 Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = Property.internalTextBag(
1090 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AODateCreated");
1091
1092 /**
1093 * The organisation or body holding and registering the artwork or object in
1094 * the image for inventory purposes.
1095 */
1096 Property ARTWORK_OR_OBJECT_DETAIL_SOURCE = Property.internalTextBag(
1097 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AOSource");
1098
1099 /**
1100 * The inventory number issued by the organisation or body holding and
1101 * registering the artwork or object in the image.
1102 */
1103 Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = Property.internalTextBag(
1104 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AOSourceInvNo");
1105
1106 /**
1107 * A reference for the artwork or object in the image.
1108 */
1109 Property ARTWORK_OR_OBJECT_DETAIL_TITLE = Property.internalTextBag(
1110 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "AOTitle");
1111
1112 /**
1113 * Name of the city of a location. This element is at the fourth level of a
1114 * top-down geographical hierarchy.
1115 */
1116 Property LOCATION_SHOWN_CITY = Property.internalTextBag(
1117 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownCity");
1118
1119 /**
1120 * The ISO code of a country of a location. This element is at the second
1121 * level of a top-down geographical hierarchy.
1122 * <p>
1123 * Note 1: an implementer would have to derive from the length of the value
1124 * string whether this is the country code from the two or three letter
1125 * scheme as no explicit indication can be provided.
1126 */
1127 Property LOCATION_SHOWN_COUNTRY_CODE = Property.internalTextBag(
1128 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownCountryCode");
1129
1130 /**
1131 * The name of a country of a location. This element is at the second level
1132 * of a top-down geographical hierarchy.
1133 */
1134 Property LOCATION_SHOWN_COUNTRY_NAME = Property.internalTextBag(
1135 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownCountryName");
1136
1137 /**
1138 * The name of a subregion of a country - a province or state - of a
1139 * location. This element is at the third level of a top-down geographical
1140 * hierarchy.
1141 */
1142 Property LOCATION_SHOWN_PROVINCE_OR_STATE = Property.internalTextBag(
1143 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownProvinceState");
1144
1145 /**
1146 * Name of a sublocation. This sublocation name could either be the name of
1147 * a sublocation to a city or the name of a well known location or (natural)
1148 * monument outside a city. In the sense of a sublocation to a city this
1149 * element is at the fifth level of a top-down geographical hierarchy.
1150 */
1151 Property LOCATION_SHOWN_SUBLOCATION = Property.internalTextBag(
1152 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownSublocation");
1153
1154 /**
1155 * The name of a world region of a location. This element is at the first
1156 * (topI) level of a top- down geographical hierarchy.
1157 */
1158 Property LOCATION_SHOWN_WORLD_REGION = Property.internalTextBag(
1159 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationShownWorldRegion");
1160
1161 /**
1162 * Name of the city of a location. This element is at the fourth level of a
1163 * top-down geographical hierarchy.
1164 */
1165 Property LOCATION_CREATED_CITY = Property.internalText(
1166 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCity");
1167
1168 /**
1169 * The ISO code of a country of a location. This element is at the second
1170 * level of a top-down geographical hierarchy.
1171 * <p>
1172 * Note 1: an implementer would have to derive from the length of the value
1173 * string whether this is the country code from the two or three letter
1174 * scheme as no explicit indication can be provided.
1175 */
1176 Property LOCATION_CREATED_COUNTRY_CODE = Property.internalText(
1177 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCountryCode");
1178
1179 /**
1180 * The name of a country of a location. This element is at the second level
1181 * of a top-down geographical hierarchy.
1182 */
1183 Property LOCATION_CREATED_COUNTRY_NAME = Property.internalText(
1184 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCountryName");
1185
1186 /**
1187 * The name of a subregion of a country - a province or state - of a
1188 * location. This element is at the third level of a top-down geographical
1189 * hierarchy.
1190 */
1191 Property LOCATION_CREATED_PROVINCE_OR_STATE = Property.internalText(
1192 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedProvinceState");
1193
1194 /**
1195 * Name of a sublocation. This sublocation name could either be the name of
1196 * a sublocation to a city or the name of a well known location or (natural)
1197 * monument outside a city. In the sense of a sublocation to a city this
1198 * element is at the fifth level of a top-down geographical hierarchy.
1199 */
1200 Property LOCATION_CREATED_SUBLOCATION = Property.internalText(
1201 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedSublocation");
1202
1203 /**
1204 * The name of a world region of a location. This element is at the first
1205 * (topI) level of a top- down geographical hierarchy.
1206 */
1207 Property LOCATION_CREATED_WORLD_REGION = Property.internalText(
1208 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedWorldRegion");
1209
1210 /**
1211 * A unique identifier created by a registry and applied by the creator of
1212 * the item. This value shall not be changed after being applied. This
1213 * identifier is linked to a corresponding Registry Organisation Identifier.
1214 */
1215 Property REGISTRY_ENTRY_CREATED_ITEM_ID = Property.internalTextBag(
1216 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "RegItemId");
1217
1218 /**
1219 * An identifier for the registry which issued the corresponding Registry Image Id.
1220 */
1221 Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID = Property.internalTextBag(
1222 PREFIX_IPTC_EXT + Metadata.NAMESPACE_PREFIX_DELIMITER + "RegOrgId");
1223
1224
1225 Property[] PROPERTY_GROUP_IPTC_CORE = new Property[] {
1226 CITY,
1227 COUNTRY,
1228 COUNTRY_CODE,
1229 DESCRIPTION,
1230 HEADLINE,
1231 INTELLECTUAL_GENRE,
1232 KEYWORDS,
1233 PROVINCE_OR_STATE,
1234 SCENE_CODE,
1235 SUBJECT_CODE,
1236 SUBLOCATION,
1237 DATE_CREATED,
1238 DESCRIPTION_WRITER,
1239 INSTRUCTIONS,
1240 JOB_ID,
1241 TITLE,
1242 COPYRIGHT_NOTICE,
1243 CREATOR,
1244 CREATORS_JOB_TITLE,
1245 CREDIT_LINE,
1246 RIGHTS_USAGE_TERMS,
1247 SOURCE,
1248 CONTACT_INFO_ADDRESS,
1249 CONTACT_INFO_CITY,
1250 CONTACT_INFO_COUNTRY,
1251 CONTACT_INFO_EMAIL,
1252 CONTACT_INFO_PHONE,
1253 CONTACT_INFO_POSTAL_CODE,
1254 CONTACT_INFO_STATE_PROVINCE,
1255 CONTACT_INFO_WEB_URL
1256 };
1257
1258 Property[] PROPERTY_GROUP_IPTC_EXT = new Property[] {
1259 ADDITIONAL_MODEL_INFO,
1260 ORGANISATION_CODE,
1261 CONTROLLED_VOCABULARY_TERM,
1262 MODEL_AGE,
1263 ORGANISATION_NAME,
1264 PERSON,
1265 DIGITAL_IMAGE_GUID,
1266 DIGITAL_SOURCE_TYPE,
1267 EVENT,
1268 IMAGE_SUPPLIER_ID,
1269 IMAGE_SUPPLIER_NAME,
1270 IMAGE_SUPPLIER_IMAGE_ID,
1271 IPTC_LAST_EDITED,
1272 MAX_AVAIL_HEIGHT,
1273 MAX_AVAIL_WIDTH,
1274 PLUS_VERSION,
1275 COPYRIGHT_OWNER_ID,
1276 COPYRIGHT_OWNER_NAME,
1277 IMAGE_CREATOR_ID,
1278 IMAGE_CREATOR_NAME,
1279 LICENSOR_ID,
1280 LICENSOR_NAME,
1281 LICENSOR_CITY,
1282 LICENSOR_COUNTRY,
1283 LICENSOR_EMAIL,
1284 LICENSOR_EXTENDED_ADDRESS,
1285 LICENSOR_POSTAL_CODE,
1286 LICENSOR_REGION,
1287 LICENSOR_STREET_ADDRESS,
1288 LICENSOR_TELEPHONE_1,
1289 LICENSOR_TELEPHONE_2,
1290 LICENSOR_URL,
1291 MINOR_MODEL_AGE_DISCLOSURE,
1292 MODEL_RELEASE_ID,
1293 MODEL_RELEASE_STATUS,
1294 PROPERTY_RELEASE_ID,
1295 PROPERTY_RELEASE_STATUS,
1296 ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE,
1297 ARTWORK_OR_OBJECT_DETAIL_CREATOR,
1298 ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED,
1299 ARTWORK_OR_OBJECT_DETAIL_SOURCE,
1300 ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER,
1301 ARTWORK_OR_OBJECT_DETAIL_TITLE,
1302 LOCATION_SHOWN_CITY,
1303 LOCATION_SHOWN_COUNTRY_CODE,
1304 LOCATION_SHOWN_COUNTRY_NAME,
1305 LOCATION_SHOWN_PROVINCE_OR_STATE,
1306 LOCATION_SHOWN_SUBLOCATION,
1307 LOCATION_SHOWN_WORLD_REGION,
1308 LOCATION_CREATED_CITY,
1309 LOCATION_CREATED_COUNTRY_CODE,
1310 LOCATION_CREATED_COUNTRY_NAME,
1311 LOCATION_CREATED_PROVINCE_OR_STATE,
1312 LOCATION_CREATED_SUBLOCATION,
1313 LOCATION_CREATED_WORLD_REGION,
1314 REGISTRY_ENTRY_CREATED_ITEM_ID,
1315 REGISTRY_ENTRY_CREATED_ORGANISATION_ID
1316 };
1317 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of Microsoft Office and Open Document property names.
20 *
21 * This is being replaced with cleaner, better defined properties in
22 * {@link Office}.
23 */
24 public interface MSOffice {
25
26 @Deprecated String KEYWORDS = "Keywords";
27
28 @Deprecated String COMMENTS = "Comments";
29
30 @Deprecated String LAST_AUTHOR = "Last-Author";
31
32 @Deprecated String AUTHOR = "Author";
33
34 @Deprecated String APPLICATION_NAME = "Application-Name";
35
36 @Deprecated String REVISION_NUMBER = "Revision-Number";
37
38 @Deprecated String TEMPLATE = "Template";
39
40 @Deprecated String TOTAL_TIME = "Total-Time";
41
42 @Deprecated String PRESENTATION_FORMAT = "Presentation-Format";
43
44 @Deprecated String NOTES = "Notes";
45
46 @Deprecated String MANAGER = "Manager";
47
48 @Deprecated String APPLICATION_VERSION = "Application-Version";
49
50 @Deprecated String VERSION = "Version";
51
52 @Deprecated String CONTENT_STATUS = "Content-Status";
53
54 @Deprecated String CATEGORY = "Category";
55
56 @Deprecated String COMPANY = "Company";
57
58 @Deprecated String SECURITY = "Security";
59
60
61 /** The number of Slides are there in the (presentation) document */
62 @Deprecated Property SLIDE_COUNT =
63 Property.internalInteger("Slide-Count");
64
65 /** The number of Pages are there in the (paged) document */
66 @Deprecated Property PAGE_COUNT =
67 Property.internalInteger("Page-Count");
68
69 /** The number of individual Paragraphs in the document */
70 @Deprecated Property PARAGRAPH_COUNT =
71 Property.internalInteger("Paragraph-Count");
72
73 /** The number of lines in the document */
74 @Deprecated Property LINE_COUNT =
75 Property.internalInteger("Line-Count");
76
77 /** The number of Words in the document */
78 @Deprecated Property WORD_COUNT =
79 Property.internalInteger("Word-Count");
80
81 /** The number of Characters in the document */
82 @Deprecated Property CHARACTER_COUNT =
83 Property.internalInteger("Character Count");
84
85 /** The number of Characters in the document, including spaces */
86 @Deprecated Property CHARACTER_COUNT_WITH_SPACES =
87 Property.internalInteger("Character-Count-With-Spaces");
88
89 /** The number of Tables in the document */
90 @Deprecated Property TABLE_COUNT =
91 Property.internalInteger("Table-Count");
92
93 /** The number of Images in the document */
94 @Deprecated Property IMAGE_COUNT =
95 Property.internalInteger("Image-Count");
96
97 /**
98 * The number of Objects in the document.
99 * This is typically non-Image resources embedded in the
100 * document, such as other documents or non-Image media.
101 */
102 @Deprecated Property OBJECT_COUNT =
103 Property.internalInteger("Object-Count");
104
105
106 /** How long has been spent editing the document? */
107 String EDIT_TIME = "Edit-Time";
108
109 /** When was the document created? */
110 @Deprecated Property CREATION_DATE =
111 Property.internalDate("Creation-Date");
112
113 /** When was the document last saved? */
114 @Deprecated Property LAST_SAVED =
115 Property.internalDate("Last-Save-Date");
116
117 /** When was the document last printed? */
118 @Deprecated Property LAST_PRINTED =
119 Property.internalDate("Last-Printed");
120
121 /**
122 * For user defined metadata entries in the document,
123 * what prefix should be attached to the key names.
124 * eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
125 */
126 String USER_DEFINED_METADATA_NAME_PREFIX = "custom:";
127 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of Message related property names.
20 */
21 public interface Message {
22 String MESSAGE_RECIPIENT_ADDRESS = "Message-Recipient-Address";
23
24 String MESSAGE_FROM = "Message-From";
25
26 String MESSAGE_TO = "Message-To";
27
28 String MESSAGE_CC = "Message-Cc";
29
30 String MESSAGE_BCC = "Message-Bcc";
31 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 import static org.apache.tika.utils.DateUtils.MIDDAY;
19 import static org.apache.tika.utils.DateUtils.UTC;
20 import static org.apache.tika.utils.DateUtils.formatDate;
21
22 import java.io.Serializable;
23 import java.text.DateFormat;
24 import java.text.DateFormatSymbols;
25 import java.text.ParseException;
26 import java.text.SimpleDateFormat;
27 import java.util.Date;
28 import java.util.Enumeration;
29 import java.util.HashMap;
30 import java.util.Locale;
31 import java.util.Map;
32 import java.util.Properties;
33 import java.util.TimeZone;
34
35 import org.apache.tika.metadata.Property.PropertyType;
36
37 /**
38 * A multi-valued metadata container.
39 */
40 public class Metadata implements CreativeCommons, Geographic, HttpHeaders,
41 Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys,
42 Serializable {
43
44 /** Serial version UID */
45 private static final long serialVersionUID = 5623926545693153182L;
46
47 /**
48 * A map of all metadata attributes.
49 */
50 private Map<String, String[]> metadata = null;
51
52 /**
53 * The common delimiter used between the namespace abbreviation and the property name
54 */
55 public static final String NAMESPACE_PREFIX_DELIMITER = ":";
56
57 /** @deprecated use TikaCoreProperties#FORMAT */
58 public static final String FORMAT = "format";
59 /** @deprecated use TikaCoreProperties#IDENTIFIER */
60 public static final String IDENTIFIER = "identifier";
61 /** @deprecated use TikaCoreProperties#MODIFIED */
62 public static final String MODIFIED = "modified";
63 /** @deprecated use TikaCoreProperties#CONTRIBUTOR */
64 public static final String CONTRIBUTOR = "contributor";
65 /** @deprecated use TikaCoreProperties#COVERAGE */
66 public static final String COVERAGE = "coverage";
67 /** @deprecated use TikaCoreProperties#CREATOR */
68 public static final String CREATOR = "creator";
69 /** @deprecated use TikaCoreProperties#CREATED */
70 public static final Property DATE = Property.internalDate("date");
71 /** @deprecated use TikaCoreProperties#DESCRIPTION */
72 public static final String DESCRIPTION = "description";
73 /** @deprecated use TikaCoreProperties#LANGUAGE */
74 public static final String LANGUAGE = "language";
75 /** @deprecated use TikaCoreProperties#PUBLISHER */
76 public static final String PUBLISHER = "publisher";
77 /** @deprecated use TikaCoreProperties#RELATION */
78 public static final String RELATION = "relation";
79 /** @deprecated use TikaCoreProperties#RIGHTS */
80 public static final String RIGHTS = "rights";
81 /** @deprecated use TikaCoreProperties#SOURCE */
82 public static final String SOURCE = "source";
83 /** @deprecated use TikaCoreProperties#KEYWORDS */
84 public static final String SUBJECT = "subject";
85 /** @deprecated use TikaCoreProperties#TITLE */
86 public static final String TITLE = "title";
87 /** @deprecated use TikaCoreProperties#TYPE */
88 public static final String TYPE = "type";
89
90 /**
91 * Some parsers will have the date as a ISO-8601 string
92 * already, and will set that into the Metadata object.
93 * So we can return Date objects for these, this is the
94 * list (in preference order) of the various ISO-8601
95 * variants that we try when processing a date based
96 * property.
97 */
98 private static final DateFormat[] iso8601InputFormats = new DateFormat[] {
99 // yyyy-mm-ddThh...
100 createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC), // UTC/Zulu
101 createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null), // With timezone
102 createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null), // Without timezone
103 // yyyy-mm-dd hh...
104 createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC), // UTC/Zulu
105 createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null), // With timezone
106 createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null), // Without timezone
107 // Date without time, set to Midday UTC
108 createDateFormat("yyyy-MM-dd", MIDDAY), // Normal date format
109 createDateFormat("yyyy:MM:dd", MIDDAY), // Image (IPTC/EXIF) format
110 };
111
112 private static DateFormat createDateFormat(String format, TimeZone timezone) {
113 SimpleDateFormat sdf =
114 new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
115 if (timezone != null) {
116 sdf.setTimeZone(timezone);
117 }
118 return sdf;
119 }
120
121 /**
122 * Parses the given date string. This method is synchronized to prevent
123 * concurrent access to the thread-unsafe date formats.
124 *
125 * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
126 * @param date date string
127 * @return parsed date, or <code>null</code> if the date can't be parsed
128 */
129 private static synchronized Date parseDate(String date) {
130 // Java doesn't like timezones in the form ss+hh:mm
131 // It only likes the hhmm form, without the colon
132 int n = date.length();
133 if (date.charAt(n - 3) == ':'
134 && (date.charAt(n - 6) == '+' || date.charAt(n - 6) == '-')) {
135 date = date.substring(0, n - 3) + date.substring(n - 2);
136 }
137
138 // Try several different ISO-8601 variants
139 for (DateFormat format : iso8601InputFormats) {
140 try {
141 return format.parse(date);
142 } catch (ParseException ignore) {
143 }
144 }
145 return null;
146 }
147
148 /**
149 * Constructs a new, empty metadata.
150 */
151 public Metadata() {
152 metadata = new HashMap<String, String[]>();
153 }
154
155 /**
156 * Returns true if named value is multivalued.
157 *
158 * @param property
159 * metadata property
160 * @return true is named value is multivalued, false if single value or null
161 */
162 public boolean isMultiValued(final Property property) {
163 return metadata.get(property.getName()) != null && metadata.get(property.getName()).length > 1;
164 }
165
166 /**
167 * Returns true if named value is multivalued.
168 *
169 * @param name
170 * name of metadata
171 * @return true is named value is multivalued, false if single value or null
172 */
173 public boolean isMultiValued(final String name) {
174 return metadata.get(name) != null && metadata.get(name).length > 1;
175 }
176
177 /**
178 * Returns an array of the names contained in the metadata.
179 *
180 * @return Metadata names
181 */
182 public String[] names() {
183 return metadata.keySet().toArray(new String[metadata.keySet().size()]);
184 }
185
186 /**
187 * Get the value associated to a metadata name. If many values are assiociated
188 * to the specified name, then the first one is returned.
189 *
190 * @param name
191 * of the metadata.
192 * @return the value associated to the specified metadata name.
193 */
194 public String get(final String name) {
195 String[] values = metadata.get(name);
196 if (values == null) {
197 return null;
198 } else {
199 return values[0];
200 }
201 }
202
203 /**
204 * Returns the value (if any) of the identified metadata property.
205 *
206 * @since Apache Tika 0.7
207 * @param property property definition
208 * @return property value, or <code>null</code> if the property is not set
209 */
210 public String get(Property property) {
211 return get(property.getName());
212 }
213
214 /**
215 * Returns the value of the identified Integer based metadata property.
216 *
217 * @since Apache Tika 0.8
218 * @param property simple integer property definition
219 * @return property value as a Integer, or <code>null</code> if the property is not set, or not a valid Integer
220 */
221 public Integer getInt(Property property) {
222 if(property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
223 return null;
224 }
225 if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
226 return null;
227 }
228
229 String v = get(property);
230 if(v == null) {
231 return null;
232 }
233 try {
234 return Integer.valueOf(v);
235 } catch(NumberFormatException e) {
236 return null;
237 }
238 }
239
240 /**
241 * Returns the value of the identified Date based metadata property.
242 *
243 * @since Apache Tika 0.8
244 * @param property simple date property definition
245 * @return property value as a Date, or <code>null</code> if the property is not set, or not a valid Date
246 */
247 public Date getDate(Property property) {
248 if(property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
249 return null;
250 }
251 if(property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
252 return null;
253 }
254
255 String v = get(property);
256 if (v != null) {
257 return parseDate(v);
258 } else {
259 return null;
260 }
261 }
262
263 /**
264 * Get the values associated to a metadata name.
265 *
266 * @param property
267 * of the metadata.
268 * @return the values associated to a metadata name.
269 */
270 public String[] getValues(final Property property) {
271 return _getValues(property.getName());
272 }
273
274 /**
275 * Get the values associated to a metadata name.
276 *
277 * @param name
278 * of the metadata.
279 * @return the values associated to a metadata name.
280 */
281 public String[] getValues(final String name) {
282 return _getValues(name);
283 }
284
285 private String[] _getValues(final String name) {
286 String[] values = metadata.get(name);
287 if (values == null) {
288 values = new String[0];
289 }
290 return values;
291 }
292
293 private String[] appendedValues(String[] values, final String value) {
294 String[] newValues = new String[values.length + 1];
295 System.arraycopy(values, 0, newValues, 0, values.length);
296 newValues[newValues.length - 1] = value;
297 return newValues;
298 }
299
300 /**
301 * Add a metadata name/value mapping. Add the specified value to the list of
302 * values associated to the specified metadata name.
303 *
304 * @param name
305 * the metadata name.
306 * @param value
307 * the metadata value.
308 */
309 public void add(final String name, final String value) {
310 String[] values = metadata.get(name);
311 if (values == null) {
312 set(name, value);
313 } else {
314 metadata.put(name, appendedValues(values, value));
315 }
316 }
317
318 /**
319 * Add a metadata property/value mapping. Add the specified value to the list of
320 * values associated to the specified metadata property.
321 *
322 * @param property
323 * the metadata property.
324 * @param value
325 * the metadata value.
326 */
327 public void add(final Property property, final String value) {
328 String[] values = metadata.get(property.getName());
329 if (values == null) {
330 set(property, value);
331 } else {
332 if (property.isMultiValuePermitted()) {
333 set(property, appendedValues(values, value));
334 } else {
335 throw new PropertyTypeException(property.getPropertyType());
336 }
337 }
338 }
339
340 /**
341 * Copy All key-value pairs from properties.
342 *
343 * @param properties
344 * properties to copy from
345 */
346 @SuppressWarnings("unchecked")
347 public void setAll(Properties properties) {
348 Enumeration<String> names =
349 (Enumeration<String>) properties.propertyNames();
350 while (names.hasMoreElements()) {
351 String name = names.nextElement();
352 metadata.put(name, new String[] { properties.getProperty(name) });
353 }
354 }
355
356 /**
357 * Set metadata name/value. Associate the specified value to the specified
358 * metadata name. If some previous values were associated to this name,
359 * they are removed. If the given value is <code>null</code>, then the
360 * metadata entry is removed.
361 *
362 * @param name the metadata name.
363 * @param value the metadata value, or <code>null</code>
364 */
365 public void set(String name, String value) {
366 if (value != null) {
367 metadata.put(name, new String[] { value });
368 } else {
369 metadata.remove(name);
370 }
371 }
372
373 /**
374 * Sets the value of the identified metadata property.
375 *
376 * @since Apache Tika 0.7
377 * @param property property definition
378 * @param value property value
379 */
380 public void set(Property property, String value) {
381 if (property == null) {
382 throw new NullPointerException("property must not be null");
383 }
384 if (property.getPropertyType() == PropertyType.COMPOSITE) {
385 set(property.getPrimaryProperty(), value);
386 if (property.getSecondaryExtractProperties() != null) {
387 for (Property secondaryExtractProperty : property.getSecondaryExtractProperties()) {
388 set(secondaryExtractProperty, value);
389 }
390 }
391 } else {
392 set(property.getName(), value);
393 }
394 }
395
396 /**
397 * Sets the values of the identified metadata property.
398 *
399 * @since Apache Tika 1.2
400 * @param property property definition
401 * @param values property values
402 */
403 public void set(Property property, String[] values) {
404 if (property == null) {
405 throw new NullPointerException("property must not be null");
406 }
407 if (property.getPropertyType() == PropertyType.COMPOSITE) {
408 set(property.getPrimaryProperty(), values);
409 if (property.getSecondaryExtractProperties() != null) {
410 for (Property secondaryExtractProperty : property.getSecondaryExtractProperties()) {
411 set(secondaryExtractProperty, values);
412 }
413 }
414 } else {
415 metadata.put(property.getName(), values);
416 }
417 }
418
419 /**
420 * Sets the integer value of the identified metadata property.
421 *
422 * @since Apache Tika 0.8
423 * @param property simple integer property definition
424 * @param value property value
425 */
426 public void set(Property property, int value) {
427 if(property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
428 throw new PropertyTypeException(Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
429 }
430 if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
431 throw new PropertyTypeException(Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
432 }
433 set(property, Integer.toString(value));
434 }
435
436 /**
437 * Sets the real or rational value of the identified metadata property.
438 *
439 * @since Apache Tika 0.8
440 * @param property simple real or simple rational property definition
441 * @param value property value
442 */
443 public void set(Property property, double value) {
444 if(property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
445 throw new PropertyTypeException(Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
446 }
447 if(property.getPrimaryProperty().getValueType() != Property.ValueType.REAL &&
448 property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
449 throw new PropertyTypeException(Property.ValueType.REAL, property.getPrimaryProperty().getValueType());
450 }
451 set(property, Double.toString(value));
452 }
453
454 /**
455 * Sets the date value of the identified metadata property.
456 *
457 * @since Apache Tika 0.8
458 * @param property simple integer property definition
459 * @param date property value
460 */
461 public void set(Property property, Date date) {
462 if(property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
463 throw new PropertyTypeException(Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
464 }
465 if(property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
466 throw new PropertyTypeException(Property.ValueType.DATE, property.getPrimaryProperty().getValueType());
467 }
468 String dateString = null;
469 if (date != null) {
470 dateString = formatDate(date);
471 }
472 set(property, dateString);
473 }
474
475 /**
476 * Remove a metadata and all its associated values.
477 *
478 * @param name
479 * metadata name to remove
480 */
481 public void remove(String name) {
482 metadata.remove(name);
483 }
484
485 /**
486 * Returns the number of metadata names in this metadata.
487 *
488 * @return number of metadata names
489 */
490 public int size() {
491 return metadata.size();
492 }
493
494 public boolean equals(Object o) {
495
496 if (o == null) {
497 return false;
498 }
499
500 Metadata other = null;
501 try {
502 other = (Metadata) o;
503 } catch (ClassCastException cce) {
504 return false;
505 }
506
507 if (other.size() != size()) {
508 return false;
509 }
510
511 String[] names = names();
512 for (int i = 0; i < names.length; i++) {
513 String[] otherValues = other._getValues(names[i]);
514 String[] thisValues = _getValues(names[i]);
515 if (otherValues.length != thisValues.length) {
516 return false;
517 }
518 for (int j = 0; j < otherValues.length; j++) {
519 if (!otherValues[j].equals(thisValues[j])) {
520 return false;
521 }
522 }
523 }
524 return true;
525 }
526
527 public String toString() {
528 StringBuffer buf = new StringBuffer();
529 String[] names = names();
530 for (int i = 0; i < names.length; i++) {
531 String[] values = _getValues(names[i]);
532 for (int j = 0; j < values.length; j++) {
533 buf.append(names[i]).append("=").append(values[j]).append(" ");
534 }
535 }
536 return buf.toString();
537 }
538
539 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Office Document properties collection. These properties apply to
20 * Office / Productivity Documents of all forms, including (but not limited
21 * to) MS Office and OpenDocument formats.
22 * This is a logical collection of properties, which may be drawn from a
23 * few different external definitions.
24 *
25 * Note that some of the legacy properties from the {@link MSOffice}
26 * collection still need to be migrated over
27 *
28 * @since Apache Tika 1.2
29 */
30 public interface Office {
31 // These are taken from the OpenDocumentFormat specification
32 public static final String NAMESPACE_URI_DOC_META = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
33 public static final String PREFIX_DOC_META = "meta";
34
35 /**
36 * For user defined metadata entries in the document,
37 * what prefix should be attached to the key names.
38 * eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
39 */
40 public static final String USER_DEFINED_METADATA_NAME_PREFIX = "custom:";
41
42
43 /**
44 * Keywords pertaining to a document.
45 */
46 Property KEYWORDS = Property.internalTextBag(
47 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "keyword");
48
49 /**
50 * Name of the initial creator/author of a document
51 */
52 Property INITIAL_AUTHOR = Property.internalText(
53 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "initial-author");
54
55 /**
56 * Name of the last (most recent) author of a document
57 */
58 Property LAST_AUTHOR = Property.internalText(
59 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "last-author");
60
61 /**
62 * Name of the principal author(s) of a document
63 */
64 Property AUTHOR = Property.internalTextBag(
65 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "author");
66
67
68 /** When was the document created? */
69 Property CREATION_DATE = Property.internalDate(
70 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "creation-date");
71
72 /** When was the document last saved? */
73 Property SAVE_DATE = Property.internalDate(
74 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "save-date");
75
76 /** When was the document last printed? */
77 Property PRINT_DATE = Property.internalDate(
78 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "print-date");
79
80
81
82 /** The number of Slides are there in the (presentation) document */
83 Property SLIDE_COUNT = Property.internalInteger(
84 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "slide-count");
85
86 /** The number of Pages are there in the (paged) document */
87 Property PAGE_COUNT = Property.internalInteger(
88 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "page-count");
89
90 /** The number of individual Paragraphs in the document */
91 Property PARAGRAPH_COUNT = Property.internalInteger(
92 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "paragraph-count");
93
94 /** The number of lines in the document */
95 Property LINE_COUNT = Property.internalInteger(
96 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "line-count");
97
98 /** The number of Words in the document */
99 Property WORD_COUNT = Property.internalInteger(
100 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "word-count");
101
102 /** The number of Characters in the document */
103 Property CHARACTER_COUNT = Property.internalInteger(
104 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "character-count");
105
106 /** The number of Characters in the document, including spaces */
107 Property CHARACTER_COUNT_WITH_SPACES = Property.internalInteger(
108 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "character-count-with-spaces");
109
110 /** The number of Tables in the document */
111 Property TABLE_COUNT = Property.internalInteger(
112 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "table-count");
113
114 /** The number of Images in the document */
115 Property IMAGE_COUNT = Property.internalInteger(
116 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "image-count");
117
118 /**
119 * The number of Objects in the document. These are typically non-Image resources
120 * embedded in the document, such as other documents or non-Image media.
121 */
122 Property OBJECT_COUNT = Property.internalInteger(
123 PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + "object-count");
124 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Core properties as defined in the Office Open XML specification part Two that are not
20 * in the DublinCore namespace.
21 * There is also a keyword property definition in the specification which is omitted here,
22 * because Tika should stick to the DublinCore/IPTC definition.
23 *
24 * @see <a href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
25 * >ISO document of Office Open XML specification</a>
26 * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm
27 * >ECMA document of Office Open XML specification</a>
28 */
29 public interface OfficeOpenXMLCore
30 {
31 String NAMESPACE_URI = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties/";
32 String PREFIX = "cp";
33
34 /**
35 * A categorization of the content of this package.
36 */
37 Property CATEGORY = Property.externalText(
38 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "category");
39
40 /**
41 * The status of the content.
42 */
43 Property CONTENT_STATUS = Property.externalText(
44 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "contentStatus");
45
46 /**
47 * The user who performed the last modification. The identification is environment-specific.
48 */
49 Property LAST_MODIFIED_BY = Property.externalText(
50 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy");
51
52 /**
53 * The date and time of the last printing.
54 */
55 Property LAST_PRINTED = Property.externalDate(
56 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "lastPrinted");
57
58 /**
59 * The revision number.
60 */
61 Property REVISION = Property.externalText(
62 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "revision");
63
64 /**
65 * The version number. This value is set by the user or by the application.
66 */
67 Property VERSION = Property.externalText(
68 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "version");
69
70 /**
71 * The document's subject.
72 */
73 Property SUBJECT = Property.externalText(
74 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "subject");
75 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Extended properties as defined in the Office Open XML specification part Four.
20 * Those properties are omitted which have equivalent properties defined in the ODF
21 * namespace like "word count".
22 * Also not all properties from the specification are defined here, yet. Only those which have been in
23 * use by the parsers so far.
24 *
25 * @see <a href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
26 * >ISO document of Office Open XML specification</a>
27 * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm
28 * >ECMA document of Office Open XML specification</a>
29 */
30 public interface OfficeOpenXMLExtended
31 {
32 String NAMESPACE_URI = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties/";
33 String WORD_PROCESSING_NAMESPACE_URI = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
34 String PREFIX = "extended-properties";
35 String WORD_PROCESSING_PREFIX = "w";
36
37 Property TEMPLATE = Property.externalText(
38 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
39
40 Property MANAGER = Property.externalText(
41 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Manager");
42
43 Property COMPANY = Property.externalText(
44 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Company");
45
46 Property PRESENTATION_FORMAT = Property.externalText(
47 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat");
48
49 Property NOTES = Property.externalInteger(
50 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Notes");
51
52 Property TOTAL_TIME = Property.externalInteger(
53 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "TotalTime");
54
55 Property HIDDEN_SLIDES = Property.externalInteger(
56 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides");
57
58 Property APPLICATION = Property.externalText(
59 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Application");
60
61 Property APP_VERSION = Property.externalText(
62 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
63
64 Property DOC_SECURITY = Property.externalInteger(
65 PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
66
67 Property COMMENTS = Property.externalTextBag(
68 WORD_PROCESSING_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "comments");
69 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * XMP Paged-text schema. This is a collection of
20 * {@link Property property definition} constants for the paged text
21 * properties defined in the XMP standard.
22 *
23 * @since Apache Tika 0.8
24 * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
25 * >XMP Specification, Part 2: Standard Schemas</a>
26 */
27 public interface PagedText {
28
29 /**
30 * "The number of pages in the document (including any in contained
31 * documents)."
32 */
33 Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
34
35 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010)
17 * standard. These parts Copyright 2010 International Press Telecommunications
18 * Council.
19 */
20 package org.apache.tika.metadata;
21
22 /**
23 * XMP Photoshop metadata schema.
24 *
25 * A collection of property constants for the
26 * Photo Metadata properties defined in the XMP Photoshop
27 * standard.
28 *
29 * @since Apache Tika 1.2
30 * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP Photoshop</a>
31 */
32 public interface Photoshop {
33
34 String NAMESPACE_URI_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/";
35 String PREFIX_PHOTOSHOP = "photoshop";
36
37 Property AUTHORS_POSITION = Property.internalText(
38 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "AuthorsPosition");
39
40 Property CAPTION_WRITER = Property.internalText(
41 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "CaptionWriter");
42
43 Property CATEGORY = Property.internalText(
44 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Category");
45
46 Property CITY = Property.internalText(
47 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "City");
48
49 Property COUNTRY = Property.internalText(
50 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Country");
51
52 Property CREDIT = Property.internalText(
53 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Credit");
54
55 Property DATE_CREATED = Property.internalDate(
56 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "DateCreated");
57
58 Property HEADLINE = Property.internalText(
59 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Headline");
60
61 Property INSTRUCTIONS = Property.internalText(
62 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Instructions");
63
64 Property SOURCE = Property.internalText(
65 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Source");
66
67 Property STATE = Property.internalText(
68 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "State");
69
70 Property SUPPLEMENTAL_CATEGORIES = Property.internalTextBag(
71 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "SupplementalCategories");
72
73 Property TRANSMISSION_REFERENCE = Property.internalText(
74 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "TransmissionReference");
75
76 Property URGENCY = Property.internalText(
77 PREFIX_PHOTOSHOP + Metadata.NAMESPACE_PREFIX_DELIMITER + "Urgency");
78
79 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashMap;
21 import java.util.HashSet;
22 import java.util.Map;
23 import java.util.Set;
24 import java.util.SortedSet;
25 import java.util.TreeSet;
26
27 /**
28 * XMP property definition. Each instance of this class defines a single
29 * metadata property like "dc:format". In addition to the property name,
30 * the {@link ValueType value type} and category (internal or external)
31 * of the property are included in the property definition. The available
32 * choice values are also stored for open and closed choice value types.
33 *
34 * @since Apache Tika 0.7
35 */
36 public final class Property implements Comparable<Property> {
37
38 public static enum PropertyType {
39 /** A single value */
40 SIMPLE,
41 STRUCTURE,
42 /** An un-ordered array */
43 BAG,
44 /** An ordered array */
45 SEQ,
46 /** An ordered array with some sort of criteria */
47 ALT,
48 /** Multiple child properties */
49 COMPOSITE
50 }
51
52 public static enum ValueType {
53 BOOLEAN, OPEN_CHOICE, CLOSED_CHOICE, DATE, INTEGER, LOCALE,
54 MIME_TYPE, PROPER_NAME, RATIONAL, REAL, TEXT, URI, URL, XPATH, PROPERTY
55 }
56
57 private static final Map<String, Property> properties =
58 new HashMap<String, Property>();
59
60 private final String name;
61
62 private final boolean internal;
63
64 private final PropertyType propertyType;
65
66 private final ValueType valueType;
67
68 private final Property primaryProperty;
69
70 private final Property[] secondaryExtractProperties;
71
72 /**
73 * The available choices for the open and closed choice value types.
74 */
75 private final Set<String> choices;
76
77 private Property(
78 String name, boolean internal, PropertyType propertyType,
79 ValueType valueType, String[] choices, Property primaryProperty, Property[] secondaryExtractProperties) {
80 this.name = name;
81 this.internal = internal;
82 this.propertyType = propertyType;
83 this.valueType = valueType;
84 if (choices != null) {
85 this.choices = Collections.unmodifiableSet(
86 new HashSet<String>(Arrays.asList(choices.clone())));
87 } else {
88 this.choices = null;
89 }
90
91 if (primaryProperty != null) {
92 this.primaryProperty = primaryProperty;
93 this.secondaryExtractProperties = secondaryExtractProperties;
94 } else {
95 this.primaryProperty = this;
96 this.secondaryExtractProperties = null;
97
98 // Only store primary properties for lookup, not composites
99 synchronized (properties) {
100 properties.put(name, this);
101 }
102 }
103 }
104
105 private Property(
106 String name, boolean internal, PropertyType propertyType,
107 ValueType valueType, String[] choices) {
108 this(name, internal, propertyType, valueType, choices, null, null);
109 }
110
111 private Property(
112 String name, boolean internal,
113 ValueType valueType, String[] choices) {
114 this(name, internal, PropertyType.SIMPLE, valueType, choices);
115 }
116
117 private Property(String name, boolean internal, ValueType valueType) {
118 this(name, internal, PropertyType.SIMPLE, valueType, null);
119 }
120
121 private Property(
122 String name, boolean internal,
123 PropertyType propertyType, ValueType valueType) {
124 this(name, internal, propertyType, valueType, null);
125 }
126
127 public String getName() {
128 return name;
129 }
130
131 public boolean isInternal() {
132 return internal;
133 }
134
135 public boolean isExternal() {
136 return !internal;
137 }
138
139 /**
140 * Is the PropertyType one which accepts multiple values?
141 */
142 public boolean isMultiValuePermitted() {
143 if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ ||
144 propertyType == PropertyType.ALT) {
145 return true;
146 } else if (propertyType == PropertyType.COMPOSITE) {
147 // Base it on the primary property's behaviour
148 return primaryProperty.isMultiValuePermitted();
149 }
150 return false;
151 }
152
153 /**
154 * Get the type of a property
155 * @param key name of the property
156 * @return the type of the property
157 */
158 public static PropertyType getPropertyType(String key) {
159 PropertyType type = null;
160 Property prop = properties.get(key);
161 if (prop != null) {
162 type = prop.getPropertyType();
163 }
164 return type;
165 }
166
167 /**
168 * Retrieve the property object that corresponds to the given key
169 * @param key the property key or name
170 * @return the Property object
171 */
172 public static Property get(String key) {
173 return properties.get(key);
174 }
175
176 public PropertyType getPropertyType() {
177 return propertyType;
178 }
179
180 public ValueType getValueType() {
181 return valueType;
182 }
183
184 /**
185 * Returns the (immutable) set of choices for the values of this property.
186 * Only defined for {@link ValueType#OPEN_CHOICE open} and
187 * {@link ValueType#CLOSED_CHOICE closed choice} value types.
188 *
189 * @return available choices, or <code>null</code>
190 */
191 public Set<String> getChoices() {
192 return choices;
193 }
194
195 /**
196 * Gets the primary property for a composite property
197 *
198 * @return the primary property
199 */
200 public Property getPrimaryProperty() {
201 return primaryProperty;
202 }
203
204 /**
205 * Gets the secondary properties for a composite property
206 *
207 * @return the secondary properties
208 */
209 public Property[] getSecondaryExtractProperties() {
210 return secondaryExtractProperties;
211 }
212
213 public static SortedSet<Property> getProperties(String prefix) {
214 SortedSet<Property> set = new TreeSet<Property>();
215 String p = prefix + ":";
216 synchronized (properties) {
217 for (String name : properties.keySet()) {
218 if (name.startsWith(p)) {
219 set.add(properties.get(name));
220 }
221 }
222 }
223 return set;
224 }
225
226 public static Property internalBoolean(String name) {
227 return new Property(name, true, ValueType.BOOLEAN);
228 }
229
230 public static Property internalClosedChoise(
231 String name, String... choices) {
232 return new Property(name, true, ValueType.CLOSED_CHOICE, choices);
233 }
234
235 public static Property internalDate(String name) {
236 return new Property(name, true, ValueType.DATE);
237 }
238
239 public static Property internalInteger(String name) {
240 return new Property(name, true, ValueType.INTEGER);
241 }
242
243 public static Property internalIntegerSequence(String name) {
244 return new Property(name, true, PropertyType.SEQ, ValueType.INTEGER);
245 }
246
247 public static Property internalRational(String name) {
248 return new Property(name, true, ValueType.RATIONAL);
249 }
250
251 public static Property internalOpenChoise(
252 String name, String... choices) {
253 return new Property(name, true, ValueType.OPEN_CHOICE, choices);
254 }
255 public static Property internalReal(String name) {
256 return new Property(name, true, ValueType.REAL);
257 }
258
259 public static Property internalText(String name) {
260 return new Property(name, true, ValueType.TEXT);
261 }
262
263 public static Property internalTextBag(String name) {
264 return new Property(name, true, PropertyType.BAG, ValueType.TEXT);
265 }
266
267 public static Property internalURI(String name) {
268 return new Property(name, true, ValueType.URI);
269 }
270
271 public static Property externalClosedChoise(
272 String name, String... choices) {
273 return new Property(name, false, ValueType.CLOSED_CHOICE, choices);
274 }
275
276 public static Property externalOpenChoise(
277 String name, String... choices) {
278 return new Property(name, false, ValueType.OPEN_CHOICE, choices);
279 }
280
281 public static Property externalDate(String name) {
282 return new Property(name, false, ValueType.DATE);
283 }
284
285 public static Property externalReal(String name) {
286 return new Property(name, false, ValueType.REAL);
287 }
288
289 public static Property externalInteger(String name) {
290 return new Property(name, false, ValueType.INTEGER);
291 }
292
293 public static Property externalBoolean(String name) {
294 return new Property(name, false, ValueType.BOOLEAN);
295 }
296
297 public static Property externalText(String name) {
298 return new Property(name, false, ValueType.TEXT);
299 }
300
301 public static Property externalTextBag(String name) {
302 return new Property(name, false, PropertyType.BAG, ValueType.TEXT);
303 }
304
305 /**
306 * Constructs a new composite property from the given primary and array of secondary properties.
307 * <p>
308 * Note that name of the composite property is taken from its primary property,
309 * and primary and secondary properties must not be composite properties themselves.
310 *
311 * @param primaryProperty
312 * @param secondaryExtractProperties
313 * @return the composite property
314 */
315 public static Property composite(Property primaryProperty, Property[] secondaryExtractProperties) {
316 if (primaryProperty == null) {
317 throw new NullPointerException("primaryProperty must not be null");
318 }
319 if (primaryProperty.getPropertyType() == PropertyType.COMPOSITE) {
320 throw new PropertyTypeException(primaryProperty.getPropertyType());
321 }
322 if (secondaryExtractProperties != null) {
323 for (Property secondaryExtractProperty : secondaryExtractProperties) {
324 if (secondaryExtractProperty.getPropertyType() == PropertyType.COMPOSITE) {
325 throw new PropertyTypeException(secondaryExtractProperty.getPropertyType());
326 }
327 }
328 }
329 String[] choices = null;
330 if (primaryProperty.getChoices() != null) {
331 choices = primaryProperty.getChoices().toArray(
332 new String[primaryProperty.getChoices().size()]);
333 }
334 return new Property(primaryProperty.getName(),
335 primaryProperty.isInternal(), PropertyType.COMPOSITE,
336 ValueType.PROPERTY, choices, primaryProperty,
337 secondaryExtractProperties);
338 }
339
340 //----------------------------------------------------------< Comparable >
341
342 public int compareTo(Property o) {
343 return name.compareTo(o.name);
344 }
345
346 //--------------------------------------------------------------< Object >
347
348 public boolean equals(Object o) {
349 return o instanceof Property && name.equals(((Property) o).name);
350 }
351
352 public int hashCode() {
353 return name.hashCode();
354 }
355
356 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 import org.apache.tika.metadata.Property.PropertyType;
19 import org.apache.tika.metadata.Property.ValueType;
20
21
22 /**
23 * XMP property definition violation exception. This is thrown when
24 * you try to set a {@link Property} value with an incorrect type,
25 * such as storing an Integer when the property is of type Date.
26 *
27 * @since Apache Tika 0.8
28 */
29 public final class PropertyTypeException extends IllegalArgumentException {
30
31 public PropertyTypeException(String msg) {
32 super(msg);
33 }
34
35 public PropertyTypeException(PropertyType expected, PropertyType found) {
36 super("Expected a property of type " + expected + ", but received " + found);
37 }
38
39 public PropertyTypeException(ValueType expected, ValueType found) {
40 super("Expected a property with a " + expected + " value, but received a " + found);
41 }
42
43 public PropertyTypeException(PropertyType unsupportedPropertyType) {
44 super((unsupportedPropertyType != PropertyType.COMPOSITE)
45 ? unsupportedPropertyType + " is not supported"
46 : "Composite Properties must not include other Composite"
47 + " Properties as either Primary or Secondary");
48 }
49 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * XMP Exif TIFF schema. This is a collection of
20 * {@link Property property definition} constants for the Exif TIFF
21 * properties defined in the XMP standard.
22 *
23 * @since Apache Tika 0.8
24 * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
25 * >XMP Specification, Part 2: Standard Schemas</a>
26 */
27 public interface TIFF {
28
29 /**
30 * "Number of bits per component in each channel."
31 */
32 Property BITS_PER_SAMPLE =
33 Property.internalIntegerSequence("tiff:BitsPerSample");
34
35 /**
36 * "Image height in pixels."
37 */
38 Property IMAGE_LENGTH =
39 Property.internalInteger("tiff:ImageLength");
40
41 /**
42 * "Image width in pixels."
43 */
44 Property IMAGE_WIDTH =
45 Property.internalInteger("tiff:ImageWidth");
46
47 /**
48 * "Number of components per pixel."
49 */
50 Property SAMPLES_PER_PIXEL =
51 Property.internalInteger("tiff:SamplesPerPixel");
52
53 /**
54 * Did the Flash fire when taking this image?
55 */
56 Property FLASH_FIRED =
57 Property.internalBoolean("exif:Flash");
58
59 /**
60 * "Exposure time in seconds."
61 */
62 Property EXPOSURE_TIME =
63 Property.internalRational("exif:ExposureTime");
64
65 /**
66 * "F-Number."
67 * The f-number is the focal length divided by the "effective" aperture
68 * diameter. It is a dimensionless number that is a measure of lens speed.
69 */
70 Property F_NUMBER =
71 Property.internalRational("exif:FNumber");
72
73 /**
74 * "Focal length of the lens, in millimeters."
75 */
76 Property FOCAL_LENGTH =
77 Property.internalRational("exif:FocalLength");
78
79 /**
80 * "ISO Speed and ISO Latitude of the input device as specified in ISO 12232"
81 */
82 Property ISO_SPEED_RATINGS =
83 Property.internalIntegerSequence("exif:IsoSpeedRatings");
84
85 /**
86 * "Manufacturer of the recording equipment."
87 */
88 Property EQUIPMENT_MAKE =
89 Property.internalText("tiff:Make");
90
91 /**
92 * "Model name or number of the recording equipment."
93 */
94 Property EQUIPMENT_MODEL =
95 Property.internalText("tiff:Model");
96
97 /**
98 * "Software or firmware used to generate the image."
99 */
100 Property SOFTWARE =
101 Property.internalText("tiff:Software");
102
103 /**
104 * "The Orientation of the image."
105 * 1 = 0th row at top, 0th column at left
106 * 2 = 0th row at top, 0th column at right
107 * 3 = 0th row at bottom, 0th column at right
108 * 4 = 0th row at bottom, 0th column at left
109 * 5 = 0th row at left, 0th column at top
110 * 6 = 0th row at right, 0th column at top
111 * 7 = 0th row at right, 0th column at bottom
112 * 8 = 0th row at left, 0th column at bottom
113 */
114 Property ORIENTATION =
115 Property.internalClosedChoise("tiff:Orientation", "1", "2", "3", "4", "5", "6", "7", "8");
116
117 /**
118 * "Horizontal resolution in pixels per unit."
119 */
120 Property RESOLUTION_HORIZONTAL =
121 Property.internalRational("tiff:XResolution");
122
123 /**
124 * "Vertical resolution in pixels per unit."
125 */
126 Property RESOLUTION_VERTICAL =
127 Property.internalRational("tiff:YResolution");
128
129 /**
130 * "Units used for Horizontal and Vertical Resolutions."
131 * One of "Inch" or "cm"
132 */
133 Property RESOLUTION_UNIT =
134 Property.internalClosedChoise("tiff:ResolutionUnit", "Inch", "cm");
135
136 /**
137 * "Date and time when original image was generated"
138 */
139 Property ORIGINAL_DATE =
140 Property.internalDate("exif:DateTimeOriginal");
141 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Contains a core set of basic Tika metadata properties, which all parsers
20 * will attempt to supply (where the file format permits). These are all
21 * defined in terms of other standard namespaces.
22 *
23 * Users of Tika who wish to have consistent metadata across file formats
24 * can make use of these Properties, knowing that where present they will
25 * have consistent semantic meaning between different file formats. (No
26 * matter if one file format calls it Title, another Long-Title and another
27 * Long-Name, if they all mean the same thing as defined by
28 * {@link DublinCore#TITLE} then they will all be present as such)
29 *
30 * For now, most of these properties are composite ones including the deprecated
31 * non-prefixed String properties from the Metadata class. In Tika 2.0, most
32 * of these will revert back to simple assignments.
33 *
34 * @since Apache Tika 1.2
35 */
36 @SuppressWarnings("deprecation")
37 public interface TikaCoreProperties {
38 /**
39 * @see DublinCore#FORMAT
40 */
41 public static final Property FORMAT = Property.composite(DublinCore.FORMAT,
42 new Property[] { Property.internalText(Metadata.FORMAT) });
43
44 /**
45 * @see DublinCore#IDENTIFIER
46 */
47 public static final Property IDENTIFIER = Property.composite(DublinCore.IDENTIFIER,
48 new Property[] { Property.internalText(Metadata.IDENTIFIER) });
49
50 /**
51 * @see DublinCore#CONTRIBUTOR
52 */
53 public static final Property CONTRIBUTOR = Property.composite(DublinCore.CONTRIBUTOR,
54 new Property[] { Property.internalText(Metadata.CONTRIBUTOR) });
55
56 /**
57 * @see DublinCore#COVERAGE
58 */
59 public static final Property COVERAGE = Property.composite(DublinCore.COVERAGE,
60 new Property[] { Property.internalText(Metadata.COVERAGE) });
61
62 /**
63 * @see DublinCore#CREATOR
64 */
65 public static final Property CREATOR = Property.composite(DublinCore.CREATOR,
66 new Property[] {
67 Office.AUTHOR,
68 Property.internalTextBag(Metadata.CREATOR),
69 Property.internalTextBag(Metadata.AUTHOR)
70 });
71
72 /**
73 * @see Office#LAST_AUTHOR
74 */
75 public static final Property MODIFIER = Property.composite(Office.LAST_AUTHOR,
76 new Property[] { Property.internalText(Metadata.LAST_AUTHOR) });
77
78 /**
79 * @see XMP#CREATOR_TOOL
80 */
81 public static final Property CREATOR_TOOL = XMP.CREATOR_TOOL;
82
83 /**
84 * @see DublinCore#LANGUAGE
85 */
86 public static final Property LANGUAGE = Property.composite(DublinCore.LANGUAGE,
87 new Property[] { Property.internalText(Metadata.LANGUAGE) });
88
89 /**
90 * @see DublinCore#PUBLISHER
91 */
92 public static final Property PUBLISHER = Property.composite(DublinCore.PUBLISHER,
93 new Property[] { Property.internalText(Metadata.PUBLISHER) });
94
95 /**
96 * @see DublinCore#RELATION
97 */
98 public static final Property RELATION = Property.composite(DublinCore.RELATION,
99 new Property[] { Property.internalText(Metadata.RELATION) });
100
101 /**
102 * @see DublinCore#RIGHTS
103 */
104 public static final Property RIGHTS = Property.composite(DublinCore.RIGHTS,
105 new Property[] { Property.internalText(Metadata.RIGHTS) });
106
107 /**
108 * @see DublinCore#SOURCE
109 */
110 public static final Property SOURCE = Property.composite(DublinCore.SOURCE,
111 new Property[] { Property.internalText(Metadata.SOURCE) });
112
113 /**
114 * @see DublinCore#TYPE
115 */
116 public static final Property TYPE = Property.composite(DublinCore.TYPE,
117 new Property[] { Property.internalText(Metadata.TYPE) });
118
119
120 // Descriptive properties
121
122 /**
123 * @see DublinCore#TITLE
124 */
125 public static final Property TITLE = Property.composite(DublinCore.TITLE,
126 new Property[] { Property.internalText(Metadata.TITLE) });
127
128 /**
129 * @see DublinCore#DESCRIPTION
130 */
131 public static final Property DESCRIPTION = Property.composite(DublinCore.DESCRIPTION,
132 new Property[] { Property.internalText(Metadata.DESCRIPTION) });
133
134 /**
135 * @see DublinCore#SUBJECT
136 * @see Office#KEYWORDS
137 */
138 public static final Property KEYWORDS = Property.composite(DublinCore.SUBJECT,
139 new Property[] {
140 Office.KEYWORDS,
141 Property.internalTextBag(MSOffice.KEYWORDS),
142 Property.internalTextBag(Metadata.SUBJECT)
143 });
144
145 // Date related properties
146
147 /**
148 * @see DublinCore#DATE
149 * @see Office#CREATION_DATE
150 */
151 public static final Property CREATED = Property.composite(DublinCore.CREATED,
152 new Property[] {
153 Office.CREATION_DATE,
154 MSOffice.CREATION_DATE
155 });
156
157 /**
158 * @see DublinCore#MODIFIED
159 * @see Metadata#DATE
160 * @see Office#SAVE_DATE
161 */
162 public static final Property MODIFIED = Property.composite(DublinCore.MODIFIED,
163 new Property[] {
164 Metadata.DATE,
165 Office.SAVE_DATE,
166 MSOffice.LAST_SAVED,
167 Property.internalText(Metadata.MODIFIED),
168 Property.internalText("Last-Modified")
169 });
170
171 /** @see Office#PRINT_DATE */
172 public static final Property PRINT_DATE = Property.composite(Office.PRINT_DATE,
173 new Property[] { MSOffice.LAST_PRINTED });
174
175 /**
176 * @see XMP#METADATA_DATE
177 */
178 public static final Property METADATA_DATE = XMP.METADATA_DATE;
179
180
181 // Geographic related properties
182
183 /**
184 * @see Geographic#LATITUDE
185 */
186 public static final Property LATITUDE = Geographic.LATITUDE;
187
188 /**
189 * @see Geographic#LONGITUDE
190 */
191 public static final Property LONGITUDE = Geographic.LONGITUDE;
192
193 /**
194 * @see Geographic#ALTITUDE
195 */
196 public static final Property ALTITUDE = Geographic.ALTITUDE;
197
198
199 // Comment and rating properties
200
201 /**
202 * @see XMP#RATING
203 */
204 public static final Property RATING = XMP.RATING;
205
206 /**
207 * @see OfficeOpenXMLExtended#COMMENTS
208 */
209 public static final Property COMMENTS = Property.composite(OfficeOpenXMLExtended.COMMENTS,
210 new Property[] {
211 Property.internalTextBag(ClimateForcast.COMMENT),
212 Property.internalTextBag(MSOffice.COMMENTS)
213 });
214
215 // TODO: Remove transition properties in Tika 2.0
216
217 /**
218 * @see DublinCore#SUBJECT
219 * @deprecated use TikaCoreProperties#KEYWORDS
220 */
221 @Deprecated
222 public static final Property TRANSITION_KEYWORDS_TO_DC_SUBJECT = Property.composite(DublinCore.SUBJECT,
223 new Property[] { Property.internalTextBag(MSOffice.KEYWORDS) });
224
225 /**
226 * @see OfficeOpenXMLExtended#COMMENTS
227 * @deprecated use TikaCoreProperties#DESCRIPTION
228 */
229 @Deprecated
230 public static final Property TRANSITION_SUBJECT_TO_DC_DESCRIPTION = Property.composite(DublinCore.DESCRIPTION,
231 new Property[] { Property.internalText(Metadata.SUBJECT) });
232
233 /**
234 * @see DublinCore#TITLE
235 * @deprecated use TikaCoreProperties#TITLE
236 */
237 @Deprecated
238 public static final Property TRANSITION_SUBJECT_TO_DC_TITLE = Property.composite(DublinCore.TITLE,
239 new Property[] { Property.internalText(Metadata.SUBJECT) });
240
241 /**
242 * @see OfficeOpenXMLCore#SUBJECT
243 * @deprecated use OfficeOpenXMLCore#SUBJECT
244 */
245 @Deprecated
246 public static final Property TRANSITION_SUBJECT_TO_OO_SUBJECT = Property.composite(OfficeOpenXMLCore.SUBJECT,
247 new Property[] { Property.internalText(Metadata.SUBJECT) });
248
249 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * Contains keys to properties in Metadata instances.
20 */
21 public interface TikaMetadataKeys {
22
23 String RESOURCE_NAME_KEY = "resourceName";
24
25 String PROTECTED = "protected";
26
27 String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
28 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 /**
19 * A collection of Tika metadata keys used in Mime Type resolution
20 */
21 public interface TikaMimeKeys {
22
23 String TIKA_MIME_FILE = "tika.mime.file";
24
25 String MIME_TYPE_MAGIC = "mime.type.magic";
26
27 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 public interface XMP {
19
20 String NAMESPACE_URI = "http://ns.adobe.com/xap/1.0/";
21
22 String PREFIX = "xmp";
23
24 /** The xmp prefix followed by the colon delimiter */
25 String PREFIX_ = PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER;
26
27 /**
28 * The date and time the resource was created. For a digital file, this need not
29 * match a file-system creation time. For a freshly created resource, it should
30 * be close to that time, modulo the time taken to write the file. Later file
31 * transfer, copying, and so on, can make the file-system time arbitrarily different.
32 */
33 Property CREATE_DATE = Property.externalDate(PREFIX_ + "CreateDate");
34
35 /**
36 * The name of the first known tool used to create the resource.
37 */
38 Property CREATOR_TOOL = Property.externalText(PREFIX_ + "CreatorTool");
39
40 /**
41 * An unordered array of text strings that unambiguously identify the resource
42 * within a given context. An array item may be qualified with xmpidq:Scheme
43 * (see 8.7, “xmpidq namespace”) to denote the formal identification system to
44 * which that identifier conforms.
45 */
46 Property IDENTIFIER = Property.externalTextBag(PREFIX_ + "Identifier");
47
48 /**
49 * A word or short phrase that identifies a resource as a member of a userdefined collection.
50 */
51 Property LABEL = Property.externalDate(PREFIX_ + "Label");
52
53 /**
54 * The date and time that any metadata for this resource was last changed. It
55 * should be the same as or more recent than xmp:ModifyDate
56 */
57 Property METADATA_DATE = Property.externalDate(PREFIX_ + "MetadataDate");
58
59 /**
60 * The date and time the resource was last modified.
61 */
62 Property MODIFY_DATE = Property.externalDate(PREFIX_ + "ModifyDate");
63
64 /**
65 * A user-assigned rating for this file. The value shall be -1 or in the range
66 * [0..5], where -1 indicates “rejected” and 0 indicates “unrated”. If xmp:Rating
67 * is not present, a value of 0 should be assumed.
68 */
69 Property RATING = Property.externalReal(PREFIX_ + "Rating");
70
71 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 import java.util.Date;
19
20 /**
21 * XMP Dynamic Media schema. This is a collection of
22 * {@link Property property definition} constants for the dynamic media
23 * properties defined in the XMP standard.
24 *
25 * @since Apache Tika 0.7
26 * @see <a href="http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart2.pdf"
27 * >XMP Specification, Part 2: Standard Schemas</a>
28 */
29 public interface XMPDM {
30
31 /**
32 * "The absolute path to the file's peak audio file. If empty, no peak
33 * file exists."
34 */
35 Property ABS_PEAK_AUDIO_FILE_PATH =
36 Property.internalURI("xmpDM:absPeakAudioFilePath");
37
38 /**
39 * "The name of the album."
40 */
41 Property ALBUM = Property.externalText("xmpDM:album");
42
43 /**
44 * "An alternative tape name, set via the project window or timecode
45 * dialog in Premiere. If an alternative name has been set and has not
46 * been reverted, that name is displayed."
47 */
48 Property ALT_TAPE_NAME = Property.externalText("xmpDM:altTapeName");
49
50 // /**
51 // * "A timecode set by the user. When specified, it is used instead
52 // * of the startTimecode."
53 // */
54 // Property ALT_TIMECODE = "xmpDM:altTimecode";
55
56 /**
57 * "The name of the artist or artists."
58 */
59 Property ARTIST = Property.externalText("xmpDM:artist");
60
61 /**
62 * "The date and time when the audio was last modified."
63 */
64 Property AUDIO_MOD_DATE = Property.internalDate("xmpDM:audioModDate");
65
66 /**
67 * "The audio sample rate. Can be any value, but commonly 32000, 41100,
68 * or 48000."
69 */
70 Property AUDIO_SAMPLE_RATE =
71 Property.internalInteger("xmpDM:audioSampleRate");
72
73 /**
74 * "The audio sample type."
75 */
76 Property AUDIO_SAMPLE_TYPE = Property.internalClosedChoise(
77 "xmpDM:audioSampleType", "8Int", "16Int", "32Int", "32Float");
78
79 /**
80 * "The audio channel type."
81 */
82 Property AUDIO_CHANNEL_TYPE = Property.internalClosedChoise(
83 "xmpDM:audioChannelType", "Mono", "Stereo", "5.1", "7.1");
84 /**
85 * Converter for {@link XMPDM#AUDIO_CHANNEL_TYPE}
86 * @deprecated Experimental method, will change shortly
87 */
88 @Deprecated
89 static class ChannelTypePropertyConverter {
90 private static Property property = AUDIO_CHANNEL_TYPE;
91
92 /**
93 * How a standalone converter might work
94 */
95 public static String convert(Object value) {
96 if (value instanceof String) {
97 // Assume already done
98 return (String)value;
99 }
100 if (value instanceof Integer) {
101 int channelCount = (Integer)value;
102 if(channelCount == 1) {
103 return "Mono";
104 } else if(channelCount == 2) {
105 return "Stereo";
106 } else if(channelCount == 5) {
107 return "5.1";
108 } else if(channelCount == 7) {
109 return "7.1";
110 }
111 }
112 return null;
113 }
114 /**
115 * How convert+set might work
116 */
117 public static void convertAndSet(Metadata metadata, Object value) {
118 if (value instanceof Integer || value instanceof Long) {
119 metadata.set(property, convert(value));
120 }
121 if (value instanceof Date) {
122 // Won't happen in this case, just an example of already
123 // converted to a type metadata.set(property) handles
124 metadata.set(property, (Date)value);
125 }
126 if (value instanceof String) {
127 // Already converted, or so we hope!
128 metadata.set(property, (String)value);
129 }
130 }
131 }
132
133 /**
134 * "The audio compression used. For example, MP3."
135 */
136 Property AUDIO_COMPRESSOR = Property.internalText("xmpDM:audioCompressor");
137
138 // /**
139 // * "Additional parameters for Beat Splice stretch mode."
140 // */
141 // Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams";
142
143 /**
144 * "The composer's name."
145 */
146 Property COMPOSER = Property.externalText("xmpDM:composer");
147
148 // /**
149 // * "An unordered list of all media used to create this media."
150 // */
151 // Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia";
152
153 /**
154 * "The copyright information."
155 */
156 Property COPYRIGHT = Property.externalText("xmpDM:copyright");
157
158 /**
159 * "The duration of the media file."
160 */
161 Property DURATION = Property.externalReal("xmpDM:duration");
162
163 /**
164 * "The engineer's name."
165 */
166 Property ENGINEER = Property.externalText("xmpDM:engineer");
167
168 /**
169 * "The file data rate in megabytes per second. For example:
170 * '36/10' = 3.6 MB/sec"
171 */
172 Property FILE_DATA_RATE = Property.internalRational("xmpDM:fileDataRate");
173
174 /**
175 * "The name of the genre."
176 */
177 Property GENRE = Property.externalText("xmpDM:genre");
178
179 /**
180 * "The musical instrument."
181 */
182 Property INSTRUMENT = Property.externalText("xmpDM:instrument");
183
184 // /**
185 // * "The duration of lead time for queuing music."
186 // */
187 // Property INTRO_TIME = "xmpDM:introTime";
188
189 /**
190 * "The audio's musical key."
191 */
192 Property KEY = Property.internalClosedChoise(
193 "xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#",
194 "G", "G#", "A", "A#", "B");
195
196 /**
197 * "User's log comments."
198 */
199 Property LOG_COMMENT = Property.externalText("xmpDM:logComment");
200
201 /**
202 * "When true, the clip can be looped seamlessly."
203 */
204 Property LOOP = Property.internalBoolean("xmpDM:loop");
205
206 /**
207 * "The number of beats."
208 */
209 Property NUMBER_OF_BEATS = Property.internalReal("xmpDM:numberOfBeats");
210
211 // /**
212 // * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}.
213 // */
214 // Property MARKERS = "xmpDM:markers";
215
216 /**
217 * "The date and time when the metadata was last modified."
218 */
219 Property METADATA_MOD_DATE = Property.internalDate("xmpDM:metadataModDate");
220
221 // /**
222 // * "The time at which to fade out."
223 // */
224 // Property OUT_CUE = "xmpDM:outCue";
225
226 // /**
227 // * "A reference to the project that created this file."
228 // */
229 // Property PROJECT_REF = "xmpDM:projectRef";
230
231 /**
232 * "The sampling phase of film to be converted to video (pull-down)."
233 */
234 Property PULL_DOWN = Property.internalClosedChoise(
235 "xmpDM:pullDown", "WSSWW", "SSWWW", "SWWWS", "WWWSS", "WWSSW",
236 "WSSWW_24p", "SSWWW_24p", "SWWWS_24p", "WWWSS_24p", "WWSSW_24p");
237
238 /**
239 * "The relative path to the file's peak audio file. If empty, no peak
240 * file exists."
241 */
242 Property RELATIVE_PEAK_AUDIO_FILE_PATH =
243 Property.internalURI("xmpDM:relativePeakAudioFilePath");
244
245 // /**
246 // * "The start time of the media inside the audio project."
247 // */
248 // Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp";
249
250 /**
251 * "The date the title was released."
252 */
253 Property RELEASE_DATE = Property.externalDate("xmpDM:releaseDate");
254
255 // /**
256 // * "Additional parameters for Resample stretch mode."
257 // */
258 // Property RESAMPLE_PARAMS = "xmpDM:resampleParams";
259
260 /**
261 * "The musical scale used in the music. 'Neither' is most often used
262 * for instruments with no associated scale, such as drums."
263 */
264 Property SCALE_TYPE = Property.internalClosedChoise(
265 "xmpDM:scaleType", "Major", "Minor", "Both", "Neither");
266
267 /**
268 * "The name of the scene."
269 */
270 Property SCENE = Property.externalText("xmpDM:scene");
271
272 /**
273 * "The date and time when the video was shot."
274 */
275 Property SHOT_DATE = Property.externalDate("xmpDM:shotDate");
276
277 /**
278 * "The name of the location where the video was shot. For example:
279 * 'Oktoberfest, Munich, Germany'. For more accurate positioning,
280 * use the EXIF GPS values."
281 */
282 Property SHOT_LOCATION = Property.externalText("xmpDM:shotLocation");
283
284 /**
285 * "The name of the shot or take."
286 */
287 Property SHOT_NAME = Property.externalText("xmpDM:shotName");
288
289 /**
290 * "A description of the speaker angles from center front in degrees.
291 * For example: 'Left = -30, Right = 30, Center = 0, LFE = 45,
292 * Left Surround = -110, Right Surround = 110'"
293 */
294 Property SPEAKER_PLACEMENT =
295 Property.externalText("xmpDM:speakerPlacement");
296
297 // /**
298 // * "The timecode of the first frame of video in the file, as obtained
299 // * from the device control."
300 // */
301 // Property START_TIMECODE = "xmpDM:startTimecode";
302
303 /**
304 * "The audio stretch mode."
305 */
306 Property STRETCH_MODE = Property.internalClosedChoise(
307 "xmpDM:stretchMode", "Fixed length", "Time-Scale", "Resample",
308 "Beat Splice", "Hybrid");
309
310 /**
311 * "The name of the tape from which the clip was captured, as set during
312 * the capture process."
313 */
314 Property TAPE_NAME = Property.externalText("xmpDM:tapeName");
315
316 /**
317 * "The audio's tempo."
318 */
319 Property TEMPO = Property.internalReal("xmpDM:tempo");
320
321 // /**
322 // * "Additional parameters for Time-Scale stretch mode."
323 // */
324 // Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams";
325
326 /**
327 * "The time signature of the music."
328 */
329 Property TIME_SIGNATURE = Property.internalClosedChoise(
330 "xmpDM:timeSignature", "2/4", "3/4", "4/4", "5/4", "7/4",
331 "6/8", "9/8", "12/8", "other");
332
333 /**
334 * "A numeric value indicating the order of the audio file within its
335 * original recording."
336 */
337 Property TRACK_NUMBER = Property.externalInteger("xmpDM:trackNumber");
338
339 // /**
340 // * "An unordered list of tracks. A track is a named set of markers,
341 // * which can specify a frame rate for all markers in the set.
342 // * See also {@link #MARKERS xmpDM:markers}."
343 // */
344 // Property TRACKS = "xmpDM:Tracks";
345
346 /**
347 * "The alpha mode."
348 */
349 Property VIDEO_ALPHA_MODE = Property.externalClosedChoise(
350 "xmpDM:videoAlphaMode", "straight", "pre-multiplied");
351
352 // /**
353 // * "A color in CMYK or RGB to be used as the pre-multiple color when
354 // * alpha mode is pre-multiplied."
355 // */
356 // Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor";
357
358 /**
359 * "When true, unity is clear, when false, it is opaque."
360 */
361 Property VIDEO_ALPHA_UNITY_IS_TRANSPARENT =
362 Property.internalBoolean("xmpDM:videoAlphaUnityIsTransparent");
363
364 /**
365 * "The color space."
366 */
367 Property VIDEO_COLOR_SPACE = Property.internalClosedChoise(
368 "xmpDM:videoColorSpace", "sRGB", "CCIR-601", "CCIR-709");
369
370 /**
371 * "Video compression used. For example, jpeg."
372 */
373 Property VIDEO_COMPRESSOR = Property.internalText("xmpDM:videoCompressor");
374
375 /**
376 * "The field order for video."
377 */
378 Property VIDEO_FIELD_ORDER = Property.internalClosedChoise(
379 "xmpDM:videoFieldOrder", "Upper", "Lower", "Progressive");
380
381 /**
382 * "The video frame rate."
383 */
384 Property VIDEO_FRAME_RATE = Property.internalOpenChoise(
385 "xmpDM:videoFrameRate", "24", "NTSC", "PAL");
386
387 // /**
388 // * "The frame size. For example: w:720, h: 480, unit:pixels"
389 // */
390 // Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize";
391
392 /**
393 * "The date and time when the video was last modified."
394 */
395 Property VIDEO_MOD_DATE = Property.internalDate("xmpDM:videoModDate");
396
397 /**
398 * "The size in bits of each color component of a pixel. Standard
399 * Windows 32-bit pixels have 8 bits per component."
400 */
401 Property VIDEO_PIXEL_DEPTH = Property.internalClosedChoise(
402 "xmpDM:videoPixelDepth", "8Int", "16Int", "32Int", "32Float");
403
404 /**
405 * "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9"
406 */
407 Property VIDEO_PIXEL_ASPECT_RATIO =
408 Property.internalRational("xmpDM:videoPixelAspectRatio");
409
410 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 public interface XMPIdq {
19
20 String NAMESPACE_URI = "http://ns.adobe.com/xmp/identifier/qual/1.0/";
21
22 String PREFIX = "xmpidq";
23
24 /** The xmpidq prefix followed by the colon delimiter */
25 String PREFIX_ = PREFIX + ":";
26
27 /**
28 * A qualifier providing the name of the formal identification
29 * scheme used for an item in the xmp:Identifier array.
30 */
31 Property SCHEME = Property.externalText(PREFIX_ + "Scheme");
32
33 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 public interface XMPMM {
19
20 String NAMESPACE_URI = "http://ns.adobe.com/xap/1.0/mm/";
21
22 String PREFIX = "xmpMM";
23
24 /** The xmpMM prefix followed by the colon delimiter */
25 String PREFIX_ = PREFIX + ":";
26
27 /**
28 * A reference to the resource from which this one is derived.
29 * This should be a minimal reference, in which missing
30 * components can be assumed to be unchanged.
31 *
32 * TODO This property is of type RessourceRef which is a struct
33 */
34 // Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom");
35
36 /**
37 * The common identifier for all versions and renditions of a resource.
38 */
39 Property DOCUMENTID = Property.externalText(PREFIX_ + "DocumentID");
40
41 /**
42 * An identifier for a specific incarnation of a resource, updated
43 * each time a file is saved.
44 */
45 Property INSTANCEID = Property.externalText(PREFIX_ + "InstanceID");
46
47 /**
48 * The common identifier for the original resource from which
49 * the current resource is derived. For example, if you save a
50 * resource to a different format, then save that one to another
51 * format, each save operation should generate a new
52 * xmpMM:DocumentID that uniquely identifies the resource in
53 * that format, but should retain the ID of the source file here.
54 */
55 Property ORIGINAL_DOCUMENTID = Property.externalText(
56 PREFIX_ + "OriginalDocumentID");
57
58 /**
59 * The rendition class name for this resource. This property
60 * should be absent or set to default for a resource that is not
61 * a derived rendition
62 */
63 Property RENDITION_CLASS = Property.externalOpenChoise(
64 PREFIX_ + "RenditionClass",
65 "default", "draft", "low-res", "proof", "screen", "thumbnail");
66
67 /**
68 * Can be used to provide additional rendition parameters that
69 * are too complex or verbose to encode in xmpMM:RenditionClass
70 */
71 Property RENDITION_PARAMS = Property.externalText(
72 PREFIX_ + "RenditionParams");
73
74 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010)
17 * standard. These parts Copyright 2010 International Press Telecommunications
18 * Council.
19 */
20 package org.apache.tika.metadata;
21
22 /**
23 * XMP Rights management schema.
24 *
25 * A collection of property constants for the
26 * rights management properties defined in the XMP
27 * standard.
28 *
29 * @since Apache Tika 1.2
30 * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP Photoshop</a>
31 */
32 public interface XMPRights {
33
34 String NAMESPACE_URI_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/";
35 String PREFIX_XMP_RIGHTS = "xmpRights";
36
37 /** The xmpRights prefix followed by the colon delimiter */
38 String PREFIX_ = PREFIX_XMP_RIGHTS + ":";
39
40 /**
41 * A Web URL for a rights management certificate.
42 */
43 Property CERTIFICATE = Property.internalText(PREFIX_ + "Certificate");
44
45 /**
46 * When true, indicates that this is a rights-managed resource. When
47 * false, indicates that this is a public-domain resource. Omit if the
48 * state is unknown.
49 */
50 Property MARKED = Property.internalBoolean(PREFIX_ + "Marked");
51
52 /**
53 * A list of legal owners of the resource.
54 */
55 Property OWNER = Property.internalTextBag(PREFIX_ + "Owner");
56
57 /**
58 * A word or short phrase that identifies a resource as a member of a userdefined collection.
59 * TODO This is actually a language alternative property
60 */
61 Property USAGE_TERMS = Property.internalText(PREFIX_ + "UsageTerms");
62
63 /**
64 * A Web URL for a statement of the ownership and usage rights for this resource.
65 */
66 Property WEB_STATEMENT = Property.internalText(PREFIX_ + "WebStatement");
67
68 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Multi-valued metadata container, and set of constant metadata fields.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.metadata;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.util.Arrays;
19
20 class AndClause implements Clause {
21
22 private final Clause[] clauses;
23
24 AndClause(Clause... clauses) {
25 this.clauses = clauses;
26 }
27
28 public boolean eval(byte[] data) {
29 for (Clause clause : clauses) {
30 if (!clause.eval(data)) {
31 return false;
32 }
33 }
34 return true;
35 }
36
37 public int size() {
38 int size = 0;
39 for (Clause clause : clauses) {
40 size += clause.size();
41 }
42 return size;
43 }
44
45 public String toString() {
46 return "and" + Arrays.toString(clauses);
47 }
48
49 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.Serializable;
19
20 /**
21 * Defines a clause to be evaluated.
22 */
23 interface Clause extends Serializable {
24
25 /**
26 * Evaluates this clause with the specified chunk of data.
27 */
28 boolean eval(byte[] data);
29
30 /**
31 * Returns the size of this clause. The size of a clause is the number of
32 * chars it is composed of.
33 */
34 int size();
35
36 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 /**
19 *
20 * A set of Hex encoding and decoding utility methods.
21 *
22 */
23 public class HexCoDec {
24
25 private static final char[] HEX_CHARS = { '0', '1', '2', '3', '4', '5',
26 '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
27
28 /**
29 * Decode a hex string
30 *
31 * @param hexValue
32 * the string of hex characters
33 * @return the decode hex string as bytes.
34 */
35 public static byte[] decode(String hexValue) {
36 return decode(hexValue.toCharArray());
37 }
38
39 /**
40 * Decode an array of hex chars
41 *
42 * @param hexChars
43 * an array of hex characters.
44 * @return the decode hex chars as bytes.
45 */
46 public static byte[] decode(char[] hexChars) {
47 return decode(hexChars, 0, hexChars.length);
48 }
49
50 /**
51 * Decode an array of hex chars.
52 *
53 * @param hexChars
54 * an array of hex characters.
55 * @param startIndex
56 * the index of the first character to decode
57 * @param length
58 * the number of characters to decode.
59 * @return the decode hex chars as bytes.
60 */
61 public static byte[] decode(char[] hexChars, int startIndex, int length) {
62 if ((length & 1) != 0)
63 throw new IllegalArgumentException("Length must be even");
64
65 byte[] result = new byte[length / 2];
66 for (int j = 0; j < result.length; j++) {
67 result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + hexCharToNibble(hexChars[startIndex++]));
68 }
69 return result;
70 }
71
72 /**
73 * Hex encode an array of bytes
74 *
75 * @param bites
76 * the array of bytes to encode.
77 * @return the array of hex characters.
78 */
79 public static char[] encode(byte[] bites) {
80 return encode(bites, 0, bites.length);
81 }
82
83 /**
84 * Hex encode an array of bytes
85 *
86 * @param bites
87 * the array of bytes to encode.
88 * @param startIndex
89 * the index of the first character to encode.
90 * @param length
91 * the number of characters to encode.
92 * @return the array of hex characters.
93 */
94 public static char[] encode(byte[] bites, int startIndex, int length) {
95 char[] result = new char[length * 2];
96 for (int i = 0, j = 0; i < length; i++) {
97 int bite = bites[startIndex++] & 0xff;
98 result[j++] = HEX_CHARS[bite >> 4];
99 result[j++] = HEX_CHARS[bite & 0xf];
100 }
101 return result;
102 }
103
104 /**
105 * Internal method to turn a hex char into a nibble.
106 */
107 private static int hexCharToNibble(char ch) {
108 if ((ch >= '0') && (ch <= '9')) {
109 return ch - '0';
110 } else if ((ch >= 'a') && (ch <= 'f')) {
111 return ch - 'a' + 10;
112 } else if ((ch >= 'A') && (ch <= 'F')) {
113 return ch - 'A' + 10;
114 } else {
115 throw new IllegalArgumentException("Not a hex char - '" + ch + "'");
116 }
117 }
118
119 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 /**
19 * Defines a magic for a MimeType. A magic is made of one or several
20 * MagicClause.
21 *
22 *
23 */
24 class Magic implements Clause, Comparable<Magic> {
25
26 private final MimeType type;
27
28 private final int priority;
29
30 private final Clause clause;
31
32 private final String string;
33
34 Magic(MimeType type, int priority, Clause clause) {
35 this.type = type;
36 this.priority = priority;
37 this.clause = clause;
38 this.string = "[" + priority + "/" + clause + "]";
39 }
40
41 MimeType getType() {
42 return type;
43 }
44
45 int getPriority() {
46 return priority;
47 }
48
49 public boolean eval(byte[] data) {
50 return clause.eval(data);
51 }
52
53 public int size() {
54 return clause.size();
55 }
56
57 public String toString() {
58 return string;
59 }
60
61 public int compareTo(Magic o) {
62 int diff = o.priority - priority;
63 if (diff == 0) {
64 diff = o.size() - size();
65 }
66 if (diff == 0) {
67 diff = o.type.compareTo(type);
68 }
69 if (diff == 0) {
70 diff = o.string.compareTo(string);
71 }
72 return diff;
73 }
74
75 public boolean equals(Object o) {
76 if (o instanceof Magic) {
77 Magic that = (Magic) o;
78 return type.equals(that.type) && string.equals(that.string);
79 }
80 return false;
81 }
82
83 public int hashCode() {
84 return type.hashCode() ^ string.hashCode();
85 }
86
87 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20
21 import org.apache.tika.detect.MagicDetector;
22 import org.apache.tika.metadata.Metadata;
23
24 /**
25 * Defines a magic match.
26 */
27 class MagicMatch implements Clause {
28
29 private final MediaType mediaType;
30
31 private final String type;
32
33 private final String offset;
34
35 private final String value;
36
37 private final String mask;
38
39 private MagicDetector detector = null;
40
41 MagicMatch(
42 MediaType mediaType,
43 String type, String offset, String value, String mask) {
44 this.mediaType = mediaType;
45 this.type = type;
46 this.offset = offset;
47 this.value = value;
48 this.mask = mask;
49 }
50
51 private synchronized MagicDetector getDetector() {
52 if (detector == null) {
53 detector = MagicDetector.parse(mediaType, type, offset, value, mask);
54 }
55 return detector;
56 }
57
58 public boolean eval(byte[] data) {
59 try {
60 return getDetector().detect(
61 new ByteArrayInputStream(data), new Metadata())
62 != MediaType.OCTET_STREAM;
63 } catch (IOException e) {
64 // Should never happen with a ByteArrayInputStream
65 return false;
66 }
67 }
68
69 public int size() {
70 return getDetector().getLength();
71 }
72
73 public String toString() {
74 return mediaType.toString()
75 + " " + type + " " + offset + " " + value + " " + mask;
76 }
77
78 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.Serializable;
19 import java.nio.charset.Charset;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.Locale;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.SortedMap;
27 import java.util.TreeMap;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 /**
32 * Internet media type.
33 */
34 public final class MediaType implements Comparable<MediaType>, Serializable {
35
36 /**
37 * Serial version UID.
38 */
39 private static final long serialVersionUID = -3831000556189036392L;
40
41 private static final Pattern SPECIAL =
42 Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]");
43
44 private static final Pattern SPECIAL_OR_WHITESPACE =
45 Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
46
47 /**
48 * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
49 */
50 private static final String VALID_CHARS =
51 "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)";
52
53 private static final Pattern TYPE_PATTERN = Pattern.compile(
54 "(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS
55 + "\\s*($|;.*)");
56
57 // TIKA-350: handle charset as first element in content-type
58 private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile(
59 "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*"
60 + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*");
61
62 /**
63 * Set of basic types with normalized "type/subtype" names.
64 * Used to optimize type lookup and to avoid having too many
65 * {@link MediaType} instances in memory.
66 */
67 private static final Map<String, MediaType> SIMPLE_TYPES =
68 new HashMap<String, MediaType>();
69
70 public static final MediaType OCTET_STREAM =
71 parse("application/octet-stream");
72
73 public static final MediaType TEXT_PLAIN = parse("text/plain");
74
75 public static final MediaType TEXT_HTML = parse("text/html");
76
77 public static final MediaType APPLICATION_XML = parse("application/xml");
78
79 public static final MediaType APPLICATION_ZIP = parse("application/zip");
80
81 public static MediaType application(String type) {
82 return MediaType.parse("application/" + type);
83 }
84
85 public static MediaType audio(String type) {
86 return MediaType.parse("audio/" + type);
87 }
88
89 public static MediaType image(String type) {
90 return MediaType.parse("image/" + type);
91 }
92
93 public static MediaType text(String type) {
94 return MediaType.parse("text/" + type);
95 }
96
97 public static MediaType video(String type) {
98 return MediaType.parse("video/" + type);
99 }
100
101 /**
102 * Convenience method that returns an unmodifiable set that contains
103 * all the given media types.
104 *
105 * @since Apache Tika 1.2
106 * @param types media types
107 * @return unmodifiable set of the given types
108 */
109 public static Set<MediaType> set(MediaType... types) {
110 Set<MediaType> set = new HashSet<MediaType>();
111 for (MediaType type : types) {
112 if (type != null) {
113 set.add(type);
114 }
115 }
116 return Collections.unmodifiableSet(set);
117 }
118
119 /**
120 * Convenience method that parses the given media type strings and
121 * returns an unmodifiable set that contains all the parsed types.
122 *
123 * @since Apache Tika 1.2
124 * @param types media type strings
125 * @return unmodifiable set of the parsed types
126 */
127 public static Set<MediaType> set(String... types) {
128 Set<MediaType> set = new HashSet<MediaType>();
129 for (String type : types) {
130 MediaType mt = parse(type);
131 if (mt != null) {
132 set.add(mt);
133 }
134 }
135 return Collections.unmodifiableSet(set);
136 }
137
138 /**
139 * Parses the given string to a media type. The string is expected
140 * to be of the form "type/subtype(; parameter=...)*" as defined in
141 * RFC 2045, though we also handle "charset=xxx; type/subtype" for
142 * broken web servers.
143 *
144 * @param string media type string to be parsed
145 * @return parsed media type, or <code>null</code> if parsing fails
146 */
147 public static MediaType parse(String string) {
148 if (string == null) {
149 return null;
150 }
151
152 // Optimization for the common cases
153 synchronized (SIMPLE_TYPES) {
154 MediaType type = SIMPLE_TYPES.get(string);
155 if (type == null) {
156 int slash = string.indexOf('/');
157 if (slash == -1) {
158 return null;
159 } else if (SIMPLE_TYPES.size() < 10000
160 && isSimpleName(string.substring(0, slash))
161 && isSimpleName(string.substring(slash + 1))) {
162 type = new MediaType(string, slash);
163 SIMPLE_TYPES.put(string, type);
164 }
165 }
166 if (type != null) {
167 return type;
168 }
169 }
170
171 Matcher matcher;
172 matcher = TYPE_PATTERN.matcher(string);
173 if (matcher.matches()) {
174 return new MediaType(
175 matcher.group(1), matcher.group(2),
176 parseParameters(matcher.group(3)));
177 }
178 matcher = CHARSET_FIRST_PATTERN.matcher(string);
179 if (matcher.matches()) {
180 return new MediaType(
181 matcher.group(2), matcher.group(3),
182 parseParameters(matcher.group(1)));
183 }
184
185 return null;
186 }
187
188 private static boolean isSimpleName(String name) {
189 for (int i = 0; i < name.length(); i++) {
190 char c = name.charAt(i);
191 if (c != '-' && c != '+' && c != '.' && c != '_'
192 && !('0' <= c && c <= '9')
193 && !('a' <= c && c <= 'z')) {
194 return false;
195 }
196 }
197 return name.length() > 0;
198 }
199
200 private static Map<String, String> parseParameters(String string) {
201 if (string.length() == 0) {
202 return Collections.<String, String>emptyMap();
203 }
204
205 // Extracts k1=v1, k2=v2 from mime/type; k1=v1; k2=v2
206 // Note - this logic isn't fully RFC2045 compliant yet, as it
207 // doesn't fully handle quoted keys or values (eg containing ; or =)
208 Map<String, String> parameters = new HashMap<String, String>();
209 while (string.length() > 0) {
210 String key = string;
211 String value = "";
212
213 int semicolon = string.indexOf(';');
214 if (semicolon != -1) {
215 key = string.substring(0, semicolon);
216 string = string.substring(semicolon + 1);
217 } else {
218 string = "";
219 }
220
221 int equals = key.indexOf('=');
222 if (equals != -1) {
223 value = key.substring(equals + 1);
224 key = key.substring(0, equals);
225 }
226
227 key = key.trim();
228 if (key.length() > 0) {
229 parameters.put(key, unquote(value.trim()));
230 }
231 }
232 return parameters;
233 }
234
235 /**
236 * Fuzzy unquoting mechanism that works also with somewhat malformed
237 * quotes.
238 *
239 * @param s string to unquote
240 * @return unquoted string
241 */
242 private static String unquote(String s) {
243 while (s.startsWith("\"") || s.startsWith("'")) {
244 s = s.substring(1);
245 }
246 while (s.endsWith("\"") || s.endsWith("'")) {
247 s = s.substring(0, s.length() - 1);
248 }
249 return s;
250 }
251
252 /**
253 * Canonical string representation of this media type.
254 */
255 private final String string;
256
257 /**
258 * Location of the "/" character separating the type and the subtype
259 * tokens in {@link #string}.
260 */
261 private final int slash;
262
263 /**
264 * Location of the first ";" character separating the type part of
265 * {@link #string} from possible parameters. Length of {@link #string}
266 * in case there are no parameters.
267 */
268 private final int semicolon;
269
270 /**
271 * Immutable sorted map of media type parameters.
272 */
273 private final Map<String, String> parameters;
274
275 public MediaType(
276 String type, String subtype, Map<String, String> parameters) {
277 type = type.trim().toLowerCase(Locale.ENGLISH);
278 subtype = subtype.trim().toLowerCase(Locale.ENGLISH);
279
280 this.slash = type.length();
281 this.semicolon = slash + 1 + subtype.length();
282
283 if (parameters.isEmpty()) {
284 this.parameters = Collections.emptyMap();
285 this.string = type + '/' + subtype;
286 } else {
287 StringBuilder builder = new StringBuilder();
288 builder.append(type);
289 builder.append('/');
290 builder.append(subtype);
291
292 SortedMap<String, String> map = new TreeMap<String, String>();
293 for (Map.Entry<String, String> entry : parameters.entrySet()) {
294 String key = entry.getKey().trim().toLowerCase(Locale.ENGLISH);
295 map.put(key, entry.getValue());
296 }
297 for (Map.Entry<String, String> entry : map.entrySet()) {
298 builder.append("; ");
299 builder.append(entry.getKey());
300 builder.append("=");
301 String value = entry.getValue();
302 if (SPECIAL_OR_WHITESPACE.matcher(value).find()) {
303 builder.append('"');
304 builder.append(SPECIAL.matcher(value).replaceAll("\\\\$0"));
305 builder.append('"');
306 } else {
307 builder.append(value);
308 }
309 }
310
311 this.string = builder.toString();
312 this.parameters = Collections.unmodifiableSortedMap(map);
313 }
314 }
315
316 public MediaType(String type, String subtype) {
317 this(type, subtype, Collections.<String, String>emptyMap());
318 }
319
320 private MediaType(String string, int slash) {
321 assert slash != -1;
322 assert string.charAt(slash) == '/';
323 assert isSimpleName(string.substring(0, slash));
324 assert isSimpleName(string.substring(slash + 1));
325 this.string = string;
326 this.slash = slash;
327 this.semicolon = string.length();
328 this.parameters = Collections.emptyMap();
329 }
330
331 private static Map<String, String> union(
332 Map<String, String> a, Map<String, String> b) {
333 if (a.isEmpty()) {
334 return b;
335 } else if (b.isEmpty()) {
336 return a;
337 } else {
338 Map<String, String> union = new HashMap<String, String>();
339 union.putAll(a);
340 union.putAll(b);
341 return union;
342 }
343 }
344
345 public MediaType(MediaType type, Map<String, String> parameters) {
346 this(type.getType(), type.getSubtype(),
347 union(type.parameters, parameters));
348 }
349
350 /**
351 * Creates a media type by adding a parameter to a base type.
352 *
353 * @param type base type
354 * @param name parameter name
355 * @param value parameter value
356 * @since Apache Tika 1.2
357 */
358 public MediaType(MediaType type, String name, String value) {
359 this(type, Collections.singletonMap(name, value));
360 }
361
362 /**
363 * Creates a media type by adding the "charset" parameter to a base type.
364 *
365 * @param type base type
366 * @param charset charset value
367 * @since Apache Tika 1.2
368 */
369 public MediaType(MediaType type, Charset charset) {
370 this(type, "charset", charset.name());
371 }
372 /**
373 * Returns the base form of the MediaType, excluding
374 * any parameters, such as "text/plain" for
375 * "text/plain; charset=utf-8"
376 */
377 public MediaType getBaseType() {
378 if (parameters.isEmpty()) {
379 return this;
380 } else {
381 return MediaType.parse(string.substring(0, semicolon));
382 }
383 }
384
385 /**
386 * Return the Type of the MediaType, such as
387 * "text" for "text/plain"
388 */
389 public String getType() {
390 return string.substring(0, slash);
391 }
392
393 /**
394 * Return the Sub-Type of the MediaType,
395 * such as "plain" for "text/plain"
396 */
397 public String getSubtype() {
398 return string.substring(slash + 1, semicolon);
399 }
400
401 /**
402 * Checks whether this media type contains parameters.
403 *
404 * @since Apache Tika 0.8
405 * @return <code>true</code> if this type has one or more parameters,
406 * <code>false</code> otherwise
407 */
408 public boolean hasParameters() {
409 return !parameters.isEmpty();
410 }
411
412 /**
413 * Returns an immutable sorted map of the parameters of this media type.
414 * The parameter names are guaranteed to be trimmed and in lower case.
415 *
416 * @return sorted map of parameters
417 */
418 public Map<String, String> getParameters() {
419 return parameters;
420 }
421
422 public String toString() {
423 return string;
424 }
425
426 public boolean equals(Object object) {
427 if (object instanceof MediaType) {
428 MediaType that = (MediaType) object;
429 return string.equals(that.string);
430 } else {
431 return false;
432 }
433 }
434
435 public int hashCode() {
436 return string.hashCode();
437 }
438
439 public int compareTo(MediaType that) {
440 return string.compareTo(that.string);
441 }
442
443 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.Serializable;
19 import java.util.HashMap;
20 import java.util.Map;
21 import java.util.SortedSet;
22 import java.util.TreeSet;
23
24 /**
25 * Registry of known Internet media types.
26 */
27 public class MediaTypeRegistry implements Serializable {
28
29 /** Serial version UID */
30 private static final long serialVersionUID = 4710974869988895410L;
31
32 /**
33 * Returns the built-in media type registry included in Tika.
34 *
35 * @since Apache Tika 0.8
36 * @return default media type registry
37 */
38 public static MediaTypeRegistry getDefaultRegistry() {
39 return MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
40 }
41
42 /**
43 * Registry of known media types, including type aliases. A canonical
44 * media type is handled as an identity mapping, while an alias is stored
45 * as a mapping from the alias to the corresponding canonical type.
46 */
47 private final Map<MediaType, MediaType> registry =
48 new HashMap<MediaType, MediaType>();
49
50 /**
51 * Known type inheritance relationships. The mapping is from a media type
52 * to the closest supertype.
53 */
54 private final Map<MediaType, MediaType> inheritance =
55 new HashMap<MediaType, MediaType>();
56
57 /**
58 * Returns the set of all known canonical media types. Type aliases are
59 * not included in the returned set.
60 *
61 * @since Apache Tika 0.8
62 * @return canonical media types
63 */
64 public SortedSet<MediaType> getTypes() {
65 return new TreeSet<MediaType>(registry.values());
66 }
67
68 /**
69 * Returns the set of known aliases of the given canonical media type.
70 *
71 * @since Apache Tika 0.8
72 * @param type canonical media type
73 * @return known aliases
74 */
75 public SortedSet<MediaType> getAliases(MediaType type) {
76 SortedSet<MediaType> aliases = new TreeSet<MediaType>();
77 for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
78 if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
79 aliases.add(entry.getKey());
80 }
81 }
82 return aliases;
83 }
84
85 public void addType(MediaType type) {
86 registry.put(type, type);
87 }
88
89 public void addAlias(MediaType type, MediaType alias) {
90 registry.put(alias, type);
91 }
92
93 public void addSuperType(MediaType type, MediaType supertype) {
94 inheritance.put(type, supertype);
95 }
96
97 public MediaType normalize(MediaType type) {
98 if (type == null) {
99 return null;
100 }
101 MediaType canonical = registry.get(type.getBaseType());
102 if (canonical == null) {
103 return type;
104 } else if (type.hasParameters()) {
105 return new MediaType(canonical, type.getParameters());
106 } else {
107 return canonical;
108 }
109 }
110
111 /**
112 * Checks whether the given media type a is a specialization of a more
113 * generic type b. Both types should be already normalised.
114 *
115 * @since Apache Tika 0.8
116 * @param a media type, normalised
117 * @param b suspected supertype, normalised
118 * @return <code>true</code> if b is a supertype of a,
119 * <code>false</code> otherwise
120 */
121 public boolean isSpecializationOf(MediaType a, MediaType b) {
122 return isInstanceOf(getSupertype(a), b);
123 }
124
125 /**
126 * Checks whether the given media type equals the given base type or
127 * is a specialization of it. Both types should be already normalised.
128 *
129 * @since Apache Tika 1.2
130 * @param a media type, normalised
131 * @param b base type, normalised
132 * @return <code>true</code> if b equals a or is a specialization of it,
133 * <code>false</code> otherwise
134 */
135 public boolean isInstanceOf(MediaType a, MediaType b) {
136 return a != null && (a.equals(b) || isSpecializationOf(a, b));
137 }
138
139 /**
140 * Parses and normalises the given media type string and checks whether
141 * the result equals the given base type or is a specialization of it.
142 * The given base type should already be normalised.
143 *
144 * @since Apache Tika 1.2
145 * @param a media type
146 * @param b base type, normalised
147 * @return <code>true</code> if b equals a or is a specialization of it,
148 * <code>false</code> otherwise
149 */
150 public boolean isInstanceOf(String a, MediaType b) {
151 return isInstanceOf(normalize(MediaType.parse(a)), b);
152 }
153
154 /**
155 * Returns the supertype of the given type. If the given type has any
156 * parameters, then the respective base type is returned. Otherwise
157 * built-in heuristics like text/... -&gt; text/plain and
158 * .../...+xml -&gt; application/xml are used in addition to explicit
159 * type inheritance rules read from the media type database. Finally
160 * application/octet-stream is returned for all types for which no other
161 * supertype is known, and the return value for application/octet-stream
162 * is <code>null</code>.
163 *
164 * @since Apache Tika 0.8
165 * @param type media type
166 * @return supertype, or <code>null</code> for application/octet-stream
167 */
168 public MediaType getSupertype(MediaType type) {
169 if (type == null) {
170 return null;
171 } else if (type.hasParameters()) {
172 return type.getBaseType();
173 } else if (inheritance.containsKey(type)) {
174 return inheritance.get(type);
175 } else if (type.getSubtype().endsWith("+xml")) {
176 return MediaType.APPLICATION_XML;
177 } else if (type.getSubtype().endsWith("+zip")) {
178 return MediaType.APPLICATION_ZIP;
179 } else if ("text".equals(type.getType())
180 && !MediaType.TEXT_PLAIN.equals(type)) {
181 return MediaType.TEXT_PLAIN;
182 } else if (!MediaType.OCTET_STREAM.equals(type)) {
183 return MediaType.OCTET_STREAM;
184 } else {
185 return null;
186 }
187 }
188
189 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.Serializable;
19 import java.net.URI;
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.List;
24
25 /**
26 * Internet media type.
27 */
28 public final class MimeType implements Comparable<MimeType>, Serializable {
29
30 /**
31 * Serial version UID.
32 */
33 private static final long serialVersionUID = 4357830439860729201L;
34
35 /**
36 * Checks that the given string is a valid Internet media type name
37 * based on rules from RFC 2054 section 5.3. For validation purposes the
38 * rules can be simplified to the following:
39 * <pre>
40 * name := token "/" token
41 * token := 1*&lt;any (US-ASCII) CHAR except SPACE, CTLs, or tspecials&gt;
42 * tspecials := "(" / ")" / "&lt;" / "&gt;" / "@" / "," / ";" / ":" /
43 * "\" / <"> / "/" / "[" / "]" / "?" / "="
44 * </pre>
45 *
46 * @param name name string
47 * @return <code>true</code> if the string is a valid media type name,
48 * <code>false</code> otherwise
49 */
50 public static boolean isValid(String name) {
51 if (name == null) {
52 throw new IllegalArgumentException("Name is missing");
53 }
54
55 boolean slash = false;
56 for (int i = 0; i < name.length(); i++) {
57 char ch = name.charAt(i);
58 if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' ||
59 ch == '<' || ch == '>' || ch == '@' || ch == ',' ||
60 ch == ';' || ch == ':' || ch == '\\' || ch == '"' ||
61 ch == '[' || ch == ']' || ch == '?' || ch == '=') {
62 return false;
63 } else if (ch == '/') {
64 if (slash || i == 0 || i + 1 == name.length()) {
65 return false;
66 }
67 slash = true;
68 }
69 }
70 return slash;
71 }
72
73 /**
74 * The normalized media type name.
75 */
76 private final MediaType type;
77
78 /**
79 * The MimeType acronym
80 */
81 private String acronym = "";
82
83 /**
84 * The http://en.wikipedia.org/wiki/Uniform_Type_Identifier
85 */
86 private String uti = "";
87
88 /**
89 * Documentation Links
90 */
91 private List<URI> links = Collections.emptyList();
92
93 /**
94 * Description of this media type.
95 */
96 private String description = "";
97
98 /** The magics associated to this Mime-Type */
99 private List<Magic> magics = null;
100
101 /** The root-XML associated to this Mime-Type */
102 private List<RootXML> rootXML = null;
103
104 /** The minimum length of data to provides for magic analyzis */
105 private int minLength = 0;
106
107 /**
108 * All known file extensions of this type, in order of preference
109 * (best first).
110 */
111 private List<String> extensions = null;
112
113 /**
114 * Creates a media type with the give name and containing media type
115 * registry. The name is expected to be valid and normalized to lower
116 * case. This constructor should only be called by
117 * {@link MimeTypes#forName(String)} to keep the media type registry
118 * up to date.
119 *
120 * @param type normalized media type name
121 */
122 MimeType(MediaType type) {
123 if (type == null) {
124 throw new IllegalArgumentException("Media type name is missing");
125 }
126 this.type = type;
127 }
128
129 /**
130 * Returns the normalized media type name.
131 *
132 * @return media type
133 */
134 public MediaType getType() {
135 return type;
136 }
137
138 /**
139 * Returns the name of this media type.
140 *
141 * @return media type name (lower case)
142 */
143 public String getName() {
144 return type.toString();
145 }
146
147 /**
148 * Returns the description of this media type.
149 *
150 * @return media type description
151 */
152 public String getDescription() {
153 return description;
154 }
155
156 /**
157 * Set the description of this media type.
158 *
159 * @param description media type description
160 */
161 public void setDescription(String description) {
162 if (description == null) {
163 throw new IllegalArgumentException("Description is missing");
164 }
165 this.description = description;
166 }
167
168
169 /**
170 * Returns an acronym for this mime type.
171 *
172 * @return mime type acronym
173 */
174 public String getAcronym() {
175 return acronym;
176 }
177
178 /**
179 * Set an acronym for the mime type
180 *
181 * @param acronym
182 */
183 void setAcronym(String v) {
184 if (v == null) {
185 throw new IllegalArgumentException("Acronym is missing");
186 }
187 acronym = v;
188 }
189
190 /**
191 * Get the UTI for this mime type.
192 *
193 * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier
194 *
195 * @return The Uniform Type Identifier
196 */
197 public String getUniformTypeIdentifier() {
198 return uti;
199 }
200
201 /**
202 * Set The Uniform Type Identifier
203 *
204 * @param uti
205 */
206 void setUniformTypeIdentifier(String v) {
207 if (v == null) {
208 throw new IllegalArgumentException("Uniform Type Identifier is missing");
209 }
210 uti = v;
211 }
212
213 /**
214 * Get a list of links to help document this mime type
215 *
216 * @return an array of links (will never be null)
217 */
218 public List<URI> getLinks() {
219 return links; // this is already unmodifiable
220 }
221
222 /**
223 * Add a link to this mime type
224 * @param link
225 */
226 void addLink(URI link) {
227 if(link==null) {
228 throw new IllegalArgumentException("Missing Link");
229 }
230 List<URI> copy = new ArrayList<URI>(links.size()+1);
231 copy.addAll(links);
232 copy.add(link);
233 links = Collections.unmodifiableList(copy);
234 }
235
236
237 /**
238 * Add some rootXML info to this mime-type
239 *
240 * @param namespaceURI
241 * @param localName
242 */
243 void addRootXML(String namespaceURI, String localName) {
244 if (rootXML == null) {
245 rootXML = new ArrayList<RootXML>();
246 }
247 rootXML.add(new RootXML(this, namespaceURI, localName));
248 }
249
250 boolean matchesXML(String namespaceURI, String localName) {
251 if (rootXML != null) {
252 for (RootXML xml : rootXML) {
253 if (xml.matches(namespaceURI, localName)) {
254 return true;
255 }
256 }
257 }
258 return false;
259 }
260
261 boolean hasRootXML() {
262 return rootXML != null;
263 }
264
265 List<Magic> getMagics() {
266 if (magics != null) {
267 return magics;
268 } else {
269 return Collections.emptyList();
270 }
271 }
272
273 void addMagic(Magic magic) {
274 if (magic == null) {
275 return;
276 }
277 if (magics == null) {
278 magics = new ArrayList<Magic>();
279 }
280 magics.add(magic);
281 }
282
283 int getMinLength() {
284 return minLength;
285 }
286
287 public boolean hasMagic() {
288 return magics != null;
289 }
290
291 public boolean matchesMagic(byte[] data) {
292 for (int i = 0; magics != null && i < magics.size(); i++) {
293 Magic magic = magics.get(i);
294 if (magic.eval(data)) {
295 return true;
296 }
297 }
298 return false;
299 }
300
301 public boolean matches(byte[] data) {
302 return matchesMagic(data);
303 }
304
305 /**
306 * Defines a RootXML description. RootXML is made of a localName and/or a
307 * namespaceURI.
308 */
309 static class RootXML implements Serializable {
310
311 /**
312 * Serial version UID.
313 */
314 private static final long serialVersionUID = 5140496601491000730L;
315
316 private MimeType type = null;
317
318 private String namespaceURI = null;
319
320 private String localName = null;
321
322 RootXML(MimeType type, String namespaceURI, String localName) {
323 if (isEmpty(namespaceURI) && isEmpty(localName)) {
324 throw new IllegalArgumentException(
325 "Both namespaceURI and localName cannot be empty");
326 }
327 this.type = type;
328 this.namespaceURI = namespaceURI;
329 this.localName = localName;
330 }
331
332 boolean matches(String namespaceURI, String localName) {
333 //Compare namespaces
334 if (!isEmpty(this.namespaceURI)) {
335 if (!this.namespaceURI.equals(namespaceURI)) {
336 return false;
337 }
338 }
339 else{
340 // else if it was empty then check to see if the provided namespaceURI
341 // is empty. If it is not, then these two aren't equal and return false
342 if(!isEmpty(namespaceURI)){
343 return false;
344 }
345 }
346
347 //Compare root element's local name
348 if (!isEmpty(this.localName)) {
349 if (!this.localName.equals(localName)) {
350 return false;
351 }
352 }
353 else{
354 // else if it was empty then check to see if the provided localName
355 // is empty. If it is not, then these two aren't equal and return false
356 if(!isEmpty(localName)){
357 return false;
358 }
359 }
360 return true;
361 }
362
363 /**
364 * Checks if a string is null or empty.
365 */
366 private boolean isEmpty(String str) {
367 return (str == null) || (str.equals(""));
368 }
369
370 MimeType getType() {
371 return type;
372 }
373
374 String getNameSpaceURI() {
375 return namespaceURI;
376 }
377
378 String getLocalName() {
379 return localName;
380 }
381
382 public String toString() {
383 return type + ", " + namespaceURI + ", " + localName;
384 }
385 }
386
387 //----------------------------------------------------------< Comparable >
388
389 public int compareTo(MimeType mime) {
390 return type.compareTo(mime.type);
391 }
392
393 //--------------------------------------------------------------< Object >
394
395 public boolean equals(Object o) {
396 if (o instanceof MimeType) {
397 MimeType that = (MimeType) o;
398 return this.type.equals(that.type);
399 }
400
401 return false;
402 }
403
404 public int hashCode() {
405 return type.hashCode();
406 }
407
408 /**
409 * Returns the name of this media type.
410 *
411 * @return media type name
412 */
413 public String toString() {
414 return type.toString();
415 }
416
417 /**
418 * Returns the preferred file extension of this type, or an empty string
419 * if no extensions are known. Use the {@link #getExtensions()} method to
420 * get the full list of known extensions of this type.
421 *
422 * @since Apache Tika 0.9
423 * @return preferred file extension or empty string
424 */
425 public String getExtension() {
426 if (extensions == null) {
427 return "";
428 } else {
429 return extensions.get(0);
430 }
431 }
432
433 /**
434 * Returns the list of all known file extensions of this media type.
435 *
436 * @since Apache Tika 0.10
437 * @return known extensions in order of preference (best first)
438 */
439 public List<String> getExtensions() {
440 if (extensions != null) {
441 return Collections.unmodifiableList(extensions);
442 } else {
443 return Collections.emptyList();
444 }
445 }
446
447 /**
448 * Adds a known file extension to this type.
449 *
450 * @param extension file extension
451 */
452 void addExtension(String extension) {
453 if (extensions == null) {
454 extensions = Collections.singletonList(extension);
455 } else if (extensions.size() == 1) {
456 extensions = new ArrayList<String>(extensions);
457 }
458 if (!extensions.contains(extension)) {
459 extensions.add(extension);
460 }
461 }
462
463 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import org.apache.tika.exception.TikaException;
19
20 /**
21 * A class to encapsulate MimeType related exceptions.
22 */
23 public class MimeTypeException extends TikaException {
24
25 /**
26 * Constructs a MimeTypeException with the specified detail message.
27 *
28 * @param message the detail message.
29 */
30 public MimeTypeException(String message) {
31 super(message);
32 }
33
34 /**
35 * Constructs a MimeTypeException with the specified detail message
36 * and root cause.
37 *
38 * @param message the detail message.
39 * @param cause root cause
40 */
41 public MimeTypeException(String message, Throwable cause) {
42 super(message, cause);
43 }
44
45 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 // JDK imports
19 import java.io.ByteArrayInputStream;
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Serializable;
24 import java.net.URI;
25 import java.net.URISyntaxException;
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.HashMap;
29 import java.util.List;
30 import java.util.Locale;
31 import java.util.Map;
32
33 import javax.xml.namespace.QName;
34
35 import org.apache.tika.Tika;
36 import org.apache.tika.detect.Detector;
37 import org.apache.tika.detect.TextDetector;
38 import org.apache.tika.detect.XmlRootExtractor;
39 import org.apache.tika.metadata.Metadata;
40
41 /**
42 * This class is a MimeType repository. It gathers a set of MimeTypes and
43 * enables to retrieves a content-type from its name, from a file name, or from
44 * a magic character sequence.
45 * <p>
46 * The MIME type detection methods that take an {@link InputStream} as
47 * an argument will never reads more than {@link #getMinLength()} bytes
48 * from the stream. Also the given stream is never
49 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
50 * or {@link InputStream#reset() reset} by the methods. Thus a client can
51 * use the {@link InputStream#markSupported() mark feature} of the stream
52 * (if available) to restore the stream back to the state it was before type
53 * detection if it wants to process the stream based on the detected type.
54 */
55 public final class MimeTypes implements Detector, Serializable {
56
57 /**
58 * Serial version UID.
59 */
60 private static final long serialVersionUID = -1350863170146349036L;
61
62 /**
63 * Name of the {@link #rootMimeType root} type, application/octet-stream.
64 */
65 public static final String OCTET_STREAM = "application/octet-stream";
66
67 /**
68 * Name of the {@link #textMimeType text} type, text/plain.
69 */
70 public static final String PLAIN_TEXT = "text/plain";
71
72 /**
73 * Name of the {@link #xml xml} type, application/xml.
74 */
75 public static final String XML = "application/xml";
76
77 /**
78 * Root type, application/octet-stream.
79 */
80 private final MimeType rootMimeType;
81
82 /**
83 * Text type, text/plain.
84 */
85 private final MimeType textMimeType;
86
87 /*
88 * xml type, application/xml
89 */
90 private final MimeType xmlMimeType;
91
92 /**
93 * Registered media types and their aliases.
94 */
95 private final MediaTypeRegistry registry = new MediaTypeRegistry();
96
97 /** All the registered MimeTypes indexed on their canonical names */
98 private final Map<MediaType, MimeType> types =
99 new HashMap<MediaType, MimeType>();
100
101 /** The patterns matcher */
102 private Patterns patterns = new Patterns(registry);
103
104 /** Sorted list of all registered magics */
105 private final List<Magic> magics = new ArrayList<Magic>();
106
107 /** Sorted list of all registered rootXML */
108 private final List<MimeType> xmls = new ArrayList<MimeType>();
109
110 public MimeTypes() {
111 rootMimeType = new MimeType(MediaType.OCTET_STREAM);
112 textMimeType = new MimeType(MediaType.TEXT_PLAIN);
113 xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
114
115 add(rootMimeType);
116 add(textMimeType);
117 add(xmlMimeType);
118 }
119
120 /**
121 * Find the Mime Content Type of a document from its name.
122 * Returns application/octet-stream if no better match is found.
123 *
124 * @deprecated Use {@link Tika#detect(String)} instead
125 * @param name of the document to analyze.
126 * @return the Mime Content Type of the specified document name
127 */
128 public MimeType getMimeType(String name) {
129 MimeType type = patterns.matches(name);
130 if (type != null) {
131 return type;
132 }
133 type = patterns.matches(name.toLowerCase(Locale.ENGLISH));
134 if (type != null) {
135 return type;
136 } else {
137 return rootMimeType;
138 }
139 }
140
141 /**
142 * Find the Mime Content Type of a document stored in the given file.
143 * Returns application/octet-stream if no better match is found.
144 *
145 * @deprecated Use {@link Tika#detect(File)} instead
146 * @param file file to analyze
147 * @return the Mime Content Type of the specified document
148 * @throws MimeTypeException if the type can't be detected
149 * @throws IOException if the file can't be read
150 */
151 public MimeType getMimeType(File file)
152 throws MimeTypeException, IOException {
153 return forName(new Tika(this).detect(file));
154 }
155
156 /**
157 * Returns the MIME type that best matches the given first few bytes
158 * of a document stream. Returns application/octet-stream if no better
159 * match is found.
160 * <p>
161 * The given byte array is expected to be at least {@link #getMinLength()}
162 * long, or shorter only if the document stream itself is shorter.
163 *
164 * @param data first few bytes of a document stream
165 * @return matching MIME type
166 */
167 private MimeType getMimeType(byte[] data) {
168 if (data == null) {
169 throw new IllegalArgumentException("Data is missing");
170 } else if (data.length == 0) {
171 // See https://issues.apache.org/jira/browse/TIKA-483
172 return rootMimeType;
173 }
174
175 // Then, check for magic bytes
176 MimeType result = null;
177 for (Magic magic : magics) {
178 if (magic.eval(data)) {
179 result = magic.getType();
180 break;
181 }
182 }
183
184 if (result != null) {
185 // When detecting generic XML (or possibly XHTML),
186 // extract the root element and match it against known types
187 if ("application/xml".equals(result.getName())
188 || "text/html".equals(result.getName())) {
189 XmlRootExtractor extractor = new XmlRootExtractor();
190
191 QName rootElement = extractor.extractRootElement(data);
192 if (rootElement != null) {
193 for (MimeType type : xmls) {
194 if (type.matchesXML(
195 rootElement.getNamespaceURI(),
196 rootElement.getLocalPart())) {
197 result = type;
198 break;
199 }
200 }
201 } else if ("application/xml".equals(result.getName())) {
202 // Downgrade from application/xml to text/plain since
203 // the document seems not to be well-formed.
204 result = textMimeType;
205 }
206 }
207 return result;
208 }
209
210 // Finally, assume plain text if no control bytes are found
211 try {
212 TextDetector detector = new TextDetector(getMinLength());
213 ByteArrayInputStream stream = new ByteArrayInputStream(data);
214 return forName(detector.detect(stream, new Metadata()).toString());
215 } catch (Exception e) {
216 return rootMimeType;
217 }
218 }
219
220 /**
221 * Reads the first {@link #getMinLength()} bytes from the given stream.
222 * If the stream is shorter, then the entire content of the stream is
223 * returned.
224 * <p>
225 * The given stream is never {@link InputStream#close() closed},
226 * {@link InputStream#mark(int) marked}, or
227 * {@link InputStream#reset() reset} by this method.
228 *
229 * @param stream stream to be read
230 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
231 * @throws IOException if the stream can not be read
232 */
233 private byte[] readMagicHeader(InputStream stream) throws IOException {
234 if (stream == null) {
235 throw new IllegalArgumentException("InputStream is missing");
236 }
237
238 byte[] bytes = new byte[getMinLength()];
239 int totalRead = 0;
240
241 int lastRead = stream.read(bytes);
242 while (lastRead != -1) {
243 totalRead += lastRead;
244 if (totalRead == bytes.length) {
245 return bytes;
246 }
247 lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
248 }
249
250 byte[] shorter = new byte[totalRead];
251 System.arraycopy(bytes, 0, shorter, 0, totalRead);
252 return shorter;
253 }
254
255 /**
256 * Returns the registered media type with the given name (or alias).
257 * The named media type is automatically registered (and returned) if
258 * it doesn't already exist.
259 *
260 * @param name media type name (case-insensitive)
261 * @return the registered media type with the given name or alias
262 * @throws MimeTypeException if the given media type name is invalid
263 */
264 public MimeType forName(String name) throws MimeTypeException {
265 MediaType type = MediaType.parse(name);
266 if (type != null) {
267 MediaType normalisedType = registry.normalize(type);
268 MimeType mime = types.get(normalisedType);
269
270 if (mime == null) {
271 synchronized (this) {
272 // Double check it didn't already get added while
273 // we were waiting for the lock
274 mime = types.get(normalisedType);
275 if (mime == null) {
276 mime = new MimeType(type);
277 add(mime);
278 types.put(type, mime);
279 }
280 }
281 }
282 return mime;
283 } else {
284 throw new MimeTypeException("Invalid media type name: " + name);
285 }
286 }
287
288 /**
289 * Returns the registered media type with the given name (or alias).
290 *
291 * Unlike {@link #forName(String)}, this function will *not* create a new
292 * MimeType and register it
293 *
294 * @param name media type name (case-insensitive)
295 * @return the registered media type with the given name or alias
296 * @throws MimeTypeException if the given media type name is invalid
297 */
298 public MimeType getRegisteredMimeType(String name) throws MimeTypeException {
299 MediaType type = MediaType.parse(name);
300 if (type != null) {
301 MediaType normalisedType = registry.normalize(type);
302 return types.get(normalisedType);
303 } else {
304 throw new MimeTypeException("Invalid media type name: " + name);
305 }
306 }
307
308 public synchronized void setSuperType(MimeType type, MediaType parent) {
309 registry.addSuperType(type.getType(), parent);
310 }
311
312 /**
313 * Adds an alias for the given media type. This method should only
314 * be called from {@link MimeType#addAlias(String)}.
315 *
316 * @param type media type
317 * @param alias media type alias (normalized to lower case)
318 */
319 synchronized void addAlias(MimeType type, MediaType alias) {
320 registry.addAlias(type.getType(), alias);
321 }
322
323 /**
324 * Adds a file name pattern for the given media type. Assumes that the
325 * pattern being added is <b>not</b> a JDK standard regular expression.
326 *
327 * @param type
328 * media type
329 * @param pattern
330 * file name pattern
331 * @throws MimeTypeException
332 * if the pattern conflicts with existing ones
333 */
334 public void addPattern(MimeType type, String pattern)
335 throws MimeTypeException {
336 this.addPattern(type, pattern, false);
337 }
338
339 /**
340 * Adds a file name pattern for the given media type. The caller can specify
341 * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard
342 * regular expression via the <code>isRegex</code> parameter. If the value
343 * is set to true, then a JDK standard regex is assumed, otherwise the
344 * freedesktop glob type is assumed.
345 *
346 * @param type
347 * media type
348 * @param pattern
349 * file name pattern
350 * @param isRegex
351 * set to true if JDK std regexs are desired, otherwise set to
352 * false.
353 * @throws MimeTypeException
354 * if the pattern conflicts with existing ones.
355 *
356 */
357 public void addPattern(MimeType type, String pattern, boolean isRegex)
358 throws MimeTypeException {
359 patterns.add(pattern, isRegex, type);
360 }
361
362 public MediaTypeRegistry getMediaTypeRegistry() {
363 return registry;
364 }
365
366 /**
367 * Return the minimum length of data to provide to analyzing methods based
368 * on the document's content in order to check all the known MimeTypes.
369 *
370 * @return the minimum length of data to provide.
371 * @see #getMimeType(byte[])
372 * @see #getMimeType(String, byte[])
373 */
374 public int getMinLength() {
375 // This needs to be reasonably large to be able to correctly detect
376 // things like XML root elements after initial comment and DTDs
377 return 64 * 1024;
378 }
379
380 /**
381 * Add the specified mime-type in the repository.
382 *
383 * @param type
384 * is the mime-type to add.
385 */
386 void add(MimeType type) {
387 registry.addType(type.getType());
388 types.put(type.getType(), type);
389
390 // Update the magics index...
391 if (type.hasMagic()) {
392 magics.addAll(type.getMagics());
393 }
394
395 // Update the xml (xmlRoot) index...
396 if (type.hasRootXML()) {
397 xmls.add(type);
398 }
399 }
400
401 /**
402 * Called after all configured types have been loaded.
403 * Initializes the magics and xmls sets.
404 */
405 void init() {
406 for (MimeType type : types.values()) {
407 magics.addAll(type.getMagics());
408 if (type.hasRootXML()) {
409 xmls.add(type);
410 }
411 }
412 Collections.sort(magics);
413 Collections.sort(xmls);
414 }
415
416 /**
417 * Automatically detects the MIME type of a document based on magic
418 * markers in the stream prefix and any given metadata hints.
419 * <p>
420 * The given stream is expected to support marks, so that this method
421 * can reset the stream to the position it was in before this method
422 * was called.
423 *
424 * @param input document stream, or <code>null</code>
425 * @param metadata metadata hints
426 * @return MIME type of the document
427 * @throws IOException if the document stream could not be read
428 */
429 public MediaType detect(InputStream input, Metadata metadata)
430 throws IOException {
431 MediaType type = MediaType.OCTET_STREAM;
432
433 // Get type based on magic prefix
434 if (input != null) {
435 input.mark(getMinLength());
436 try {
437 byte[] prefix = readMagicHeader(input);
438 type = getMimeType(prefix).getType();
439 } finally {
440 input.reset();
441 }
442 }
443
444 // Get type based on resourceName hint (if available)
445 String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
446 if (resourceName != null) {
447 String name = null;
448
449 // Deal with a URI or a path name in as the resource name
450 try {
451 URI uri = new URI(resourceName);
452 String path = uri.getPath();
453 if (path != null) {
454 int slash = path.lastIndexOf('/');
455 if (slash + 1 < path.length()) {
456 name = path.substring(slash + 1);
457 }
458 }
459 } catch (URISyntaxException e) {
460 name = resourceName;
461 }
462
463 if (name != null) {
464 MediaType hint = getMimeType(name).getType();
465 if (registry.isSpecializationOf(hint, type)) {
466 type = hint;
467 }
468 }
469 }
470
471 // Get type based on metadata hint (if available)
472 String typeName = metadata.get(Metadata.CONTENT_TYPE);
473 if (typeName != null) {
474 try {
475 MediaType hint = forName(typeName).getType();
476 if (registry.isSpecializationOf(hint, type)) {
477 type = hint;
478 }
479 } catch (MimeTypeException e) {
480 // Malformed type name, ignore
481 }
482 }
483
484 return type;
485 }
486
487 private static MimeTypes DEFAULT_TYPES = null;
488 private static Map<ClassLoader,MimeTypes> CLASSLOADER_SPECIFIC_DEFAULT_TYPES =
489 new HashMap<ClassLoader, MimeTypes>();
490
491 /**
492 * Get the default MimeTypes. This includes all the build in
493 * media types, and any custom override ones present.
494 *
495 * @return MimeTypes default type registry
496 */
497 public static synchronized MimeTypes getDefaultMimeTypes() {
498 return getDefaultMimeTypes(null);
499 }
500 /**
501 * Get the default MimeTypes. This includes all the built-in
502 * media types, and any custom override ones present.
503 *
504 * @param ClassLoader to use, if not the default
505 * @return MimeTypes default type registry
506 */
507 public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader) {
508 MimeTypes types = DEFAULT_TYPES;
509 if (classLoader != null) {
510 types = CLASSLOADER_SPECIFIC_DEFAULT_TYPES.get(classLoader);
511 }
512
513 if (types == null) {
514 try {
515 types = MimeTypesFactory.create(
516 "tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
517 } catch (MimeTypeException e) {
518 throw new RuntimeException(
519 "Unable to parse the default media type registry", e);
520 } catch (IOException e) {
521 throw new RuntimeException(
522 "Unable to read the default media type registry", e);
523 }
524
525 if (classLoader == null) {
526 DEFAULT_TYPES = types;
527 } else {
528 CLASSLOADER_SPECIFIC_DEFAULT_TYPES.put(classLoader, types);
529 }
530 }
531 return types;
532 }
533 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.InputStream;
19 import java.io.IOException;
20 import java.net.URL;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.List;
24
25 import org.w3c.dom.Document;
26
27 /**
28 * Creates instances of MimeTypes.
29 */
30 public class MimeTypesFactory {
31
32 /**
33 * Creates an empty instance; same as calling new MimeTypes().
34 *
35 * @return an empty instance
36 */
37 public static MimeTypes create() {
38 return new MimeTypes();
39 }
40
41 /**
42 * Creates and returns a MimeTypes instance from the specified document.
43 * @throws MimeTypeException if the type configuration is invalid
44 */
45 public static MimeTypes create(Document document) throws MimeTypeException {
46 MimeTypes mimeTypes = new MimeTypes();
47 new MimeTypesReader(mimeTypes).read(document);
48 mimeTypes.init();
49 return mimeTypes;
50 }
51
52 /**
53 * Creates and returns a MimeTypes instance from the specified input stream.
54 * Does not close the input stream(s).
55 * @throws IOException if the stream can not be read
56 * @throws MimeTypeException if the type configuration is invalid
57 */
58 public static MimeTypes create(InputStream... inputStreams)
59 throws IOException, MimeTypeException {
60 MimeTypes mimeTypes = new MimeTypes();
61 MimeTypesReader reader = new MimeTypesReader(mimeTypes);
62 for(InputStream inputStream : inputStreams) {
63 reader.read(inputStream);
64 }
65 mimeTypes.init();
66 return mimeTypes;
67 }
68
69 /** @see #create(InputStream...) */
70 public static MimeTypes create(InputStream stream)
71 throws IOException, MimeTypeException {
72 return create(new InputStream[] { stream });
73 }
74
75 /**
76 * Creates and returns a MimeTypes instance from the resource
77 * at the location specified by the URL. Opens and closes the
78 * InputStream from the URL.
79 * If multiple URLs are supplied, then they are loaded in turn.
80 *
81 * @throws IOException if the URL can not be accessed
82 * @throws MimeTypeException if the type configuration is invalid
83 */
84 public static MimeTypes create(URL... urls)
85 throws IOException, MimeTypeException {
86 InputStream[] streams = new InputStream[urls.length];
87 for(int i=0; i<streams.length; i++) {
88 streams[i] = urls[i].openStream();
89 }
90
91 try {
92 return create(streams);
93 } finally {
94 for(InputStream stream : streams) {
95 stream.close();
96 }
97 }
98 }
99
100 /** @see #create(URL...) */
101 public static MimeTypes create(URL url)
102 throws IOException, MimeTypeException {
103 return create(new URL[] { url });
104 }
105
106 /**
107 * Creates and returns a MimeTypes instance from the specified file path,
108 * as interpreted by the class loader in getResource().
109 *
110 * @throws IOException if the file can not be accessed
111 * @throws MimeTypeException if the type configuration is invalid
112 */
113 public static MimeTypes create(String filePath)
114 throws IOException, MimeTypeException {
115 return create(MimeTypesReader.class.getResource(filePath));
116 }
117
118 /**
119 * Creates and returns a MimeTypes instance. The core mimetypes
120 * will be loaded from the specified file path, and any custom
121 * override mimetypes found will loaded afterwards.
122 * The file paths will be interpreted by the default class loader in
123 * getResource().
124 *
125 * @param coreFilePath The main MimeTypes file to load
126 * @param extensionFilePath The name of extension MimeType files to load afterwards
127 *
128 * @throws IOException if the file can not be accessed
129 * @throws MimeTypeException if the type configuration is invalid
130 */
131 public static MimeTypes create(String coreFilePath, String extensionFilePath)
132 throws IOException, MimeTypeException {
133 return create(coreFilePath, extensionFilePath, null);
134 }
135 /**
136 * Creates and returns a MimeTypes instance. The core mimetypes
137 * will be loaded from the specified file path, and any custom
138 * override mimetypes found will loaded afterwards.
139 * The file paths will be interpreted by the specified class
140 * loader in getResource().
141 *
142 * @param coreFilePath The main MimeTypes file to load
143 * @param extensionFilePath The name of extension MimeType files to load afterwards
144 *
145 * @throws IOException if the file can not be accessed
146 * @throws MimeTypeException if the type configuration is invalid
147 */
148 public static MimeTypes create(String coreFilePath, String extensionFilePath,
149 ClassLoader classLoader) throws IOException, MimeTypeException {
150 // If no specific classloader was requested, use our own class's one
151 if (classLoader == null) {
152 classLoader = MimeTypesReader.class.getClassLoader();
153 }
154
155 // This allows us to replicate class.getResource() when using
156 // the classloader directly
157 String classPrefix = MimeTypesReader.class.getPackage().getName().replace('.', '/') + "/";
158
159 // Get the core URL, and all the extensions URLs
160 URL coreURL = classLoader.getResource(classPrefix+coreFilePath);
161 List<URL> extensionURLs = Collections.list(
162 classLoader.getResources(classPrefix+extensionFilePath));
163
164 // Swap that into an Array, and process
165 List<URL> urls = new ArrayList<URL>();
166 urls.add(coreURL);
167 urls.addAll(extensionURLs);
168
169 return create( urls.toArray(new URL[urls.size()]) );
170 }
171 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.net.URI;
22 import java.net.URISyntaxException;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.List;
26
27 import javax.xml.parsers.ParserConfigurationException;
28 import javax.xml.parsers.SAXParser;
29 import javax.xml.parsers.SAXParserFactory;
30 import javax.xml.transform.Transformer;
31 import javax.xml.transform.TransformerException;
32 import javax.xml.transform.TransformerFactory;
33 import javax.xml.transform.dom.DOMSource;
34 import javax.xml.transform.sax.SAXResult;
35
36 import org.w3c.dom.Document;
37 import org.xml.sax.Attributes;
38 import org.xml.sax.InputSource;
39 import org.xml.sax.SAXException;
40 import org.xml.sax.helpers.DefaultHandler;
41
42 /**
43 * A reader for XML files compliant with the freedesktop MIME-info DTD.
44 *
45 * <pre>
46 * &lt;!DOCTYPE mime-info [
47 * &lt;!ELEMENT mime-info (mime-type)+&gt;
48 * &lt;!ATTLIST mime-info xmlns CDATA #FIXED &quot;http://www.freedesktop.org/standards/shared-mime-info&quot;&gt;
49 *
50 * &lt;!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*&gt;
51 * &lt;!ATTLIST mime-type type CDATA #REQUIRED&gt;
52 *
53 * &lt;!-- a comment describing a document with the respective MIME type. Example: &quot;WMV video&quot; --&gt;
54 * &lt;!ELEMENT _comment (#PCDATA)&gt;
55 * &lt;!ATTLIST _comment xml:lang CDATA #IMPLIED&gt;
56 *
57 * &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;WMV&quot; --&gt;
58 * &lt;!ELEMENT acronym (#PCDATA)&gt;
59 * &lt;!ATTLIST acronym xml:lang CDATA #IMPLIED&gt;
60 *
61 * &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;Windows Media Video&quot; --&gt;
62 * &lt;!ELEMENT expanded-acronym (#PCDATA)&gt;
63 * &lt;!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED&gt;
64 *
65 * &lt;!ELEMENT glob EMPTY&gt;
66 * &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
67 * &lt;!ATTLIST glob isregex CDATA #IMPLIED&gt;
68 *
69 * &lt;!ELEMENT magic (match)+&gt;
70 * &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
71 *
72 * &lt;!ELEMENT match (match)*&gt;
73 * &lt;!ATTLIST match offset CDATA #REQUIRED&gt;
74 * &lt;!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED&gt;
75 * &lt;!ATTLIST match value CDATA #REQUIRED&gt;
76 * &lt;!ATTLIST match mask CDATA #IMPLIED&gt;
77 *
78 * &lt;!ELEMENT root-XML EMPTY&gt;
79 * &lt;!ATTLIST root-XML
80 * namespaceURI CDATA #REQUIRED
81 * localName CDATA #REQUIRED&gt;
82 *
83 * &lt;!ELEMENT alias EMPTY&gt;
84 * &lt;!ATTLIST alias
85 * type CDATA #REQUIRED&gt;
86 *
87 * &lt;!ELEMENT sub-class-of EMPTY&gt;
88 * &lt;!ATTLIST sub-class-of
89 * type CDATA #REQUIRED&gt;
90 * ]&gt;
91 * </pre>
92 *
93 * In addition to the standard fields, this will also read two Tika specific fields:
94 * - link
95 * - uti
96 *
97 *
98 * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
99 */
100 public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
101 protected final MimeTypes types;
102
103 /** Current type */
104 protected MimeType type = null;
105
106 protected int priority;
107
108 protected StringBuilder characters = null;
109
110 protected MimeTypesReader(MimeTypes types) {
111 this.types = types;
112 }
113
114 public void read(InputStream stream) throws IOException, MimeTypeException {
115 try {
116 SAXParserFactory factory = SAXParserFactory.newInstance();
117 factory.setNamespaceAware(false);
118 SAXParser parser = factory.newSAXParser();
119 parser.parse(stream, this);
120 } catch (ParserConfigurationException e) {
121 throw new MimeTypeException("Unable to create an XML parser", e);
122 } catch (SAXException e) {
123 throw new MimeTypeException("Invalid type configuration", e);
124 }
125 }
126
127 public void read(Document document) throws MimeTypeException {
128 try {
129 TransformerFactory factory = TransformerFactory.newInstance();
130 Transformer transformer = factory.newTransformer();
131 transformer.transform(new DOMSource(document), new SAXResult(this));
132 } catch (TransformerException e) {
133 throw new MimeTypeException("Failed to parse type registry", e);
134 }
135 }
136
137 @Override
138 public InputSource resolveEntity(String publicId, String systemId) {
139 return new InputSource(new ByteArrayInputStream(new byte[0]));
140 }
141
142 @Override
143 public void startElement(
144 String uri, String localName, String qName,
145 Attributes attributes) throws SAXException {
146 if (type == null) {
147 if (MIME_TYPE_TAG.equals(qName)) {
148 String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
149 try {
150 type = types.forName(name);
151 } catch (MimeTypeException e) {
152 handleMimeError(name, e, qName, attributes);
153 }
154 }
155 } else if (ALIAS_TAG.equals(qName)) {
156 String alias = attributes.getValue(ALIAS_TYPE_ATTR);
157 types.addAlias(type, MediaType.parse(alias));
158 } else if (SUB_CLASS_OF_TAG.equals(qName)) {
159 String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
160 types.setSuperType(type, MediaType.parse(parent));
161 } else if (ACRONYM_TAG.equals(qName)||
162 COMMENT_TAG.equals(qName)||
163 TIKA_LINK_TAG.equals(qName)||
164 TIKA_UTI_TAG.equals(qName)) {
165 characters = new StringBuilder();
166 } else if (GLOB_TAG.equals(qName)) {
167 String pattern = attributes.getValue(PATTERN_ATTR);
168 String isRegex = attributes.getValue(ISREGEX_ATTR);
169 if (pattern != null) {
170 try {
171 types.addPattern(type, pattern, Boolean.valueOf(isRegex));
172 } catch (MimeTypeException e) {
173 handleGlobError(type, pattern, e, qName, attributes);
174 }
175 }
176 } else if (ROOT_XML_TAG.equals(qName)) {
177 String namespace = attributes.getValue(NS_URI_ATTR);
178 String name = attributes.getValue(LOCAL_NAME_ATTR);
179 type.addRootXML(namespace, name);
180 } else if (MATCH_TAG.equals(qName)) {
181 String kind = attributes.getValue(MATCH_TYPE_ATTR);
182 String offset = attributes.getValue(MATCH_OFFSET_ATTR);
183 String value = attributes.getValue(MATCH_VALUE_ATTR);
184 String mask = attributes.getValue(MATCH_MASK_ATTR);
185 if (kind == null) {
186 kind = "string";
187 }
188 current = new ClauseRecord(
189 new MagicMatch(type.getType(), kind, offset, value, mask));
190 } else if (MAGIC_TAG.equals(qName)) {
191 String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
192 if (value != null && value.length() > 0) {
193 priority = Integer.parseInt(value);
194 } else {
195 priority = 50;
196 }
197 current = new ClauseRecord(null);
198 }
199 }
200
201 @Override
202 public void endElement(String uri, String localName, String qName) {
203 if (type != null) {
204 if (MIME_TYPE_TAG.equals(qName)) {
205 type = null;
206 } else if (COMMENT_TAG.equals(qName)) {
207 type.setDescription(characters.toString().trim());
208 characters = null;
209 } else if (ACRONYM_TAG.equals(qName)) {
210 type.setAcronym(characters.toString().trim());
211 characters = null;
212 } else if (TIKA_UTI_TAG.equals(qName)) {
213 type.setUniformTypeIdentifier(characters.toString().trim());
214 characters = null;
215 } else if (TIKA_LINK_TAG.equals(qName)) {
216 try {
217 type.addLink(new URI(characters.toString().trim()));
218 }
219 catch (URISyntaxException e) {
220 throw new IllegalArgumentException("unable to parse link: "+characters, e);
221 }
222 characters = null;
223 } else if (MATCH_TAG.equals(qName)) {
224 current.stop();
225 } else if (MAGIC_TAG.equals(qName)) {
226 for (Clause clause : current.getClauses()) {
227 type.addMagic(new Magic(type, priority, clause));
228 }
229 current = null;
230 }
231 }
232 }
233
234 @Override
235 public void characters(char[] ch, int start, int length) {
236 if (characters != null) {
237 characters.append(ch, start, length);
238 }
239 }
240
241 protected void handleMimeError(String input, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
242 throw new SAXException(ex);
243 }
244
245 protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
246 throw new SAXException(ex);
247 }
248
249 private ClauseRecord current = new ClauseRecord(null);
250
251 private class ClauseRecord {
252
253 private ClauseRecord parent;
254
255 private Clause clause;
256
257 private List<Clause> subclauses = null;
258
259 public ClauseRecord(Clause clause) {
260 this.parent = current;
261 this.clause = clause;
262 }
263
264 public void stop() {
265 if (subclauses != null) {
266 Clause subclause;
267 if (subclauses.size() == 1) {
268 subclause = subclauses.get(0);
269 } else {
270 subclause = new OrClause(subclauses);
271 }
272 clause = new AndClause(clause, subclause);
273 }
274 if (parent.subclauses == null) {
275 parent.subclauses = Collections.singletonList(clause);
276 } else {
277 if (parent.subclauses.size() == 1) {
278 parent.subclauses = new ArrayList<Clause>(parent.subclauses);
279 }
280 parent.subclauses.add(clause);
281 }
282
283 current = current.parent;
284 }
285
286 public List<Clause> getClauses() {
287 return subclauses;
288 }
289
290 }
291
292 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 /**
19 * Met Keys used by the {@link MimeTypesReader}.
20 */
21 public interface MimeTypesReaderMetKeys {
22
23 String MIME_INFO_TAG = "mime-info";
24
25 String MIME_TYPE_TAG = "mime-type";
26
27 String MIME_TYPE_TYPE_ATTR = "type";
28
29 String ACRONYM_TAG = "acronym";
30
31 String COMMENT_TAG = "_comment";
32
33 String GLOB_TAG = "glob";
34
35 String ISREGEX_ATTR = "isregex";
36
37 String PATTERN_ATTR = "pattern";
38
39 String MAGIC_TAG = "magic";
40
41 String ALIAS_TAG = "alias";
42
43 String ALIAS_TYPE_ATTR = "type";
44
45 String ROOT_XML_TAG = "root-XML";
46
47 String SUB_CLASS_OF_TAG = "sub-class-of";
48
49 String SUB_CLASS_TYPE_ATTR = "type";
50
51 String MAGIC_PRIORITY_ATTR = "priority";
52
53 String MATCH_TAG = "match";
54
55 String MATCH_OFFSET_ATTR = "offset";
56
57 String MATCH_TYPE_ATTR = "type";
58
59 String MATCH_VALUE_ATTR = "value";
60
61 String MATCH_MASK_ATTR = "mask";
62
63 String NS_URI_ATTR = "namespaceURI";
64
65 String LOCAL_NAME_ATTR = "localName";
66
67 String TIKA_LINK_TAG = "tika:link";
68
69 String TIKA_UTI_TAG = "tika:uti";
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.util.List;
19
20 class OrClause implements Clause {
21
22 private final List<Clause> clauses;
23
24 OrClause(List<Clause> clauses) {
25 this.clauses = clauses;
26 }
27
28 public boolean eval(byte[] data) {
29 for (Clause clause : clauses) {
30 if (clause.eval(data)) {
31 return true;
32 }
33 }
34 return false;
35 }
36
37 public int size() {
38 int size = 0;
39 for (Clause clause : clauses) {
40 size = Math.max(size, clause.size());
41 }
42 return size;
43 }
44
45 public String toString() {
46 return "or" + clauses;
47 }
48
49 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.Serializable;
19 import java.util.Comparator;
20 import java.util.HashMap;
21 import java.util.Map;
22 import java.util.SortedMap;
23 import java.util.TreeMap;
24
25 /**
26 * Defines a MimeType pattern.
27 */
28 class Patterns implements Serializable {
29
30 /**
31 * Serial version UID.
32 */
33 private static final long serialVersionUID = -5778015347278111140L;
34
35 private final MediaTypeRegistry registry;
36
37 /**
38 * Index of exact name patterns.
39 */
40 private final Map<String, MimeType> names = new HashMap<String, MimeType>();
41
42 /**
43 * Index of extension patterns of the form "*extension".
44 */
45 private final Map<String, MimeType> extensions =
46 new HashMap<String, MimeType>();
47
48 private int minExtensionLength = Integer.MAX_VALUE;
49
50 private int maxExtensionLength = 0;
51
52 /**
53 * Index of generic glob patterns, sorted by length.
54 */
55 private final SortedMap<String, MimeType> globs =
56 new TreeMap<String, MimeType>(new LengthComparator());
57
58 private static final class LengthComparator
59 implements Comparator<String>, Serializable {
60
61 /**
62 * Serial version UID.
63 */
64 private static final long serialVersionUID = 8468289702915532359L;
65
66 public int compare(String a, String b) {
67 int diff = b.length() - a.length();
68 if (diff == 0) {
69 diff = a.compareTo(b);
70 }
71 return diff;
72 }
73
74 }
75
76 public Patterns(MediaTypeRegistry registry) {
77 this.registry = registry;
78 }
79
80 public void add(String pattern, MimeType type) throws MimeTypeException {
81 this.add(pattern, false, type);
82 }
83
84 public void add(String pattern, boolean isJavaRegex, MimeType type)
85 throws MimeTypeException {
86 if (pattern == null || type == null) {
87 throw new IllegalArgumentException(
88 "Pattern and/or mime type is missing");
89 }
90
91 if (isJavaRegex) {
92 // in this case, we don't need to build a regex pattern
93 // it's already there for us, so just add the pattern as is
94 addGlob(pattern, type);
95 } else {
96
97 if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1
98 && pattern.indexOf('[') == -1) {
99 addName(pattern, type);
100 } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1
101 && pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) {
102 String extension = pattern.substring(1);
103 addExtension(extension, type);
104 type.addExtension(extension);
105 } else {
106 addGlob(compile(pattern), type);
107 }
108 }
109 }
110
111 private void addName(String name, MimeType type) throws MimeTypeException {
112 MimeType previous = names.get(name);
113 if (previous == null
114 || registry.isSpecializationOf(previous.getType(), type.getType())) {
115 names.put(name, type);
116 } else if (previous == type
117 || registry.isSpecializationOf(type.getType(), previous.getType())) {
118 // do nothing
119 } else {
120 throw new MimeTypeException("Conflicting name pattern: " + name);
121 }
122 }
123
124 private void addExtension(String extension, MimeType type)
125 throws MimeTypeException {
126 MimeType previous = extensions.get(extension);
127 if (previous == null
128 || registry.isSpecializationOf(previous.getType(), type.getType())) {
129 extensions.put(extension, type);
130 int length = extension.length();
131 minExtensionLength = Math.min(minExtensionLength, length);
132 maxExtensionLength = Math.max(maxExtensionLength, length);
133 } else if (previous == type
134 || registry.isSpecializationOf(type.getType(), previous.getType())) {
135 // do nothing
136 } else {
137 throw new MimeTypeException(
138 "Conflicting extension pattern: " + extension);
139 }
140 }
141
142 private void addGlob(String glob, MimeType type)
143 throws MimeTypeException {
144 MimeType previous = globs.get(glob);
145 if (previous == null
146 || registry.isSpecializationOf(previous.getType(), type.getType())) {
147 globs.put(glob, type);
148 } else if (previous == type
149 || registry.isSpecializationOf(type.getType(), previous.getType())) {
150 // do nothing
151 } else {
152 throw new MimeTypeException("Conflicting glob pattern: " + glob);
153 }
154 }
155
156 /**
157 * Find the MimeType corresponding to a resource name.
158 *
159 * It applies the recommendations detailed in FreeDesktop Shared MIME-info
160 * Database for guessing MimeType from a resource name: It first tries a
161 * case-sensitive match, then try again with the resource name converted to
162 * lower-case if that fails. If several patterns match then the longest
163 * pattern is used. In particular, files with multiple extensions (such as
164 * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in
165 * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched
166 * before all others. Patterns beginning with `*.' and containing no other
167 * special characters (`*?[') are matched before other wildcarded patterns
168 * (since this covers the majority of the patterns).
169 */
170 public MimeType matches(String name) {
171 if (name == null) {
172 throw new IllegalArgumentException("Name is missing");
173 }
174
175 // First, try exact match of the provided resource name
176 if (names.containsKey(name)) {
177 return names.get(name);
178 }
179
180 // Then try "extension" (*.xxx) matching
181 int maxLength = Math.min(maxExtensionLength, name.length());
182 for (int n = maxLength; n >= minExtensionLength; n--) {
183 String extension = name.substring(name.length() - n);
184 if (extensions.containsKey(extension)) {
185 return extensions.get(extension);
186 }
187 }
188
189 // And finally, try complex regexp matching
190 for (Map.Entry<String, MimeType> entry : globs.entrySet()) {
191 if (name.matches(entry.getKey())) {
192 return entry.getValue();
193 }
194 }
195
196 return null;
197 }
198
199 private String compile(String glob) {
200 StringBuilder pattern = new StringBuilder();
201 pattern.append("\\A");
202 for (int i = 0; i < glob.length(); i++) {
203 char ch = glob.charAt(i);
204 if (ch == '?') {
205 pattern.append('.');
206 } else if (ch == '*') {
207 pattern.append(".*");
208 } else if ("\\[]^.-$+(){}|".indexOf(ch) != -1) {
209 pattern.append('\\');
210 pattern.append(ch);
211 } else {
212 pattern.append(ch);
213 }
214 }
215 pattern.append("\\z");
216 return pattern.toString();
217 }
218
219 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Media type information.
19 */
20 @aQute.bnd.annotation.Version("1.2.0")
21 package org.apache.tika.mime;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Apache Tika.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.metadata.Metadata;
23 import org.xml.sax.ContentHandler;
24 import org.xml.sax.SAXException;
25
26 /**
27 * Abstract base class for new parsers. This method implements the old
28 * deprecated parse method so subclasses won't have to.
29 *
30 * @since Apache Tika 0.10
31 */
32 public abstract class AbstractParser implements Parser {
33
34 /**
35 * Serial version UID.
36 */
37 private static final long serialVersionUID = 7186985395903074255L;
38
39 /**
40 * Calls the
41 * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
42 * method with an empty {@link ParseContext}. This method exists as a
43 * leftover from Tika 0.x when the three-argument parse() method still
44 * existed in the {@link Parser} interface. No new code should call this
45 * method anymore, it's only here for backwards compatibility.
46 *
47 * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method instead
48 */
49 public void parse(
50 InputStream stream, ContentHandler handler, Metadata metadata)
51 throws IOException, SAXException, TikaException {
52 parse(stream, handler, metadata, new ParseContext());
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.config.TikaConfig;
22 import org.apache.tika.detect.DefaultDetector;
23 import org.apache.tika.detect.Detector;
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.io.TemporaryResources;
26 import org.apache.tika.io.TikaInputStream;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.mime.MediaType;
29 import org.apache.tika.mime.MediaTypeRegistry;
30 import org.apache.tika.sax.SecureContentHandler;
31 import org.xml.sax.ContentHandler;
32 import org.xml.sax.SAXException;
33
34 public class AutoDetectParser extends CompositeParser {
35
36 /** Serial version UID */
37 private static final long serialVersionUID = 6110455808615143122L;
38
39 /**
40 * The type detector used by this parser to auto-detect the type
41 * of a document.
42 */
43 private Detector detector; // always set in the constructor
44
45 /**
46 * Creates an auto-detecting parser instance using the default Tika
47 * configuration.
48 */
49 public AutoDetectParser() {
50 this(TikaConfig.getDefaultConfig());
51 }
52
53 public AutoDetectParser(Detector detector) {
54 this(TikaConfig.getDefaultConfig());
55 setDetector(detector);
56 }
57
58 /**
59 * Creates an auto-detecting parser instance using the specified set of parser.
60 * This allows one to create a Tika configuration where only a subset of the
61 * available parsers have their 3rd party jars included, as otherwise the
62 * use of the default TikaConfig will throw various "ClassNotFound" exceptions.
63 *
64 * @param detector Detector to use
65 * @param parsers
66 */
67 public AutoDetectParser(Parser...parsers) {
68 this(new DefaultDetector(), parsers);
69 }
70
71 public AutoDetectParser(Detector detector, Parser...parsers) {
72 super(MediaTypeRegistry.getDefaultRegistry(), parsers);
73 setDetector(detector);
74 }
75
76 public AutoDetectParser(TikaConfig config) {
77 super(config.getMediaTypeRegistry(), config.getParser());
78 setDetector(config.getDetector());
79 }
80
81 /**
82 * Returns the type detector used by this parser to auto-detect the type
83 * of a document.
84 *
85 * @return type detector
86 * @since Apache Tika 0.4
87 */
88 public Detector getDetector() {
89 return detector;
90 }
91
92 /**
93 * Sets the type detector used by this parser to auto-detect the type
94 * of a document.
95 *
96 * @param detector type detector
97 * @since Apache Tika 0.4
98 */
99 public void setDetector(Detector detector) {
100 this.detector = detector;
101 }
102
103 public void parse(
104 InputStream stream, ContentHandler handler,
105 Metadata metadata, ParseContext context)
106 throws IOException, SAXException, TikaException {
107 TemporaryResources tmp = new TemporaryResources();
108 try {
109 TikaInputStream tis = TikaInputStream.get(stream, tmp);
110
111 // Automatically detect the MIME type of the document
112 MediaType type = detector.detect(tis, metadata);
113 metadata.set(Metadata.CONTENT_TYPE, type.toString());
114
115 // TIKA-216: Zip bomb prevention
116 SecureContentHandler sch = new SecureContentHandler(handler, tis);
117 try {
118 // Parse the document
119 super.parse(tis, sch, metadata, context);
120 } catch (SAXException e) {
121 // Convert zip bomb exceptions to TikaExceptions
122 sch.throwIfCauseOf(e);
123 throw e;
124 }
125 } finally {
126 tmp.dispose();
127 }
128 }
129
130 public void parse(
131 InputStream stream, ContentHandler handler, Metadata metadata)
132 throws IOException, SAXException, TikaException {
133 ParseContext context = new ParseContext();
134 context.set(Parser.class, this);
135 parse(stream, handler, metadata, context);
136 }
137
138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Set;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.TemporaryResources;
30 import org.apache.tika.io.TikaInputStream;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.mime.MediaTypeRegistry;
34 import org.apache.tika.sax.TaggedContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 /**
39 * Composite parser that delegates parsing tasks to a component parser
40 * based on the declared content type of the incoming document. A fallback
41 * parser is defined for cases where a parser for the given content type is
42 * not available.
43 */
44 public class CompositeParser extends AbstractParser {
45
46 /** Serial version UID */
47 private static final long serialVersionUID = 2192845797749627824L;
48
49 /**
50 * Media type registry.
51 */
52 private MediaTypeRegistry registry;
53
54 /**
55 * List of component parsers.
56 */
57 private List<Parser> parsers;
58
59 /**
60 * The fallback parser, used when no better parser is available.
61 */
62 private Parser fallback = new EmptyParser();
63
64 public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
65 this.parsers = parsers;
66 this.registry = registry;
67 }
68
69 public CompositeParser(MediaTypeRegistry registry, Parser... parsers) {
70 this(registry, Arrays.asList(parsers));
71 }
72
73 public CompositeParser() {
74 this(new MediaTypeRegistry());
75 }
76
77 public Map<MediaType, Parser> getParsers(ParseContext context) {
78 Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
79 for (Parser parser : parsers) {
80 for (MediaType type : parser.getSupportedTypes(context)) {
81 map.put(registry.normalize(type), parser);
82 }
83 }
84 return map;
85 }
86
87 /**
88 * Utility method that goes through all the component parsers and finds
89 * all media types for which more than one parser declares support. This
90 * is useful in tracking down conflicting parser definitions.
91 *
92 * @since Apache Tika 0.10
93 * @see <a href="https://issues.apache.org/jira/browse/TIKA-660">TIKA-660</a>
94 * @param context parsing context
95 * @return media types that are supported by at least two component parsers
96 */
97 public Map<MediaType, List<Parser>> findDuplicateParsers(
98 ParseContext context) {
99 Map<MediaType, Parser> types = new HashMap<MediaType, Parser>();
100 Map<MediaType, List<Parser>> duplicates =
101 new HashMap<MediaType, List<Parser>>();
102 for (Parser parser : parsers) {
103 for (MediaType type : parser.getSupportedTypes(context)) {
104 MediaType canonicalType = registry.normalize(type);
105 if (types.containsKey(canonicalType)) {
106 List<Parser> list = duplicates.get(canonicalType);
107 if (list == null) {
108 list = new ArrayList<Parser>();
109 list.add(types.get(canonicalType));
110 duplicates.put(canonicalType, list);
111 }
112 list.add(parser);
113 } else {
114 types.put(canonicalType, parser);
115 }
116 }
117 }
118 return duplicates;
119 }
120
121 /**
122 * Returns the media type registry used to infer type relationships.
123 *
124 * @since Apache Tika 0.8
125 * @return media type registry
126 */
127 public MediaTypeRegistry getMediaTypeRegistry() {
128 return registry;
129 }
130
131 /**
132 * Sets the media type registry used to infer type relationships.
133 *
134 * @since Apache Tika 0.8
135 * @param registry media type registry
136 */
137 public void setMediaTypeRegistry(MediaTypeRegistry registry) {
138 this.registry = registry;
139 }
140
141 /**
142 * Returns the component parsers.
143 *
144 * @return component parsers, keyed by media type
145 */
146 public Map<MediaType, Parser> getParsers() {
147 return getParsers(new ParseContext());
148 }
149
150 /**
151 * Sets the component parsers.
152 *
153 * @param parsers component parsers, keyed by media type
154 */
155 public void setParsers(Map<MediaType, Parser> parsers) {
156 this.parsers = new ArrayList<Parser>(parsers.size());
157 for (Map.Entry<MediaType, Parser> entry : parsers.entrySet()) {
158 this.parsers.add(ParserDecorator.withTypes(
159 entry.getValue(), Collections.singleton(entry.getKey())));
160 }
161 }
162
163 /**
164 * Returns the fallback parser.
165 *
166 * @return fallback parser
167 */
168 public Parser getFallback() {
169 return fallback;
170 }
171
172 /**
173 * Sets the fallback parser.
174 *
175 * @param fallback fallback parser
176 */
177 public void setFallback(Parser fallback) {
178 this.fallback = fallback;
179 }
180
181 /**
182 * Returns the parser that best matches the given metadata. By default
183 * looks for a parser that matches the content type metadata property,
184 * and uses the fallback parser if a better match is not found. The
185 * type hierarchy information included in the configured media type
186 * registry is used when looking for a matching parser instance.
187 * <p>
188 * Subclasses can override this method to provide more accurate
189 * parser resolution.
190 *
191 * @param metadata document metadata
192 * @return matching parser
193 */
194 protected Parser getParser(Metadata metadata) {
195 return getParser(metadata, new ParseContext());
196 }
197
198 protected Parser getParser(Metadata metadata, ParseContext context) {
199 Map<MediaType, Parser> map = getParsers(context);
200 MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
201 if (type != null) {
202 // We always work on the normalised, canonical form
203 type = registry.normalize(type);
204 }
205
206 while (type != null) {
207 // Try finding a parser for the type
208 Parser parser = map.get(type);
209 if (parser != null) {
210 return parser;
211 }
212
213 // Failing that, try for the parent of the type
214 type = registry.getSupertype(type);
215 }
216 return fallback;
217 }
218
219 public Set<MediaType> getSupportedTypes(ParseContext context) {
220 return getParsers(context).keySet();
221 }
222
223 /**
224 * Delegates the call to the matching component parser.
225 * <p>
226 * Potential {@link RuntimeException}s, {@link IOException}s and
227 * {@link SAXException}s unrelated to the given input stream and content
228 * handler are automatically wrapped into {@link TikaException}s to better
229 * honor the {@link Parser} contract.
230 */
231 public void parse(
232 InputStream stream, ContentHandler handler,
233 Metadata metadata, ParseContext context)
234 throws IOException, SAXException, TikaException {
235 Parser parser = getParser(metadata);
236 TemporaryResources tmp = new TemporaryResources();
237 try {
238 TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
239 TaggedContentHandler taggedHandler = new TaggedContentHandler(handler);
240 try {
241 parser.parse(taggedStream, taggedHandler, metadata, context);
242 } catch (RuntimeException e) {
243 throw new TikaException(
244 "Unexpected RuntimeException from " + parser, e);
245 } catch (IOException e) {
246 taggedStream.throwIfCauseOf(e);
247 throw new TikaException(
248 "TIKA-198: Illegal IOException from " + parser, e);
249 } catch (SAXException e) {
250 taggedHandler.throwIfCauseOf(e);
251 throw new TikaException(
252 "TIKA-237: Illegal SAXException from " + parser, e);
253 }
254 } finally {
255 tmp.dispose();
256 }
257 }
258
259 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.security.AlgorithmParameters;
21 import java.security.GeneralSecurityException;
22 import java.security.Key;
23 import java.security.Provider;
24 import java.security.SecureRandom;
25 import java.util.Set;
26
27 import javax.crypto.Cipher;
28 import javax.crypto.CipherInputStream;
29
30 import org.apache.tika.exception.EncryptedDocumentException;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.metadata.Metadata;
33 import org.apache.tika.mime.MediaType;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37 /**
38 * Decrypts the incoming document stream and delegates further parsing to
39 * another parser instance. The decryption key and other settings as well
40 * as the delegate parser are taken from the parsing context.
41 *
42 * @since Apache Tika 0.10
43 */
44 public abstract class CryptoParser extends DelegatingParser {
45
46 /** Serial version UID */
47 private static final long serialVersionUID = -3507995752666557731L;
48
49 private final String transformation;
50
51 private final Provider provider;
52
53 private final Set<MediaType> types;
54
55 public CryptoParser(
56 String transformation, Provider provider, Set<MediaType> types) {
57 this.transformation = transformation;
58 this.provider = provider;
59 this.types = types;
60 }
61
62 public CryptoParser(
63 String transformation, Set<MediaType> types) {
64 this(transformation, null, types);
65 }
66
67 public Set<MediaType> getSupportedTypes(ParseContext context) {
68 return types;
69 }
70
71 public void parse(
72 InputStream stream, ContentHandler handler,
73 Metadata metadata, ParseContext context)
74 throws IOException, SAXException, TikaException {
75 try {
76 Cipher cipher;
77 if (provider != null) {
78 cipher = Cipher.getInstance(transformation, provider);
79 } else {
80 cipher = Cipher.getInstance(transformation);
81 }
82
83 Key key = context.get(Key.class);
84 if (key == null) {
85 throw new EncryptedDocumentException("No decryption key provided");
86 }
87
88 AlgorithmParameters params = context.get(AlgorithmParameters.class);
89 SecureRandom random = context.get(SecureRandom.class);
90 if (params != null && random != null) {
91 cipher.init(Cipher.DECRYPT_MODE, key, params, random);
92 } else if (params != null) {
93 cipher.init(Cipher.DECRYPT_MODE, key, params);
94 } else if (random != null) {
95 cipher.init(Cipher.DECRYPT_MODE, key, random);
96 } else {
97 cipher.init(Cipher.DECRYPT_MODE, key);
98 }
99
100 super.parse(
101 new CipherInputStream(stream, cipher),
102 handler, metadata, context);
103 } catch (GeneralSecurityException e) {
104 throw new TikaException("Unable to decrypt document stream", e);
105 }
106 }
107
108 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.util.Collections;
19 import java.util.Comparator;
20 import java.util.List;
21 import java.util.Map;
22
23 import org.apache.tika.config.ServiceLoader;
24 import org.apache.tika.mime.MediaType;
25 import org.apache.tika.mime.MediaTypeRegistry;
26
27 /**
28 * A composite parser based on all the {@link Parser} implementations
29 * available through the
30 * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}.
31 *
32 * @since Apache Tika 0.8
33 */
34 public class DefaultParser extends CompositeParser {
35
36 /** Serial version UID */
37 private static final long serialVersionUID = 3612324825403757520L;
38
39 /**
40 * Finds all statically loadable parsers and sort the list by name,
41 * rather than discovery order. CompositeParser takes the last
42 * parser for any given media type, so put the Tika parsers first
43 * so that non-Tika (user supplied) parsers can take precedence.
44 *
45 * @param loader service loader
46 * @return ordered list of statically loadable parsers
47 */
48 private static List<Parser> getDefaultParsers(ServiceLoader loader) {
49 List<Parser> parsers =
50 loader.loadStaticServiceProviders(Parser.class);
51 Collections.sort(parsers, new Comparator<Parser>() {
52 public int compare(Parser p1, Parser p2) {
53 String n1 = p1.getClass().getName();
54 String n2 = p2.getClass().getName();
55 boolean t1 = n1.startsWith("org.apache.tika.");
56 boolean t2 = n2.startsWith("org.apache.tika.");
57 if (t1 == t2) {
58 return n1.compareTo(n2);
59 } else if (t1) {
60 return -1;
61 } else {
62 return 1;
63 }
64 }
65 });
66 return parsers;
67 }
68
69 private transient final ServiceLoader loader;
70
71 public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
72 super(registry, getDefaultParsers(loader));
73 this.loader = loader;
74 }
75
76 public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) {
77 this(registry, new ServiceLoader(loader));
78 }
79
80 public DefaultParser(ClassLoader loader) {
81 this(MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(loader));
82 }
83
84 public DefaultParser(MediaTypeRegistry registry) {
85 this(registry, new ServiceLoader());
86 }
87
88 public DefaultParser() {
89 this(MediaTypeRegistry.getDefaultRegistry());
90 }
91
92 @Override
93 public Map<MediaType, Parser> getParsers(ParseContext context) {
94 Map<MediaType, Parser> map = super.getParsers(context);
95
96 if (loader != null) {
97 // Add dynamic parser service (they always override static ones)
98 MediaTypeRegistry registry = getMediaTypeRegistry();
99 List<Parser> parsers =
100 loader.loadDynamicServiceProviders(Parser.class);
101 Collections.reverse(parsers); // best parser last
102 for (Parser parser : parsers) {
103 for (MediaType type : parser.getSupportedTypes(context)) {
104 map.put(registry.normalize(type), parser);
105 }
106 }
107 }
108
109 return map;
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.xml.sax.ContentHandler;
26 import org.xml.sax.SAXException;
27
28 /**
29 * Base class for parser implementations that want to delegate parts of the
30 * task of parsing an input document to another parser. The delegate parser
31 * is looked up from the parsing context using the {@link Parser} class as
32 * the key.
33 *
34 * @since Apache Tika 0.4, major changes in Tika 0.5
35 */
36 public class DelegatingParser extends AbstractParser {
37
38 /**
39 * Returns the parser instance to which parsing tasks should be delegated.
40 * The default implementation looks up the delegate parser from the given
41 * parse context, and uses an {@link EmptyParser} instance as a fallback.
42 * Subclasses can override this method to implement alternative delegation
43 * strategies.
44 *
45 * @since Apache Tika 0.7
46 * @param context parse context
47 * @return delegate parser
48 */
49 protected Parser getDelegateParser(ParseContext context) {
50 return context.get(Parser.class, EmptyParser.INSTANCE);
51 }
52
53 public Set<MediaType> getSupportedTypes(ParseContext context) {
54 return getDelegateParser(context).getSupportedTypes(context);
55 }
56
57 /**
58 * Looks up the delegate parser from the parsing context and
59 * delegates the parse operation to it. If a delegate parser is not
60 * found, then an empty XHTML document is returned.
61 * <p>
62 * Subclasses should override this method to parse the top level
63 * structure of the given document stream. Parsed sub-streams can
64 * be passed to this base class method to be parsed by the configured
65 * delegate parser.
66 */
67 public void parse(
68 InputStream stream, ContentHandler handler,
69 Metadata metadata, ParseContext context)
70 throws SAXException, IOException, TikaException {
71 getDelegateParser(context).parse(stream, handler, metadata, context);
72 }
73
74 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.InputStream;
19 import java.util.Collections;
20 import java.util.Set;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.mime.MediaType;
24 import org.apache.tika.sax.XHTMLContentHandler;
25 import org.xml.sax.ContentHandler;
26 import org.xml.sax.SAXException;
27
28 /**
29 * Dummy parser that always produces an empty XHTML document without even
30 * attempting to parse the given document stream. Useful as a sentinel parser
31 * for unknown document types.
32 */
33 public class EmptyParser extends AbstractParser {
34
35 /**
36 * Serial version UID.
37 */
38 private static final long serialVersionUID = -4218649699095732123L;
39
40 /**
41 * Singleton instance of this class.
42 */
43 public static final EmptyParser INSTANCE = new EmptyParser();
44
45 public Set<MediaType> getSupportedTypes(ParseContext context) {
46 return Collections.emptySet();
47 }
48
49 public void parse(
50 InputStream stream, ContentHandler handler,
51 Metadata metadata, ParseContext context)
52 throws SAXException {
53 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
54 xhtml.startDocument();
55 xhtml.endDocument();
56 }
57
58 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.InputStream;
19 import java.util.Collections;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.xml.sax.ContentHandler;
26
27 /**
28 * Dummy parser that always throws a {@link TikaException} without even
29 * attempting to parse the given document stream. Useful as a sentinel parser
30 * for unknown document types.
31 */
32 public class ErrorParser extends AbstractParser {
33
34 /**
35 * Singleton instance of this class.
36 */
37 public static final ErrorParser INSTANCE = new ErrorParser();
38
39 public Set<MediaType> getSupportedTypes(ParseContext context) {
40 return Collections.emptySet();
41 }
42
43 public void parse(
44 InputStream stream, ContentHandler handler,
45 Metadata metadata, ParseContext context)
46 throws TikaException {
47 throw new TikaException("Parse error");
48 }
49
50 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.FilterOutputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.OutputStream;
22 import java.net.Socket;
23 import java.net.URI;
24 import java.net.URL;
25 import java.net.URLConnection;
26 import java.util.Collections;
27 import java.util.Set;
28
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.io.CloseShieldInputStream;
31 import org.apache.tika.io.IOUtils;
32 import org.apache.tika.io.TemporaryResources;
33 import org.apache.tika.io.TikaInputStream;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.mime.MediaType;
36 import org.apache.tika.sax.TaggedContentHandler;
37 import org.apache.tika.sax.TeeContentHandler;
38 import org.xml.sax.Attributes;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.helpers.DefaultHandler;
42
43 public class NetworkParser extends AbstractParser {
44
45 private final URI uri;
46
47 private final Set<MediaType> supportedTypes;
48
49 public NetworkParser(URI uri, Set<MediaType> supportedTypes) {
50 this.uri = uri;
51 this.supportedTypes = supportedTypes;
52 }
53
54 public NetworkParser(URI uri) {
55 this(uri, Collections.singleton(MediaType.OCTET_STREAM));
56 }
57
58 public Set<MediaType> getSupportedTypes(ParseContext context) {
59 return supportedTypes;
60 }
61
62 public void parse(
63 InputStream stream, ContentHandler handler,
64 Metadata metadata, ParseContext context)
65 throws IOException, SAXException, TikaException {
66 TemporaryResources tmp = new TemporaryResources();
67 try {
68 TikaInputStream tis = TikaInputStream.get(stream, tmp);
69 parse(tis, handler, metadata, context);
70 } finally {
71 tmp.dispose();
72 }
73 }
74
75 private void parse(
76 TikaInputStream stream, ContentHandler handler,
77 Metadata metadata, ParseContext context)
78 throws IOException, SAXException, TikaException {
79 if ("telnet".equals(uri.getScheme())) {
80 final Socket socket = new Socket(uri.getHost(), uri.getPort());
81 try {
82 new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
83 @Override
84 public void close() throws IOException {
85 socket.shutdownOutput();
86 }
87 }).parse(
88 socket.getInputStream(), handler, metadata, context);
89 } finally {
90 socket.close();
91 }
92 } else {
93 URL url = uri.toURL();
94 URLConnection connection = url.openConnection();
95 connection.setDoOutput(true);
96 connection.connect();
97 InputStream input = connection.getInputStream();
98 try {
99 new ParsingTask(stream, connection.getOutputStream()).parse(
100 new CloseShieldInputStream(input),
101 handler, metadata, context);
102 } finally {
103 input.close();
104 }
105 }
106
107 }
108
109 private static class ParsingTask implements Runnable {
110
111 private final TikaInputStream input;
112
113 private final OutputStream output;
114
115 private volatile Exception exception = null;
116
117 public ParsingTask(TikaInputStream input, OutputStream output) {
118 this.input = input;
119 this.output = output;
120 }
121
122 public void parse(
123 InputStream stream, ContentHandler handler,
124 Metadata metadata, ParseContext context)
125 throws IOException, SAXException, TikaException {
126 Thread thread = new Thread(this, "Tika network parser");
127 thread.start();
128
129 TaggedContentHandler tagged = new TaggedContentHandler(handler);
130 try {
131 context.getSAXParser().parse(
132 stream, new TeeContentHandler(
133 tagged, new MetaHandler(metadata)));
134 } catch (SAXException e) {
135 tagged.throwIfCauseOf(e);
136 throw new TikaException(
137 "Invalid network parser output", e);
138 } catch (IOException e) {
139 throw new TikaException(
140 "Unable to read network parser output", e);
141 } finally {
142 try {
143 thread.join(1000);
144 } catch (InterruptedException e) {
145 throw new TikaException("Network parser interrupted", e);
146 }
147
148 if (exception != null) {
149 input.throwIfCauseOf(exception);
150 throw new TikaException(
151 "Unexpected network parser error", exception);
152 }
153 }
154 }
155
156 //----------------------------------------------------------<Runnable>
157
158 public void run() {
159 try {
160 try {
161 IOUtils.copy(input, output);
162 } finally {
163 output.close();
164 }
165 } catch (Exception e) {
166 exception = e;
167 }
168 }
169
170 }
171
172 private static class MetaHandler extends DefaultHandler {
173
174 private final Metadata metadata;
175
176 public MetaHandler(Metadata metadata) {
177 this.metadata = metadata;
178 }
179
180 @Override
181 public void startElement(
182 String uri, String localName, String qName,
183 Attributes attributes) throws SAXException {
184 if ("http://www.w3.org/1999/xhtml".equals(uri)
185 && "meta".equals(localName)) {
186 String name = attributes.getValue("", "name");
187 String content = attributes.getValue("", "content");
188 if (name != null && content != null) {
189 metadata.add(name, content);
190 }
191 }
192 }
193
194 }
195
196 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.Serializable;
19 import java.util.HashMap;
20 import java.util.Map;
21
22 import javax.xml.XMLConstants;
23 import javax.xml.parsers.ParserConfigurationException;
24 import javax.xml.parsers.SAXParser;
25 import javax.xml.parsers.SAXParserFactory;
26
27 import org.apache.tika.exception.TikaException;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.SAXNotRecognizedException;
30 import org.xml.sax.SAXNotSupportedException;
31
32 /**
33 * Parse context. Used to pass context information to Tika parsers.
34 *
35 * @since Apache Tika 0.5
36 * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
37 */
38 public class ParseContext implements Serializable {
39
40 /** Serial version UID. */
41 private static final long serialVersionUID = -5921436862145826534L;
42
43 /** Map of objects in this context */
44 private final Map<String, Object> context = new HashMap<String, Object>();
45
46 /**
47 * Adds the given value to the context as an implementation of the given
48 * interface.
49 *
50 * @param key the interface implemented by the given value
51 * @param value the value to be added, or <code>null</code> to remove
52 */
53 public <T> void set(Class<T> key, T value) {
54 if (value != null) {
55 context.put(key.getName(), value);
56 } else {
57 context.remove(key.getName());
58 }
59 }
60
61 /**
62 * Returns the object in this context that implements the given interface.
63 *
64 * @param key the interface implemented by the requested object
65 * @return the object that implements the given interface,
66 * or <code>null</code> if not found
67 */
68 @SuppressWarnings("unchecked")
69 public <T> T get(Class<T> key) {
70 return (T) context.get(key.getName());
71 }
72
73 /**
74 * Returns the object in this context that implements the given interface,
75 * or the given default value if such an object is not found.
76 *
77 * @param key the interface implemented by the requested object
78 * @param defaultValue value to return if the requested object is not found
79 * @return the object that implements the given interface,
80 * or the given default value if not found
81 */
82 public <T> T get(Class<T> key, T defaultValue) {
83 T value = get(key);
84 if (value != null) {
85 return value;
86 } else {
87 return defaultValue;
88 }
89 }
90
91 /**
92 * Returns the SAX parser specified in this parsing context. If a parser
93 * is not explicitly specified, then one is created using the specified
94 * or the default SAX parser factory.
95 *
96 * @see #getSAXParserFactory()
97 * @since Apache Tika 0.8
98 * @return SAX parser
99 * @throws TikaException if a SAX parser could not be created
100 */
101 public SAXParser getSAXParser() throws TikaException {
102 SAXParser parser = get(SAXParser.class);
103 if (parser != null) {
104 return parser;
105 } else {
106 try {
107 return getSAXParserFactory().newSAXParser();
108 } catch (ParserConfigurationException e) {
109 throw new TikaException("Unable to configure a SAX parser", e);
110 } catch (SAXException e) {
111 throw new TikaException("Unable to create a SAX parser", e);
112 }
113 }
114 }
115
116 /**
117 * Returns the SAX parser factory specified in this parsing context.
118 * If a factory is not explicitly specified, then a default factory
119 * instance is created and returned. The default factory instance is
120 * configured to be namespace-aware and to use
121 * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
122 *
123 * @since Apache Tika 0.8
124 * @return SAX parser factory
125 */
126 public SAXParserFactory getSAXParserFactory() {
127 SAXParserFactory factory = get(SAXParserFactory.class);
128 if (factory == null) {
129 factory = SAXParserFactory.newInstance();
130 factory.setNamespaceAware(true);
131 try {
132 factory.setFeature(
133 XMLConstants.FEATURE_SECURE_PROCESSING, true);
134 } catch (ParserConfigurationException e) {
135 } catch (SAXNotSupportedException e) {
136 } catch (SAXNotRecognizedException e) {
137 // TIKA-271: Some XML parsers do not support the
138 // secure-processing feature, even though it's required by
139 // JAXP in Java 5. Ignoring the exception is fine here, as
140 // deployments without this feature are inherently vulnerable
141 // to XML denial-of-service attacks.
142 }
143 }
144 return factory;
145 }
146
147 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.Serializable;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28
29 /**
30 * Tika parser interface.
31 */
32 public interface Parser extends Serializable {
33
34 /**
35 * Returns the set of media types supported by this parser when used
36 * with the given parse context.
37 *
38 * @since Apache Tika 0.7
39 * @param context parse context
40 * @return immutable set of media types
41 */
42 Set<MediaType> getSupportedTypes(ParseContext context);
43
44 /**
45 * Parses a document stream into a sequence of XHTML SAX events.
46 * Fills in related document metadata in the given metadata object.
47 * <p>
48 * The given document stream is consumed but not closed by this method.
49 * The responsibility to close the stream remains on the caller.
50 * <p>
51 * Information about the parsing context can be passed in the context
52 * parameter. See the parser implementations for the kinds of context
53 * information they expect.
54 *
55 * @since Apache Tika 0.5
56 * @param stream the document stream (input)
57 * @param handler handler for the XHTML SAX events (output)
58 * @param metadata document metadata (input and output)
59 * @param context parse context
60 * @throws IOException if the document stream could not be read
61 * @throws SAXException if the SAX events could not be processed
62 * @throws TikaException if the document could not be parsed
63 */
64 void parse(
65 InputStream stream, ContentHandler handler,
66 Metadata metadata, ParseContext context)
67 throws IOException, SAXException, TikaException;
68
69 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.xml.sax.ContentHandler;
26 import org.xml.sax.SAXException;
27
28 /**
29 * Decorator base class for the {@link Parser} interface. This class
30 * simply delegates all parsing calls to an underlying decorated parser
31 * instance. Subclasses can provide extra decoration by overriding the
32 * parse method.
33 */
34 public class ParserDecorator extends AbstractParser {
35
36 /** Serial version UID */
37 private static final long serialVersionUID = -3861669115439125268L;
38
39 /**
40 * Decorates the given parser so that it always claims to support
41 * parsing of the given media types.
42 *
43 * @param parser the parser to be decorated
44 * @param types supported media types
45 * @return the decorated parser
46 */
47 public static final Parser withTypes(
48 Parser parser, final Set<MediaType> types) {
49 return new ParserDecorator(parser) {
50 private static final long serialVersionUID = -7345051519565330731L;
51 @Override
52 public Set<MediaType> getSupportedTypes(ParseContext context) {
53 return types;
54 }
55 };
56 }
57
58 /**
59 * The decorated parser instance.
60 */
61 private final Parser parser;
62
63 /**
64 * Creates a decorator for the given parser.
65 *
66 * @param parser the parser instance to be decorated
67 */
68 public ParserDecorator(Parser parser) {
69 this.parser = parser;
70 }
71
72 /**
73 * Delegates the method call to the decorated parser. Subclasses should
74 * override this method (and use <code>super.getSupportedTypes()</code>
75 * to invoke the decorated parser) to implement extra decoration.
76 */
77 public Set<MediaType> getSupportedTypes(ParseContext context) {
78 return parser.getSupportedTypes(context);
79 }
80
81 /**
82 * Delegates the method call to the decorated parser. Subclasses should
83 * override this method (and use <code>super.parse()</code> to invoke
84 * the decorated parser) to implement extra decoration.
85 */
86 public void parse(
87 InputStream stream, ContentHandler handler,
88 Metadata metadata, ParseContext context)
89 throws IOException, SAXException, TikaException {
90 parser.parse(stream, handler, metadata, context);
91 }
92
93
94 /**
95 * Gets the parser wrapped by this ParserDecorator
96 * @return
97 */
98 public Parser getWrappedParser() {
99 return this.parser;
100 }
101
102 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.sax.BodyContentHandler;
24 import org.apache.tika.sax.TeeContentHandler;
25 import org.apache.tika.utils.RegexUtils;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28
29 /**
30 * Parser decorator that post-processes the results from a decorated parser.
31 * The post-processing takes care of filling in the "fulltext", "summary",
32 * and "outlinks" metadata entries based on the full text content returned by
33 * the decorated parser.
34 */
35 public class ParserPostProcessor extends ParserDecorator {
36
37 /**
38 * Creates a post-processing decorator for the given parser.
39 *
40 * @param parser the parser to be decorated
41 */
42 public ParserPostProcessor(Parser parser) {
43 super(parser);
44 }
45
46 /**
47 * Forwards the call to the delegated parser and post-processes the
48 * results as described above.
49 */
50 public void parse(
51 InputStream stream, ContentHandler handler,
52 Metadata metadata, ParseContext context)
53 throws IOException, SAXException, TikaException {
54 ContentHandler body = new BodyContentHandler();
55 ContentHandler tee = new TeeContentHandler(handler, body);
56 super.parse(stream, tee, metadata, context);
57
58 String content = body.toString();
59 metadata.set("fulltext", content);
60
61 int length = Math.min(content.length(), 500);
62 metadata.set("summary", content.substring(0, length));
63
64 for (String link : RegexUtils.extractLinks(content)) {
65 metadata.add("outlinks", link);
66 }
67 }
68
69 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.BufferedReader;
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.PipedReader;
25 import java.io.PipedWriter;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.util.concurrent.Executor;
29
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.sax.BodyContentHandler;
32 import org.xml.sax.ContentHandler;
33
34 /**
35 * Reader for the text content from a given binary stream. This class
36 * uses a background parsing task with a {@link Parser}
37 * ({@link AutoDetectParser} by default) to parse the text content from
38 * a given input stream. The {@link BodyContentHandler} class and a pipe
39 * is used to convert the push-based SAX event stream to the pull-based
40 * character stream defined by the {@link Reader} interface.
41 *
42 * @since Apache Tika 0.2
43 */
44 public class ParsingReader extends Reader {
45
46 /**
47 * Parser instance used for parsing the given binary stream.
48 */
49 private final Parser parser;
50
51 /**
52 * Buffered read end of the pipe.
53 */
54 private final Reader reader;
55
56 /**
57 * Write end of the pipe.
58 */
59 private final Writer writer;
60
61 /**
62 * The binary stream being parsed.
63 */
64 private final InputStream stream;
65
66 /**
67 * Metadata associated with the document being parsed.
68 */
69 private final Metadata metadata;
70
71 /**
72 * The parse context.
73 */
74 private final ParseContext context;
75
76 /**
77 * An exception (if any) thrown by the parsing thread.
78 */
79 private transient Throwable throwable;
80
81 /**
82 * Utility method that returns a {@link Metadata} instance
83 * for a document with the given name.
84 *
85 * @param name resource name (or <code>null</code>)
86 * @return metadata instance
87 */
88 private static Metadata getMetadata(String name) {
89 Metadata metadata = new Metadata();
90 if (name != null && name.length() > 0) {
91 metadata.set(Metadata.RESOURCE_NAME_KEY, name);
92 }
93 return metadata;
94 }
95
96 /**
97 * Creates a reader for the text content of the given binary stream.
98 *
99 * @param stream binary stream
100 * @throws IOException if the document can not be parsed
101 */
102 public ParsingReader(InputStream stream) throws IOException {
103 this(new AutoDetectParser(), stream, new Metadata(), new ParseContext());
104 context.set(Parser.class, parser);
105 }
106
107 /**
108 * Creates a reader for the text content of the given binary stream
109 * with the given name.
110 *
111 * @param stream binary stream
112 * @param name document name
113 * @throws IOException if the document can not be parsed
114 */
115 public ParsingReader(InputStream stream, String name) throws IOException {
116 this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext());
117 context.set(Parser.class, parser);
118 }
119
120 /**
121 * Creates a reader for the text content of the given file.
122 *
123 * @param file file
124 * @throws FileNotFoundException if the given file does not exist
125 * @throws IOException if the document can not be parsed
126 */
127 public ParsingReader(File file) throws FileNotFoundException, IOException {
128 this(new FileInputStream(file), file.getName());
129 }
130
131 /**
132 * Creates a reader for the text content of the given binary stream
133 * with the given document metadata. The given parser is used for
134 * parsing. A new background thread is started for the parsing task.
135 * <p>
136 * The created reader will be responsible for closing the given stream.
137 * The stream and any associated resources will be closed at or before
138 * the time when the {@link #close()} method is called on this reader.
139 *
140 * @param parser parser instance
141 * @param stream binary stream
142 * @param metadata document metadata
143 * @throws IOException if the document can not be parsed
144 */
145 public ParsingReader(
146 Parser parser, InputStream stream, final Metadata metadata,
147 ParseContext context) throws IOException {
148 this(parser, stream, metadata, context, new Executor() {
149 public void execute(Runnable command) {
150 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
151 if (name != null) {
152 name = "Apache Tika: " + name;
153 } else {
154 name = "Apache Tika";
155 }
156 Thread thread = new Thread(command, name);
157 thread.setDaemon(true);
158 thread.start();
159 }
160 });
161 }
162
163 /**
164 * Creates a reader for the text content of the given binary stream
165 * with the given document metadata. The given parser is used for the
166 * parsing task that is run with the given executor. The given executor
167 * <em>must</em> run the parsing task asynchronously in a separate thread,
168 * since the current thread must return to the caller that can then
169 * consume the parsed text through the {@link Reader} interface.
170 * <p>
171 * The created reader will be responsible for closing the given stream.
172 * The stream and any associated resources will be closed at or before
173 * the time when the {@link #close()} method is called on this reader.
174 *
175 * @param parser parser instance
176 * @param stream binary stream
177 * @param metadata document metadata
178 * @param context parsing context
179 * @param executor executor for the parsing task
180 * @throws IOException if the document can not be parsed
181 * @since Apache Tika 0.4
182 */
183 public ParsingReader(
184 Parser parser, InputStream stream, Metadata metadata,
185 ParseContext context, Executor executor) throws IOException {
186 this.parser = parser;
187 PipedReader pipedReader = new PipedReader();
188 this.reader = new BufferedReader(pipedReader);
189 try {
190 this.writer = new PipedWriter(pipedReader);
191 } catch (IOException e) {
192 throw new IllegalStateException(e); // Should never happen
193 }
194 this.stream = stream;
195 this.metadata = metadata;
196 this.context = context;
197
198 executor.execute(new ParsingTask());
199
200 // TIKA-203: Buffer first character to force metadata extraction
201 reader.mark(1);
202 reader.read();
203 reader.reset();
204 }
205
206 /**
207 * The background parsing task.
208 */
209 private class ParsingTask implements Runnable {
210
211 /**
212 * Parses the given binary stream and writes the text content
213 * to the write end of the pipe. Potential exceptions (including
214 * the one caused if the read end is closed unexpectedly) are
215 * stored before the input stream is closed and processing is stopped.
216 */
217 public void run() {
218 try {
219 ContentHandler handler = new BodyContentHandler(writer);
220 parser.parse(stream, handler, metadata, context);
221 } catch (Throwable t) {
222 throwable = t;
223 }
224
225 try {
226 stream.close();
227 } catch (Throwable t) {
228 if (throwable == null) {
229 throwable = t;
230 }
231 }
232
233 try {
234 writer.close();
235 } catch (Throwable t) {
236 if (throwable == null) {
237 throwable = t;
238 }
239 }
240 }
241
242 }
243
244 /**
245 * Reads parsed text from the pipe connected to the parsing thread.
246 * Fails if the parsing thread has thrown an exception.
247 *
248 * @param cbuf character buffer
249 * @param off start offset within the buffer
250 * @param len maximum number of characters to read
251 * @throws IOException if the parsing thread has failed or
252 * if for some reason the pipe does not work properly
253 */
254 @Override
255 public int read(char[] cbuf, int off, int len) throws IOException {
256 if (throwable instanceof IOException) {
257 throw (IOException) throwable;
258 } else if (throwable != null) {
259 IOException exception = new IOException("");
260 exception.initCause(throwable);
261 throw exception;
262 }
263 return reader.read(cbuf, off, len);
264 }
265
266 /**
267 * Closes the read end of the pipe. If the parsing thread is still
268 * running, next write to the pipe will fail and cause the thread
269 * to stop. Thus there is no need to explicitly terminate the thread.
270 *
271 * @throws IOException if the pipe can not be closed
272 */
273 @Override
274 public void close() throws IOException {
275 reader.close();
276 }
277
278 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import org.apache.tika.metadata.Metadata;
19
20 /**
21 * Interface for providing a password to a Parser for handling Encrypted
22 * and Password Protected Documents.
23 * An implementation of this should be set on the {@link ParseContext}
24 * supplied to {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, Metadata, ParseContext)}
25 * to provide a way to get the document password.
26 * An implementation of this interface defines some specific selection
27 * or lookup criteria, to be applied against the document metadata passed
28 * to the {@link #getPassword(Metadata)} method.
29 *
30 * @since Apache Tika 1.1
31 */
32 public interface PasswordProvider {
33 /**
34 * Looks up the password for a document with the given metadata,
35 * and returns it for the Parser. If no password is available
36 * for the document, will return null.
37 *
38 * @param metadata document metadata
39 * @return The document decryption password, or <code>null</code> if not known
40 */
41 String getPassword(Metadata metadata);
42 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.external;
17
18 import java.io.IOException;
19 import java.util.List;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.mime.MediaTypeRegistry;
23 import org.apache.tika.parser.CompositeParser;
24 import org.apache.tika.parser.Parser;
25
26 /**
27 * A Composite Parser that wraps up all the available External Parsers,
28 * and provides an easy way to access them.
29 * Parser that uses an external program (like catdoc or pdf2txt) to extract
30 * text content and metadata from a given document.
31 */
32 public class CompositeExternalParser extends CompositeParser {
33 private static final long serialVersionUID = 6962436916649024024L;
34
35 public CompositeExternalParser() throws IOException, TikaException {
36 this(new MediaTypeRegistry());
37 }
38
39 @SuppressWarnings("unchecked")
40 public CompositeExternalParser(MediaTypeRegistry registry) throws IOException, TikaException {
41 super(
42 registry,
43 (List<Parser>)(List<? extends Parser>)ExternalParsersFactory.create()
44 );
45 }
46 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.external;
17
18 import java.io.BufferedReader;
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.io.OutputStream;
25 import java.io.Reader;
26 import java.util.Collections;
27 import java.util.HashSet;
28 import java.util.Map;
29 import java.util.Set;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.io.IOUtils;
35 import org.apache.tika.io.NullOutputStream;
36 import org.apache.tika.io.TemporaryResources;
37 import org.apache.tika.io.TikaInputStream;
38 import org.apache.tika.metadata.Metadata;
39 import org.apache.tika.mime.MediaType;
40 import org.apache.tika.parser.AbstractParser;
41 import org.apache.tika.parser.ParseContext;
42 import org.apache.tika.sax.XHTMLContentHandler;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
46 /**
47 * Parser that uses an external program (like catdoc or pdf2txt) to extract
48 * text content and metadata from a given document.
49 */
50 public class ExternalParser extends AbstractParser {
51 private static final long serialVersionUID = -1079128990650687037L;
52
53 /**
54 * The token, which if present in the Command string, will
55 * be replaced with the input filename.
56 * Alternately, the input data can be streamed over STDIN.
57 */
58 public static final String INPUT_FILE_TOKEN = "${INPUT}";
59 /**
60 * The token, which if present in the Command string, will
61 * be replaced with the output filename.
62 * Alternately, the output data can be collected on STDOUT.
63 */
64 public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
65
66 /**
67 * Media types supported by the external program.
68 */
69 private Set<MediaType> supportedTypes = Collections.emptySet();
70
71 /**
72 * Regular Expressions to run over STDOUT to
73 * extract Metadata.
74 */
75 private Map<Pattern,String> metadataPatterns = null;
76
77 /**
78 * The external command to invoke.
79 * @see Runtime#exec(String[])
80 */
81 private String[] command = new String[] { "cat" };
82
83 public Set<MediaType> getSupportedTypes(ParseContext context) {
84 return getSupportedTypes();
85 }
86
87 public Set<MediaType> getSupportedTypes() {
88 return supportedTypes;
89 }
90
91 public void setSupportedTypes(Set<MediaType> supportedTypes) {
92 this.supportedTypes =
93 Collections.unmodifiableSet(new HashSet<MediaType>(supportedTypes));
94 }
95
96
97 public String[] getCommand() {
98 return command;
99 }
100
101 /**
102 * Sets the command to be run. This can include either of
103 * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
104 * if the command needs filenames.
105 * @see Runtime#exec(String[])
106 */
107 public void setCommand(String... command) {
108 this.command = command;
109 }
110
111
112 public Map<Pattern,String> getMetadataExtractionPatterns() {
113 return metadataPatterns;
114 }
115
116 /**
117 * Sets the map of regular expression patterns and Metadata
118 * keys. Any matching patterns will have the matching
119 * metadata entries set.
120 * Set this to null to disable Metadata extraction.
121 */
122 public void setMetadataExtractionPatterns(Map<Pattern,String> patterns) {
123 this.metadataPatterns = patterns;
124 }
125
126
127 /**
128 * Executes the configured external command and passes the given document
129 * stream as a simple XHTML document to the given SAX content handler.
130 * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
131 * has been called to set patterns.
132 */
133 public void parse(
134 InputStream stream, ContentHandler handler,
135 Metadata metadata, ParseContext context)
136 throws IOException, SAXException, TikaException {
137 XHTMLContentHandler xhtml =
138 new XHTMLContentHandler(handler, metadata);
139
140 TemporaryResources tmp = new TemporaryResources();
141 try {
142 parse(TikaInputStream.get(stream, tmp),
143 xhtml, metadata, tmp);
144 } finally {
145 tmp.dispose();
146 }
147 }
148
149 private void parse(
150 TikaInputStream stream, XHTMLContentHandler xhtml,
151 Metadata metadata, TemporaryResources tmp)
152 throws IOException, SAXException, TikaException {
153 boolean inputToStdIn = true;
154 boolean outputFromStdOut = true;
155 boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
156
157 File output = null;
158
159 // Build our command
160 String[] cmd = new String[command.length];
161 System.arraycopy(command, 0, cmd, 0, command.length);
162 for(int i=0; i<cmd.length; i++) {
163 if(cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
164 cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
165 inputToStdIn = false;
166 }
167 if(cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
168 output = tmp.createTemporaryFile();
169 outputFromStdOut = false;
170 }
171 }
172
173 // Execute
174 Process process;
175 if(cmd.length == 1) {
176 process = Runtime.getRuntime().exec( cmd[0] );
177 } else {
178 process = Runtime.getRuntime().exec( cmd );
179 }
180
181 try {
182 if(inputToStdIn) {
183 sendInput(process, stream);
184 } else {
185 process.getOutputStream().close();
186 }
187
188 InputStream out = process.getInputStream();
189 InputStream err = process.getErrorStream();
190
191 if(hasPatterns) {
192 extractMetadata(err, metadata);
193
194 if(outputFromStdOut) {
195 extractOutput(out, xhtml);
196 } else {
197 extractMetadata(out, metadata);
198 }
199 } else {
200 ignoreStream(err);
201
202 if(outputFromStdOut) {
203 extractOutput(out, xhtml);
204 } else {
205 ignoreStream(out);
206 }
207 }
208 } finally {
209 try {
210 process.waitFor();
211 } catch (InterruptedException ignore) {
212 }
213 }
214
215 // Grab the output if we haven't already
216 if (!outputFromStdOut) {
217 extractOutput(new FileInputStream(output), xhtml);
218 }
219 }
220
221 /**
222 * Starts a thread that extracts the contents of the standard output
223 * stream of the given process to the given XHTML content handler.
224 * The standard output stream is closed once fully processed.
225 *
226 * @param process process
227 * @param xhtml XHTML content handler
228 * @throws SAXException if the XHTML SAX events could not be handled
229 * @throws IOException if an input error occurred
230 */
231 private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
232 throws SAXException, IOException {
233 Reader reader = new InputStreamReader(stream);
234 try {
235 xhtml.startDocument();
236 xhtml.startElement("p");
237 char[] buffer = new char[1024];
238 for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
239 xhtml.characters(buffer, 0, n);
240 }
241 xhtml.endElement("p");
242 xhtml.endDocument();
243 } finally {
244 reader.close();
245 }
246 }
247
248 /**
249 * Starts a thread that sends the contents of the given input stream
250 * to the standard input stream of the given process. Potential
251 * exceptions are ignored, and the standard input stream is closed
252 * once fully processed. Note that the given input stream is <em>not</em>
253 * closed by this method.
254 *
255 * @param process process
256 * @param stream input stream
257 */
258 private void sendInput(final Process process, final InputStream stream) {
259 new Thread() {
260 public void run() {
261 OutputStream stdin = process.getOutputStream();
262 try {
263 IOUtils.copy(stream, stdin);
264 } catch (IOException e) {
265 }
266 }
267 }.start();
268 }
269
270 /**
271 * Starts a thread that reads and discards the contents of the
272 * standard stream of the given process. Potential exceptions
273 * are ignored, and the stream is closed once fully processed.
274 *
275 * @param process process
276 */
277 private void ignoreStream(final InputStream stream) {
278 new Thread() {
279 public void run() {
280 try {
281 IOUtils.copy(stream, new NullOutputStream());
282 } catch (IOException e) {
283 } finally {
284 IOUtils.closeQuietly(stream);
285 }
286 }
287 }.start();
288 }
289
290 private void extractMetadata(final InputStream stream, final Metadata metadata) {
291 new Thread() {
292 public void run() {
293 BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
294 try {
295 String line;
296 while ( (line = reader.readLine()) != null ) {
297 for(Pattern p : metadataPatterns.keySet()) {
298 Matcher m = p.matcher(line);
299 if(m.find()) {
300 metadata.add( metadataPatterns.get(p), m.group(1) );
301 }
302 }
303 }
304 } catch (IOException e) {
305 } finally {
306 IOUtils.closeQuietly(reader);
307 IOUtils.closeQuietly(stream);
308 }
309 }
310 }.start();
311 }
312
313 /**
314 * Checks to see if the command can be run. Typically used with
315 * something like "myapp --version" to check to see if "myapp"
316 * is installed and on the path.
317 *
318 * @param checkCmd The check command to run
319 * @param errorValue What is considered an error value?
320 */
321 public static boolean check(String checkCmd, int... errorValue) {
322 return check(new String[] {checkCmd}, errorValue);
323 }
324 public static boolean check(String[] checkCmd, int... errorValue) {
325 if(errorValue.length == 0) {
326 errorValue = new int[] { 127 };
327 }
328
329 try {
330 Process process;
331 if(checkCmd.length == 1) {
332 process = Runtime.getRuntime().exec(checkCmd[0]);
333 } else {
334 process = Runtime.getRuntime().exec(checkCmd);
335 }
336 int result = process.waitFor();
337
338 for(int err : errorValue) {
339 if(result == err) return false;
340 }
341 return true;
342 } catch(IOException e) {
343 // Some problem, command is there or is broken
344 return false;
345 } catch (InterruptedException ie) {
346 // Some problem, command is there or is broken
347 return false;
348 }
349 }
350 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.external;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.StringTokenizer;
27 import java.util.regex.Pattern;
28
29 import javax.xml.parsers.DocumentBuilder;
30 import javax.xml.parsers.DocumentBuilderFactory;
31 import javax.xml.parsers.ParserConfigurationException;
32
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.mime.MediaType;
35 import org.apache.tika.mime.MimeTypeException;
36 import org.w3c.dom.Document;
37 import org.w3c.dom.Element;
38 import org.w3c.dom.Node;
39 import org.w3c.dom.NodeList;
40 import org.xml.sax.InputSource;
41 import org.xml.sax.SAXException;
42
43 /**
44 * Builds up ExternalParser instances based on XML file(s)
45 * which define what to run, for what, and how to process
46 * any output metadata.
47 * Typically used to configure up a series of external programs
48 * (like catdoc or pdf2txt) to extract text content from documents.
49 *
50 * <pre>
51 * TODO XML DTD Here
52 * </pre>
53 */
54 public final class ExternalParsersConfigReader implements ExternalParsersConfigReaderMetKeys {
55
56 public static List<ExternalParser> read(InputStream stream) throws TikaException, IOException {
57 try {
58 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
59 DocumentBuilder builder = factory.newDocumentBuilder();
60 Document document = builder.parse(new InputSource(stream));
61 return read(document);
62 } catch (ParserConfigurationException e) {
63 throw new TikaException("Unable to create an XML parser", e);
64 } catch (SAXException e) {
65 throw new TikaException("Invalid parser configuration", e);
66 }
67 }
68
69 public static List<ExternalParser> read(Document document) throws TikaException, IOException {
70 return read(document.getDocumentElement());
71 }
72
73 public static List<ExternalParser> read(Element element) throws TikaException, IOException {
74 List<ExternalParser> parsers = new ArrayList<ExternalParser>();
75
76 if (element != null && element.getTagName().equals(EXTERNAL_PARSERS_TAG)) {
77 NodeList nodes = element.getChildNodes();
78 for (int i = 0; i < nodes.getLength(); i++) {
79 Node node = nodes.item(i);
80 if (node.getNodeType() == Node.ELEMENT_NODE) {
81 Element child = (Element) node;
82 if (child.getTagName().equals(PARSER_TAG)) {
83 ExternalParser p = readParser(child);
84 if(p != null) {
85 parsers.add( p );
86 }
87 }
88 }
89 }
90 } else {
91 throw new MimeTypeException(
92 "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: "
93 + element.getTagName());
94 }
95
96 return parsers;
97 }
98
99 /**
100 * Builds and Returns an ExternalParser, or null if a check
101 * command was given that didn't match.
102 */
103 private static ExternalParser readParser(Element parserDef) throws TikaException {
104 ExternalParser parser = new ExternalParser();
105
106 NodeList children = parserDef.getChildNodes();
107 for(int i=0; i<children.getLength(); i++) {
108 Node node = children.item(i);
109 if (node.getNodeType() == Node.ELEMENT_NODE) {
110 Element child = (Element) node;
111 if (child.getTagName().equals(CHECK_TAG)) {
112 boolean present = readCheckTagAndCheck(child);
113 if(! present) {
114 return null;
115 }
116 }
117 else if (child.getTagName().equals(COMMAND_TAG)) {
118 parser.setCommand( getString(child) );
119 }
120 else if (child.getTagName().equals(MIMETYPES_TAG)) {
121 parser.setSupportedTypes(
122 readMimeTypes(child)
123 );
124 }
125 else if (child.getTagName().equals(METADATA_TAG)) {
126 parser.setMetadataExtractionPatterns(
127 readMetadataPatterns(child)
128 );
129 }
130 }
131 }
132
133 return parser;
134 }
135
136 private static Set<MediaType> readMimeTypes(Element mimeTypes) {
137 Set<MediaType> types = new HashSet<MediaType>();
138
139 NodeList children = mimeTypes.getChildNodes();
140 for(int i=0; i<children.getLength(); i++) {
141 Node node = children.item(i);
142 if (node.getNodeType() == Node.ELEMENT_NODE) {
143 Element child = (Element) node;
144 if (child.getTagName().equals(MIMETYPE_TAG)) {
145 types.add( MediaType.parse( getString(child) ) );
146 }
147 }
148 }
149
150 return types;
151 }
152
153 private static Map<Pattern,String> readMetadataPatterns(Element metadataDef) {
154 Map<Pattern, String> metadata = new HashMap<Pattern, String>();
155
156 NodeList children = metadataDef.getChildNodes();
157 for(int i=0; i<children.getLength(); i++) {
158 Node node = children.item(i);
159 if (node.getNodeType() == Node.ELEMENT_NODE) {
160 Element child = (Element) node;
161 if (child.getTagName().equals(METADATA_MATCH_TAG)) {
162 String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
163 Pattern pattern = Pattern.compile( getString(child) );
164 metadata.put(pattern, metadataKey);
165 }
166 }
167 }
168
169 return metadata;
170 }
171
172 private static boolean readCheckTagAndCheck(Element checkDef) {
173 String command = null;
174 List<Integer> errorVals = new ArrayList<Integer>();
175
176 NodeList children = checkDef.getChildNodes();
177 for(int i=0; i<children.getLength(); i++) {
178 Node node = children.item(i);
179 if (node.getNodeType() == Node.ELEMENT_NODE) {
180 Element child = (Element) node;
181 if (child.getTagName().equals(COMMAND_TAG)) {
182 command = getString(child);
183 }
184 if (child.getTagName().equals(ERROR_CODES_TAG)) {
185 String errs = getString(child);
186 StringTokenizer st = new StringTokenizer(errs);
187 while(st.hasMoreElements()) {
188 try {
189 String s = st.nextToken();
190 errorVals.add(Integer.parseInt(s));
191 } catch(NumberFormatException e) {}
192 }
193 }
194 }
195 }
196
197 if(command != null) {
198 int[] errVals = new int[errorVals.size()];
199 for(int i=0; i<errVals.length; i++) {
200 errVals[i] = errorVals.get(i);
201 }
202
203 return ExternalParser.check(command, errVals);
204 }
205
206 // No check command, so assume it's there
207 return true;
208 }
209
210 private static String getString(Element element) {
211 StringBuffer s = new StringBuffer();
212
213 NodeList children = element.getChildNodes();
214 for(int i=0; i<children.getLength(); i++) {
215 Node node = children.item(i);
216 if (node.getNodeType() == Node.TEXT_NODE) {
217 s.append( node.getNodeValue() );
218 }
219 }
220
221 return s.toString();
222 }
223 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.external;
17
18 /**
19 * Met Keys used by the {@link ExternalParsersConfigReader}.
20 */
21 public interface ExternalParsersConfigReaderMetKeys {
22
23 String EXTERNAL_PARSERS_TAG = "external-parsers";
24
25 String PARSER_TAG = "parser";
26
27 String COMMAND_TAG = "command";
28
29 String CHECK_TAG = "check";
30
31 String ERROR_CODES_TAG = "error-codes";
32
33 String MIMETYPES_TAG = "mime-types";
34
35 String MIMETYPE_TAG = "mime-type";
36
37 String METADATA_TAG = "metadata";
38
39 String METADATA_MATCH_TAG = "match";
40
41 String METADATA_KEY_ATTR = "key";
42 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.external;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.net.URL;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.Enumeration;
24 import java.util.List;
25 import java.util.Map;
26
27 import org.apache.tika.config.ServiceLoader;
28 import org.apache.tika.config.TikaConfig;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.CompositeParser;
32 import org.apache.tika.parser.Parser;
33
34 /**
35 * Creates instances of ExternalParser based on XML
36 * configuration files.
37 *
38 * @see ExternalParsersConfigReader
39 */
40 public class ExternalParsersFactory {
41
42 public static List<ExternalParser> create() throws IOException, TikaException {
43 return create(new ServiceLoader());
44 }
45
46 public static List<ExternalParser> create(ServiceLoader loader)
47 throws IOException, TikaException {
48 return create("tika-external-parsers.xml", loader);
49 }
50
51 public static List<ExternalParser> create(String filename, ServiceLoader loader)
52 throws IOException, TikaException {
53 String filepath = ExternalParsersFactory.class.getPackage().getName().replace('.', '/') +
54 "/" + filename;
55 Enumeration<URL> files = loader.findServiceResources(filepath);
56 ArrayList<URL> list = Collections.list(files);
57 URL[] urls = list.toArray(new URL[list.size()]);
58 return create(urls);
59 }
60
61 public static List<ExternalParser> create(URL... urls) throws IOException, TikaException {
62 List<ExternalParser> parsers = new ArrayList<ExternalParser>();
63 for(URL url : urls) {
64 InputStream stream = url.openStream();
65 try {
66 parsers.addAll(
67 ExternalParsersConfigReader.read(stream)
68 );
69 } finally {
70 stream.close();
71 }
72 }
73 return parsers;
74 }
75
76 public static void attachExternalParsers(TikaConfig config) throws IOException, TikaException {
77 attachExternalParsers( create(), config );
78 }
79
80 public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
81 Parser parser = config.getParser();
82 if (parser instanceof CompositeParser) {
83 CompositeParser cParser = (CompositeParser)parser;
84 Map<MediaType,Parser> parserMap = cParser.getParsers();
85 }
86 // TODO
87 }
88 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * External parser process.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.parser.external;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Tika parsers.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.parser;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.OutputStream;
19 import java.io.Writer;
20
21 import org.apache.tika.sax.xpath.Matcher;
22 import org.apache.tika.sax.xpath.MatchingContentHandler;
23 import org.apache.tika.sax.xpath.XPathParser;
24 import org.xml.sax.ContentHandler;
25 import org.xml.sax.SAXException;
26
27 /**
28 * Content handler decorator that only passes everything inside
29 * the XHTML &lt;body/&gt; tag to the underlying handler. Note that
30 * the &lt;body/&gt; tag itself is <em>not</em> passed on.
31 */
32 public class BodyContentHandler extends ContentHandlerDecorator {
33
34 /**
35 * XHTML XPath parser.
36 */
37 private static final XPathParser PARSER =
38 new XPathParser("xhtml", XHTMLContentHandler.XHTML);
39
40 /**
41 * The XPath matcher used to select the XHTML body contents.
42 */
43 private static final Matcher MATCHER =
44 PARSER.parse("/xhtml:html/xhtml:body/descendant::node()");
45
46 /**
47 * Creates a content handler that passes all XHTML body events to the
48 * given underlying content handler.
49 *
50 * @param handler content handler
51 */
52 public BodyContentHandler(ContentHandler handler) {
53 super(new MatchingContentHandler(handler, MATCHER));
54 }
55
56 /**
57 * Creates a content handler that writes XHTML body character events to
58 * the given writer.
59 *
60 * @param writer writer
61 */
62 public BodyContentHandler(Writer writer) {
63 this(new WriteOutContentHandler(writer));
64 }
65
66 /**
67 * Creates a content handler that writes XHTML body character events to
68 * the given output stream using the default encoding.
69 *
70 * @param stream output stream
71 */
72 public BodyContentHandler(OutputStream stream) {
73 this(new WriteOutContentHandler(stream));
74 }
75
76 /**
77 * Creates a content handler that writes XHTML body character events to
78 * an internal string buffer. The contents of the buffer can be retrieved
79 * using the {@link #toString()} method.
80 * <p>
81 * The internal string buffer is bounded at the given number of characters.
82 * If this write limit is reached, then a {@link SAXException} is thrown.
83 *
84 * @since Apache Tika 0.7
85 * @param writeLimit maximum number of characters to include in the string,
86 * or -1 to disable the write limit
87 */
88 public BodyContentHandler(int writeLimit) {
89 this(new WriteOutContentHandler(writeLimit));
90 }
91
92 /**
93 * Creates a content handler that writes XHTML body character events to
94 * an internal string buffer. The contents of the buffer can be retrieved
95 * using the {@link #toString()} method.
96 * <p>
97 * The internal string buffer is bounded at 100k characters. If this write
98 * limit is reached, then a {@link SAXException} is thrown.
99 */
100 public BodyContentHandler() {
101 this(new WriteOutContentHandler());
102 }
103
104 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.Attributes;
19 import org.xml.sax.ContentHandler;
20 import org.xml.sax.Locator;
21 import org.xml.sax.SAXException;
22 import org.xml.sax.helpers.DefaultHandler;
23
24 /**
25 * Decorator base class for the {@link ContentHandler} interface. This class
26 * simply delegates all SAX events calls to an underlying decorated handler
27 * instance. Subclasses can provide extra decoration by overriding one or more
28 * of the SAX event methods.
29 */
30 public class ContentHandlerDecorator extends DefaultHandler {
31
32 /**
33 * Decorated SAX event handler.
34 */
35 private ContentHandler handler;
36
37 /**
38 * Creates a decorator for the given SAX event handler.
39 *
40 * @param handler SAX event handler to be decorated
41 */
42 public ContentHandlerDecorator(ContentHandler handler) {
43 assert handler != null;
44 this.handler = handler;
45 }
46
47 /**
48 * Creates a decorator that by default forwards incoming SAX events to
49 * a dummy content handler that simply ignores all the events. Subclasses
50 * should use the {@link #setContentHandler(ContentHandler)} method to
51 * switch to a more usable underlying content handler.
52 */
53 protected ContentHandlerDecorator() {
54 this(new DefaultHandler());
55 }
56
57 /**
58 * Sets the underlying content handler. All future SAX events will be
59 * directed to this handler instead of the one that was previously used.
60 *
61 * @param handler content handler
62 */
63 protected void setContentHandler(ContentHandler handler) {
64 assert handler != null;
65 this.handler = handler;
66 }
67
68 @Override
69 public void startPrefixMapping(String prefix, String uri)
70 throws SAXException {
71 try {
72 handler.startPrefixMapping(prefix, uri);
73 } catch (SAXException e) {
74 handleException(e);
75 }
76 }
77
78 @Override
79 public void endPrefixMapping(String prefix) throws SAXException {
80 try {
81 handler.endPrefixMapping(prefix);
82 } catch (SAXException e) {
83 handleException(e);
84 }
85 }
86
87 @Override
88 public void processingInstruction(String target, String data)
89 throws SAXException {
90 try {
91 handler.processingInstruction(target, data);
92 } catch (SAXException e) {
93 handleException(e);
94 }
95 }
96
97 @Override
98 public void setDocumentLocator(Locator locator) {
99 handler.setDocumentLocator(locator);
100 }
101
102 @Override
103 public void startDocument() throws SAXException {
104 try {
105 handler.startDocument();
106 } catch (SAXException e) {
107 handleException(e);
108 }
109 }
110
111 @Override
112 public void endDocument() throws SAXException {
113 try {
114 handler.endDocument();
115 } catch (SAXException e) {
116 handleException(e);
117 }
118 }
119
120 @Override
121 public void startElement(
122 String uri, String localName, String name, Attributes atts)
123 throws SAXException {
124 try {
125 handler.startElement(uri, localName, name, atts);
126 } catch (SAXException e) {
127 handleException(e);
128 }
129 }
130
131 @Override
132 public void endElement(String uri, String localName, String name)
133 throws SAXException {
134 try {
135 handler.endElement(uri, localName, name);
136 } catch (SAXException e) {
137 handleException(e);
138 }
139 }
140
141 @Override
142 public void characters(char[] ch, int start, int length)
143 throws SAXException {
144 try {
145 handler.characters(ch, start, length);
146 } catch (SAXException e) {
147 handleException(e);
148 }
149 }
150
151 @Override
152 public void ignorableWhitespace(char[] ch, int start, int length)
153 throws SAXException {
154 try {
155 handler.ignorableWhitespace(ch, start, length);
156 } catch (SAXException e) {
157 handleException(e);
158 }
159 }
160
161 @Override
162 public void skippedEntity(String name) throws SAXException {
163 try {
164 handler.skippedEntity(name);
165 } catch (SAXException e) {
166 handleException(e);
167 }
168 }
169
170 @Override
171 public String toString() {
172 return handler.toString();
173 }
174
175 /**
176 * Handle any exceptions thrown by methods in this class. This method
177 * provides a single place to implement custom exception handling. The
178 * default behaviour is simply to re-throw the given exception, but
179 * subclasses can also provide alternative ways of handling the situation.
180 *
181 * @param exception the exception that was thrown
182 * @throws SAXException the exception (if any) thrown to the client
183 */
184 protected void handleException(SAXException exception) throws SAXException {
185 throw exception;
186 }
187
188 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.util.Map;
19 import java.util.Collections;
20 import javax.xml.namespace.QName;
21
22 import org.xml.sax.SAXException;
23 import org.xml.sax.ContentHandler;
24 import org.xml.sax.Attributes;
25 import org.xml.sax.helpers.AttributesImpl;
26
27 /**
28 * Content handler decorator that maps element <code>QName</code>s using
29 * a <code>Map</code>. Not mappable elements are not forwarded.
30 * Attributes may also be mapped (for each element different using
31 * a <code>Map</code> for attributes), not mappable attributes are not
32 * forwarded. The default is to not map any attributes and therefore do
33 * not forward any of them.
34 */
35 public class ElementMappingContentHandler extends ContentHandlerDecorator {
36
37 private final Map<QName, TargetElement> mappings;
38
39 public ElementMappingContentHandler(
40 ContentHandler handler, Map<QName, TargetElement> mappings) {
41 super(handler);
42 this.mappings = mappings;
43 }
44
45 @Override
46 public void startElement(
47 String namespaceURI, String localName, String qName,
48 Attributes atts) throws SAXException {
49 TargetElement mapping =
50 mappings.get(new QName(namespaceURI, localName));
51 if (mapping != null) {
52 QName tag = mapping.getMappedTagName();
53 super.startElement(
54 tag.getNamespaceURI(), tag.getLocalPart(),
55 getQNameAsString(tag), mapping.mapAttributes(atts));
56 }
57 }
58
59 @Override
60 public void endElement(String namespaceURI, String localName, String qName)
61 throws SAXException {
62 TargetElement mapping =
63 mappings.get(new QName(namespaceURI, localName));
64 if (mapping != null) {
65 QName tag=mapping.getMappedTagName();
66 super.endElement(
67 tag.getNamespaceURI(), tag.getLocalPart(),
68 getQNameAsString(tag));
69 }
70 }
71
72 protected static final String getQNameAsString(QName qname) {
73 String prefix = qname.getPrefix();
74 if (prefix.length() > 0) {
75 return prefix + ":" + qname.getLocalPart();
76 } else {
77 return qname.getLocalPart();
78 }
79 }
80
81 public static class TargetElement {
82
83 /**
84 * Creates an TargetElement, attributes of this element will
85 * be mapped as specified
86 */
87 public TargetElement(
88 QName mappedTagName, Map<QName, QName> attributesMapping) {
89 this.mappedTagName = mappedTagName;
90 this.attributesMapping = attributesMapping;
91 }
92
93 /**
94 * A shortcut that automatically creates the QName object
95 */
96 public TargetElement(
97 String mappedTagURI, String mappedTagLocalName,
98 Map<QName, QName> attributesMapping) {
99 this(new QName(mappedTagURI, mappedTagLocalName), attributesMapping);
100 }
101
102 /**
103 * Creates an TargetElement with no attributes, all attributes
104 * will be deleted from SAX stream
105 */
106 public TargetElement(QName mappedTagName) {
107 this(mappedTagName, Collections.<QName,QName>emptyMap());
108 }
109
110 /** A shortcut that automatically creates the QName object */
111 public TargetElement(String mappedTagURI, String mappedTagLocalName) {
112 this(mappedTagURI, mappedTagLocalName,
113 Collections.<QName,QName>emptyMap());
114 }
115
116 public QName getMappedTagName() {
117 return mappedTagName;
118 }
119
120 public Map<QName, QName> getAttributesMapping() {
121 return attributesMapping;
122 }
123
124 public Attributes mapAttributes(final Attributes atts) {
125 AttributesImpl natts = new AttributesImpl();
126 for (int i = 0; i < atts.getLength(); i++) {
127 QName name = attributesMapping.get(
128 new QName(atts.getURI(i), atts.getLocalName(i)));
129 if (name!=null) {
130 natts.addAttribute(
131 name.getNamespaceURI(), name.getLocalPart(),
132 getQNameAsString(name),
133 atts.getType(i), atts.getValue(i));
134 }
135 }
136 return natts;
137 }
138
139 private final QName mappedTagName;
140
141 private final Map<QName, QName> attributesMapping;
142
143 }
144
145 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.ContentHandler;
19
20 /**
21 * Content handler decorator that prevents the {@link #startDocument()}
22 * and {@link #endDocument()} events from reaching the decorated handler.
23 * This is useful when you want to direct the results of parsing multiple
24 * different XML documents into a single target document without worrying
25 * about the {@link #startDocument()} and {@link #endDocument()} methods
26 * being called more than once.
27 */
28 public class EmbeddedContentHandler extends ContentHandlerDecorator {
29
30 /**
31 * Created a decorator that prevents the given handler from
32 * receiving {@link #startDocument()} and {@link #endDocument()}
33 * events.
34 *
35 * @param handler the content handler to be decorated
36 */
37 public EmbeddedContentHandler(ContentHandler handler) {
38 super(handler);
39 }
40
41 /**
42 * Ignored.
43 */
44 @Override
45 public void startDocument() {
46 }
47
48 /**
49 * Ignored.
50 */
51 @Override
52 public void endDocument() {
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.ContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * A wrapper around a {@link ContentHandler} which will ignore normal
23 * SAX calls to {@link #endDocument()}, and only fire them later.
24 * This is typically used to ensure that we can output the metadata
25 * before ending the document
26 */
27 public class EndDocumentShieldingContentHandler extends ContentHandlerDecorator {
28 private boolean endDocumentCalled;
29
30 /**
31 * Creates a decorator for the given SAX event handler.
32 *
33 * @param handler SAX event handler to be decorated
34 */
35 public EndDocumentShieldingContentHandler(ContentHandler handler) {
36 super(handler);
37 endDocumentCalled = false;
38 }
39
40 @Override
41 public void endDocument() throws SAXException {
42 endDocumentCalled = true;
43 }
44
45 public void reallyEndDocument() throws SAXException {
46 super.endDocument();
47 }
48
49 public boolean getEndDocumentWasCalled() {
50 return endDocumentCalled;
51 }
52 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import javax.xml.transform.sax.TransformerHandler;
19
20 import org.xml.sax.Attributes;
21 import org.xml.sax.ContentHandler;
22 import org.xml.sax.SAXException;
23
24 /**
25 * Content handler decorator which wraps a {@link TransformerHandler} in order to
26 * allow the <code>TITLE</code> tag to render as <code>&lt;title&gt;&lt;/title&gt;</code>
27 * rather than <code>&lt;title/&gt;</code> which is accomplished
28 * by calling the {@link TransformerHandler#characters(char[], int, int)} method
29 * with a <code>length</code> of 1 but a zero length char array.
30 * <p>
31 * This workaround is an unfortunate circumstance of the limitations imposed by the
32 * implementation of the XML serialization code in the JDK brought over from
33 * the xalan project which no longer allows for the specification of an
34 * alternate <code>content-handler</code> via xslt templates or other means.
35 *
36 * @see <a href="https://issues.apache.org/jira/browse/TIKA-725">TIKA-725</a>
37 */
38 public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
39
40 private boolean isTitleTagOpen;
41 private static final String TITLE_TAG = "TITLE";
42
43 public ExpandedTitleContentHandler() {
44 super();
45 }
46
47 public ExpandedTitleContentHandler(ContentHandler handler) {
48 super(handler);
49 }
50
51 @Override
52 public void startDocument() throws SAXException {
53 super.startDocument();
54 isTitleTagOpen = false;
55 }
56
57 @Override
58 public void startElement(String uri, String localName, String qName,
59 Attributes atts) throws SAXException {
60 super.startElement(uri, localName, qName, atts);
61 if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
62 isTitleTagOpen = true;
63 }
64 }
65
66 @Override
67 public void endElement(String uri, String localName, String qName)
68 throws SAXException {
69 super.endElement(uri, localName, qName);
70 if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
71 isTitleTagOpen = false;
72 }
73 }
74
75 @Override
76 public void characters(char[] ch, int start, int length)
77 throws SAXException {
78 if (isTitleTagOpen && length == 0) {
79 // Hack to close the title tag
80 try {
81 super.characters(new char[0], 0, 1);
82 } catch (ArrayIndexOutOfBoundsException e) {
83 // Expected, just wanted to close the title tag
84 }
85 } else {
86 super.characters(ch, start, length);
87 }
88 }
89
90 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 public class Link {
19
20 private final String type;
21
22 private final String uri;
23
24 private final String title;
25
26 private final String text;
27
28 private final String rel;
29
30 public Link(String type, String uri, String title, String text) {
31 this.type = type;
32 this.uri = uri;
33 this.title = title;
34 this.text = text;
35 this.rel = "";
36 }
37
38 public Link(String type, String uri, String title, String text, String rel) {
39 this.type = type;
40 this.uri = uri;
41 this.title = title;
42 this.text = text;
43 this.rel = rel;
44 }
45
46 public boolean isAnchor() {
47 return "a".equals(type);
48 }
49
50 public boolean isImage() {
51 return "img".equals(type);
52 }
53
54 public String getType() {
55 return type;
56 }
57
58 public String getUri() {
59 return uri;
60 }
61
62 public String getTitle() {
63 return title;
64 }
65
66 public String getText() {
67 return text;
68 }
69
70 public String getRel() {
71 return rel;
72 }
73
74 public String toString() {
75 StringBuilder builder = new StringBuilder();
76 if (isImage()) {
77 builder.append("<img src=\"");
78 builder.append(uri);
79 if (title != null && title.length() > 0) {
80 builder.append("\" title=\"");
81 builder.append(title);
82 }
83 if (text != null && text.length() > 0) {
84 builder.append("\" alt=\"");
85 builder.append(text);
86 }
87 builder.append("\"/>");
88 } else {
89 builder.append("<");
90 builder.append(type);
91 builder.append(" href=\"");
92 builder.append(uri);
93 if (title != null && title.length() > 0) {
94 builder.append("\" title=\"");
95 builder.append(title);
96 }
97 if (rel != null && rel.length() > 0) {
98 builder.append("\" rel=\"");
99 builder.append(rel);
100 }
101 builder.append("\">");
102 builder.append(text);
103 builder.append("</");
104 builder.append(type);
105 builder.append(">");
106 }
107 return builder.toString();
108 }
109
110 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 class LinkBuilder {
19
20 private final String type;
21
22 private String uri = "";
23
24 private String title = "";
25
26 private String rel = "";
27
28 private final StringBuilder text = new StringBuilder();
29
30 public LinkBuilder(String type) {
31 this.type = type;
32 }
33
34 public void setURI(String uri) {
35 if (uri != null) {
36 this.uri = uri;
37 } else {
38 this.uri = "";
39 }
40 }
41
42 public void setTitle(String title) {
43 if (title != null) {
44 this.title = title;
45 } else {
46 this.title = "";
47 }
48 }
49
50 public void setRel(String rel) {
51 if (rel != null) {
52 this.rel = rel;
53 } else {
54 this.rel = "";
55 }
56 }
57
58 public void characters(char[] ch, int offset, int length) {
59 text.append(ch, offset, length);
60 }
61
62 public Link getLink() {
63 return getLink(false);
64 }
65
66 public Link getLink(boolean collapseWhitespace) {
67 String anchor = text.toString();
68
69 if (collapseWhitespace) {
70 anchor = anchor.replaceAll("\\s+", " ").trim();
71 }
72
73 return new Link(type, uri, title, anchor, rel);
74 }
75
76 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
19
20 import java.util.ArrayList;
21 import java.util.LinkedList;
22 import java.util.List;
23
24 import org.xml.sax.Attributes;
25 import org.xml.sax.helpers.DefaultHandler;
26
27 /**
28 * Content handler that collects links from an XHTML document.
29 */
30 public class LinkContentHandler extends DefaultHandler {
31
32 /**
33 * Stack of link builders, one for each level of nested links currently
34 * being processed. A usual case of a nested link would be a hyperlinked
35 * image (<code>&a href="..."&gt;&lt;img src="..."&gt;&lt;&gt;</code>),
36 * but it's possible (though unlikely) for also other kinds of nesting
37 * to occur.
38 */
39 private final LinkedList<LinkBuilder> builderStack =
40 new LinkedList<LinkBuilder>();
41
42 /** Collected links */
43 private final List<Link> links = new ArrayList<Link>();
44
45 /** Whether to collapse whitespace in anchor text */
46 private boolean collapseWhitespaceInAnchor;
47
48 /**
49 * Default constructor
50 */
51 public LinkContentHandler() {
52 this(false);
53 }
54
55 /**
56 * Default constructor
57 *
58 * @boolean collapseWhitespaceInAnchor
59 */
60 public LinkContentHandler(boolean collapseWhitespaceInAnchor) {
61 super();
62
63 this.collapseWhitespaceInAnchor = collapseWhitespaceInAnchor;
64 }
65
66 /**
67 * Returns the list of collected links.
68 *
69 * @return collected links
70 */
71 public List<Link> getLinks() {
72 return links;
73 }
74
75 //-------------------------------------------------------< ContentHandler>
76
77 @Override
78 public void startElement(
79 String uri, String local, String name, Attributes attributes) {
80 if (XHTML.equals(uri)) {
81 if ("a".equals(local)) {
82 LinkBuilder builder = new LinkBuilder("a");
83 builder.setURI(attributes.getValue("", "href"));
84 builder.setTitle(attributes.getValue("", "title"));
85 builder.setRel(attributes.getValue("", "rel"));
86 builderStack.addFirst(builder);
87 } else if ("img".equals(local)) {
88 LinkBuilder builder = new LinkBuilder("img");
89 builder.setURI(attributes.getValue("", "src"));
90 builder.setTitle(attributes.getValue("", "title"));
91 builder.setRel(attributes.getValue("", "rel"));
92 builderStack.addFirst(builder);
93
94 String alt = attributes.getValue("", "alt");
95 if (alt != null) {
96 char[] ch = alt.toCharArray();
97 characters(ch, 0, ch.length);
98 }
99 }
100 }
101 }
102
103 @Override
104 public void characters(char[] ch, int start, int length) {
105 for (LinkBuilder builder : builderStack) {
106 builder.characters(ch, start, length);
107 }
108 }
109
110 @Override
111 public void ignorableWhitespace(char[] ch, int start, int length) {
112 characters(ch, start, length);
113 }
114
115 @Override
116 public void endElement(String uri, String local, String name) {
117 if (XHTML.equals(uri)) {
118 if ("a".equals(local) || "img".equals(local)) {
119 links.add(builderStack.removeFirst().getLink(collapseWhitespaceInAnchor));
120 }
121 }
122 }
123
124 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.apache.tika.io.ClosedInputStream;
19 import org.xml.sax.ContentHandler;
20 import org.xml.sax.InputSource;
21
22 /**
23 * Content handler decorator that always returns an empty stream from the
24 * {@link #resolveEntity(String, String)} method to prevent potential
25 * network or other external resources from being accessed by an XML parser.
26 *
27 * @see <a href="https://issues.apache.org/jira/browse/TIKA-185">TIKA-185</a>
28 */
29 public class OfflineContentHandler extends ContentHandlerDecorator {
30
31 public OfflineContentHandler(ContentHandler handler) {
32 super(handler);
33 }
34
35 /**
36 * Returns an empty stream. This will make an XML parser silently
37 * ignore any external entities.
38 */
39 @Override
40 public InputSource resolveEntity(String publicId, String systemId) {
41 return new InputSource(new ClosedInputStream());
42 }
43
44 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 /*
19 import java.util.ArrayList;
20 import java.util.List;
21 */
22
23 import org.xml.sax.Attributes;
24 import org.xml.sax.ContentHandler;
25 import org.xml.sax.SAXException;
26 import org.xml.sax.helpers.AttributesImpl;
27
28 /**
29 * Content handler decorator that makes sure that the character events
30 * ({@link #characters(char[], int, int)} or
31 * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
32 * content handler contain only valid XML characters. All invalid characters
33 * are replaced with spaces.
34 * <p>
35 * The XML standard defines the following Unicode character ranges as
36 * valid XML characters:
37 * <pre>
38 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
39 * </pre>
40 * <p>
41 * Note that currently this class only detects those invalid characters whose
42 * UTF-16 representation fits a single char. Also, this class does not ensure
43 * that the UTF-16 encoding of incoming characters is correct.
44 */
45 public class SafeContentHandler extends ContentHandlerDecorator {
46
47 /**
48 * Replacement for invalid characters.
49 */
50 private static final char[] REPLACEMENT = new char[] { '\ufffd' };
51
52 /**
53 * Internal interface that allows both character and
54 * ignorable whitespace content to be filtered the same way.
55 */
56 protected interface Output {
57 void write(char[] ch, int start, int length) throws SAXException;
58 }
59
60 private static class StringOutput implements Output {
61
62 private final StringBuilder builder = new StringBuilder();
63
64 public void write(char[] ch, int start, int length) {
65 builder.append(ch, start, length);
66 }
67
68 public String toString() {
69 return builder.toString();
70 }
71
72 }
73
74 /**
75 * Output through the {@link ContentHandler#characters(char[], int, int)}
76 * method of the decorated content handler.
77 */
78 private final Output charactersOutput = new Output() {
79 public void write(char[] ch, int start, int length)
80 throws SAXException {
81 SafeContentHandler.super.characters(ch, start, length);
82 }
83 };
84
85 /**
86 * Output through the
87 * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
88 * method of the decorated content handler.
89 */
90 private final Output ignorableWhitespaceOutput = new Output() {
91 public void write(char[] ch, int start, int length)
92 throws SAXException {
93 SafeContentHandler.super.ignorableWhitespace(ch, start, length);
94 }
95 };
96
97 public SafeContentHandler(ContentHandler handler) {
98 super(handler);
99 }
100
101 /**
102 * Filters and outputs the contents of the given input buffer. Any
103 * invalid characters in the input buffer area handled by sending a
104 * replacement (a space character) to the given output. Any sequences
105 * of valid characters are passed as-is to the given output.
106 *
107 * @param ch input buffer
108 * @param start start offset within the buffer
109 * @param length number of characters to read from the buffer
110 * @param output output channel
111 * @throws SAXException if the filtered characters could not be written out
112 */
113 private void filter(char[] ch, int start, int length, Output output)
114 throws SAXException {
115 int end = start + length;
116
117 int i = start;
118 while (i < end) {
119 int c = Character.codePointAt(ch, i, end);
120 int j = i + Character.charCount(c);
121
122 if (isInvalid(c)) {
123 // Output any preceding valid characters
124 if (i > start) {
125 output.write(ch, start, i - start);
126 }
127
128 // Output the replacement for this invalid character
129 writeReplacement(output);
130
131 // Continue with the rest of the array
132 start = j;
133 }
134
135 i = j;
136 }
137
138 // Output any remaining valid characters
139 output.write(ch, start, end - start);
140 }
141
142 /**
143 * Checks if the given string contains any invalid XML characters.
144 *
145 * @param value string to be checked
146 * @return <code>true</code> if the string contains invalid XML characters,
147 * <code>false</code> otherwise
148 */
149 private boolean isInvalid(String value) {
150 char[] ch = value.toCharArray();
151
152 int i = 0;
153 while (i < ch.length) {
154 int c = Character.codePointAt(ch, i);
155 if (isInvalid(c)) {
156 return true;
157 }
158 i = i + Character.charCount(c);
159 }
160
161 return false;
162 }
163
164 /**
165 * Checks whether the given Unicode character is an invalid XML character
166 * and should be replaced for output. Subclasses can override this method
167 * to use an alternative definition of which characters should be replaced
168 * in the XML output. The default definition from the XML specification is:
169 * <pre>
170 * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
171 * </pre>
172 *
173 * @param ch character
174 * @return <code>true</code> if the character should be replaced,
175 * <code>false</code> otherwise
176 */
177 protected boolean isInvalid(int ch) {
178 if (ch < 0x20) {
179 return ch != 0x09 && ch != 0x0A && ch != 0x0D;
180 } else if (ch < 0xE000) {
181 return ch > 0xD7FF;
182 } else if (ch < 0x10000) {
183 return ch > 0xFFFD;
184 } else {
185 return ch > 0x10FFFF;
186 }
187 }
188
189 /**
190 * Outputs the replacement for an invalid character. Subclasses can
191 * override this method to use a custom replacement.
192 *
193 * @param output where the replacement is written to
194 * @throws SAXException if the replacement could not be written
195 */
196 protected void writeReplacement(Output output) throws SAXException {
197 output.write(REPLACEMENT, 0, REPLACEMENT.length);
198 }
199
200
201 /*
202 private final List<String> elements = new ArrayList<String>();
203
204 // Called only from assert
205 private boolean verifyStartElement(String name) {
206 // TODO: we could strengthen this to do full
207 // XTHML validation, eg you shouldn't start p inside
208 // another p (but ODF parser, at least, seems to
209 // violate this):
210 //if (name.equals("p")) {
211 //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p");
212 //}
213 elements.add(name);
214 return true;
215 }
216
217 // Called only from assert
218 private boolean verifyEndElement(String name) {
219 assert elements.size() > 0: "end tag=" + name + " with no startElement";
220 final String currentElement = elements.get(elements.size()-1);
221 assert currentElement.equals(name): "mismatched elements open=" + currentElement + " close=" + name;
222 elements.remove(elements.size()-1);
223 return true;
224 }
225
226 // Called only from assert
227 private boolean verifyEndDocument() {
228 assert elements.size() == 0;
229 return true;
230 }
231 */
232
233 //------------------------------------------------------< ContentHandler >
234
235 @Override
236 public void startElement(
237 String uri, String localName, String name, Attributes atts)
238 throws SAXException {
239 // TODO: enable this, but some parsers currently
240 // trip it
241 //assert verifyStartElement(name);
242 // Look for any invalid characters in attribute values.
243 for (int i = 0; i < atts.getLength(); i++) {
244 if (isInvalid(atts.getValue(i))) {
245 // Found an invalid character, so need to filter the attributes
246 AttributesImpl filtered = new AttributesImpl();
247 for (int j = 0; j < atts.getLength(); j++) {
248 String value = atts.getValue(j);
249 if (j >= i && isInvalid(value)) {
250 // Filter the attribute value when needed
251 Output buffer = new StringOutput();
252 filter(value.toCharArray(), 0, value.length(), buffer);
253 value = buffer.toString();
254 }
255 filtered.addAttribute(
256 atts.getURI(j), atts.getLocalName(j),
257 atts.getQName(j), atts.getType(j), value);
258 }
259 atts = filtered;
260 break;
261 }
262 }
263 super.startElement(uri, localName, name, atts);
264 }
265
266 @Override
267 public void endElement(String uri, String localName, String name)
268 throws SAXException {
269 // TODO: enable this, but some parsers currently
270 // trip it
271 //assert verifyEndElement(name);
272 super.endElement(uri, localName, name);
273 }
274
275 @Override
276 public void endDocument() throws SAXException {
277 // TODO: enable this, but some parsers currently
278 // trip it
279 //assert verifyEndDocument();
280 super.endDocument();
281 }
282
283 @Override
284 public void characters(char[] ch, int start, int length)
285 throws SAXException {
286 filter(ch, start, length, charactersOutput);
287 }
288
289 @Override
290 public void ignorableWhitespace(char[] ch, int start, int length)
291 throws SAXException {
292 filter(ch, start, length, ignorableWhitespaceOutput);
293 }
294
295 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.IOException;
19 import java.util.LinkedList;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.io.TikaInputStream;
23 import org.xml.sax.Attributes;
24 import org.xml.sax.ContentHandler;
25 import org.xml.sax.SAXException;
26
27 /**
28 * Content handler decorator that attempts to prevent denial of service
29 * attacks against Tika parsers.
30 * <p>
31 * Currently this class simply compares the number of output characters
32 * to to the number of input bytes and keeps track of the XML nesting levels.
33 * An exception gets thrown if the output seems excessive compared to the
34 * input document. This is a strong indication of a zip bomb.
35 *
36 * @since Apache Tika 0.4
37 * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
38 */
39 public class SecureContentHandler extends ContentHandlerDecorator {
40
41 /**
42 * The input stream that Tika is parsing.
43 */
44 private final TikaInputStream stream;
45
46 /**
47 * Number of output characters that Tika has produced so far.
48 */
49 private long characterCount = 0;
50
51 /**
52 * The current XML element depth.
53 */
54 private int currentDepth = 0;
55
56 /**
57 * Current number of nested &lt;div class="package-entr"&gt; elements.
58 */
59 private LinkedList<Integer> packageEntryDepths = new LinkedList<Integer>();
60
61 /**
62 * Output threshold.
63 */
64 private long threshold = 1000000;
65
66 /**
67 * Maximum compression ratio.
68 */
69 private long ratio = 100;
70
71 /**
72 * Maximum XML element nesting level.
73 */
74 private int maxDepth = 100;
75
76 /**
77 * Maximum package entry nesting level.
78 */
79 private int maxPackageEntryDepth = 10;
80
81 /**
82 * Decorates the given content handler with zip bomb prevention based
83 * on the count of bytes read from the given counting input stream.
84 * The resulting decorator can be passed to a Tika parser along with
85 * the given counting input stream.
86 *
87 * @param handler the content handler to be decorated
88 * @param stream the input stream to be parsed
89 */
90 public SecureContentHandler(
91 ContentHandler handler, TikaInputStream stream) {
92 super(handler);
93 this.stream = stream;
94 }
95
96 /**
97 * Returns the configured output threshold.
98 *
99 * @return output threshold
100 */
101 public long getOutputThreshold() {
102 return threshold;
103 }
104
105
106 /**
107 * Sets the threshold for output characters before the zip bomb prevention
108 * is activated. This avoids false positives in cases where an otherwise
109 * normal document for some reason starts with a highly compressible
110 * sequence of bytes.
111 *
112 * @param threshold new output threshold
113 */
114 public void setOutputThreshold(long threshold) {
115 this.threshold = threshold;
116 }
117
118
119 /**
120 * Returns the maximum compression ratio.
121 *
122 * @return maximum compression ratio
123 */
124 public long getMaximumCompressionRatio() {
125 return ratio;
126 }
127
128
129 /**
130 * Sets the ratio between output characters and input bytes. If this
131 * ratio is exceeded (after the output threshold has been reached) then
132 * an exception gets thrown.
133 *
134 * @param ratio new maximum compression ratio
135 */
136 public void setMaximumCompressionRatio(long ratio) {
137 this.ratio = ratio;
138 }
139
140 /**
141 * Returns the maximum XML element nesting level.
142 *
143 * @return maximum XML element nesting level
144 */
145 public int getMaximumDepth() {
146 return maxDepth;
147 }
148
149
150 /**
151 * Sets the maximum package entry nesting level. If this depth level is
152 * exceeded then an exception gets thrown.
153 *
154 * @param depth maximum package entry nesting level
155 */
156 public void setMaximumPackageEntryDepth(int depth) {
157 this.maxPackageEntryDepth = depth;
158 }
159
160 /**
161 * Returns the maximum package entry nesting level.
162 *
163 * @return maximum package entry nesting level
164 */
165 public int getMaximumPackageEntryDepth() {
166 return maxPackageEntryDepth;
167 }
168
169
170 /**
171 * Sets the maximum XML element nesting level. If this depth level is
172 * exceeded then an exception gets thrown.
173 *
174 * @param depth maximum XML element nesting level
175 */
176 public void setMaximumDepth(int depth) {
177 this.maxDepth = depth;
178 }
179
180 /**
181 * Converts the given {@link SAXException} to a corresponding
182 * {@link TikaException} if it's caused by this instance detecting
183 * a zip bomb.
184 *
185 * @param e SAX exception
186 * @throws TikaException zip bomb exception
187 */
188 public void throwIfCauseOf(SAXException e) throws TikaException {
189 if (e instanceof SecureSAXException
190 && ((SecureSAXException) e).isCausedBy(this)) {
191 throw new TikaException("Zip bomb detected!", e);
192 }
193 }
194
195 private long getByteCount() throws SAXException {
196 try {
197 if (stream.hasLength()) {
198 return stream.getLength();
199 } else {
200 return stream.getPosition();
201 }
202 } catch (IOException e) {
203 throw new SAXException("Unable to get stream length", e);
204 }
205 }
206
207 /**
208 * Records the given number of output characters (or more accurately
209 * UTF-16 code units). Throws an exception if the recorded number of
210 * characters highly exceeds the number of input bytes read.
211 *
212 * @param length number of new output characters produced
213 * @throws SAXException if a zip bomb is detected
214 */
215 private void advance(int length) throws SAXException {
216 characterCount += length;
217 long byteCount = getByteCount();
218 if (characterCount > threshold
219 && characterCount > byteCount * ratio) {
220 throw new SecureSAXException(
221 "Suspected zip bomb: "
222 + byteCount + " input bytes produced "
223 + characterCount + " output characters");
224 }
225 }
226
227 @Override
228 public void startElement(
229 String uri, String localName, String name, Attributes atts)
230 throws SAXException {
231 currentDepth++;
232 if (currentDepth >= maxDepth) {
233 throw new SecureSAXException(
234 "Suspected zip bomb: "
235 + currentDepth + " levels of XML element nesting");
236 }
237
238 if ("div".equals(name)
239 && "package-entry".equals(atts.getValue("class"))) {
240 packageEntryDepths.addLast(currentDepth);
241 if (packageEntryDepths.size() >= maxPackageEntryDepth) {
242 throw new SecureSAXException(
243 "Suspected zip bomb: "
244 + packageEntryDepths.size()
245 + " levels of package entry nesting");
246 }
247 }
248
249 super.startElement(uri, localName, name, atts);
250 }
251
252 @Override
253 public void endElement(
254 String uri, String localName, String name) throws SAXException {
255 super.endElement(uri, localName, name);
256
257 if (!packageEntryDepths.isEmpty()
258 && packageEntryDepths.getLast() == currentDepth) {
259 packageEntryDepths.removeLast();
260 }
261
262 currentDepth--;
263 }
264
265 @Override
266 public void characters(char[] ch, int start, int length)
267 throws SAXException {
268 advance(length);
269 super.characters(ch, start, length);
270 }
271
272 @Override
273 public void ignorableWhitespace(char[] ch, int start, int length)
274 throws SAXException {
275 advance(length);
276 super.ignorableWhitespace(ch, start, length);
277 }
278
279 /**
280 * Private exception class used to indicate a suspected zip bomb.
281 *
282 * @see SecureContentHandler#throwIfCauseOf(SAXException)
283 */
284 private class SecureSAXException extends SAXException {
285
286 /** Serial version UID.*/
287 private static final long serialVersionUID = 2285245380321771445L;
288
289 public SecureSAXException(String message) throws SAXException {
290 super(message);
291 }
292
293 public boolean isCausedBy(SecureContentHandler handler) {
294 return SecureContentHandler.this == handler;
295 }
296
297 }
298
299 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.ContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * A content handler decorator that tags potential exceptions so that the
23 * handler that caused the exception can easily be identified. This is
24 * done by using the {@link TaggedSAXException} class to wrap all thrown
25 * {@link SAXException}s. See below for an example of using this class.
26 * <pre>
27 * TaggedContentHandler handler = new TaggedContentHandler(...);
28 * try {
29 * // Processing that may throw an SAXException either from this handler
30 * // or from some other XML parsing activity
31 * processXML(handler);
32 * } catch (SAXException e) {
33 * if (handler.isCauseOf(e)) {
34 * // The exception was caused by this handler.
35 * // Use e.getCause() to get the original exception.
36 * } else {
37 * // The exception was caused by something else.
38 * }
39 * }
40 * </pre>
41 * <p>
42 * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be
43 * used to let higher levels of code handle the exception caused by this
44 * stream while other processing errors are being taken care of at this
45 * lower level.
46 * <pre>
47 * TaggedContentHandler handler = new TaggedContentHandler(...);
48 * try {
49 * processXML(handler);
50 * } catch (SAXException e) {
51 * stream.throwIfCauseOf(e);
52 * // ... or process the exception that was caused by something else
53 * }
54 * </pre>
55 *
56 * @see TaggedSAXException
57 */
58 public class TaggedContentHandler extends ContentHandlerDecorator {
59
60 /**
61 * Creates a tagging decorator for the given content handler.
62 *
63 * @param proxy content handler to be decorated
64 */
65 public TaggedContentHandler(ContentHandler proxy) {
66 super(proxy);
67 }
68
69 /**
70 * Tests if the given exception was caused by this handler.
71 *
72 * @param exception an exception
73 * @return <code>true</code> if the exception was thrown by this handler,
74 * <code>false</code> otherwise
75 */
76 public boolean isCauseOf(SAXException exception) {
77 if (exception instanceof TaggedSAXException) {
78 TaggedSAXException tagged = (TaggedSAXException) exception;
79 return this == tagged.getTag();
80 } else {
81 return false;
82 }
83 }
84
85 /**
86 * Re-throws the original exception thrown by this handler. This method
87 * first checks whether the given exception is a {@link TaggedSAXException}
88 * wrapper created by this decorator, and then unwraps and throws the
89 * original wrapped exception. Returns normally if the exception was
90 * not thrown by this handler.
91 *
92 * @param exception an exception
93 * @throws SAXException original exception, if any, thrown by this handler
94 */
95 public void throwIfCauseOf(Exception exception) throws SAXException {
96 if (exception instanceof TaggedSAXException) {
97 TaggedSAXException tagged = (TaggedSAXException) exception;
98 if (this == tagged.getTag()) {
99 throw tagged.getCause();
100 }
101 }
102 }
103
104 /**
105 * Tags any {@link SAXException}s thrown, wrapping and re-throwing.
106 *
107 * @param e The SAXException thrown
108 * @throws SAXException if an XML error occurs
109 */
110 @Override
111 protected void handleException(SAXException e) throws SAXException {
112 throw new TaggedSAXException(e, this);
113 }
114
115 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.SAXException;
19
20 /**
21 * A {@link SAXException} wrapper that tags the wrapped exception with
22 * a given object reference. Both the tag and the wrapped original exception
23 * can be used to determine further processing when this exception is caught.
24 */
25 public class TaggedSAXException extends SAXException {
26
27 /**
28 * The object reference used to tag the exception.
29 */
30 private final Object tag;
31
32 /**
33 * Creates a tagged wrapper for the given exception.
34 *
35 * @param original the exception to be tagged
36 * @param tag tag object
37 */
38 public TaggedSAXException(SAXException original, Object tag) {
39 super(original.getMessage(), original);
40 initCause(original); // SAXException has it's own chaining mechanism!
41 this.tag = tag;
42 }
43
44 /**
45 * Returns the object reference used as the tag this exception.
46 *
47 * @return tag object
48 */
49 public Object getTag() {
50 return tag;
51 }
52
53 /**
54 * Returns the wrapped exception. The only difference to the overridden
55 * {@link Throwable#getCause()} method is the narrower return type.
56 *
57 * @return wrapped exception
58 */
59 @Override
60 public SAXException getCause() {
61 return (SAXException) super.getCause();
62 }
63
64 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.Attributes;
19 import org.xml.sax.ContentHandler;
20 import org.xml.sax.Locator;
21 import org.xml.sax.SAXException;
22 import org.xml.sax.helpers.DefaultHandler;
23
24 /**
25 * Content handler proxy that forwards the received SAX events to zero or
26 * more underlying content handlers.
27 */
28 public class TeeContentHandler extends DefaultHandler {
29
30 private final ContentHandler[] handlers;
31
32 public TeeContentHandler(ContentHandler... handlers) {
33 this.handlers = handlers;
34 }
35
36 @Override
37 public void startPrefixMapping(String prefix, String uri)
38 throws SAXException {
39 for (ContentHandler handler : handlers) {
40 handler.startPrefixMapping(prefix, uri);
41 }
42 }
43
44 @Override
45 public void endPrefixMapping(String prefix) throws SAXException {
46 for (ContentHandler handler : handlers) {
47 handler.endPrefixMapping(prefix);
48 }
49 }
50
51 @Override
52 public void processingInstruction(String target, String data)
53 throws SAXException {
54 for (ContentHandler handler : handlers) {
55 handler.processingInstruction(target, data);
56 }
57 }
58
59 @Override
60 public void setDocumentLocator(Locator locator) {
61 for (ContentHandler handler : handlers) {
62 handler.setDocumentLocator(locator);
63 }
64 }
65
66 @Override
67 public void startDocument() throws SAXException {
68 for (ContentHandler handler : handlers) {
69 handler.startDocument();
70 }
71 }
72
73 @Override
74 public void endDocument() throws SAXException {
75 for (ContentHandler handler : handlers) {
76 handler.endDocument();
77 }
78 }
79
80 @Override
81 public void startElement(
82 String uri, String localName, String name, Attributes atts)
83 throws SAXException {
84 for (ContentHandler handler : handlers) {
85 handler.startElement(uri, localName, name, atts);
86 }
87 }
88
89 @Override
90 public void endElement(String uri, String localName, String name)
91 throws SAXException {
92 for (ContentHandler handler : handlers) {
93 handler.endElement(uri, localName, name);
94 }
95 }
96
97 @Override
98 public void characters(char[] ch, int start, int length)
99 throws SAXException {
100 for (ContentHandler handler : handlers) {
101 handler.characters(ch, start, length);
102 }
103 }
104
105 @Override
106 public void ignorableWhitespace(char[] ch, int start, int length)
107 throws SAXException {
108 for (ContentHandler handler : handlers) {
109 handler.ignorableWhitespace(ch, start, length);
110 }
111 }
112
113 @Override
114 public void skippedEntity(String name) throws SAXException {
115 for (ContentHandler handler : handlers) {
116 handler.skippedEntity(name);
117 }
118 }
119
120 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.xml.sax.Attributes;
19 import org.xml.sax.ContentHandler;
20 import org.xml.sax.SAXException;
21 import org.xml.sax.helpers.DefaultHandler;
22
23 /**
24 * Content handler decorator that only passes the
25 * {@link #characters(char[], int, int)} and
26 * (@link {@link #ignorableWhitespace(char[], int, int)}
27 * (plus {@link #startDocument()} and {@link #endDocument()} events to
28 * the decorated content handler.
29 */
30 public class TextContentHandler extends DefaultHandler {
31
32 private static final char[] SPACE = new char[] {' '};
33
34 private final ContentHandler delegate;
35 private final boolean addSpaceBetweenElements;
36
37 public TextContentHandler(ContentHandler delegate) {
38 this(delegate, false);
39 }
40
41 public TextContentHandler(ContentHandler delegate, boolean addSpaceBetweenElements) {
42 this.delegate = delegate;
43 this.addSpaceBetweenElements = addSpaceBetweenElements;
44 }
45
46 @Override
47 public void setDocumentLocator(org.xml.sax.Locator locator) {
48 delegate.setDocumentLocator(locator);
49 }
50
51 @Override
52 public void characters(char[] ch, int start, int length)
53 throws SAXException {
54 delegate.characters(ch, start, length);
55 }
56
57 @Override
58 public void ignorableWhitespace(char[] ch, int start, int length)
59 throws SAXException {
60 delegate.ignorableWhitespace(ch, start, length);
61 }
62
63 @Override
64 public void startElement(String uri, String localName, String qName, Attributes attributes)
65 throws SAXException {
66 if (addSpaceBetweenElements) {
67 delegate.characters(SPACE, 0, SPACE.length);
68 }
69 }
70
71 @Override
72 public void startDocument() throws SAXException {
73 delegate.startDocument();
74 }
75
76 @Override
77 public void endDocument() throws SAXException {
78 delegate.endDocument();
79 }
80
81 @Override
82 public String toString() {
83 return delegate.toString();
84 }
85
86 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.OutputStream;
19 import java.io.UnsupportedEncodingException;
20 import java.util.Arrays;
21 import java.util.HashSet;
22 import java.util.Set;
23
24 import org.xml.sax.SAXException;
25
26 /**
27 * SAX event handler that serializes the HTML document to a character stream.
28 * The incoming SAX events are expected to be well-formed (properly nested,
29 * etc.) and valid HTML.
30 *
31 * @since Apache Tika 0.10
32 */
33 public class ToHTMLContentHandler extends ToXMLContentHandler {
34
35 private static final Set<String> EMPTY_ELEMENTS =
36 new HashSet<String>(Arrays.asList(
37 "area", "base", "basefont", "br", "col", "frame", "hr",
38 "img", "input", "isindex", "link", "meta", "param"));
39
40 public ToHTMLContentHandler(OutputStream stream, String encoding)
41 throws UnsupportedEncodingException {
42 super(stream, encoding);
43 }
44
45 public ToHTMLContentHandler() {
46 super();
47 }
48
49 @Override
50 public void startDocument() throws SAXException {
51 }
52
53 @Override
54 public void endElement(String uri, String localName, String qName)
55 throws SAXException {
56 if (inStartElement) {
57 write('>');
58 inStartElement = false;
59
60 if (EMPTY_ELEMENTS.contains(localName)) {
61 namespaces.clear();
62 return;
63 }
64 }
65
66 super.endElement(uri, localName, qName);
67 }
68
69 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.IOException;
19 import java.io.OutputStream;
20 import java.io.OutputStreamWriter;
21 import java.io.StringWriter;
22 import java.io.UnsupportedEncodingException;
23 import java.io.Writer;
24
25 import org.xml.sax.SAXException;
26 import org.xml.sax.helpers.DefaultHandler;
27
28 /**
29 * SAX event handler that writes all character content out to a character
30 * stream. No escaping or other transformations are made on the character
31 * content.
32 *
33 * @since Apache Tika 0.10
34 */
35 public class ToTextContentHandler extends DefaultHandler {
36
37 /**
38 * The character stream.
39 */
40 private final Writer writer;
41
42 /**
43 * Creates a content handler that writes character events to
44 * the given writer.
45 *
46 * @param writer writer
47 */
48 public ToTextContentHandler(Writer writer) {
49 this.writer = writer;
50 }
51
52 /**
53 * Creates a content handler that writes character events to
54 * the given output stream using the platform default encoding.
55 *
56 * @param stream output stream
57 */
58 public ToTextContentHandler(OutputStream stream) {
59 this(new OutputStreamWriter(stream));
60 }
61
62 /**
63 * Creates a content handler that writes character events to
64 * the given output stream using the given encoding.
65 *
66 * @param stream output stream
67 * @param encoding output encoding
68 * @throws UnsupportedEncodingException if the encoding is unsupported
69 */
70 public ToTextContentHandler(OutputStream stream, String encoding)
71 throws UnsupportedEncodingException {
72 this(new OutputStreamWriter(stream, encoding));
73 }
74
75 /**
76 * Creates a content handler that writes character events
77 * to an internal string buffer. Use the {@link #toString()}
78 * method to access the collected character content.
79 */
80 public ToTextContentHandler() {
81 this(new StringWriter());
82 }
83
84 /**
85 * Writes the given characters to the given character stream.
86 */
87 @Override
88 public void characters(char[] ch, int start, int length)
89 throws SAXException {
90 try {
91 writer.write(ch, start, length);
92 } catch (IOException e) {
93 throw new SAXException(
94 "Error writing: " + new String(ch, start, length), e);
95 }
96 }
97
98
99 /**
100 * Writes the given ignorable characters to the given character stream.
101 * The default implementation simply forwards the call to the
102 * {@link #characters(char[], int, int)} method.
103 */
104 @Override
105 public void ignorableWhitespace(char[] ch, int start, int length)
106 throws SAXException {
107 characters(ch, start, length);
108 }
109
110 /**
111 * Flushes the character stream so that no characters are forgotten
112 * in internal buffers.
113 *
114 * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
115 * @throws SAXException if the stream can not be flushed
116 */
117 @Override
118 public void endDocument() throws SAXException {
119 try {
120 writer.flush();
121 } catch (IOException e) {
122 throw new SAXException("Error flushing character output", e);
123 }
124 }
125
126 /**
127 * Returns the contents of the internal string buffer where
128 * all the received characters have been collected. Only works
129 * when this object was constructed using the empty default
130 * constructor or by passing a {@link StringWriter} to the
131 * other constructor.
132 */
133 @Override
134 public String toString() {
135 return writer.toString();
136 }
137
138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.OutputStream;
19 import java.io.UnsupportedEncodingException;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.Map;
23
24 import org.xml.sax.Attributes;
25 import org.xml.sax.SAXException;
26
27 /**
28 * SAX event handler that serializes the XML document to a character stream.
29 * The incoming SAX events are expected to be well-formed (properly nested,
30 * etc.) and to explicitly include namespace declaration attributes and
31 * corresponding namespace prefixes in element and attribute names.
32 *
33 * @since Apache Tika 0.10
34 */
35 public class ToXMLContentHandler extends ToTextContentHandler {
36
37 private static class ElementInfo {
38
39 private final ElementInfo parent;
40
41 private final Map<String, String> namespaces;
42
43 public ElementInfo(ElementInfo parent, Map<String, String> namespaces) {
44 this.parent = parent;
45 if (namespaces.isEmpty()) {
46 this.namespaces = Collections.emptyMap();
47 } else {
48 this.namespaces = new HashMap<String, String>(namespaces);
49 }
50 }
51
52 public String getPrefix(String uri) throws SAXException {
53 String prefix = namespaces.get(uri);
54 if (prefix != null) {
55 return prefix;
56 } else if (parent != null) {
57 return parent.getPrefix(uri);
58 } else if (uri == null || uri.length() == 0) {
59 return "";
60 } else {
61 throw new SAXException("Namespace " + uri + " not declared");
62 }
63 }
64
65 public String getQName(String uri, String localName)
66 throws SAXException {
67 String prefix = getPrefix(uri);
68 if (prefix.length() > 0) {
69 return prefix + ":" + localName;
70 } else {
71 return localName;
72 }
73 }
74
75 }
76
77 private final String encoding;
78
79 protected boolean inStartElement = false;
80
81 protected final Map<String, String> namespaces =
82 new HashMap<String, String>();
83
84 private ElementInfo currentElement;
85
86 /**
87 * Creates an XML serializer that writes to the given byte stream
88 * using the given character encoding.
89 *
90 * @param stream output stream
91 * @param encoding output encoding
92 * @throws UnsupportedEncodingException if the encoding is unsupported
93 */
94 public ToXMLContentHandler(OutputStream stream, String encoding)
95 throws UnsupportedEncodingException {
96 super(stream, encoding);
97 this.encoding = encoding;
98 }
99
100 public ToXMLContentHandler(String encoding) {
101 super();
102 this.encoding = encoding;
103 }
104
105 public ToXMLContentHandler() {
106 super();
107 this.encoding = null;
108 }
109
110 /**
111 * Writes the XML prefix.
112 */
113 @Override
114 public void startDocument() throws SAXException {
115 if (encoding != null) {
116 write("<?xml version=\"1.0\" encoding=\"");
117 write(encoding);
118 write("\"?>\n");
119 }
120
121 currentElement = null;
122 namespaces.clear();
123 }
124
125 @Override
126 public void startPrefixMapping(String prefix, String uri)
127 throws SAXException {
128 try {
129 if (currentElement != null
130 && prefix.equals(currentElement.getPrefix(uri))) {
131 return;
132 }
133 } catch (SAXException ignore) {
134 }
135 namespaces.put(uri, prefix);
136 }
137
138 @Override
139 public void startElement(
140 String uri, String localName, String qName, Attributes atts)
141 throws SAXException {
142 lazyCloseStartElement();
143
144 currentElement = new ElementInfo(currentElement, namespaces);
145
146 write('<');
147 write(currentElement.getQName(uri, localName));
148
149 for (int i = 0; i < atts.getLength(); i++) {
150 write(' ');
151 write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i)));
152 write('=');
153 write('"');
154 char[] ch = atts.getValue(i).toCharArray();
155 writeEscaped(ch, 0, ch.length, true);
156 write('"');
157 }
158
159 for (Map.Entry<String, String> entry : namespaces.entrySet()) {
160 write(' ');
161 write("xmlns");
162 String prefix = entry.getValue();
163 if (prefix.length() > 0) {
164 write(':');
165 write(prefix);
166 }
167 write('=');
168 write('"');
169 char[] ch = entry.getKey().toCharArray();
170 writeEscaped(ch, 0, ch.length, true);
171 write('"');
172 }
173 namespaces.clear();
174
175 inStartElement = true;
176 }
177
178 @Override
179 public void endElement(String uri, String localName, String qName)
180 throws SAXException {
181 if (inStartElement) {
182 write(" />");
183 inStartElement = false;
184 } else {
185 write("</");
186 write(qName);
187 write('>');
188 }
189
190 namespaces.clear();
191
192 // Reset the position in the tree, to avoid endless stack overflow
193 // chains (see TIKA-1070)
194 currentElement = currentElement.parent;
195 }
196
197 @Override
198 public void characters(char[] ch, int start, int length)
199 throws SAXException {
200 lazyCloseStartElement();
201 writeEscaped(ch, start, start + length, false);
202 }
203
204 private void lazyCloseStartElement() throws SAXException {
205 if (inStartElement) {
206 write('>');
207 inStartElement = false;
208 }
209 }
210
211 /**
212 * Writes the given character as-is.
213 *
214 * @param ch character to be written
215 * @throws SAXException if the character could not be written
216 */
217 protected void write(char ch) throws SAXException {
218 super.characters(new char[] { ch }, 0, 1);
219 }
220
221 /**
222 * Writes the given string of character as-is.
223 *
224 * @param string string of character to be written
225 * @throws SAXException if the character string could not be written
226 */
227 protected void write(String string) throws SAXException {
228 super.characters(string.toCharArray(), 0, string.length());
229 }
230
231 /**
232 * Writes the given characters as-is followed by the given entity.
233 *
234 * @param ch character array
235 * @param from start position in the array
236 * @param to end position in the array
237 * @param entity entity code
238 * @return next position in the array,
239 * after the characters plus one entity
240 * @throws SAXException if the characters could not be written
241 */
242 private int writeCharsAndEntity(char[] ch, int from, int to, String entity)
243 throws SAXException {
244 super.characters(ch, from, to - from);
245 write('&');
246 write(entity);
247 write(';');
248 return to + 1;
249 }
250
251 /**
252 * Writes the given characters with XML meta characters escaped.
253 *
254 * @param ch character array
255 * @param from start position in the array
256 * @param to end position in the array
257 * @param attribute whether the characters should be escaped as
258 * an attribute value or normal character content
259 * @throws SAXException if the characters could not be written
260 */
261 private void writeEscaped(char[] ch, int from, int to, boolean attribute)
262 throws SAXException {
263 int pos = from;
264 while (pos < to) {
265 if (ch[pos] == '<') {
266 from = pos = writeCharsAndEntity(ch, from, pos, "lt");
267 } else if (ch[pos] == '>') {
268 from = pos = writeCharsAndEntity(ch, from, pos, "gt");
269 } else if (ch[pos] == '&') {
270 from = pos = writeCharsAndEntity(ch, from, pos, "amp");
271 } else if (attribute && ch[pos] == '"') {
272 from = pos = writeCharsAndEntity(ch, from, pos, "quot");
273 } else {
274 pos++;
275 }
276 }
277 super.characters(ch, from, to - from);
278 }
279
280 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.io.OutputStream;
19 import java.io.OutputStreamWriter;
20 import java.io.Serializable;
21 import java.io.StringWriter;
22 import java.io.Writer;
23 import java.util.UUID;
24
25 import org.xml.sax.ContentHandler;
26 import org.xml.sax.SAXException;
27
28 /**
29 * SAX event handler that writes content up to an optional write
30 * limit out to a character stream or other decorated handler.
31 */
32 public class WriteOutContentHandler extends ContentHandlerDecorator {
33
34 /**
35 * The unique tag associated with exceptions from stream.
36 */
37 private final Serializable tag = UUID.randomUUID();
38
39 /**
40 * The maximum number of characters to write to the character stream.
41 * Set to -1 for no limit.
42 */
43 private final int writeLimit;
44
45 /**
46 * Number of characters written so far.
47 */
48 private int writeCount = 0;
49
50 /**
51 * Creates a content handler that writes content up to the given
52 * write limit to the given content handler.
53 *
54 * @since Apache Tika 0.10
55 * @param handler content handler to be decorated
56 * @param writeLimit write limit
57 */
58 public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
59 super(handler);
60 this.writeLimit = writeLimit;
61 }
62
63 /**
64 * Creates a content handler that writes content up to the given
65 * write limit to the given character stream.
66 *
67 * @since Apache Tika 0.10
68 * @param writer character stream
69 * @param writeLimit write limit
70 */
71 public WriteOutContentHandler(Writer writer, int writeLimit) {
72 this(new ToTextContentHandler(writer), writeLimit);
73 }
74
75 /**
76 * Creates a content handler that writes character events to
77 * the given writer.
78 *
79 * @param writer writer
80 */
81 public WriteOutContentHandler(Writer writer) {
82 this(writer, -1);
83 }
84
85 /**
86 * Creates a content handler that writes character events to
87 * the given output stream using the default encoding.
88 *
89 * @param stream output stream
90 */
91 public WriteOutContentHandler(OutputStream stream) {
92 this(new OutputStreamWriter(stream));
93 }
94
95 /**
96 * Creates a content handler that writes character events
97 * to an internal string buffer. Use the {@link #toString()}
98 * method to access the collected character content.
99 * <p>
100 * The internal string buffer is bounded at the given number of characters.
101 * If this write limit is reached, then a {@link SAXException} is thrown.
102 * The {@link #isWriteLimitReached(Throwable)} method can be used to
103 * detect this case.
104 *
105 * @since Apache Tika 0.7
106 * @param writeLimit maximum number of characters to include in the string,
107 * or -1 to disable the write limit
108 */
109 public WriteOutContentHandler(int writeLimit) {
110 this(new StringWriter(), writeLimit);
111 }
112
113 /**
114 * Creates a content handler that writes character events
115 * to an internal string buffer. Use the {@link #toString()}
116 * method to access the collected character content.
117 * <p>
118 * The internal string buffer is bounded at 100k characters. If this
119 * write limit is reached, then a {@link SAXException} is thrown. The
120 * {@link #isWriteLimitReached(Throwable)} method can be used to detect
121 * this case.
122 */
123 public WriteOutContentHandler() {
124 this(100 * 1000);
125 }
126
127 /**
128 * Writes the given characters to the given character stream.
129 */
130 @Override
131 public void characters(char[] ch, int start, int length)
132 throws SAXException {
133 if (writeLimit == -1 || writeCount + length <= writeLimit) {
134 super.characters(ch, start, length);
135 writeCount += length;
136 } else {
137 super.characters(ch, start, writeLimit - writeCount);
138 writeCount = writeLimit;
139 throw new WriteLimitReachedException(
140 "Your document contained more than " + writeLimit
141 + " characters, and so your requested limit has been"
142 + " reached. To receive the full text of the document,"
143 + " increase your limit. (Text up to the limit is"
144 + " however available).", tag);
145 }
146 }
147
148 @Override
149 public void ignorableWhitespace(char[] ch, int start, int length)
150 throws SAXException {
151 if (writeLimit == -1 || writeCount + length <= writeLimit) {
152 super.ignorableWhitespace(ch, start, length);
153 writeCount += length;
154 } else {
155 super.ignorableWhitespace(ch, start, writeLimit - writeCount);
156 writeCount = writeLimit;
157 throw new WriteLimitReachedException(
158 "Your document contained more than " + writeLimit
159 + " characters, and so your requested limit has been"
160 + " reached. To receive the full text of the document,"
161 + " increase your limit. (Text up to the limit is"
162 + " however available).", tag);
163 }
164 }
165
166 /**
167 * Checks whether the given exception (or any of it's root causes) was
168 * thrown by this handler as a signal of reaching the write limit.
169 *
170 * @since Apache Tika 0.7
171 * @param t throwable
172 * @return <code>true</code> if the write limit was reached,
173 * <code>false</code> otherwise
174 */
175 public boolean isWriteLimitReached(Throwable t) {
176 if (t instanceof WriteLimitReachedException) {
177 return tag.equals(((WriteLimitReachedException) t).tag);
178 } else {
179 return t.getCause() != null && isWriteLimitReached(t.getCause());
180 }
181 }
182
183 /**
184 * The exception used as a signal when the write limit has been reached.
185 */
186 private static class WriteLimitReachedException extends SAXException {
187
188 /** Serial version UID */
189 private static final long serialVersionUID = -1850581945459429943L;
190
191 /** Serializable tag of the handler that caused this exception */
192 private final Serializable tag;
193
194 public WriteLimitReachedException(String message, Serializable tag) {
195 super(message);
196 this.tag = tag;
197 }
198
199 }
200
201 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.xml.sax.Attributes;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.helpers.AttributesImpl;
29
30 /**
31 * Content handler decorator that simplifies the task of producing XHTML
32 * events for Tika content parsers.
33 */
34 public class XHTMLContentHandler extends SafeContentHandler {
35
36 /**
37 * The XHTML namespace URI
38 */
39 public static final String XHTML = "http://www.w3.org/1999/xhtml";
40
41 /**
42 * The newline character that gets inserted after block elements.
43 */
44 private static final char[] NL = new char[] { '\n' };
45
46 /**
47 * The tab character gets inserted before table cells and list items.
48 */
49 private static final char[] TAB = new char[] { '\t' };
50
51 /**
52 * The elements that are in the <head> section.
53 */
54 private static final Set<String> HEAD =
55 unmodifiableSet("title", "link", "base", "meta");
56
57 /**
58 * The elements that are automatically emitted by lazyStartHead, so
59 * skip them if they get sent to startElement/endElement by mistake.
60 */
61 private static final Set<String> AUTO =
62 unmodifiableSet("html", "head", "body", "frameset");
63
64 /**
65 * The elements that get prepended with the {@link #TAB} character.
66 */
67 private static final Set<String> INDENT =
68 unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
69
70 /**
71 * The elements that get appended with the {@link #NL} character.
72 */
73 public static final Set<String> ENDLINE = unmodifiableSet(
74 "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
75 "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
76 "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option");
77
78 private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
79
80 private static Set<String> unmodifiableSet(String... elements) {
81 return Collections.unmodifiableSet(
82 new HashSet<String>(Arrays.asList(elements)));
83 }
84
85 /**
86 * Metadata associated with the document. Used to fill in the
87 * &lt;head/&gt; section.
88 */
89 private final Metadata metadata;
90
91 /**
92 * Flag to indicate whether the document has been started.
93 */
94 private boolean documentStarted = false;
95
96 /**
97 * Flags to indicate whether the document head element has been started/ended.
98 */
99 private boolean headStarted = false;
100 private boolean headEnded = false;
101 private boolean useFrameset = false;
102
103 public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
104 super(handler);
105 this.metadata = metadata;
106 }
107
108 /**
109 * Starts an XHTML document by setting up the namespace mappings
110 * when called for the first time.
111 * The standard XHTML prefix is generated lazily when the first
112 * element is started.
113 */
114 @Override
115 public void startDocument() throws SAXException {
116 if(!documentStarted){
117 documentStarted = true;
118 super.startDocument();
119 startPrefixMapping("", XHTML);
120 }
121 }
122
123 /**
124 * Generates the following XHTML prefix when called for the first time:
125 * <pre>
126 * &lt;html&gt;
127 * &lt;head&gt;
128 * &lt;title&gt;...&lt;/title&gt;
129 * &lt;/head&gt;
130 * &lt;body&gt;
131 * </pre>
132 */
133 private void lazyStartHead() throws SAXException {
134 if (!headStarted) {
135 headStarted = true;
136
137 // Call directly, so we don't go through our startElement(), which will
138 // ignore these elements.
139 super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
140 newline();
141 super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
142 newline();
143 }
144 }
145
146 /**
147 * Generates the following XHTML prefix when called for the first time:
148 * <pre>
149 * &lt;html&gt;
150 * &lt;head&gt;
151 * &lt;title&gt;...&lt;/title&gt;
152 * &lt;/head&gt;
153 * &lt;body&gt; (or &lt;frameset&gt;
154 * </pre>
155 */
156 private void lazyEndHead(boolean isFrameset) throws SAXException {
157 lazyStartHead();
158
159 if (!headEnded) {
160 headEnded = true;
161 useFrameset = isFrameset;
162
163 // TIKA-478: Emit all metadata values (other than title). We have to call
164 // startElement() and characters() directly to avoid recursive problems.
165 for (String name : metadata.names()) {
166 if (name.equals("title")) {
167 continue;
168 }
169
170 for (String value : metadata.getValues(name)) {
171 // Putting null values into attributes causes problems, but is
172 // allowed by Metadata, so guard against that.
173 if (value != null) {
174 AttributesImpl attributes = new AttributesImpl();
175 attributes.addAttribute("", "name", "name", "CDATA", name);
176 attributes.addAttribute("", "content", "content", "CDATA", value);
177 super.startElement(XHTML, "meta", "meta", attributes);
178 super.endElement(XHTML, "meta", "meta");
179 newline();
180 }
181 }
182 }
183
184 super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES);
185 String title = metadata.get(TikaCoreProperties.TITLE);
186 if (title != null && title.length() > 0) {
187 char[] titleChars = title.toCharArray();
188 super.characters(titleChars, 0, titleChars.length);
189 } else {
190 // TIKA-725: Prefer <title></title> over <title/>
191 super.characters(new char[0], 0, 0);
192 }
193 super.endElement(XHTML, "title", "title");
194 newline();
195
196 super.endElement(XHTML, "head", "head");
197 newline();
198
199 if (useFrameset) {
200 super.startElement(XHTML, "frameset", "frameset", EMPTY_ATTRIBUTES);
201 } else {
202 super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
203 }
204 }
205 }
206
207 /**
208 * Ends the XHTML document by writing the following footer and
209 * clearing the namespace mappings:
210 * <pre>
211 * &lt;/body&gt;
212 * &lt;/html&gt;
213 * </pre>
214 */
215 @Override
216 public void endDocument() throws SAXException {
217 lazyEndHead(useFrameset);
218
219 if (useFrameset) {
220 super.endElement(XHTML, "frameset", "frameset");
221 } else {
222 super.endElement(XHTML, "body", "body");
223 }
224
225 super.endElement(XHTML, "html", "html");
226
227 endPrefixMapping("");
228 super.endDocument();
229 }
230
231 /**
232 * Starts the given element. Table cells and list items are automatically
233 * indented by emitting a tab character as ignorable whitespace.
234 */
235 @Override
236 public void startElement(
237 String uri, String local, String name, Attributes attributes)
238 throws SAXException {
239
240 if (name.equals("frameset")) {
241 lazyEndHead(true);
242 } else if (!AUTO.contains(name)) {
243 if (HEAD.contains(name)) {
244 lazyStartHead();
245 } else {
246 lazyEndHead(false);
247 }
248
249 if (XHTML.equals(uri) && INDENT.contains(name)) {
250 ignorableWhitespace(TAB, 0, TAB.length);
251 }
252
253 super.startElement(uri, local, name, attributes);
254 }
255 }
256
257 /**
258 * Ends the given element. Block elements are automatically followed
259 * by a newline character.
260 */
261 @Override
262 public void endElement(String uri, String local, String name) throws SAXException {
263 if (!AUTO.contains(name)) {
264 super.endElement(uri, local, name);
265 if (XHTML.equals(uri) && ENDLINE.contains(name)) {
266 newline();
267 }
268 }
269 }
270
271 /**
272 * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
273 */
274 @Override
275 public void characters(char[] ch, int start, int length) throws SAXException {
276 lazyEndHead(useFrameset);
277 super.characters(ch, start, length);
278 }
279
280 //------------------------------------------< public convenience methods >
281
282 public void startElement(String name) throws SAXException {
283 startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
284 }
285
286 public void startElement(String name, String attribute, String value)
287 throws SAXException {
288 AttributesImpl attributes = new AttributesImpl();
289 attributes.addAttribute("", attribute, attribute, "CDATA", value);
290 startElement(XHTML, name, name, attributes);
291 }
292
293 public void startElement(String name, AttributesImpl attributes)
294 throws SAXException {
295 startElement(XHTML, name, name, attributes);
296 }
297
298 public void endElement(String name) throws SAXException {
299 endElement(XHTML, name, name);
300 }
301
302 public void characters(String characters) throws SAXException {
303 if (characters != null && characters.length() > 0) {
304 characters(characters.toCharArray(), 0, characters.length());
305 }
306 }
307
308 public void newline() throws SAXException {
309 ignorableWhitespace(NL, 0, NL.length);
310 }
311
312 /**
313 * Emits an XHTML element with the given text content. If the given
314 * text value is null or empty, then the element is not written.
315 *
316 * @param name XHTML element name
317 * @param value element value, possibly <code>null</code>
318 * @throws SAXException if the content element could not be written
319 */
320 public void element(String name, String value) throws SAXException {
321 if (value != null && value.length() > 0) {
322 startElement(name);
323 characters(value);
324 endElement(name);
325 }
326 }
327
328 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.xml.sax.Attributes;
21 import org.xml.sax.ContentHandler;
22 import org.xml.sax.SAXException;
23 import org.xml.sax.helpers.AttributesImpl;
24
25 /**
26 * Content handler decorator that simplifies the task of producing XMP output.
27 *
28 * @since Apache Tika 1.0
29 */
30 public class XMPContentHandler extends SafeContentHandler {
31
32 /**
33 * The RDF namespace URI
34 */
35 public static final String RDF =
36 "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
37
38 /**
39 * The XMP namespace URI
40 */
41 public static final String XMP =
42 "http://ns.adobe.com/xap/1.0/";
43
44 private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
45
46 public XMPContentHandler(ContentHandler handler) {
47 super(handler);
48 }
49
50 /**
51 * Starts an XMP document by setting up the namespace mappings and
52 * writing out the following header:
53 * <pre>
54 * &lt;rdf:RDF&gt;
55 * </pre>
56 */
57 @Override
58 public void startDocument() throws SAXException {
59 super.startDocument();
60
61 startPrefixMapping("rdf", RDF);
62 startPrefixMapping("xmp", XMP);
63
64 startElement(RDF, "RDF", "rdf:RDF", EMPTY_ATTRIBUTES);
65 }
66
67 /**
68 * Ends the XMP document by writing the following footer and
69 * clearing the namespace mappings:
70 * <pre>
71 * &lt;/rdf:RDF&gt;
72 * </pre>
73 */
74 @Override
75 public void endDocument() throws SAXException {
76 endElement(RDF, "RDF", "rdf:RDF");
77
78 endPrefixMapping("xmp");
79 endPrefixMapping("rdf");
80
81 super.endDocument();
82 }
83
84 //------------------------------------------< public convenience methods >
85
86 private String prefix = null;
87
88 private String uri = null;
89
90 public void startDescription(String about, String prefix, String uri)
91 throws SAXException {
92 this.prefix = prefix;
93 this.uri = uri;
94
95 startPrefixMapping(prefix, uri);
96 AttributesImpl attributes = new AttributesImpl();
97 attributes.addAttribute(RDF, "about", "rdf:about", "CDATA", about);
98 startElement(RDF, "Description", "rdf:Description", attributes);
99 }
100
101 public void endDescription() throws SAXException {
102 endElement(RDF, "Description", "rdf:Description");
103 endPrefixMapping(prefix);
104
105 this.uri = null;
106 this.prefix = null;
107 }
108
109 public void property(String name, String value) throws SAXException {
110 String qname = prefix + ":" + name;
111 startElement(uri, name, qname, EMPTY_ATTRIBUTES);
112 characters(value.toCharArray(), 0, value.length());
113 endElement(uri, name, qname);
114 }
115
116 public void metadata(Metadata metadata) throws SAXException {
117 description(metadata, "xmp", XMP);
118 description(metadata, "dc", "http://purl.org/dc/elements/1.1/");
119 description(metadata, "xmpTPg", "http://ns.adobe.com/xap/1.0/t/pg/");
120 description(metadata, "xmpRigths", "http://ns.adobe.com/xap/1.0/rights/");
121 description(metadata, "xmpMM", "http://ns.adobe.com/xap/1.0/mm/");
122 description(metadata, "xmpidq", "http://ns.adobe.com/xmp/identifier/qual/1.0/");
123 description(metadata, "xmpBJ", "http://ns.adobe.com/xap/1.0/bj/");
124 description(metadata, "xmpDM", "http://ns.adobe.com/xmp/1.0/DynamicMedia/");
125 description(metadata, "pdf", "http://ns.adobe.com/pdf/1.3/");
126 description(metadata, "photoshop", "s http://ns.adobe.com/photoshop/1.0/");
127 description(metadata, "crs", "http://ns.adobe.com/camera-raw-settings/1.0/");
128 description(metadata, "tiff", "http://ns.adobe.com/tiff/1.0/");
129 description(metadata, "exif", "http://ns.adobe.com/exif/1.0/");
130 description(metadata, "aux", "http://ns.adobe.com/exif/1.0/aux/");
131 }
132
133 private void description(Metadata metadata, String prefix, String uri)
134 throws SAXException {
135 int count = 0;
136 for (Property property : Property.getProperties(prefix)) {
137 String value = metadata.get(property);
138 if (value != null) {
139 if (count++ == 0) {
140 startDescription("", prefix, uri);
141 }
142 property(property.getName().substring(prefix.length() + 1), value);
143 }
144 }
145
146 if (count > 0) {
147 endDescription();
148 }
149 }
150
151 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * SAX utilities.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.sax;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Final evaluation state of a <code>.../@*</code> XPath expression.
20 * Matches all attributes of the current element.
21 */
22 public class AttributeMatcher extends Matcher {
23
24 public static final Matcher INSTANCE = new AttributeMatcher();
25
26 public boolean matchesAttribute(String namespace, String name) {
27 return true;
28 }
29
30 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Intermediate evaluation state of a <code>.../*...</code> XPath expression.
20 * Matches nothing, but specifies the evaluation state for all child elements.
21 */
22 public class ChildMatcher extends Matcher {
23
24 private final Matcher then;
25
26 public ChildMatcher(Matcher then) {
27 this.then = then;
28 }
29
30 public Matcher descend(String namespace, String name) {
31 return then;
32 }
33
34 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Composite XPath evaluation state. Used when XPath evaluation results
20 * in two or more branches of independent evaluation states.
21 */
22 public class CompositeMatcher extends Matcher {
23
24 private final Matcher a;
25
26 private final Matcher b;
27
28 public CompositeMatcher(Matcher a, Matcher b) {
29 this.a = a;
30 this.b = b;
31 }
32
33 public Matcher descend(String namespace, String name) {
34 Matcher a = this.a.descend(namespace, name);
35 Matcher b = this.b.descend(namespace, name);
36 if (a == FAIL) {
37 return b;
38 } else if (b == FAIL) {
39 return a;
40 } else if (this.a == a && this.b == b) {
41 return this;
42 } else {
43 return new CompositeMatcher(a, b);
44 }
45 }
46
47 public boolean matchesElement() {
48 return a.matchesElement() || b.matchesElement();
49 }
50
51 public boolean matchesAttribute(String namespace, String name) {
52 return a.matchesAttribute(namespace, name)
53 || b.matchesAttribute(namespace, name);
54 }
55
56 public boolean matchesText() {
57 return a.matchesText() || b.matchesText();
58 }
59
60 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Final evaluation state of an XPath expression that targets an element.
20 * Matches the current element.
21 */
22 public class ElementMatcher extends Matcher {
23
24 public static final Matcher INSTANCE = new ElementMatcher();
25
26 public boolean matchesElement() {
27 return true;
28 }
29
30 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * XPath element matcher. A matcher instance encapsulates a specific
20 * state in XPath evaluation.
21 */
22 public class Matcher {
23
24 /**
25 * State of a failed XPath evaluation, where nothing is matched.
26 * This matcher instance is used as a sentinel object whenever an
27 * XPath evaluation branch fails.
28 */
29 public static final Matcher FAIL = new Matcher();
30
31 /**
32 * Returns the XPath evaluation state that results from descending
33 * to a child element with the given name.
34 *
35 * @param namespace element namespace or <code>null</code>
36 * @param name element name
37 * @return next XPath evaluation state
38 */
39 public Matcher descend(String namespace, String name) {
40 return FAIL;
41 }
42
43 /**
44 * Returns <code>true</code> if the XPath expression matches
45 * the element associated with this evaluation state.
46 *
47 * @return XPath evaluation state for this element
48 */
49 public boolean matchesElement() {
50 return false;
51 }
52
53 /**
54 * Returns <code>true</code> if the XPath expression matches the named
55 * attribute of the element associated with this evaluation state.
56 *
57 * @param namespace attribute namespace or <code>null</code>
58 * @param name attribute name
59 * @return XPath evaluation state for named attribute of this element
60 */
61 public boolean matchesAttribute(String namespace, String name) {
62 return false;
63 }
64
65 /**
66 * Returns <code>true</code> if the XPath expression matches all text
67 * nodes whose parent is the element associated with this evaluation
68 * state.
69 *
70 * @return XPath evaluation state for text children of this element
71 */
72 public boolean matchesText() {
73 return false;
74 }
75
76 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 import java.util.LinkedList;
19
20 import org.apache.tika.sax.ContentHandlerDecorator;
21 import org.xml.sax.Attributes;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.helpers.AttributesImpl;
25
26 /**
27 * Content handler decorator that only passes the elements, attributes,
28 * and text nodes that match the given XPath expression.
29 */
30 public class MatchingContentHandler extends ContentHandlerDecorator {
31
32 private final LinkedList<Matcher> matchers = new LinkedList<Matcher>();
33
34 private Matcher matcher;
35
36 public MatchingContentHandler(ContentHandler delegate, Matcher matcher) {
37 super(delegate);
38 this.matcher = matcher;
39 }
40
41 public void startElement(
42 String uri, String localName, String name, Attributes attributes)
43 throws SAXException {
44 matchers.addFirst(matcher);
45 matcher = matcher.descend(uri, localName);
46
47 AttributesImpl matches = new AttributesImpl();
48 for (int i = 0; i < attributes.getLength(); i++) {
49 String attributeURI = attributes.getURI(i);
50 String attributeName = attributes.getLocalName(i);
51 if (matcher.matchesAttribute(attributeURI, attributeName)) {
52 matches.addAttribute(
53 attributeURI, attributeName, attributes.getQName(i),
54 attributes.getType(i), attributes.getValue(i));
55 }
56 }
57
58 if (matcher.matchesElement() || matches.getLength() > 0) {
59 super.startElement(uri, localName, name, matches);
60 if (!matcher.matchesElement()) {
61 // Force the matcher to match the current element, so the
62 // endElement method knows to emit the correct event
63 matcher =
64 new CompositeMatcher(matcher, ElementMatcher.INSTANCE);
65 }
66 }
67 }
68
69 public void endElement(String uri, String localName, String name)
70 throws SAXException {
71 if (matcher.matchesElement()) {
72 super.endElement(uri, localName, name);
73 }
74 // Sometimes tagsoup returns double end tags, so the stack might
75 // be empty! TODO: Remove this when the tagsoup problem is fixed.
76 if (!matchers.isEmpty()) {
77 matcher = matchers.removeFirst();
78 }
79 }
80
81 public void characters(char[] ch, int start, int length)
82 throws SAXException {
83 if (matcher.matchesText()) {
84 super.characters(ch, start, length);
85 }
86 }
87
88 public void ignorableWhitespace(char[] ch, int start, int length)
89 throws SAXException {
90 if (matcher.matchesText()) {
91 super.ignorableWhitespace(ch, start, length);
92 }
93 }
94
95 public void processingInstruction(String target, String data) {
96 // TODO: Support for matching processing instructions
97 }
98
99 public void skippedEntity(String name) throws SAXException {
100 // TODO: Can skipped entities refer to more than text?
101 if (matcher.matchesText()) {
102 super.skippedEntity(name);
103 }
104 }
105
106 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Final evaluation state of a <code>.../@name</code> XPath expression.
20 * Matches the named attributes of the current element.
21 */
22 public class NamedAttributeMatcher extends Matcher {
23
24 private final String namespace;
25
26 private final String name;
27
28 public NamedAttributeMatcher(String namespace, String name) {
29 this.namespace = namespace;
30 this.name = name;
31 }
32
33 public boolean matchesAttribute(String namespace, String name) {
34 return equals(namespace, this.namespace) && name.equals(this.name);
35 }
36
37 private static boolean equals(String a, String b) {
38 return (a == null) ? (b == null) : a.equals(b);
39 }
40
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Intermediate evaluation state of a <code>.../name...</code> XPath
20 * expression. Matches nothing, but specifies the evaluation state
21 * for the child elements with the given name.
22 */
23 public class NamedElementMatcher extends ChildMatcher {
24
25 private final String namespace;
26
27 private final String name;
28
29 protected NamedElementMatcher(String namespace, String name, Matcher then) {
30 super(then);
31 this.namespace = namespace;
32 this.name = name;
33 }
34
35 public Matcher descend(String namespace, String name) {
36 if (equals(namespace, this.namespace) && name.equals(this.name)) {
37 return super.descend(namespace, name);
38 } else {
39 return FAIL;
40 }
41 }
42
43 private static boolean equals(String a, String b) {
44 return (a == null) ? (b == null) : a.equals(b);
45 }
46
47 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Final evaluation state of a <code>.../node()</code> XPath expression.
20 * Matches all elements, attributes, and text.
21 */
22 public class NodeMatcher extends Matcher {
23
24 public static final Matcher INSTANCE = new NodeMatcher();
25
26 @Override
27 public boolean matchesElement() {
28 return true;
29 }
30
31 @Override
32 public boolean matchesAttribute(String namespace, String name) {
33 return true;
34 }
35
36 @Override
37 public boolean matchesText() {
38 return true;
39 }
40
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Evaluation state of a <code>...//...</code> XPath expression. Applies the
20 * contained evaluation state to the current element and all its descendants.
21 */
22 public class SubtreeMatcher extends Matcher {
23
24 private final Matcher then;
25
26 public SubtreeMatcher(Matcher then) {
27 this.then = then;
28 }
29
30 @Override
31 public Matcher descend(String namespace, String name) {
32 Matcher next = then.descend(namespace, name);
33 if (next == FAIL || next == then) {
34 return this;
35 } else {
36 return new CompositeMatcher(next, this);
37 }
38 }
39
40 @Override
41 public boolean matchesElement() {
42 return then.matchesElement();
43 }
44
45 @Override
46 public boolean matchesAttribute(String namespace, String name) {
47 return then.matchesAttribute(namespace, name);
48 }
49
50 @Override
51 public boolean matchesText() {
52 return then.matchesText();
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 /**
19 * Final evaluation state of a <code>.../text()</code> XPath expression.
20 * Matches all text children of the current element.
21 */
22 public class TextMatcher extends Matcher {
23
24 public static final Matcher INSTANCE = new TextMatcher();
25
26 public boolean matchesText() {
27 return true;
28 }
29
30 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 import java.util.HashMap;
19 import java.util.Map;
20
21 /**
22 * Parser for a very simple XPath subset. Only the following XPath constructs
23 * (with namespaces) are supported:
24 * <ul>
25 * <li><code>.../node()</code></li>
26 * <li><code>.../text()</code></li>
27 * <li><code>.../@*</code></li>
28 * <li><code>.../@name</code></li>
29 * <li><code>.../*...</code></li>
30 * <li><code>.../name...</code></li>
31 * <li><code>...//*...</code></li>
32 * <li><code>...//name...</code></li>
33 * </ul>
34 * <p>
35 * In addition the non-abbreviated <code>.../descendant::node()</code>
36 * construct can be used for cases where the descendant-or-self axis
37 * used by the <code>...//node()</code> construct is not appropriate.
38 */
39 public class XPathParser {
40
41 private final Map<String, String> prefixes = new HashMap<String, String>();
42
43 public XPathParser() {
44 }
45
46 public XPathParser(String prefix, String namespace) {
47 addPrefix(prefix, namespace);
48 }
49
50 public void addPrefix(String prefix, String namespace) {
51 prefixes.put(prefix, namespace);
52 }
53
54 /**
55 * Parses the given simple XPath expression to an evaluation state
56 * initialized at the document node. Invalid expressions are not flagged
57 * as errors, they just result in a failing evaluation state.
58 *
59 * @param xpath simple XPath expression
60 * @return XPath evaluation state
61 */
62 public Matcher parse(String xpath) {
63 if (xpath.equals("/text()")) {
64 return TextMatcher.INSTANCE;
65 } else if (xpath.equals("/node()")) {
66 return NodeMatcher.INSTANCE;
67 } else if (xpath.equals("/descendant::node()")
68 || xpath.equals("/descendant:node()")) { // for compatibility
69 return new CompositeMatcher(
70 TextMatcher.INSTANCE,
71 new ChildMatcher(new SubtreeMatcher(NodeMatcher.INSTANCE)));
72 } else if (xpath.equals("/@*")) {
73 return AttributeMatcher.INSTANCE;
74 } else if (xpath.length() == 0) {
75 return ElementMatcher.INSTANCE;
76 } else if (xpath.startsWith("/@")) {
77 String name = xpath.substring(2);
78 String prefix = null;
79 int colon = name.indexOf(':');
80 if (colon != -1) {
81 prefix = name.substring(0, colon);
82 name = name.substring(colon + 1);
83 }
84 if (prefixes.containsKey(prefix)) {
85 return new NamedAttributeMatcher(prefixes.get(prefix), name);
86 } else {
87 return Matcher.FAIL;
88 }
89 } else if (xpath.startsWith("/*")) {
90 return new ChildMatcher(parse(xpath.substring(2)));
91 } else if (xpath.startsWith("///")) {
92 return Matcher.FAIL;
93 } else if (xpath.startsWith("//")) {
94 return new SubtreeMatcher(parse(xpath.substring(1)));
95 } else if (xpath.startsWith("/")) {
96 int slash = xpath.indexOf('/', 1);
97 if (slash == -1) {
98 slash = xpath.length();
99 }
100 String name = xpath.substring(1, slash);
101 String prefix = null;
102 int colon = name.indexOf(':');
103 if (colon != -1) {
104 prefix = name.substring(0, colon);
105 name = name.substring(colon + 1);
106 }
107 if (prefixes.containsKey(prefix)) {
108 return new NamedElementMatcher(
109 prefixes.get(prefix), name,
110 parse(xpath.substring(slash)));
111 } else {
112 return Matcher.FAIL;
113 }
114 } else {
115 return Matcher.FAIL;
116 }
117 }
118
119 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * XPath utilities
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.sax.xpath;
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import static java.util.Locale.ENGLISH;
19
20 import java.lang.reflect.Method;
21 import java.nio.charset.Charset;
22 import java.nio.charset.IllegalCharsetNameException;
23 import java.util.HashMap;
24 import java.util.Locale;
25 import java.util.Map;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 public class CharsetUtils {
30
31 private static final Pattern CHARSET_NAME_PATTERN =
32 Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*");
33
34 private static final Pattern ISO_NAME_PATTERN =
35 Pattern.compile(".*8859-(\\d+)");
36
37 private static final Pattern CP_NAME_PATTERN =
38 Pattern.compile("cp-(\\d+)");
39
40 private static final Pattern WIN_NAME_PATTERN =
41 Pattern.compile("win-?(\\d+)");
42
43 private static final Map<String, Charset> COMMON_CHARSETS =
44 new HashMap<String, Charset>();
45
46 private static Method getCharsetICU = null;
47 private static Method isSupportedICU = null;
48
49 private static Map<String, Charset> initCommonCharsets(String... names) {
50 Map<String, Charset> charsets = new HashMap<String, Charset>();
51 for (String name : names) {
52 try {
53 Charset charset = Charset.forName(name);
54 COMMON_CHARSETS.put(name.toLowerCase(ENGLISH), charset);
55 for (String alias : charset.aliases()) {
56 COMMON_CHARSETS.put(alias.toLowerCase(ENGLISH), charset);
57 }
58 } catch (Exception e) {
59 // ignore
60 }
61 }
62 return charsets;
63 }
64
65 static {
66 initCommonCharsets(
67 "Big5",
68 "EUC-JP", "EUC-KR", "x-EUC-TW",
69 "GB18030",
70 "IBM855", "IBM866",
71 "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR",
72 "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4",
73 "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
74 "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15",
75 "KOI8-R",
76 "x-MacCyrillic",
77 "SHIFT_JIS",
78 "UTF-8", "UTF-16BE", "UTF-16LE",
79 "windows-1251", "windows-1252", "windows-1253", "windows-1255");
80
81 // Common aliases/typos not included in standard charset definitions
82 COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1"));
83 COMMON_CHARSETS.put("windows", COMMON_CHARSETS.get("windows-1252"));
84 COMMON_CHARSETS.put("koi8r", COMMON_CHARSETS.get("koi8-r"));
85
86 // See if we can load the icu4j CharsetICU class
87 Class<?> icuCharset = null;
88 try {
89 icuCharset = CharsetUtils.class.getClassLoader().loadClass(
90 "com.ibm.icu.charset.CharsetICU");
91 } catch (ClassNotFoundException e) {
92 }
93 if (icuCharset != null) {
94 try {
95 getCharsetICU = icuCharset.getMethod("forNameICU", String.class);
96 } catch (Throwable t) {
97 throw new RuntimeException(t);
98 }
99 try {
100 isSupportedICU = icuCharset.getMethod("isSupported", String.class);
101 } catch (Throwable t) {
102 }
103 // TODO: would be nice to somehow log that we
104 // successfully found ICU
105 }
106 }
107
108 /**
109 * Safely return whether <charsetName> is supported, without throwing exceptions
110 *
111 * @param charsetName Name of charset (can be null)
112 * @return true if the character set is supported
113 */
114 public static boolean isSupported(String charsetName) {
115 try {
116 if (isSupportedICU != null && ((Boolean) isSupportedICU.invoke(null, charsetName)).booleanValue()) {
117 return true;
118 }
119 return Charset.isSupported(charsetName);
120 } catch (IllegalCharsetNameException e) {
121 return false;
122 } catch (IllegalArgumentException e) {
123 // null, for example
124 return false;
125 } catch (Exception e) {
126 // Unexpected exception, what to do?
127 return false;
128 }
129 }
130
131 /**
132 * Handle various common charset name errors, and return something
133 * that will be considered valid (and is normalized)
134 *
135 * @param charsetName name of charset to process
136 * @return potentially remapped/cleaned up version of charset name
137 */
138 public static String clean(String charsetName) {
139 try {
140 return forName(charsetName).name();
141 } catch (Exception e) {
142 return null;
143 }
144 }
145
146 /** Returns Charset impl, if one exists. This method
147 * optionally uses ICU4J's CharsetICU.forNameICU,
148 * if it is found on the classpath, else only uses
149 * JDK's builtin Charset.forName. */
150 public static Charset forName(String name) {
151 if (name == null) {
152 throw new IllegalArgumentException();
153 }
154
155 // Get rid of cruft around names, like <>, trailing commas, etc.
156 Matcher m = CHARSET_NAME_PATTERN.matcher(name);
157 if (!m.matches()) {
158 throw new IllegalCharsetNameException(name);
159 }
160 name = m.group(1);
161
162 String lower = name.toLowerCase(Locale.ENGLISH);
163 Charset charset = COMMON_CHARSETS.get(lower);
164 if (charset != null) {
165 return charset;
166 } else if ("none".equals(lower) || "no".equals(lower)) {
167 throw new IllegalCharsetNameException(name);
168 } else {
169 Matcher iso = ISO_NAME_PATTERN.matcher(lower);
170 Matcher cp = CP_NAME_PATTERN.matcher(lower);
171 Matcher win = WIN_NAME_PATTERN.matcher(lower);
172 if (iso.matches()) {
173 // Handle "iso 8859-x" error
174 name = "iso-8859-" + iso.group(1);
175 charset = COMMON_CHARSETS.get(name);
176 } else if (cp.matches()) {
177 // Handle "cp-xxx" error
178 name = "cp" + cp.group(1);
179 charset = COMMON_CHARSETS.get(name);
180 } else if (win.matches()) {
181 // Handle "winxxx" and "win-xxx" errors
182 name = "windows-" + win.group(1);
183 charset = COMMON_CHARSETS.get(name);
184 }
185 if (charset != null) {
186 return charset;
187 }
188 }
189
190 if (getCharsetICU != null) {
191 try {
192 Charset cs = (Charset) getCharsetICU.invoke(null, name);
193 if (cs != null) {
194 return cs;
195 }
196 } catch (Exception e) {
197 // ignore
198 }
199 }
200
201 return Charset.forName(name);
202 }
203 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import java.util.Calendar;
19 import java.util.Date;
20 import java.util.GregorianCalendar;
21 import java.util.Locale;
22 import java.util.TimeZone;
23
24 /**
25 * Date related utility methods and constants
26 */
27 public class DateUtils {
28 /**
29 * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)}
30 * understands "UTC" in all environments, but it'll fall back to GMT
31 * in such cases, which is in practice equivalent to UTC.
32 */
33 public static final TimeZone UTC = TimeZone.getTimeZone("UTC");
34
35 /**
36 * Custom time zone used to interpret date values without a time
37 * component in a way that most likely falls within the same day
38 * regardless of in which time zone it is later interpreted. For
39 * example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z"
40 * (instead of the default "2012-02-17T00:00:00Z"), which would still
41 * map to "2012-02-17" if interpreted in say Pacific time (while the
42 * default mapping would result in "2012-02-16" for UTC-8).
43 */
44 public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00");
45
46 /**
47 * Returns a ISO 8601 representation of the given date. This method
48 * is thread safe and non-blocking.
49 *
50 * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
51 * @param date given date
52 * @return ISO 8601 date string
53 */
54 public static String formatDate(Date date) {
55 Calendar calendar = GregorianCalendar.getInstance(UTC, Locale.US);
56 calendar.setTime(date);
57 return String.format(
58 "%04d-%02d-%02dT%02d:%02d:%02dZ",
59 calendar.get(Calendar.YEAR),
60 calendar.get(Calendar.MONTH) + 1,
61 calendar.get(Calendar.DAY_OF_MONTH),
62 calendar.get(Calendar.HOUR_OF_DAY),
63 calendar.get(Calendar.MINUTE),
64 calendar.get(Calendar.SECOND));
65 }
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import java.util.ArrayList;
19 import java.util.Collections;
20 import java.util.List;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
23
24 /**
25 * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
26 * content
27 *
28 *
29 */
30 public class RegexUtils {
31
32 /**
33 * Regex pattern to get URLs within a plain text.
34 *
35 * @see <a
36 * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
37 * </a>
38 */
39 private static final String LINKS_REGEX =
40 "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
41 + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
42 + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
43
44 private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);
45
46 /**
47 * Extract urls from plain text.
48 *
49 * @param content The plain text content to examine
50 * @return List of urls within found in the plain text
51 */
52 public static List<String> extractLinks(String content) {
53 if (content == null || content.length() == 0) {
54 return Collections.emptyList();
55 }
56
57 List<String> extractions = new ArrayList<String>();
58 final Matcher matcher = LINKS_PATTERN.matcher(content);
59 while (matcher.find()) {
60 extractions.add(matcher.group());
61 }
62 return extractions;
63
64 }
65 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import java.io.BufferedInputStream;
19 import java.io.BufferedOutputStream;
20 import java.io.ByteArrayInputStream;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.OutputStream;
27
28
29 /**
30 * Wraps an input stream, reading it only once, but making it available
31 * for rereading an arbitrary number of times. The stream's bytes are
32 * stored in memory up to a user specified maximum, and then stored in a
33 * temporary file which is deleted when this class' close() method is called.
34 */
35 public class RereadableInputStream extends InputStream {
36
37
38 /**
39 * Input stream originally passed to the constructor.
40 */
41 private InputStream originalInputStream;
42
43 /**
44 * The inputStream currently being used by this object to read contents;
45 * may be the original stream passed in, or a stream that reads
46 * the saved copy.
47 */
48 private InputStream inputStream;
49
50 /**
51 * Maximum number of bytes that can be stored in memory before
52 * storage will be moved to a temporary file.
53 */
54 private int maxBytesInMemory;
55
56 /**
57 * True when the original stream is being read; set to false when
58 * reading is set to use the stored data instead.
59 */
60 private boolean firstPass = true;
61
62 /**
63 * Whether or not the stream's contents are being stored in a file
64 * as opposed to memory.
65 */
66 private boolean bufferIsInFile;
67
68 /**
69 * The buffer used to store the stream's content; this storage is moved
70 * to a file when the stored data's size exceeds maxBytesInMemory.
71 */
72 private byte[] byteBuffer;
73
74 /**
75 * The total number of bytes read from the original stream at the time.
76 */
77 private int size;
78
79 /**
80 * File used to store the stream's contents; is null until the stored
81 * content's size exceeds maxBytesInMemory.
82 */
83 private File storeFile;
84
85 /**
86 * OutputStream used to save the content of the input stream in a
87 * temporary file.
88 */
89 private OutputStream storeOutputStream;
90
91
92 /**
93 * Specifies whether or not to read to the end of stream on first
94 * rewind. This defaults to true. If this is set to false,
95 * then the first time when rewind() is called, only those bytes
96 * already read from the original stream will be available from then on.
97 */
98 private boolean readToEndOfStreamOnFirstRewind = true;
99
100
101 /**
102 * Specifies whether or not to close the original input stream
103 * when close() is called. Defaults to true.
104 */
105 private boolean closeOriginalStreamOnClose = true;
106
107
108 // TODO: At some point it would be better to replace the current approach
109 // (specifying the above) with more automated behavior. The stream could
110 // keep the original stream open until EOF was reached. For example, if:
111 //
112 // the original stream is 10 bytes, and
113 // only 2 bytes are read on the first pass
114 // rewind() is called
115 // 5 bytes are read
116 //
117 // In this case, this instance gets the first 2 from its store,
118 // and the next 3 from the original stream, saving those additional 3
119 // bytes in the store. In this way, only the maximum number of bytes
120 // ever needed must be saved in the store; unused bytes are never read.
121 // The original stream is closed when EOF is reached, or when close()
122 // is called, whichever comes first. Using this approach eliminates
123 // the need to specify the flag (though makes implementation more complex).
124
125
126
127 /**
128 * Creates a rereadable input stream.
129 *
130 * @param inputStream stream containing the source of data
131 * @param maxBytesInMemory maximum number of bytes to use to store
132 * the stream's contents in memory before switching to disk; note that
133 * the instance will preallocate a byte array whose size is
134 * maxBytesInMemory. This byte array will be made available for
135 * garbage collection (i.e. its reference set to null) when the
136 * content size exceeds the array's size, when close() is called, or
137 * when there are no more references to the instance.
138 * @param readToEndOfStreamOnFirstRewind Specifies whether or not to
139 * read to the end of stream on first rewind. If this is set to false,
140 * then when rewind() is first called, only those bytes already read
141 * from the original stream will be available from then on.
142 */
143 public RereadableInputStream(InputStream inputStream, int maxBytesInMemory,
144 boolean readToEndOfStreamOnFirstRewind,
145 boolean closeOriginalStreamOnClose) {
146 this.inputStream = inputStream;
147 this.originalInputStream = inputStream;
148 this.maxBytesInMemory = maxBytesInMemory;
149 byteBuffer = new byte[maxBytesInMemory];
150 this.readToEndOfStreamOnFirstRewind = readToEndOfStreamOnFirstRewind;
151 this.closeOriginalStreamOnClose = closeOriginalStreamOnClose;
152 }
153
154 /**
155 * Reads a byte from the stream, saving it in the store if it is being
156 * read from the original stream. Implements the abstract
157 * InputStream.read().
158 *
159 * @return the read byte, or -1 on end of stream.
160 * @throws IOException
161 */
162 public int read() throws IOException {
163 int inputByte = inputStream.read();
164 if (firstPass) {
165 saveByte(inputByte);
166 }
167 return inputByte;
168 }
169
170 /**
171 * "Rewinds" the stream to the beginning for rereading.
172 * @throws IOException
173 */
174 public void rewind() throws IOException {
175
176 if (firstPass && readToEndOfStreamOnFirstRewind) {
177 // Force read to end of stream to fill store with any
178 // remaining bytes from original stream.
179 while(read() != -1) {
180 // empty loop
181 }
182 }
183
184 closeStream();
185 if (storeOutputStream != null) {
186 storeOutputStream.close();
187 storeOutputStream = null;
188 }
189 firstPass = false;
190 boolean newStreamIsInMemory = (size < maxBytesInMemory);
191 inputStream = newStreamIsInMemory
192 ? new ByteArrayInputStream(byteBuffer)
193 : new BufferedInputStream(new FileInputStream(storeFile));
194 }
195
196 /**
197 * Closes the input stream currently used for reading (may either be
198 * the original stream or a memory or file stream after the first pass).
199 *
200 * @throws IOException
201 */
202 // Does anyone need/want for this to be public?
203 private void closeStream() throws IOException {
204 if (inputStream != null
205 &&
206 (inputStream != originalInputStream
207 || closeOriginalStreamOnClose)) {
208 inputStream.close();
209 inputStream = null;
210 }
211 }
212
213 /**
214 * Closes the input stream and removes the temporary file if one was
215 * created.
216 *
217 * @throws IOException
218 */
219 public void close() throws IOException {
220 closeStream();
221 super.close();
222 if (storeFile != null) {
223 storeFile.delete();
224 }
225 }
226
227 /**
228 * Returns the number of bytes read from the original stream.
229 *
230 * @return number of bytes read
231 */
232 public int getSize() {
233 return size;
234 }
235
236 /**
237 * Saves the byte read from the original stream to the store.
238 *
239 * @param inputByte byte read from original stream
240 * @throws IOException
241 */
242 private void saveByte(int inputByte) throws IOException {
243
244 if (!bufferIsInFile) {
245 boolean switchToFile = (size == (maxBytesInMemory));
246 if (switchToFile) {
247 storeFile = File.createTempFile("TIKA_streamstore_", ".tmp");
248 bufferIsInFile = true;
249 storeOutputStream = new BufferedOutputStream(
250 new FileOutputStream(storeFile));
251 storeOutputStream.write(byteBuffer, 0, size);
252 storeOutputStream.write(inputByte);
253 byteBuffer = null; // release for garbage collection
254 } else {
255 byteBuffer[size] = (byte) inputByte;
256 }
257 } else {
258 storeOutputStream.write(inputByte);
259 }
260 ++size;
261 }
262 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Utilities.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.utils;
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _па 3116
15 _і_ 2556
16 _на 2147
17 _пр 2138
18 на_ 2079
19 ай_ 1954
20 ста 1826
21 _ка 1654
22 пра 1646
23 аў_ 1634
24 _ў_ 1489
25 _з_ 1484
26 ава 1461
27 _ст 1451
28 ць_ 1391
29 га_ 1369
30 кі_ 1308
31 ага 1303
32 _у_ 1298
33 ны_ 1297
34 _ад 1277
35 _за 1271
36 _як 1261
37 ска 1234
38 _вы 1167
39 _да 1149
40 ам_ 1123
41 ii_ 1108
42 ых_ 1094
43 пер 1074
44 ара 1055
45 дзе 1022
46 _ра 997
47 _ма 950
48 ым_ 946
49 ая_ 934
50 най 933
51 ана 932
52 ыя_ 920
53 ца_ 919
54 не_ 899
55 льн 883
56 ла_ 846
57 пад 842
58 ван 836
59 ера 832
60 алі 831
61 пры 819
62 ае_ 818
63 цца 811
64 год 792
65 _пе 783
66 мі_ 779
67 аль 776
68 ня_ 773
69 лі_ 772
70 _th 765
71 анн 760
72 _i_ 748
73 да_ 744
74 ад_ 741
75 ала 737
76 ада 736
77 _ii 733
78 ва_ 729
79 асц 724
80 скі 714
81 рад 710
82 _не 701
83 _бы 688
84 рац 681
85 аст 679
86 кам 668
87 ся_ 668
88 ных 651
89 _са 650
90 іст 648
91 ары 640
92 ыі_ 636
93 оль 635
94 тар 633
95 the 625
96 ння 625
97 кай 618
98 ля_ 610
99 _аб 607
100 ка_ 607
101 ама 604
102 іка 604
103 кар 599
104 _of 594
105 of_ 587
106 he_ 586
107 ki_ 584
108 on_ 580
109 дзі 580
110 аны 579
111 ным 579
112 рам 578
113 рас 577
114 рав 569
115 але 568
116 _та 561
117 er_ 561
118 ры_ 555
119 які 555
120 ці_ 553
121 дзя 552
122 вал 548
123 тра 548
124 кан 545
125 нас 544
126 наг 542
127 _го 540
128 таг 540
129 аго 539
130 тва 539
131 адз 536
132 нік 535
133 амі 534
134 ні_ 533
135 ран 527
136 ку_ 526
137 нне 517
138 ата 513
139 іі_ 508
140 сці 507
141 ski 505
142 ія_ 505
143 _ча 503
144 раз 501
145 ацы 499
146 es_ 498
147 зе_ 493
148 нск 492
149 _ва 486
150 зна 485
151 _лі 484
152 та_ 481
153 _св 480
154 ую_ 476
155 _co 472
156 аць 471
157 лас 468
158 тры 466
159 ддз 460
160 час 460
161 лад 459
162 го_ 457
163 ств 450
164 іх_ 450
165 стр 449
166 ion 445
167 сам 442
168 _га 441
169 ная 441
170 чна 441
171 _сі 437
172 так 437
173 _дз 436
174 ім_ 436
175 тал 435
176 ныя 434
177 us_ 428
178 ецц 428
179 _de 426
180 _ал 424
181 одд 424
182 ра_ 424
183 ыў_ 424
184 iii 418
185 аро 416
186 лік 415
187 _су 414
188 кла 413
189 лен 410
190 таў 408
191 рым 406
192 вае 404
193 тан 402
194 еры 400
195 каг 400
196 _рэ 398
197 энн 398
198 оўн 395
199 одз 394
200 _ін 393
201 вы_ 392
202 _wi 387
203 был 386
204 па_ 386
205 там 384
206 пас 382
207 раў 381
208 ова 380
209 дна 379
210 рыс 379
211 енн 373
212 нав 372
213 and 371
214 ычн 371
215 _re 370
216 апа 369
217 ма_ 369
218 _по 368
219 йск 368
220 ход 368
221 _ме 366
222 _гр 364
223 ах_ 364
224 адн 363
225 _in 357
226 is_ 356
227 ані 355
228 кім 355
229 рыя 355
230 аве 353
231 для 353
232 _an 352
233 _вя 351
234 tio 351
235 аві 349
236 гра 348
237 овы 348
238 анс 345
239 _дл 343
240 _сп 343
241 ача 343
242 ыка 343
243 ыст 343
244 аўн 342
245 al_ 341
246 _бе 339
247 _li 338
248 кал 338
249 _ас 337
250 ows 336
251 _ma 335
252 _шт 335
253 _ве 333
254 _кр 333
255 вар 333
256 ьна 331
257 ман 330
258 стэ 330
259 _ба 328
260 спа 328
261 оў_ 327
262 ўна 327
263 _ар 326
264 яго 325
265 кія 323
266 _vi 322
267 аля 319
268 пар 319
269 цыя 318
270 кая 317
271 _po 316
272 _яг 316
273 мен 316
274 пал 316
275 нач 315
276 рат 312
277 анд 311
278 аса 309
279 ой_ 308
280 то_ 308
281 ар_ 307
282 ле_ 307
283 пам 307
284 _тэ 306
285 ati 306
286 nd_ 306
287 ter 306
288 _ім 303
289 род 303
290 an_ 302
291 вай 302
292 тэм 302
293 цыі 301
294 _а_ 300
295 ваў 299
296 вер 299
297 мы_ 299
298 нал 299
299 _мо 298
300 быў 298
301 асн 296
302 en_ 295
303 ліс 295
304 ак_ 291
305 аўс 290
306 што 290
307 сва 288
308 вык 287
309 рус 287
310 аме 286
311 ака 284
312 нам 284
313 чны 284
314 _ко 283
315 рал 283
316 яў_ 282
317 ены 281
318 чан 281
319 _ат 280
320 авы 280
321 сту 279
322 ася 276
323 ахо 276
324 ора 276
325 сіс 276
326 сто 275
327 _xi 274
328 аец 274
329 як_ 274
330 ялі 274
331 гэт 273
332 кра 273
333 _be 272
334 _ar 271
335 нні 271
336 одн 271
337 ты_ 271
338 му_ 270
339 паў 269
340 яка 269
341 _мі 268
342 іна 268
343 ду_ 267
344 аб_ 266
345 ына 266
346 ўся 266
347 de_ 265
348 _гэ 264
349 кіх 264
350 аюц 263
351 уль 262
352 _pr 261
353 ed_ 261
354 _ск 260
355 ia_ 260
356 чын 260
357 ыма 260
358 _pa 259
359 ўва 259
360 ало 257
361 мар 257
362 _ро 255
363 ды_ 255
364 ела 255
365 сць 255
366 асп 254
367 кас 254
368 льк 254
369 оры 254
370 чы_ 254
371 эта 254
372 азв 252
373 даў 252
374 іта 252
375 _во 251
376 _зн 249
377 бел 249
378 _то 248
379 _гу 247
380 iv_ 247
381 цый 247
382 зен 246
383 ыла 246
384 асл 245
385 мал 245
386 wsk 244
387 атр 244
388 зя_ 244
389 йна 244
390 ing 242
391 уск 242
392 кса 241
393 мас 241
394 чэн 241
395 ады 240
396 вац 240
397 ода 240
398 _st 239
399 ch_ 239
400 арт 239
401 _ап 237
402 _xv 236
403 _ся 236
404 _тр 236
405 in_ 236
406 вял 236
407 за_ 236
408 тэр 236
409 цы_ 236
410 ена 235
411 пав 235
412 _to 234
413 ng_ 234
414 акс 234
415 _бо 233
416 _ві 233
417 кав 233
418 іра 232
419 re_ 231
420 аба 231
421 le_ 230
422 сты 230
423 ант 229
424 ную 229
425 яй_ 229
426 раб 228
427 чал 228
428 іў_ 228
429 sta 227
430 тор 227
431 амп 226
432 піс 226
433 ix_ 225
434 нта 224
435 ent 223
436 to_ 223
437 _no 222
438 _ан 221
439 дав 221
440 каў 221
441 тур 221
442 раг 220
443 _ге 219
444 ist 219
445 ыва 219
446 ьны 219
447 or_ 218
448 зва 218
449 _so 217
450 ару 217
451 нар 216
452 ьні 216
453 ерш 215
454 рын 215
455 rus 214
456 at_ 213
457 ве_ 213
458 пач 213
459 іла 213
460 _ca 212
461 ем_ 212
462 мер 212
463 ст_ 212
464 _bo 211
465 або 211
466 ніц 210
467 ымі 210
468 ўны 210
469 _iv 209
470 іва 209
471 ўля 209
472 vii 208
473 маг 208
474 _ру 207
475 вых 207
476 дар 207
477 _a_ 206
478 _ты 206
479 xvi 206
480 нов 206
481 юць 206
482 _un 205
483 _се 205
484 nte 205
485 аза 205
486 анц 205
487 ель 205
488 _la 204
489 ылі 204
490 _ўс 203
491 ыйн 203
492 бра 202
493 льш 202
494 літ 202
495 нае 202
496 тым 202
497 ўні 202
498 _se 201
499 ndo 201
500 оўв 201
501 пан 201
502 яе_ 201
503 win 200
504 іцы 200
505 _ды 199
506 _кі 199
507 наў 199
508 рыі 199
509 шча 199
510 аты 198
511 мат 198
512 пол 198
513 сна 198
514 _mi 197
515 _аў 197
516 ate 197
517 агр 197
518 ога 197
519 са_ 197
520 спр 197
521 dow 196
522 буд 195
523 лав 195
524 ль_ 195
525 рак 195
526 гад 194
527 рма 194
528 мов 193
529 сла 193
530 ькі 193
531 адк 192
532 дал 192
533 рык 192
534 іць 192
535 ind 191
536 гру 191
537 ічн 191
538 _na 190
539 et_ 190
540 na_ 190
541 амо 190
542 _fo 189
543 тна 189
544 _sa 188
545 асі 188
546 ган 188
547 ўск 188
548 int 187
549 рыт 187
550 дам 186
551 лан 186
552 мпе 186
553 туп 186
554 імп 186
555 _др 185
556 _ры 185
557 лар 185
558 тоў 185
559 _бу 184
560 la_ 184
561 інш 183
562 ver 182
563 тав 182
564 ікі 182
565 _le 181
566 гор 181
567 іза 181
568 _al 180
569 дра 180
570 яўл 180
571 vi_ 179
572 ляе 179
573 ту_ 179
574 ta_ 177
575 te_ 177
576 вой 177
577 вор 177
578 lar 175
579 аду 175
580 жна 175
581 ков 175
582 кры 174
583 нцы 174
584 яко 174
585 абі 173
586 гал 173
587 рна 173
588 ік_ 173
589 _му 172
590 ан_ 172
591 ачы 172
592 ваю 172
593 нен 172
594 che 171
595 for 171
596 ліц 171
597 ce_ 170
598 lin 170
599 адс 170
600 наз 170
601 _ro 169
602 ws_ 169
603 зі_ 169
604 од_ 168
605 ers 167
606 st_ 167
607 ас_ 167
608 зіц 167
609 тка 167
610 ён_ 167
611 _ку 166
612 tor 166
613 йны 166
614 ne_ 165
615 апі 165
616 акі 164
617 бар 164
618 кір 164
619 ліч 164
620 рск 164
621 тыч 164
622 ыю_ 164
623 _ус 163
624 _do 162
625 аво 162
626 аўт 162
627 вед 162
628 дан 162
629 дны 162
630 _am 161
631 ns_ 161
632 вым 161
633 ніі 161
634 рап 161
635 _ца 160
636 os_ 160
637 акт 160
638 жан 160
639 нак 160
640 шы_ 160
641 ілі 160
642 _чы 159
643 арм 159
644 бол 159
645 ей_ 159
646 каз 159
647 тол 159
648 ума 159
649 адо 158
650 ейс 158
651 ыкл 158
652 ыць 158
653 _но 157
654 гар 157
655 _ле 156
656 пы_ 156
657 рон 156
658 ін_ 156
659 няў 155
660 скл 155
661 це_ 155
662 _di 154
663 вет 154
664 ека 154
665 роў 154
666 сав 154
667 _fr 153
668 _зв 153
669 ste 153
670 апр 153
671 ica 152
672 гер 152
673 заб 152
674 зін 152
675 паз 152
676 _mo 151
677 el_ 151
678 ie_ 151
679 тай 151
680 ызн 151
681 эмы 151
682 ra_ 150
683 um_ 150
684 гул 150
685 пла 150
686 руп 150
687 ыні 150
688 _ch 149
689 _пл 149
690 арэ 149
691 вод 149
692 рта 149
693 цар 149
694 am_ 148
695 аму 148
696 est 147
697 адр 147
698 лів 147
699 сан 147
700 as_ 146
701 апе 146
702 аск 146
703 дад 146
704 ект 146
705 зас 146
706 цэн 146
707 іль 146
708 _ак 145
709 con 145
710 азн 145
711 кую 145
712 нда 145
713 нем 145
714 рга 145
715 ску 145
716 тво 145
717 цтв 145
718 _c_ 144
719 _яў 144
720 ot_ 144
721 лін 144
722 нап 144
723 яна 144
724 яшч 144
725 кол 143
726 льс 143
727 ну_ 143
728 ршы 143
729 _ne 142
730 _te 142
731 his 142
732 блі 142
733 вяр 142
734 кта 142
735 _me 141
736 ms_ 141
737 nt_ 141
738 азі 141
739 анг 141
740 віл 141
741 аін 140
742 зам 140
743 суп 140
744 _sp 139
745 _фа 139
746 ari 139
747 вып 139
748 ент 139
749 коў 139
750 мін 139
751 рэн 139
752 тро 139
753 ill 138
754 ran 138
755 sto 138
756 амы 138
757 кае 138
758 роз 138
759 чым 138
760 іча 138
761 bel 137
762 кат 137
763 ноў 137
764 ост 137
765 пак 137
766 уда 137
767 юцц 137
768 _xx 136
769 _ўз 136
770 ry_ 136
771 амн 136
772 аўл 136
773 ерс 136
774 зел 136
775 лам 136
776 мны 136
777 ншы 136
778 рыз 136
779 рэд 136
780 ючы 136
781 all 135
782 ic_ 135
783 пап 135
784 _ba 134
785 _фр 134
786 _ён 134
787 афі 134
788 зав 134
789 нія 134
790 све 134
791 яме 134
792 _іс 133
793 lan 133
794 se_ 133
795 айн 133
796 арс 133
797 ацэ 133
798 раі 133
799 уча 133
800 іся 133
801 _gr 132
802 ela 132
803 men 132
804 uni 132
805 зац 132
806 ней 132
807 оду 132
808 энт 132
809 _гі 131
810 зах 131
811 мац 131
812 нст 131
813 она 131
814 спе 131
815 ула 131
816 _ge 130
817 _v_ 130
818 xii 130
819 вен 130
820 вік 130
821 ру_ 130
822 ьск 130
823 _зм 129
824 edi 129
825 per 129
826 ане 129
827 мад 129
828 маў 129
829 фар 129
830 arm 128
831 bot 128
832 аз_ 128
833 нтр 128
834 ско 128
835 цэс 128
836 ыяд 128
837 іку 128
838 ўта 128
839 ета 127
840 між 127
841 іса 127
842 _ho 126
843 _ха 126
844 com 126
845 ome 126
846 бал 126
847 ярж 126
848 _si 125
849 _кл 125
850 era 125
851 атк 125
852 мес 125
853 ржа 125
854 _su 124
855 _зб 124
856 ar_ 124
857 nal 124
858 rea 124
859 він 124
860 зап 124
861 кул 124
862 лаў 124
863 ніз 124
864 сля 124
865 ўле 124
866 _кн 123
867 ani 123
868 aru 123
869 ich 123
870 ona 123
871 адп 123
872 бер 123
873 ваг 123
874 зяр 123
875 мпа 123
876 цка 123
877 імі 123
878 _ам 122
879 _ну 122
880 mar 122
881 ńsk 122
882 асо 122
883 гу_ 122
884 зан 122
885 нан 122
886 нах 122
887 lit 121
888 nce 121
889 sch 121
890 ату 121
891 уст 121
892 чат 121
893 ўст 121
894 _at 120
895 ian 120
896 pol 120
897 ris 120
898 агу 120
899 дст 119
900 дын 119
901 ерн 119
902 зал 119
903 сар 119
904 тны 119
905 удз 119
906 _ni 118
907 _ла 118
908 бо_ 118
909 ніч 118
910 усі 118
911 ive 117
912 rat 117
913 абр 117
914 дск 117
915 кцы 117
916 мет 117
917 нна 117
918 тат 117
919 ію_ 117
920 _бр 116
921 der 116
922 no_ 116
923 саб 116
924 яды 116
925 _г_ 115
926 ens 115
927 ons 115
928 rs_ 115
929 ано 115
930 ато 115
931 эра 115
932 _is 114
933 ali 114
934 les 114
935 lis 114
936 man 114
937 tan 114
938 два 114
939 дыч 114
940 кты 114
941 над 114
942 ярэ 114
943 _da 113
944 ten 113
945 зей 113
946 ло_ 113
947 льт 113
948 нны 113
949 нум 113
950 яец 113
951 _дв 112
952 _лю 112
953 _сл 112
954 tra 112
955 адм 112
956 дае 112
957 нек 112
958 озн 112
959 упы 112
960 іны 112
961 _жа 111
962 _яе 111
963 ll_ 111
964 арк 111
965 ліз 111
966 стк 111
967 цоў 111
968 _ha 110
969 _pe 110
970 _ян 110
971 ka_ 110
972 дка 110
973 ойс 110
974 рай 110
975 pro 109
976 ros 109
977 абл 109
978 авя 109
979 аха 109
980 ву_ 109
981 выя 109
982 еда 109
983 раф 109
984 фра 109
985 цыю 109
986 _ja 108
987 th_ 108
988 арг 108
989 пат 108
990 ыло 108
991 эты 108
992 ях_ 108
993 іцц 108
994 _on 107
995 ard 107
996 ell 107
997 lle 107
998 om_ 107
999 str 107
1000 пус 107
1001 _he 106
1002 _дэ 106
1003 art 106
1004 coa 106
1005 por 106
1006 адт 106
1007 аца 106
1008 бав 106
1009 эры 106
1010 _tr 105
1011 oli 105
1012 выс 105
1013 нты 105
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _de 516022
15 es_ 365077
16 de_ 334150
17 la_ 263275
18 el_ 261190
19 _el 243111
20 _la 233181
21 _co 177338
22 _i_ 171080
23 en_ 169592
24 ent 166275
25 que 166181
26 ls_ 157560
27 nt_ 150408
28 _a_ 143945
29 _es 142224
30 _qu 141042
31 _se 140738
32 _en 134188
33 _pe 132199
34 er_ 131611
35 _un 123347
36 per 123282
37 al_ 120012
38 ia_ 118076
39 del 116515
40 _al 107051
41 _l_ 105983
42 at_ 105377
43 est 105341
44 men 104773
45 ue_ 104665
46 na_ 104442
47 els 101956
48 ar_ 101658
49 _ca 100754
50 va_ 97716
51 les 97400
52 ts_ 95576
53 _va 93846
54 _pr 91960
55 _re 90487
56 ió_ 89704
57 és_ 87754
58 ra_ 86970
59 ant 86210
60 _d_ 84565
61 ns_ 83984
62 res 79034
63 con 78700
64 com 76189
65 re_ 75747
66 _le 73315
67 ció 71365
68 ta_ 71034
69 _ma 69384
70 tat 69135
71 _po 67807
72 des 67019
73 _pa 65755
74 sta 65224
75 aci 64934
76 amb 64831
77 _no 63591
78 ica 62191
79 da_ 61970
80 ons 61946
81 un_ 60044
82 _am 59421
83 tre 58786
84 era 55582
85 _di 55225
86 ter 55129
87 an_ 54289
88 ca_ 54098
89 _in 53935
90 ita 53038
91 ues 52668
92 una 52339
93 _an 49240
94 _te 49089
95 mb_ 48417
96 pro 47675
97 nci 47057
98 _mo 47048
99 cia 46374
100 ion 46213
101 ran 46058
102 ona 45850
103 par 45623
104 ist 44820
105 _tr 43920
106 ada 43914
107 tra 43026
108 als 41706
109 _fo 41079
110 om_ 40973
111 _ll 40939
112 _és 40055
113 ria 39718
114 _ha 39263
115 ir_ 38985
116 tes 37588
117 ame 37502
118 eix 36953
119 _so 36817
120 os_ 36738
121 rs_ 36655
122 ser 36569
123 nta 35894
124 ell 35382
125 ntr 35005
126 or_ 34878
127 aqu 34384
128 ura 34186
129 cio 33689
130 ren 33576
131 tan 33546
132 esp 33545
133 ste 33496
134 ici 33411
135 car 33386
136 _ta 33334
137 pre 33213
138 any 33166
139 eri 33092
140 ina 32951
141 tal 32531
142 str 32499
143 _ba 32466
144 més 32434
145 art 32365
146 _me 32132
147 lla 32045
148 nts 31902
149 _aq 31778
150 ats 31473
151 tic 31409
152 _si 31263
153 sa_ 31258
154 _ar 31183
155 bre 30897
156 _to 30832
157 ten 30645
158 le_ 30308
159 _fi 30264
160 ort 30218
161 man 30192
162 ver 30142
163 for 29957
164 _ex 29143
165 sti 29068
166 _ac 28797
167 nte 28630
168 us_ 28612
169 eu_ 28538
170 nom 28515
171 lit 28502
172 sen 28418
173 seg 28278
174 tor 28235
175 err 28123
176 _vi 28083
177 _mé 28005
178 ers 27946
179 _su 27596
180 _sa 27595
181 mar 27488
182 ic_ 27478
183 lle 27184
184 ari 26678
185 ont 26666
186 ara 26489
187 ori 26486
188 _fe 26413
189 seu 26393
190 _gr 26304
191 qua 26246
192 _hi 26199
193 is_ 26180
194 int 25926
195 gra 25649
196 tar 25610
197 _mi 25597
198 no_ 25452
199 nes 25065
200 alt 24511
201 ix_ 24456
202 ll_ 24451
203 _ve 24208
204 por 24096
205 ere 24061
206 ans 24031
207 arr 23938
208 ime 23877
209 fer 23807
210 ali 23651
211 all 23633
212 ial 23506
213 ass 23454
214 pri 23452
215 _o_ 23436
216 ual 23234
217 ana 23127
218 rre 23061
219 esc 22896
220 ect 22871
221 ome 22854
222 nal 22845
223 _s_ 22560
224 ral 22518
225 nic 22285
226 _do 22283
227 rt_ 22264
228 st_ 22258
229 mer 22240
230 tam 22231
231 uni 22189
232 ma_ 22184
233 ble 22180
234 ene 22143
235 nti 22140
236 tur 22048
237 van 22021
238 orm 22005
239 act 21946
240 ins 21897
241 ies 21892
242 lar 21738
243 _ci 21641
244 cs_ 21161
245 se_ 20858
246 pos 20683
247 rra 20605
248 ava 20555
249 fin 20460
250 rec 20446
251 tro 20443
252 rac 20323
253 rat 20233
254 tot 20207
255 ade 20156
256 ssi 20136
257 ner 20125
258 cat 19966
259 emp 19904
260 sev 19886
261 one 19873
262 enc 19861
263 rma 19747
264 gen 19728
265 it_ 19693
266 den 19676
267 lan 19577
268 mat 19547
269 itz 19446
270 _ge 19434
271 der 19382
272 rta 19204
273 ènc 19186
274 ata 19034
275 ot_ 18970
276 fic 18950
277 ate 18866
278 nat 18829
279 _cr 18819
280 ide 18781
281 eva 18674
282 on_ 18627
283 _ce 18458
284 rti 18387
285 tit 18385
286 _mu 18237
287 rim 18230
288 _fa 18218
289 ens 18112
290 mes 18112
291 cap 18107
292 omp 18045
293 eta 18011
294 ric 17998
295 reg 17890
296 tri 17879
297 ess 17709
298 inc 17709
299 ri_ 17687
300 cci 17676
301 _or 17654
302 cte 17632
303 min 17550
304 ode 17544
305 nar 17519
306 egu 17514
307 erm 17464
308 sos 17459
309 aix 17443
310 ost 17412
311 olt 17332
312 cas 17292
313 tza 17252
314 cor 17228
315 ssa 17182
316 tem 16913
317 qui 16901
318 ili 16857
319 dor 16849
320 ón_ 16828
321 obl 16777
322 ret 16755
323 bar 16710
324 cal 16685
325 ena 16678
326 te_ 16677
327 rad 16633
328 alm 16596
329 ien 16571
330 rop 16567
331 vol 16554
332 obr 16529
333 rme 16443
334 are 16439
335 mun 16433
336 _fr 16417
337 ven 16399
338 ors 16387
339 san 16350
340 _im 16318
341 can 16285
342 dia 16285
343 bé_ 16281
344 mol 16137
345 ill 16059
346 me_ 16036
347 ese 15975
348 ixe 15972
349 ha_ 15947
350 _ai 15854
351 et_ 15830
352 lic 15800
353 val 15792
354 _ro 15789
355 ale 15784
356 _du 15675
357 rit 15631
358 cip 15576
359 ert 15538
360 lme 15519
361 ord 15447
362 and 15425
363 ltr 15408
364 gue 15316
365 ado 15312
366 ern 15098
367 iqu 15087
368 len 14989
369 ris 14984
370 nya 14948
371 cul 14944
372 cen 14918
373 ol_ 14907
374 ou_ 14906
375 hi_ 14874
376 rei 14862
377 ula 14766
378 ora 14745
379 tin 14743
380 pel 14711
381 itu 14694
382 eni 14603
383 arc 14590
384 dic 14407
385 _ob 14388
386 imp 14387
387 dis 14310
388 nit 14227
389 ele 14009
390 ani 13971
391 ast 13971
392 eme 13932
393 ny_ 13839
394 ive 13822
395 _na 13820
396 tua 13809
397 oma 13726
398 _pl 13717
399 uer 13693
400 osa 13686
401 ron 13581
402 ini 13555
403 ala 13512
404 ida 13500
405 cie 13468
406 cre 13468
407 mon 13465
408 end 13447
409 tac 13438
410 _er 13426
411 mpo 13404
412 ure 13374
413 rie 13339
414 tei 13267
415 edi 13254
416 unt 13234
417 iu_ 13221
418 nse 13173
419 nen 13153
420 nor 13079
421 _só 13030
422 rin 12959
423 són 12945
424 mit 12928
425 oca 12908
426 ine 12900
427 ave 12869
428 bla 12846
429 mbé 12845
430 mpl 12834
431 _ap 12825
432 ics 12814
433 cad 12803
434 nst 12802
435 ves 12789
436 nia 12782
437 sit 12782
438 rar 12769
439 pla 12734
440 _as 12727
441 _gu 12721
442 _ja 12720
443 _ho 12715
444 uta 12678
445 ota 12648
446 ses 12620
447 via 12599
448 ega 12561
449 _li 12555
450 lac 12526
451 dre 12515
452 gle 12477
453 sol 12468
454 pob 12447
455 col 12441
456 sió 12419
457 anc 12363
458 nca 12341
459 ind 12339
460 sse 12307
461 rob 12271
462 lat 12262
463 nda 12249
464 _ju 12215
465 cri 12161
466 rri 12146
467 ane 12139
468 mbr 12080
469 rea 12049
470 llo 11949
471 ema 11932
472 gua 11875
473 rd_ 11866
474 sar 11789
475 rel 11778
476 ber 11736
477 erò 11715
478 sic 11598
479 _ne 11592
480 cam 11520
481 ple 11474
482 pod 11458
483 met 11451
484 ctu 11440
485 rep 11438
486 pal 11427
487 _da 11420
488 rib 11400
489 ati 11390
490 ndi 11390
491 rò_ 11380
492 rio 11368
493 esa 11341
494 _ab 11328
495 lli 11321
496 dur 11318
497 cos 11280
498 iva 11228
499 igu 11219
500 _be 11216
501 avi 11215
502 ut_ 11203
503 fra 11164
504 bli 11153
505 lem 11146
506 ys_ 11134
507 uan 11131
508 ces 11109
509 rom 11076
510 ove 11038
511 nys 11002
512 gui 10995
513 nça 10985
514 tiv 10957
515 egl 10943
516 lia 10896
517 ya_ 10893
518 ult 10843
519 erv 10831
520 _ga 10734
521 rna 10732
522 _ri 10676
523 _jo 10674
524 ing 10662
525 emb 10644
526 eus 10637
527 nce 10612
528 nde 10594
529 lor 10593
530 nsi 10581
531 ete 10554
532 rca 10542
533 ja_ 10524
534 spe 10499
535 lta 10474
536 ifi 10465
537 _em 10456
538 pol 10444
539 dir 10436
540 ngu 10431
541 nis 10409
542 cer 10390
543 don 10375
544 _pu 10323
545 eco 10300
546 _ra 10262
547 sob 10260
548 ivi 10238
549 ban 10229
550 ian 10170
551 sis 10162
552 _fu 10153
553 alg 10112
554 rod 10078
555 lt_ 10050
556 _cl 10043
557 pan 10040
558 dif 10028
559 ua_ 10012
560 tel 9978
561 li_ 9977
562 cti 9957
563 si_ 9943
564 mor 9937
565 gon 9889
566 rés 9888
567 egi 9884
568 ni_ 9864
569 abl 9848
570 _bo 9844
571 olu 9815
572 mal 9813
573 pas 9800
574 ap_ 9785
575 sme 9781
576 cta 9727
577 _ad 9715
578 hav 9713
579 ima 9703
580 mic 9686
581 apa 9669
582 tru 9662
583 ça_ 9643
584 tir 9636
585 mil 9608
586 til 9572
587 ego 9564
588 ano 9507
589 pen 9507
590 _au 9488
591 tge 9463
592 _oc 9455
593 spr 9442
594 cla 9407
595 orr 9393
596 ciu 9374
597 ola 9352
598 oni 9339
599 loc 9337
600 ram 9321
601 _on 9317
602 _cu 9285
603 _lo 9251
604 sco 9204
605 _vo 9195
606 nad 9194
607 ès_ 9179
608 rov 9167
609 esi 9153
610 div 9145
611 as_ 9130
612 vis 9114
613 fou 9048
614 iut 9043
615 ard 8995
616 omi 8972
617 nac 8969
618 tiu 8965
619 oc_ 8913
620 aba 8860
621 olo 8854
622 ire 8841
623 pré 8827
624 il_ 8820
625 oba 8808
626 ang 8790
627 erc 8775
628 òri 8767
629 _pi 8739
630 ior 8736
631 ite 8721
632 ii_ 8705
633 bra 8694
634 eny 8693
635 ism 8690
636 exp 8687
637 pot 8675
638 oli 8626
639 ore 8624
640 _ti 8613
641 ipa 8589
642 din 8544
643 lls 8534
644 omb 8529
645 mpe 8517
646 cel 8502
647 son 8501
648 atr 8482
649 dar 8460
650 _at 8397
651 sso 8378
652 sid 8365
653 its 8361
654 _go 8338
655 aca 8333
656 pec 8306
657 ga_ 8289
658 ps_ 8274
659 ge_ 8262
660 atu 8257
661 eng 8206
662 rqu 8206
663 etr 8170
664 oci 8168
665 lgu 8156
666 stà 8124
667 ler 8122
668 lon 8118
669 gun 8109
670 roc 8104
671 rer 8100
672 lis 8081
673 odu 8065
674 tà_ 8062
675 xen 8043
676 his 7998
677 ela 7945
678 sem 7945
679 tad 7919
680 atg 7895
681 in_ 7889
682 sup 7878
683 ote 7860
684 uir 7809
685 sal 7776
686 ust 7768
687 ama 7756
688 reu 7735
689 fil 7690
690 spa 7671
691 neg 7669
692 ís_ 7662
693 uns 7656
694 amp 7646
695 vil 7641
696 ndr 7631
697 adi 7617
698 spo 7599
699 eur 7590
700 _xi 7571
701 eci 7552
702 tòr 7550
703 reb 7523
704 onc 7522
705 ixa 7517
706 uci 7514
707 bal 7503
708 eve 7490
709 duc 7474
710 rce 7431
711 han 7411
712 oss 7406
713 bri 7389
714 rig 7372
715 vin 7362
716 gut 7357
717 jor 7343
718 ros 7317
719 eti 7308
720 sca 7294
721 ars 7286
722 rn_ 7286
723 ud_ 7268
724 ucc 7258
725 uit 7244
726 ira 7243
727 ipi 7239
728 _br 7237
729 var 7233
730 anç 7223
731 gre 7223
732 ila 7217
733 ei_ 7205
734 rso 7194
735 mpr 7170
736 riu 7125
737 efe 7122
738 eli 7116
739 isi 7105
740 té_ 7098
741 rem 7075
742 und 7059
743 ova 7043
744 àni 7041
745 inf 7026
746 vid 7026
747 sin 7008
748 pon 7004
749 nir 7000
750 pli 6922
751 ife 6896
752 bas 6873
753 arl 6871
754 osi 6859
755 aus 6851
756 sia 6834
757 upa 6821
758 iar 6812
759 _ag 6800
760 scr 6758
761 ovi 6753
762 uen 6729
763 gia 6727
764 uti 6727
765 xem 6727
766 fon 6719
767 rci 6717
768 nve 6706
769 iss 6705
770 lin 6698
771 aco 6694
772 org 6677
773 tim 6668
774 cac 6662
775 gad 6659
776 ref 6647
777 lec 6646
778 ext 6638
779 sig 6636
780 stò 6622
781 llu 6611
782 sat 6606
783 pat 6604
784 acc 6577
785 orn 6571
786 ià_ 6556
787 igi 6555
788 lad 6555
789 iat 6553
790 ume 6547
791 ajo 6541
792 gan 6525
793 rro 6523
794 nov 6518
795 ope 6518
796 ba_ 6515
797 tja 6512
798 _ed 6493
799 cit 6468
800 ger 6462
801 _he 6451
802 fun 6434
803 eda 6426
804 maj 6401
805 arg 6374
806 _ni 6371
807 pa_ 6369
808 ito 6356
809 enç 6344
810 env 6341
811 udi 6337
812 ui_ 6317
813 aig 6306
814 dat 6296
815 _eu 6285
816 lim 6278
817 rot 6262
818 sor 6261
819 iga 6249
820 xa_ 6247
821 íti 6224
822 leg 6221
823 dos 6196
824 pet 6192
825 au_ 6181
826 olí 6152
827 let 6144
828 uny 6143
829 _av 6136
830 eso 6133
831 ebr 6125
832 bat 6104
833 jun 6092
834 tud 6092
835 cto 6077
836 asa 6062
837 upe 6055
838 tab 6053
839 ben 6044
840 pi_ 6039
841 cle 6011
842 rga 6005
843 ne_ 6004
844 imi 6002
845 ond 5990
846 rà_ 5990
847 poc 5965
848 tav 5944
849 ges 5932
850 rav 5920
851 iba 5915
852 tia 5906
853 rev 5905
854 àri 5897
855 itj 5881
856 lun 5875
857 exe 5874
858 gar 5843
859 usa 5839
860 abi 5825
861 ots 5818
862 vel 5816
863 soc 5815
864 aut 5798
865 sio 5794
866 exi 5791
867 ole 5782
868 tàn 5769
869 mpa 5757
870 lti 5752
871 ac_ 5741
872 elo 5728
873 red 5719
874 què 5714
875 ogr 5712
876 em_ 5710
877 _bi 5697
878 uro 5689
879 evo 5683
880 cid 5666
881 ius 5663
882 rte 5640
883 nvi 5633
884 bit 5623
885 èri 5621
886 òni 5609
887 une 5606
888 veg 5591
889 evi 5576
890 dem 5573
891 vit 5542
892 _té 5541
893 uda 5491
894 sub 5486
895 mad 5484
896 nsa 5484
897 uct 5483
898 uè_ 5475
899 uar 5472
900 opo 5470
901 _af 5465
902 lam 5454
903 mod 5441
904 eal 5429
905 tig 5412
906 sul 5409
907 mos 5398
908 obe 5388
909 im_ 5375
910 _ut 5373
911 pit 5373
912 stu 5364
913 di_ 5346
914 fet 5320
915 ig_ 5313
916 eno 5311
917 ton 5311
918 cis 5293
919 nim 5292
920 sud 5289
921 ecc 5284
922 uin 5245
923 pul 5239
924 alu 5233
925 àti 5229
926 fa_ 5220
927 ece 5213
928 dec 5200
929 cep 5178
930 dei 5171
931 ruc 5169
932 là_ 5168
933 gov 5166
934 due 5160
935 _hu 5159
936 ea_ 5159
937 ixò 5154
938 xò_ 5152
939 dit 5145
940 emi 5129
941 set 5128
942 ero 5125
943 xí_ 5123
944 gin 5117
945 opi 5112
946 mps 5108
947 ose 5083
948 ixí 5082
949 mas 5078
950 ono 5071
951 nyo 5070
952 sto 5049
953 uat 5042
954 dep 5035
955 ocu 5033
956 _of 5021
957 ede 5004
958 rsi 5002
959 cin 5001
960 cab 4986
961 uei 4979
962 equ 4973
963 gat 4972
964 _ec 4964
965 uel 4957
966 die 4930
967 jan 4924
968 ign 4916
969 iet 4909
970 oll 4909
971 onv 4907
972 lte 4905
973 nqu 4905
974 epr 4892
975 omu 4883
976 ms_ 4880
977 los 4879
978 um_ 4877
979 isc 4871
980 ltu 4866
981 log 4865
982 mis 4861
983 dan 4844
984 lít 4799
985 ras 4784
986 med 4778
987 nvo 4770
988 scu 4769
989 iri 4766
990 ibl 4754
991 ols 4753
992 ecu 4750
993 sim 4733
994 mac 4730
995 zar 4727
996 rag 4721
997 bai 4720
998 sec 4717
999 cau 4712
1000 lig 4708
1001 agr 4687
1002 rla 4680
1003 uri 4671
1004 apr 4646
1005 odi 4637
1006 zac 4636
1007 rts 4634
1008 rup 4634
1009 cli 4629
1010 hor 4624
1011 def 4611
1012 fes 4607
1013 rol 4600
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 er_ 132232
15 _de 103517
16 en_ 82666
17 et_ 80661
18 for 65286
19 _fo 57945
20 de_ 51382
21 der 44049
22 at_ 41915
23 det 41381
24 _og 40344
25 _at 39482
26 ing 38707
27 den 36795
28 og_ 36577
29 _me 34924
30 nde 34528
31 _i_ 33267
32 _vi 32625
33 or_ 32053
34 om_ 31206
35 _er 29398
36 il_ 29247
37 re_ 28969
38 _af 28934
39 til 28370
40 _ti 28270
41 ke_ 27854
42 ere 27756
43 ne_ 26820
44 _en 25210
45 lig 24909
46 ed_ 24168
47 af_ 23702
48 ter 23332
49 es_ 22109
50 ger 22046
51 ge_ 21757
52 and 21487
53 ion 21203
54 lle 21168
55 _be 21089
56 nin 20565
57 te_ 20381
58 kke 19844
59 nge 19835
60 ng_ 19621
61 med 18904
62 end 18886
63 men 18796
64 ske 18757
65 som 18654
66 _ha 18627
67 els 18561
68 _ko 18448
69 _om 18051
70 tte 17863
71 ede 17830
72 le_ 17790
73 _so 17747
74 gen 17236
75 lse 17224
76 ind 17109
77 _st 16887
78 ige 16683
79 ern 16450
80 _in 16439
81 ste 16357
82 se_ 16253
83 ar_ 15962
84 ikk 15896
85 _på 15640
86 ig_ 15274
87 rne 15042
88 vi_ 14945
89 på_ 14727
90 ver 14378
91 isk 14355
92 _ud 14205
93 ent 14033
94 an_ 13917
95 _je 13885
96 eg_ 13844
97 _re 13815
98 jeg 13707
99 _si 13697
100 _fr 13341
101 ret 13313
102 har 13273
103 igt 13233
104 del 13131
105 ler 13016
106 _ik 12818
107 mme 12523
108 res 12471
109 vil 12426
110 hed 12411
111 _sk 12198
112 one 12163
113 rin 12026
114 nne 11960
115 gt_ 11839
116 ska 11777
117 kom 11592
118 _eu 11223
119 _hv 11033
120 man 10992
121 iss 10867
122 omm 10770
123 nte 10763
124 age 10696
125 _an 10556
126 und 10124
127 ner 10119
128 nd_ 10098
129 _he 10089
130 ser 9903
131 _mi 9852
132 get 9849
133 _et 9839
134 ett 9756
135 tio 9752
136 ene 9541
137 tet 9477
138 ens 9417
139 så_ 9402
140 enn 9299
141 mis 9287
142 ive 9128
143 _ma 9119
144 _pr 9051
145 nen 9049
146 ati 8977
147 lan 8972
148 uro 8935
149 _ve 8868
150 eur 8851
151 tig 8762
152 al_ 8741
153 ell 8695
154 ors 8678
155 sta 8674
156 øre 8614
157 ren 8526
158 _sa 8460
159 ore 8431
160 mer 8304
161 _al 8294
162 rop 8291
163 år_ 8254
164 ord 8218
165 sig 8135
166 _ka 8105
167 rer 8001
168 ove 7998
169 ære 7940
170 kon 7892
171 ghe 7782
172 pro 7745
173 lem 7738
174 vær 7738
175 igh 7723
176 eri 7684
177 vor 7669
178 ngs 7609
179 orm 7607
180 ale 7580
181 ang 7540
182 eli 7441
183 str 7435
184 ssi 7430
185 ker 7413
186 kal 7342
187 kan 7320
188 ans 7306
189 _op 7271
190 vis 7262
191 _un 7256
192 sio 7254
193 ege 7233
194 tal 7142
195 før 7123
196 mmi 7100
197 em_ 7019
198 _bl 6957
199 åde 6865
200 iti 6855
201 dle 6843
202 tni 6828
203 _pa 6782
204 _ge 6712
205 ag_ 6666
206 ndr 6626
207 _væ 6623
208 sen 6586
209 _så 6583
210 dig 6562
211 bes 6500
212 lag 6461
213 _ov 6448
214 red 6435
215 lin 6431
216 dre 6409
217 ved 6325
218 sam 6318
219 par 6284
220 bet 6246
221 ve_ 6237
222 _se 6234
223 est 6223
224 _la 6213
225 ill 6205
226 ide 6125
227 on_ 6109
228 fre 6086
229 tag 6083
230 nes 6043
231 _di 5942
232 _må 5910
233 lit 5909
234 lde 5898
235 rbe 5884
236 råd 5866
237 ven 5815
238 sse 5776
239 ers 5745
240 var 5666
241 _ta 5600
242 ns_ 5544
243 st_ 5538
244 _gr 5521
245 tat 5501
246 kel 5480
247 ogs 5449
248 ten 5440
249 min 5423
250 gså 5390
251 ekt 5388
252 sla 5380
253 _tr 5376
254 han 5374
255 ndl 5355
256 ame 5318
257 fra 5314
258 rem 5301
259 rla 5286
260 reg 5266
261 ate 5254
262 is_ 5251
263 all 5217
264 arl 5188
265 kti 5187
266 old 5187
267 hr_ 5178
268 _hr 5176
269 _fø 5112
270 _sp 5086
271 rt_ 5079
272 dt_ 5023
273 tiv 5019
274 oli 5011
275 des 5003
276 rma 4963
277 lt_ 4909
278 ra_ 4909
279 tis 4889
280 rke 4867
281 alt 4846
282 tra 4828
283 udv 4811
284 mål 4804
285 tid 4800
286 sk_ 4780
287 el_ 4779
288 ble 4764
289 fte 4744
290 ist 4744
291 _no 4735
292 kni 4733
293 tem 4733
294 hol 4713
295 rst 4688
296 lam 4662
297 gel 4654
298 rde 4617
299 gru 4609
300 arb 4586
301 ejd 4545
302 ort 4545
303 emm 4540
304 bej 4528
305 dis 4518
306 _li 4479
307 gør 4467
308 pol 4451
309 orb 4435
310 sti 4427
311 esk 4361
312 nsk 4339
313 ænd 4324
314 rsl 4308
315 opæ 4287
316 _va 4281
317 bli 4247
318 _fa 4238
319 mod 4205
320 me_ 4185
321 æis 4185
322 pæi 4180
323 hvi 4166
324 gge 4114
325 art 4105
326 _fi 4050
327 _po 4045
328 ndi 4042
329 ets 4036
330 rli 4022
331 _da 4018
332 _ku 4010
333 kab 3991
334 hvo 3978
335 amm 3963
336 tor 3924
337 mel 3909
338 sto 3868
339 hen 3867
340 nst 3866
341 giv 3857
342 lev 3847
343 nog 3840
344 ems 3831
345 ele 3804
346 les 3797
347 _mo 3794
348 opa 3774
349 ørg 3774
350 vet 3746
351 ør_ 3744
352 elt 3707
353 ts_ 3707
354 ber 3705
355 dem 3704
356 gan 3699
357 are 3691
358 edl 3676
359 _ar 3649
360 ken 3639
361 ise 3633
362 dvi 3632
363 _vo 3607
364 stø 3590
365 lli 3585
366 tik 3581
367 fin 3576
368 rig 3574
369 _el 3567
370 val 3566
371 gti 3562
372 dri 3558
373 gsm 3549
374 _ef 3545
375 ite 3534
376 lut 3525
377 akt 3522
378 tæn 3508
379 små 3486
380 dst 3482
381 liv 3458
382 spø 3456
383 nds 3445
384 pør 3441
385 led 3402
386 eks 3396
387 kun 3386
388 pa_ 3386
389 jde 3376
390 her 3375
391 ad_ 3374
392 dni 3373
393 nat 3371
394 kri 3370
395 ffe 3344
396 run 3334
397 bru 3327
398 fæl 3325
399 yde 3313
400 rti 3288
401 sær 3279
402 nal 3261
403 ess 3252
404 nem 3250
405 sid 3234
406 sik 3226
407 lge 3222
408 ål_ 3187
409 vid 3175
410 rel 3165
411 _kr 3153
412 sæt 3133
413 int 3128
414 per 3125
415 kte 3113
416 sst 3113
417 hel 3112
418 gle 3102
419 rat 3100
420 eds 3098
421 rgs 3074
422 sel 3069
423 rre 3048
424 ons 3046
425 tro 3037
426 ænk 3037
427 ran 3031
428 ppe 3030
429 mar 3026
430 tel 3023
431 ert 3003
432 rug 2996
433 uni 2990
434 meg 2978
435 slu 2954
436 esl 2919
437 mul 2914
438 _na 2913
439 eve 2904
440 att 2902
441 os_ 2901
442 rge 2895
443 rdi 2879
444 _gø 2875
445 tur 2873
446 tti 2858
447 in_ 2857
448 ik_ 2843
449 orh 2843
450 rfo 2831
451 _fæ 2828
452 eft 2819
453 kla 2803
454 omr 2799
455 sfo 2793
456 ris 2792
457 mrå 2788
458 erf 2787
459 dli 2783
460 tre 2780
461 _rå 2779
462 nt_ 2778
463 må_ 2758
464 org 2747
465 hav 2729
466 dan 2726
467 ona 2725
468 ali 2707
469 syn 2707
470 pri 2699
471 agt 2685
472 _ny 2680
473 kt_ 2679
474 dag 2676
475 nu_ 2676
476 _or 2675
477 od_ 2668
478 _br 2657
479 let 2656
480 tan 2646
481 ørs 2644
482 _ne 2641
483 uli 2637
484 eu_ 2631
485 læg 2628
486 _mu 2620
487 ærk 2614
488 ave 2608
489 mil 2608
490 æll 2585
491 ld_ 2582
492 sat 2578
493 sva 2572
494 gra 2563
495 _år 2559
496 abe 2557
497 ndt 2557
498 nis 2555
499 gte 2552
500 rsk 2547
501 _ga 2535
502 tli 2533
503 nio 2531
504 nkt 2504
505 _æn 2499
506 ket 2487
507 len 2483
508 ast 2480
509 id_ 2461
510 sag 2456
511 elv 2451
512 vir 2433
513 _nå 2428
514 etæ 2426
515 oge 2424
516 nce 2415
517 irk 2412
518 lad 2409
519 _os 2408
520 stå 2407
521 øde 2404
522 træ 2398
523 ier 2395
524 nse 2393
525 vig 2392
526 ude 2378
527 ant 2363
528 omi 2357
529 jer 2354
530 oll 2353
531 ram 2347
532 _bø 2337
533 nsi 2335
534 ætt 2321
535 _ba 2319
536 alg 2311
537 kra 2310
538 ikl 2303
539 nkn 2300
540 vik 2296
541 bør 2281
542 tøt 2274
543 unk 2274
544 rek 2272
545 øtt 2266
546 ine 2265
547 _ek 2261
548 _få 2261
549 nve 2258
550 ted 2250
551 åle 2246
552 _fl 2236
553 _sy 2230
554 _gi 2228
555 _nu 2225
556 ode 2224
557 æng 2224
558 æld 2220
559 egi 2203
560 mid 2202
561 dva 2201
562 _ho 2199
563 gsf 2194
564 _kl 2189
565 går 2188
566 lar 2180
567 sin 2177
568 ald 2164
569 dta 2162
570 _pe 2155
571 udg 2144
572 ien 2142
573 ina 2142
574 _go 2133
575 idl 2131
576 kre 2128
577 rte 2124
578 bed 2122
579 pun 2115
580 aft 2110
581 net 2110
582 ytt 2109
583 ial 2101
584 _næ 2099
585 lke 2099
586 ade 2097
587 bin 2096
588 _hø 2094
589 _lo 2093
590 _bi 2092
591 dfø 2088
592 _nø 2085
593 når 2082
594 ges 2080
595 ire 2078
596 eme 2074
597 eni 2073
598 ids 2067
599 ntr 2058
600 itu 2055
601 ono 2053
602 edr 2051
603 ær_ 2050
604 _le 2049
605 god 2045
606 _to 2042
607 øko 2032
608 van 2024
609 nom 2017
610 skr 2004
611 lis 2003
612 rbi 1990
613 føl 1989
614 beh 1984
615 ked 1984
616 ure 1975
617 sit 1974
618 rag 1973
619 un_ 1969
620 øje 1966
621 kol 1961
622 rme 1956
623 utn 1956
624 iv_ 1948
625 ægg 1946
626 spe 1944
627 vad 1938
628 sni 1936
629 tri 1936
630 hva 1932
631 nke 1930
632 _bo 1926
633 spr 1925
634 ærd 1925
635 mss 1917
636 rit 1915
637 kli 1911
638 ud_ 1907
639 pla 1904
640 nød 1901
641 ukt 1899
642 nta 1897
643 cen 1894
644 erh 1891
645 øge 1887
646 afs 1877
647 tru 1868
648 fri 1845
649 ini 1842
650 sky 1840
651 upp 1838
652 rod 1836
653 _øk 1833
654 ræn 1830
655 tyd 1819
656 rak 1818
657 _sæ 1814
658 _fu 1811
659 dsk 1809
660 ete 1806
661 ont 1797
662 ntl 1792
663 bek 1787
664 obl 1784
665 mær 1776
666 fer 1774
667 ins 1774
668 fat 1769
669 eta 1764
670 idt 1763
671 bor 1758
672 dir 1756
673 rog 1755
674 søg 1755
675 øns 1749
676 efo 1745
677 beg 1741
678 _do 1732
679 ils 1725
680 ses 1717
681 raf 1715
682 rks 1713
683 ogl 1712
684 ilk 1709
685 rol 1706
686 _bu 1694
687 rup 1694
688 _ra 1688
689 eje 1685
690 mig 1683
691 por 1680
692 sku 1680
693 uge 1680
694 dve 1678
695 pe_ 1677
696 spo 1668
697 rob 1666
698 olk 1660
699 _hu 1659
700 hve 1659
701 cer 1658
702 fru 1657
703 soc 1657
704 lov 1655
705 oci 1655
706 ølg 1646
707 rho 1644
708 _te 1641
709 edt 1639
710 _im 1637
711 ark 1636
712 unn 1636
713 nan 1631
714 nti 1628
715 jen 1625
716 emt 1622
717 mpe 1622
718 _gå 1620
719 cia 1617
720 ora 1617
721 ori 1604
722 erv 1603
723 op_ 1600
724 enc 1598
725 enh 1594
726 ru_ 1594
727 pen 1592
728 fun 1591
729 _dr 1587
730 fol 1587
731 evi 1584
732 mt_ 1570
733 ækk 1570
734 era 1562
735 rs_ 1557
736 ogr 1554
737 vel 1554
738 leg 1553
739 ror 1553
740 ral 1551
741 nye 1549
742 igg 1547
743 _pl 1535
744 ev_ 1535
745 di_ 1534
746 ika 1530
747 met 1528
748 to_ 1519
749 _of 1517
750 onk 1511
751 ilj 1505
752 roc 1503
753 præ 1502
754 uds 1501
755 udt 1497
756 teg 1496
757 oce 1489
758 ødv 1488
759 nsy 1486
760 _ri 1485
761 gne 1483
762 ned 1483
763 nli 1482
764 ult 1480
765 lik 1478
766 yn_ 1475
767 _øn 1468
768 ræk 1465
769 ikr 1461
770 mst 1451
771 kso 1450
772 tes 1450
773 egr 1444
774 rid 1443
775 gsp 1441
776 bud 1440
777 bel 1439
778 sis 1437
779 løs 1435
780 ilf 1434
781 fle 1433
782 ye_ 1432
783 bev 1428
784 fal 1425
785 høj 1425
786 _lø 1420
787 tyr 1419
788 ari 1416
789 rdn 1415
790 ynd 1406
791 dde 1395
792 egn 1394
793 opf 1394
794 rie 1393
795 _kv 1386
796 off 1384
797 rik 1384
798 ssæ 1381
799 tin 1380
800 uti 1377
801 fas 1374
802 _læ 1372
803 sme 1372
804 vin 1365
805 lta 1362
806 åbe 1362
807 bil 1354
808 je_ 1353
809 omh 1352
810 yst 1343
811 nit 1342
812 idi 1339
813 _ty 1331
814 kat 1331
815 tje 1331
816 mød 1329
817 æse 1326
818 ful 1323
819 erl 1320
820 tør 1317
821 sek 1316
822 _sv 1315
823 eml 1303
824 dsp 1302
825 ane 1300
826 emo 1296
827 ety 1296
828 ljø 1296
829 kul 1294
830 såd 1290
831 åda 1290
832 odu 1288
833 erd 1287
834 drø 1286
835 da_ 1281
836 jds 1281
837 ltn 1279
838 dra 1277
839 gio 1276
840 uld 1275
841 _pu 1274
842 fti 1274
843 tak 1269
844 bef 1267
845 få_ 1267
846 måd 1263
847 nel 1261
848 sko 1261
849 atu 1256
850 etn 1255
851 rve 1255
852 græ 1252
853 kræ 1250
854 eho 1243
855 _su 1239
856 emf 1239
857 tit 1238
858 tab 1237
859 eng 1232
860 ået 1231
861 mfø 1229
862 amt 1228
863 lid 1227
864 _ak 1223
865 emn 1222
866 top 1221
867 _hå 1220
868 anv 1220
869 iel 1220
870 rhe 1220
871 ili 1218
872 hør 1215
873 sor 1213
874 isi 1212
875 lv_ 1210
876 igs 1208
877 imi 1208
878 sie 1206
879 ama 1203
880 vej 1203
881 _gl 1201
882 æde 1201
883 dar 1193
884 log 1193
885 ani 1192
886 bag 1189
887 ørt 1189
888 rha 1187
889 ssk 1185
890 ben 1184
891 vne 1183
892 _s_ 1182
893 it_ 1182
894 orv 1180
895 _ad 1174
896 stæ 1171
897 dsa 1170
898 nsv 1164
899 erk 1163
900 oms 1160
901 cip 1159
902 tår 1158
903 adi 1155
904 eva 1152
905 ks_ 1152
906 rbr 1144
907 nci 1142
908 sty 1139
909 tek 1139
910 æft 1137
911 odt 1133
912 anl 1131
913 try 1130
914 rsø 1129
915 mhe 1128
916 ldt 1127
917 mat 1124
918 rts 1124
919 gæl 1121
920 skæ 1120
921 inc 1119
922 lst 1118
923 rso 1115
924 ats 1114
925 eha 1109
926 ryk 1107
927 æve 1107
928 ktu 1104
929 afg 1099
930 dom 1098
931 ivi 1097
932 kst 1094
933 æns 1093
934 esu 1089
935 yrk 1089
936 _is 1088
937 mti 1088
938 mli 1087
939 igv 1086
940 ann 1082
941 dtr 1080
942 kyt 1077
943 lte 1077
944 rør 1077
945 fta 1075
946 æst 1073
947 sys 1071
948 _eg 1069
949 slå 1068
950 dge 1063
951 sål 1063
952 tut 1060
953 ank 1058
954 nsp 1056
955 ves 1056
956 bar 1054
957 lys 1053
958 yld 1050
959 mes 1049
960 øst 1049
961 ds_ 1047
962 nyt 1045
963 orl 1045
964 gni 1044
965 hov 1044
966 okr 1044
967 pre 1044
968 opm 1043
969 eto 1041
970 be_ 1037
971 set 1035
972 utt 1033
973 pek 1032
974 _sl 1026
975 rej 1025
976 mok 1022
977 gvi 1021
978 tua 1020
979 pli 1019
980 ibe 1018
981 ldn 1018
982 ruk 1015
983 ræs 1015
984 _hj 1014
985 uat 1011
986 dlæ 1008
987 ndn 1006
988 kor 1005
989 son 1005
990 ndb 1003
991 hæn 1000
992 vni 1000
993 fen 997
994 rga 996
995 ref 993
996 ug_ 991
997 ism 988
998 ab_ 987
999 gis 987
1000 edu 984
1001 rev 984
1002 ass 979
1003 ærl 975
1004 _yd 974
1005 dnu 973
1006 ksi 973
1007 rda 971
1008 duk 968
1009 abs 967
1010 aff 967
1011 dbr 966
1012 ost 966
1013 ævn 964
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 en_ 212612
15 er_ 98528
16 _de 85811
17 ich 77459
18 der 70583
19 _di 70518
20 die 69388
21 ie_ 67971
22 sch 56571
23 ein 53912
24 ch_ 53758
25 _un 52654
26 ung 50103
27 che 45855
28 nd_ 44410
29 den 43865
30 _da 41966
31 cht 41763
32 _be 39578
33 gen 38128
34 und 38106
35 _ei 36321
36 in_ 34380
37 ng_ 34359
38 ten 34283
39 es_ 33601
40 _au 33306
41 _wi 31780
42 _in 30781
43 _zu 30678
44 hen 30448
45 _ge 30199
46 on_ 28096
47 nde 28026
48 eit 27645
49 ine 27612
50 _vo 26131
51 ver 25977
52 ere 23829
53 men 23322
54 wir 23199
55 ht_ 23170
56 ent 23092
57 ber 23063
58 _we 23010
59 lic 22696
60 te_ 22547
61 ion 22420
62 _si 22298
63 _ve 21928
64 nen 20719
65 it_ 20653
66 ter 19791
67 nge 19645
68 ste 19037
69 ese 18527
70 isc 18191
71 rde 18054
72 ren 17925
73 _mi 17599
74 _an 17282
75 ies 17035
76 as_ 16836
77 em_ 16828
78 zu_ 16778
79 her 16547
80 mit 16540
81 _er 16445
82 _ko 16370
83 run 16334
84 lle 16293
85 ist 16265
86 ne_ 16251
87 st_ 16035
88 auf 15984
89 _ha 15980
90 ers 15463
91 aus 15387
92 ier 14586
93 sen 14337
94 ir_ 14331
95 _so 14264
96 rei 14034
97 sse 14014
98 für 13527
99 aß_ 13516
100 _fü 13464
101 nte 13283
102 ben 13251
103 daß 13186
104 ür_ 13142
105 end 13019
106 ige 12985
107 vor 12973
108 das 12972
109 von 12898
110 _ic 12743
111 he_ 12692
112 ges 12653
113 ern 12573
114 ach 12465
115 bei 12431
116 des 12271
117 wer 12200
118 _ni 12156
119 _is 12089
120 _st 11950
121 and 11857
122 _al 11768
123 ert 11570
124 omm 11562
125 sic 11525
126 len 11498
127 ge_ 11405
128 _re 11377
129 _se 11228
130 nic 11202
131 eru 11090
132 abe 11066
133 kom 11050
134 uch 11046
135 _pr 11019
136 ind 10872
137 wei 10852
138 _eu 10566
139 _he 10564
140 uf_ 10457
141 erd 10409
142 sta 10319
143 _es 10313
144 tig 10262
145 tio 10247
146 hte 10203
147 hre 10019
148 eur 9917
149 ner 9909
150 ser 9905
151 ell 9828
152 _me 9768
153 sie 9765
154 ens 9748
155 iss 9720
156 ech 9665
157 age 9652
158 _ab 9610
159 uro 9599
160 ege 9577
161 _im 9551
162 um_ 9504
163 übe 9493
164 nn_ 9394
165 rop 9378
166 _fr 9366
167 oll 9332
168 ngs 9308
169 cha 9204
170 _sc 9195
171 se_ 9154
172 dem 9078
173 _en 9049
174 re_ 9003
175 mme 9000
176 bes 8937
177 auc 8928
178 chl 8837
179 ite 8774
180 rn_ 8764
181 wie 8744
182 est 8732
183 im_ 8662
184 tli 8577
185 de_ 8540
186 lie 8522
187 ati 8510
188 err 8489
189 all 8428
190 eic 8415
191 ang 8405
192 rt_ 8376
193 _üb 8359
194 nt_ 8299
195 rst 8116
196 haf 8115
197 rec 8067
198 tel 7937
199 mei 7917
200 le_ 7887
201 _na 7840
202 tte 7823
203 _um 7791
204 an_ 7767
205 hei 7712
206 iti 7707
207 eri 7651
208 ric 7646
209 uns 7638
210 rau 7623
211 ins 7567
212 tun 7559
213 ene 7556
214 kei 7517
215 ur_ 7503
216 ssi 7480
217 fra 7472
218 ger 7447
219 tra 7440
220 mis 7425
221 ede 7268
222 _ma 7246
223 aft 7238
224 iel 7166
225 fen 7065
226 änd 7064
227 pro 7039
228 rag 7025
229 lei 7007
230 och 6998
231 ei_ 6988
232 alt 6974
233 rte 6958
234 mmi 6906
235 lte 6905
236 hal 6893
237 at_ 6843
238 gli 6829
239 ied 6819
240 sti 6809
241 ame 6804
242 sio 6720
243 ord 6714
244 ide 6691
245 opä 6650
246 ns_ 6649
247 enn 6618
248 sei 6568
249 äis 6563
250 päi 6560
251 rge 6560
252 eis 6559
253 hme 6533
254 tsc 6528
255 ebe 6450
256 ehr 6443
257 wen 6434
258 nne 6401
259 etz 6367
260 lt_ 6366
261 ann 6319
262 hr_ 6308
263 _wa 6306
264 ls_ 6301
265 hab 6292
266 _gr 6280
267 ing 6273
268 rsc 6227
269 sin 6215
270 _ih 6212
271 ini 6144
272 gt_ 6121
273 als 6114
274 ate 6027
275 rbe 6007
276 ehe 6006
277 itt 6000
278 eme 5984
279 _hi 5953
280 unt 5938
281 for 5937
282 lun 5917
283 ig_ 5865
284 erh 5831
285 _pa 5818
286 rat 5816
287 ft_ 5799
288 ts_ 5757
289 par 5693
290 nsc 5686
291 ahr 5645
292 rd_ 5591
293 ete 5558
294 lit 5524
295 zie 5522
296 et_ 5500
297 eil 5483
298 era 5474
299 nts 5467
300 leg 5465
301 dar 5440
302 sam 5429
303 nis 5392
304 spr 5383
305 ien 5381
306 ird 5359
307 el_ 5349
308 ele 5326
309 str 5277
310 gel 5254
311 geb 5196
312 rla 5173
313 imm 5169
314 ffe 5162
315 _le 5139
316 nse 5082
317 hat 5052
318 esc 5043
319 lan 5033
320 tei 4978
321 sol 4971
322 ant 4965
323 _mö 4949
324 nun 4947
325 erf 4946
326 nst 4940
327 erb 4886
328 ar_ 4875
329 zus 4862
330 gem 4840
331 zur 4837
332 one 4779
333 egi 4753
334 prä 4748
335 nac 4721
336 _ra 4707
337 kon 4694
338 reg 4639
339 ort 4615
340 us_ 4613
341 chu 4535
342 oli 4513
343 org 4510
344 zen 4503
345 _no 4500
346 _ka 4496
347 _je 4479
348 ihr 4472
349 uss 4459
350 geh 4458
351 eut 4444
352 elt 4443
353 tis 4437
354 _zw 4432
355 wic 4430
356 zei 4407
357 anz 4406
358 ran 4375
359 arb 4336
360 uß_ 4336
361 hie 4327
362 kan 4316
363 erw 4308
364 gun 4303
365 pol 4302
366 nie 4298
367 rin 4296
368 ale 4294
369 _wo 4288
370 hin 4273
371 _nu 4256
372 erl 4241
373 räs 4201
374 _bi 4187
375 rr_ 4187
376 sid 4185
377 ass 4184
378 ken 4173
379 kti 4170
380 ntr 4149
381 äsi 4132
382 _la 4128
383 _ar 4125
384 zun 4123
385 lam 4065
386 llt 4060
387 arl 4058
388 ntw 4058
389 uni 4057
390 geg 4056
391 lag 4047
392 so_ 4036
393 han 4011
394 ahm 4004
395 ll_ 3999
396 rch 3972
397 aat 3969
398 ess 3966
399 gan 3963
400 art 3954
401 son 3936
402 erg 3933
403 taa 3923
404 urc 3923
405 ors 3914
406 _po 3913
407 tim 3911
408 int 3900
409 tre 3878
410 tze 3856
411 kt_ 3842
412 erk 3833
413 hne 3797
414 amm 3774
415 nig 3767
416 tzt 3758
417 set 3744
418 ond 3741
419 dur 3737
420 tik 3735
421 its 3728
422 rit 3696
423 rer 3658
424 was 3645
425 or_ 3616
426 üss 3615
427 rie 3613
428 tet 3613
429 nat 3609
430 seh 3592
431 hti 3590
432 _kö 3581
433 _du 3572
434 ag_ 3565
435 me_ 3557
436 inn 3548
437 nio 3548
438 öch 3531
439 dig 3528
440 gru 3515
441 _fo 3494
442 _ja 3487
443 kön 3479
444 _ne 3468
445 akt 3459
446 _mü 3443
447 tie 3437
448 att 3429
449 ise 3419
450 tat 3407
451 bet 3401
452 ode 3371
453 gew 3366
454 vie 3358
455 gie 3338
456 önn 3312
457 man 3303
458 hts 3298
459 ühr 3283
460 _vi 3266
461 ute 3263
462 wor 3262
463 etr 3254
464 ndl 3254
465 nah 3251
466 füh 3243
467 res 3234
468 _mu 3219
469 tzu 3211
470 möc 3206
471 itg 3193
472 rli 3184
473 chs 3176
474 mer 3176
475 ses 3164
476 tät 3161
477 eht 3157
478 hli 3137
479 lem 3113
480 noc 3107
481 eid 3104
482 war 3102
483 stä 3098
484 nkt 3089
485 chi 3082
486 fin 3078
487 chr 3075
488 _ke 3069
489 ler 3049
490 ick 3032
491 wel 3017
492 erz 2992
493 müs 2979
494 ekt 2977
495 ans 2974
496 zum 2971
497 neh 2964
498 bed 2958
499 tiv 2957
500 eue 2952
501 sel 2952
502 ona 2943
503 teh 2938
504 rts 2935
505 ndi 2932
506 orm 2921
507 tri 2909
508 dan 2904
509 län 2898
510 ons 2894
511 ehm 2889
512 lin 2886
513 ina 2885
514 nal 2871
515 tgl 2859
516 itä 2857
517 tan 2855
518 muß 2846
519 au_ 2840
520 enz 2837
521 _fi 2836
522 rdn 2824
523 dun 2797
524 wis 2796
525 nz_ 2789
526 jah 2769
527 wur 2769
528 ik_ 2761
529 kte 2754
530 rke 2749
531 nur 2744
532 ali 2741
533 _fe 2721
534 abs 2720
535 hau 2717
536 is_ 2717
537 be_ 2694
538 nnt 2692
539 htl 2688
540 zt_ 2686
541 usa 2675
542 nem 2672
543 ble 2670
544 _te 2669
545 nan 2665
546 gef 2660
547 rne 2659
548 nze 2652
549 _ze 2651
550 igt 2651
551 _ri 2648
552 neu 2645
553 rhe 2645
554 sem 2641
555 rha 2625
556 jed 2617
557 doc 2616
558 _wu 2610
559 unk 2590
560 _sa 2571
561 nti 2570
562 rtr 2561
563 sge 2555
564 _li 2554
565 _sp 2554
566 _gl 2551
567 chn 2548
568 sun 2548
569 ück 2544
570 net 2543
571 meh 2541
572 fal 2538
573 ara 2525
574 hun 2517
575 fol 2511
576 _fa 2501
577 inz 2491
578 off 2489
579 gke 2477
580 gra 2476
581 igk 2476
582 urd 2474
583 mög 2472
584 olg 2472
585 ank 2466
586 fre 2465
587 irt 2464
588 kol 2449
589 ive 2444
590 ntl 2441
591 _od 2440
592 rre 2439
593 sha 2439
594 rac 2435
595 ris 2433
596 il_ 2426
597 _ga 2424
598 sag 2414
599 san 2412
600 ust 2405
601 twi 2397
602 eig 2382
603 eso 2378
604 sit 2360
605 ieß 2359
606 bar 2358
607 nke 2355
608 bez 2345
609 hla 2344
610 rze 2343
611 ögl 2342
612 fer 2338
613 nes 2338
614 deu 2337
615 tro 2323
616 _tr 2318
617 _do 2317
618 zwe 2312
619 del 2310
620 ßen 2307
621 itu 2306
622 enh 2304
623 dam 2289
624 inf 2272
625 bew 2271
626 bst 2269
627 _lä 2268
628 _gi 2260
629 äge 2255
630 al_ 2247
631 opa 2242
632 eds 2240
633 lis 2233
634 ckl 2230
635 _ta 2225
636 uen 2225
637 chw 2221
638 ena 2218
639 rüc 2218
640 dst 2213
641 dli 2212
642 gsa 2207
643 bra 2199
644 mal 2195
645 gle 2192
646 tsp 2179
647 isi 2178
648 zug 2170
649 irk 2161
650 ieg 2153
651 lge 2152
652 äch 2148
653 usg 2139
654 ial 2137
655 sor 2137
656 ssc 2135
657 kun 2127
658 _zi 2124
659 alb 2124
660 tz_ 2123
661 bt_ 2114
662 eln 2106
663 ret 2096
664 ähr 2094
665 tes 2090
666 fe_ 2085
667 ßer 2075
668 sbe 2068
669 uti 2063
670 fah 2051
671 _mo 2047
672 maß 2044
673 utz 2044
674 pun 2039
675 nzi 2036
676 gre 2035
677 lls 2032
678 las 2028
679 ewe 2024
680 _än 2022
681 fun 2019
682 _br 2016
683 ili 2012
684 erm 2010
685 trä 2007
686 ill 2006
687 rif 2005
688 hst 2004
689 rwe 2003
690 bil 2002
691 abg 2001
692 nsi 2001
693 fte 2000
694 rfo 1997
695 pie 1992
696 lig 1980
697 chk 1977
698 rti 1977
699 pre 1975
700 are 1967
701 ami 1966
702 min 1966
703 mmt 1963
704 am_ 1961
705 ütz 1958
706 _dr 1941
707 bge 1941
708 klu 1937
709 igu 1927
710 nsa 1926
711 lch 1911
712 obl 1910
713 lau 1908
714 hri 1906
715 _ho 1901
716 beg 1900
717 uße 1897
718 ema 1890
719 mun 1890
720 vol 1890
721 lli 1888
722 hru 1885
723 chä 1883
724 wäh 1881
725 rob 1880
726 _am 1872
727 hke 1869
728 gte 1854
729 los 1847
730 bin 1844
731 _kl 1834
732 tän 1834
733 räg 1831
734 rak 1830
735 uer 1829
736 stü 1828
737 din 1821
738 _wä 1817
739 aue 1816
740 _kr 1815
741 sow 1803
742 ahl 1795
743 ufg 1792
744 mt_ 1790
745 els 1789
746 rle 1784
747 uge 1782
748 tur 1781
749 mmu 1777
750 rfa 1776
751 kra 1770
752 lb_ 1768
753 ari 1761
754 mac 1756
755 hrt 1753
756 ibt 1745
757 ieh 1745
758 ssa 1744
759 ät_ 1741
760 _ak 1733
761 rkl 1733
762 nhe 1732
763 ark 1730
764 soz 1730
765 nha 1728
766 ltu 1723
767 ani 1719
768 hle 1715
769 ont 1710
770 ozi 1706
771 ck_ 1702
772 nzu 1698
773 tru 1693
774 per 1690
775 hl_ 1688
776 sat 1688
777 un_ 1688
778 fes 1684
779 fri 1676
780 heu 1676
781 egt 1673
782 iff 1668
783 rkt 1666
784 elb 1664
785 rig 1664
786 stu 1658
787 fti 1641
788 äre 1640
789 bri 1637
790 zia 1633
791 tüt 1631
792 bür 1618
793 fts 1616
794 ndu 1614
795 spi 1603
796 rga 1602
797 beh 1597
798 gro 1581
799 pra 1581
800 ln_ 1577
801 pri 1577
802 eni 1574
803 rbr 1571
804 egr 1569
805 tag 1566
806 zwi 1565
807 ßna 1561
808 dnu 1560
809 pa_ 1558
810 fas 1556
811 ats 1554
812 aßn 1551
813 tem 1550
814 _wü 1543
815 mat 1541
816 kli 1540
817 kri 1540
818 ukt 1539
819 esa 1536
820 quo 1530
821 wür 1530
822 ogr 1528
823 nds 1521
824 edi 1519
825 hör 1519
826 nom 1519
827 bli 1514
828 woh 1514
829 htu 1513
830 äft 1513
831 roß 1512
832 ürd 1512
833 äng 1505
834 det 1503
835 rum 1498
836 edo 1496
837 gib 1495
838 ker 1491
839 enk 1489
840 mar 1484
841 ut_ 1484
842 da_ 1478
843 dne 1478
844 gio 1477
845 _bü 1473
846 kla 1469
847 rs_ 1469
848 _eb 1467
849 _ob 1465
850 ftl 1465
851 itr 1465
852 ihn 1464
853 tür 1459
854 ral 1457
855 bie 1456
856 ika 1456
857 two 1455
858 tor 1454
859 ewi 1452
860 ürg 1450
861 hem 1447
862 let 1445
863 rem 1443
864 nre 1442
865 ram 1441
866 iet 1436
867 nsp 1435
868 ßt_ 1435
869 ven 1434
870 grü 1432
871 ike 1430
872 squ 1430
873 aff 1419
874 leb 1415
875 gs_ 1410
876 rog 1409
877 bek 1403
878 rol 1401
879 agt 1400
880 spe 1397
881 swe 1397
882 twe 1390
883 mpf 1388
884 eih 1386
885 eng 1385
886 erp 1378
887 nfa 1376
888 amt 1374
889 hil 1371
890 örd 1370
891 weg 1369
892 atz 1356
893 lfe 1355
894 oße 1352
895 mic 1349
896 ref 1346
897 mte 1345
898 ast 1333
899 sar 1333
900 ezi 1330
901 geo 1330
902 _ba 1329
903 eug 1324
904 _et 1323
905 rma 1320
906 cho 1319
907 uo_ 1318
908 ohl 1314
909 wär 1314
910 eu_ 1312
911 rtu 1312
912 eor 1308
913 häf 1306
914 ätz 1306
915 nft 1304
916 tit 1302
917 wes 1299
918 rdi 1298
919 ilf 1293
920 hlu 1291
921 the 1288
922 zte 1288
923 _ku 1280
924 tge 1277
925 ori 1273
926 dis 1268
927 cke 1266
928 tär 1266
929 mil 1265
930 rwa 1265
931 zah 1265
932 nwe 1256
933 umw 1253
934 ieb 1251
935 rrn 1250
936 izi 1249
937 bis 1248
938 owi 1248
939 ibe 1247
940 fli 1246
941 ehö 1244
942 rah 1240
943 ums 1240
944 mwe 1239
945 eno 1235
946 llu 1235
947 nve 1233
948 ze_ 1230
949 ohn 1228
950 nit 1224
951 tt_ 1223
952 nfo 1222
953 ett 1218
954 ost 1216
955 esh 1215
956 ruc 1215
957 daf 1213
958 yst 1213
959 _ls 1212
960 lsq 1212
961 _fl 1209
962 eha 1207
963 ile 1206
964 sys 1206
965 ume 1200
966 pfe 1199
967 lbs 1196
968 kel 1195
969 rme 1194
970 bef 1193
971 efü 1189
972 esi 1188
973 hof 1188
974 dri 1185
975 zeu 1184
976 ebi 1182
977 emo 1180
978 nma 1175
979 ade 1174
980 inb 1174
981 tin 1173
982 okr 1169
983 rf_ 1166
984 ve_ 1166
985 om_ 1165
986 gab 1164
987 lts 1164
988 ush 1164
989 _nä 1163
990 eff 1160
991 lsc 1159
992 hwe 1158
993 gut 1157
994 sis 1147
995 _ch 1146
996 hut 1143
997 eif 1140
998 enr 1136
999 not 1133
1000 suc 1132
1001 auß 1126
1002 eib 1123
1003 nbe 1122
1004 ünd 1121
1005 _or 1119
1006 _th 1118
1007 rmi 1115
1008 get 1114
1009 lar 1112
1010 spa 1112
1011 nau 1111
1012 tal 1108
1013 lär 1107
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 αι_ 73292
15 _χ_ 72116
16 _τη 67413
17 _κα 66823
18 _το 61620
19 ου_ 58182
20 να_ 51915
21 ης_ 50982
22 _πρ 50886
23 και 44545
24 _να 42766
25 ων_ 42218
26 ια_ 39929
27 _στ 38665
28 ην_ 37376
29 την 36472
30 με_ 36145
31 _πο 35813
32 ει_ 35673
33 το_ 34708
34 ση_ 33913
35 _επ 32206
36 _απ 31689
37 τικ 31116
38 του 31099
39 της 29811
40 ας_ 29544
41 _συ 29493
42 τα_ 29344
43 _με 26939
44 _αν 26136
45 προ 25989
46 _δι 25857
47 ις_ 25679
48 ία_ 25305
49 _η_ 24782
50 ες_ 24638
51 των 24440
52 που 24082
53 _γι 22457
54 _αυ 22034
55 για 21684
56 αυτ 21185
57 ών_ 20933
58 _εί 20414
59 ική 20053
60 τη_ 20028
61 _τω 19621
62 επι 19352
63 στη 18905
64 ής_ 18677
65 ται 17329
66 _θα 16927
67 ματ 16894
68 ος_ 16877
69 τι_ 16833
70 θα_ 16699
71 ές_ 16670
72 δια 16531
73 υς_ 16512
74 σει 16489
75 ους 16463
76 είν 16459
77 κατ 16414
78 οι_ 16395
79 ως_ 16295
80 εί_ 16276
81 σε_ 16078
82 κή_ 16060
83 ον_ 15967
84 στο 15711
85 _υπ 15611
86 πολ 15525
87 οπο 15499
88 _δε 15021
89 σης 14828
90 ναι 14699
91 ίνα 14581
92 ουμ 14531
93 υμε 14346
94 _ευ 14263
95 από 14186
96 _πα 14039
97 ότι 14014
98 _σε 13929
99 τις 13852
100 παρ 13503
101 ουν 13481
102 _τα 13419
103 υν_ 13392
104 _ότ 13385
105 ατα 13313
106 εν_ 13301
107 συν 13188
108 περ 13128
109 ετα 13105
110 πό_ 13057
111 τρο 12829
112 ιο_ 12789
113 _οι 12740
114 μα_ 12475
115 μέν 12474
116 ντα 12467
117 _πε 12327
118 ηση 12080
119 _τι 12050
120 ροπ 11987
121 _έ_ 11888
122 ερι 11865
123 κά_ 11796
124 _εν 11735
125 απο 11708
126 ιστ 11698
127 τον 11604
128 ευρ 11565
129 ικά 11562
130 δεν 11364
131 _ει 11261
132 πει 11098
133 πρό 11044
134 εις 11034
135 _κο 10943
136 τερ 10760
137 αν_ 10723
138 τε_ 10698
139 ανα 10526
140 τό_ 10388
141 ίας 10369
142 αντ 10365
143 ικό 10049
144 οιν 9986
145 _μι 9981
146 πιτ 9846
147 ρα_ 9784
148 κοι 9749
149 φορ 9615
150 ού_ 9464
151 συμ 9361
152 ημα 9284
153 στι 9263
154 έπε 9222
155 κής 9207
156 ισμ 9165
157 ποί 9103
158 ωση 9064
159 υτό 9009
160 οντ 8911
161 μια 8832
162 δικ 8605
163 σου 8568
164 ένα 8537
165 ρισ 8473
166 κό_ 8448
167 _εκ 8395
168 ιτρ 8346
169 ικο 8331
170 στα 8304
171 στε 8266
172 ούν 8262
173 τά_ 8261
174 ρωπ 8246
175 _ου 8210
176 _αρ 8180
177 ρέπ 8012
178 _οπ 7931
179 τητ 7885
180 _μέ 7831
181 αστ 7785
182 πορ 7719
183 μετ 7665
184 ροσ 7661
185 νου 7648
186 πρέ 7532
187 ολο 7516
188 υρω 7488
189 σία 7480
190 ότη 7463
191 ωπα 7339
192 ατο 7335
193 _εξ 7266
194 _θέ 7253
195 νομ 7248
196 _έν 7244
197 ντι 7234
198 _μα 7225
199 ικα 7136
200 οπή 7088
201 αϊκ 7081
202 ποι 7043
203 ύν_ 6974
204 μπο 6902
205 επί 6861
206 ός_ 6816
207 παϊ 6811
208 _πλ 6799
209 λογ 6789
210 ατά 6787
211 ολι 6777
212 _ο_ 6775
213 τασ 6735
214 αλλ 6679
215 θεσ 6665
216 κών 6651
217 εια 6594
218 νικ 6582
219 θεί 6549
220 ασί 6533
221 ρά_ 6490
222 ατι 6481
223 κές 6440
224 υτή 6360
225 τή_ 6359
226 ίες 6350
227 αση 6291
228 αφο 6286
229 σημ 6261
230 _μπ 6244
231 αρα 6220
232 νο_ 6177
233 νει 6176
234 υπο 6121
235 ητα 6120
236 άλλ 6114
237 ήσε 6062
238 _τρ 6049
239 _κά 6010
240 ινο 6003
241 _σ_ 6002
242 ικέ 5940
243 ικώ 5913
244 νων 5777
245 όσο 5754
246 ίου 5748
247 καν 5668
248 _θε 5583
249 κού 5580
250 _ση 5573
251 τος 5571
252 ώς_ 5553
253 βου 5550
254 ούμ 5535
255 ετι 5498
256 ουλ 5496
257 πως 5491
258 μας 5486
259 πισ 5471
260 λιτ 5457
261 οτε 5443
262 ποτ 5440
263 μικ 5433
264 _αλ 5380
265 ιτι 5369
266 ιση 5343
267 ερα 5333
268 ύρι 5289
269 ύμε 5282
270 ταν 5183
271 ρού 5121
272 _δη 5079
273 _κρ 5077
274 ρία 5071
275 ορά 5067
276 τελ 5063
277 _ή_ 5059
278 λά_ 5028
279 ρο_ 5021
280 τού 4969
281 γμα 4959
282 ελε 4916
283 λου 4886
284 _όλ 4882
285 _ετ 4878
286 νισ 4864
287 ρατ 4843
288 εδρ 4822
289 _κύ 4786
290 μεν 4786
291 ότε 4779
292 οβο 4761
293 πή_ 4750
294 καθ 4730
295 ένο 4724
296 μερ 4719
297 _αφ 4716
298 λει 4710
299 λλο 4697
300 σο_ 4684
301 λευ 4661
302 νωσ 4657
303 υνα 4646
304 ρικ 4636
305 ριο 4628
306 κύρ 4613
307 ργα 4602
308 εργ 4564
309 λα_ 4555
310 εων 4553
311 ρου 4498
312 _μο 4489
313 τησ 4486
314 εση 4475
315 ομι 4472
316 _κυ 4433
317 κρι 4424
318 άσε 4313
319 ορι 4295
320 βού 4281
321 σεω 4276
322 υμβ 4246
323 λλά 4236
324 μη_ 4210
325 τών 4199
326 ανά 4197
327 δημ 4179
328 ερο 4173
329 νη_ 4173
330 ομέ 4167
331 _ορ 4163
332 ριε 4152
333 μού 4145
334 ίνε 4125
335 απα 4091
336 σμο 4088
337 οικ 4087
338 _ερ 4047
339 τηρ 4044
340 _ακ 4032
341 ονο 4029
342 μεί 4020
343 ήμα 4018
344 τάσ 3997
345 υνε 3994
346 _ελ 3968
347 οστ 3966
348 τας 3963
349 _σύ 3947
350 ούλ 3935
351 _όπ 3934
352 ιών 3934
353 ραγ 3916
354 τομ 3914
355 τεί 3884
356 ουρ 3880
357 ιε_ 3877
358 _γε 3869
359 γία 3859
360 λόγ 3858
361 ρος 3846
362 θέσ 3844
363 ϊκή 3842
364 ορο 3830
365 ιμέ 3829
366 πάρ 3827
367 ύλι 3794
368 καλ 3788
369 ορε 3778
370 νοβ 3752
371 τήσ 3752
372 κει 3746
373 ολύ 3718
374 _ό_ 3716
375 ιας 3688
376 ρόε 3687
377 ίς_ 3686
378 όεδ 3685
379 μου 3684
380 διά 3683
381 αρ_ 3670
382 _νο 3668
383 ρει 3660
384 σω_ 3642
385 λιο 3639
386 σμό 3633
387 ιδι 3626
388 _σα 3624
389 _πι 3611
390 αρά 3604
391 θού 3602
392 νες 3602
393 ρεί 3581
394 λη_ 3568
395 κον 3557
396 γρα 3549
397 πίσ 3535
398 στό 3525
399 _εμ 3519
400 _ώρ 3518
401 αμε 3511
402 ρε_ 3505
403 σα_ 3496
404 πρα 3480
405 ύτε 3478
406 _νω 3472
407 ύς_ 3472
408 δρε 3471
409 ανο 3467
410 _έκ 3456
411 ίο_ 3443
412 οία 3440
413 τές 3431
414 σμα 3426
415 _δυ 3424
416 ευτ 3410
417 φέρ 3408
418 υπά 3396
419 ξη_ 3393
420 ούς 3375
421 _ισ 3323
422 ιακ 3319
423 συγ 3316
424 ρώπ 3303
425 δυν 3294
426 τες 3292
427 ρες 3288
428 φων 3268
429 ουσ 3258
430 _άλ 3255
431 _λό 3241
432 υλί 3239
433 διε 3237
434 άτω 3220
435 εξα 3189
436 όνο 3189
437 ίζο 3156
438 εία 3155
439 ροτ 3146
440 μασ 3138
441 μαν 3123
442 μάτ 3122
443 κυρ 3120
444 ρίζ 3119
445 ώσε 3117
446 υρώ 3096
447 ήσο 3085
448 λύ_ 3071
449 _τε 3070
450 αγμ 3066
451 μό_ 3066
452 τημ 3066
453 ιότ 3063
454 ποσ 3058
455 επε 3043
456 _ασ 3039
457 νία 3009
458 _πω 3004
459 διο 3003
460 ρήσ 3003
461 εκτ 3002
462 νον 2999
463 ρότ 2998
464 _ως 2996
465 έσε 2994
466 όπο 2986
467 μβο 2984
468 ορί 2979
469 ίτε 2974
470 τοι 2964
471 ότα 2953
472 ύσε 2946
473 λεί 2933
474 λικ 2928
475 ιατ 2927
476 ταξ 2917
477 βασ 2915
478 _μό 2910
479 ψη_ 2901
480 ημε 2899
481 ίση 2895
482 μία 2894
483 στή 2894
484 υρί 2894
485 θηκ 2890
486 άρ_ 2874
487 ίζε 2863
488 _αγ 2861
489 μεγ 2857
490 ζου 2849
491 _βο 2847
492 τιμ 2845
493 ωνι 2842
494 ρη_ 2841
495 εσμ 2838
496 ερί 2837
497 νατ 2836
498 _ομ 2833
499 κρα 2833
500 ράτ 2822
501 ήθε 2821
502 λον 2816
503 όπω 2810
504 μόν 2805
505 ευθ 2803
506 όλο 2801
507 πο_ 2790
508 λο_ 2786
509 είς 2785
510 αρμ 2782
511 όν_ 2776
512 ήτη 2771
513 ατε 2763
514 κθε 2753
515 μέλ 2748
516 γασ 2743
517 ούσ 2742
518 υση 2731
519 ένω 2725
520 κολ 2719
521 θέμ 2718
522 κρά 2710
523 αιρ 2709
524 ραμ 2695
525 ρησ 2692
526 ντο 2688
527 τόσ 2685
528 _όσ 2680
529 αδι 2674
530 έμα 2668
531 μφω 2655
532 ογι 2655
533 _εγ 2651
534 ανε 2644
535 ειρ 2637
536 υγκ 2637
537 ρον 2632
538 ζητ 2627
539 υστ 2623
540 έκθ 2621
541 _γν 2618
542 σμέ 2616
543 κόμ 2609
544 _ωρ 2596
545 υργ 2581
546 έα_ 2575
547 οίο 2568
548 όμε 2566
549 ροφ 2565
550 ασφ 2561
551 νός 2558
552 ράσ 2558
553 ιες 2557
554 ζήτ 2549
555 λίο 2549
556 εντ 2546
557 αφέ 2542
558 υμφ 2533
559 στρ 2527
560 _ιδ 2522
561 πικ 2522
562 ειτ 2520
563 γικ 2513
564 ρωτ 2511
565 πής 2510
566 οδο 2509
567 οί_ 2504
568 οβλ 2497
569 ινω 2494
570 τήρ 2491
571 άστ 2487
572 ονι 2481
573 έρο 2467
574 ωρί 2467
575 ροβ 2466
576 _όμ 2464
577 ακο 2462
578 ναν 2456
579 οιο 2454
580 μμα 2453
581 σια 2452
582 υτο 2444
583 τρα 2438
584 θετ 2436
585 ληρ 2429
586 ωστ 2428
587 ιαφ 2420
588 εισ 2415
589 εμπ 2411
590 τεύ 2411
591 ίνο 2408
592 κασ 2407
593 παν 2405
594 _κ_ 2395
595 κε_ 2388
596 άδε 2383
597 αρι 2382
598 ίων 2379
599 ενο 2378
600 ρώτ 2375
601 κεί 2372
602 άπο 2367
603 εωρ 2365
604 ζει 2361
605 υσι 2357
606 ειδ 2356
607 νετ 2355
608 βάλ 2342
609 ρια 2342
610 ροκ 2342
611 ρασ 2338
612 _τέ 2337
613 ρών 2329
614 _ρη 2321
615 κότ 2316
616 ονό 2313
617 ημο 2311
618 συζ 2309
619 ώρα 2307
620 άθε 2300
621 πλη 2300
622 ρετ 2300
623 ιασ 2298
624 ακό 2295
625 ητή 2290
626 οτι 2284
627 όμα 2280
628 ιου 2276
629 γνω 2263
630 στά 2261
631 _νέ 2258
632 _τό 2254
633 ναφ 2251
634 _εδ 2248
635 σιο 2243
636 αγω 2242
637 ενι 2235
638 έρε 2229
639 γορ 2214
640 σκο 2213
641 ιαδ 2205
642 ζετ 2204
643 υτέ 2203
644 υπό 2193
645 ώπη 2193
646 νθρ 2190
647 αίτ 2189
648 εθν 2189
649 ογί 2174
650 ριμ 2174
651 πτυ 2172
652 ελα 2168
653 άς_ 2165
654 σας 2163
655 _εφ 2159
656 γον 2159
657 πλα 2158
658 είτ 2152
659 ερη 2152
660 λισ 2147
661 ηκε 2146
662 _βα 2138
663 σήμ 2137
664 _οδ 2135
665 γκε 2134
666 _δρ 2129
667 ένε 2129
668 άνο 2127
669 υνο 2126
670 μβά 2123
671 _ρό 2119
672 έλο 2119
673 άνε 2116
674 γάλ 2116
675 αίν 2115
676 έλε 2110
677 ρόν 2107
678 ψηφ 2106
679 ρίσ 2103
680 έτο 2087
681 ατό 2084
682 ετά 2080
683 _ολ 2072
684 μως 2071
685 έση 2068
686 τισ 2068
687 λού 2066
688 ριβ 2066
689 ερε 2064
690 θελ 2064
691 μέσ 2061
692 ναλ 2060
693 σύν 2052
694 μελ 2045
695 είμ 2044
696 έλη 2041
697 αλύ 2035
698 _αξ 2033
699 λύτ 2033
700 νερ 2030
701 νησ 2029
702 ιτα 2028
703 ίσε 2023
704 νε_ 2010
705 αιτ 2008
706 ίσο 2006
707 ένη 2005
708 ρώ_ 2001
709 οδη 1995
710 ομά 1992
711 γο_ 1991
712 σον 1988
713 άση 1986
714 νότ 1986
715 ατη 1984
716 όμω 1980
717 ισ_ 1979
718 λλα 1978
719 στώ 1977
720 σότ 1974
721 ομο 1969
722 θει 1964
723 ηθε 1962
724 σαν 1962
725 άτη 1959
726 _σή 1958
727 αξι 1956
728 _ήθ 1950
729 ατί 1950
730 τέλ 1941
731 _σο 1940
732 ακρ 1940
733 σκε 1939
734 ξει 1935
735 _μη 1932
736 ολλ 1924
737 σφα 1921
738 _βρ 1916
739 θέλ 1914
740 ανθ 1911
741 οφο 1908
742 λαί 1906
743 ευσ 1905
744 δο_ 1903
745 τοπ 1898
746 _ρε 1896
747 ιμε 1895
748 ρακ 1894
749 δει 1893
750 _άρ 1891
751 ωνί 1891
752 εγά 1888
753 λοι 1888
754 ρμο 1884
755 αμμ 1882
756 ρων 1881
757 σιμ 1881
758 ινή 1878
759 μαι 1877
760 δηγ 1876
761 λαμ 1872
762 ιμο 1867
763 επα 1866
764 ζον 1865
765 μάδ 1865
766 αίσ 1864
767 _αμ 1856
768 _πά 1847
769 ακτ 1841
770 ούτ 1835
771 τρό 1832
772 ετε 1824
773 ντί 1823
774 άλε 1822
775 γή_ 1822
776 ενδ 1819
777 οκρ 1818
778 υλε 1818
779 κεκ 1811
780 εκρ 1809
781 ίσι 1804
782 πρά 1802
783 λες 1801
784 κάν 1800
785 αμβ 1796
786 είο 1795
787 οθε 1791
788 νοι 1788
789 οίη 1787
790 θερ 1785
791 ίησ 1784
792 εφα 1783
793 μιο 1783
794 _ζη 1778
795 ώρε 1775
796 φερ 1772
797 αρο 1768
798 ποδ 1767
799 γου 1765
800 υξη 1763
801 σσό 1760
802 ογρ 1759
803 ζω_ 1758
804 φαλ 1755
805 αθε 1754
806 βάσ 1753
807 μέρ 1753
808 οια 1750
809 ριν 1750
810 θεω 1745
811 λος 1745
812 λλε 1743
813 ημι 1741
814 όγο 1739
815 κτι 1738
816 ώμα 1737
817 υπε 1736
818 _σκ 1733
819 ρόπ 1733
820 κάπ 1731
821 _δύ 1730
822 ίδι 1713
823 υντ 1709
824 τής 1708
825 υτά 1708
826 άν_ 1699
827 αλι 1699
828 ήμε 1698
829 ράγ 1695
830 _φο 1691
831 ανι 1689
832 πε_ 1689
833 άμε 1684
834 ομα 1683
835 ισσ 1681
836 παι 1680
837 γαλ 1679
838 _βά 1678
839 ηρί 1674
840 ητι 1669
841 πίτ 1657
842 κρί 1655
843 γισ 1653
844 νω_ 1649
845 πρώ 1645
846 ητο 1644
847 σή_ 1643
848 δα_ 1641
849 λεσ 1641
850 ελέ 1639
851 ίμα 1638
852 λλη 1634
853 εγκ 1633
854 ενό 1632
855 δή_ 1628
856 νοτ 1625
857 ειμ 1621
858 μέα 1620
859 ρόκ 1620
860 εδο 1617
861 τρέ 1613
862 τατ 1612
863 λησ 1609
864 βλη 1606
865 αγο 1605
866 γει 1604
867 _κό 1603
868 μισ 1602
869 έρω 1601
870 ηρο 1601
871 όντ 1599
872 ποφ 1595
873 τότ 1594
874 ορέ 1592
875 τυξ 1591
876 ίζω 1588
877 ραφ 1588
878 ποκ 1584
879 όκε 1584
880 πάν 1583
881 ύντ 1582
882 άδα 1579
883 λημ 1579
884 ξύ_ 1579
885 μβα 1578
886 _πό 1576
887 ρεσ 1575
888 αξύ 1574
889 πη_ 1573
890 γίν 1571
891 ασμ 1570
892 ριθ 1568
893 _γί 1564
894 άζε 1564
895 ομί 1561
896 _λα 1555
897 ιγμ 1555
898 ιάσ 1554
899 κιν 1554
900 ήρι 1552
901 νση 1552
902 ρωσ 1549
903 λή_ 1547
904 υνέ 1542
905 βάν 1536
906 _ίδ 1535
907 γεν 1535
908 ιαί 1534
909 εάν 1533
910 ευ_ 1527
911 τευ 1527
912 φαρ 1526
913 ωτι 1524
914 γκρ 1522
915 αιώ 1521
916 ασι 1519
917 δη_ 1518
918 λών 1514
919 άλο 1511
920 νας 1511
921 ίως 1509
922 νωρ 1509
923 κλη 1508
924 νή_ 1508
925 _εά 1505
926 ρημ 1505
927 _ήτ 1500
928 _λε 1500
929 θήκ 1497
930 _ψη 1492
931 λων 1488
932 κυβ 1486
933 σικ 1486
934 τόν 1486
935 εξε 1484
936 πιο 1484
937 ανό 1481
938 μον 1479
939 ήτα 1477
940 ταλ 1476
941 δομ 1472
942 _λο 1470
943 ρομ 1469
944 θρω 1466
945 ρόσ 1463
946 φασ 1461
947 ωμέ 1461
948 ημέ 1458
949 υνά 1455
950 ηρε 1453
951 ώτη 1453
952 όμη 1451
953 ίστ 1450
954 θυμ 1450
955 εκπ 1440
956 γεί 1437
957 νια 1437
958 άπτ 1435
959 _κε 1431
960 κοπ 1431
961 ξου 1431
962 ίπε 1430
963 ογή 1430
964 τήμ 1428
965 ωρώ 1428
966 αφε 1425
967 _εθ 1420
968 τυ_ 1420
969 νης 1418
970 εγο 1416
971 ενε 1410
972 νόμ 1410
973 _ον 1409
974 δρο 1407
975 πεδ 1406
976 αρκ 1405
977 φάλ 1404
978 γεγ 1399
979 ενη 1397
980 αγκ 1395
981 μός 1395
982 εκε 1393
983 υμπ 1393
984 αθώ 1391
985 ινό 1391
986 _αι 1389
987 κάθ 1389
988 πρω 1388
989 σαφ 1388
990 _μί 1385
991 νάπ 1383
992 υζή 1383
993 τώσ 1382
994 λάβ 1380
995 ταγ 1380
996 πηρ 1378
997 αίω 1376
998 λία 1376
999 τία 1375
1000 όγω 1375
1001 ρές 1372
1002 ργί 1371
1003 _ι_ 1369
1004 γω_ 1368
1005 ίτρ 1366
1006 ολί 1363
1007 πιθ 1361
1008 νού 1360
1009 ύσα 1360
1010 ιβά 1357
1011 _ωσ 1356
1012 υνθ 1355
1013 γνώ 1349
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _th 154732
15 the 117027
16 he_ 95427
17 on_ 52529
18 ion 52199
19 _in 48458
20 _of 47302
21 _to 46961
22 of_ 45622
23 _co 44626
24 to_ 43979
25 _an 42881
26 nd_ 41007
27 is_ 38826
28 and 38294
29 ent 36998
30 in_ 34623
31 tio 34216
32 ed_ 34118
33 ing 32587
34 ng_ 31999
35 es_ 31252
36 at_ 29583
37 re_ 29200
38 _re 28769
39 nt_ 28050
40 er_ 26375
41 _pr 26146
42 _be 25910
43 _is 23791
44 hat 23622
45 men 23540
46 al_ 23387
47 ati 23109
48 tha 22799
49 for 22603
50 _a_ 21520
51 _wh 20668
52 thi 20188
53 an_ 20163
54 ly_ 19930
55 com 19295
56 _fo 19226
57 _we 19058
58 ve_ 18877
59 or_ 18842
60 as_ 18763
61 _ha 18065
62 res 17961
63 _wi 17873
64 pro 17415
65 con 17228
66 _on 17153
67 ts_ 16822
68 his 16273
69 ate 16103
70 we_ 15339
71 se_ 15279
72 _no 15270
73 _it 15183
74 her 15085
75 ons 15030
76 en_ 14965
77 ll_ 14957
78 ch_ 14888
79 st_ 14506
80 are 13968
81 it_ 13573
82 le_ 13524
83 _de 13487
84 _i_ 13424
85 ns_ 13375
86 ere 13326
87 ter 13267
88 _ar 13207
89 _ma 13164
90 rop 13064
91 ope 12799
92 omm 12656
93 _st 12216
94 _as 12134
95 all 12093
96 ld_ 11997
97 _al 11972
98 be_ 11913
99 sio 11812
100 _po 11634
101 ce_ 11563
102 ect 11498
103 _eu 11388
104 not 11209
105 ort 11098
106 ty_ 11038
107 ver 11001
108 iti 10943
109 ssi 10940
110 rea 10912
111 ive 10901
112 whi 10843
113 th_ 10842
114 eur 10744
115 nce 10725
116 te_ 10720
117 _pa 10667
118 pre 10537
119 eve 10507
120 hic 10420
121 ot_ 10304
122 ich 10302
123 ith 10301
124 sta 10296
125 our 10103
126 _me 10010
127 wit 10005
128 oul 9789
129 ide 9714
130 int 9709
131 par 9700
132 uld 9685
133 uro 9676
134 ut_ 9587
135 _se 9581
136 cti 9559
137 por 9497
138 rs_ 9464
139 _wo 9429
140 _so 9411
141 ave 9392
142 cou 9388
143 mmi 9375
144 _un 9292
145 hav 9263
146 _ca 9260
147 me_ 9077
148 iss 9045
149 ill 8964
150 ry_ 8820
151 ess 8804
152 ted 8694
153 ame 8672
154 pea 8584
155 ean 8514
156 _su 8446
157 ies 8403
158 _di 8378
159 oun 8351
160 mis 7974
161 ity 7914
162 _mo 7885
163 ble 7852
164 ers 7829
165 nte 7713
166 ust 7684
167 rat 7577
168 est 7548
169 _wa 7541
170 _ne 7493
171 _ex 7407
172 uni 7396
173 tiv 7327
174 tic 7229
175 _li 7016
176 tin 6991
177 one 6980
178 ur_ 6899
179 _ac 6810
180 _fi 6780
181 pos 6778
182 rt_ 6738
183 wil 6705
184 by_ 6667
185 hou 6661
186 _sh 6659
187 ica 6641
188 ure 6638
189 sti 6611
190 ore 6597
191 der 6592
192 _by 6563
193 tat 6556
194 str 6527
195 _en 6518
196 eme 6456
197 _fr 6434
198 _ou 6430
199 sid 6416
200 so_ 6395
201 ant 6363
202 art 6357
203 _do 6343
204 act 6339
205 ain 6294
206 nti 6292
207 den 6275
208 _bu 6274
209 nal 6262
210 _le 6236
211 ple 6230
212 igh 6205
213 ay_ 6192
214 ese 6152
215 ial 6121
216 _mr 6119
217 end 6114
218 ke_ 6109
219 has 6030
220 ntr 5939
221 ct_ 5919
222 _at 5885
223 ome 5870
224 ear 5841
225 ght 5795
226 ern 5788
227 ove 5784
228 ne_ 5782
229 tra 5697
230 enc 5664
231 abl 5632
232 ic_ 5621
233 ber 5605
234 lit 5586
235 lat 5534
236 _fa 5517
237 und 5468
238 per 5467
239 een 5464
240 out 5459
241 nci 5453
242 _pe 5438
243 als 5433
244 us_ 5403
245 ces 5394
246 _ta 5391
247 eas 5387
248 _im 5378
249 hin 5303
250 man 5292
251 nts 5283
252 imp 5274
253 _mu 5273
254 arl 5241
255 _ho 5236
256 cal 5214
257 mr_ 5178
258 ss_ 5145
259 tur 5131
260 can 5122
261 esi 5119
262 ste 5114
263 _or 5113
264 but 5113
265 _sa 5110
266 ar_ 5093
267 rec 5079
268 ake 5076
269 _tr 5049
270 rep 5047
271 spe 5034
272 lic 5023
273 use 5004
274 sho 5003
275 _la 4965
276 oli 4937
277 era 4936
278 ow_ 4926
279 tan 4864
280 pol 4854
281 _ag 4842
282 nat 4840
283 inc 4835
284 sed 4831
285 _he 4830
286 mbe 4812
287 rom 4798
288 _am 4795
289 ist 4783
290 cia 4772
291 nit 4767
292 app 4761
293 nde 4759
294 _yo 4751
295 you 4729
296 anc 4690
297 ren 4688
298 eco 4643
299 wou 4636
300 lly 4634
301 tri 4632
302 mpl 4613
303 tte 4602
304 wor 4597
305 _gr 4571
306 et_ 4567
307 unt 4559
308 om_ 4552
309 reg 4543
310 rit 4518
311 ds_ 4489
312 eed 4484
313 lia 4474
314 _si 4471
315 ten 4439
316 min 4428
317 ona 4373
318 _fu 4368
319 ina 4368
320 _ri 4348
321 eat 4343
322 nta 4327
323 tes 4319
324 _ch 4301
325 emb 4271
326 han 4254
327 ont 4243
328 gre 4229
329 nio 4224
330 ous 4187
331 oth 4183
332 ose 4181
333 ind 4172
334 ven 4150
335 din 4136
336 ral 4136
337 lea 4118
338 lso 4103
339 ge_ 4089
340 de_ 4076
341 ard 4074
342 ee_ 4065
343 ref 4053
344 _s_ 4050
345 ine 4047
346 rin 4041
347 ens 4040
348 tho 4026
349 _ab 3978
350 ner 3946
351 ise 3935
352 nsi 3924
353 pec 3921
354 nst 3914
355 mus 3913
356 rli 3888
357 rou 3860
358 ues 3842
359 hes 3840
360 ins 3826
361 rie 3826
362 il_ 3822
363 _ad 3820
364 fro 3813
365 ree 3811
366 cy_ 3803
367 rig 3799
368 dis 3789
369 _go 3783
370 fic 3780
371 mit 3774
372 mor 3767
373 ula 3757
374 rta 3747
375 sen 3735
376 ey_ 3732
377 _ba 3729
378 vel 3729
379 eri 3722
380 tal 3721
381 pe_ 3720
382 ran 3674
383 ini 3671
384 its 3670
385 eci 3669
386 _qu 3668
387 _mi 3655
388 gra 3651
389 mem 3650
390 uti 3616
391 iam 3614
392 age 3610
393 whe 3608
394 sit 3593
395 sin 3592
396 onc 3591
397 lar 3577
398 _ap 3524
399 ou_ 3522
400 omp 3519
401 ast 3517
402 _te 3515
403 lis 3515
404 _us 3508
405 ote 3504
406 _sp 3497
407 ew_ 3486
408 rti 3481
409 ppo 3480
410 ery 3458
411 ely 3417
412 kin 3415
413 bee 3390
414 oin 3386
415 any 3384
416 tor 3357
417 red 3350
418 ire 3349
419 pri 3348
420 cul 3338
421 cer 3328
422 ite 3310
423 gen 3300
424 tak 3295
425 unc 3294
426 fin 3290
427 orm 3287
428 ene 3284
429 ili 3272
430 lem 3253
431 ans 3238
432 hen 3238
433 ms_ 3232
434 ord 3229
435 em_ 3227
436 mon 3202
437 ali 3200
438 sur 3180
439 ves 3172
440 ny_ 3163
441 egi 3156
442 esp 3143
443 was 3139
444 ish 3138
445 ue_ 3137
446 cha 3114
447 ori 3099
448 efo 3098
449 ime 3093
450 now 3079
451 fer 3078
452 _cr 3048
453 _cl 3047
454 ead 3045
455 acc 3036
456 _lo 3031
457 _ev 3025
458 ndi 3016
459 aus 3001
460 ice 3001
461 cil 2995
462 _ra 2992
463 isi 2988
464 tim 2975
465 ase 2974
466 que 2972
467 ned 2967
468 _ve 2964
469 les 2956
470 tai 2948
471 ult 2943
472 wha 2939
473 _op 2933
474 lin 2931
475 tie 2929
476 ary 2920
477 mat 2911
478 up_ 2911
479 opo 2909
480 pla 2900
481 ffe 2883
482 ht_ 2872
483 ike 2867
484 do_ 2843
485 lik 2827
486 eal 2819
487 how 2818
488 rov 2817
489 ser 2811
490 tre 2807
491 nis 2805
492 ork 2804
493 ual 2802
494 ir_ 2799
495 ega 2792
496 rd_ 2789
497 epo 2779
498 lan 2774
499 id_ 2751
500 oci 2751
501 mpo 2746
502 itu 2737
503 pen 2732
504 who 2731
505 ade 2728
506 ls_ 2715
507 _pu 2712
508 soc 2706
509 _ge 2699
510 sse 2698
511 _if 2691
512 if_ 2691
513 bou 2685
514 fac 2685
515 ici 2682
516 nme 2682
517 ext 2680
518 duc 2678
519 _ye 2672
520 cle 2667
521 nin 2666
522 _pl 2650
523 ari 2648
524 sec 2648
525 hey 2646
526 tly 2625
527 abo 2622
528 tit 2616
529 _vi 2612
530 equ 2605
531 ost 2600
532 nee 2592
533 ert 2581
534 iat 2578
535 my_ 2574
536 cat 2570
537 icu 2563
538 ris 2552
539 exp 2551
540 sol 2547
541 uct 2535
542 _na 2530
543 rel 2522
544 _ju 2519
545 itt 2519
546 nly 2512
547 uch 2510
548 tia 2505
549 no_ 2504
550 sib 2499
551 ugh 2491
552 omi 2485
553 bli 2481
554 tab 2478
555 _hi 2474
556 mme 2473
557 ach 2469
558 agr 2469
559 cau 2465
560 rge 2462
561 _ot 2443
562 bec 2435
563 att 2433
564 ong 2428
565 eir 2426
566 nge 2411
567 oug 2411
568 hei 2410
569 ssu 2400
570 rst 2399
571 tem 2397
572 am_ 2396
573 cre 2386
574 ani 2385
575 mak 2381
576 ced 2371
577 ura 2369
578 bil 2367
579 ded 2363
580 tee 2360
581 owe 2357
582 _ti 2356
583 jec 2354
584 eli 2346
585 elo 2344
586 ric 2342
587 mar 2334
588 wer 2331
589 mea 2326
590 leg 2323
591 ond 2321
592 gro 2314
593 ifi 2309
594 eth 2303
595 _ec 2294
596 sel 2290
597 _up 2289
598 sup 2288
599 hts 2282
600 nda 2273
601 som 2270
602 ad_ 2267
603 _my 2252
604 ses 2248
605 irs 2246
606 cie 2240
607 ono 2239
608 ell 2231
609 tru 2230
610 dec 2226
611 iou 2225
612 ace 2212
613 dev 2212
614 tar 2212
615 fir 2211
616 cis 2197
617 rio 2196
618 way 2195
619 rem 2193
620 ece 2189
621 new 2189
622 nes 2182
623 ass 2173
624 erm 2166
625 mun 2165
626 _bo 2163
627 eca 2162
628 _af 2145
629 har 2145
630 lle 2145
631 isa 2140
632 ang 2139
633 spo 2138
634 war 2137
635 ele 2129
636 lie 2128
637 erv 2121
638 _hu 2119
639 mmu 2114
640 roc 2112
641 nsu 2109
642 ibl 2103
643 _ce 2096
644 get 2096
645 uri 2080
646 el_ 2077
647 mer 2076
648 ria 2074
649 ron 2074
650 ram 2070
651 lop 2067
652 upp 2066
653 _fe 2049
654 dit 2044
655 eop 2043
656 rod 2042
657 nk_ 2039
658 jus 2038
659 sal 2038
660 ffi 2034
661 gh_ 2022
662 ogr 2021
663 pon 2020
664 qui 2014
665 rk_ 2012
666 _ob 2011
667 suc 2011
668 asi 2010
669 dem 2003
670 ein 2001
671 aid 1998
672 sue 1998
673 icy 1989
674 sis 1986
675 opl 1983
676 fun 1980
677 peo 1973
678 bet 1970
679 oce 1969
680 ann 1968
681 cce 1968
682 nom 1959
683 mad 1948
684 lut 1945
685 see 1945
686 poi 1941
687 chi 1940
688 rke 1939
689 _es 1934
690 yea 1928
691 arg 1927
692 ivi 1927
693 onl 1925
694 ita 1921
695 low 1912
696 rog 1909
697 day 1904
698 own 1903
699 clu 1894
700 emp 1894
701 rre 1891
702 tro 1887
703 led 1885
704 wn_ 1881
705 ho_ 1877
706 urs 1876
707 _da 1875
708 cur 1874
709 hem 1874
710 dir 1861
711 rly 1857
712 tec 1856
713 ful 1855
714 _ci 1847
715 ia_ 1836
716 ume 1834
717 iev 1833
718 oll 1833
719 vin 1833
720 od_ 1826
721 rce 1816
722 gai 1806
723 rte 1806
724 rac 1805
725 cri 1804
726 _em 1800
727 _vo 1797
728 tua 1797
729 isc 1793
730 rm_ 1793
731 oss 1791
732 sh_ 1787
733 sat 1783
734 mes 1782
735 ovi 1778
736 rde 1776
737 osi 1769
738 eff 1766
739 rse 1763
740 llo 1761
741 try 1760
742 odu 1758
743 _gi 1757
744 cts 1755
745 wee 1754
746 _br 1750
747 cit 1745
748 olu 1744
749 son 1744
750 dam 1743
751 _cu 1739
752 lf_ 1738
753 ntl 1736
754 giv 1733
755 osa 1731
756 _ef 1726
757 ink 1716
758 hos 1715
759 cen 1714
760 nds 1706
761 ppr 1700
762 bat 1698
763 aga 1693
764 eac 1692
765 ndm 1687
766 ark 1685
767 ctu 1683
768 eso 1680
769 nto 1680
770 sub 1680
771 hum 1679
772 uma 1678
773 bel 1673
774 thr 1670
775 dme 1667
776 obl 1667
777 rna 1667
778 hal 1665
779 edu 1664
780 _sc 1661
781 vid 1661
782 emo 1659
783 _au 1657
784 des 1657
785 ved 1655
786 nan 1652
787 oup 1651
788 hea 1647
789 eti 1644
790 bas 1643
791 tel 1641
792 ara 1635
793 ela 1634
794 ncl 1625
795 ept 1624
796 nvi 1624
797 rds 1621
798 wel 1621
799 bei 1619
800 ood 1617
801 qua 1614
802 inf 1610
803 oti 1609
804 fec 1604
805 nfo 1604
806 gov 1597
807 dif 1596
808 lon 1596
809 lve 1594
810 etw 1591
811 ems 1585
812 alt 1584
813 dea 1583
814 bje 1582
815 cip 1578
816 ack 1576
817 rma 1576
818 gar 1574
819 ien 1571
820 rob 1570
821 vot 1560
822 _ru 1559
823 dy_ 1557
824 lac 1557
825 ign 1547
826 say 1545
827 onf 1544
828 edi 1540
829 inv 1539
830 cor 1534
831 imi 1533
832 med 1529
833 twe 1529
834 iff 1524
835 mic 1524
836 rth 1513
837 ian 1505
838 _va 1504
839 kno 1504
840 plo 1502
841 gio 1496
842 pli 1493
843 che 1490
844 _sy 1489
845 ruc 1487
846 ole 1486
847 ppl 1478
848 rtu 1477
849 _ai 1475
850 eem 1475
851 mos 1474
852 ana 1456
853 sum 1456
854 sts 1455
855 mil 1453
856 ks_ 1451
857 hor 1450
858 cas 1449
859 lli 1449
860 adi 1443
861 pme 1442
862 rdi 1440
863 _ea 1438
864 off 1437
865 uat 1435
866 ars 1434
867 deb 1432
868 cep 1431
869 dge 1431
870 rnm 1424
871 wan 1422
872 usi 1418
873 ges 1416
874 amm 1412
875 _kn 1411
876 pin 1407
877 ied 1406
878 nno 1404
879 vis 1402
880 rev 1400
881 del 1398
882 opm 1394
883 nve 1390
884 hel 1383
885 cra 1382
886 ubl 1380
887 wev 1380
888 cus 1378
889 rap 1377
890 tut 1375
891 ism 1372
892 lev 1371
893 rn_ 1371
894 ute 1367
895 _ro 1366
896 rot 1366
897 eu_ 1364
898 ea_ 1363
899 _el 1354
900 lud 1352
901 aki 1348
902 ech 1347
903 ete 1347
904 _ov 1343
905 cco 1341
906 eba 1341
907 ict 1338
908 efe 1334
909 env 1334
910 yst 1333
911 gin 1332
912 shi 1332
913 egu 1331
914 fre 1327
915 urt 1325
916 dur 1322
917 add 1320
918 vir 1318
919 _dr 1316
920 oes 1312
921 iro 1303
922 she 1302
923 vie 1301
924 air 1296
925 car 1296
926 ck_ 1296
927 mpe 1293
928 sla 1292
929 opi 1288
930 sys 1288
931 too 1288
932 ncy 1287
933 err 1282
934 til 1281
935 uss 1281
936 ask 1279
937 ps_ 1278
938 opt 1277
939 aff 1272
940 ank 1266
941 ngs 1266
942 ail 1264
943 exa 1263
944 ors 1248
945 sto 1248
946 ket 1247
947 cto 1246
948 uth 1241
949 rad 1239
950 onm 1238
951 atu 1234
952 mpr 1234
953 fra 1229
954 loy 1229
955 ize 1226
956 hy_ 1225
957 tle 1225
958 pub 1224
959 lec 1223
960 req 1219
961 iew 1218
962 nic 1217
963 hil 1216
964 _du 1214
965 nse 1211
966 two 1210
967 xt_ 1207
968 aft 1206
969 vic 1204
970 rol 1202
971 coo 1199
972 evi 1199
973 exc 1196
974 fte 1191
975 col 1190
976 cte 1190
977 rms 1186
978 nor 1184
979 log 1183
980 ema 1180
981 eta 1178
982 ipl 1175
983 dat 1174
984 gs_ 1174
985 rts 1174
986 mai 1169
987 far 1166
988 gul 1166
989 amp 1164
990 oni 1164
991 rne 1164
992 ger 1163
993 abi 1162
994 aut 1157
995 las 1154
996 erg 1153
997 _tw 1151
998 let 1150
999 sou 1148
1000 olv 1144
1001 len 1143
1002 ibi 1141
1003 pt_ 1141
1004 ocr 1140
1005 doe 1136
1006 set 1133
1007 mpa 1132
1008 hol 1131
1009 sha 1131
1010 ken 1129
1011 _gu 1126
1012 sk_ 1126
1013 lig 1125
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _la 382428
15 la_ 374029
16 aj_ 246196
17 _de 243032
18 de_ 224736
19 oj_ 196353
20 is_ 184184
21 as_ 167445
22 _ka 158716
23 en_ 145494
24 _en 143916
25 _es 136176
26 kaj 133721
27 est 127476
28 to_ 119334
29 on_ 111006
30 sta 102398
31 io_ 94844
32 ro_ 92272
33 _pr 86851
34 ta_ 84035
35 _ko 81850
36 _po 78991
37 _li 78651
38 ant 78055
39 tas 76074
40 jn_ 75117
41 _ki 71830
42 ran 67142
43 _al 66971
44 per 64839
45 _ma 64513
46 nto 63544
47 an_ 63496
48 ist 63058
49 tis 62800
50 li_ 59418
51 no_ 58385
52 sti 58354
53 aŭ_ 57064
54 kon 56876
55 do_ 56827
56 ita 56600
57 _re 55250
58 toj 52344
59 ent 51516
60 _an 49254
61 _in 49196
62 el_ 49185
63 _ti 48980
64 aro 47609
65 ia_ 47484
66 era 46867
67 _se 46678
68 ter 46674
69 ojn 46607
70 sto 44974
71 _pl 44131
72 nta 42572
73 lo_ 42413
74 _si 41789
75 pro 41744
76 and 41350
77 al_ 40597
78 taj 40368
79 _mo 39952
80 _ku 39584
81 nte 39520
82 tra 39196
83 _su 38668
84 _tr 37801
85 _pa 37749
86 te_ 36918
87 _el 36863
88 _ja 35980
89 lan 35290
90 _no 35285
91 str 35267
92 int 35237
93 iu_ 34758
94 ĝis 34526
95 mal 34395
96 spe 34114
97 por 34002
98 ili 33780
99 _ne 33742
100 ono 33221
101 roj 33107
102 er_ 32980
103 _pe 32912
104 ver 32687
105 kiu 32398
106 ata 32238
107 mo_ 32101
108 _ek 31851
109 par 31812
110 na_ 31546
111 ko_ 31323
112 esp 31152
113 oni 31136
114 ne_ 31100
115 ado 30618
116 ajn 30394
117 gra 30281
118 pre 30086
119 art 30018
120 _di 29742
121 ra_ 29729
122 ank 29714
123 _un 29391
124 lin 28832
125 un_ 28792
126 or_ 28659
127 da_ 28594
128 _vi 28489
129 tro 28462
130 ort 28424
131 mon 28335
132 _du 28273
133 _gr 27759
134 kun 27460
135 kie 27371
136 iĝi 27310
137 pri 27274
138 _te 26849
139 nda 26844
140 ali 26703
141 eri 26702
142 jar 26668
143 unu 26566
144 men 26435
145 _ĝi 26411
146 _fa 26339
147 igi 26298
148 _fo 26208
149 ris 26161
150 lia 26035
151 ori 25930
152 man 25760
153 iel 25478
154 _me 25465
155 ri_ 25226
156 _ve 25125
157 ndo 24996
158 kom 24890
159 ato 24793
160 iaj 24374
161 ing 24319
162 loj 24202
163 noj 24059
164 ano 23865
165 _ĉe 23736
166 ost 23591
167 ont 23504
168 iko 23470
169 ara 23402
170 ari 23393
171 ton 23389
172 ion 23088
173 _aŭ 23086
174 _ĉi 23039
175 olo 23002
176 co_ 22791
177 dis 22751
178 tan 22705
179 ero 22650
180 cio 22578
181 _ke 22566
182 rto 22427
183 ple 22424
184 ona 22317
185 _mi 22130
186 for 22075
187 vas 22049
188 res 21573
189 _ar 21495
190 kaŭ 21484
191 ni_ 21472
192 ino 21409
193 pos 21408
194 ond 21383
195 aci 21203
196 nom 21114
197 _sa 21097
198 rio 20941
199 don 20939
200 _a_ 20895
201 nis 20827
202 _fi 20721
203 ste 20709
204 ekt 20560
205 rbo 20334
206 ala 20211
207 nka 20005
208 ova 19919
209 ika 19896
210 am_ 19768
211 _st 19732
212 tri 19715
213 urb 19704
214 _kr 19629
215 kto 19451
216 ult 19403
217 enc 19375
218 _mu 19363
219 iuj 19315
220 nti 19227
221 in_ 19191
222 pli 19189
223 _ha 19073
224 tem 18910
225 _na 18887
226 mil 18707
227 re_ 18705
228 _ba 18595
229 ron 18581
230 cia 18499
231 ntr 18457
232 ana 18423
233 _fr 18310
234 ani 18277
235 tiu 18229
236 ons 18088
237 aĵo 18007
238 lit 17946
239 alo 17914
240 ilo 17844
241 bo_ 17783
242 _ri 17713
243 le_ 17588
244 ke_ 17581
245 lon 17579
246 rti 17464
247 nas 17458
248 tur 17452
249 sia 17362
250 ern 17178
251 go_ 17178
252 uj_ 17178
253 _lo 17024
254 oro 16925
255 ven 16925
256 eno 16894
257 _on 16892
258 tat 16859
259 ava 16840
260 ian 16642
261 _nu 16636
262 ka_ 16588
263 tik 16586
264 ur_ 16573
265 omo 16555
266 nst 16538
267 _ur 16429
268 nio 16379
269 vo_ 16233
270 ito 16230
271 tor 16132
272 _il 16099
273 _ge 16082
274 tio 15963
275 ive 15933
276 ndi 15926
277 raj 15875
278 _or 15831
279 _da 15720
280 eni 15702
281 _ap 15700
282 eks 15621
283 ten 15575
284 ers 15567
285 ngv 15546
286 po_ 15509
287 ren 15297
288 ide 15274
289 rma 15270
290 ulo 15264
291 jo_ 15216
292 vis 15183
293 sed 15177
294 reg 15151
295 bro 15138
296 ed_ 15089
297 son 15082
298 ena 14920
299 ome 14913
300 doj 14875
301 tar 14853
302 mar 14814
303 _so 14788
304 eto 14735
305 _ok 14593
306 hav 14577
307 _ho 14547
308 gis 14523
309 erm 14504
310 end 14420
311 um_ 14412
312 ina 14368
313 ma_ 14351
314 ati 14266
315 iam 14239
316 rit 14234
317 sen 14217
318 moj 14113
319 dum 14108
320 eco 14094
321 _vo 14090
322 _ta 14053
323 mul 13983
324 va_ 13963
325 emp 13961
326 rad 13941
327 cen 13928
328 den 13816
329 ĝo_ 13816
330 lej 13808
331 _ro 13735
332 _le 13729
333 ovi 13656
334 anc 13632
335 akt 13568
336 dan 13568
337 ejo 13532
338 ĝi_ 13508
339 ila 13493
340 ini 13443
341 lio 13402
342 tru 13367
343 _ak 13345
344 ie_ 13328
345 ame 13290
346 ele 13288
347 ama 13234
348 mor 13229
349 ora 13145
350 las 13125
351 elo 13065
352 ioj 13044
353 uni 13020
354 oli 12996
355 fra 12922
356 so_ 12895
357 es_ 12890
358 ere 12858
359 kis 12853
360 _br 12843
361 nov 12840
362 ord 12810
363 pol 12732
364 nco 12711
365 tin 12663
366 _ra 12624
367 _be 12621
368 egi 12619
369 _do 12514
370 kan 12476
371 nci 12449
372 fer 12414
373 ast 12395
374 _tu 12322
375 st_ 12288
376 ĝas 12242
377 rat 12180
378 iĝa 12129
379 gio 12120
380 lta 12063
381 rov 11941
382 erk 11928
383 _hi 11908
384 kol 11901
385 kti 11884
386 iga 11863
387 orm 11830
388 ura 11829
389 rez 11785
390 ej_ 11779
391 tal 11764
392 emo 11739
393 _he 11716
394 kva 11615
395 ale 11518
396 van 11515
397 ras 11353
398 voj 11321
399 _va 11270
400 lig 11270
401 oma 11257
402 _ce 11252
403 ber 11247
404 ici 11247
405 far 11205
406 _bo 11192
407 sur 11166
408 ins 11159
409 rig 11142
410 rop 11142
411 rna 11083
412 oka 11064
413 gvo 11044
414 nua 11032
415 bor 10997
416 kri 10988
417 ene 10942
418 vid 10902
419 je_ 10854
420 omi 10824
421 opo 10818
422 ua_ 10805
423 uro 10804
424 gan 10751
425 ien 10745
426 _ju 10692
427 tie 10659
428 koj 10645
429 ola 10592
430 ebl 10591
431 ate 10582
432 ang 10579
433 mbr 10567
434 _fe 10554
435 san 10539
436 kta 10510
437 tre 10499
438 rom 10473
439 sis 10433
440 rin 10430
441 zis 10427
442 nat 10422
443 ekz 10418
444 _sp 10394
445 duk 10386
446 kre 10344
447 _je 10325
448 _of 10275
449 nor 10263
450 ong 10263
451 ism 10248
452 laŭ 10140
453 tit 10138
454 iti 10095
455 fin 10090
456 ria 10073
457 rte 10044
458 lek 10043
459 der 10028
460 rik 9921
461 abo 9900
462 ski 9879
463 bla 9867
464 kar 9850
465 min 9845
466 fon 9826
467 taŭ 9780
468 mat 9762
469 sim 9749
470 ind 9745
471 ezi 9735
472 iri 9728
473 apa 9712
474 ĉef 9708
475 non 9702
476 azi 9636
477 rta 9611
478 nac 9601
479 lis 9578
480 pan 9562
481 avi 9551
482 pov 9495
483 eci 9453
484 naj 9452
485 ask 9416
486 eta 9409
487 maj 9369
488 sek 9363
489 tia 9287
490 _am 9280
491 lik 9255
492 sub 9221
493 omp 9208
494 laj 9207
495 ser 9152
496 kul 9143
497 ger 9128
498 ial 9125
499 _ru 9070
500 igo 9059
501 raŭ 9046
502 me_ 9036
503 ida 9030
504 zo_ 9000
505 _kv 8968
506 git 8945
507 rie 8921
508 _uz 8908
509 kor 8895
510 nde 8895
511 ti_ 8894
512 rdo 8860
513 ans 8847
514 nca 8841
515 ĵoj 8814
516 nur 8812
517 ĉe_ 8802
518 nu_ 8775
519 lib 8764
520 mpe 8760
521 bli 8675
522 ler 8663
523 mer 8651
524 ert 8625
525 lab 8617
526 nko 8589
527 ca_ 8547
528 _lu 8535
529 pen 8515
530 var 8511
531 nan 8505
532 rol 8477
533 eli 8446
534 kci 8444
535 gas 8440
536 vol 8436
537 iki 8433
538 ago 8432
539 are 8425
540 pon 8425
541 _ga 8379
542 edi 8369
543 his 8347
544 adi 8332
545 ami 8313
546 bon 8299
547 rib 8267
548 nsi 8252
549 rki 8225
550 nga 8223
551 lat 8218
552 alt 8205
553 uso 8173
554 mun 8162
555 mpl 8152
556 vin 8148
557 met 8128
558 lar 8126
559 fil 8095
560 _bi 8088
561 rav 8044
562 rak 8006
563 ral 7991
564 mpo 7977
565 emb 7944
566 _fl 7915
567 daj 7911
568 ard 7894
569 erv 7862
570 lor 7850
571 tiv 7815
572 oci 7803
573 mem 7792
574 sam 7792
575 uzi 7777
576 gen 7773
577 ies 7757
578 rko 7735
579 tig 7711
580 iva 7708
581 ape 7695
582 _kl 7691
583 ite 7671
584 ema 7663
585 ar_ 7661
586 len 7660
587 oko 7644
588 reĝ 7617
589 ksi 7570
590 smo 7566
591 rus 7562
592 atu 7544
593 dio 7540
594 org 7527
595 ĵo_ 7527
596 us_ 7523
597 eso 7504
598 viv 7493
599 plu 7487
600 _ni 7483
601 ove 7460
602 gi_ 7444
603 mas 7437
604 kia 7411
605 kaz 7410
606 sit 7408
607 tut 7380
608 eko 7372
609 iĝo 7331
610 evi 7318
611 imp 7311
612 ela 7300
613 ret 7277
614 stu 7273
615 log 7272
616 kla 7264
617 kiĝ 7257
618 ine 7249
619 ner 7249
620 gar 7244
621 ĉiu 7200
622 _at 7195
623 bel 7162
624 sin 7161
625 emi 7141
626 rek 7131
627 pla 7112
628 sa_ 7108
629 _sc 7082
630 hom 7082
631 vor 7077
632 gre 7063
633 red 7043
634 kat 7041
635 kel 7002
636 pe_ 6954
637 val 6945
638 arc 6941
639 eo_ 6903
640 ldo 6869
641 pra 6868
642 ark 6865
643 ĝin 6848
644 kur 6841
645 se_ 6840
646 tek 6822
647 soc 6804
648 sig 6788
649 rec 6751
650 _ŝi 6742
651 ido 6736
652 dek 6730
653 rce 6722
654 con 6714
655 bra 6711
656 cie 6709
657 anĝ 6703
658 cid 6664
659 lok 6655
660 ga_ 6626
661 ens 6623
662 _sk 6586
663 vaj 6584
664 ĉi_ 6578
665 ivi 6576
666 ira 6575
667 tel 6565
668 _as 6562
669 du_ 6561
670 os_ 6560
671 ust 6543
672 ril 6520
673 riĝ 6513
674 sci 6504
675 inf 6473
676 rmo 6433
677 rka 6415
678 ska 6415
679 arm 6411
680 aĝo 6403
681 bri 6402
682 sko 6400
683 nce 6349
684 _pi 6346
685 dia 6341
686 elk 6332
687 aso 6331
688 rmi 6330
689 ega 6321
690 mis 6314
691 uda 6305
692 rok 6295
693 eti 6293
694 _pu 6285
695 nun 6281
696 _ol 6247
697 dev 6238
698 ako 6232
699 odo 6209
700 _us 6179
701 atr 6177
702 omu 6174
703 ign 6161
704 one 6159
705 cis 6154
706 rso 6153
707 ula 6151
708 eda 6144
709 loĝ 6125
710 fam 6122
711 sup 6115
712 idi 6105
713 _bu 6102
714 bar 6087
715 rme 6072
716 uti 6068
717 lka 6044
718 tiĝ 6037
719 uza 6037
720 kst 6015
721 ofi 5991
722 umo 5935
723 sio 5925
724 _ŝt 5905
725 omb 5903
726 ibe 5891
727 olu 5891
728 div 5889
729 coj 5878
730 asi 5861
731 _hu 5845
732 izo 5839
733 und 5825
734 rga 5818
735 mit 5815
736 pec 5814
737 amo 5809
738 _im 5784
739 itu 5769
740 rua 5762
741 ve_ 5756
742 ogi 5734
743 flu 5727
744 ice 5722
745 rsa 5720
746 inc 5718
747 efe 5711
748 hel 5698
749 kde 5695
750 ole 5691
751 ote 5677
752 ukt 5650
753 imi 5622
754 etr 5619
755 eve 5618
756 ekd 5610
757 ete 5600
758 _ĉa 5596
759 rla 5584
760 kap 5563
761 oft 5563
762 skr 5550
763 _ŝa 5543
764 ume 5537
765 kro 5533
766 das 5529
767 una 5526
768 _jo 5523
769 ipo 5522
770 ces 5520
771 esi 5506
772 upo 5488
773 ol_ 5487
774 ble 5474
775 iun 5474
776 uo_ 5471
777 eze 5469
778 ima 5452
779 ime 5438
780 ngl 5419
781 spa 5416
782 esk 5414
783 kal 5405
784 rac 5398
785 kam 5397
786 ric 5386
787 rui 5380
788 tad 5364
789 rda 5359
790 lud 5350
791 ede 5349
792 ivo 5336
793 niv 5335
794 nik 5332
795 rem 5328
796 ce_ 5318
797 poj 5317
798 rev 5301
799 tol 5297
800 iro 5292
801 din 5282
802 saj 5280
803 leg 5275
804 om_ 5275
805 enk 5270
806 uta 5261
807 nig 5258
808 cip 5254
809 alp 5250
810 uri 5248
811 kio 5246
812 oje 5244
813 ram 5229
814 ŭro 5229
815 imo 5221
816 vil 5195
817 rba 5191
818 izi 5181
819 lte 5179
820 udo 5176
821 ŝta 5169
822 aga 5160
823 _fu 5149
824 ier 5149
825 tam 5145
826 boj 5103
827 gru 5093
828 sud 5090
829 til 5088
830 ndu 5087
831 ĝan 5084
832 pat 5076
833 nit 5065
834 nal 5058
835 vad 5056
836 _ag 5053
837 dec 5047
838 rup 5045
839 zon 5045
840 _ci 5026
841 uli 5021
842 goj 5018
843 ore 5007
844 riv 4998
845 alf 4983
846 _eŭ 4981
847 eĝo 4968
848 _to 4947
849 foj 4935
850 ubl 4922
851 uis 4915
852 _eb 4902
853 soj 4898
854 arb 4892
855 apo 4887
856 num 4884
857 all 4855
858 _ad 4837
859 ald 4836
860 kce 4819
861 jam 4815
862 rdi 4805
863 opa 4802
864 viĝ 4795
865 edo 4791
866 lim 4788
867 ibr 4783
868 zio 4779
869 sla 4766
870 eŭr 4764
871 fte 4760
872 mpa 4759
873 isp 4755
874 raf 4755
875 erl 4751
876 jun 4751
877 ekv 4737
878 _ca 4735
879 evo 4733
880 oto 4723
881 kas 4714
882 _ed 4711
883 pas 4707
884 ogr 4701
885 irk 4674
886 ept 4663
887 mov 4645
888 zid 4639
889 her 4636
890 tag 4636
891 sol 4613
892 pub 4602
893 kzi 4600
894 gla 4592
895 rim 4571
896 mio 4566
897 esa 4562
898 dit 4556
899 ge_ 4549
900 oks 4535
901 eme 4533
902 opr 4528
903 isk 4516
904 zas 4516
905 tim 4503
906 bur 4501
907 nse 4489
908 amp 4481
909 rei 4480
910 bat 4478
911 rel 4472
912 aco 4470
913 fla 4468
914 aŭt 4460
915 ung 4459
916 ile 4446
917 iza 4443
918 rsi 4443
919 odi 4429
920 nkt 4417
921 lam 4403
922 mia 4403
923 eld 4402
924 ksa 4397
925 _ev 4394
926 spo 4390
927 ipe 4377
928 alk 4364
929 unk 4362
930 nia 4361
931 plo 4361
932 nen 4354
933 pop 4349
934 als 4348
935 aza 4347
936 fun 4343
937 vi_ 4329
938 _n_ 4320
939 kte 4320
940 dep 4301
941 joj 4301
942 _co 4292
943 ezo 4290
944 rod 4276
945 von 4262
946 ea_ 4253
947 rea 4249
948 oku 4243
949 cer 4223
950 mez 4223
951 sal 4204
952 mi_ 4199
953 riz 4195
954 sil 4195
955 dir 4194
956 _m_ 4190
957 gon 4186
958 ĝoj 4180
959 dik 4177
960 ĝen 4167
961 del 4161
962 aka 4156
963 mik 4147
964 tud 4138
965 _bl 4135
966 dem 4127
967 sma 4124
968 fic 4119
969 ego 4098
970 lem 4091
971 ave 4078
972 iis 4076
973 ire 4068
974 kad 4049
975 rne 4049
976 rof 4046
977 okc 4035
978 muz 4015
979 dat 4013
980 gni 4007
981 ngo 4000
982 kze 3997
983 oĝa 3988
984 ĉar 3981
985 ane 3979
986 rap 3978
987 api 3977
988 rde 3969
989 rot 3968
990 rep 3954
991 ikt 3953
992 sul 3951
993 nej 3948
994 dua 3944
995 ek_ 3932
996 zik 3927
997 amb 3899
998 vit 3897
999 kut 3887
1000 kvi 3880
1001 dor 3879
1002 _et 3877
1003 nar 3873
1004 hun 3869
1005 erb 3867
1006 bru 3863
1007 nic 3861
1008 zen 3853
1009 kit 3846
1010 ŝi_ 3841
1011 ada 3837
1012 eva 3817
1013 nge 3801
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _de 136162
15 os_ 104432
16 de_ 96031
17 _la 81054
18 _co 64732
19 la_ 62987
20 es_ 60027
21 as_ 58615
22 _qu 56200
23 que 55727
24 ue_ 53693
25 ent 53516
26 en_ 52339
27 el_ 50612
28 _en 49870
29 _es 48725
30 ón_ 46841
31 ión 46068
32 est 43016
33 nte 42192
34 te_ 38687
35 _el 38461
36 con 37552
37 _se 37134
38 _pr 35772
39 do_ 34887
40 _lo 34502
41 _y_ 32030
42 _un 30742
43 _re 29625
44 _po 29551
45 aci 29218
46 _a_ 29199
47 los 28792
48 ció 28717
49 _pa 28600
50 to_ 28476
51 ra_ 27785
52 res 27200
53 men 26415
54 ar_ 25118
55 ado 25005
56 com 24457
57 sta 23713
58 par 23570
59 or_ 22898
60 _in 22253
61 se_ 21776
62 al_ 21224
63 por 20662
64 _ha 20266
65 no_ 19782
66 _no 19637
67 las 19623
68 pro 18796
69 nto 18515
70 pre 18215
71 tra 18215
72 ien 18094
73 ta_ 18071
74 cia 18041
75 ion 17727
76 ida 17065
77 nci 16820
78 na_ 16728
79 ica 16595
80 cio 16590
81 mos 16275
82 one 16216
83 del 15972
84 dad 15624
85 lo_ 15488
86 ara 15288
87 ia_ 14766
88 una 14753
89 ant 14486
90 nes 14249
91 _si 14167
92 er_ 14052
93 ame 13930
94 dos 13906
95 _al 13877
96 rec 13848
97 rop 13745
98 on_ 13491
99 an_ 13268
100 _so 13252
101 nta 13173
102 da_ 13104
103 ro_ 13032
104 _di 12932
105 _su 12895
106 ues 12709
107 _me 12586
108 era 12502
109 des 12453
110 per 12285
111 _pe 12224
112 un_ 12196
113 re_ 11905
114 ste 11899
115 mis 11726
116 ten 11678
117 ter 11476
118 enc 11340
119 ad_ 11252
120 tar 11116
121 isi 11112
122 _ca 10938
123 ici 10928
124 les 10770
125 uro 10756
126 sió 10697
127 pue 10688
128 tad 10621
129 mo_ 10163
130 io_ 10146
131 str 10101
132 ons 9952
133 nos 9905
134 _cu 9835
135 ari 9810
136 ada 9771
137 tos 9746
138 ido 9642
139 omi 9583
140 eur 9560
141 tiv 9560
142 ntr 9498
143 _eu 9482
144 esp 9477
145 _tr 9414
146 _mi 9363
147 ect 9362
148 tic 9322
149 ero 9214
150 den 9204
151 emo 9181
152 ere 9157
153 tan 9147
154 ier 9049
155 _ma 8996
156 _ta 8994
157 ía_ 8957
158 tam 8818
159 orm 8810
160 rio 8749
161 _ac 8686
162 esi 8656
163 bre 8655
164 sid 8627
165 ca_ 8595
166 ale 8551
167 ndo 8536
168 mie 8477
169 ist 8448
170 der 8325
171 ide 8284
172 for 8259
173 eci 8249
174 sti 8242
175 ble 8210
176 ene 8196
177 _to 8194
178 ona 8191
179 esa 8081
180 ran 8077
181 _te 7956
182 end 7934
183 uni 7910
184 ope 7900
185 tro 7898
186 ont 7887
187 deb 7873
188 ros 7787
189 mbi 7751
190 sto 7662
191 _pu 7656
192 cho 7529
193 ha_ 7509
194 amb 7491
195 _ex 7420
196 ese 7364
197 int 7253
198 ina 7226
199 das 7081
200 ría 7030
201 _mu 6965
202 art 6917
203 gra 6882
204 cto 6867
205 tes 6857
206 so_ 6839
207 ir_ 6792
208 spe 6704
209 ma_ 6692
210 edi 6656
211 ás_ 6644
212 _nu 6501
213 señ 6470
214 co_ 6455
215 más 6441
216 tod 6398
217 omo 6376
218 ora 6350
219 lam 6322
220 ece 6312
221 ser 6290
222 ita 6283
223 ea_ 6280
224 ria 6261
225 me_ 6256
226 qui 6227
227 sar 6206
228 _as 6167
229 lar 6121
230 obr 6092
231 cue 6081
232 eño 6050
233 iva 6034
234 nue 5959
235 _ap 5930
236 fic 5924
237 amo 5908
238 ali 5900
239 nal 5838
240 rta 5836
241 egu 5791
242 ech 5761
243 iza 5731
244 eri 5730
245 ñor 5723
246 ort 5694
247 rac 5692
248 ico 5684
249 dic 5667
250 ces 5661
251 pon 5640
252 ade 5636
253 sob 5607
254 ren 5603
255 and 5589
256 odo 5582
257 _má 5529
258 emp 5523
259 rma 5485
260 ner 5478
261 ebe 5448
262 ace 5437
263 mpl 5411
264 ios 5407
265 ura 5405
266 ial 5353
267 cer 5337
268 le_ 5271
269 inc 5236
270 su_ 5233
271 uer 5220
272 man 5215
273 sen 5211
274 unt 5206
275 eo_ 5197
276 tas 5168
277 va_ 5165
278 ras 5158
279 tal 5158
280 cas 5154
281 mer 5148
282 cie 5130
283 imp 5117
284 arl 5078
285 mpo 5051
286 nda 5030
287 dec 5018
288 ema 5009
289 ili 4972
290 reg 4968
291 lic 4876
292 pos 4860
293 _ve 4848
294 _gr 4823
295 _im 4816
296 lid 4782
297 ati 4776
298 go_ 4763
299 tur 4735
300 uci 4702
301 med 4693
302 omp 4690
303 én_ 4638
304 _mo 4635
305 ién 4631
306 _fu 4619
307 _cr 4615
308 ate 4578
309 cci 4569
310 anc 4568
311 cre 4551
312 gun 4544
313 abl 4531
314 nfo 4528
315 rla 4516
316 tor 4512
317 ued 4491
318 inf 4486
319 min 4483
320 mbr 4479
321 pec 4452
322 imi 4425
323 car 4406
324 _fi 4395
325 pri 4389
326 bié 4365
327 nió 4365
328 eso 4355
329 rar 4352
330 jo_ 4338
331 nse 4338
332 llo 4335
333 rme 4321
334 iem 4305
335 nti 4303
336 nde 4301
337 _he 4300
338 seg 4296
339 ede 4292
340 íti 4288
341 cul 4283
342 pol 4276
343 ore 4270
344 rad 4258
345 rat 4258
346 rea 4240
347 cua 4237
348 _ad 4228
349 ral 4227
350 ber 4225
351 sin 4202
352 _ci 4201
353 _le 4201
354 dem 4197
355 oci 4196
356 ho_ 4169
357 ivo 4168
358 ejo 4157
359 ust 4130
360 cti 4120
361 emb 4112
362 ver 4101
363 eco 4094
364 erc 4092
365 ual 4083
366 ce_ 4081
367 _an 4068
368 cad 4066
369 _vi 4063
370 _ob 4051
371 tie 4051
372 mas 4028
373 _ti 4026
374 ias 4006
375 _fo 4000
376 ndi 3986
377 hac 3960
378 otr 3955
379 bie 3949
380 rte 3940
381 ert 3938
382 uen 3928
383 pea 3898
384 cos 3884
385 osi 3878
386 olí 3869
387 itu 3867
388 lít 3862
389 ond 3830
390 uie 3829
391 _em 3827
392 gar 3820
393 vo_ 3815
394 ses 3796
395 _ne 3768
396 act 3761
397 lan 3753
398 ano 3752
399 ela 3735
400 sa_ 3724
401 leg 3684
402 uda 3682
403 nic 3664
404 tre 3662
405 son 3648
406 und 3647
407 pli 3635
408 nst 3629
409 ers 3619
410 tem 3608
411 fin 3594
412 dis 3574
413 mun 3572
414 nsi 3564
415 sol 3562
416 spo 3562
417 mar 3555
418 ome 3555
419 gen 3525
420 ya_ 3515
421 aba 3508
422 ini 3502
423 ori 3488
424 are 3485
425 _sa 3467
426 ata 3448
427 ern 3428
428 nac 3427
429 rti 3426
430 ven 3417
431 ism 3368
432 rá_ 3358
433 in_ 3345
434 _li 3344
435 dir 3340
436 lac 3283
437 nid 3274
438 si_ 3260
439 _ot 3258
440 ram 3248
441 cha 3246
442 ult 3241
443 fer 3233
444 ne_ 3232
445 uch 3216
446 ime 3214
447 duc 3206
448 aís 3191
449 _va 3189
450 ref 3180
451 paí 3177
452 egi 3164
453 mpr 3151
454 ito 3147
455 did 3144
456 lem 3142
457 hos 3128
458 tri 3124
459 sos 3118
460 _pl 3114
461 liz 3114
462 _au 3112
463 eda 3111
464 ula 3098
465 tua 3093
466 ifi 3092
467 alm 3090
468 pla 3086
469 ens 3079
470 sit 3076
471 pod 3075
472 rol 3071
473 nas 3064
474 lme 3051
475 stá 3045
476 ele 3044
477 rob 3044
478 _lu 3042
479 _am 3028
480 erd 3021
481 nad 3003
482 _sr 3002
483 sus 2988
484 onc 2986
485 rid 2986
486 tac 2982
487 po_ 2953
488 hay 2931
489 _ni 2929
490 ino 2926
491 peo 2926
492 nec 2913
493 rse 2912
494 alg 2904
495 cac 2898
496 erv 2887
497 _ju 2883
498 ami 2876
499 eme 2876
500 ete 2870
501 _na 2864
502 dar 2863
503 ple 2848
504 can 2846
505 bil 2844
506 ota 2843
507 eti 2833
508 obl 2827
509 mit 2792
510 bro 2790
511 oce 2787
512 han 2775
513 smo 2766
514 opu 2758
515 _fa 2755
516 zar 2749
517 rim 2746
518 rab 2736
519 rro 2736
520 ecu 2730
521 err 2724
522 imo 2724
523 arr 2720
524 soc 2713
525 roc 2704
526 cam 2703
527 cla 2695
528 eno 2678
529 ega 2677
530 ell 2677
531 _or 2676
532 tid 2674
533 _o_ 2673
534 hab 2673
535 dam 2666
536 cir 2660
537 cid 2642
538 ctu 2640
539 ins 2639
540 _ba 2632
541 án_ 2620
542 sis 2617
543 aso 2601
544 nen 2596
545 baj 2594
546 col 2593
547 eni 2592
548 _vo 2591
549 ani 2585
550 efe 2575
551 _do 2570
552 erm 2567
553 cor 2558
554 olu 2547
555 ric 2542
556 eva 2539
557 omu 2535
558 efi 2534
559 sej 2529
560 _ho 2525
561 exi 2525
562 luc 2523
563 nsa 2521
564 fun 2516
565 lta 2515
566 _da 2504
567 rel 2501
568 eta 2498
569 us_ 2489
570 año 2479
571 rmi 2479
572 ret 2474
573 dor 2469
574 tim 2466
575 cen 2463
576 rem 2447
577 igu 2444
578 be_ 2439
579 tab 2438
580 _ge 2428
581 arg 2428
582 erí 2426
583 lec 2420
584 _añ 2418
585 nar 2418
586 asa 2417
587 ima 2411
588 sas 2410
589 uno 2406
590 esu 2405
591 pa_ 2400
592 tit 2399
593 nve 2392
594 sio 2389
595 pet 2388
596 uev 2383
597 ún_ 2383
598 rca 2373
599 _ag 2372
600 sup 2371
601 dif 2370
602 rod 2370
603 asi 2361
604 nce 2358
605 _op 2357
606 _ya 2349
607 ord 2330
608 ire 2328
609 cip 2322
610 muc 2313
611 exp 2312
612 cis 2308
613 sab 2308
614 oda 2307
615 rso 2304
616 vis 2293
617 reo 2290
618 eba 2276
619 rno 2272
620 sie 2272
621 íse 2272
622 dio 2271
623 ena 2267
624 hor 2263
625 esc 2262
626 sig 2261
627 opi 2243
628 abi 2240
629 ogr 2238
630 ced 2230
631 eal 2230
632 oy_ 2229
633 rep 2225
634 vid 2221
635 ind 2215
636 uy_ 2209
637 ibl 2208
638 muy 2207
639 cta 2197
640 dan 2196
641 mpe 2191
642 ama 2184
643 ola 2180
644 gui 2174
645 nan 2173
646 clu 2164
647 sib 2157
648 zac 2153
649 ono 2144
650 opa 2139
651 lib 2125
652 _ec 2123
653 gur 2121
654 ana 2118
655 fue 2118
656 isa 2117
657 nor 2114
658 oso 2113
659 ará 2110
660 lla 2106
661 onf 2106
662 fec 2105
663 abo 2101
664 ocu 2100
665 bar 2093
666 rre 2089
667 odu 2085
668 sr_ 2080
669 iti 2075
670 pen 2075
671 ay_ 2074
672 vos 2073
673 yo_ 2071
674 tin 2070
675 _ll 2067
676 ast 2062
677 _fr 2056
678 ode 2055
679 sea 2047
680 za_ 2041
681 _ce 2031
682 dia 2031
683 sec 2031
684 opo 2020
685 uan 2019
686 vas 2014
687 ans 2007
688 sí_ 2007
689 oll 2000
690 _hu 1988
691 lgu 1988
692 lle 1985
693 evi 1984
694 tir 1984
695 gan 1981
696 rda 1965
697 bra 1964
698 mi_ 1962
699 acu 1957
700 ext 1953
701 rie 1950
702 ga_ 1949
703 rqu 1948
704 tió 1947
705 _ra 1933
706 ala 1932
707 _ab 1931
708 uct 1930
709 lat 1922
710 uga 1915
711 _cl 1914
712 apr 1913
713 rgo 1913
714 rin 1912
715 bat 1911
716 orí 1910
717 acc 1900
718 tru 1897
719 rev 1895
720 cal 1882
721 len 1878
722 amp 1874
723 eje 1874
724 ior 1874
725 nmi 1873
726 pac 1865
727 mod 1864
728 tá_ 1863
729 uma 1862
730 ías 1861
731 lad 1859
732 upo 1856
733 _ar 1850
734 ron 1849
735 bli 1846
736 ume 1844
737 ost 1838
738 sic 1838
739 ncl 1831
740 ños 1831
741 pta 1823
742 _ga 1822
743 uir 1816
744 ole 1812
745 lab 1809
746 icu 1801
747 arc 1795
748 rdo 1785
749 abe 1783
750 lug 1782
751 _ej 1777
752 lim 1774
753 rog 1767
754 iac 1765
755 eli 1764
756 rán 1763
757 iad 1762
758 _só 1756
759 ars 1755
760 gua 1754
761 sól 1754
762 rib 1739
763 ibi 1732
764 ajo 1722
765 zad 1714
766 ivi 1712
767 jet 1711
768 aqu 1697
769 iar 1696
770 pas 1694
771 has 1690
772 lia 1687
773 onv 1686
774 _ah 1684
775 ólo 1682
776 _bi 1681
777 enm 1680
778 apo 1676
779 oca 1673
780 ayo 1670
781 nza 1670
782 ecc 1665
783 _du 1663
784 uac 1660
785 rup 1659
786 ane 1657
787 _ef 1656
788 ez_ 1651
789 may 1650
790 oba 1642
791 ben 1638
792 bje 1637
793 nom 1632
794 obj 1629
795 ote 1628
796 unc 1627
797 asu 1623
798 _ay 1614
799 rci 1610
800 equ 1609
801 rna 1609
802 cep 1608
803 nco 1605
804 ing 1599
805 mac 1597
806 bor 1589
807 agr 1581
808 mej 1581
809 mil 1580
810 uto 1578
811 gru 1575
812 ase 1573
813 hec 1573
814 apl 1564
815 orq 1563
816 mic 1562
817 yor 1561
818 rga 1557
819 rit 1557
820 cit 1554
821 ich 1552
822 abr 1550
823 iga 1549
824 cri 1548
825 ato 1547
826 evo 1541
827 nit 1541
828 ive 1540
829 ine 1537
830 mba 1535
831 omb 1535
832 vot 1534
833 pio 1532
834 mat 1528
835 pun 1527
836 uri 1526
837 cur 1525
838 tec 1525
839 lti 1518
840 sad 1517
841 és_ 1517
842 bem 1516
843 bas 1512
844 _us 1510
845 nis 1509
846 osa 1509
847 hem 1508
848 hum 1500
849 _oc 1498
850 bla 1496
851 isp 1496
852 aho 1494
853 jor 1478
854 así 1476
855 rde 1476
856 _aq 1475
857 anz 1470
858 red 1468
859 sul 1466
860 xis 1462
861 eng 1454
862 sum 1448
863 uis 1447
864 apa 1445
865 orr 1444
866 ayu 1442
867 cum 1439
868 quí 1436
869 rto 1434
870 ite 1433
871 _bu 1431
872 rot 1430
873 yud 1429
874 eto 1424
875 noc 1424
876 die 1420
877 ian 1420
878 pal 1418
879 ulo 1416
880 ibu 1407
881 val 1403
882 arí 1402
883 emá 1402
884 _ev 1401
885 adi 1401
886 sca 1400
887 iud 1392
888 ciu 1386
889 alt 1380
890 bit 1378
891 cab 1378
892 sun 1375
893 fra 1374
894 _at 1373
895 _fe 1361
896 van 1359
897 esd 1358
898 ego 1353
899 he_ 1353
900 sal 1353
901 sde 1352
902 aut 1349
903 erá 1346
904 poy 1341
905 lig 1338
906 jer 1331
907 teg 1331
908 nem 1326
909 nóm 1323
910 ruc 1319
911 oco 1311
912 uta 1310
913 _pi 1308
914 aya 1306
915 qué 1306
916 rom 1306
917 upu 1304
918 spu 1302
919 gue 1300
920 tán 1300
921 ibe 1299
922 mid 1296
923 rtu 1293
924 il_ 1292
925 aña 1289
926 onó 1286
927 plo 1284
928 ife 1280
929 _af 1276
930 dim 1272
931 cim 1271
932 ño_ 1271
933 log 1268
934 ómi 1268
935 lus 1263
936 ué_ 1250
937 uid 1244
938 sam 1238
939 oli 1232
940 def 1231
941 _go 1228
942 ept 1227
943 lis 1227
944 isc 1225
945 obs 1220
946 til 1218
947 mad 1213
948 gio 1212
949 aro 1211
950 cup 1209
951 lev 1208
952 egl 1204
953 rra 1204
954 usi 1199
955 cil 1195
956 ba_ 1191
957 aca 1188
958 vez 1188
959 sla 1184
960 _és 1183
961 nsp 1178
962 inv 1177
963 rlo 1176
964 tom 1174
965 _tu 1166
966 gla 1166
967 ill 1165
968 odr 1163
969 mpa 1161
970 spa 1161
971 eña 1157
972 obi 1155
973 odi 1155
974 is_ 1152
975 pes 1150
976 hoy 1148
977 ted 1146
978 var 1141
979 día 1140
980 nsu 1140
981 rco 1138
982 nam 1136
983 _yo 1131
984 rvi 1131
985 cap 1128
986 rri 1127
987 nif 1124
988 uel 1121
989 eja 1120
990 udi 1119
991 pin 1115
992 tud 1114
993 vel 1112
994 tuc 1107
995 uso 1106
996 vic 1106
997 sem 1105
998 últ 1105
999 ipa 1103
1000 dop 1100
1001 fre 1098
1002 aja 1097
1003 aus 1097
1004 gas 1096
1005 ipi 1094
1006 pid 1089
1007 exc 1086
1008 det 1083
1009 _úl 1082
1010 _gu 1077
1011 mes 1077
1012 zo_ 1077
1013 bri 1075
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 se_ 21814
15 mis 19747
16 ise 19736
17 ja_ 17655
18 te_ 17213
19 use 16214
20 _ja 15845
21 st_ 13526
22 est 13298
23 ud_ 13260
24 ste 13210
25 sta 12994
26 _ko 12908
27 ust 12804
28 ast 11912
29 ist 11713
30 ami 11710
31 _ka 11705
32 le_ 11520
33 dus 11464
34 on_ 10832
35 ne_ 10685
36 _te 10569
37 sel 10497
38 tus 10458
39 ine 10055
40 _va 9735
41 end 9638
42 es_ 9627
43 _on 9512
44 ks_ 9494
45 de_ 9290
46 and 9066
47 us_ 9013
48 lis 8954
49 nda 8754
50 _se 8736
51 id_ 8699
52 aja 8273
53 eri 8195
54 ed_ 8156
55 ava 8116
56 sti 8109
57 lik 8028
58 _ee 7949
59 el_ 7931
60 min 7913
61 ga_ 7697
62 val 7636
63 tud 7571
64 ti_ 7392
65 tee 7203
66 ali 7197
67 ate 7136
68 _ta 7108
69 ing 7031
70 ele 6956
71 ees 6954
72 ide 6935
73 _ku 6890
74 lt_ 6750
75 is_ 6643
76 kon 6565
77 da_ 6556
78 tsi 6474
79 töö 6444
80 ndu 6377
81 eks 6338
82 ete 6334
83 as_ 6328
84 aks 6245
85 _ra 6227
86 _ma 6185
87 ad_ 6040
88 tam 5993
89 _ar 5952
90 atu 5921
91 _võ 5914
92 eer 5902
93 iku 5846
94 aas 5743
95 _vä 5723
96 _ni 5677
97 _in 5534
98 sed 5525
99 tat 5511
100 _mi 5382
101 nna 5339
102 uur 5295
103 met 5180
104 dam 5128
105 või 5104
106 _pr 5064
107 _to 5017
108 _ol 5016
109 _me 4954
110 ren 4867
111 lus 4818
112 oli 4795
113 tme 4751
114 _tö 4749
115 sek 4747
116 ega 4735
117 ng_ 4702
118 _jä 4678
119 _su 4642
120 tte 4565
121 imi 4564
122 lii 4557
123 _tu 4526
124 eva 4520
125 ata 4480
126 kes 4435
127 ses 4423
128 _li 4335
129 _sa 4331
130 ite 4320
131 tel 4309
132 stu 4294
133 oon 4263
134 koo 4259
135 al_ 4256
136 are 4251
137 ade 4247
138 teg 4239
139 _et 4214
140 tse 4206
141 oni 4202
142 _ke 4195
143 kas 4179
144 rit 4176
145 tes 4174
146 tav 4159
147 _pa 4130
148 ita 4064
149 nin 4040
150 sus 4033
151 emi 4025
152 maa 4001
153 aal 3976
154 ole 3969
155 eta 3968
156 ta_ 3899
157 ili 3886
158 vas 3881
159 vad 3868
160 asu 3852
161 ool 3842
162 ada 3756
163 ka_ 3737
164 ema 3701
165 ma_ 3701
166 vus 3665
167 vat 3638
168 pro 3621
169 _põ 3618
170 itu 3616
171 ndi 3609
172 si_ 3609
173 na_ 3582
174 ima 3543
175 koh 3515
176 nud 3507
177 rii 3506
178 ald 3483
179 ama 3465
180 ab_ 3461
181 kse 3452
182 uri 3444
183 ngu 3439
184 kus 3433
185 its 3425
186 ku_ 3420
187 uta 3405
188 ini 3372
189 des 3367
190 suu 3352
191 ika 3342
192 _si 3321
193 jan 3315
194 lit 3309
195 esk 3307
196 ni_ 3307
197 _aa 3296
198 lin 3277
199 et_ 3261
200 eet 3257
201 ioo 3251
202 _el 3248
203 lle 3248
204 ime 3245
205 loo 3238
206 ala 3235
207 sio 3229
208 sut 3227
209 ekt 3224
210 umi 3223
211 _ül 3208
212 ood 3198
213 maj 3176
214 mee 3166
215 inn 3135
216 _ha 3124
217 onn 3102
218 jää 3093
219 _pe 3089
220 _al 3085
221 tak 3061
222 _os 3055
223 elt 3036
224 ule 3026
225 kui 3025
226 eli 3019
227 ett 3018
228 _re 3000
229 utu 2982
230 tik 2977
231 _ju 2941
232 ell 2940
233 ter 2893
234 ene 2884
235 too 2883
236 _pi 2879
237 li_ 2868
238 võt 2854
239 ase 2835
240 väl 2832
241 lem 2826
242 str 2816
243 saa 2804
244 _po 2803
245 _a_ 2790
246 mus 2784
247 kti 2773
248 tas 2768
249 nis 2763
250 _an 2756
251 gus 2738
252 see 2736
253 _so 2718
254 etu 2709
255 ik_ 2699
256 igi 2690
257 va_ 2683
258 tul 2671
259 di_ 2670
260 _ri 2667
261 mat 2634
262 ure 2629
263 õim 2624
264 alt 2622
265 alu 2621
266 mal 2615
267 tal 2602
268 ent 2600
269 me_ 2592
270 gu_ 2569
271 sse 2569
272 osa 2564
273 _tä 2548
274 kku 2529
275 idu 2525
276 rim 2494
277 ida 2493
278 äät 2492
279 ots 2490
280 esi 2478
281 sis 2476
282 ätm 2474
283 lda 2473
284 oma 2473
285 eis 2467
286 arv 2463
287 ahe 2456
288 rin 2454
289 lli 2452
290 kog 2450
291 ööt 2450
292 eel 2446
293 hen 2440
294 kor 2439
295 ori 2439
296 gi_ 2428
297 eid 2422
298 ege 2421
299 _ne 2419
300 äär 2407
301 iti 2391
302 rak 2391
303 _üh 2385
304 tev 2383
305 _la 2378
306 aga 2368
307 _kõ 2360
308 isa 2355
309 all 2354
310 oos 2354
311 ike 2347
312 lev 2341
313 ead 2314
314 ri_ 2312
315 hin 2307
316 ind 2304
317 _es 2300
318 ogu 2293
319 las 2290
320 ats 2288
321 iit 2282
322 _er 2274
323 rah 2273
324 ond 2272
325 adu 2255
326 evõ 2245
327 nim 2234
328 set 2231
329 jal 2225
330 ati 2224
331 pea 2219
332 iik 2218
333 ari 2201
334 põl 2189
335 lek 2188
336 _ki 2183
337 kaa 2172
338 lja 2168
339 gev 2164
340 iig 2159
341 iir 2159
342 eed 2153
343 uma 2151
344 ude 2141
345 tsu 2134
346 ui_ 2132
347 kul 2126
348 tis 2117
349 tea 2116
350 roo 2114
351 ke_ 2111
352 _as 2097
353 tid 2097
354 _mu 2093
355 ikk 2089
356 pii 2089
357 ei_ 2073
358 ani 2072
359 seg 2072
360 ute 2064
361 dad 2055
362 oet 2053
363 mas 2043
364 nde 2043
365 nik 2029
366 evu 2025
367 ost 2025
368 mes 2021
369 _ei 2019
370 jär 2012
371 nts 2009
372 tek 2009
373 tur 2003
374 sei 2000
375 tab 1996
376 eem 1984
377 ves 1962
378 tad 1948
379 nõu 1940
380 sid 1934
381 _pu 1929
382 _i_ 1921
383 toe 1921
384 del 1915
385 _kä 1911
386 ähe 1908
387 _om 1906
388 ess 1902
389 älj 1899
390 ran 1896
391 _e_ 1887
392 jat 1885
393 la_ 1885
394 mi_ 1882
395 tuu 1872
396 ale 1867
397 põh 1864
398 ant 1854
399 rid 1854
400 ärg 1847
401 vah 1844
402 pol 1843
403 _en 1841
404 tii 1834
405 par 1829
406 uli 1828
407 dat 1826
408 iga 1823
409 lan 1815
410 ets 1814
411 isi 1814
412 sii 1813
413 oht 1807
414 _lo 1800
415 _ve 1799
416 res 1789
417 _vi 1788
418 _st 1779
419 mid 1778
420 vii 1778
421 rat 1777
422 iiv 1764
423 odu 1764
424 oha 1764
425 lse 1759
426 _uu 1749
427 _hi 1740
428 sam 1740
429 med 1735
430 ame 1734
431 il_ 1729
432 sal 1727
433 imu 1715
434 olu 1709
435 ras 1708
436 ede 1706
437 sea 1694
438 usl 1690
439 ia_ 1689
440 nev 1687
441 usi 1687
442 _le 1686
443 tlu 1686
444 tu_ 1684
445 nen 1679
446 teh 1675
447 iva 1669
448 soo 1668
449 mak 1667
450 llu 1662
451 sli 1653
452 eda 1651
453 aid 1650
454 _lä 1649
455 nd_ 1649
456 poo 1648
457 ul_ 1648
458 ane 1646
459 iis 1643
460 art 1641
461 juh 1641
462 oot 1638
463 egi 1630
464 kko 1630
465 üle 1630
466 mil 1629
467 sem 1623
468 mit 1620
469 _na 1618
470 ord 1618
471 _mä 1616
472 anu 1616
473 tle 1611
474 asi 1608
475 mär 1608
476 rgi 1605
477 at_ 1603
478 ris 1598
479 sit 1598
480 rra 1589
481 uud 1589
482 _nõ 1585
483 gia 1579
484 kat 1579
485 een 1577
486 inf 1575
487 hel 1570
488 etm 1566
489 sia 1564
490 ära 1563
491 _tõ 1560
492 tmi 1556
493 äit 1542
494 _s_ 1541
495 sak 1533
496 tei 1532
497 ara 1527
498 hal 1525
499 har 1520
500 he_ 1518
501 õig 1516
502 aad 1515
503 uut 1514
504 _mõ 1512
505 muu 1512
506 ake 1509
507 als 1504
508 orr 1504
509 tra 1502
510 eur 1501
511 _kü 1498
512 _eu 1496
513 õi_ 1496
514 ral 1493
515 taj 1491
516 ken 1482
517 sa_ 1478
518 uro 1477
519 uru 1477
520 oom 1475
521 ill 1472
522 raa 1466
523 aig 1464
524 isk 1464
525 enu 1460
526 ige 1460
527 era 1456
528 nem 1456
529 ese 1454
530 ulu 1454
531 em_ 1453
532 skk 1453
533 aar 1446
534 ium 1444
535 jad 1441
536 tor 1440
537 ion 1436
538 per 1434
539 eng 1423
540 ult 1423
541 uko 1420
542 ühi 1420
543 sva 1413
544 und 1411
545 aka 1410
546 gra 1407
547 ien 1402
548 let 1401
549 vaj 1401
550 kõi 1398
551 iaa 1397
552 dis 1394
553 ldu 1393
554 tsa 1392
555 ina 1382
556 _t_ 1375
557 gut 1375
558 tru 1370
559 eti 1366
560 ee_ 1365
561 eku 1364
562 kid 1359
563 uht 1359
564 hai 1358
565 tar 1355
566 jek 1354
567 dav 1348
568 udu 1347
569 ahv 1344
570 eld 1344
571 _kr 1341
572 oog 1341
573 his 1336
574 nat 1333
575 väh 1332
576 usa 1330
577 elu 1326
578 kli 1321
579 kut 1321
580 eb_ 1320
581 toi 1320
582 du_ 1319
583 esm 1317
584 täi 1315
585 sen 1306
586 uss 1302
587 uva 1299
588 sim 1295
589 hta 1293
590 aam 1292
591 edi 1291
592 ela 1291
593 võr 1291
594 ruk 1287
595 _nä 1284
596 vis 1284
597 les 1282
598 õtt 1281
599 oor 1279
600 gil 1278
601 van 1277
602 ki_ 1275
603 usk 1270
604 rve 1263
605 ogi 1262
606 iid 1261
607 mää 1261
608 asa 1259
609 igu 1259
610 õpp 1245
611 iim 1244
612 iks 1244
613 ile 1244
614 isu 1242
615 rdi 1241
616 akt 1239
617 nas 1238
618 üsi 1236
619 dit 1230
620 tet 1227
621 kal 1226
622 riu 1226
623 ako 1224
624 asv 1224
625 lum 1220
626 _th 1215
627 ram 1215
628 ukt 1213
629 aot 1212
630 lat 1208
631 rvi 1208
632 iko 1204
633 ktu 1204
634 ogr 1204
635 olo 1203
636 sin 1202
637 nus 1200
638 erv 1197
639 it_ 1194
640 tag 1194
641 abi 1192
642 ait 1191
643 smä 1191
644 tri 1191
645 _is 1189
646 oop 1187
647 omi 1175
648 aa_ 1169
649 nve 1167
650 ain 1162
651 rio 1160
652 lu_ 1154
653 amm 1153
654 ge_ 1153
655 net 1150
656 nte 1144
657 ühe 1144
658 _he 1143
659 hul 1143
660 kin 1141
661 reg 1141
662 mad 1133
663 uid 1133
664 re_ 1131
665 opa 1129
666 una 1129
667 taa 1126
668 õll 1126
669 rtu 1117
670 er_ 1116
671 iin 1111
672 vee 1111
673 ivi 1110
674 men 1109
675 kir 1108
676 pa_ 1108
677 aat 1106
678 iki 1105
679 oje 1105
680 the 1105
681 ksu 1104
682 roj 1104
683 vai 1103
684 uts 1102
685 kau 1101
686 _õp 1100
687 _ed 1099
688 ub_ 1097
689 ssi 1095
690 rog 1094
691 san 1094
692 sku 1093
693 õhi 1092
694 aan 1090
695 ska 1089
696 kav 1085
697 eni 1082
698 nan 1077
699 puu 1076
700 det 1075
701 sol 1074
702 pri 1072
703 ere 1071
704 inu 1069
705 üst 1068
706 kum 1066
707 kõr 1065
708 nee 1062
709 ppe 1061
710 iss 1056
711 eme 1054
712 nii 1052
713 ib_ 1050
714 in_ 1048
715 ang 1045
716 sko 1043
717 htu 1039
718 tun 1039
719 nil 1038
720 ivs 1035
721 sas 1032
722 oll 1030
723 has 1028
724 ena 1027
725 ire 1027
726 kai 1026
727 kom 1026
728 spo 1026
729 lul 1024
730 egu 1023
731 süs 1023
732 ötu 1021
733 _av 1020
734 ato 1019
735 por 1013
736 rus 1012
737 õte 1012
738 kud 1010
739 rem 1010
740 rel 1009
741 nal 1008
742 aha 1007
743 vää 1006
744 näi 1003
745 ehn 1002
746 ve_ 1001
747 avi 995
748 pid 995
749 rva 995
750 avu 990
751 _l_ 989
752 täh 989
753 aru 988
754 til 988
755 ii_ 982
756 _pä 980
757 ulg 980
758 eal 976
759 üld 976
760 das 971
761 seo 971
762 _sü 970
763 üks 968
764 lee 967
765 lla 967
766 uni 967
767 inv 960
768 õrg 960
769 abe 959
770 nit 955
771 pet 951
772 naa 947
773 ana 946
774 _sõ 944
775 käi 944
776 tut 944
777 _mo 942
778 ive 941
779 ree 941
780 orm 936
781 ber 935
782 nni 935
783 ven 935
784 kva 932
785 kto 930
786 otl 930
787 bi_ 923
788 äht 923
789 õle 923
790 ont 920
791 ann 918
792 öta 917
793 kun 916
794 rja 915
795 nfo 914
796 jus 912
797 ort 912
798 öös 912
799 emu 910
800 ndl 909
801 sot 909
802 spe 909
803 man 907
804 smi 905
805 _ai 901
806 gis 896
807 dud 895
808 ärt 895
809 nne 892
810 ra_ 889
811 sat 886
812 tli 886
813 tao 885
814 nam 881
815 lid 879
816 mei 878
817 oim 878
818 _ot 877
819 _ho 873
820 kok 873
821 kol 873
822 rko 873
823 gem 871
824 pär 868
825 av_ 865
826 uba 864
827 eeg 855
828 ikl 847
829 iri 845
830 okk 845
831 int 843
832 kuu 842
833 tum 841
834 hte 840
835 did 839
836 _di 836
837 ner 836
838 lge 834
839 ial 832
840 nt_ 832
841 lma 828
842 pan 828
843 nid 827
844 _tr 826
845 dme 826
846 stö 826
847 irj 822
848 _lõ 821
849 ass 821
850 _de 820
851 ngi 820
852 ars 817
853 lmi 817
854 vit 817
855 _ro 816
856 amu 816
857 mek 813
858 ria 813
859 han 811
860 rav 811
861 tan 809
862 tit 806
863 ism 802
864 aud 800
865 usv 800
866 den 798
867 jõu 798
868 alg 797
869 _ab 796
870 sil 796
871 eko 795
872 uun 795
873 õiv 794
874 _n_ 792
875 ha_ 792
876 kan 790
877 var 790
878 hti 789
879 itm 789
880 mõj 789
881 ona 789
882 sul 789
883 õju 789
884 irk 787
885 kur 787
886 itl 786
887 _no 785
888 õik 785
889 ndm 783
890 of_ 783
891 ilm 779
892 uti 779
893 vse 779
894 odi 778
895 olt 777
896 õhj 773
897 ila 770
898 gas 769
899 ju_ 767
900 _fi 765
901 äli 765
902 _aj 764
903 _ak 764
904 _of 764
905 lga 762
906 aeg 761
907 _ti 759
908 an_ 759
909 sik 757
910 luk 756
911 mmi 756
912 ior 755
913 lad 755
914 nti 754
915 tim 751
916 ärk 750
917 _u_ 747
918 pal 745
919 äie 744
920 _kv 743
921 kel 743
922 for 742
923 lam 742
924 su_ 740
925 uha 740
926 kül 738
927 alm 737
928 gud 737
929 dab 736
930 ski 734
931 ldi 731
932 äbi 731
933 otu 729
934 rma 729
935 suh 729
936 _o_ 728
937 _kl 726
938 läb 725
939 _hu 724
940 hoi 720
941 olm 720
942 sto 720
943 öst 720
944 ula 717
945 juu 716
946 tin 715
947 ügi 715
948 hus 714
949 len 714
950 uvi 714
951 hva 713
952 _sä 712
953 pla 712
954 ksi 708
955 rge 706
956 usm 704
957 osi 700
958 noo 699
959 _jo 698
960 ekk 697
961 jon 697
962 dal 695
963 eos 693
964 vab 692
965 um_ 690
966 aba 687
967 gik 686
968 rik 686
969 aav 683
970 api 683
971 ten 682
972 _m_ 680
973 lep 680
974 vi_ 680
975 _r_ 678
976 fra 677
977 jaa 677
978 kts 676
979 öö_ 676
980 ilj 674
981 ode 674
982 eha 673
983 kri 671
984 onk 670
985 rot 670
986 nei 669
987 sur 669
988 nst 666
989 aki 665
990 ahu 662
991 erg 662
992 jul 662
993 _d_ 660
994 eka 660
995 otm 660
996 nes 658
997 õus 658
998 pin 657
999 dko 655
1000 üli 654
1001 eas 653
1002 hõi 653
1003 oid 650
1004 ske 649
1005 itt 648
1006 eam 646
1007 sad 644
1008 rst 643
1009 ll_ 642
1010 _ag 641
1011 _ük 640
1012 ev_ 640
1013 väi 640
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 en_ 106131
15 in_ 58268
16 ta_ 55626
17 tä_ 53283
18 an_ 53187
19 ist 47697
20 sta 43336
21 on_ 43007
22 ja_ 39277
23 _ja 34833
24 ett 34518
25 ttä 34382
26 ise 34375
27 sa_ 31387
28 tta 31331
29 sen 31131
30 _on 30416
31 _va 29376
32 _jo 27765
33 mis 27422
34 lis 27257
35 aan 26778
36 ssa 26557
37 _ko 26334
38 ksi 25541
39 itt 25386
40 ais 25194
41 än_ 25023
42 isi 23975
43 est 23320
44 _tä 22725
45 _ta 22098
46 taa 21547
47 _si 21503
48 lla 21331
49 _ka 21096
50 _et 20955
51 aa_ 20905
52 lli 20722
53 si_ 20631
54 ten 19829
55 ste 19820
56 _se 19749
57 den 19664
58 _ol 19431
59 stä 19131
60 iss 19127
61 ell 19029
62 me_ 18781
63 een 18638
64 ast 18147
65 lle 17984
66 ia_ 17761
67 uks 17657
68 itä 17586
69 kse 17163
70 mme 17101
71 ill 16970
72 _ku 16949
73 _mi 16823
74 voi 16728
75 le_ 16698
76 _to 16348
77 at_ 16024
78 sia 15940
79 la_ 15811
80 all 15602
81 tel 15469
82 ti_ 15218
83 ois 15157
84 sti 14987
85 ust 14795
86 ses 14778
87 val 14766
88 iin 14638
89 ään 14348
90 sit 14245
91 et_ 14188
92 nen 14129
93 oit 14082
94 ole 13982
95 _pa 13728
96 ava 13727
97 mie 13701
98 toi 13651
99 tte 13598
100 utt 13595
101 ine 13313
102 tei 13274
103 _tu 13273
104 ka_ 13210
105 sii 13167
106 tav 13165
107 kan 13159
108 täm 13097
109 aik 12891
110 ide 12841
111 _sa 12591
112 ite 12212
113 kin 12161
114 ent 12138
115 vat 12064
116 oll 12062
117 lai 12050
118 _jä 12011
119 eis 11981
120 omi 11784
121 eur 11749
122 _mu 11697
123 tää 11692
124 ain 11679
125 _me 11653
126 _ma 11650
127 sä_ 11514
128 ssä 11466
129 tam 11435
130 lä_ 11242
131 stu 11222
132 _eu 11214
133 kai 11196
134 iit 11194
135 tet 11103
136 ämä 11095
137 ien 11081
138 ää_ 10806
139 _yh 10753
140 maa 10734
141 oli 10717
142 _ei 10690
143 llä 10687
144 _pu 10684
145 min 10678
146 _te 10528
147 suu 10307
148 ess 10289
149 men 10188
150 alt 10177
151 uro 10157
152 ion 9915
153 _ha 9860
154 oim 9812
155 emm 9786
156 roo 9786
157 ita 9692
158 sio 9489
159 tee 9445
160 _ke 9393
161 oop 9353
162 nta 9269
163 ikk 9224
164 ans 9216
165 ott 9213
166 ude 9208
167 hte 9161
168 ssi 9141
169 arv 9081
170 pan 8995
171 yht 8981
172 tuk 8825
173 sel 8791
174 alu 8765
175 _as 8758
176 kom 8684
177 _vo 8664
178 eri 8658
179 ike 8619
180 imi 8605
181 tti 8560
182 iel 8555
183 sin 8480
184 uut 8383
185 ei_ 8363
186 lta 8338
187 yks 8321
188 _my 8216
189 ost 8198
190 ami 8070
191 tar 8014
192 se_ 8001
193 _al 7974
194 myö 7965
195 na_ 7923
196 tai 7911
197 täv 7907
198 kaa 7906
199 _ai 7886
200 _kä 7856
201 set 7850
202 opa 7849
203 tus 7841
204 vas 7747
205 _su 7738
206 _ti 7713
207 asi 7683
208 sis 7671
209 _hy 7586
210 see 7577
211 ala 7524
212 nne 7523
213 par 7490
214 äse 7465
215 tin 7453
216 _ra 7450
217 kos 7409
218 _ni 7394
219 _li 7360
220 aat 7266
221 jäs 7242
222 _en 7198
223 eid 7197
224 nsa 7196
225 int 7180
226 tii 7157
227 _pe 7131
228 un_ 7123
229 ass 7122
230 _ar 7116
231 vai 7115
232 puh 7108
233 nna 7086
234 hal 7084
235 isu 7001
236 per 6960
237 att 6924
238 lit 6913
239 hdo 6893
240 _la 6884
241 ali 6869
242 saa 6868
243 nti 6845
244 _vi 6836
245 muk 6823
246 ity 6822
247 elm 6815
248 esi 6812
249 ika 6810
250 oik 6776
251 jot 6699
252 lin 6684
253 ann 6683
254 yös 6678
255 äyt 6660
256 jen 6637
257 joi 6633
258 tie 6626
259 _nä 6566
260 rit 6551
261 hyv 6530
262 _vä 6529
263 nii 6492
264 kun 6440
265 lma 6438
266 ioi 6408
267 nki 6382
268 oni 6379
269 osk 6377
270 _pi 6371
271 _es 6358
272 ark 6352
273 ama 6333
274 yvä 6320
275 _pä 6266
276 eet 6266
277 sal 6257
278 del 6245
279 ävä 6228
280 oid 6226
281 pää 6208
282 lii 6182
283 ske 6176
284 man 6170
285 _po 6160
286 ät_ 6152
287 _er 6149
288 va_ 6146
289 vaa 6117
290 us_ 6083
291 uud 6056
292 ant 6053
293 äis 6009
294 rvo 5977
295 keu 5971
296 rki 5942
297 eli 5928
298 äll 5924
299 sto 5872
300 iet 5871
301 ene 5840
302 nte 5829
303 tio 5781
304 ös_ 5773
305 väl 5753
306 _so 5744
307 kui 5738
308 ttu 5738
309 mei 5711
310 _ne 5695
311 aja 5690
312 män 5673
313 ame 5657
314 uhe 5631
315 isa 5604
316 vuo 5591
317 ut_ 5587
318 lam 5586
319 oma 5565
320 ytt 5558
321 _ki 5554
322 tka 5541
323 dän 5525
324 mit 5504
325 eva 5492
326 rla 5492
327 enk 5470
328 alo 5460
329 imm 5451
330 mää 5445
331 uol 5435
332 _os 5408
333 ele 5355
334 arl 5345
335 sek 5341
336 sil 5322
337 _hu 5300
338 ni_ 5297
339 iä_ 5296
340 itu 5293
341 ova 5288
342 elu 5286
343 mä_ 5259
344 vät 5258
345 apa 5245
346 kka 5244
347 _he 5223
348 unn 5201
349 käy 5187
350 san 5176
351 oht 5169
352 kki 5166
353 oin 5156
354 inn 5147
355 laa 5122
356 net 5115
357 _ky 5101
358 ehd 5067
359 pol 5052
360 van 5049
361 ano 5036
362 kä_ 5027
363 tän 5013
364 idä 5008
365 työ 4985
366 lem 4964
367 lti 4938
368 lua 4937
369 tan 4921
370 emi 4907
371 ute 4896
372 isä 4889
373 tun 4881
374 ön_ 4841
375 enn 4839
376 oka 4839
377 äst 4827
378 nee 4822
379 _vu 4818
380 ime 4817
381 eks 4805
382 var 4776
383 tis 4751
384 äli 4713
385 nio 4694
386 uka 4681
387 sty 4675
388 amm 4668
389 uus 4660
390 äks 4659
391 nni 4644
392 kes 4639
393 muu 4576
394 nin 4572
395 vie 4569
396 its 4563
397 dot 4561
398 tal 4550
399 huo 4549
400 uni 4541
401 hta 4537
402 eit 4522
403 mai 4508
404 tty 4481
405 lue 4476
406 osi 4454
407 _lu 4445
408 ota 4434
409 jok 4428
410 _un 4427
411 _ed 4416
412 iaa 4414
413 kuu 4410
414 kys 4397
415 utu 4393
416 iva 4376
417 iik 4371
418 _ov 4353
419 teh 4340
420 eru 4335
421 äin 4333
422 sku 4307
423 oje 4287
424 elt 4274
425 osa 4268
426 rus 4267
427 uom 4256
428 _ty 4252
429 tul 4240
430 ne_ 4239
431 ote 4238
432 taj 4235
433 sim 4214
434 len 4180
435 ens 4177
436 tse 4170
437 jos 4167
438 pit 4146
439 yt_ 4140
440 mat 4137
441 tuu 4136
442 pal 4096
443 imu 4077
444 nä_ 4068
445 eil 4052
446 des 4051
447 _oi 4048
448 esk 4035
449 mer 4001
450 ino 3999
451 sää 3976
452 tys 3965
453 lan 3955
454 ies 3926
455 äät 3926
456 hem 3915
457 lee 3907
458 nyt 3901
459 aut 3896
460 es_ 3893
461 nis 3890
462 täi 3888
463 hin 3863
464 täy 3840
465 _an 3834
466 mas 3830
467 näi 3829
468 kev 3827
469 uur 3818
470 ata 3817
471 lev 3816
472 neu 3807
473 tek 3791
474 les 3775
475 äär 3772
476 aal 3765
477 koh 3760
478 _eh 3749
479 pai 3740
480 ana 3736
481 otk 3728
482 uvo 3722
483 _uu 3717
484 euv 3715
485 ati 3711
486 ris 3706
487 ina 3699
488 ken 3691
489 ian 3684
490 yis 3681
491 ua_ 3676
492 vii 3641
493 kau 3630
494 ihe 3628
495 lou 3628
496 soi 3624
497 joh 3606
498 ämi 3600
499 vä_ 3568
500 ahd 3561
501 stö 3561
502 env 3555
503 yst 3552
504 uit 3542
505 oa_ 3530
506 kis 3525
507 rke 3525
508 dis 3518
509 hen 3513
510 oja 3511
511 lmi 3505
512 uot 3500
513 tuo 3498
514 puo 3494
515 hän 3484
516 sem 3483
517 kut 3470
518 ilm 3457
519 aks 3452
520 ltä 3452
521 nka 3442
522 ila 3438
523 til 3433
524 kei 3423
525 ysy 3408
526 erk 3391
527 täs 3376
528 aki 3363
529 kol 3363
530 jär 3360
531 ail 3357
532 ode 3344
533 vos 3342
534 loi 3335
535 oks 3334
536 ran 3332
537 iti 3326
538 eta 3325
539 mmä 3325
540 uun 3325
541 väk 3325
542 ton 3324
543 opi 3316
544 nut 3307
545 jat 3302
546 uin 3299
547 oss 3288
548 _py 3284
549 aus 3271
550 kii 3253
551 iks 3244
552 unt 3244
553 li_ 3236
554 ska 3231
555 onn 3223
556 ee_ 3198
557 tap 3190
558 ätt 3186
559 elä 3169
560 rje 3167
561 eut 3166
562 kas 3164
563 nva 3160
564 sym 3160
565 onk 3141
566 uva 3135
567 käs 3134
568 eti 3133
569 eel 3125
570 ulu 3115
571 ppa 3099
572 tyi 3096
573 otu 3095
574 atk 3092
575 sop 3089
576 lut 3084
577 ihi 3079
578 ivä 3079
579 mmi 3078
580 ink 3077
581 jon 3067
582 _sy 3062
583 ntä 3044
584 ema 3029
585 nnö 3023
586 nan 3019
587 äne 3009
588 ärj 3002
589 io_ 2989
590 kok 2988
591 ull 2987
592 rat 2984
593 äri 2982
594 ima 2978
595 äss 2976
596 ara 2973
597 vin 2956
598 keh 2954
599 jes 2953
600 vis 2949
601 aka 2942
602 luo 2942
603 _el 2939
604 rvi 2934
605 ätö 2931
606 uri 2928
607 _ri 2915
608 mut 2898
609 ärk 2898
610 _ny 2895
611 kus 2894
612 te_ 2881
613 koi 2873
614 _ve 2870
615 ntö 2870
616 ikä 2863
617 oon 2840
618 rin 2827
619 hee 2826
620 nsi 2824
621 mah 2823
622 äsi 2816
623 tyy 2808
624 _yk 2806
625 ko_ 2805
626 pim 2805
627 ees 2803
628 iis 2799
629 _ot 2793
630 jel 2790
631 oko 2790
632 hmi 2789
633 dol 2787
634 sam 2786
635 tyk 2785
636 ytä 2780
637 nno 2776
638 ein 2775
639 yde 2773
640 tto 2770
641 hit 2764
642 opp 2762
643 tas 2743
644 etu 2742
645 kon 2742
646 ai_ 2739
647 _mo 2737
648 ivi 2734
649 ehi 2729
650 _il 2727
651 os_ 2726
652 syy 2722
653 kea 2718
654 _lä 2711
655 tu_ 2708
656 vit 2695
657 rah 2681
658 yvi 2681
659 aih 2676
660 ase 2670
661 aav 2668
662 ri_ 2665
663 sik 2665
664 use 2661
665 _yl 2657
666 noi 2644
667 yhd 2638
668 nit 2635
669 tum 2628
670 ämm 2620
671 kit 2615
672 ohj 2603
673 _ju 2595
674 täl 2589
675 ymy 2585
676 kke 2582
677 suo 2575
678 _lo 2565
679 ede 2565
680 _it 2563
681 eni 2558
682 uis 2555
683 siä 2554
684 auk 2534
685 oi_ 2531
686 tär 2528
687 ani 2526
688 elv 2526
689 kil 2524
690 kee 2513
691 iko 2511
692 _hä 2509
693 _ih 2500
694 avo 2498
695 nss 2494
696 raa 2494
697 uta 2489
698 ki_ 2486
699 umi 2467
700 tur 2464
701 kie 2459
702 änt 2459
703 eht 2456
704 aas 2454
705 ida 2454
706 eik 2434
707 ihm 2432
708 ntt 2432
709 oud 2420
710 oil 2418
711 sie 2415
712 ryh 2403
713 yty 2402
714 ito 2393
715 kul 2387
716 evi 2386
717 yy_ 2380
718 iiv 2376
719 keä 2374
720 kal 2372
721 ätä 2370
722 pau 2367
723 tod 2365
724 tot 2365
725 ank 2358
726 ros 2356
727 mal 2348
728 det 2347
729 ule 2344
730 uod 2341
731 uon 2326
732 nai 2325
733 usk 2325
734 rkk 2323
735 uul 2319
736 ys_ 2317
737 ena 2315
738 _mä 2314
739 etä 2314
740 tia 2312
741 aam 2310
742 kem 2307
743 ker 2307
744 han 2299
745 ias 2299
746 avi 2292
747 ete 2286
748 tak 2283
749 ter 2283
750 kää 2277
751 hde 2275
752 tom 2273
753 _no 2272
754 pro 2265
755 yön 2256
756 ksy 2255
757 läh 2245
758 avu 2243
759 vel 2239
760 ait 2236
761 hei 2217
762 nto 2217
763 ärä 2217
764 mia 2216
765 ria 2216
766 tut 2212
767 isk 2211
768 yä_ 2193
769 nes 2183
770 rii 2183
771 jan 2181
772 htä 2180
773 ähe 2178
774 kir 2176
775 nei 2172
776 mio 2158
777 inu 2154
778 _us 2146
779 ous 2144
780 ake 2138
781 ohd 2136
782 rva 2122
783 oiv 2106
784 ulk 2103
785 toj 2098
786 uee 2093
787 uu_ 2091
788 nsä 2088
789 ely 2084
790 mon 2084
791 ied 2083
792 ält 2083
793 irj 2080
794 kko 2077
795 vää 2069
796 _ym 2065
797 hoi 2062
798 myk 2054
799 mus 2053
800 aha 2051
801 tym 2050
802 lau 2047
803 ekä 2042
804 _sä 2040
805 kor 2034
806 opu 2034
807 inä 2031
808 yyt 2031
809 ttö 2030
810 ski 2023
811 änn 2022
812 ma_ 2017
813 rik 2010
814 nal 2001
815 tyv 1985
816 eus 1983
817 mik 1982
818 to_ 1970
819 mät 1969
820 äni 1969
821 _ry 1968
822 lei 1966
823 mui 1964
824 tim 1958
825 enä 1953
826 tuv 1948
827 lop 1946
828 tau 1946
829 yri 1945
830 mil 1936
831 näk 1936
832 isö 1934
833 seu 1931
834 kti 1930
835 uss 1929
836 ymi 1928
837 tös 1923
838 iih 1916
839 ate 1913
840 ona 1908
841 sei 1908
842 vak 1903
843 ura 1900
844 sos 1897
845 jä_ 1892
846 iot 1887
847 sva 1883
848 vän 1879
849 alv 1853
850 önt 1852
851 rja 1851
852 mma 1849
853 usi 1836
854 iku 1829
855 oso 1824
856 änä 1817
857 tui 1811
858 noa 1809
859 da_ 1808
860 yhm 1804
861 uai 1802
862 nge 1799
863 ini 1796
864 hje 1792
865 iso 1791
866 ont 1784
867 mak 1782
868 _av 1773
869 ert 1770
870 vap 1768
871 eto 1766
872 uor 1763
873 sov 1758
874 mpä 1755
875 daa 1754
876 iid 1754
877 mar 1754
878 _ää 1753
879 nnu 1752
880 ea_ 1749
881 not 1744
882 jo_ 1739
883 eud 1738
884 _pr 1735
885 päi 1734
886 kat 1732
887 nos 1732
888 paa 1732
889 ltt 1727
890 uos 1724
891 euk 1723
892 aaj 1722
893 _aj 1720
894 tät 1718
895 urv 1715
896 rak 1711
897 lus 1707
898 dä_ 1706
899 tön 1706
900 yöt 1704
901 ntu 1698
902 aho 1695
903 von 1695
904 tyn 1693
905 tua 1684
906 mpi 1679
907 is_ 1667
908 aad 1661
909 vir 1657
910 väh 1655
911 eiv 1646
912 nom 1644
913 ilt 1638
914 lke 1634
915 sai 1632
916 ekt 1628
917 ire 1628
918 nat 1624
919 lmä 1617
920 _om 1615
921 ivo 1614
922 uok 1614
923 las 1613
924 pid 1611
925 tos 1611
926 isy 1606
927 nmu 1606
928 ot_ 1606
929 siv 1606
930 via 1602
931 ras 1599
932 _in 1596
933 pa_ 1591
934 hti 1590
935 lyt 1590
936 ilu 1587
937 jäl 1584
938 ty_ 1582
939 ver 1580
940 toa 1578
941 lvo 1577
942 oku 1575
943 iki 1573
944 ong 1572
945 uht 1562
946 _oh 1560
947 _n_ 1554
948 ljo 1553
949 ävi 1552
950 ähä 1550
951 nim 1549
952 elk 1542
953 kou 1539
954 erä 1538
955 pär 1536
956 olt 1535
957 säl 1533
958 uma 1533
959 ouk 1532
960 gel 1531
961 ulo 1528
962 hdi 1521
963 ija 1517
964 nem 1517
965 sla 1517
966 ane 1516
967 arm 1515
968 yse 1511
969 art 1509
970 oro 1509
971 ira 1507
972 aht 1500
973 pah 1500
974 rko 1495
975 sko 1495
976 ins 1494
977 öön 1489
978 lim 1488
979 ymp 1488
980 yyd 1486
981 anu 1479
982 ari 1475
983 _na 1469
984 het 1467
985 lun 1466
986 yte 1465
987 njo 1463
988 aji 1462
989 hmä 1460
990 enj 1457
991 väs 1457
992 _em 1454
993 iir 1454
994 ose 1451
995 rjo 1449
996 _di 1446
997 aar 1446
998 yle 1441
999 iok 1436
1000 eki 1429
1001 yn_ 1428
1002 luk 1423
1003 töö 1423
1004 ppu 1416
1005 htu 1415
1006 suh 1411
1007 ääs 1411
1008 ial 1410
1009 non 1410
1010 öst 1408
1011 alk 1405
1012 lko 1403
1013 llu 1403
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 es_ 124024
15 _de 109755
16 de_ 78274
17 nt_ 74666
18 ent 73427
19 ion 65376
20 _le 62856
21 on_ 57250
22 le_ 55725
23 _co 53514
24 _qu 52602
25 que 50913
26 _la 49619
27 re_ 49502
28 la_ 49477
29 ns_ 48796
30 tio 44394
31 men 41151
32 ue_ 40603
33 ons 40266
34 les 37708
35 et_ 37315
36 _pa 36424
37 ne_ 36124
38 _pr 35906
39 _l_ 34158
40 _un 33129
41 _en 32396
42 ur_ 32380
43 _et 31951
44 _ce 31086
45 us_ 30924
46 eme 30266
47 er_ 30202
48 ati 29403
49 _po 29237
50 _no 28573
51 des 28357
52 con 28332
53 _à_ 28000
54 eur 27741
55 en_ 27260
56 _au 26289
57 ce_ 25075
58 te_ 23773
59 our 23353
60 est 23013
61 ous 22967
62 par 22793
63 _d_ 22601
64 com 22259
65 omm 20361
66 tre 19922
67 _so 19837
68 ant 19778
69 ont 19637
70 _es 19421
71 is_ 19350
72 it_ 19074
73 pro 19063
74 nou 19028
75 ts_ 19005
76 res 18850
77 ire 18398
78 me_ 17460
79 _dé 17376
80 ans 17332
81 pou 17199
82 _in 17162
83 té_ 17135
84 il_ 16880
85 lle 16762
86 une 16272
87 st_ 16127
88 _re 16119
89 _su 15949
90 _ma 15798
91 _se 15554
92 ui_ 15354
93 iqu 15271
94 dan 15176
95 ssi 15065
96 rop 14695
97 qui 14633
98 _du 14571
99 _il 14473
100 du_ 14410
101 _da 14232
102 un_ 14210
103 iss 13840
104 se_ 13739
105 nce 13678
106 ux_ 13396
107 _mo 13362
108 ell 13134
109 _ré 13115
110 sio 13078
111 rs_ 13041
112 nte 12940
113 ait 12843
114 ais 12724
115 air 12687
116 rai 12244
117 ort 12135
118 _di 12086
119 ité 12035
120 ter 12019
121 lem 11984
122 ien 11962
123 iti 11938
124 mme 11896
125 nne 11869
126 sur 11838
127 pré 11770
128 _fa 11747
129 au_ 11636
130 _je 11603
131 as_ 11533
132 _vo 11276
133 _av 11227
134 _tr 11221
135 _pe 11178
136 je_ 11106
137 _do 11086
138 mis 11081
139 pas 10835
140 ale 10650
141 rés 10622
142 _eu 10537
143 son 10475
144 pos 10392
145 tra 10361
146 _ét 10282
147 por 10242
148 cti 10072
149 uro 10031
150 _me 10021
151 int 9942
152 _to 9829
153 ntr 9806
154 tou 9805
155 ces 9782
156 ir_ 9689
157 onn 9638
158 ues 9634
159 _pl 9542
160 ure 9473
161 app 9402
162 out 9368
163 aut 9323
164 ain 9197
165 mes 9177
166 qu_ 9177
167 és_ 9128
168 ect 9069
169 tte 9069
170 lit 9026
171 ère 9010
172 ens 8997
173 ten 8872
174 ett 8855
175 ut_ 8792
176 _a_ 8759
177 ouv 8681
178 sti 8643
179 ar_ 8607
180 aux 8537
181 _ne 8519
182 mai 8509
183 mmi 8482
184 end 8433
185 ive 8378
186 ble 8307
187 ide 8242
188 _ex 8222
189 enc 8137
190 urs 8133
191 ren 8006
192 opé 7977
193 onc 7838
194 tai 7806
195 mon 7792
196 _ac 7779
197 nsi 7755
198 ie_ 7749
199 ser 7748
200 tan 7706
201 ieu 7677
202 tat 7660
203 oit 7656
204 _ra 7644
205 art 7605
206 fai 7575
207 ins 7506
208 ée_ 7467
209 nts 7358
210 nti 7356
211 lus 7344
212 ess 7310
213 éta 7253
214 anc 7163
215 sse 7144
216 uve 7128
217 éen 7111
218 _ca 7106
219 ran 7070
220 pée 7067
221 plu 7017
222 _fo 7001
223 si_ 6980
224 _on 6934
225 rat 6897
226 ven 6887
227 _si 6821
228 pri 6650
229 ièr 6647
230 _sa 6628
231 tiq 6604
232 sit 6592
233 _te 6577
234 cet 6575
235 tes 6542
236 sen 6508
237 ine 6474
238 _ch 6454
239 in_ 6396
240 nse 6387
241 den 6360
242 uti 6354
243 uni 6332
244 ern 6311
245 vou 6286
246 cer 6243
247 ite 6243
248 nde 6242
249 ave 6229
250 nta 6201
251 and 6195
252 ier 6167
253 sou 6096
254 rap 6020
255 sid 6005
256 ise 5940
257 for 5864
258 rt_ 5836
259 uel 5822
260 rta 5817
261 enn 5777
262 rit 5745
263 ond 5707
264 ers 5703
265 che 5681
266 _ai 5679
267 bre 5589
268 ron 5574
269 eux 5560
270 ési 5542
271 tie 5496
272 pre 5456
273 ver 5424
274 tiv 5401
275 abl 5357
276 ppo 5350
277 rti 5340
278 rem 5338
279 don 5336
280 _ap 5333
281 ali 5327
282 voi 5309
283 emb 5303
284 roi 5295
285 oir 5282
286 ois 5214
287 dre 5212
288 san 5209
289 _n_ 5197
290 _li 5186
291 _im 5124
292 gra 5118
293 per 5097
294 man 5074
295 ist 5068
296 imp 5066
297 str 5021
298 sie 4991
299 omp 4954
300 von 4939
301 ris 4938
302 ses 4912
303 ili 4906
304 avo 4873
305 _vi 4852
306 arl 4822
307 ées 4802
308 fin 4794
309 ve_ 4793
310 era 4788
311 nes 4785
312 ill 4784
313 leu 4783
314 pe_ 4771
315 dem 4769
316 rme 4765
317 ste 4763
318 ute 4750
319 _s_ 4708
320 mat 4707
321 ssa 4699
322 oli 4695
323 ès_ 4669
324 ert 4643
325 act 4614
326 ava 4606
327 ass 4587
328 pol 4569
329 teu 4559
330 _gr 4552
331 rie 4509
332 mbr 4505
333 nst 4494
334 age 4492
335 nom 4492
336 rle 4467
337 éri 4447
338 _mi 4436
339 tés 4425
340 lis 4420
341 rte 4420
342 isa 4409
343 cha 4398
344 _êt 4396
345 ame 4387
346 oin 4386
347 orm 4378
348 osi 4369
349 acc 4356
350 sem 4345
351 dev 4332
352 êtr 4318
353 rec 4293
354 mpl 4291
355 dis 4277
356 pen 4263
357 ra_ 4237
358 _c_ 4217
359 ord 4215
360 jou 4211
361 opo 4184
362 rd_ 4157
363 eau 4146
364 nio 4139
365 peu 4130
366 ème 4129
367 _an 4097
368 cel 4088
369 tur 4084
370 tro 4053
371 itu 4046
372 cor 4035
373 ge_ 4025
374 utr 4016
375 ls_ 4004
376 esp 3998
377 nat 3987
378 éra 3983
379 _cr 3971
380 emp 3970
381 al_ 3968
382 uis 3968
383 _ou 3964
384 tit 3943
385 nan 3934
386 ica 3929
387 éga 3926
388 dir 3911
389 lat 3909
390 eil 3906
391 cou 3892
392 ec_ 3884
393 _el 3863
394 nté 3840
395 été 3820
396 ina 3818
397 ela 3805
398 tif 3797
399 _lo 3786
400 mpo 3772
401 vec 3765
402 ori 3759
403 sei 3751
404 sta 3743
405 déc 3725
406 égi 3720
407 ici 3719
408 ndr 3716
409 nem 3716
410 min 3712
411 fon 3704
412 _fi 3669
413 ani 3644
414 nal 3640
415 rou 3635
416 _am 3606
417 its 3604
418 ése 3567
419 ats 3559
420 doi 3556
421 uss 3541
422 ime 3499
423 nci 3498
424 el_ 3491
425 _éc 3472
426 vel 3460
427 dro 3457
428 der 3454
429 rne 3413
430 sai 3410
431 soi 3405
432 vai 3403
433 roc 3394
434 pay 3390
435 rès 3385
436 spo 3376
437 mun 3374
438 met 3361
439 ére 3352
440 cia 3332
441 nda 3331
442 cul 3320
443 _dr 3310
444 at_ 3304
445 _ci 3301
446 mem 3287
447 oci 3276
448 mmu 3273
449 not 3270
450 ifi 3250
451 aus 3245
452 ppe 3245
453 tri 3235
454 erm 3229
455 fic 3215
456 rer 3211
457 _bi 3208
458 _as 3194
459 _ég 3194
460 cit 3181
461 tic 3180
462 ays 3170
463 _ju 3169
464 ema 3163
465 gal 3157
466 att 3155
467 éci 3154
468 tem 3145
469 ys_ 3133
470 ial 3114
471 oi_ 3091
472 rce 3088
473 ndi 3074
474 _ob 3071
475 inc 3068
476 ini 3066
477 cat 3063
478 lon 3062
479 nis 3059
480 ail 3044
481 ême 3026
482 mer 3024
483 col 3019
484 lai 3019
485 _ad 3014
486 van 3011
487 cte 3005
488 lie 3004
489 lan 3000
490 ner 2996
491 vis 2995
492 _or 2994
493 os_ 2986
494 otr 2986
495 ara 2983
496 erc 2980
497 ita 2980
498 ose 2975
499 sat 2954
500 _j_ 2949
501 squ 2940
502 cie 2936
503 éco 2930
504 sui 2926
505 rég 2925
506 isi 2924
507 dép 2917
508 eff 2917
509 mar 2915
510 qua 2913
511 _al 2904
512 bie 2901
513 _ag 2896
514 rav 2883
515 _ef 2879
516 gue 2868
517 ign 2855
518 _pu 2850
519 ez_ 2830
520 uit 2817
521 lut 2810
522 _él 2809
523 bil 2788
524 lor 2785
525 _y_ 2772
526 _ve 2770
527 elo 2767
528 _af 2757
529 dit 2749
530 née 2741
531 ors 2741
532 bli 2740
533 ai_ 2738
534 nir 2733
535 dra 2731
536 non 2729
537 erv 2728
538 rep 2727
539 omi 2724
540 _m_ 2717
541 moi 2716
542 soc 2709
543 tal 2703
544 ou_ 2693
545 ote 2691
546 ate 2687
547 uer 2687
548 oup 2680
549 _at 2676
550 _mê 2675
551 jet 2674
552 mêm 2662
553 arc 2658
554 ctu 2658
555 _né 2650
556 ult 2628
557 rel 2603
558 _st 2597
559 cla 2570
560 dam 2559
561 ura 2552
562 ona 2549
563 sol 2548
564 tue 2538
565 ffi 2524
566 dif 2522
567 ple 2519
568 pon 2515
569 pli 2510
570 vie 2510
571 he_ 2499
572 tér 2489
573 opp 2478
574 ral 2477
575 pla 2474
576 ler 2459
577 uct 2458
578 eut 2454
579 ein 2453
580 tab 2453
581 _ba 2450
582 ric 2448
583 ind 2444
584 ice 2432
585 uri 2419
586 eni 2407
587 pér 2407
588 éve 2387
589 epr 2366
590 oye 2365
591 ono 2363
592 som 2361
593 nco 2360
594 ace 2350
595 nné 2339
596 udr 2338
597 nvi 2333
598 ore 2329
599 cis 2324
600 ule 2324
601 use 2318
602 rod 2315
603 lic 2313
604 amm 2312
605 _na 2310
606 _év 2299
607 olu 2296
608 omb 2276
609 cip 2274
610 fér 2274
611 mbl 2269
612 icu 2264
613 exp 2260
614 obl 2259
615 _ar 2244
616 urr 2242
617 arg 2241
618 nau 2239
619 fau 2226
620 oll 2222
621 reu 2219
622 _hu 2213
623 eu_ 2213
624 ang 2212
625 rch 2188
626 oul 2180
627 poi 2177
628 _ta 2162
629 ttr 2156
630 vot 2156
631 gen 2151
632 cco 2147
633 _be 2141
634 lar 2139
635 ibl 2129
636 aie 2123
637 ope 2116
638 _sé 2115
639 uli 2115
640 nd_ 2114
641 ds_ 2112
642 pte 2106
643 uté 2105
644 vea 2104
645 éce 2104
646 spe 2103
647 _ab 2102
648 cep 2088
649 els 2088
650 pui 2085
651 rre 2082
652 rog 2081
653 dui 2078
654 deu 2075
655 env 2075
656 ann 2068
657 rob 2066
658 agi 2065
659 rée 2062
660 tiè 2057
661 odu 2055
662 ogr 2052
663 err 2051
664 exi 2051
665 iff 2049
666 nna 2049
667 _cl 2048
668 trè 2048
669 ci_ 2043
670 vre 2041
671 rép 2039
672 tru 2036
673 yen 2032
674 ujo 2031
675 réc 2024
676 niè 2017
677 onf 2010
678 oud 2008
679 nfo 2006
680 ves 1999
681 iat 1974
682 _ho 1972
683 ito 1972
684 cri 1971
685 uat 1971
686 ust 1971
687 gle 1963
688 dév 1960
689 auc 1957
690 gne 1957
691 iso 1955
692 upe 1954
693 car 1940
694 sib 1940
695 lop 1931
696 oss 1922
697 bat 1920
698 imi 1920
699 loi 1905
700 ils 1897
701 nit 1894
702 una 1892
703 _va 1887
704 ger 1885
705 réa 1883
706 rqu 1882
707 ppr 1881
708 evo 1873
709 idé 1873
710 ard 1871
711 rma 1869
712 vra 1865
713 cas 1862
714 gar 1858
715 ché 1850
716 if_ 1843
717 ffe 1841
718 isp 1840
719 seu 1833
720 déb 1831
721 _bo 1825
722 gis 1822
723 nai 1822
724 rin 1821
725 rge 1817
726 tis 1816
727 sé_ 1815
728 nc_ 1799
729 lib 1796
730 abi 1792
731 ume 1792
732 cen 1790
733 abo 1779
734 bje 1776
735 déf 1773
736 obj 1765
737 jus 1760
738 ssu 1760
739 nie 1758
740 rio 1756
741 isé 1754
742 lig 1748
743 gio 1745
744 emi 1744
745 rni 1742
746 ies 1739
747 her 1738
748 éme 1738
749 ret 1737
750 liq 1734
751 rna 1734
752 lég 1730
753 vit 1729
754 pec 1726
755 enf 1724
756 ora 1724
757 ndu 1723
758 équ 1717
759 ept 1716
760 _ni 1715
761 exe 1712
762 blè 1710
763 cur 1710
764 lèm 1710
765 nsa 1709
766 réf 1709
767 éli 1702
768 ivi 1690
769 enu 1687
770 esu 1685
771 foi 1685
772 cré 1681
773 rve 1680
774 evr 1678
775 tel 1673
776 tin 1670
777 ena 1665
778 cho 1662
779 plo 1661
780 ubl 1659
781 _bu 1657
782 mil 1656
783 _em 1653
784 rci 1653
785 all 1651
786 an_ 1650
787 gro 1648
788 _jo 1647
789 eus 1646
790 uvo 1646
791 céd 1641
792 nge 1640
793 nve 1638
794 ext 1637
795 eul 1632
796 bor 1631
797 inf 1626
798 _fr 1621
799 ile 1621
800 rim 1620
801 édu 1619
802 tec 1618
803 ari 1616
804 lim 1602
805 mie 1598
806 hai 1596
807 _ga 1595
808 rôl 1589
809 ets 1584
810 ero 1579
811 nos 1570
812 iel 1562
813 urd 1560
814 uen 1555
815 aid 1553
816 har 1552
817 aqu 1550
818 pem 1550
819 sée 1549
820 _lu 1548
821 sso 1541
822 ruc 1537
823 ram 1532
824 tir 1529
825 dér 1523
826 _ha 1522
827 til 1521
828 _ti 1519
829 vir 1518
830 ôle 1517
831 onv 1515
832 éné 1513
833 fs_ 1511
834 tée 1508
835 quo 1505
836 nel 1502
837 niq 1496
838 tég 1496
839 mpr 1495
840 bon 1494
841 adr 1493
842 nor 1492
843 tor 1486
844 uan 1484
845 _ri 1481
846 dém 1480
847 _fe 1479
848 ota 1477
849 éba 1474
850 dom 1473
851 olo 1473
852 rra 1471
853 isc 1464
854 log 1464
855 sab 1463
856 sav 1461
857 aur 1460
858 tim 1460
859 rso 1459
860 _sy 1457
861 néc 1456
862 rév 1455
863 hui 1454
864 éch 1454
865 cro 1452
866 gou 1451
867 ipe 1447
868 dur 1439
869 uvr 1439
870 oma 1434
871 règ 1434
872 éte 1434
873 écu 1433
874 _rè 1431
875 éfi 1429
876 avi 1425
877 eco 1424
878 ré_ 1424
879 ppl 1423
880 éso 1416
881 fra 1412
882 han 1410
883 miq 1410
884 nds 1401
885 dia 1398
886 tut 1396
887 épo 1396
888 uta 1395
889 ègl 1392
890 ole 1391
891 _op 1390
892 ges 1390
893 évi 1389
894 ula 1388
895 iro 1387
896 uto 1375
897 rot 1373
898 len 1369
899 épu 1367
900 _lé 1366
901 git 1366
902 hom 1364
903 rts 1364
904 ifs 1363
905 jec 1360
906 oie 1354
907 aci 1353
908 mal 1350
909 mit 1344
910 auj 1337
911 vic 1335
912 osé 1330
913 tag 1328
914 usi 1323
915 émo 1323
916 sme 1321
917 nen 1320
918 éla 1315
919 rvi 1311
920 ada 1310
921 gie 1309
922 lio 1306
923 llè 1306
924 mod 1305
925 toy 1303
926 lli 1302
927 aff 1293
928 tua 1293
929 _mé 1292
930 nue 1292
931 rri 1289
932 ps_ 1288
933 cun 1287
934 mma 1281
935 pel 1281
936 sec 1275
937 _vu 1266
938 _gé 1265
939 rêt 1264
940 cra 1259
941 mpt 1259
942 rib 1258
943 opt 1257
944 cad 1255
945 lèg 1254
946 ix_ 1253
947 nqu 1252
948 nct 1248
949 bas 1247
950 urq 1247
951 _go 1245
952 _où 1243
953 occ 1243
954 où_ 1243
955 déj 1240
956 niv 1239
957 nér 1237
958 jà_ 1234
959 éjà 1233
960 dic 1229
961 édi 1225
962 sus 1224
963 pub 1221
964 veu 1219
965 né_ 1216
966 val 1214
967 put 1212
968 och 1206
969 ègu 1206
970 duc 1204
971 yst 1202
972 agr 1197
973 ech 1196
974 ism 1192
975 _ut 1191
976 alo 1188
977 ibu 1187
978 nfi 1187
979 uoi 1184
980 idi 1183
981 mps 1183
982 sys 1183
983 emm 1179
984 lée 1179
985 cce 1176
986 tau 1170
987 _oc 1167
988 ior 1167
989 lec 1165
990 gan 1160
991 _br 1159
992 vue 1159
993 stè 1156
994 lui 1151
995 ffé 1148
996 gén 1145
997 clu 1143
998 ton 1143
999 dop 1141
1000 rab 1134
1001 lac 1130
1002 rom 1126
1003 ude 1126
1004 hé_ 1125
1005 océ 1121
1006 cle 1116
1007 ièm 1114
1008 lab 1114
1009 sa_ 1113
1010 org 1108
1011 afi 1107
1012 éal 1104
1013 _x_ 1101
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _de 362561
15 de_ 340477
16 os_ 289211
17 as_ 230680
18 _co 219796
19 do_ 211969
20 _a_ 176030
21 _e_ 148881
22 ent 146587
23 _o_ 144578
24 da_ 144100
25 que 129396
26 _do 125347
27 en_ 124546
28 nte 124516
29 es_ 123737
30 _en 122599
31 _po 119323
32 ión 117837
33 te_ 116335
34 _se 116250
35 _qu 114451
36 ón_ 111221
37 ue_ 110174
38 ra_ 108855
39 con 104102
40 _no 103372
41 _es 102588
42 est 99822
43 _da 97953
44 _pa 96504
45 _ca 95634
46 _un 94718
47 ció 92440
48 _pr 91703
49 se_ 91606
50 ado 90387
51 _re 88993
52 to_ 86582
53 no_ 82834
54 on_ 82389
55 ia_ 75136
56 men 74366
57 par 73303
58 sta 72331
59 ica 72002
60 al_ 70803
61 aci 69207
62 dos 69086
63 res 68742
64 or_ 68497
65 ta_ 68082
66 ant 67278
67 is_ 66568
68 com 64441
69 na_ 63758
70 _na 62458
71 ro_ 62412
72 _in 62205
73 _ma 60636
74 ida 58388
75 ar_ 57239
76 des 56158
77 tra 55994
78 un_ 54236
79 ade 54102
80 ist 53931
81 nha 53367
82 unh 52921
83 ter 52897
84 ha_ 52204
85 _fo 52198
86 cia 52169
87 ada 51535
88 ou_ 51441
89 ndo 50795
90 por 50783
91 an_ 50601
92 lo_ 50452
93 ara 49902
94 _as 48813
95 _te 48670
96 dad 48495
97 _pe 48329
98 ca_ 48070
99 and 47367
100 ran 47229
101 mo_ 47160
102 nto 47157
103 nci 47155
104 co_ 46709
105 _os 46533
106 io_ 46177
107 _me 45312
108 per 44965
109 pro 44009
110 _an 42944
111 la_ 42826
112 _mo 42662
113 ste 42248
114 _di 42188
115 eir 41812
116 ns_ 41575
117 ía_ 40807
118 era 40030
119 _é_ 38847
120 nta 38725
121 _so 38693
122 das 38426
123 ico 38241
124 ntr 37887
125 pre 37350
126 er_ 37330
127 ici 37158
128 ita 37003
129 pol 36612
130 nde 36196
131 _tr 36019
132 tes 35915
133 ano 35863
134 ont 35798
135 ali 35501
136 str 34745
137 _al 34459
138 re_ 34247
139 ina 34102
140 tro 33428
141 rio 33394
142 tic 33180
143 _ou 33167
144 ito 33072
145 art 32766
146 ron 32618
147 den 32334
148 ura 31657
149 tor 31585
150 tos 31346
151 nos 31266
152 ari 31176
153 tan 30842
154 ons 30673
155 omo 30623
156 ido 30315
157 for 30282
158 ame 30184
159 ais 30126
160 cas 29991
161 ort 29863
162 _ba 29474
163 rte 29021
164 ese 28913
165 eri 28638
166 esp 28464
167 óns 28461
168 enc 28443
169 end 28407
170 _sa 28381
171 _ex 28360
172 ras 28352
173 nda 28275
174 _su 28176
175 tad 28044
176 rec 27981
177 sti 27846
178 ten 27829
179 olo 27590
180 _ao 27559
181 ria 27535
182 can 27525
183 iro 27513
184 ona 27272
185 eu_ 27225
186 _vi 26867
187 ma_ 26654
188 _li 26245
189 rad 26126
190 oi_ 26110
191 go_ 25982
192 gra 25962
193 car 25913
194 ros 25908
195 man 25708
196 nti 25581
197 cid 25536
198 _ac 25512
199 err 25340
200 _gr 25284
201 ao_ 25267
202 _ta 25168
203 tre 25002
204 mar 24844
205 ira 24809
206 ome 24729
207 seu 24658
208 _fi 24384
209 sen 24324
210 der 24286
211 ase 24256
212 _sú 24128
213 pri 24061
214 súa 24007
215 int 23838
216 ori 23784
217 ver 23650
218 aba 23626
219 tiv 23595
220 cio 23426
221 _to 23407
222 tal 23275
223 pos 23196
224 _si 23148
225 _ar 22915
226 esc 22887
227 _du 22810
228 ale 22764
229 _ve 22697
230 _ga 22689
231 tar 22611
232 ero 22392
233 tas 22355
234 _má 22351
235 ata 22275
236 ion 22168
237 _fa 22134
238 _at 22090
239 und 21998
240 ser 21863
241 uni 21857
242 iza 21734
243 _ci 21689
244 ece 21554
245 foi 21482
246 nic 21406
247 rma 21402
248 tur 21273
249 llo 21264
250 fic 21171
251 ela 21100
252 cos 20953
253 qui 20792
254 edi 20788
255 ect 20784
256 so_ 20756
257 úa_ 20748
258 cul 20692
259 sto 20669
260 lic 20603
261 sa_ 20585
262 nse 20508
263 áis 20448
264 mái 20400
265 eci 20299
266 inc 20223
267 orm 20212
268 ola 20209
269 cor 20103
270 ide 20102
271 ría 20021
272 anc 19900
273 emp 19852
274 _lo 19761
275 non 19706
276 _ch 19631
277 lla 19566
278 _mi 19521
279 ren 19514
280 tem 19491
281 dor 19474
282 tam 19357
283 ime 19241
284 min 19064
285 cen 18957
286 _fr 18927
287 _mu 18869
288 nas 18844
289 nal 18819
290 cal 18730
291 ric 18711
292 ell 18705
293 ing 18689
294 las 18574
295 _cu 18510
296 ore 18426
297 egu 18394
298 los 18298
299 _le 18115
300 ña_ 18107
301 ber 18042
302 ial 17988
303 rim 17899
304 za_ 17888
305 les 17879
306 ere 17782
307 mpo 17780
308 cad 17751
309 nce 17739
310 ost 17638
311 ill 17603
312 mai 17603
313 _ce 17545
314 ers 17538
315 ele 17519
316 rra 17518
317 ema 17506
318 us_ 17492
319 el_ 17440
320 rei 17436
321 ios 17413
322 _cr 17364
323 ral 17314
324 rti 17312
325 gal 17248
326 rac 17190
327 act 17182
328 ili 17131
329 ens 17071
330 are 17051
331 _á_ 16946
332 ern 16896
333 tri 16887
334 _va 16876
335 seg 16856
336 san 16814
337 uci 16718
338 out 16631
339 _el 16579
340 bre 16577
341 lle 16456
342 obr 16301
343 arr 16256
344 eme 16221
345 lar 16216
346 lan 16132
347 _xe 16131
348 son 16126
349 ode 16058
350 ind 15926
351 ora 15897
352 mes 15889
353 _ha 15869
354 oma 15822
355 _la 15814
356 ana 15813
357 esa 15801
358 én_ 15759
359 nad 15707
360 ior 15661
361 _fe 15510
362 rea 15450
363 ond 15418
364 exi 15350
365 le_ 15347
366 esi 15222
367 dia 15199
368 sió 15199
369 _or 15171
370 rit 15167
371 _ne 15165
372 rop 15128
373 omp 15055
374 dis 14925
375 _ro 14913
376 _im 14803
377 vo_ 14698
378 fer 14659
379 nst 14615
380 _ap 14576
381 dic 14556
382 bra 14554
383 liz 14540
384 cto 14514
385 ces 14433
386 ega 14361
387 rre 14343
388 rta 14315
389 ias 14224
390 _au 14207
391 ban 14164
392 imp 14103
393 ula 14086
394 mer 14081
395 tin 14062
396 ert 14042
397 ivo 13967
398 mei 13910
399 ala 13877
400 ndi 13833
401 án_ 13824
402 utr 13815
403 nom 13757
404 ama 13688
405 ini 13619
406 xa_ 13614
407 odo 13596
408 mun 13512
409 _ti 13511
410 cri 13487
411 ast 13447
412 oca 13409
413 all 13356
414 itu 13293
415 alm 13278
416 ga_ 13244
417 smo 13209
418 onc 13200
419 col 13194
420 oa_ 13184
421 ian 13181
422 sid 13175
423 ulo 13143
424 aro 13137
425 lme 13124
426 cip 13101
427 gue 13066
428 ato 13042
429 gar 12993
430 ba_ 12984
431 _ho 12954
432 nor 12952
433 cel 12939
434 cha 12923
435 po_ 12920
436 mos 12885
437 eta 12816
438 nov 12809
439 cer 12788
440 amé 12750
441 lin 12726
442 _ad 12650
443 med 12649
444 ir_ 12640
445 fra 12610
446 ati 12580
447 pod 12550
448 tua 12527
449 ate 12524
450 ino 12452
451 mas 12400
452 cam 12384
453 me_ 12364
454 dun 12354
455 iva 12351
456 ous 12351
457 rro 12332
458 nac 12329
459 erm 12318
460 use 12244
461 _nu 12213
462 va_ 12207
463 eno 12141
464 _am 12080
465 rat 12073
466 lia 11982
467 rel 11982
468 gun 11979
469 tid 11955
470 tod 11888
471 _em 11870
472 ans 11750
473 cci 11699
474 mad 11660
475 xe_ 11600
476 ani 11593
477 spo 11587
478 mén 11560
479 ima 11538
480 apa 11527
481 ble 11527
482 eit 11474
483 eus 11472
484 uen 11466
485 fin 11465
486 _xu 11360
487 sco 11257
488 gre 11191
489 mil 11166
490 _ob 11130
491 arc 11089
492 _go 11057
493 mon 11055
494 bro 11039
495 tac 11030
496 ace 11011
497 nun 10990
498 _br 10984
499 ego 10983
500 coa 10964
501 uto 10953
502 moi 10925
503 rin 10892
504 spa 10890
505 spe 10810
506 dou 10803
507 _ra 10763
508 ive 10735
509 ven 10731
510 rna 10723
511 lec 10719
512 vol 10680
513 lid 10654
514 ult 10648
515 ena 10629
516 uer 10628
517 sic 10577
518 uro 10571
519 _er 10518
520 iña 10499
521 nsi 10498
522 _ab 10492
523 ses 10466
524 orr 10454
525 tab 10450
526 _lu 10418
527 asa 10411
528 _fu 10365
529 eco 10353
530 cre 10339
531 aio 10336
532 _xa 10328
533 stá 10311
534 rex 10309
535 var 10287
536 vid 10281
537 pal 10260
538 _gu 10253
539 oit 10247
540 pas 10243
541 mor 10211
542 _be 10197
543 nar 10190
544 reg 10189
545 asi 10187
546 ien 10186
547 nid 10156
548 mpe 10073
549 erí 10041
550 rar 10037
551 xo_ 10031
552 bri 10026
553 ite 10010
554 ris 10002
555 és_ 9966
556 pen 9881
557 _bo 9850
558 nes 9833
559 rib 9798
560 dur 9780
561 lem 9778
562 lon 9765
563 rom 9749
564 emb 9745
565 ai_ 9725
566 lis 9721
567 ifi 9720
568 ngu 9703
569 oñe 9670
570 ivi 9635
571 che 9622
572 rep 9607
573 sob 9552
574 oro 9551
575 _pu 9527
576 _hi 9481
577 ret 9476
578 ce_ 9473
579 ual 9446
580 ipa 9417
581 erc 9372
582 mpl 9357
583 bli 9331
584 lit 9329
585 obe 9280
586 rod 9267
587 unt 9249
588 sas 9230
589 lac 9226
590 mpr 9180
591 gua 9179
592 ord 9158
593 sit 9149
594 isi 9129
595 scr 9105
596 oci 9062
597 pañ 9040
598 duc 9027
599 axe 9009
600 poi 8958
601 nis 8940
602 ust 8940
603 ea_ 8935
604 tel 8904
605 uga 8901
606 aña 8877
607 igo 8876
608 cat 8867
609 ete 8867
610 ago 8858
611 aca 8851
612 abi 8848
613 bal 8808
614 vis 8803
615 oni 8793
616 pec 8793
617 xer 8788
618 roc 8743
619 leg 8728
620 eli 8694
621 ota 8684
622 eo_ 8683
623 úas 8682
624 rno 8677
625 lta 8642
626 bar 8631
627 ede 8630
628 tit 8600
629 obo 8587
630 ono 8567
631 sar 8551
632 fun 8548
633 ove 8531
634 gui 8512
635 rde 8509
636 gan 8496
637 ard 8474
638 _vo 8472
639 ez_ 8467
640 rto 8460
641 gos 8447
642 _cl 8444
643 eso 8436
644 cti 8434
645 xen 8407
646 dem 8391
647 dio 8391
648 mat 8364
649 emi 8358
650 rso 8334
651 _pi 8293
652 osi 8293
653 sis 8248
654 coñ 8233
655 tru 8228
656 mic 8217
657 rem 8197
658 aso 8187
659 rab 8164
660 alg 8160
661 índ 8157
662 lor 8131
663 erv 8111
664 pob 8106
665 nza 8104
666 mit 8070
667 ois 8070
668 tig 8069
669 _sé 8062
670 cie 8048
671 iu_ 8036
672 lad 8034
673 il_ 8000
674 exa 7979
675 ixi 7976
676 _oc 7970
677 atr 7966
678 met 7951
679 mbr 7949
680 bas 7946
681 in_ 7941
682 val 7934
683 ctu 7922
684 alt 7902
685 zad 7892
686 cap 7870
687 ane 7861
688 adi 7858
689 ume 7838
690 ism 7834
691 imi 7823
692 ixo 7809
693 ñec 7803
694 ram 7798
695 pa_ 7782
696 elo 7769
697 boa 7755
698 fil 7752
699 hab 7737
700 exp 7725
701 iga 7706
702 _ó_ 7675
703 ii_ 7662
704 écu 7649
705 ins 7643
706 nve 7642
707 _on 7620
708 arí 7619
709 osa 7619
710 red 7619
711 ami 7607
712 pla 7596
713 rov 7582
714 íti 7570
715 dir 7566
716 día 7526
717 ine 7524
718 abe 7499
719 sos 7496
720 ecu 7478
721 rca 7475
722 bit 7457
723 imo 7450
724 nia 7423
725 séc 7409
726 til 7379
727 eda 7317
728 dec 7311
729 esm 7310
730 zo_ 7296
731 _xo 7286
732 rse 7285
733 ño_ 7283
734 _bi 7272
735 rri 7254
736 ogr 7242
737 pon 7234
738 dep 7233
739 atu 7227
740 usa 7196
741 omi 7188
742 rqu 7185
743 aut 7173
744 _ag 7131
745 eal 7131
746 dar 7120
747 eza 7119
748 ene 7106
749 gad 7098
750 stu 7087
751 rid 7086
752 oqu 7076
753 _he 7072
754 sup 7068
755 cla 7052
756 nco 7047
757 erd 7044
758 ovi 7026
759 tud 7026
760 lat 7021
761 río 7019
762 emo 7010
763 _ni 6998
764 tán 6992
765 efe 6988
766 sin 6964
767 _il 6946
768 del 6943
769 bel 6936
770 etr 6925
771 len 6898
772 _eu 6891
773 loc 6890
774 sca 6888
775 ova 6883
776 opa 6880
777 dif 6865
778 amp 6859
779 oli 6853
780 rci 6853
781 sol 6818
782 odu 6814
783 rie 6814
784 equ 6804
785 oso 6804
786 ler 6798
787 ís_ 6780
788 exe 6776
789 cac 6775
790 tou 6755
791 deb 6739
792 _is 6728
793 _pl 6711
794 alo 6711
795 ire 6709
796 bai 6685
797 lev 6674
798 eva 6669
799 xis 6667
800 aín 6665
801 dan 6638
802 rga 6615
803 iti 6611
804 ext 6592
805 inf 6586
806 _ri 6566
807 aix 6562
808 ila 6524
809 _aí 6513
810 aco 6499
811 aís 6499
812 tá_ 6488
813 vas 6476
814 ref 6471
815 cab 6468
816 olí 6448
817 abr 6427
818 ull 6420
819 sem 6403
820 edo 6393
821 his 6392
822 gob 6374
823 tim 6361
824 rmi 6329
825 ogo 6321
826 tiñ 6290
827 abl 6288
828 ose 6280
829 rix 6280
830 vin 6258
831 ás_ 6258
832 año 6250
833 div 6227
834 mpa 6225
835 tir 6224
836 nat 6222
837 fac 6207
838 isc 6197
839 lei 6196
840 ol_ 6166
841 lib 6160
842 ton 6141
843 paí 6137
844 iad 6131
845 ped 6109
846 sal 6086
847 ner 6083
848 ham 6073
849 rev 6056
850 lti 6054
851 dei 6052
852 rda 6044
853 sur 6038
854 uia 6008
855 ías 5990
856 opi 5979
857 _us 5975
858 ei_ 5962
859 opo 5951
860 aqu 5940
861 mac 5923
862 _of 5905
863 rme 5898
864 soc 5887
865 amb 5874
866 van 5846
867 rai 5843
868 zar 5841
869 rou 5834
870 ecc 5824
871 vos 5812
872 zas 5804
873 cun 5799
874 vil 5790
875 eto 5779
876 bor 5774
877 mal 5716
878 anz 5701
879 eur 5696
880 rob 5686
881 adr 5682
882 orn 5682
883 ixe 5672
884 roq 5662
885 rav 5649
886 oce 5641
887 lab 5628
888 ote 5622
889 gas 5610
890 rot 5605
891 onv 5604
892 lug 5599
893 lim 5591
894 mel 5589
895 tec 5589
896 ong 5579
897 mul 5568
898 lít 5564
899 log 5563
900 _ed 5562
901 vel 5561
902 amo 5554
903 ovo 5553
904 unc 5546
905 uti 5522
906 oac 5520
907 ua_ 5511
908 _id 5508
909 ave 5505
910 aos 5486
911 nai 5484
912 ein 5460
913 ole 5452
914 evi 5441
915 did 5438
916 sul 5432
917 ur_ 5427
918 ben 5419
919 cur 5409
920 nsa 5408
921 one 5404
922 uel 5398
923 olu 5376
924 sec 5371
925 mez 5362
926 cem 5356
927 mod 5348
928 raz 5337
929 nan 5326
930 pit 5325
931 hom 5292
932 _ec 5272
933 ope 5245
934 abo 5242
935 rón 5228
936 org 5223
937 xic 5218
938 ve_ 5217
939 zac 5199
940 scu 5182
941 orí 5181
942 pad 5172
943 dra 5171
944 ues 5167
945 heg 5146
946 oto 5143
947 ibe 5142
948 _af 5134
949 así 5133
950 rsi 5117
951 _ig 5107
952 plo 5104
953 ibi 5091
954 ne_ 5090
955 ced 5075
956 mán 5054
957 xió 5043
958 olv 5041
959 _rí 5035
960 arq 5025
961 lve 5018
962 ío_ 5013
963 ían 5003
964 iño 4995
965 ves 4994
966 _dí 4990
967 tei 4989
968 cin 4980
969 edr 4980
970 fal 4976
971 xec 4976
972 mis 4957
973 ipo 4954
974 ple 4954
975 apo 4943
976 rup 4941
977 mbi 4927
978 _ir 4924
979 pra 4919
980 ars 4917
981 xun 4904
982 _bu 4903
983 pul 4902
984 pan 4898
985 arl 4897
986 xía 4891
987 áti 4886
988 aza 4880
989 bo_ 4873
990 evo 4873
991 ilo 4869
992 tom 4860
993 pio 4854
994 cou 4850
995 inv 4847
996 sub 4846
997 bil 4845
998 iar 4841
999 igr 4839
1000 nca 4830
1001 sí_ 4825
1002 _xi 4816
1003 pou 4796
1004 via 4787
1005 api 4775
1006 upo 4769
1007 vad 4764
1008 ile 4762
1009 egr 4759
1010 hai 4759
1011 _th 4752
1012 lam 4750
1013 aus 4736
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _a_ 38146
15 _sz 17915
16 _az 14292
17 és_ 13996
18 sze 12590
19 az_ 12544
20 ek_ 11509
21 en_ 10831
22 _és 9755
23 _me 9709
24 tás 9013
25 gy_ 8327
26 an_ 7873
27 meg 7821
28 ak_ 7519
29 egy 7511
30 ség 7504
31 ele 7499
32 _te 7457
33 _el 7419
34 zer 7386
35 _pr 6800
36 zet 6690
37 pro 6517
38 tt_ 6485
39 _eg 6437
40 et_ 6393
41 _fe 6318
42 _kö 6220
43 ok_ 6199
44 is_ 5969
45 _in 5910
46 ere 5855
47 szá 5791
48 int 5673
49 szt 5606
50 gra 5568
51 ram 5558
52 tés 5512
53 nak 5498
54 rog 5418
55 ás_ 5400
56 ogr 5386
57 _ha 5290
58 ter 5278
59 fel 5226
60 ban 5218
61 eze 5182
62 _be 5135
63 atá 5040
64 ése 5033
65 ény 4982
66 ben 4981
67 _mi 4955
68 let 4914
69 es_ 4903
70 ete 4898
71 _fo 4856
72 _ne 4807
73 ala 4759
74 al_ 4746
75 _ta 4741
76 tel 4737
77 ész 4715
78 _ho 4684
79 nt_ 4649
80 min 4643
81 zés 4620
82 _va 4619
83 nek 4548
84 hat 4547
85 el_ 4528
86 at_ 4502
87 si_ 4501
88 am_ 4499
89 köz 4491
90 vez 4449
91 nye 4419
92 ely 4413
93 ja_ 4355
94 ra_ 4345
95 ett 4342
96 _ki 4341
97 _le 4323
98 ont 4232
99 tal 4212
100 _al 4195
101 ind 4188
102 ság 4187
103 sza 4168
104 ti_ 4167
105 ott 4075
106 ált 4068
107 ai_ 4063
108 áll 3878
109 szo 3873
110 ítá 3848
111 _is 3831
112 het 3819
113 ell 3816
114 gye 3800
115 _ke 3747
116 tat 3696
117 end 3657
118 ogy 3646
119 zám 3645
120 agy 3629
121 esz 3604
122 len 3589
123 _ké 3572
124 os_ 3561
125 pon 3533
126 for 3479
127 se_ 3459
128 hog 3391
129 ásá 3389
130 ége 3354
131 ssz 3335
132 _ér 3331
133 ni_ 3321
134 ány 3313
135 eke 3310
136 ció 3294
137 sít 3278
138 re_ 3265
139 _re 3264
140 nem 3256
141 erv 3250
142 unk 3247
143 sok 3239
144 lis 3225
145 val 3219
146 lat 3212
147 mel 3188
148 ik_ 3168
149 ent 3163
150 ért 3161
151 em_ 3153
152 jel 3131
153 kez 3129
154 kat 3099
155 lta 3091
156 ren 3049
157 tár 3049
158 zás 3028
159 dít 3024
160 ato 3016
161 tet 3003
162 áso 3000
163 rés 2988
164 _ál 2980
165 kép 2950
166 tan 2902
167 dés 2892
168 _ko 2885
169 _tá 2874
170 rve 2861
171 _po 2841
172 kor 2813
173 ték 2811
174 on_ 2799
175 _mu 2757
176 art 2748
177 át_ 2746
178 ma_ 2740
179 lap 2731
180 tó_ 2717
181 ató 2707
182 mun 2699
183 _cs 2697
184 áli 2696
185 lla 2688
186 asz 2676
187 eté 2657
188 _ma 2655
189 st_ 2648
190 _ré 2641
191 ása 2625
192 lt_ 2597
193 nde 2582
194 lye 2581
195 gya 2571
196 _id 2563
197 nté 2538
198 koz 2535
199 öss 2534
200 ét_ 2522
201 _je 2517
202 _am 2516
203 bb_ 2512
204 ető 2512
205 cso 2500
206 gál 2496
207 lás 2495
208 vál 2479
209 áci 2465
210 tar 2453
211 ezé 2433
212 sa_ 2429
213 tek 2418
214 _vá 2416
215 ésé 2416
216 kel 2409
217 íté 2404
218 _bi 2399
219 _ez 2394
220 eve 2390
221 alm 2385
222 zat 2382
223 alk 2381
224 er_ 2359
225 eg_ 2342
226 ndí 2317
227 ame 2312
228 alá 2311
229 leg 2304
230 hel 2302
231 ák_ 2302
232 lle 2300
233 nte 2299
234 ket 2293
235 oly 2293
236 _ös 2280
237 olg 2257
238 rté 2245
239 hoz 2242
240 ág_ 2238
241 ási 2217
242 zak 2211
243 ési 2183
244 ozá 2173
245 tja 2171
246 mén 2159
247 nál 2149
248 biz 2145
249 ny_ 2144
250 tő_ 2143
251 del 2141
252 ese 2134
253 eti 2129
254 vet 2125
255 ól_ 2122
256 fog 2119
257 ló_ 2119
258 öve 2116
259 orm 2103
260 _ka 2102
261 án_ 2097
262 _gy 2078
263 elő 2072
264 ten 2066
265 ább 2065
266 _ve 2064
267 ég_ 2063
268 _eu 2061
269 kül 2057
270 ada 2041
271 idő 2031
272 us_ 2031
273 _má 2023
274 elé 2001
275 tos 1997
276 men 1995
277 ot_ 1974
278 szí 1957
279 yek 1953
280 lal 1929
281 lés 1929
282 mán 1926
283 _he 1922
284 tok 1915
285 lam 1914
286 ána 1913
287 _ny 1910
288 ez_ 1906
289 nev 1892
290 kon 1884
291 ysz 1883
292 _es 1882
293 _to 1874
294 ozó 1871
295 ára 1871
296 elm 1857
297 dsz 1852
298 les 1852
299 gi_ 1850
300 tik 1846
301 lem 1844
302 lya 1840
303 _ad 1838
304 lma 1838
305 _vi 1836
306 csa 1830
307 dás 1827
308 _cé 1809
309 mi_ 1808
310 _de 1805
311 ve_ 1803
312 _tö 1775
313 rt_ 1771
314 éte 1766
315 szi 1759
316 yes 1750
317 oga 1745
318 sek 1740
319 vel 1734
320 ző_ 1728
321 ors 1727
322 ati 1720
323 nka 1719
324 vag 1715
325 iál 1711
326 ber 1705
327 áma 1695
328 mer 1694
329 sen 1692
330 zó_ 1689
331 ker 1685
332 ll_ 1681
333 fej 1679
334 it_ 1678
335 ehe 1676
336 emb 1673
337 kal 1667
338 ély 1666
339 rmá 1659
340 zot 1656
341 cél 1655
342 ová 1654
343 épz 1647
344 tot 1644
345 rsz 1643
346 tve 1642
347 tes 1640
348 zol 1640
349 gat 1639
350 ciá 1638
351 mbe 1637
352 or_ 1637
353 tér 1636
354 rek 1629
355 amo 1619
356 nk_ 1617
357 has 1614
358 ék_ 1601
359 erm 1599
360 enn 1598
361 rta 1598
362 _na 1597
363 lko 1594
364 éle 1583
365 isz 1582
366 kör 1575
367 özö 1573
368 _se 1567
369 éke 1564
370 áro 1557
371 _ak 1554
372 vén 1554
373 oka 1552
374 ata 1550
375 eur 1550
376 _e_ 1546
377 _té 1545
378 tsé 1545
379 ába 1543
380 aló 1537
381 _en 1524
382 ami 1524
383 edé 1523
384 rül 1521
385 szü 1520
386 ly_ 1518
387 elt 1517
388 nyo 1517
389 lys 1511
390 dat 1496
391 fon 1496
392 tov 1496
393 _do 1495
394 váb 1492
395 _an 1488
396 nyi 1488
397 ül_ 1487
398 ébe 1486
399 tén 1481
400 eri 1480
401 yez 1473
402 ító 1464
403 ező 1459
404 ama 1456
405 fol 1453
406 ort 1452
407 rin 1447
408 yel 1447
409 leh 1441
410 ály 1441
411 ege 1439
412 más 1439
413 zon 1439
414 maz 1438
415 _vé 1436
416 mag 1434
417 lgá 1433
418 azo 1430
419 már 1430
420 por 1424
421 ós_ 1420
422 _so 1417
423 ók_ 1416
424 pzé 1412
425 ges 1408
426 ene 1407
427 ond 1403
428 elv 1402
429 efo 1397
430 rán 1394
431 tor 1391
432 ika 1387
433 rás 1384
434 tud 1384
435 ons 1383
436 yan 1382
437 ió_ 1379
438 szé 1379
439 kap 1378
440 ére 1369
441 gén 1368
442 yen 1365
443 szn 1362
444 zág 1362
445 zem 1360
446 gaz 1359
447 _kü 1358
448 éne 1349
449 _t_ 1348
450 ta_ 1343
451 ony 1341
452 sme 1337
453 zt_ 1331
454 ndo 1330
455 lak 1329
456 üle 1323
457 mat 1321
458 ola 1321
459 ár_ 1321
460 _er 1317
461 ism 1313
462 _ol 1307
463 okt 1305
464 _mo 1304
465 erü 1294
466 én_ 1291
467 _ig 1289
468 iós 1287
469 oz_ 1285
470 ülö 1282
471 uta 1281
472 ámo 1279
473 uró 1278
474 oci 1277
475 dol 1273
476 omá 1273
477 sol 1273
478 gon 1268
479 ejl 1265
480 pes 1264
481 yi_ 1264
482 ako 1263
483 _tu 1262
484 sán 1257
485 kés 1246
486 ret 1245
487 lön 1241
488 ála 1238
489 elk 1237
490 uk_ 1237
491 lát 1234
492 zta 1233
493 nfo 1232
494 zoc 1228
495 égi 1223
496 izo 1220
497 eny 1214
498 zeg 1213
499 pol 1210
500 tts 1210
501 ill 1208
502 opo 1207
503 _lé 1202
504 sop 1202
505 ia_ 1199
506 zel 1199
507 mok 1195
508 te_ 1193
509 éve 1191
510 res 1189
511 ssé 1189
512 nsz 1188
513 ást 1188
514 érd 1180
515 eme 1178
516 emz 1166
517 oló 1160
518 seg 1159
519 árs 1159
520 zín 1157
521 ess 1154
522 gít 1154
523 gal 1146
524 lek 1145
525 ken 1144
526 zte 1136
527 els 1133
528 emé 1133
529 ekt 1131
530 _át 1129
531 éko 1129
532 ött 1128
533 elj 1127
534 lha 1122
535 zab 1121
536 gek 1120
537 ágo 1119
538 nds 1117
539 inf 1116
540 tte 1115
541 mód 1113
542 lcs 1112
543 róp 1112
544 ert 1111
545 zen 1107
546 est 1103
547 zté 1103
548 nik 1102
549 zná 1101
550 sz_ 1099
551 dő_ 1098
552 ki_ 1098
553 _il 1097
554 sor 1092
555 tag 1089
556 áló 1088
557 dal 1086
558 ntj 1086
559 ord 1084
560 _mé 1083
561 rvé 1081
562 lít 1078
563 ona 1075
564 ük_ 1075
565 _jo 1074
566 _s_ 1074
567 ked 1074
568 tsá 1071
569 ne_ 1070
570 nős 1067
571 zto 1067
572 inő 1065
573 ván 1062
574 egí 1058
575 zük 1056
576 _st 1054
577 téz 1050
578 kol 1049
579 tál 1045
580 vis 1044
581 set 1042
582 vég 1033
583 _ok 1032
584 li_ 1031
585 lit 1031
586 den 1028
587 osí 1028
588 ől_ 1028
589 _új 1027
590 ksé 1027
591 ópa 1027
592 eng 1026
593 nag 1021
594 nys 1016
595 uni 1016
596 _mó 1015
597 ha_ 1014
598 mác 1012
599 and 1010
600 kai 1010
601 the 1007
602 bel 1006
603 ebb 1006
604 dok 1005
605 nya 1005
606 ion 1003
607 ügy 1003
608 nos 1002
609 hez 1001
610 köv 1001
611 lka 1001
612 roz 999
613 egé 997
614 zhe 997
615 jog 993
616 jle 991
617 ját 990
618 _hi 988
619 kér 988
620 zi_ 988
621 ján 985
622 lgo 984
623 vev 984
624 ei_ 979
625 ver 979
626 nge 976
627 égé 974
628 mél 973
629 egf 972
630 mot 971
631 iká 967
632 émá 965
633 _fi 964
634 ri_ 964
635 pcs 963
636 _go 961
637 _pé 961
638 lel 961
639 etk 960
640 gla 960
641 sak 954
642 ogl 953
643 zek 953
644 zik 949
645 ezd 948
646 ana 946
647 apc 946
648 nti 946
649 até 943
650 yam 943
651 ize 941
652 bef 940
653 rsa 934
654 vő_ 934
655 yak 933
656 evő 932
657 lef 928
658 ran 927
659 ysé 926
660 izt 925
661 sér 924
662 ani 922
663 ged 922
664 toz 922
665 pai 920
666 kke 914
667 kén 914
668 lmi 911
669 abb 909
670 ról 906
671 sal 906
672 érv 906
673 ft_ 905
674 van 905
675 be_ 904
676 net 903
677 _vo 902
678 fiz 902
679 get 902
680 épe 902
681 _né 901
682 szö 901
683 gyi 899
684 egn 898
685 ezh 896
686 doz 893
687 mai 893
688 oli 892
689 von 891
690 _pa 890
691 nto 889
692 oss 889
693 alo 887
694 ősé 887
695 üks 885
696 tha 883
697 tle 882
698 ztv 882
699 gok 881
700 ori 881
701 cím 880
702 osz 880
703 tám 879
704 lő_ 876
705 áza 876
706 vek 873
707 iku 871
708 tör 871
709 _ba 868
710 kus 867
711 apo 865
712 old 865
713 lét 864
714 _év 860
715 _pe 859
716 rez 859
717 lom 858
718 rte 858
719 rde 855
720 par 854
721 _cí 848
722 tra 848
723 elh 847
724 bké 846
725 _há 844
726 dél 844
727 iti 843
728 yos 842
729 evé 841
730 rme 841
731 me_ 840
732 ig_ 839
733 kis 839
734 akk 836
735 llá 832
736 _k_ 831
737 esí 831
738 mze 831
739 tét 831
740 ént 831
741 ági 830
742 itá 829
743 lad 828
744 máj 823
745 om_ 823
746 kto 821
747 tam 821
748 bbk 820
749 tke 819
750 ers 818
751 aná 817
752 sko 817
753 ult 817
754 goz 815
755 ést 814
756 köd 812
757 oza 809
758 att 808
759 dek 803
760 ine 802
761 nci 802
762 zok 802
763 ndő 801
764 azd 800
765 ács 800
766 jár 799
767 gne 797
768 ech 796
769 tém 796
770 efi 795
771 yet 795
772 de_ 794
773 bbi 793
774 nny 793
775 sát 793
776 ozt 789
777 öbb 789
778 ősí 789
779 ják 788
780 olt 788
781 zít 788
782 las 786
783 töb 786
784 anu 784
785 két 784
786 _di 783
787 asá 783
788 igé 783
789 mog 783
790 eli 782
791 ldá 782
792 zmé 780
793 gyo 779
794 kbe 779
795 um_ 779
796 tem 778
797 ája 778
798 ka_ 777
799 juk 776
800 lmé 776
801 elü 775
802 _ku 772
803 íne 771
804 _ga 770
805 _or 770
806 _em 768
807 gfe 768
808 kba 768
809 tec 767
810 _n_ 766
811 dom 765
812 egk 762
813 rla 762
814 san 761
815 sár 760
816 _bu 757
817 rmé 757
818 ból 756
819 ol_ 754
820 orl 753
821 sel 753
822 véd 753
823 ape 752
824 ela 752
825 ozz 752
826 enc 751
827 bál 748
828 _la 747
829 mér 747
830 önb 747
831 ann 745
832 dőt 745
833 nká 745
834 teg 743
835 íte 743
836 uda 742
837 új_ 742
838 ves 739
839 zle 739
840 han 735
841 dik 734
842 _ór 733
843 mek 733
844 lje 732
845 sad 732
846 él_ 731
847 _ci 730
848 _él 729
849 etl 728
850 lja 728
851 udo 728
852 eum 726
853 ölt 725
854 _un 724
855 óds 724
856 _ft 722
857 yer 721
858 gés 720
859 őta 720
860 bek 718
861 chn 718
862 pia 718
863 tak 718
864 kom 717
865 nél 717
866 osa 717
867 pít 716
868 red 716
869 str 715
870 adá 714
871 nna 713
872 tta 713
873 rtj 712
874 tős 712
875 lke 711
876 jes 709
877 _ja 708
878 dőp 708
879 olá 707
880 lóg 706
881 szó 705
882 _ir 704
883 kul 703
884 zin 702
885 ina 701
886 rdí 700
887 dig 699
888 irá 698
889 őpo 698
890 yar 697
891 lan 696
892 agá 695
893 iac 694
894 ezt 692
895 sét 692
896 kka 689
897 ped 689
898 ulá 689
899 nác 686
900 nd_ 685
901 íme 685
902 _mú 684
903 ljá 684
904 ing 683
905 zöt 682
906 za_ 681
907 eni 680
908 ba_ 677
909 füg 677
910 kko 677
911 zös 677
912 mut 676
913 óra 676
914 llí 675
915 aka 674
916 ton 674
917 ügg 674
918 egh 673
919 isk 672
920 tív 672
921 dó_ 669
922 ódo 669
923 akt 668
924 ámá 668
925 apj 667
926 lál 667
927 zda 667
928 nat 666
929 tán 666
930 _lá 663
931 cik 662
932 _u_ 661
933 ntá 661
934 cs_ 659
935 ikk 659
936 _ut 658
937 gia 658
938 nap 658
939 tók 658
940 álá 658
941 nal 657
942 rto 657
943 áto 657
944 aki 656
945 erz 655
946 nul 655
947 ul_ 655
948 ago 654
949 atj 652
950 _m_ 651
951 dap 651
952 etv 651
953 lté 651
954 oko 651
955 lto 650
956 lós 649
957 szk 649
958 elõ 648
959 bud 647
960 gol 647
961 _pi 646
962 kra 646
963 das 645
964 ise 642
965 rál 642
966 éri 641
967 edi 640
968 vét 640
969 ece 639
970 zél 639
971 azá 637
972 son 637
973 _ar 636
974 abá 636
975 gys 636
976 gos 634
977 any 633
978 böz 633
979 nbö 633
980 zeu 633
981 ava 632
982 hal 632
983 múz 631
984 rat 630
985 úze 630
986 ein 628
987 ang 626
988 tre 626
989 orr 625
990 ssá 624
991 ede 623
992 _kí 622
993 le_ 622
994 bi_ 621
995 éde 621
996 lni 620
997 _ön 619
998 oro 619
999 ósá 619
1000 tjá 616
1001 zül 615
1002 élc 615
1003 _mű 614
1004 _pá 613
1005 ng_ 612
1006 viz 612
1007 zér 609
1008 áva 609
1009 ite 608
1010 ara 607
1011 ztá 607
1012 egi 606
1013 va_ 605
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 um_ 9904
15 og_ 9208
16 _og 9172
17 að_ 8930
18 ar_ 8829
19 _að 7647
20 _í_ 6730
21 ur_ 6670
22 ir_ 6423
23 ið_ 6069
24 ing 6017
25 _á_ 5368
26 _er 5106
27 inn 5098
28 _vi 4520
29 er_ 4359
30 _se 4316
31 ein 4206
32 _ve 4163
33 ver 4139
34 _st 3850
35 and 3655
36 sem 3409
37 _ei 3356
38 ra_ 3330
39 til 3319
40 na_ 3289
41 nar 3273
42 _þe 3263
43 em_ 3209
44 _ti 3176
45 við 3133
46 _he 3115
47 sta 3071
48 nga 2999
49 _me 2993
50 ni_ 2881
51 gar 2863
52 ndi 2820
53 in_ 2787
54 nna 2752
55 nin 2734
56 nn_ 2721
57 _a_ 2697
58 _sk 2655
59 ta_ 2651
60 ri_ 2612
61 il_ 2588
62 st_ 2579
63 _sa 2523
64 nda 2498
65 ða_ 2497
66 ann 2480
67 ði_ 2472
68 num 2412
69 _fy 2407
70 _um 2403
71 _fr 2352
72 _ha 2327
73 _þa 2283
74 nni 2278
75 sam 2253
76 fyr 2230
77 _r_ 2094
78 un_ 2078
79 rir 2054
80 _n_ 2049
81 ngu 2038
82 und 2037
83 yri 2029
84 ns_ 2025
85 erð 2024
86 ður 2018
87 ga_ 1987
88 _va 1930
89 lan 1926
90 enn 1906
91 með 1904
92 _en 1887
93 di_ 1887
94 ins 1883
95 leg 1868
96 _s_ 1862
97 la_ 1849
98 efn 1837
99 var 1832
100 end 1828
101 _i_ 1822
102 _ge 1817
103 ru_ 1796
104 tar 1783
105 men 1757
106 _ne 1719
107 nnu 1684
108 sin 1682
109 ki_ 1672
110 _mi 1666
111 lag 1652
112 stu 1652
113 ndu 1637
114 ast 1623
115 ja_ 1623
116 vin 1623
117 ari 1579
118 eð_ 1555
119 an_ 1548
120 gin 1543
121 _al 1536
122 kja 1513
123 arf 1493
124 _af 1479
125 _in 1478
126 ti_ 1477
127 _ma 1454
128 ess 1450
129 lei 1447
130 ng_ 1438
131 man 1432
132 nu_ 1417
133 rey 1417
134 ðar 1416
135 _re 1408
136 tir 1406
137 far 1404
138 _hv 1383
139 _un 1380
140 ski 1372
141 dur 1366
142 all 1363
143 ega 1360
144 erk 1353
145 inu 1349
146 rei 1330
147 eru 1322
148 nd_ 1318
149 ngi 1301
150 _ef 1297
151 _fa 1296
152 _te 1292
153 eng 1287
154 ger 1287
155 tt_ 1283
156 tæk 1278
157 ram 1277
158 rið 1273
159 fra 1269
160 gu_ 1264
161 _t_ 1257
162 mál 1252
163 eir 1243
164 _fj 1236
165 þes 1234
166 ist 1221
167 en_ 1220
168 _sé 1217
169 da_ 1215
170 _ra 1213
171 æði 1213
172 _gr 1205
173 þei 1201
174 una 1200
175 _no 1195
176 gi_ 1191
177 tu_ 1182
178 ekk 1178
179 ang 1172
180 fni 1172
181 _le 1166
182 ræð 1165
183 tur 1165
184 haf 1150
185 unn 1150
186 fa_ 1149
187 rði 1141
188 slu 1137
189 eik 1131
190 ka_ 1131
191 _l_ 1126
192 mar 1121
193 _e_ 1119
194 hei 1114
195 kki 1089
196 kni 1088
197 ust 1087
198 rða 1082
199 upp 1082
200 hve 1081
201 rin 1077
202 eða 1076
203 ynd 1069
204 _up 1068
205 eim 1065
206 _u_ 1062
207 _ár 1057
208 _m_ 1040
209 nem 1037
210 sla 1036
211 _la 1034
212 ig_ 1030
213 mið 1030
214 óla 1028
215 gum 1022
216 for 1021
217 ars 1008
218 af_ 1003
219 ara 1001
220 eit 1000
221 nað 1000
222 skó 999
223 tti 998
224 tak 990
225 _kr 989
226 aða 985
227 lin 983
228 _þv 981
229 _ko 979
230 rna 979
231 myn 977
232 jál 964
233 gre 958
234 _má 953
235 nir 952
236 rst 951
237 kól 950
238 tin 942
239 eið 940
240 on_ 934
241 ald 932
242 ðin 930
243 _ð_ 926
244 _sv 922
245 ví_ 920
246 því 915
247 fræ 912
248 það 909
249 _ná 903
250 _g_ 902
251 _at 897
252 nám 895
253 dan 893
254 _eð 890
255 aði 887
256 fti 884
257 veg 882
258 rau 881
259 _ek 879
260 afn 878
261 _sí 868
262 tað 866
263 _an 856
264 kyn 852
265 ina 843
266 sto 843
267 rðu 840
268 sér 839
269 _fo 838
270 lað 832
271 _fl 829
272 _ís 828
273 gur 828
274 tta 828
275 is_ 823
276 dir 822
277 fur 822
278 átt 822
279 hf_ 820
280 lu_ 816
281 not 816
282 _sj 815
283 éla 815
284 nns 813
285 ran 809
286 ygg 809
287 hef 807
288 ðum 807
289 _th 804
290 ma_ 803
291 þar 802
292 _k_ 798
293 bor 796
294 gt_ 795
295 ér_ 792
296 jöl 790
297 li_ 790
298 ísl 788
299 eme 785
300 rá_ 785
301 _ke 780
302 fél 777
303 frá 775
304 fna 767
305 þjó 767
306 nan 766
307 stö 766
308 ss_ 765
309 _li 764
310 rð_ 764
311 _be 762
312 jar 761
313 nds 761
314 rsl 761
315 _f_ 757
316 lda 757
317 nnt 757
318 afa 756
319 _lí 755
320 _si 753
321 lum 753
322 álf 749
323 lla 746
324 jón 740
325 ldi 738
326 fjö 737
327 nt_ 737
328 run 737
329 kil 733
330 nig 733
331 okk 731
332 tun 731
333 rni 729
334 _ta 728
335 kar 727
336 kur 727
337 gja 726
338 _hl 722
339 lut 721
340 ers 720
341 jór 720
342 _þá 719
343 ðir 719
344 _br 718
345 _út 716
346 ðu_ 716
347 rar 713
348 the 713
349 est 711
350 rra 708
351 kom 707
352 rt_ 706
353 stj 706
354 _þj 705
355 al_ 701
356 ynn 701
357 ita 698
358 fer 697
359 fi_ 695
360 rfi 692
361 ett 689
362 gan 687
363 fin 686
364 eri 685
365 ska 684
366 era 683
367 kip 679
368 eft 675
369 ild 673
370 mi_ 671
371 org 670
372 rke 670
373 _my 667
374 mun 664
375 lög 663
376 fan 662
377 nfa 661
378 arn 658
379 agn 657
380 vei 654
381 dar 653
382 din 652
383 efu 651
384 ten 651
385 ill 649
386 egi 644
387 set 642
388 tra 642
389 ste 640
390 nun 639
391 eyr 638
392 sti 638
393 ark 637
394 tjó 636
395 mis 633
396 rtæ 633
397 öld 632
398 _ba 631
399 ern 631
400 _eh 626
401 iði 626
402 skr 626
403 ehf 625
404 gna 625
405 ggi 623
406 kun 623
407 ótt 623
408 _hj 622
409 han 617
410 ags 616
411 ikn 616
412 raf 616
413 tla 616
414 _of 614
415 _tö 614
416 _bo 612
417 nsk 612
418 töl 611
419 eig 610
420 anf 608
421 tei 608
422 ent 603
423 vík 603
424 net 602
425 rum 602
426 _áh 601
427 ion 601
428 orð 601
429 ken 600
430 yfi 599
431 lar 598
432 stæ 597
433 tan 597
434 rfs 595
435 ngs 594
436 her 593
437 nið 592
438 nst 591
439 iti 589
440 _au 588
441 lau 588
442 ld_ 588
443 tum 587
444 ráð 585
445 he_ 582
446 eyk 580
447 lun 579
448 fir 578
449 lli 578
450 sjá 577
451 _tí 576
452 irt 574
453 _há 573
454 ama 573
455 kin 573
456 hug 572
457 erf 571
458 arl 570
459 hlu 570
460 fær 568
461 hjá 568
462 ban 565
463 jóð 565
464 síð 563
465 vir 563
466 rit 561
467 ind 557
468 eld 556
469 aví 555
470 ldu 555
471 lið 555
472 töð 554
473 vor 553
474 fun 552
475 iða 551
476 nus 551
477 iðs 550
478 ækj 549
479 lit 548
480 gru 547
481 ini 546
482 ykj 545
483 tal 544
484 hal 543
485 kef 543
486 _ga 542
487 eil 542
488 str 542
489 arð 541
490 son 541
491 irk 540
492 jav 540
493 _d_ 539
494 dum 539
495 ker 539
496 rkj 539
497 fja 537
498 aðu 535
499 _fe 534
500 ter 533
501 ík_ 532
502 önn 527
503 irr 525
504 am_ 524
505 min 523
506 ota 522
507 bro 520
508 aus 515
509 aga 514
510 _tr 511
511 lýs 510
512 ðst 510
513 ans 506
514 mik 506
515 get 502
516 _þr 500
517 saf 500
518 fnu 499
519 iki 498
520 rot 497
521 _lo 496
522 iðu 495
523 aka 494
524 eyt 494
525 _hú 492
526 _vo 492
527 kju 492
528 vel 492
529 len 491
530 sle 490
531 val 490
532 _hæ 489
533 þá_ 489
534 nis 488
535 rri 487
536 ögu 487
537 áms 486
538 _hi 484
539 ætt 484
540 _fi 482
541 arg 481
542 aðr 481
543 rla 480
544 íða 479
545 _úr 478
546 ll_ 478
547 hel 476
548 _ky 473
549 ttu 473
550 ala 472
551 art 472
552 mil 472
553 íma 471
554 _ný 469
555 ams 467
556 lok 464
557 hey 461
558 _fé 460
559 hús 460
560 kum 459
561 _ka 458
562 ens 456
563 ðra 456
564 ölv 456
565 sum 455
566 amt 454
567 si_ 454
568 ske 454
569 aut 453
570 kku 453
571 nas 453
572 ýsi 453
573 _lö 452
574 at_ 452
575 _hö 451
576 já_ 451
577 agi 450
578 uðu 448
579 ðan 448
580 öðu 447
581 _h_ 446
582 auð 446
583 ipt 446
584 sku 446
585 bre 443
586 _ja 442
587 sa_ 442
588 unu 442
589 lík 441
590 _da 440
591 ber 440
592 im_ 440
593 tof 439
594 ðni 438
595 int 437
596 nte 437
597 _áf 436
598 ds_ 436
599 ung 435
600 æki 435
601 itt 434
602 _hu 432
603 lis 432
604 étt 432
605 _fó 431
606 _o_ 431
607 tni 431
608 fat 428
609 lvu 427
610 ljó 426
611 ðal 426
612 _v_ 424
613 ule 424
614 _tæ 423
615 _pr 422
616 kka 422
617 tím 421
618 rs_ 420
619 ile 419
620 kvæ 419
621 of_ 419
622 _yf 418
623 aðs 417
624 byg 417
625 sso 417
626 _b_ 416
627 bra 416
628 dag 414
629 oru 414
630 ali 413
631 ku_ 413
632 egn 412
633 egu 410
634 rsk 410
635 tvi 410
636 _su 409
637 auk 409
638 bók 409
639 tek 409
640 es_ 408
641 kið 408
642 ssu 408
643 _ör 407
644 _fu 406
645 lt_ 406
646 rne 406
647 _by 404
648 iss 404
649 kri 404
650 ónu 404
651 ári 402
652 ækn 402
653 úr_ 402
654 ðsl 401
655 les 400
656 líf 400
657 nsl 400
658 rif 400
659 kra 398
660 ssa 398
661 _or 397
662 rf_ 397
663 órn 396
664 ggj 395
665 ssi 395
666 ert 394
667 þeg 393
668 _ok 392
669 rét 391
670 tis 391
671 gir 390
672 kna 388
673 hen 386
674 llu 386
675 tið 383
676 asa 381
677 atl 381
678 mei 379
679 tíð 379
680 _kl 378
681 jum 378
682 ðun 378
683 _ég 376
684 ég_ 376
685 fól 375
686 öll 375
687 ólk 374
688 atv 371
689 _co 370
690 jal 370
691 _j_ 369
692 ism 369
693 vef 369
694 _mu 368
695 áfa 367
696 iðn 366
697 búa 365
698 nle 365
699 yrn 363
700 gð_ 361
701 sig 361
702 _sp 360
703 ag_ 360
704 lur 360
705 tri 360
706 rfa 359
707 urð 359
708 ær_ 359
709 par 358
710 hva 357
711 rle 356
712 rn_ 356
713 svo 355
714 bún 353
715 nta 353
716 it_ 352
717 ryg 352
718 ögð 351
719 _ó_ 349
720 akl 349
721 alm 348
722 ris 348
723 els 347
724 _hr 346
725 fle 346
726 ðis 346
727 ft_ 345
728 ála 345
729 llt 343
730 _fæ 340
731 _þ_ 340
732 gun 340
733 tio 340
734 ppl 339
735 rka 339
736 íðu 339
737 dót 337
738 áhe 337
739 erj 336
740 höf 335
741 kli 335
742 eti 334
743 yrk 334
744 ef_ 332
745 hin 332
746 lst 332
747 sen 332
748 ell 331
749 _bæ 330
750 _na 330
751 etu 330
752 el_ 329
753 eta 328
754 ed_ 326
755 uta 326
756 svi 325
757 yti 325
758 _rá 324
759 afl 324
760 kr_ 324
761 tök 324
762 ilk 323
763 rga 323
764 bar 321
765 taf 320
766 uð_ 320
767 vo_ 320
768 æri 320
769 sve 319
770 flu 318
771 sjó 318
772 jaf 317
773 oma 317
774 plý 316
775 ál_ 316
776 ókn 316
777 sst 315
778 vör 315
779 pp_ 314
780 anu 313
781 sar 313
782 óða 313
783 _ið 312
784 msk 312
785 ort 312
786 reg 312
787 _sö 311
788 lky 311
789 ons 311
790 ros 311
791 tku 311
792 las 310
793 sín 310
794 ull 310
795 mur 309
796 otk 309
797 _þó 308
798 _hé 306
799 afi 306
800 lfu 306
801 _læ 305
802 _sm 305
803 amk 305
804 ani 305
805 gið 305
806 ára 305
807 íbú 305
808 öry 305
809 jár 304
810 örn 304
811 _íb 303
812 igi 303
813 imi 303
814 _fá 302
815 væm 302
816 ati 300
817 urs 300
818 ðas 300
819 _bi 299
820 gs_ 299
821 nur 299
822 et_ 298
823 etn 298
824 ren 298
825 rki 298
826 sko 298
827 kle 297
828 úa_ 297
829 ldr 296
830 tas 296
831 _bó 295
832 _ým 295
833 or_ 295
834 gri 294
835 kan 294
836 álp 294
837 gas 293
838 mæl 292
839 orm 292
840 sók 292
841 vað 292
842 mt_ 291
843 gni 290
844 glu 289
845 ls_ 289
846 nef 289
847 þet 289
848 ley 288
849 arv 287
850 æða 287
851 lme 286
852 uga 286
853 þjá 286
854 ut_ 285
855 úna 285
856 ost 284
857 sýn 284
858 ðla 284
859 dra 283
860 gen 283
861 ákv 283
862 _mæ 282
863 _öl 282
864 krá 282
865 fal 281
866 kað 281
867 au_ 280
868 ift 280
869 ona 280
870 _hó 279
871 eg_ 279
872 ötu 279
873 _sn 278
874 egg 278
875 ægt 278
876 æðu 278
877 gag 277
878 tel 277
879 þan 277
880 aðg 275
881 rg_ 275
882 þau 275
883 iga 274
884 oða 274
885 rek 274
886 ðge 274
887 þát 274
888 rli 273
889 gra 272
890 ila 272
891 áðu 272
892 jas 271
893 ðfe 271
894 arh 270
895 fel 270
896 hön 270
897 bei 269
898 nor 269
899 re_ 269
900 su_ 269
901 ár_ 269
902 ún_ 269
903 itu 268
904 uni 268
905 hér 267
906 oss 267
907 jör 265
908 væð 265
909 ætl 265
910 ask 264
911 jun 264
912 uri 264
913 amb 263
914 ema 263
915 iðl 263
916 _gu 262
917 dal 262
918 pur 262
919 smu 262
920 stó 262
921 ofn 260
922 ot_ 260
923 try 260
924 vél 260
925 afr 259
926 gis 259
927 _þæ 257
928 fre 257
929 pa_ 257
930 rma 257
931 ana 256
932 hóp 256
933 kis 256
934 ndr 256
935 rík 256
936 ts_ 256
937 fla 255
938 mst 255
939 sé_ 255
940 tuð 255
941 _is 254
942 öru 254
943 _nú 253
944 _tv 253
945 fst 253
946 ika 253
947 ili 253
948 van 253
949 íðn 253
950 _de 252
951 kos 252
952 urn 252
953 ám_ 252
954 _ri 251
955 _sl 251
956 ful 251
957 ors 251
958 ðil 251
959 ög_ 251
960 _to 250
961 egl 249
962 eyn 249
963 sky 249
964 má_ 248
965 oft 248
966 rgu 248
967 hri 247
968 mkv 247
969 öku 247
970 bur 246
971 fn_ 246
972 fum 246
973 gil 246
974 aft 245
975 kif 245
976 ðið 245
977 jan 244
978 sdó 244
979 sme 244
980 lpa 243
981 rgi 243
982 tæð 243
983 aun 242
984 æmi 242
985 _ás 241
986 _ö_ 241
987 avi 241
988 _pa 240
989 _öð 240
990 pta 240
991 fjá 238
992 íka 238
993 fyl 237
994 hre 237
995 hæg 237
996 nal 237
997 tve 237
998 aml 236
999 kal 236
1000 gul 235
1001 rað 235
1002 rk_ 235
1003 _væ 234
1004 tær 234
1005 _næ 231
1006 göt 231
1007 mót 231
1008 róf 231
1009 _ræ 230
1010 as_ 229
1011 egt 229
1012 öng 229
1013 _ho 228
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 to_ 75545
15 _di 71801
16 _co 69441
17 _de 69420
18 ion 65997
19 re_ 61205
20 ne_ 59892
21 la_ 58690
22 ent 56018
23 di_ 55565
24 one 50667
25 _in 50615
26 le_ 48644
27 del 44222
28 zio 42592
29 che 41776
30 ell 41403
31 he_ 41009
32 _pr 40198
33 ti_ 40157
34 te_ 39046
35 con 38401
36 _ch 36586
37 men 35513
38 nte 35359
39 no_ 34461
40 _un 34023
41 _la 32971
42 _e_ 32966
43 per 32168
44 _pe 30225
45 azi 29233
46 lla 28803
47 _qu 28795
48 _al 28585
49 are 26622
50 ta_ 26401
51 li_ 26117
52 in_ 25316
53 _ri 25268
54 _si 24870
55 com 24561
56 _no 23220
57 nto 23173
58 _il 22688
59 il_ 22487
60 pro 22263
61 ni_ 22123
62 on_ 21511
63 sta 21407
64 er_ 21129
65 io_ 21113
66 est 21043
67 si_ 20926
68 ame 20419
69 ere 20411
70 pre 20275
71 ato 20092
72 ssi 20073
73 _pa 19646
74 _po 19609
75 _so 19277
76 el_ 19187
77 ono 19085
78 ess 18759
79 all 18645
80 _ne 18598
81 ett 18428
82 ia_ 18278
83 ati 18029
84 que 17941
85 _a_ 17921
86 ro_ 17878
87 ll_ 17740
88 ter 17687
89 _se 17583
90 gli 17425
91 res 17381
92 _re 16700
93 rop 16632
94 tat 16350
95 un_ 16325
96 ra_ 16189
97 mo_ 16135
98 tti 16134
99 _da 16074
100 nti 16049
101 tra 15995
102 ale 15977
103 _st 15870
104 ri_ 15863
105 att 15813
106 tto 15623
107 ica 15621
108 ost 15621
109 lle 15578
110 na_ 15453
111 par 15395
112 _su 15319
113 _l_ 15307
114 ei_ 14940
115 ali 14864
116 non 14823
117 _le 14629
118 sio 14621
119 do_ 14374
120 ant 14130
121 oni 14051
122 _es 14023
123 _è_ 13824
124 tà_ 13644
125 ari 13494
126 ues 13356
127 pos 12935
128 str 12738
129 lo_ 12580
130 _ma 12485
131 so_ 12466
132 iam 12421
133 sti 12321
134 esi 12224
135 tiv 12178
136 se_ 12143
137 ont 11693
138 uro 11613
139 tta 11450
140 una 11383
141 enz 11379
142 nel 11367
143 tic 11356
144 _an 11345
145 _i_ 11290
146 ma_ 11274
147 za_ 11255
148 nta 11191
149 rat 11122
150 uni 11097
151 eri 11095
152 _mo 11084
153 ico 11075
154 _me 10854
155 qua 10852
156 ist 10840
157 mis 10811
158 ten 10727
159 nor 10687
160 ca_ 10641
161 eur 10545
162 iss 10490
163 col 10446
164 ore 10443
165 _eu 10415
166 mmi 10402
167 ver 10359
168 ità 10328
169 ste 10250
170 int 10197
171 _mi 10182
172 amo 10081
173 sto 10068
174 omm 10058
175 rit 10043
176 _tr 9969
177 al_ 9745
178 ide 9732
179 anc 9652
180 lit 9564
181 ope 9534
182 vol 9532
183 tan 9493
184 utt 9489
185 bil 9455
186 tar 9412
187 da_ 9397
188 tro 9387
189 era 9354
190 nza 9246
191 _ci 9155
192 ori 9088
193 ire 9062
194 _fa 9045
195 cor 9036
196 olt 9023
197 dei 8945
198 ons 8906
199 ris 8902
200 sen 8831
201 oli 8814
202 _ca 8777
203 ran 8748
204 chi 8704
205 tor 8691
206 ese 8677
207 me_ 8656
208 end 8651
209 ric 8605
210 sse 8407
211 iti 8371
212 ndo 8367
213 tut 8301
214 den 8290
215 ora 8279
216 ser 8244
217 ito 8193
218 son 8175
219 sig 8170
220 ond 8127
221 der 8099
222 ili 8076
223 rio 8046
224 _do 8004
225 ann 7982
226 ssa 7934
227 ort 7920
228 _te 7909
229 ina 7871
230 por 7846
231 gra 7838
232 ita 7826
233 ntr 7822
234 pri 7762
235 izi 7696
236 sid 7674
237 and 7657
238 tal 7657
239 _tu 7654
240 ano 7599
241 _ha 7588
242 ata 7581
243 ci_ 7579
244 nzi 7541
245 _pi 7423
246 art 7418
247 _im 7395
248 ndi 7333
249 ome 7328
250 sso 7247
251 sul 7163
252 zia 7136
253 gio 7125
254 ini 7122
255 fic 7087
256 spe 7067
257 erc 7060
258 oss 7023
259 rev 6990
260 dir 6989
261 for 6966
262 dal 6903
263 mi_ 6885
264 co_ 6875
265 _vo 6871
266 ual 6851
267 app 6828
268 va_ 6789
269 evo 6782
270 po_ 6776
271 ura 6729
272 dis 6706
273 ona 6693
274 anz 6668
275 ass 6642
276 _fo 6634
277 ggi 6630
278 imp 6597
279 sar 6565
280 _vi 6543
281 nsi 6524
282 ela 6514
283 gno 6498
284 nch 6490
285 tri 6480
286 sia 6448
287 ene 6423
288 sa_ 6410
289 nal 6397
290 ate 6330
291 ria 6323
292 iar 6298
293 iva 6298
294 fin 6284
295 rta 6282
296 opo 6273
297 nda 6268
298 cia 6253
299 _ve 6250
300 tre 6242
301 rre 6209
302 acc 6203
303 _gi 6193
304 itt 6191
305 _ad 6176
306 ien 6172
307 nno 6115
308 ola 6115
309 tte 6106
310 olo 6098
311 arl 6081
312 zza 6078
313 sol 6039
314 ici 6015
315 ich 5947
316 isc 5946
317 _ta 5888
318 ppo 5876
319 _li 5848
320 ven 5844
321 _lo 5801
322 _ra 5780
323 ive 5757
324 ole 5742
325 ltr 5723
326 pol 5719
327 tur 5705
328 vor 5701
329 lam 5695
330 mpo 5692
331 raz 5652
332 llo 5636
333 man 5626
334 _fi 5593
335 alt 5590
336 ine 5573
337 ero 5566
338 ime 5554
339 rla 5544
340 min 5543
341 ott 5517
342 ve_ 5500
343 orm 5499
344 _pu 5491
345 ea_ 5467
346 _at 5444
347 lio 5443
348 mer 5437
349 rel 5413
350 cos 5401
351 nde 5376
352 _sa 5369
353 _gr 5358
354 rti 5355
355 nos 5292
356 rim 5292
357 ign 5279
358 ani 5248
359 ren 5234
360 _as 5209
361 orr 5190
362 tit 5148
363 ior 5147
364 ial 5142
365 _gl 5140
366 _ac 5138
367 rma 5111
368 nio 5110
369 _ap 5097
370 ha_ 5085
371 ers 5083
372 isp 5066
373 ebb 5047
374 ern 5032
375 _av 5021
376 ile 5020
377 laz 4961
378 bia 4922
379 cco 4903
380 omp 4891
381 iù_ 4881
382 più 4873
383 uzi 4858
384 ce_ 4851
385 spo 4851
386 vo_ 4849
387 inc 4842
388 ces 4825
389 _am 4795
390 ui_ 4785
391 pet 4776
392 cat 4716
393 ivi 4693
394 uto 4690
395 occ 4682
396 tem 4669
397 ove 4653
398 nit 4643
399 ara 4617
400 izz 4572
401 fer 4544
402 ert 4539
403 lat 4536
404 _sp 4524
405 eci 4508
406 ret 4505
407 ull 4489
408 onc 4483
409 uan 4468
410 bbi 4464
411 _or 4455
412 _on 4454
413 rà_ 4453
414 reg 4451
415 abi 4447
416 ifi 4420
417 oro 4415
418 eme 4414
419 oll 4383
420 mat 4382
421 car 4368
422 egl 4365
423 vi_ 4359
424 ind 4351
425 nci 4339
426 ttu 4328
427 ral 4317
428 omu 4310
429 igl 4303
430 ner 4294
431 itu 4289
432 bbe 4279
433 de_ 4259
434 leg 4259
435 agg 4256
436 emp 4243
437 eco 4236
438 rte 4226
439 ie_ 4209
440 ord 4208
441 ece 4201
442 giu 4158
443 _cr 4157
444 sit 4138
445 avo 4132
446 reb 4093
447 cen 4080
448 emb 4066
449 _va 4062
450 or_ 4045
451 osi 4030
452 ai_ 4028
453 mun 4020
454 _cu 4018
455 pea 4005
456 iat 4001
457 nat 3990
458 ima 3958
459 ede 3920
460 ibi 3918
461 _sc 3902
462 egi 3897
463 upp 3876
464 rie 3875
465 _ai 3862
466 amb 3862
467 cit 3852
468 ivo 3813
469 dic 3805
470 rog 3804
471 erm 3803
472 tua 3795
473 rar 3792
474 lar 3772
475 lor 3770
476 ché 3767
477 hé_ 3767
478 iso 3767
479 mod 3766
480 deg 3757
481 odo 3749
482 ens 3745
483 mbi 3732
484 ue_ 3711
485 _nu 3702
486 lta 3698
487 erv 3682
488 lic 3680
489 aff 3678
490 nco 3627
491 sem 3625
492 rov 3624
493 ino 3619
494 cre 3618
495 ian 3616
496 sis 3588
497 cer 3583
498 mbr 3579
499 iri 3567
500 qui 3543
501 aes 3531
502 imi 3527
503 _au 3518
504 go_ 3494
505 pae 3493
506 eve 3469
507 gen 3464
508 tes 3449
509 ult 3443
510 pon 3433
511 rin 3421
512 _ab 3414
513 _sv 3406
514 tam 3393
515 hia 3375
516 ema 3368
517 err 3365
518 _ba 3362
519 omi 3347
520 ram 3329
521 _af 3319
522 fat 3313
523 alc 3302
524 iet 3302
525 dam 3294
526 nom 3280
527 rso 3275
528 ffi 3255
529 via 3254
530 _ec 3253
531 imo 3250
532 be_ 3241
533 gua 3241
534 esp 3224
535 rsi 3221
536 uti 3218
537 fon 3212
538 ure 3208
539 ppr 3205
540 ite 3186
541 dov 3182
542 ad_ 3171
543 oci 3168
544 cui 3165
545 _fr 3164
546 isi 3161
547 nan 3151
548 soc 3130
549 uta 3126
550 rif 3116
551 mol 3115
552 _ag 3111
553 rea 3096
554 ard 3088
555 mpi 3086
556 seg 3085
557 ova 3083
558 inf 3072
559 pen 3064
560 pot 3062
561 _ce 3059
562 abb 3055
563 edi 3053
564 din 3036
565 des 3029
566 opr 3028
567 vis 3024
568 ron 3012
569 roc 2961
570 sco 2943
571 dev 2938
572 cio 2937
573 uar 2937
574 naz 2935
575 tim 2935
576 rem 2911
577 bri 2897
578 dia 2893
579 ice 2875
580 uel 2866
581 far 2861
582 lto 2857
583 sci 2854
584 _oc 2851
585 hi_ 2849
586 niz 2839
587 lia 2815
588 _na 2814
589 ngo 2808
590 ber 2806
591 ezz 2805
592 egu 2804
593 _og 2796
594 div 2795
595 lme 2792
596 unt 2792
597 pli 2786
598 liz 2774
599 cce 2773
600 ume 2765
601 lem 2754
602 _ar 2748
603 _ge 2747
604 mpr 2743
605 ors 2741
606 rno 2740
607 lav 2739
608 rca 2734
609 mem 2732
610 bie 2699
611 can 2695
612 ras 2686
613 uov 2685
614 ffe 2684
615 nse 2682
616 rig 2675
617 vit 2664
618 eo_ 2663
619 rod 2658
620 cun 2656
621 ote 2648
622 taz 2639
623 amm 2634
624 rda 2631
625 lan 2619
626 tin 2611
627 eno 2610
628 _du 2600
629 dif 2595
630 hie 2590
631 van 2590
632 cip 2589
633 rdi 2586
634 ing 2583
635 lti 2581
636 rch 2580
637 ele 2575
638 ust 2572
639 sce 2554
640 tio 2547
641 eng 2545
642 _el 2543
643 ave 2542
644 emo 2542
645 rec 2540
646 oi_ 2537
647 egn 2536
648 rme 2528
649 tru 2527
650 oce 2526
651 dat 2525
652 sib 2521
653 rap 2519
654 mpe 2518
655 nic 2512
656 _o_ 2511
657 _em 2503
658 nce 2503
659 vil 2488
660 ble 2476
661 _op 2475
662 ego 2470
663 iut 2465
664 inv 2460
665 rmi 2457
666 ogg 2453
667 adi 2449
668 _bi 2427
669 nar 2390
670 pa_ 2388
671 sca 2388
672 evi 2385
673 ete 2380
674 lcu 2379
675 fro 2354
676 nuo 2340
677 alm 2337
678 ger 2327
679 _be 2316
680 rag 2309
681 cam 2306
682 rci 2296
683 rob 2295
684 sos 2294
685 orn 2286
686 olu 2284
687 cur 2281
688 cis 2277
689 ier 2276
690 dot 2273
691 sim 2265
692 sic 2260
693 onf 2257
694 ced 2251
695 opp 2250
696 eni 2244
697 mon 2244
698 lin 2241
699 peo 2238
700 asp 2232
701 gia 2230
702 nve 2229
703 ze_ 2229
704 scu 2212
705 _fu 2208
706 han 2206
707 nec 2204
708 zi_ 2203
709 ccu 2196
710 esc 2192
711 ogr 2189
712 val 2182
713 dur 2178
714 _is 2174
715 dar 2173
716 fra 2172
717 cas 2171
718 ota 2170
719 rei 2169
720 ua_ 2164
721 rad 2163
722 emi 2161
723 teg 2150
724 lim 2145
725 nsa 2144
726 mpl 2143
727 dec 2135
728 set 2120
729 agl 2115
730 lib 2105
731 mar 2102
732 opa 2102
733 cup 2101
734 isu 2097
735 svi 2097
736 ega 2094
737 ovr 2092
738 esa 2088
739 paz 2087
740 _lu 2084
741 nis 2082
742 obl 2065
743 asi 2064
744 rna 2061
745 rez 2060
746 cri 2059
747 iò_ 2057
748 pia 2056
749 ciò 2054
750 uss 2045
751 cie 2040
752 rdo 2032
753 ilu 2028
754 gol 2026
755 su_ 2022
756 ade 2015
757 igu 2015
758 _ed 2013
759 nze 2006
760 mag 2002
761 lup 2001
762 gni 1999
763 sec 1999
764 icu 1987
765 rib 1980
766 obi 1973
767 eva 1972
768 vat 1969
769 avi 1968
770 met 1968
771 pun 1967
772 rto 1966
773 vre 1962
774 _d_ 1958
775 pio 1957
776 riv 1955
777 ros 1945
778 uno 1943
779 bas 1937
780 rol 1932
781 upa 1932
782 ed_ 1930
783 nut 1929
784 ben 1927
785 obb 1924
786 _en 1915
787 nei 1914
788 siz 1913
789 _ob 1909
790 det 1908
791 iun 1897
792 eff 1890
793 zaz 1880
794 bli 1877
795 mic 1870
796 rse 1868
797 atu 1865
798 gan 1863
799 gar 1861
800 ogn 1856
801 mit 1855
802 ado 1852
803 _ef 1847
804 ute 1846
805 avv 1842
806 gna 1834
807 enu 1832
808 ana 1821
809 bit 1816
810 mes 1815
811 red 1815
812 bbl 1813
813 nqu 1811
814 let 1807
815 var 1799
816 une 1797
817 tir 1794
818 nni 1789
819 uma 1789
820 dim 1786
821 ius 1783
822 mig 1779
823 ubb 1778
824 ila 1776
825 uin 1771
826 ga_ 1769
827 pes 1769
828 amp 1768
829 gi_ 1765
830 get 1746
831 ul_ 1744
832 san 1739
833 ins 1733
834 lte 1724
835 caz 1718
836 ena 1715
837 sì_ 1714
838 odi 1705
839 don 1697
840 ied 1697
841 rid 1687
842 ife 1677
843 diz 1676
844 aus 1671
845 ida 1663
846 uri 1663
847 ved 1657
848 uaz 1653
849 lut 1651
850 sua 1651
851 cus 1647
852 vel 1635
853 ezi 1628
854 suo 1628
855 rup 1626
856 già 1609
857 ià_ 1609
858 ast 1606
859 edo 1606
860 luz 1606
861 eal 1604
862 aut 1603
863 mme 1600
864 ung 1600
865 gru 1597
866 vot 1596
867 erà 1592
868 rni 1586
869 mil 1581
870 idi 1576
871 oma 1565
872 nol 1562
873 _ev 1561
874 dit 1558
875 _ga 1546
876 log 1544
877 agi 1535
878 eta 1534
879 maz 1534
880 neg 1530
881 può 1529
882 uò_ 1529
883 ecc 1527
884 clu 1526
885 cci 1519
886 tab 1519
887 ibe 1516
888 lus 1514
889 ibu 1511
890 pi_ 1510
891 rve 1510
892 sot 1510
893 vvi 1509
894 gue 1499
895 ogl 1497
896 sat 1497
897 pra 1491
898 eti 1489
899 tav 1489
900 osc 1488
901 _ul 1487
902 ane 1485
903 ace 1484
904 lis 1473
905 otr 1472
906 rut 1471
907 dan 1470
908 zat 1469
909 gov 1468
910 _go 1458
911 ars 1449
912 asc 1448
913 dob 1448
914 ghi 1448
915 tad 1437
916 sch 1435
917 mio 1426
918 osa 1425
919 tec 1421
920 mma 1419
921 ovi 1418
922 rne 1418
923 spi 1418
924 alu 1416
925 nso 1416
926 sab 1414
927 noi 1413
928 iff 1411
929 ava 1407
930 ttr 1398
931 arm 1394
932 gui 1394
933 nia 1394
934 _ho 1391
935 nam 1390
936 usi 1390
937 ho_ 1386
938 pub 1383
939 nfo 1382
940 cca 1375
941 fav 1372
942 lli 1372
943 vid 1372
944 raf 1371
945 uir 1369
946 org 1366
947 uo_ 1365
948 mia 1360
949 cal 1358
950 gge 1355
951 _os 1354
952 cap 1354
953 efi 1353
954 pie 1351
955 rro 1349
956 rga 1348
957 vve 1347
958 med 1346
959 oti 1345
960 lie 1344
961 api 1338
962 liv 1337
963 età 1336
964 rvi 1330
965 tuz 1330
966 rav 1328
967 odu 1327
968 spa 1327
969 idu 1322
970 sor 1322
971 aro 1320
972 ase 1320
973 nga 1319
974 ovo 1318
975 inu 1315
976 uit 1313
977 erg 1305
978 edu 1303
979 fet 1300
980 pur 1299
981 _er 1296
982 orz 1296
983 ism 1295
984 til 1295
985 alo 1292
986 rom 1287
987 _az 1280
988 agr 1279
989 nge 1279
990 ngr 1278
991 uis 1278
992 _ru 1277
993 aci 1270
994 sin 1268
995 onv 1266
996 ssu 1264
997 pas 1260
998 pec 1256
999 sur 1251
1000 _fe 1248
1001 nir 1248
1002 nsu 1241
1003 tol 1240
1004 pit 1238
1005 tia 1234
1006 gon 1232
1007 due 1231
1008 nea 1231
1009 ddi 1230
1010 vam 1229
1011 aiu 1220
1012 ipa 1218
1013 ipi 1217
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ini 159130
15 s_p 136654
16 ijo 122792
17 usi 109788
18 jos 107645
19 ien 102237
20 iau 101203
21 tin 98655
22 ali 97119
23 aus 96445
24 s_s 90636
25 o_p 82919
26 pas 76537
27 iet 75896
28 uvo 73539
29 ink 72204
30 kai 70264
31 met 69573
32 oje 69524
33 s_i 69391
34 sta 69212
35 s_a 68807
36 etu 68576
37 lie 67524
38 tai 67366
39 s_k 67201
40 iai 65565
41 pri 64365
42 cij 63754
43 pra 62815
44 ent 62543
45 tas 61206
46 uri 60404
47 die 60339
48 s_v 59941
49 s_t 59421
50 ant 59244
51 kur 58824
52 ist 58773
53 ais 58720
54 par 57557
55 tar 56888
56 i_p 56639
57 min 55905
58 mas 55042
59 lai 53592
60 adi 53477
61 nin 53428
62 imo 53282
63 eri 52816
64 gal 52650
65 rin 52324
66 ius 51954
67 vie 51668
68 ina 51595
69 tuv 51364
70 s_d 51340
71 inė 50967
72 s_n 50915
73 o_s 50529
74 per 50345
75 asi 50040
76 čia 49622
77 sti 49324
78 ria 48966
79 s_b 48614
80 tik 48489
81 ų_p 47566
82 tei 47284
83 s_m 46994
84 pro 46728
85 ija 46622
86 int 46240
87 lin 46223
88 oli 45286
89 aug 45194
90 nės 45165
91 iki 45097
92 val 44965
93 kad 44762
94 rei 44727
95 eik 44595
96 ran 43262
97 ios 42986
98 art 42983
99 iam 42904
100 ama 42389
101 dar 41920
102 tra 41800
103 ari 41088
104 kar 40704
105 buv 40452
106 išk 40340
107 i_s 39698
108 nių 39538
109 toj 39340
110 o_k 39240
111 uot 39079
112 avo 38925
113 sto 38437
114 aik 38434
115 sak 38392
116 pat 38170
117 oja 38106
118 nti 38046
119 and 37960
120 nuo 37925
121 o_a 37528
122 vai 37109
123 s_r 37027
124 rie 36575
125 ima 36250
126 dži 36150
127 ino 35879
128 kom 35652
129 enį 35646
130 ies 35440
131 aip 35412
132 o_t 35270
133 uos 35175
134 vos 35022
135 eli 35016
136 sav 34879
137 ame 34802
138 nas 34747
139 men 34734
140 est 34700
141 s_g 34575
142 kas 34537
143 end 34503
144 ų_s 34397
145 ras 34268
146 nis 34174
147 kal 34092
148 ori 34018
149 pre 33908
150 avi 33907
151 rti 33845
152 lio 33617
153 nes 33416
154 ėjo 33094
155 ren 33046
156 o_v 33041
157 o_m 33007
158 tur 32939
159 s_l 32885
160 ika 32882
161 ter 32591
162 aut 32530
163 nio 32467
164 mon 32251
165 sia 31903
166 ris 31726
167 lia 31598
168 eis 31331
169 ų_k 31214
170 sio 31193
171 var 30935
172 pir 30876
173 nia 30796
174 ose 30724
175 aci 30646
176 etų 30333
177 eni 30096
178 eig 29982
179 iti 29963
180 rad 29834
181 ats 29741
182 ing 29643
183 nka 29366
184 kon 29259
185 mis 29240
186 aud 29189
187 eči 29187
188 omi 29121
189 tos 28908
190 o_i 28902
191 a_p 28666
192 nta 28206
193 e_p 28094
194 irt 28062
195 iko 28035
196 i_i 28032
197 kla 28004
198 are 27968
199 din 27854
200 ili 27784
201 i_n 27745
202 tis 27740
203 lau 27597
204 tat 27590
205 kin 27293
206 nau 27284
207 asa 27223
208 o_d 27142
209 o_n 27018
210 kel 26795
211 vis 26752
212 ati 26751
213 irm 26533
214 jam 26466
215 auj 26466
216 ų_m 26457
217 oma 26174
218 o_r 26138
219 s_į 26036
220 gia 26032
221 oni 25744
222 vir 25725
223 tor 25626
224 riu 25603
225 nči 25544
226 sus 25479
227 i_k 25472
228 oti 25423
229 vyk 25288
230 iek 25282
231 gin 25205
232 ala 25159
233 ntr 25023
234 i_a 25011
235 sij 24954
236 i_t 24940
237 ici 24880
238 kos 24822
239 enk 24585
240 čių 24491
241 sau 24328
242 eti 24311
243 raš 24191
244 joj 24153
245 ane 24141
246 žia 24121
247 sie 24115
248 ams 24113
249 ben 24113
250 auk 24112
251 ita 24110
252 isi 24026
253 dau 24009
254 pie 23921
255 str 23827
256 i_b 23715
257 lan 23667
258 imu 23615
259 ų_a 23535
260 tvi 23473
261 ide 23466
262 ndr 23419
263 ato 23399
264 dėl 23394
265 api 23356
266 rij 23263
267 nto 23215
268 yra 23185
269 ova 23113
270 čio 23058
271 ste 22985
272 man 22817
273 lit 22796
274 iuo 22568
275 ani 22553
276 aty 22469
277 den 22448
278 ket 22424
279 kia 22396
280 ojo 22359
281 į_p 22318
282 anč 22302
283 nim 22280
284 šal 22185
285 ank 22164
286 ena 22156
287 ų_i 22156
288 nus 22144
289 jus 22125
290 ona 22054
291 yti 22020
292 ami 22001
293 arb 21994
294 sis 21922
295 rio 21744
296 mok 21682
297 did 21672
298 ybė 21613
299 ver 21543
300 ska 21465
301 rau 21450
302 ekt 21440
303 e_s 21419
304 imą 21261
305 tus 21109
306 vil 21091
307 ven 21066
308 žin 21062
309 ara 21052
310 rus 21051
311 nij 21022
312 nam 21022
313 kti 20974
314 ado 20929
315 eta 20911
316 tuo 20890
317 rta 20873
318 s_š 20708
319 ion 20705
320 lių 20689
321 vei 20533
322 lis 20474
323 rių 20379
324 ast 20374
325 nai 20365
326 čiu 20363
327 tie 20293
328 mos 20283
329 bus 20273
330 s_j 20268
331 r_p 20206
332 eng 20174
333 i_v 20169
334 o_b 20158
335 gos 20137
336 ele 20087
337 ieš 20063
338 ą_p 20059
339 jau 20027
340 uti 20025
341 era 19968
342 lik 19958
343 tij 19908
344 ung 19882
345 vad 19855
346 eto 19718
347 ų_t 19633
348 liu 19624
349 i_d 19601
350 oki 19582
351 ngt 19529
352 mai 19489
353 neš 19415
354 yri 19405
355 rit 19243
356 yje 19202
357 kim 19120
358 osi 19078
359 nki 19069
360 pol 19036
361 rim 19028
362 alt 19022
363 als 19011
364 eno 18995
365 rma 18938
366 dal 18885
367 tad 18884
368 ndi 18860
369 ų_v 18848
370 bos 18781
371 o_l 18727
372 niu 18697
373 ači 18695
374 oto 18673
375 net 18634
376 jai 18624
377 ait 18614
378 sit 18587
379 kus 18537
380 vim 18536
381 ald 18495
382 rez 18473
383 a_s 18465
384 ata 18455
385 amo 18413
386 ava 18311
387 aly 18250
388 ana 18208
389 tre 18204
390 uli 18184
391 rov 18140
392 atv 18079
393 uoj 18032
394 ano 17974
395 nos 17947
396 imi 17884
397 ada 17864
398 tel 17860
399 o_g 17819
400 uro 17810
401 ard 17719
402 pal 17667
403 tan 17658
404 itų 17649
405 gyv 17619
406 ome 17569
407 pag 17541
408 dam 17540
409 vyr 17522
410 ask 17461
411 tro 17412
412 kst 17335
413 akė 17298
414 bei 17263
415 spa 17211
416 kra 17183
417 ros 17135
418 rod 17133
419 uto 17098
420 ias 17082
421 maž 17076
422 omo 17037
423 vas 17028
424 iči 17022
425 nal 17009
426 esi 16800
427 dos 16758
428 duo 16757
429 jas 16752
430 lei 16688
431 ate 16619
432 roc 16602
433 iem 16589
434 tam 16562
435 ybo 16532
436 u_p 16409
437 iją 16383
438 ers 16264
439 e_t 16234
440 ono 16134
441 ų_b 16110
442 lst 16086
443 ų_n 16065
444 kta 16061
445 jim 16039
446 tom 16004
447 mus 15977
448 kci 15963
449 ų_d 15960
450 sty 15882
451 ų_g 15857
452 pos 15821
453 rek 15812
454 aid 15739
455 bal 15684
456 dra 15594
457 i_į 15593
458 mie 15582
459 ary 15559
460 a_i 15555
461 rai 15550
462 rat 15545
463 ijų 15534
464 kie 15508
465 ian 15508
466 ioj 15482
467 dro 15466
468 bės 15465
469 iln 15425
470 nkt 15412
471 ral 15410
472 ert 15393
473 tyb 15390
474 o_į 15324
475 ėje 15293
476 žmo 15281
477 igi 15265
478 aba 15257
479 lni 15239
480 bai 15239
481 ą_s 15144
482 ė_p 15124
483 ų_r 15008
484 alė 14978
485 uom 14974
486 der 14955
487 ovė 14936
488 aka 14870
489 ugi 14862
490 dov 14824
491 ruo 14818
492 aro 14818
493 s_e 14673
494 ikt 14650
495 arp 14646
496 ain 14644
497 s_ž 14641
498 pen 14640
499 ški 14593
500 san 14569
501 cia 14546
502 sir 14505
503 lti 14465
504 dai 14456
505 a_n 14448
506 kit 14440
507 aži 14437
508 gra 14319
509 ime 14286
510 kam 14253
511 tri 14251
512 e_v 14250
513 eši 14202
514 imt 14193
515 dim 14179
516 r_k 14168
517 ą_i 14135
518 spr 14131
519 ito 14124
520 pan 14115
521 aul 14114
522 yve 14095
523 pav 14071
524 ner 14051
525 eur 13954
526 tau 13937
527 žio 13930
528 ų_l 13916
529 rto 13877
530 ust 13854
531 pak 13852
532 pad 13829
533 eki 13779
534 šia 13774
535 aig 13762
536 kau 13753
537 nep 13732
538 kri 13644
539 ski 13633
540 sių 13605
541 ost 13600
542 lim 13597
543 ėju 13587
544 aun 13583
545 ban 13544
546 i_g 13517
547 tov 13513
548 nko 13466
549 ira 13449
550 ėja 13447
551 len 13440
552 yta 13425
553 nar 13424
554 nie 13401
555 ėti 13316
556 dėj 13293
557 nor 13275
558 siu 13253
559 e_b 13248
560 a_a 13246
561 lic 13246
562 tim 13243
563 jav 13213
564 ale 13208
565 uja 13190
566 sin 13189
567 ram 13151
568 kre 13121
569 ezi 13114
570 uma 13099
571 eks 13075
572 tač 13002
573 a_t 12982
574 voj 12981
575 ial 12980
576 urė 12968
577 uol 12911
578 ieč 12891
579 ida 12884
580 kan 12866
581 i_l 12853
582 jog 12839
583 das 12838
584 bil 12835
585 į_s 12830
586 kto 12829
587 nei 12813
588 rik 12801
589 būt 12789
590 por 12770
591 i_m 12759
592 aim 12632
593 ang 12611
594 rga 12591
595 jis 12591
596 šio 12576
597 ngi 12558
598 imų 12541
599 ota 12524
600 eim 12513
601 zid 12504
602 s_u 12449
603 dir 12442
604 e_i 12415
605 šim 12412
606 iva 12405
607 ine 12393
608 ikė 12320
609 i_r 12308
610 r_s 12298
611 ort 12293
612 eko 12282
613 jun 12256
614 ovo 12171
615 onė 12170
616 bin 12164
617 lės 12157
618 jan 12151
619 cen 12147
620 epa 12142
621 gim 12127
622 u_s 12107
623 ind 12092
624 ite 12003
625 e_a 12002
626 a_k 11998
627 pla 11983
628 mer 11978
629 uta 11950
630 nga 11945
631 idž 11939
632 tru 11932
633 u_k 11907
634 pau 11865
635 e_n 11864
636 ngo 11838
637 i_j 11823
638 e_k 11812
639 eid 11810
640 for 11809
641 akc 11799
642 kir 11796
643 nat 11777
644 nda 11692
645 reč 11641
646 aps 11640
647 p_p 11639
648 gai 11634
649 paž 11629
650 sut 11625
651 emo 11619
652 ryt 11614
653 rna 11610
654 one 11601
655 opo 11589
656 tyn 11551
657 ešė 11544
658 ern 11540
659 ene 11540
660 nan 11536
661 akt 11533
662 kio 11529
663 mat 11527
664 mad 11519
665 gru 11477
666 isa 11403
667 kov 11353
668 gan 11309
669 dav 11291
670 udo 11290
671 tūr 11243
672 oji 11237
673 tst 11236
674 tok 11229
675 a_d 11220
676 adė 11220
677 nte 11201
678 ikr 11196
679 cin 11174
680 iri 11157
681 erg 11122
682 aga 11120
683 las 11109
684 etv 11089
685 šin 11066
686 ikl 11053
687 obi 11044
688 gti 11041
689 tal 11032
690 spe 11027
691 u_n 11020
692 ųjų 11019
693 rop 10969
694 vin 10960
695 kol 10877
696 r_t 10860
697 nom 10852
698 uda 10836
699 eną 10829
700 ans 10822
701 ger 10812
702 lta 10812
703 i_š 10779
704 nci 10778
705 mln 10764
706 olo 10754
707 lyg 10730
708 lij 10719
709 gen 10718
710 ūks 10695
711 jon 10669
712 vės 10657
713 r_v 10651
714 dyt 10638
715 ybi 10624
716 itu 10607
717 evi 10584
718 sek 10570
719 s_f 10547
720 idė 10534
721 orm 10517
722 e_d 10516
723 stu 10514
724 į_k 10512
725 įst 10484
726 tsi 10461
727 inę 10412
728 ako 10388
729 umo 10365
730 yva 10357
731 alb 10341
732 tūk 10323
733 ė_s 10298
734 tės 10268
735 nėj 10233
736 a_b 10218
737 o_š 10201
738 igo 10143
739 ška 10112
740 a_v 10108
741 mob 10106
742 sim 10086
743 rog 10077
744 rtu 10048
745 ndo 10024
746 r_n 10015
747 ūna 9995
748 ero 9990
749 uni 9954
750 tyt 9935
751 mpi 9904
752 ere 9871
753 gri 9864
754 edi 9816
755 ark 9766
756 ins 9754
757 kių 9737
758 eiš 9737
759 alo 9715
760 idi 9710
761 gas 9700
762 ten 9695
763 ėji 9687
764 vid 9668
765 sei 9648
766 o_j 9646
767 ą_k 9642
768 bar 9565
769 usk 9557
770 ber 9487
771 pin 9481
772 klu 9472
773 mėn 9467
774 gam 9454
775 dij 9439
776 inu 9430
777 įsi 9410
778 iej 9390
779 rem 9381
780 rmi 9348
781 dvi 9348
782 nku 9326
783 res 9307
784 igū 9273
785 sik 9270
786 esn 9248
787 ugo 9235
788 dan 9230
789 mar 9202
790 vak 9193
791 uga 9182
792 r_a 9181
793 gūn 9179
794 vus 9171
795 atl 9171
796 elb 9153
797 rda 9147
798 zij 9123
799 užs 9112
800 į_v 9094
801 rup 9082
802 roj 9043
803 ems 9041
804 kis 9016
805 šta 8987
806 muo 8960
807 udi 8915
808 lat 8903
809 lėj 8902
810 ute 8892
811 ote 8883
812 ą_a 8855
813 rak 8834
814 ą_v 8823
815 ėne 8784
816 iks 8762
817 nyb 8762
818 pil 8709
819 r_d 8697
820 ske 8691
821 arn 8676
822 ult 8649
823 tol 8648
824 ymo 8646
825 žai 8632
826 nkl 8601
827 iui 8557
828 fin 8546
829 u_t 8539
830 u_a 8508
831 ido 8505
832 kli 8473
833 sid 8470
834 ntu 8469
835 kyt 8465
836 sni 8462
837 kil 8451
838 ldy 8426
839 rės 8421
840 gar 8403
841 ė_v 8374
842 lam 8356
843 ėli 8344
844 gau 8327
845 rac 8301
846 rėj 8293
847 sme 8293
848 sen 8274
849 mui 8272
850 sas 8272
851 iza 8269
852 ojų 8250
853 les 8233
854 a_l 8227
855 aiš 8217
856 ryb 8199
857 esa 8186
858 ūro 8184
859 ega 8181
860 kat 8166
861 egi 8165
862 ė_a 8149
863 sla 8142
864 mot 8130
865 usy 8121
866 eit 8115
867 inį 8108
868 air 8102
869 tit 8093
870 ole 8084
871 lyv 8084
872 jie 8068
873 lek 8047
874 suo 8046
875 spo 8030
876 bas 8030
877 ukt 8023
878 ula 8016
879 ūti 8008
880 del 8003
881 enc 7995
882 kšt 7982
883 arė 7972
884 mės 7972
885 emp 7971
886 los 7970
887 dom 7933
888 bia 7927
889 van 7922
890 enė 7913
891 ško 7910
892 run 7906
893 iku 7895
894 u_b 7892
895 yni 7887
896 sku 7881
897 žsi 7880
898 šių 7877
899 iav 7872
900 mes 7847
901 lyj 7841
902 pus 7826
903 uva 7823
904 ton 7813
905 rny 7813
906 sve 7811
907 ašt 7803
908 siū 7797
909 ivi 7797
910 tek 7796
911 myb 7781
912 alų 7767
913 omp 7745
914 į_a 7742
915 emi 7739
916 med 7732
917 e_r 7726
918 ė_k 7715
919 škė 7710
920 ont 7709
921 sul 7683
922 lėt 7675
923 ą_d 7675
924 ėse 7655
925 suk 7651
926 tym 7643
927 eda 7620
928 ire 7614
929 įvy 7610
930 dėt 7605
931 tap 7602
932 vau 7601
933 sči 7598
934 oms 7597
935 iūl 7587
936 u_i 7575
937 yto 7565
938 žiu 7563
939 išs 7562
940 org 7543
941 šči 7533
942 mėj 7529
943 ars 7528
944 šiu 7516
945 pel 7513
946 gre 7512
947 aur 7494
948 ešt 7478
949 izi 7458
950 agr 7457
951 idu 7453
952 sva 7443
953 ą_b 7431
954 tyv 7431
955 a_g 7422
956 irb 7416
957 e_g 7411
958 sur 7410
959 rba 7401
960 gum 7396
961 ė_n 7383
962 sos 7381
963 ų_į 7378
964 e_l 7378
965 urn 7372
966 vič 7370
967 nkų 7367
968 ksl 7365
969 ler 7364
970 ėtų 7359
971 u_v 7355
972 syb 7347
973 aki 7340
974 kij 7339
975 kėj 7334
976 oks 7327
977 lab 7291
978 rys 7280
979 gij 7272
980 ašk 7259
981 pio 7258
982 ism 7250
983 oka 7236
984 ves 7209
985 o_ž 7205
986 igė 7199
987 bri 7191
988 iad 7190
989 o_e 7180
990 juo 7175
991 r_m 7171
992 ipa 7170
993 stų 7157
994 ons 7153
995 klo 7148
996 ovi 7126
997 nut 7117
998 puo 7111
999 šeš 7107
1000 o_u 7105
1001 etr 7100
1002 vok 7097
1003 oju 7077
1004 via 7064
1005 imė 7056
1006 e_m 7052
1007 tyr 7050
1008 ete 7022
1009 sum 7021
1010 ogi 7011
1011 log 6992
1012 sud 6968
1013 r_j 6961
1014 amu 6953
1015 bet 6952
1016 rbi 6951
1017 pta 6947
1018 urt 6937
1019 ure 6934
1020 žei 6930
1021 tys 6930
1022 gus 6903
1023 šve 6891
1024 ens 6889
1025 ąją 6882
1026 žem 6863
1027 įmo 6858
1028 sar 6852
1029 abi 6849
1030 gty 6849
1031 inf 6844
1032 eži 6830
1033 taš 6818
1034 šti 6811
1035 aru 6808
1036 s_o 6785
1037 rtą 6773
1038 ą_n 6766
1039 uvi 6754
1040 ė_i 6750
1041 į_i 6749
1042 mil 6747
1043 odė 6745
1044 rėt 6727
1045 yba 6726
1046 aėj 6725
1047 raė 6716
1048 reg 6715
1049 dym 6713
1050 eln 6712
1051 ujo 6681
1052 s_y 6680
1053 a_m 6679
1054 vių 6670
1055 age 6668
1056 rep 6661
1057 rdu 6660
1058 ilo 6634
1059 uod 6619
1060 viz 6616
1061 sko 6610
1062 eka 6603
1063 ėmi 6584
1064 koj 6572
1065 yks 6556
1066 neb 6554
1067 ake 6552
1068 į_l 6545
1069 oci 6520
1070 dyb 6500
1071 ų_š 6478
1072 nst 6474
1073 t_p 6463
1074 bol 6459
1075 bra 6457
1076 lub 6451
1077 oro 6450
1078 čem 6442
1079 aja 6440
1080 doj 6440
1081 lyd 6430
1082 rol 6428
1083 pab 6428
1084 lig 6426
1085 bėj 6382
1086 a_į 6372
1087 amų 6369
1088 amb 6363
1089 į_t 6362
1090 ykl 6362
1091 r_l 6362
1092 asm 6334
1093 yvi 6333
1094 į_n 6333
1095 ktu 6322
1096 iga 6321
1097 anc 6317
1098 niz 6308
1099 nks 6306
1100 s_c 6296
1101 fer 6289
1102 pti 6277
1103 yko 6276
1104 ą_t 6271
1105 ser 6268
1106 nde 6267
1107 ė_d 6263
1108 ola 6261
1109 kor 6250
1110 ė_t 6220
1111 sąj 6208
1112 ret 6200
1113 lbė 6194
1114 tli 6188
1115 oku 6185
1116 ąju 6179
1117 ntų 6173
1118 u_m 6128
1119 ema 6118
1120 sli 6111
1121 ipė 6106
1122 ugu 6100
1123 mac 6082
1124 sig 6070
1125 uki 6049
1126 nfo 6033
1127 atr 6028
1128 iso 6025
1129 pap 6023
1130 šau 6010
1131 u_d 5991
1132 r_b 5990
1133 iju 5987
1134 ogr 5984
1135 rob 5981
1136 num 5979
1137 paj 5978
1138 iru 5977
1139 raj 5965
1140 u_j 5961
1141 ajo 5956
1142 ora 5950
1143 tes 5948
1144 oda 5935
1145 išv 5935
1146 upė 5932
1147 vėl 5928
1148 soc 5926
1149 amą 5923
1150 lgi 5917
1151 o_f 5917
1152 a_r 5913
1153 uno 5913
1154 ael 5903
1155 rae 5898
1156 kiu 5895
1157 moj 5894
1158 jek 5873
1159 sil 5873
1160 ėsi 5863
1161 isu 5862
1162 ų_e 5862
1163 i_ž 5858
1164 rst 5853
1165 uje 5851
1166 įta 5848
1167 goj 5845
1168 ruk 5842
1169 mti 5839
1170 imy 5787
1171 ukš 5781
1172 sky 5777
1173 neg 5770
1174 noj 5746
1175 apt 5736
1176 dab 5732
1177 vij 5718
1178 enų 5714
1179 mir 5710
1180 iuj 5710
1181 uzi 5708
1182 nty 5706
1183 kyb 5693
1184 isk 5683
1185 gel 5680
1186 lon 5678
1187 mpa 5668
1188 e_į 5664
1189 ė_j 5659
1190 mia 5653
1191 l_k 5650
1192 apo 5643
1193 riv 5634
1194 ašy 5614
1195 rbu 5611
1196 kty 5602
1197 ond 5597
1198 enu 5595
1199 ope 5587
1200 ila 5585
1201 kei 5578
1202 tėj 5562
1203 lav 5552
1204 rbo 5547
1205 šie 5530
1206 nkė 5517
1207 iky 5504
1208 tet 5500
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 en_ 224429
15 de_ 112710
16 _de 109806
17 an_ 70218
18 et_ 69985
19 _he 61754
20 _va 52542
21 van 52050
22 er_ 48056
23 ing 42549
24 het 41683
25 ver 40916
26 oor 40483
27 _in 37934
28 at_ 35969
29 _be 35875
30 een 35631
31 ie_ 35579
32 _ge 35577
33 _da 35422
34 _vo 33870
35 _en 33361
36 gen 33291
37 ten 31913
38 nde 30738
39 ng_ 30393
40 aar 29998
41 den 29952
42 voo 29229
43 _ve 28528
44 _ee 28490
45 men 28180
46 in_ 27376
47 te_ 27265
48 dat 26734
49 _te 26081
50 ste 25332
51 _di 24146
52 _me 23484
53 aan 22945
54 der 22495
55 ijk 22042
56 lij 21706
57 cht 20751
58 ter 20595
59 or_ 20528
60 is_ 20333
61 and 20223
62 ij_ 20086
63 _op 20049
64 ijn 19588
65 _we 19127
66 ere 18987
67 _on 18954
68 eli 18952
69 tie 18790
70 ord 18668
71 nie 18634
72 eer 18591
73 sch 18441
74 _zi 18090
75 rde 17912
76 _is 17281
77 _wi 17078
78 _co 16956
79 ers 16856
80 nge 16637
81 _aa 16324
82 die 16194
83 ren 16025
84 ar_ 15980
85 lle 15979
86 _al 15922
87 ent 15769
88 op_ 15741
89 it_ 15486
90 ken 15329
91 _mo 15263
92 ele 14932
93 el_ 14901
94 len 14892
95 _st 14746
96 jn_ 14701
97 ik_ 14152
98 zij 14141
99 ens 13788
100 _ni 13770
101 eid 13612
102 wij 13390
103 _ma 13380
104 _ik 13362
105 _wa 13306
106 uit 13297
107 _mi 13000
108 ove 12952
109 end 12873
110 eme 12865
111 ond 12826
112 eur 12710
113 hee 12652
114 _wo 12630
115 iet 12513
116 le_ 12397
117 ze_ 12218
118 al_ 12165
119 wor 12126
120 _zo 12007
121 lan 11995
122 ede 11994
123 _to 11904
124 sta 11799
125 gel 11682
126 _re 11597
127 ege 11582
128 erd 11384
129 nt_ 11380
130 nen 11376
131 tel 11375
132 met 11232
133 jk_ 10989
134 mis 10918
135 moe 10881
136 om_ 10615
137 iss 10582
138 ati 10561
139 _om 10511
140 _eu 10493
141 oet 10486
142 com 10371
143 est 10264
144 _ov 10196
145 erk 10156
146 ns_ 10114
147 rop 10039
148 maa 10038
149 _er 10035
150 eze 9879
151 mmi 9872
152 ete 9815
153 aat 9801
154 omm 9800
155 del 9754
156 ls_ 9724
157 id_ 9629
158 ke_ 9603
159 uro 9565
160 _oo 9464
161 eri 9420
162 _na 9390
163 ang 9264
164 nd_ 9250
165 _ui 9240
166 eve 9215
167 ssi 9180
168 ven 9150
169 ech 9130
170 nte 9041
171 rin 9016
172 ope 8992
173 es_ 8931
174 als 8917
175 rij 8862
176 dit 8793
177 _do 8752
178 sie 8741
179 se_ 8735
180 wer 8723
181 pro 8618
182 eel 8515
183 ich 8407
184 ige 8401
185 st_ 8331
186 _pr 8283
187 hte 8203
188 _bi 8192
189 lin 8186
190 ag_ 8042
191 gev 7973
192 eke 7934
193 nne 7920
194 daa 7904
195 waa 7870
196 ite 7722
197 raa 7712
198 ok_ 7691
199 ame 7668
200 ook 7661
201 ben 7614
202 hei 7604
203 ien 7595
204 mij 7578
205 ft_ 7548
206 bes 7521
207 bel 7502
208 _ze 7478
209 tre 7475
210 eld 7371
211 eft 7364
212 mee 7320
213 re_ 7316
214 _la 7300
215 tte 7285
216 rd_ 7266
217 dig 7262
218 we_ 7212
219 dez 7155
220 bij 7138
221 jke 7082
222 con 7071
223 heb 6911
224 ind 6852
225 _pa 6821
226 _no 6751
227 ate 6670
228 sen 6610
229 che 6605
230 kel 6566
231 rst 6528
232 ge_ 6513
233 ht_ 6338
234 _ho 6282
235 isc 6230
236 eef 6223
237 lem 6174
238 toe 6172
239 par 6167
240 all 6154
241 str 6128
242 ont 6123
243 gro 6114
244 min 6109
245 ant 6097
246 vol 6094
247 kom 6084
248 dt_ 6024
249 bet 5999
250 ger 5903
251 ese 5899
252 rec 5891
253 ier 5839
254 reg 5817
255 _gr 5785
256 _ka 5742
257 _li 5692
258 ot_ 5691
259 ale 5617
260 taa 5610
261 _ha 5604
262 geb 5599
263 kt_ 5598
264 rle 5585
265 rs_ 5580
266 _vr 5551
267 ome 5549
268 ijd 5548
269 ach 5535
270 rge 5523
271 ld_ 5517
272 rzi 5503
273 ell 5495
274 wil 5490
275 bbe 5457
276 kke 5296
277 ebb 5281
278 doo 5274
279 nst 5243
280 ig_ 5220
281 ela 5164
282 kin 5164
283 ges 5161
284 pes 5133
285 orz 5122
286 dan 5089
287 ard 5085
288 ch_ 5073
289 pen 5014
290 ake 5000
291 tot 4977
292 aal 4946
293 eni 4930
294 sti 4886
295 iti 4829
296 ons 4787
297 lei 4783
298 cha 4771
299 zit 4722
300 itt 4693
301 ene 4682
302 ouw 4679
303 ngs 4677
304 oed 4626
305 _ko 4619
306 arl 4600
307 _af 4594
308 sla 4592
309 nhe 4581
310 doe 4538
311 han 4523
312 kan 4523
313 age 4448
314 leg 4444
315 _za 4414
316 erv 4402
317 wel 4400
318 _ra 4387
319 kun 4387
320 voe 4381
321 _ku 4370
322 ids 4368
323 ari 4345
324 oud 4344
325 ort 4327
326 _hi 4304
327 nse 4262
328 naa 4248
329 tin 4231
330 erg 4227
331 rdt 4222
332 gin 4219
333 _ec 4205
334 eit 4204
335 uni 4192
336 ree 4190
337 ide 4180
338 teg 4144
339 _an 4134
340 ieu 4124
341 etr 4121
342 cti 4099
343 din 4098
344 ntw 4093
345 unn 4083
346 org 4067
347 ies 4060
348 uw_ 4057
349 og_ 4036
350 evo 4028
351 ern 4027
352 spr 4021
353 rek 4006
354 aag 4002
355 her 3997
356 he_ 3982
357 sse 3959
358 orm 3933
359 lee 3916
360 gem 3906
361 lag 3903
362 _un 3887
363 gaa 3851
364 laa 3850
365 hie 3844
366 tij 3832
367 _le 3819
368 _du 3812
369 rag 3812
370 nin 3806
371 enk 3799
372 ran 3799
373 of_ 3792
374 ert 3776
375 ied 3767
376 oge 3766
377 tat 3752
378 ins 3749
379 ad_ 3737
380 oen 3717
381 wee 3710
382 wat 3684
383 gee 3653
384 ill 3644
385 ili 3634
386 _hu 3597
387 nze 3578
388 art 3576
389 zic 3573
390 nti 3564
391 zou 3547
392 ist 3543
393 oer 3524
394 _ga 3511
395 zie 3509
396 uwe 3491
397 erh 3479
398 era 3463
399 egi 3457
400 _of 3454
401 _u_ 3452
402 ats 3446
403 tei 3444
404 il_ 3426
405 chi 3419
406 tro 3418
407 rt_ 3413
408 gd_ 3396
409 int 3394
410 vin 3393
411 tra 3390
412 ude 3384
413 mer 3370
414 hap 3353
415 nom 3351
416 aad 3348
417 ade 3345
418 _vi 3339
419 nat 3330
420 ion 3318
421 nog 3316
422 erl 3312
423 ndi 3292
424 per 3289
425 aak 3281
426 ect 3276
427 woo 3275
428 edi 3274
429 eno 3274
430 iek 3274
431 rei 3270
432 un_ 3266
433 _so 3261
434 hou 3260
435 gez 3243
436 ron 3241
437 erm 3230
438 ed_ 3226
439 ker 3223
440 jnh 3219
441 ali 3196
442 nem 3195
443 pla 3194
444 nis 3187
445 ees 3181
446 ts_ 3178
447 erw 3177
448 teu 3168
449 _pl 3165
450 lit 3150
451 vor 3150
452 rsc 3147
453 dra 3139
454 erb 3123
455 me_ 3123
456 gra 3116
457 _go 3114
458 lge 3112
459 app 3110
460 vra 3109
461 rou 3107
462 esc 3104
463 rke 3098
464 res 3097
465 aro 3095
466 _po 3093
467 lid 3080
468 ek_ 3074
469 act 3070
470 ast 3069
471 ris 3057
472 tee 3055
473 tan 3050
474 ona 3040
475 dst 3028
476 zal 3012
477 rac 3005
478 ber 3001
479 lli 2984
480 _sp 2974
481 eva 2968
482 tig 2960
483 goe 2958
484 _sc 2950
485 beg 2943
486 rdi 2942
487 rot 2935
488 uur 2935
489 ors 2921
490 onz 2920
491 oek 2902
492 els 2895
493 rsl 2890
494 erz 2872
495 _bu 2867
496 nke 2867
497 rte 2860
498 gri 2854
499 ric 2850
500 tge 2834
501 dee 2831
502 ou_ 2821
503 uid 2818
504 ee_ 2792
505 bie 2791
506 rme 2788
507 rat 2785
508 lie 2775
509 are 2770
510 pre 2762
511 dem 2761
512 ijz 2745
513 tem 2742
514 euw 2737
515 mst 2737
516 tio 2735
517 rui 2728
518 ure 2720
519 olg 2712
520 oel 2704
521 vaa 2700
522 unt 2698
523 bre 2697
524 oms 2697
525 ank 2693
526 ote 2693
527 rki 2681
528 _ja 2680
529 nda 2673
530 rmi 2653
531 twi 2653
532 zon 2645
533 ikk 2640
534 tal 2637
535 _nu 2636
536 nal 2635
537 _ne 2626
538 ak_ 2622
539 ema 2611
540 ein 2603
541 oeg 2589
542 val 2577
543 ne_ 2576
544 red 2575
545 rom 2573
546 rbe 2567
547 ero 2565
548 hoo 2559
549 twe 2556
550 _am 2553
551 mog 2550
552 vro 2549
553 zel 2548
554 lde 2541
555 dde 2527
556 bli 2525
557 lis 2524
558 _s_ 2522
559 _zu 2514
560 omi 2514
561 zen 2497
562 rvo 2491
563 _ac 2485
564 _ri 2484
565 vee 2476
566 ans 2468
567 gge 2457
568 ged 2451
569 eed 2448
570 elf 2435
571 bev 2427
572 nam 2424
573 ebr 2419
574 idi 2416
575 ur_ 2399
576 tst 2377
577 sel 2372
578 _ei 2368
579 nu_ 2364
580 wet 2360
581 ezi 2351
582 rli 2347
583 ini 2343
584 oli 2342
585 mil 2337
586 esl 2332
587 ina 2325
588 hoe 2319
589 lat 2314
590 gew 2312
591 ds_ 2306
592 zeg 2302
593 inn 2300
594 igd 2291
595 eco 2283
596 lig 2278
597 oe_ 2278
598 beh 2274
599 ffe 2274
600 pun 2265
601 haa 2256
602 roe 2255
603 ega 2250
604 sin 2246
605 egr 2244
606 oll 2238
607 rkt 2231
608 spe 2220
609 ngr 2218
610 ner 2215
611 zak 2214
612 eng 2207
613 epa 2205
614 nds 2204
615 _sa 2188
616 mid 2183
617 nta 2176
618 idd 2175
619 enw 2169
620 eun 2168
621 evr 2168
622 rne 2168
623 bed 2166
624 elk 2164
625 wen 2163
626 enl 2157
627 eem 2156
628 tis 2154
629 lic 2151
630 ull 2144
631 wik 2140
632 ssa 2138
633 egg 2136
634 zo_ 2128
635 erp 2119
636 ref 2119
637 ppe 2115
638 uss 2104
639 em_ 2102
640 nsc 2102
641 one 2099
642 bur 2097
643 ani 2095
644 fin 2095
645 ntr 2094
646 rti 2094
647 us_ 2092
648 _tw 2084
649 dui 2081
650 hun 2080
651 _pe 2075
652 rie 2066
653 nee 2061
654 pel 2052
655 _bl 2048
656 gan 2048
657 tuu 2046
658 egd 2045
659 sam 2043
660 _br 2041
661 _tu 2035
662 ser 2035
663 igh 2027
664 lev 2027
665 gt_ 2024
666 its 2024
667 ono 2022
668 ijv 2017
669 akt 2016
670 rob 2013
671 pee 2010
672 ghe 2004
673 _ba 1999
674 nci 1996
675 opa 1993
676 _ti 1990
677 zul 1988
678 ewe 1980
679 _bo 1979
680 cia 1972
681 iev 1972
682 rga 1970
683 tor 1966
684 nig 1957
685 rva 1956
686 rhe 1950
687 tri 1945
688 jd_ 1942
689 uct 1939
690 bep 1938
691 hed 1938
692 man 1938
693 pa_ 1928
694 ral 1923
695 fen 1921
696 ble 1917
697 mat 1910
698 ial 1908
699 fra 1906
700 lui 1904
701 enh 1902
702 as_ 1898
703 oal 1896
704 mak 1887
705 ebi 1879
706 por 1877
707 _kr 1876
708 ost 1875
709 zoa 1871
710 led 1866
711 rit 1864
712 jaa 1861
713 air 1858
714 _fr 1857
715 ett 1847
716 mme 1842
717 ise 1838
718 dus 1830
719 pol 1823
720 tek 1819
721 roo 1818
722 eff 1813
723 _ar 1808
724 cte 1808
725 ijf 1808
726 _tr 1801
727 hel 1797
728 khe 1789
729 _ie 1778
730 ekk 1778
731 geh 1778
732 ire 1778
733 ram 1773
734 itg 1772
735 urg 1770
736 ann 1763
737 _kw 1761
738 zet 1757
739 jkh 1756
740 eko 1755
741 _sl 1749
742 anc 1749
743 dri 1749
744 _dr 1737
745 epe 1737
746 hti 1737
747 mar 1735
748 nan 1719
749 ori 1719
750 nwe 1712
751 ap_ 1710
752 ma_ 1697
753 obl 1696
754 edr 1692
755 sit 1692
756 mev 1685
757 ong 1685
758 _el 1681
759 oep 1678
760 evi 1669
761 oci 1664
762 soc 1662
763 pri 1661
764 baa 1654
765 esp 1653
766 rak 1645
767 _fi 1643
768 tai 1643
769 emm 1640
770 ef_ 1632
771 col 1629
772 eil 1627
773 ief 1625
774 rod 1625
775 eig 1621
776 afg 1620
777 zor 1619
778 eho 1618
779 hts 1614
780 eg_ 1613
781 bru 1612
782 tus 1602
783 ike 1599
784 eds 1596
785 on_ 1592
786 oog 1591
787 zoe 1589
788 gde 1587
789 gek 1584
790 nli 1583
791 two 1583
792 roc 1579
793 onc 1574
794 zee 1574
795 oce 1567
796 loo 1566
797 jde 1564
798 ust 1560
799 bin 1559
800 tru 1559
801 ve_ 1559
802 _ke 1552
803 ume 1552
804 _ev 1546
805 fge 1537
806 ena 1536
807 slu 1535
808 ler 1534
809 mt_ 1532
810 zaa 1532
811 rre 1529
812 kri 1522
813 weg 1520
814 vri 1519
815 pra 1518
816 odi 1517
817 ara 1514
818 rzo 1510
819 zin 1506
820 ban 1505
821 rwe 1505
822 noo 1503
823 pas 1502
824 rol 1491
825 sol 1487
826 nk_ 1485
827 eb_ 1484
828 kki 1479
829 ore 1479
830 lt_ 1475
831 noe 1473
832 amm 1466
833 ogr 1466
834 _ju 1456
835 arm 1456
836 dse 1456
837 rel 1454
838 stu 1452
839 wan 1452
840 eu_ 1450
841 cho 1445
842 ark 1444
843 _pu 1441
844 nko 1438
845 na_ 1433
846 arb 1430
847 eba 1428
848 rdr 1426
849 omd 1425
850 mel 1424
851 uis 1421
852 pec 1419
853 ode 1409
854 bee 1407
855 _ro 1406
856 ass 1405
857 bou 1404
858 uik 1403
859 duc 1400
860 for 1397
861 sar 1397
862 och 1396
863 _se 1394
864 mda 1394
865 ets 1391
866 rbi 1391
867 olk 1390
868 _si 1385
869 emo 1379
870 ult 1376
871 aut 1375
872 deb 1375
873 hij 1375
874 orb 1374
875 rog 1372
876 rok 1372
877 had 1371
878 lot 1365
879 atr 1358
880 lke 1355
881 opg 1355
882 odu 1354
883 elo 1352
884 emi 1345
885 enr 1345
886 rig 1342
887 rna 1342
888 tli 1340
889 itu 1334
890 ijs 1332
891 jze 1330
892 jve 1328
893 htl 1327
894 ood 1326
895 nel 1321
896 pge 1317
897 je_ 1313
898 med 1312
899 slo 1312
900 uti 1310
901 _ta 1309
902 _fe 1308
903 rha 1307
904 ine 1306
905 ijg 1302
906 ële 1301
907 cen 1291
908 ije 1290
909 eda 1281
910 geg 1281
911 err 1274
912 ald 1271
913 oei 1271
914 mma 1267
915 paa 1267
916 _lo 1264
917 gst 1264
918 kte 1258
919 gio 1255
920 oti 1255
921 iël 1252
922 gie 1251
923 rma 1251
924 was 1251
925 inz 1243
926 _or 1240
927 ehe 1236
928 spa 1235
929 get 1230
930 cië 1229
931 ves 1228
932 chr 1225
933 rap 1223
934 igi 1222
935 nit 1220
936 fer 1214
937 vas 1214
938 elt 1213
939 ga_ 1208
940 alt 1205
941 dhe 1205
942 bat 1197
943 ct_ 1197
944 dur 1195
945 zig 1191
946 olu 1187
947 zek 1186
948 hil 1180
949 sbe 1179
950 _ki 1178
951 atu 1173
952 oop 1172
953 sto 1172
954 tur 1170
955 eha 1164
956 nod 1160
957 bar 1159
958 dru 1159
959 ita 1159
960 arv 1157
961 dis 1156
962 log 1156
963 uat 1156
964 eta 1155
965 net 1155
966 rik 1154
967 ebe 1153
968 oon 1153
969 af_ 1149
970 nni 1147
971 _kl 1145
972 hul 1144
973 ole 1144
974 ora 1144
975 akk 1143
976 oot 1141
977 ple 1132
978 ruc 1132
979 sme 1128
980 ekt 1122
981 _im 1118
982 ive 1113
983 kwe 1111
984 rk_ 1111
985 nre 1105
986 lst 1097
987 har 1084
988 sle 1083
989 dag 1081
990 uch 1075
991 wes 1065
992 ruk 1062
993 los 1059
994 lf_ 1055
995 nad 1055
996 rea 1054
997 uk_ 1054
998 oos 1052
999 etg 1051
1000 emd 1048
1001 cra 1046
1002 tse 1044
1003 nsp 1043
1004 ppo 1043
1005 hui 1041
1006 ane 1040
1007 att 1037
1008 keu 1024
1009 ezo 1023
1010 bui 1017
1011 nau 1017
1012 eru 1016
1013 edu 1012
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 er_ 43153
15 en_ 26541
16 et_ 22335
17 for 18182
18 _de 18107
19 ing 16986
20 _fo 15665
21 _og 15184
22 og_ 14311
23 or_ 13145
24 _i_ 12591
25 re_ 11699
26 _av 10970
27 om_ 10970
28 til 10716
29 _ti 10674
30 ter 10662
31 ne_ 10351
32 det 9655
33 av_ 9596
34 _me 9565
35 ng_ 9493
36 ene 8925
37 de_ 8847
38 il_ 8724
39 te_ 8327
40 ere 8267
41 _en 8175
42 _er 7956
43 som 7882
44 ed_ 7804
45 _so 7659
46 tte 7622
47 lle 7592
48 nge 7533
49 der 7421
50 ver 7272
51 ler 6963
52 _in 6894
53 ke_ 6881
54 ett 6857
55 _st 6628
56 es_ 6510
57 ste 6404
58 and 6285
59 ell 6190
60 _vi 6176
61 _ve 6113
62 lig 6085
63 nde 6053
64 _ha 6049
65 nin 6036
66 med 5943
67 _be 5895
68 ten 5874
69 sjo 5852
70 _på 5831
71 jon 5831
72 _re 5779
73 ser 5764
74 ar_ 5720
75 den 5611
76 ent 5599
77 rin 5552
78 _ut 5489
79 gen 5480
80 ens 5436
81 _ko 5418
82 _tr 5412
83 på_ 5401
84 le_ 5344
85 kon 5309
86 tor 5211
87 ner 5099
88 sen 5019
89 _å_ 4951
90 ge_ 4931
91 _an 4919
92 kke 4899
93 _no 4850
94 rt_ 4814
95 ig_ 4722
96 at_ 4631
97 ikk 4619
98 _sk 4597
99 ger 4583
100 nne 4581
101 men 4497
102 ren 4476
103 _fr 4454
104 _pr 4445
105 _se 4403
106 ker 4361
107 ers 4338
108 an_ 4325
109 _at 4202
110 inn 4129
111 del 4113
112 eri 4102
113 _om 4077
114 tt_ 4043
115 end 4009
116 res 3920
117 ekt 3912
118 tet 3821
119 lin 3813
120 _et 3808
121 ngs 3755
122 els 3750
123 ont 3750
124 _ka 3746
125 nte 3718
126 se_ 3696
127 per 3670
128 har 3662
129 isk 3647
130 opp 3632
131 one 3612
132 _sa 3572
133 est 3571
134 ans 3533
135 dre 3517
136 enn 3503
137 ert 3467
138 pro 3462
139 dek 3353
140 tra 3330
141 ove 3286
142 het 3263
143 ang 3256
144 _op 3255
145 mme 3233
146 tal 3214
147 nen 3209
148 _ma 3204
149 try 3197
150 gde 3175
151 ret 3165
152 ska 3161
153 ord 3142
154 ske 3131
155 lse 3129
156 jen 3126
157 str 3104
158 nor 3085
159 all 3077
160 lan 3029
161 asj 3017
162 on_ 3014
163 eko 2981
164 al_ 2969
165 ryg 2961
166 nto 2940
167 rer 2938
168 _el 2930
169 ygd 2925
170 ort 2880
171 _pe 2879
172 kan 2879
173 ige 2858
174 fra 2842
175 var 2842
176 _gj 2808
177 ate 2797
178 ran 2794
179 sam 2765
180 mer 2763
181 år_ 2762
182 ide 2760
183 len 2753
184 _si 2749
185 ern 2744
186 ll_ 2725
187 sse 2718
188 st_ 2712
189 ors 2710
190 sta 2696
191 gje 2663
192 kom 2656
193 ra_ 2627
194 ise 2618
195 _bl 2595
196 net 2593
197 ved 2568
198 _la 2564
199 att 2543
200 kti 2531
201 _al 2529
202 tiv 2528
203 ale 2521
204 _ba 2520
205 eli 2514
206 vil 2498
207 _he 2482
208 _mi 2479
209 _ik 2475
210 are 2466
211 kt_ 2466
212 sk_ 2455
213 kal 2448
214 lde 2442
215 el_ 2429
216 nse 2420
217 bru 2415
218 nd_ 2410
219 old 2386
220 ruk 2384
221 ist 2375
222 lt_ 2353
223 ore 2345
224 _gr 2343
225 lik 2336
226 _br 2335
227 ete 2327
228 ndr 2321
229 _ar 2313
230 _va 2307
231 ns_ 2303
232 ite 2297
233 tre 2289
234 _hv 2281
235 øre 2236
236 _ov 2226
237 ien 2224
238 eve 2216
239 sti 2214
240 sto 2214
241 omm 2212
242 hol 2210
243 rte 2209
244 nes 2202
245 ons 2186
246 vei 2184
247 sli 2172
248 kte 2160
249 _li 2155
250 unn 2155
251 _fi 2136
252 por 2132
253 nn_ 2116
254 elt 2092
255 ive 2087
256 rbe 2087
257 ill 2083
258 eng 2072
259 pen 2070
260 før 2068
261 org 2055
262 eks 2051
263 ele 2050
264 man 2044
265 rke 2042
266 _di 2021
267 ven 2015
268 rsk 2009
269 bar 2005
270 sik 1996
271 eid 1994
272 arb 1983
273 tat 1982
274 und 1982
275 _le 1973
276 ann 1955
277 reg 1954
278 ede 1946
279 tid 1945
280 nt_ 1943
281 _ta 1937
282 _to 1936
283 ier 1929
284 bei 1921
285 vær 1921
286 _sp 1918
287 erk 1905
288 sel 1902
289 art 1901
290 tur 1886
291 sni 1884
292 vis 1871
293 sko 1860
294 red 1858
295 _da 1856
296 set 1838
297 ære 1837
298 kje 1836
299 rne 1827
300 _fa 1818
301 rso 1807
302 så_ 1796
303 ant 1793
304 gra 1790
305 _un 1786
306 _må 1782
307 itt 1779
308 ess 1776
309 leg 1776
310 son 1768
311 akt 1766
312 nom 1762
313 _ak 1761
314 _ku 1757
315 uts 1755
316 dri 1737
317 kel 1737
318 ikt 1728
319 lag 1727
320 ati 1723
321 _bi 1722
322 gru 1708
323 orm 1707
324 ram 1686
325 kni 1685
326 eld 1681
327 ass 1669
328 nta 1669
329 eie 1668
330 gan 1661
331 _fø 1659
332 rel 1655
333 _mo 1647
334 jør 1640
335 _kr 1639
336 lit 1637
337 ake 1634
338 amm 1621
339 kol 1610
340 _år 1606
341 _te 1605
342 tak 1590
343 vik 1581
344 tem 1580
345 dig 1577
346 _os 1574
347 bli 1574
348 han 1573
349 ind 1568
350 _væ 1566
351 alt 1552
352 tig 1552
353 bil 1551
354 ken 1551
355 rek 1551
356 ble 1542
357 ift 1540
358 ffe 1537
359 pla 1537
360 igh 1530
361 spo 1523
362 egg 1519
363 mil 1506
364 nsk 1505
365 _ne 1504
366 slo 1495
367 _sy 1493
368 ør_ 1491
369 _ga 1477
370 let 1476
371 rde 1469
372 rog 1464
373 ghe 1459
374 ppe 1453
375 rge 1446
376 gge 1445
377 nst 1433
378 min 1419
379 ld_ 1413
380 eme 1403
381 eta 1400
382 osl 1396
383 is_ 1391
384 ole 1391
385 åde 1391
386 kap 1390
387 ag_ 1380
388 dis 1376
389 in_ 1375
390 lo_ 1374
391 ike 1371
392 die 1369
393 ntr 1366
394 ali 1360
395 ros 1356
396 dle 1347
397 jer 1336
398 sin 1329
399 nno 1326
400 tan 1326
401 las 1314
402 age 1310
403 eg_ 1310
404 irk 1306
405 lta 1292
406 ogs 1282
407 id_ 1281
408 _sl 1279
409 ir_ 1276
410 uli 1274
411 nal 1267
412 met 1264
413 erd 1263
414 ids 1260
415 vir 1258
416 gså 1257
417 jel 1256
418 _po 1255
419 _ek 1253
420 ine 1248
421 ets 1246
422 _ho 1242
423 oms 1240
424 uke 1236
425 _gi 1235
426 spe 1235
427 mel 1234
428 bes 1227
429 tel 1222
430 mun 1221
431 tes 1218
432 vek 1216
433 ilj 1214
434 ytt 1210
435 sat 1208
436 ode 1203
437 råd 1203
438 rve 1201
439 _fe 1200
440 rst 1199
441 bet 1197
442 van 1186
443 _of 1182
444 par 1182
445 tek 1182
446 hel 1181
447 ve_ 1180
448 kk_ 1177
449 les 1177
450 _fl 1173
451 kun 1172
452 ljø 1172
453 ark 1170
454 hen 1169
455 _kj 1165
456 nsp 1165
457 int 1161
458 _na 1152
459 mar 1152
460 nis 1149
461 mmu 1143
462 arn 1142
463 kri 1141
464 run 1139
465 ipp 1136
466 vi_ 1134
467 ket 1129
468 ese 1128
469 ost 1128
470 era 1127
471 ins 1127
472 orb 1126
473 ion 1125
474 ta_ 1123
475 pri 1121
476 ris 1116
477 _by 1113
478 mid 1113
479 _th 1107
480 _lo 1106
481 nfo 1104
482 use 1104
483 _ny 1103
484 ves 1099
485 dli 1096
486 fer 1095
487 ndi 1095
488 tin 1088
489 fte 1087
490 me_ 1085
491 _pa 1081
492 erg 1079
493 gre 1077
494 ure 1076
495 sva 1075
496 the 1074
497 lev 1070
498 mis 1069
499 rli 1067
500 rat 1060
501 tsl 1059
502 ast 1056
503 nsj 1051
504 ege 1050
505 ber 1048
506 pp_ 1048
507 _øk 1046
508 gis 1045
509 tvi 1043
510 idl 1038
511 ss_ 1035
512 lke 1033
513 utv 1033
514 mas 1032
515 val 1030
516 kre 1028
517 rdi 1028
518 lis 1026
519 rif 1021
520 _mu 1017
521 bel 1016
522 gne 1011
523 egi 1008
524 lip 1007
525 tis 1006
526 nke 1004
527 nns 1003
528 vor 1001
529 seg 999
530 kli 998
531 ilt 993
532 rma 992
533 _hø 990
534 dus 990
535 ksj 985
536 riv 985
537 ult 982
538 nat 981
539 ski 981
540 pre 979
541 off 978
542 rie 978
543 nær 976
544 oll 974
545 dag 972
546 erv 967
547 skr 967
548 tro 965
549 _dr 964
550 skj 963
551 ata 961
552 ute 960
553 atu 959
554 stu 959
555 tni 956
556 rem 955
557 sit 955
558 rn_ 953
559 _pl 949
560 ogr 949
561 sys 947
562 vel 946
563 kes 944
564 fre 943
565 fis 941
566 mot 941
567 tab 941
568 øke 940
569 _co 938
570 gjø 938
571 yst 935
572 ikl 934
573 jem 934
574 mål 932
575 _ra 931
576 raf 931
577 vet 930
578 må_ 929
579 ndl 929
580 ak_ 922
581 orh 922
582 ivi 921
583 iti 920
584 rod 919
585 _ul 916
586 lte 915
587 nsi 913
588 get 912
589 lys 912
590 odu 912
591 sis 912
592 kse 911
593 lek 911
594 eis 910
595 lov 910
596 _eu 908
597 erf 908
598 ekn 906
599 beh 904
600 ted 904
601 sve 903
602 hvo 901
603 une 898
604 lom 896
605 sje 896
606 _få 895
607 ekk 895
608 dan 892
609 ire 887
610 mpe 886
611 rei 886
612 ur_ 885
613 duk 883
614 ika 882
615 ts_ 878
616 _nå 876
617 rs_ 875
618 lge 874
619 lli 874
620 dni 870
621 _kl 869
622 us_ 868
623 tri 867
624 gat 866
625 nyt 865
626 rme 865
627 far 864
628 ile 862
629 stø 860
630 omr 858
631 elv 857
632 _or 856
633 rk_ 856
634 enk 853
635 ft_ 853
636 llo 852
637 ial 849
638 iss 849
639 mat 848
640 he_ 843
641 rre 842
642 uk_ 842
643 _så 839
644 led 839
645 upp 838
646 mul 837
647 fin 836
648 gel 834
649 lem 834
650 ane 831
651 ldr 831
652 kra 828
653 ori 828
654 gi_ 827
655 ils 827
656 ave 826
657 tje 826
658 edu 824
659 dat 822
660 ull 821
661 esi 820
662 ad_ 818
663 des 818
664 ses 817
665 app 815
666 _fy 813
667 ona 813
668 rho 812
669 tør 811
670 log 809
671 ørs 809
672 ik_ 807
673 mrå 807
674 oli 805
675 dde 804
676 em_ 803
677 _a_ 800
678 fle 800
679 emp 799
680 ade 796
681 rup 796
682 byg 794
683 edr 794
684 ose 794
685 fik 793
686 ut_ 793
687 rti 790
688 syn 789
689 noe 787
690 rks 787
691 fol 786
692 ye_ 784
693 emi 781
694 sie 778
695 tli 778
696 esk 776
697 vid 775
698 _bo 773
699 inf 773
700 kst 773
701 rit 773
702 ned 772
703 sid 772
704 oru 770
705 omf 768
706 rhe 768
707 syk 766
708 tas 766
709 rap 765
710 lat 764
711 søk 764
712 to_ 752
713 ari 751
714 æri 750
715 avg 749
716 rad 747
717 olo 745
718 tud 745
719 utt 745
720 hje 741
721 tik 741
722 um_ 740
723 kjø 736
724 orv 736
725 uks 736
726 høy 735
727 _sv 732
728 egn 732
729 ukt 732
730 tar 727
731 aks 725
732 sem 718
733 _go 715
734 _lø 715
735 kul 714
736 eff 713
737 gin 713
738 hov 713
739 ks_ 711
740 _bu 710
741 as_ 710
742 dst 710
743 føl 710
744 pet 709
745 ygg 709
746 ei_ 706
747 ppl 706
748 unk 706
749 iv_ 704
750 _bø 702
751 vin 698
752 urs 697
753 rse 696
754 tyr 695
755 ogi 692
756 ølg 691
757 kin 687
758 god 685
759 ota 683
760 fel 681
761 rea 681
762 tti 681
763 kla 680
764 rfo 680
765 rds 678
766 nli 677
767 nit 674
768 erh 673
769 tse 673
770 _næ 672
771 pas 672
772 når 670
773 ppo 668
774 ras 667
775 dir 663
776 dra 663
777 ela 661
778 _fu 660
779 inu 659
780 rav 658
781 _ef 657
782 udi 656
783 øye 655
784 sor 653
785 sre 653
786 kso 651
787 eha 650
788 tof 650
789 of_ 649
790 vit 649
791 dt_ 646
792 pos 646
793 fek 645
794 fyl 643
795 nti 642
796 tio 642
797 gst 641
798 gg_ 639
799 spr 638
800 _sø 637
801 bed 637
802 bus 637
803 ot_ 635
804 afi 633
805 isj 631
806 ssi 631
807 tru 631
808 rev 629
809 aml 627
810 nel 626
811 fri 625
812 ono 625
813 gsp 624
814 rna 624
815 ges 623
816 omi 623
817 rsi 623
818 eho 622
819 gt_ 622
820 nds 622
821 mes 621
822 oen 621
823 lar 620
824 rga 620
825 nas 619
826 sty 619
827 fat 616
828 kar 616
829 rio 616
830 ært 616
831 ald 614
832 ilb 614
833 oks 613
834 tøy 612
835 pes 611
836 ase 610
837 nsa 609
838 øy_ 608
839 olk 607
840 løs 606
841 stå 606
842 sla 605
843 fun 603
844 tst 599
845 ini 598
846 rut 598
847 _eg 597
848 _kn 597
849 iel 597
850 kto 597
851 ben 596
852 ssu 595
853 egr 592
854 hvi 590
855 kle 588
856 rik 588
857 uss 588
858 no_ 587
859 emm 586
860 lir 586
861 lie 585
862 kil 583
863 enh 581
864 erl 581
865 bas 580
866 _ri 579
867 _bå 578
868 kos 577
869 abe 576
870 edi 576
871 omh 575
872 rd_ 575
873 tot 575
874 mhe 574
875 ple 574
876 _im 573
877 beg 573
878 nnt 573
879 sur 573
880 pol 572
881 nsv 571
882 nux 571
883 rom 569
884 ha_ 567
885 mst 567
886 ild 566
887 erm 565
888 ral 565
889 _kv 564
890 hve 564
891 _do 563
892 _my 562
893 ape 561
894 rol 561
895 mle 559
896 ety 558
897 itu 555
898 ust 554
899 rgi 553
900 pe_ 550
901 rda 550
902 her 549
903 ykk 549
904 vgi 548
905 ems 547
906 avt 546
907 omp 546
908 efo 545
909 _as 544
910 ani 544
911 ux_ 543
912 utg 541
913 gif 537
914 gir 537
915 je_ 536
916 bør 534
917 nkt 534
918 urd 534
919 rep 531
920 nni 530
921 ara 527
922 bok 527
923 pel 527
924 ksi 523
925 spi 522
926 vta 522
927 nye 521
928 bak 520
929 rdr 520
930 sgr 519
931 tyd 519
932 am_ 518
933 stn 516
934 sek 515
935 na_ 514
936 _ki 513
937 få_ 512
938 sak 511
939 dom 510
940 ema 510
941 hus 510
942 _ge 509
943 nnl 509
944 _s_ 508
945 lel 507
946 lær 507
947 ltu 506
948 yde 506
949 _ro 505
950 osi 503
951 sul 503
952 pps 502
953 pun 502
954 teg 497
955 ita 496
956 anl 495
957 gur 495
958 igu 492
959 ud_ 492
960 yr_ 492
961 ifi 491
962 rak 491
963 rig 491
964 _ad 489
965 jor 489
966 øko 489
967 fig 488
968 sig 487
969 sst 487
970 _ww 485
971 _x_ 485
972 kas 485
973 ørt 485
974 api 484
975 jek 484
976 edl 483
977 ude 483
978 ilk 482
979 ål_ 482
980 _ty 481
981 ikr 478
982 bre 477
983 nve 475
984 kva 474
985 ies 473
986 lav 473
987 ogn 473
988 osj 473
989 ref 473
990 _ca 472
991 dep 472
992 ein 472
993 bud 471
994 kor 471
995 nkl 471
996 vur 469
997 bef 468
998 fal 468
999 sia 468
1000 mod 467
1001 nad 467
1002 tit 467
1003 _du 466
1004 igg 466
1005 asi 463
1006 sku 463
1007 _su 462
1008 amv 462
1009 nda 462
1010 _gå 461
1011 ask 461
1012 ria 459
1013 små 459
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ie_ 11047
15 nie 10353
16 _pr 9159
17 _po 8142
18 ch_ 7969
19 _na 6678
20 ani 6213
21 _w_ 5759
22 ia_ 5718
23 ych 5534
24 prz 5323
25 rze 5307
26 owa 5228
27 nia 5174
28 _za 5071
29 na_ 5016
30 _do 4773
31 _i_ 4470
32 wan 4352
33 _je 4264
34 ej_ 4177
35 dzi 4116
36 _wy 4045
37 eni 4018
38 ów_ 3946
39 go_ 3882
40 sta 3785
41 ego 3747
42 ne_ 3539
43 rzy 3417
44 pro 3256
45 _ni 3219
46 est 3193
47 wie 3160
48 acj 3088
49 _ko 3060
50 _z_ 2938
51 ści 2922
52 nyc 2846
53 ji_ 2811
54 em_ 2639
55 cze 2614
56 czn 2608
57 _in 2538
58 _ro 2529
59 _st 2504
60 cji 2486
61 owe 2455
62 st_ 2447
63 yst 2416
64 ym_ 2412
65 ny_ 2389
66 do_ 2371
67 _si 2319
68 _mo 2304
69 jes 2270
70 ci_ 2190
71 owi 2186
72 row 2146
73 owy 2109
74 _te 2077
75 kie 2046
76 ost 2030
77 wa_ 1992
78 się 1990
79 zie 1988
80 ki_ 1968
81 ośc 1966
82 czy 1938
83 zy_ 1938
84 _wi 1924
85 zen 1921
86 _sp 1886
87 ien 1866
88 ię_ 1859
89 kon 1857
90 _cz 1850
91 ane 1840
92 cie 1835
93 pra 1834
94 any 1832
95 je_ 1828
96 _od 1803
97 jąc 1801
98 str 1790
99 ier 1787
100 _mi 1786
101 nik 1786
102 pow 1759
103 mie 1741
104 _ma 1711
105 ku_ 1699
106 ach 1667
107 ać_ 1666
108 nic 1649
109 ka_ 1639
110 pod 1626
111 mi_ 1617
112 _pa 1614
113 dni 1614
114 war 1600
115 rac 1582
116 ste 1545
117 tow 1535
118 kow 1533
119 icz 1523
120 ywa 1516
121 zys 1509
122 _ja 1507
123 iej 1486
124 ter 1483
125 ent 1478
126 _to 1472
127 ale 1455
128 tyc 1431
129 gra 1424
130 trz 1408
131 ami 1401
132 zna 1393
133 _o_ 1389
134 to_ 1388
135 _a_ 1384
136 _ob 1382
137 roz 1366
138 tem 1362
139 now 1344
140 szy 1336
141 edn 1332
142 moż 1322
143 ist 1319
144 nej 1315
145 cza 1314
146 ycz 1297
147 _ty 1296
148 za_ 1291
149 ram 1276
150 sze 1265
151 ski 1262
152 ska 1259
153 arz 1256
154 _op 1251
155 spo 1250
156 ora 1248
157 iał 1237
158 ra_ 1237
159 la_ 1231
160 neg 1230
161 _ws 1229
162 wyc 1228
163 ze_ 1228
164 _re 1223
165 noś 1222
166 cja 1218
167 zan 1218
168 wni 1216
169 któ 1214
170 _ta 1209
171 ali 1209
172 dow 1200
173 rog 1198
174 zac 1194
175 że_ 1194
176 dan 1193
177 aln 1189
178 orz 1183
179 era 1182
180 for 1175
181 ogr 1166
182 _wa 1165
183 nym 1163
184 jak 1159
185 ają 1156
186 nal 1154
187 awi 1147
188 ika 1146
189 zyc 1140
190 wer 1128
191 orm 1126
192 _sk 1125
193 ja_ 1123
194 _sy 1119
195 tan 1116
196 _kt 1114
197 _us 1114
198 ść_ 1114
199 sto 1112
200 stw 1107
201 zia 1106
202 pie 1105
203 ich 1103
204 tór 1096
205 raz 1094
206 iem 1092
207 ony 1091
208 ez_ 1087
209 ce_ 1085
210 zas 1073
211 wia 1071
212 iu_ 1057
213 tra 1056
214 _dz 1053
215 ied 1050
216 lik 1050
217 rod 1048
218 li_ 1047
219 raw 1044
220 ący 1043
221 _se 1042
222 twa 1042
223 od_ 1039
224 ty_ 1037
225 tor 1036
226 ran 1034
227 wy_ 1033
228 _tr 1032
229 zne 1029
230 _or 1028
231 ast 1024
232 zez 1022
233 iec 1021
234 kom 1021
235 cho 1017
236 iel 1016
237 akt 1014
238 _pl 1013
239 poz 1010
240 jed 1008
241 ał_ 1005
242 cy_ 991
243 _da 989
244 mu_ 987
245 we_ 985
246 ków 983
247 ak_ 982
248 by_ 979
249 _ba 971
250 ją_ 967
251 ry_ 963
252 az_ 961
253 stę 961
254 le_ 960
255 my_ 959
256 ada 954
257 rów 953
258 zes 953
259 ona 952
260 tu_ 952
261 yjn 951
262 _sz 947
263 ecz 946
264 taw 945
265 ość 944
266 usz 944
267 dy_ 937
268 ędz 937
269 sty 933
270 ko_ 932
271 rma 932
272 art 931
273 ek_ 929
274 _dl 927
275 ta_ 926
276 _ka 923
277 ują 922
278 wsz 916
279 ekt 914
280 zcz 912
281 szc 910
282 _lu 903
283 tęp 903
284 sys 902
285 sie 899
286 two 897
287 liz 896
288 niu 894
289 cyj 889
290 ion 887
291 tni 884
292 sow 881
293 odz 880
294 _by 879
295 one 875
296 er_ 874
297 ini 874
298 uje 872
299 bra 865
300 kac 865
301 _no 864
302 wym 864
303 acz 857
304 nac 857
305 por 855
306 yci 848
307 pol 840
308 _ok 839
309 dla 837
310 zon 837
311 aty 834
312 ące 830
313 ust 828
314 ana 818
315 pli 818
316 _co 813
317 sza 813
318 erw 811
319 uży 809
320 tak 806
321 men 803
322 ucz 801
323 acy 800
324 bie 800
325 erz 800
326 lic 800
327 _ak 798
328 ni_ 796
329 zed 796
330 _fu 794
331 pos 794
332 mia 791
333 wią 790
334 tał 788
335 yć_ 788
336 _li 782
337 lub 781
338 leż 779
339 mat 777
340 wyk 774
341 tal 761
342 fun 757
343 ocz 752
344 eń_ 749
345 inf 747
346 zap 740
347 _pi 737
348 on_ 736
349 adz 733
350 nio 732
351 min 731
352 cia 727
353 ozw 725
354 tów 725
355 wej 724
356 _ra 722
357 ero 721
358 oni 721
359 tro 721
360 dno 720
361 ną_ 719
362 _ch 718
363 ub_ 718
364 unk 718
365 oda 716
366 opr 712
367 ład 709
368 mow 705
369 ży_ 703
370 isk 702
371 zwi 702
372 oso 700
373 ies 699
374 zaw 699
375 own 698
376 ktu 697
377 omi 696
378 _an 694
379 zny 694
380 _kr 692
381 kcj 692
382 mac 691
383 iza 688
384 wać 688
385 ech 686
386 int 686
387 cen 685
388 _zn 684
389 ała 684
390 ła_ 682
391 edz 681
392 _uż 680
393 iąz 679
394 nte 679
395 świ 677
396 _os 676
397 _kl 675
398 arc 673
399 lne 671
400 pis 671
401 ję_ 667
402 ele 665
403 ymi 664
404 tko 662
405 awa 661
406 oże 655
407 tyw 655
408 bez 652
409 zni 650
410 _be 649
411 kre 649
412 teg 646
413 ako 644
414 tar 644
415 _uc 643
416 esz 643
417 ić_ 643
418 ano 637
419 _sa 636
420 aki 636
421 kła 636
422 rzą 636
423 _bi 635
424 nas 633
425 ian 632
426 ser 632
427 dos 631
428 uch 628
429 odu 627
430 odn 626
431 _fi 625
432 omo 624
433 ącz 622
434 res 621
435 ówn 620
436 eci 619
437 oku 616
438 ięc 615
439 weg 615
440 stn 613
441 łąc 613
442 wię 612
443 nar 610
444 pom 609
445 _dr 606
446 liw 606
447 zec 604
448 zeg 602
449 _że 596
450 wor 596
451 ieg 594
452 naj 592
453 kty 591
454 yko 591
455 eży 590
456 ejs 588
457 dza 587
458 _ab 586
459 amo 586
460 ząd 586
461 tek 585
462 _go 584
463 ros 584
464 ato 583
465 stk 582
466 tki 582
467 ows 581
468 osz 580
469 słu 580
470 nta 579
471 _me 578
472 aby 577
473 ca_ 577
474 owo 576
475 _są 574
476 wo_ 574
477 ńst 569
478 kor 568
479 _we 565
480 _ró 564
481 _ce 563
482 ma_ 563
483 tym 563
484 zos 563
485 roc 562
486 _al 561
487 nfo 561
488 _zo 560
489 cje 558
490 ers 557
491 lny 555
492 no_ 555
493 bli 554
494 ort 554
495 ole 553
496 _ze 551
497 rto 551
498 zym 550
499 _wł 549
500 inn 549
501 są_ 548
502 _zw 547
503 dze 547
504 oce 547
505 cyc 546
506 ły_ 545
507 jny 544
508 ate 543
509 zeń 542
510 dar 541
511 ron 540
512 po_ 539
513 czo 538
514 und 537
515 iad 536
516 _dy 534
517 _zm 534
518 omp 534
519 rzę 534
520 emu 533
521 ont 533
522 _r_ 530
523 eśl 530
524 chn 526
525 am_ 525
526 kto 525
527 spr 524
528 _de 522
529 tru 522
530 kan 521
531 rak 520
532 _gr 519
533 rok 517
534 _fo 516
535 wys 516
536 moc 514
537 rz_ 513
538 wsk 512
539 iow 511
540 ono 507
541 ard 506
542 re_ 504
543 ala 503
544 wyb 503
545 lac 502
546 _zb 500
547 asz 500
548 oko 500
549 zęd 499
550 ii_ 498
551 met 498
552 toś 498
553 api 496
554 sz_ 496
555 zar 496
556 wad 494
557 mis 493
558 zak 493
559 ańs 491
560 nst 489
561 ikó 488
562 tac 488
563 żli 488
564 tur 487
565 cję 486
566 nan 486
567 spó 486
568 dus 484
569 iet 484
570 iek 483
571 szu 482
572 ożl 481
573 bud 480
574 odo 480
575 ual 480
576 om_ 479
577 óry 479
578 ndu 478
579 zmi 478
580 sam 474
581 ara 473
582 as_ 471
583 or_ 471
584 par 471
585 ło_ 471
586 będ 469
587 hod 469
588 ik_ 469
589 pon 469
590 en_ 467
591 ins 467
592 ado 465
593 te_ 465
594 ysk 464
595 _ki 463
596 ryw 463
597 wis 463
598 ąc_ 463
599 _zd 462
600 at_ 462
601 im_ 462
602 lem 462
603 odp 462
604 _bu 461
605 edy 461
606 ług 460
607 cis 458
608 bio 457
609 obi 457
610 _pe 455
611 ięk 455
612 pla 455
613 ruc 455
614 dro 454
615 _ul 453
616 god 453
617 mni 453
618 sob 452
619 tos 451
620 łow 451
621 czą 449
622 ań_ 448
623 kic 448
624 kol 448
625 yma 448
626 _śr 447
627 tec 447
628 wła 447
629 osi 446
630 óre 446
631 dom 445
632 ąza 444
633 ans 443
634 ata 443
635 czę 443
636 tua 443
637 and 441
638 kli 441
639 zyn 441
640 być 440
641 yta 440
642 ępn 440
643 ior 438
644 esi 437
645 pły 437
646 _wp 435
647 gan 435
648 _um 434
649 nos 434
650 wal 434
651 lko 433
652 zu_ 433
653 enc 431
654 obr 431
655 _ją 429
656 oje 429
657 oli 429
658 tyk 428
659 tyl 426
660 an_ 425
661 koś 425
662 dys 424
663 ież 424
664 żyt 424
665 ewn 423
666 ęci 423
667 mod 421
668 red 421
669 zer 421
670 jne 419
671 ząc 419
672 aga 418
673 wsp 418
674 dat 417
675 rdz 417
676 okr 414
677 łów 414
678 pot 413
679 da_ 412
680 eks 412
681 ezp 412
682 uni 412
683 nis 410
684 eli 409
685 _ry 408
686 eto 408
687 low 408
688 mów 408
689 _ap 405
690 ars 405
691 pop 405
692 zow 405
693 ern 403
694 lec 402
695 ces 401
696 co_ 400
697 _św 399
698 dok 399
699 _ur 398
700 su_ 398
701 rob 395
702 wod 395
703 zej 395
704 pół 394
705 ura 394
706 wid 394
707 zyk 394
708 _bę 393
709 nny 393
710 nt_ 393
711 sku 393
712 oty 392
713 ądz 390
714 ały 389
715 log 389
716 ols 389
717 sko 389
718 śni 389
719 cha 388
720 hni 388
721 _wo 387
722 _au 386
723 lu_ 385
724 ntr 385
725 lsk 384
726 mon 383
727 win 383
728 poc 382
729 zwa 382
730 _gł 381
731 lon 381
732 opi 381
733 zal 380
734 dzo 379
735 ten 379
736 pad 378
737 ogi 377
738 pań 377
739 akc 376
740 _tw 375
741 ict 374
742 lan 374
743 ctw 373
744 lni 373
745 per 373
746 _pł 372
747 otr 372
748 yni 372
749 śli 371
750 śro 371
751 _ci 370
752 _gd 370
753 rad 370
754 zew 370
755 był 369
756 ute 369
757 _e_ 368
758 lat 368
759 net 367
760 du_ 366
761 ed_ 366
762 gi_ 366
763 ałe 365
764 es_ 365
765 ina 362
766 kat 362
767 ytk 362
768 zbi 362
769 rat 361
770 szt 360
771 cel 358
772 cią 357
773 dst 357
774 ksz 357
775 róż 357
776 óln 357
777 et_ 356
778 żna 356
779 _s_ 355
780 aci 355
781 zaj 355
782 _ar 354
783 in_ 354
784 umi 353
785 _wz 352
786 oma 351
787 ożn 351
788 bar 350
789 zek 349
790 zi_ 349
791 duk 348
792 eko 347
793 obo 347
794 rus 347
795 ga_ 346
796 ybr 345
797 ywn 344
798 ory 343
799 try 343
800 _mu 342
801 _pu 342
802 eż_ 342
803 ią_ 342
804 kra 342
805 ad_ 341
806 ało 341
807 ncj 340
808 _oc 339
809 kar 339
810 nad 339
811 mar 338
812 nwe 338
813 żen 337
814 _ud 336
815 óżn 336
816 _ca 335
817 lno 335
818 rea 335
819 ope 334
820 adk 333
821 ame 333
822 dpo 333
823 elo 333
824 onf 333
825 och 332
826 wno 332
827 ema 330
828 woś 330
829 naz 329
830 nu_ 329
831 ods 329
832 zad 329
833 łem 329
834 _rz 328
835 wał 327
836 olo 326
837 _is 325
838 ełn 325
839 kry 325
840 odk 325
841 peł 325
842 szk 325
843 ąć_ 325
844 _m_ 324
845 alo 323
846 ru_ 323
847 ul_ 323
848 dku 322
849 rsz 322
850 rci 321
851 cow 320
852 der 320
853 nat 320
854 ysz 320
855 poł 319
856 _sł 318
857 rwe 318
858 wne 317
859 iki 316
860 len 316
861 stu 316
862 _ha 315
863 nię 315
864 _n_ 314
865 oto 314
866 _on 313
867 _sw 313
868 _le 312
869 nty 312
870 yza 312
871 zyp 311
872 _br 310
873 ruk 310
874 _dn 309
875 man 309
876 mię 309
877 obl 309
878 tat 309
879 wą_ 309
880 ner 308
881 run 308
882 _el 307
883 kaz 307
884 ryc 307
885 wyd 307
886 mag 306
887 pcj 305
888 udz 305
889 _ad 304
890 emy 304
891 bib 303
892 ing 303
893 nak 303
894 ylk 303
895 ypa 302
896 zda 302
897 zeb 302
898 dob 300
899 baz 299
900 cz_ 299
901 tom 299
902 wny 299
903 zło 299
904 _ot 298
905 pyt 298
906 rol 298
907 wyn 298
908 ycj 298
909 azw 297
910 pan 297
911 rza 297
912 sk_ 297
913 ume 297
914 dcz 296
915 pre 296
916 woj 295
917 duj 294
918 lis 294
919 nku 294
920 ukt 294
921 _of 293
922 wir 293
923 żyw 293
924 pni 292
925 spe 292
926 zam 292
927 śre 292
928 _og 290
929 tio 290
930 łan 290
931 azy 289
932 ozy 289
933 atu 288
934 eme 288
935 tel 288
936 ury 288
937 uwa 288
938 mog 287
939 reś 287
940 amy 286
941 nag 286
942 sa_ 286
943 zyw 286
944 etr 285
945 irm 285
946 łu_ 285
947 _zł 284
948 ere 284
949 fir 284
950 las 283
951 wow 283
952 _dw 282
953 _mn 281
954 aro 281
955 gól 281
956 twi 281
957 ywi 281
958 awn 280
959 daj 280
960 dot 280
961 każ 280
962 oka 280
963 zem 280
964 anu 279
965 ela 279
966 oln 279
967 awo 278
968 jeś 278
969 rej 278
970 uro 278
971 iwo 277
972 put 277
973 amu 276
974 eru 276
975 ode 276
976 zpi 276
977 _zg 275
978 mpu 275
979 rtu 275
980 ył_ 275
981 cą_ 274
982 emi 274
983 nkc 274
984 rst 274
985 wat 274
986 _la 273
987 gen 273
988 rys 273
989 umo 273
990 ępu 273
991 cej 272
992 uto 272
993 ark 271
994 ozn 271
995 _ed 270
996 bow 270
997 eta 270
998 iat 270
999 iru 270
1000 opc 270
1001 pew 270
1002 wol 270
1003 dod 269
1004 ntó 269
1005 wes 269
1006 wi_ 269
1007 aut 268
1008 ord 268
1009 atk 267
1010 dne 267
1011 rne 267
1012 iew 266
1013 nni 266
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 os_ 106562
15 _de 94943
16 de_ 78029
17 ão_ 73364
18 _co 70082
19 as_ 67724
20 ent 63763
21 _qu 59914
22 que 56968
23 _a_ 55050
24 do_ 50486
25 _se 49212
26 ue_ 48768
27 nte 47706
28 te_ 43622
29 es_ 41035
30 men 38593
31 da_ 37937
32 to_ 37913
33 _pr 37585
34 _re 37101
35 est 35702
36 com 34832
37 con 33816
38 em_ 33441
39 _do 32672
40 _es 32519
41 _e_ 32507
42 _po 32472
43 ção 32297
44 _pa 31603
45 _o_ 31584
46 _da 31273
47 ia_ 28540
48 ar_ 28269
49 res 28202
50 ra_ 27948
51 sta 25994
52 dos 25956
53 par 25402
54 _um 25401
55 _no 25025
56 ado 24915
57 nto 23364
58 or_ 22082
59 er_ 21774
60 ma_ 21744
61 açã 21040
62 ame 20887
63 se_ 20757
64 pre 20137
65 _em 19954
66 _in 19858
67 pro 19578
68 ida 19522
69 is_ 18817
70 ara 18446
71 ta_ 18437
72 ica 18404
73 _pe 18337
74 sen 18221
75 _di 17530
76 por 17260
77 _ma 17070
78 mos 16899
79 ter 16877
80 _as 16774
81 _me 16634
82 dad 16600
83 cia 16511
84 uma 16455
85 ess 16398
86 tra 16270
87 ade 16261
88 des 15992
89 ria 15577
90 al_ 15111
91 no_ 15108
92 _te 14909
93 ões 14808
94 um_ 14573
95 nta 14430
96 das 14350
97 são 14212
98 _os 14194
99 io_ 14192
100 ais 14167
101 ant 14084
102 iss 13625
103 não 13461
104 rop 13397
105 _nã 13395
106 _é_ 13171
107 ont 13118
108 _na 13074
109 nos 13062
110 tiv 12897
111 ito 12896
112 ada 12829
113 tad 12776
114 ons 12392
115 ela 12316
116 nci 12089
117 ste 11923
118 rio 11757
119 enh 11729
120 so_ 11496
121 tar 11461
122 tos 11247
123 uro 11192
124 hor 11032
125 pos 11024
126 na_ 11017
127 _eu 10972
128 mo_ 10866
129 nho 10866
130 sso 10690
131 omi 10640
132 çõe 10399
133 ist 10262
134 mis 10234
135 era 10204
136 ro_ 10200
137 am_ 10193
138 eur 10130
139 rec 10096
140 om_ 10046
141 _so 9861
142 _ao 9830
143 ser 9818
144 _tr 9786
145 ca_ 9780
146 tam 9767
147 per 9754
148 _ap 9673
149 _al 9658
150 ect 9564
151 ver 9488
152 for 9481
153 ntr 9414
154 _ac 9353
155 _ca 9305
156 _ex 9293
157 tem 9248
158 eit 9235
159 _en 9219
160 qua 9034
161 ime 8935
162 esp 8865
163 ran 8858
164 sid 8855
165 _fa 8763
166 _fo 8732
167 mas 8639
168 ora 8629
169 _ne 8572
170 tic 8563
171 ope 8466
172 ide 8439
173 tes 8388
174 _su 8376
175 ssã 8355
176 re_ 8323
177 ese 8299
178 oss 8245
179 ido 8211
180 uni 8210
181 int 8208
182 _à_ 8189
183 iva 8175
184 ssa 8166
185 ári 8155
186 ndo 8121
187 _ta 8067
188 emo 8061
189 _to 7968
190 ece 7913
191 sse 7912
192 ura 7784
193 rel 7763
194 ass 7728
195 mai 7648
196 ram 7548
197 ita 7497
198 ost 7487
199 den 7473
200 omo 7470
201 ou_ 7463
202 sa_ 7368
203 der 7355
204 ali 7352
205 eri 7351
206 lo_ 7317
207 ten 7311
208 spe 7290
209 ir_ 7147
210 egu 7129
211 ros 7115
212 pel 7067
213 eu_ 7040
214 gra 6993
215 lat 6985
216 ere 6934
217 ao_ 6878
218 fic 6878
219 orm 6832
220 eir 6796
221 cio 6770
222 ort 6754
223 str 6660
224 tan 6635
225 ati 6620
226 eve 6602
227 rma 6598
228 sti 6573
229 _mu 6512
230 amb 6510
231 ém_ 6467
232 ina 6454
233 pri 6443
234 nde 6440
235 _ou 6344
236 açõ 6287
237 _si 6277
238 eci 6249
239 reg 6226
240 _ve 6210
241 dir 6158
242 cto 6140
243 ênc 6128
244 obr 6108
245 art 6093
246 pei 6087
247 lho 6046
248 end 6024
249 raç 5997
250 and 5978
251 ico 5954
252 ona 5904
253 vel 5857
254 co_ 5802
255 el_ 5771
256 tas 5765
257 va_ 5763
258 iza 5727
259 seg 5718
260 lam 5713
261 ion 5694
262 emp 5692
263 _im 5671
264 ual 5667
265 ire 5664
266 _an 5649
267 tur 5635
268 dev 5609
269 ho_ 5572
270 imp 5569
271 ode 5568
272 tro 5561
273 lar 5555
274 esi 5545
275 _vi 5512
276 bre 5444
277 cor 5371
278 eia 5360
279 amo 5358
280 dep 5357
281 _fi 5339
282 ili 5321
283 _mo 5306
284 ces 5289
285 ios 5276
286 cas 5275
287 tor 5273
288 ias 5222
289 ião 5200
290 nti 5178
291 cti 5166
292 pen 5166
293 tod 5161
294 act 5155
295 uta 5153
296 pod 5129
297 _un 5090
298 lid 5078
299 rta 5033
300 fer 5032
301 ele 5025
302 nda 5019
303 sob 5018
304 nse 5009
305 ici 4964
306 rem 4950
307 nst 4946
308 ore 4919
309 ome 4891
310 ind 4883
311 itu 4841
312 min 4839
313 mpo 4831
314 rei 4827
315 mpr 4795
316 efe 4776
317 ref 4772
318 pol 4761
319 ses 4755
320 íti 4736
321 uit 4734
322 car 4717
323 odo 4717
324 bém 4713
325 mbé 4713
326 dem 4679
327 rat 4667
328 und 4666
329 ema 4658
330 ssi 4650
331 rad 4634
332 tão 4628
333 cer 4612
334 man 4585
335 ivo 4584
336 dis 4581
337 ens 4561
338 sto 4547
339 ord 4540
340 lic 4534
341 rá_ 4530
342 eco 4528
343 qui 4525
344 nha 4521
345 _gr 4500
346 nal 4500
347 spo 4493
348 niã 4449
349 vo_ 4449
350 ira 4439
351 ano 4426
352 emb 4419
353 are 4417
354 cad 4410
355 edi 4377
356 elh 4361
357 elo 4357
358 mui 4351
359 _li 4340
360 tal 4329
361 pon 4322
362 ari 4290
363 nas 4289
364 inc 4271
365 sem 4269
366 _cr 4229
367 ori 4229
368 ial 4227
369 _or 4208
370 opo 4188
371 vid 4158
372 la_ 4129
373 olí 4126
374 ras 4118
375 aci 4110
376 _at 4109
377 lme 4106
378 go_ 4097
379 put 4093
380 _ob 4079
381 me_ 4076
382 col 4070
383 epu 4048
384 liz 4048
385 óri 4048
386 lít 4036
387 rla 4024
388 _fu 4005
389 ate 3992
390 ern 3983
391 nes 3975
392 rar 3975
393 erc 3961
394 uer 3926
395 omp 3919
396 erm 3911
397 tór 3905
398 ula 3903
399 _mi 3896
400 cid 3889
401 _sa 3857
402 nça 3850
403 egi 3847
404 arl 3827
405 ata 3817
406 rte 3813
407 _go 3808
408 íve 3780
409 apr 3769
410 ça_ 3758
411 nic 3743
412 ini 3739
413 _is 3733
414 iro 3702
415 alm 3672
416 anç 3671
417 _ci 3662
418 _le 3650
419 mun 3641
420 oci 3640
421 tri 3619
422 onc 3592
423 stã 3582
424 _el 3580
425 tua 3575
426 sua 3550
427 ifi 3546
428 ien 3536
429 enc 3532
430 omu 3514
431 rti 3507
432 sas 3494
433 _ag 3493
434 vol 3493
435 taç 3492
436 sár 3491
437 mer 3480
438 ua_ 3466
439 pec 3465
440 ssá 3451
441 ndi 3437
442 fun 3410
443 alt 3409
444 fin 3402
445 tid 3383
446 cul 3370
447 ues 3348
448 sos 3346
449 bil 3339
450 aís 3330
451 cri 3328
452 gar 3326
453 paí 3314
454 poi 3308
455 dec 3290
456 mes 3286
457 uan 3283
458 gos 3282
459 gun 3280
460 ató 3270
461 bal 3265
462 leg 3264
463 tre 3263
464 mem 3262
465 içã 3251
466 _ad 3249
467 ins 3248
468 equ 3232
469 roc 3227
470 dam 3223
471 zer 3223
472 erá 3216
473 ret 3210
474 ren 3200
475 rda 3199
476 esc 3190
477 aba 3180
478 unt 3179
479 out 3168
480 esa 3167
481 _vo 3165
482 mbr 3156
483 sol 3149
484 laç 3132
485 tit 3128
486 erv 3120
487 utr 3120
488 ega 3113
489 _ju 3110
490 _am 3103
491 did 3098
492 mpl 3090
493 mei 3086
494 aqu 3082
495 aco 3073
496 nce 3072
497 bro 3067
498 _ba 3065
499 lem 3064
500 ral 3061
501 ven 3058
502 nco 3055
503 nsi 3054
504 ena 3053
505 rim 3049
506 dor 3045
507 sej 3045
508 cos 3034
509 peu 3033
510 eja 3032
511 uçã 3029
512 ond 3021
513 cei 3020
514 seu 3017
515 ha_ 3007
516 aos 3002
517 smo 2996
518 ve_ 2988
519 cis 2986
520 rea 2986
521 nec 2951
522 po_ 2939
523 vis 2935
524 sit 2932
525 uto 2924
526 ult 2922
527 env 2919
528 ero 2897
529 _sã 2889
530 rmo 2885
531 tin 2885
532 enç 2883
533 mit 2881
534 ior 2880
535 olv 2880
536 dia 2860
537 soc 2859
538 eme 2857
539 _au 2843
540 sel 2825
541 vam 2817
542 ala 2814
543 até 2801
544 ois 2775
545 nov 2773
546 pes 2768
547 ber 2765
548 jec 2763
549 zaç 2763
550 ova 2752
551 alh 2748
552 vos 2731
553 evi 2730
554 inh 2730
555 ust 2722
556 _ho 2709
557 _ab 2708
558 fac 2704
559 rab 2700
560 cre 2697
561 err 2687
562 ert 2678
563 ama 2673
564 áve 2664
565 mar 2662
566 nvo 2658
567 anc 2657
568 _cu 2651
569 im_ 2647
570 exi 2643
571 ete 2641
572 rna 2634
573 faz 2626
574 olu 2612
575 alg 2608
576 ape 2608
577 ota 2601
578 orç 2590
579 rev 2582
580 sim 2575
581 mbi 2569
582 _ce 2568
583 éri 2561
584 cla 2555
585 tir 2554
586 abe 2541
587 caç 2536
588 ani 2529
589 orr 2514
590 já_ 2512
591 _já 2509
592 nad 2508
593 tim 2479
594 _be 2469
595 oce 2462
596 esm 2455
597 rov 2454
598 rig 2451
599 ove 2449
600 rod 2446
601 imo 2443
602 dar 2441
603 aze 2426
604 _lo 2419
605 nid 2419
606 _fe 2414
607 osi 2409
608 eno 2398
609 ima 2394
610 dic 2384
611 _va 2375
612 rmi 2375
613 ans 2370
614 lte 2364
615 erg 2362
616 _ch 2360
617 tud 2358
618 erd 2356
619 cen 2352
620 vas 2337
621 íse 2336
622 iti 2334
623 _ec 2327
624 lgu 2325
625 oi_ 2325
626 stá 2324
627 foi 2316
628 _aq 2315
629 ja_ 2315
630 vez 2314
631 pa_ 2312
632 imi 2311
633 ez_ 2302
634 squ 2299
635 _lu 2289
636 med 2288
637 igo 2286
638 ace 2285
639 gur 2283
640 _ge 2281
641 rre 2275
642 rep 2271
643 nsa 2265
644 ric 2264
645 ce_ 2259
646 sis 2258
647 ns_ 2256
648 ui_ 2251
649 _op 2243
650 gui 2243
651 nom 2240
652 _ef 2238
653 pli 2232
654 às_ 2226
655 ble 2224
656 eis 2223
657 rca 2222
658 tec 2222
659 içõ 2220
660 oca 2218
661 _às 2216
662 nve 2192
663 olo 2191
664 _ra 2188
665 _nu 2186
666 tá_ 2186
667 _la 2180
668 dif 2178
669 ext 2171
670 iar 2165
671 lhe 2163
672 inf 2149
673 rog 2141
674 lvi 2134
675 gua 2133
676 us_ 2133
677 rid 2129
678 abi 2127
679 opa 2126
680 nçã 2124
681 ego 2119
682 _há 2116
683 iam 2115
684 tru 2114
685 ço_ 2112
686 iga 2100
687 odu 2097
688 rin 2090
689 ume 2090
690 _fr 2089
691 oda 2088
692 oje 2088
693 há_ 2087
694 vem 2087
695 ogr 2084
696 nac 2076
697 los 2072
698 bat 2066
699 diz 2061
700 ovo 2059
701 lta 2052
702 isa 2050
703 rno 2050
704 pas 2046
705 bor 2045
706 isc 2033
707 nan 2031
708 gor 2027
709 onf 2026
710 cam 2020
711 ato 2019
712 fei 2017
713 ive 2014
714 ocu 2014
715 lha 2007
716 lim 2003
717 vei 2003
718 exe 1998
719 uir 1998
720 ain 1996
721 apo 1996
722 bri 1996
723 mpe 1995
724 efi 1994
725 uas 1994
726 rib 1990
727 agr 1988
728 ola 1984
729 sab 1984
730 bem 1983
731 cim 1981
732 tár 1976
733 eus 1975
734 cçã 1970
735 ibu 1967
736 clu 1963
737 dei 1960
738 tui 1946
739 aso 1944
740 rit 1941
741 bas 1938
742 rên 1937
743 quo 1932
744 rdo 1932
745 sar 1932
746 exp 1928
747 can 1927
748 cie 1926
749 pró 1924
750 _ai 1922
751 ple 1910
752 eal 1898
753 age 1891
754 itá 1883
755 siç 1871
756 udo 1868
757 _cl 1867
758 mat 1862
759 bje 1855
760 iad 1847
761 obj 1841
762 rça 1840
763 gen 1838
764 lan 1838
765 uo_ 1835
766 vim 1834
767 nor 1832
768 _ti 1828
769 ana 1825
770 abo 1823
771 ole 1817
772 utu 1810
773 rga 1804
774 sam 1804
775 ava 1802
776 isp 1802
777 upo 1801
778 atr 1797
779 ced 1789
780 num 1785
781 rde 1782
782 ize 1778
783 ris 1777
784 mod 1768
785 nai 1759
786 mic 1757
787 gem 1744
788 def 1741
789 lis 1740
790 iaç 1739
791 ssu 1732
792 rão 1726
793 via 1723
794 cur 1718
795 rup 1718
796 til 1712
797 rob 1707
798 lei 1706
799 tér 1702
800 luç 1698
801 ers 1696
802 eio 1694
803 raz 1693
804 xis 1693
805 ham 1692
806 uti 1687
807 ase 1678
808 bli 1677
809 rav 1669
810 tai 1667
811 deb 1663
812 obl 1657
813 red 1656
814 _ha 1651
815 uaç 1650
816 lec 1647
817 _ar 1646
818 ite 1644
819 lev 1641
820 rqu 1640
821 ivi 1639
822 nhe 1639
823 _ir 1632
824 onh 1630
825 uiç 1627
826 gru 1625
827 rom 1624
828 uga 1622
829 eba 1621
830 ctu 1619
831 eva 1619
832 gia 1615
833 ote 1610
834 sív 1603
835 atu 1598
836 ibi 1597
837 vot 1597
838 arm 1594
839 _af 1591
840 ave 1590
841 maç 1590
842 ncl 1584
843 nar 1581
844 aio 1580
845 mel 1578
846 _tu 1574
847 hum 1572
848 sub 1569
849 ger 1566
850 val 1561
851 íci 1554
852 _ev 1553
853 ves 1552
854 egr 1551
855 dão 1545
856 _du 1541
857 anh 1538
858 nis 1529
859 ecu 1525
860 uin 1524
861 las 1522
862 ede 1521
863 onv 1517
864 icu 1513
865 uda 1513
866 ós_ 1513
867 ife 1511
868 ian 1510
869 nív 1509
870 arg 1507
871 eta 1505
872 sõe 1502
873 efo 1496
874 plo 1480
875 nóm 1478
876 uns 1478
877 tom 1475
878 les 1474
879 soa 1474
880 çam 1474
881 rvi 1473
882 vad 1473
883 rot 1470
884 eda 1468
885 lug 1459
886 ene 1456
887 zad 1450
888 gum 1448
889 mil 1444
890 tou 1443
891 eli 1438
892 aut 1432
893 jud 1430
894 ei_ 1428
895 sco 1426
896 _ga 1423
897 mul 1423
898 gov 1419
899 duz 1417
900 rie 1414
901 uai 1414
902 uad 1413
903 rce 1412
904 eti 1411
905 ga_ 1408
906 orn 1408
907 _pl 1407
908 ómi 1407
909 nit 1406
910 nfo 1402
911 har 1399
912 afi 1397
913 nso 1396
914 _ní 1393
915 sum 1393
916 eso 1392
917 fal 1392
918 rgu 1392
919 ssí 1392
920 apl 1390
921 onó 1390
922 _bo 1388
923 unc 1388
924 lad 1387
925 rme 1387
926 gul 1386
927 áti 1386
928 hec 1385
929 lti 1385
930 ilh 1379
931 gis 1377
932 cta 1374
933 rol 1370
934 _só 1364
935 tab 1363
936 acç 1357
937 rtu 1356
938 mon 1348
939 sec 1347
940 _hu 1345
941 ino 1343
942 cip 1340
943 nam 1340
944 tig 1335
945 sul 1332
946 lia 1331
947 ism 1326
948 loc 1326
949 ços 1324
950 inu 1323
951 lib 1322
952 iai 1321
953 ânc 1317
954 oso 1315
955 rto 1315
956 fra 1311
957 oma 1311
958 isã 1302
959 nsu 1301
960 ãos 1298
961 irm 1290
962 _lh 1284
963 meu 1281
964 il_ 1277
965 ong 1274
966 só_ 1274
967 cit 1273
968 igu 1272
969 mad 1268
970 té_ 1267
971 che 1263
972 ulo 1258
973 nua 1257
974 ner 1254
975 ago 1251
976 gas 1249
977 sad 1249
978 nat 1246
979 gad 1245
980 mid 1244
981 lor 1243
982 rac 1243
983 pio 1242
984 fec 1240
985 ast 1236
986 bra 1236
987 púb 1236
988 úbl 1235
989 nsp 1231
990 paç 1231
991 nif 1230
992 róp 1230
993 dim 1229
994 ltu 1229
995 fir 1226
996 ópr 1224
997 bit 1223
998 gue 1223
999 ign 1222
1000 pla 1222
1001 ipa 1219
1002 adã 1217
1003 ane 1216
1004 ecç 1211
1005 tân 1211
1006 adi 1207
1007 ibe 1207
1008 je_ 1207
1009 bie 1200
1010 za_ 1198
1011 az_ 1195
1012 apa 1194
1013 len 1194
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _de 275797
15 de_ 221617
16 _în 201809
17 te_ 185452
18 ul_ 173696
19 re_ 147553
20 şi_ 144596
21 în_ 138751
22 le_ 138646
23 _a_ 133790
24 _şi 129587
25 are 128007
26 ea_ 126532
27 _ca 107537
28 in_ 103624
29 _di 103605
30 _pr 101695
31 est 101638
32 _co 100783
33 ui_ 98427
34 ia_ 96188
35 ii_ 94435
36 lui 91306
37 din 89879
38 _pe 87943
39 la_ 87244
40 or_ 86340
41 _la 85811
42 ste 85394
43 _ma 83317
44 ate 82009
45 _un 81730
46 _al 80569
47 ent 76873
48 at_ 76298
49 _cu 76154
50 ei_ 75281
51 tă_ 74565
52 _re 73436
53 ele 69971
54 al_ 68556
55 ntr 67751
56 ulu 67602
57 rea 66513
58 tat 65973
59 _se 65202
60 _es 62123
61 _in 61236
62 eri 60921
63 ie_ 60412
64 car 60313
65 _ce 60304
66 _fo 59970
67 lor 59578
68 tor 56819
69 con 56788
70 _o_ 55465
71 cu_ 55403
72 _ac 54614
73 că_ 54083
74 tul 53892
75 se_ 53831
76 ori 52641
77 st_ 51801
78 ale 51347
79 ri_ 51093
80 pri 50682
81 au_ 50343
82 ist 49381
83 _su 48666
84 ter 48613
85 tru 48049
86 _po 47312
87 ita 46805
88 ile 46762
89 pre 45339
90 _ro 44548
91 ce_ 44471
92 aţi 44169
93 ne_ 43210
94 tre 42947
95 un_ 42868
96 int 42797
97 _lu 42307
98 nte 42206
99 uri 42127
100 pe_ 41815
101 _sa 41679
102 ar_ 41444
103 ace 41264
104 ai_ 41170
105 ani 41113
106 _fi 40769
107 ru_ 40371
108 ca_ 39967
109 _pa 39949
110 ali 39944
111 rom 39778
112 sta 39493
113 _au 39046
114 _tr 38650
115 ost 38644
116 _nu 38163
117 pro 37941
118 ilo 37912
119 tra 37768
120 _st 37626
121 _ar 37507
122 mai 37165
123 nd_ 37160
124 ici 37105
125 să_ 37075
126 _an 37073
127 ra_ 37044
128 art 36938
129 mân 36409
130 lit 36119
131 era 36065
132 tic 36047
133 ere 35981
134 tur 35373
135 str 35258
136 men 34656
137 _mo 34654
138 par 33961
139 _să 33809
140 it_ 32860
141 rat 32832
142 _or 32526
143 fos 32407
144 _lo 32368
145 tea 32242
146 nă_ 32048
147 uni 31985
148 pen 31966
149 an_ 31682
150 ată 31524
151 nt_ 31441
152 nia 31202
153 ica 31013
154 mar 30991
155 ră_ 30959
156 _mu 30944
157 rin 30651
158 mul 30630
159 per 30598
160 ine 30560
161 eni 30285
162 oar 30205
163 _că 30187
164 ta_ 30063
165 rii 29904
166 _do 29778
167 ril 29631
168 _si 29620
169 ni_ 29000
170 rul 28851
171 iei 28667
172 com 28654
173 omâ 28575
174 rit 28561
175 _mi 28466
176 une 28391
177 ari 28066
178 _li 28048
179 ran 27960
180 _da 27286
181 ţii 27231
182 nul 27089
183 loc 27076
184 rie 26978
185 na_ 26900
186 ice 26761
187 ind 26483
188 man 26445
189 cel 26408
190 ric 26341
191 rma 26153
192 ică 26017
193 atu 25888
194 el_ 25888
195 ion 25661
196 ces 25525
197 num 25470
198 şti 25305
199 ând 25271
200 ţi_ 25157
201 sti 25125
202 ona 25056
203 eşt 24934
204 _ap 24786
205 lă_ 24517
206 ite 24505
207 _te 24363
208 _le 24038
209 ili 24008
210 ora 23784
211 ast 23778
212 ant 23667
213 rec 23641
214 ic_ 23606
215 cal 23390
216 _me 23380
217 ări 23378
218 _ex 23216
219 ult 23097
220 nic 23019
221 cul 22957
222 tel 22804
223 _fa 22657
224 iul 22622
225 _pu 22592
226 _du 22549
227 _no 22348
228 ini 22340
229 edi 22323
230 cea 22318
231 ţie 22301
232 chi 22228
233 ită 22094
234 ina 22017
235 mun 21963
236 ial 21869
237 ara 21834
238 imp 21773
239 ati 21765
240 ţia 21688
241 ons 21687
242 anu 21565
243 înt 21525
244 esc 21469
245 ut_ 21469
246 col 21419
247 nu_ 21377
248 _as 21375
249 _ju 21163
250 _sc 21151
251 tri 21022
252 ect 21010
253 iun 20861
254 for 20836
255 rim 20699
256 ci_ 20657
257 nea 20620
258 olo 20619
259 res 20539
260 _sp 20396
261 ză_ 20353
262 _ve 20352
263 tin 20342
264 mat 20314
265 nal 20170
266 tar 20159
267 _vi 20021
268 rte 19783
269 ria 19682
270 nde 19673
271 cat 19627
272 _cr 19624
273 ire 19448
274 min 19442
275 _ba 19400
276 lul 19370
277 tiv 19350
278 scu 19335
279 itu 19309
280 ura 19041
281 reg 18994
282 va_ 18959
283 nta 18888
284 oca 18815
285 âni 18781
286 unt 18566
287 ume 18522
288 tan 18465
289 lic 18432
290 ene 18413
291 ală 18381
292 cur 18370
293 ti_ 18359
294 elo 18349
295 _so 18122
296 ito 18094
297 _ge 18037
298 _va 18023
299 and 17944
300 dat 17919
301 ver 17885
302 nce 17880
303 _ti 17757
304 _ne 17693
305 ţa_ 17524
306 înc 17402
307 des 17256
308 orm 17254
309 _to 17249
310 iar 17157
311 nat 17137
312 cia 17125
313 us_ 17115
314 enţ 17010
315 nti 16978
316 por 16959
317 nst 16737
318 ato 16732
319 şte 16599
320 oli 16590
321 ort 16550
322 _ad 16437
323 stă 16374
324 lea 16339
325 _fr 16327
326 der 16276
327 nit 16204
328 mit 16122
329 ude 16097
330 iu_ 16065
331 _gr 15912
332 fic 15902
333 _bu 15804
334 ian 15776
335 rti 15706
336 ame 15696
337 ază 15645
338 ral 15613
339 ers 15562
340 eas 15560
341 tal 15544
342 iţi 15504
343 cri 15396
344 reş 15368
345 iza 15344
346 _bi 15331
347 tim 15305
348 _im 15246
349 er_ 15231
350 cer 15093
351 rep 15028
352 raş 15027
353 pă_ 15023
354 bil 15010
355 ocu 14954
356 ier 14897
357 nţi 14856
358 ten 14815
359 _s_ 14793
360 _câ 14771
361 ser 14758
362 rop 14727
363 ma_ 14705
364 eaz 14646
365 ern 14644
366 _el 14613
367 ont 14610
368 erm 14590
369 fer 14569
370 nci 14556
371 oni 14450
372 on_ 14415
373 act 14376
374 pul 14374
375 pol 14224
376 ven 14180
377 jud 14150
378 _ia 14119
379 inc 14114
380 eze 14079
381 ţă_ 14078
382 _na 14076
383 cre 14047
384 ase 14036
385 rez 14028
386 sa_ 14006
387 cut 13982
388 deţ 13968
389 alt 13958
390 rel 13854
391 imi 13777
392 ond 13770
393 ăţi 13770
394 _at 13756
395 nii 13750
396 put 13732
397 olu 13724
398 egi 13722
399 ndu 13588
400 ide 13572
401 iin 13571
402 ula 13547
403 ţul 13510
404 lte 13471
405 sun 13468
406 ime 13442
407 cţi 13421
408 nsi 13400
409 tăţ 13400
410 iil 13387
411 sit 13381
412 ntu 13360
413 eci 13333
414 ţio 13325
415 inţ 13295
416 ata 13275
417 ima 13274
418 gra 13272
419 ris 13179
420 omu 13168
421 cen 13146
422 ans 13128
423 _pi 13115
424 _ci 13026
425 nţa 13012
426 ive 13011
427 ger 13005
428 es_ 12895
429 upă 12888
430 ins 12825
431 eţu 12816
432 toa 12698
433 _ră 12692
434 eme 12640
435 ară 12616
436 me_ 12609
437 dup 12591
438 oma 12577
439 nie 12492
440 lat 12464
441 unc 12455
442 iti 12453
443 sau 12450
444 tit 12450
445 ane 12440
446 _er 12429
447 stu 12410
448 lia 12375
449 nor 12349
450 _av 12259
451 ure 12203
452 sto 12200
453 ţin 12155
454 lim 12104
455 lan 12054
456 ope 12050
457 ecu 12006
458 lin 11992
459 dia 11967
460 nis 11947
461 umi 11933
462 oru 11913
463 fii 11815
464 _am 11811
465 _ur 11803
466 eco 11786
467 mon 11769
468 ioa 11684
469 sat 11672
470 sul 11665
471 ute 11646
472 dec 11637
473 vin 11632
474 _ch 11596
475 dar 11594
476 ord 11567
477 cum 11532
478 ndi 11528
479 ece 11523
480 lar 11483
481 mer 11481
482 sec 11441
483 uit 11433
484 den 11401
485 rar 11401
486 ner 11385
487 _vo 11194
488 cti 11181
489 cep 11157
490 leg 11154
491 ovi 11127
492 mel 11122
493 can 11094
494 oi_ 11066
495 tem 11061
496 abi 10975
497 _ni 10917
498 is_ 10913
499 duc 10884
500 _fe 10882
501 imb 10857
502 sup 10852
503 mic 10850
504 mpo 10846
505 ctu 10795
506 _br 10787
507 bli 10786
508 inu 10786
509 rio 10760
510 fra 10753
511 um_ 10716
512 eli 10698
513 rta 10698
514 tro 10691
515 dic 10676
516 ala 10593
517 scr 10567
518 gen 10542
519 omi 10530
520 zat 10487
521 bri 10453
522 spe 10437
523 _ra 10434
524 und 10427
525 eta 10416
526 rei 10395
527 ână 10370
528 oat 10362
529 _bo 10357
530 _pl 10357
531 _af 10350
532 anţ 10346
533 mil 10262
534 dis 10251
535 nda 10243
536 uno 10179
537 păr 10146
538 emi 10116
539 ivi 10111
540 ana 10107
541 _dr 10102
542 rad 10078
543 da_ 10040
544 iv_ 10031
545 ise 10026
546 ăto 10023
547 pun 9989
548 ren 9984
549 uţi 9965
550 ome 9936
551 mbr 9881
552 nei 9872
553 nţe 9847
554 ing 9824
555 elu 9752
556 one 9736
557 _cl 9676
558 nar 9620
559 ifi 9541
560 lie 9535
561 pân 9508
562 _pă 9502
563 fi_ 9471
564 ust 9457
565 sub 9442
566 omp 9419
567 spr 9408
568 cor 9392
569 cip 9338
570 _îm 9319
571 sil 9317
572 upr 9282
573 vol 9273
574 _is 9267
575 pra 9251
576 si_ 9224
577 ede 9211
578 ânt 9199
579 log 9165
580 epu 9159
581 căt 9139
582 iri 9115
583 cii 9099
584 ezi 9069
585 ore 9069
586 cop 9065
587 sem 9020
588 una 9016
589 ept 9015
590 ber 9012
591 unu 9006
592 alu 8999
593 liz 8996
594 dul 8976
595 che 8918
596 opu 8918
597 ade 8878
598 ege 8863
599 anc 8856
600 van 8851
601 gin 8838
602 pec 8833
603 asc 8832
604 tua 8806
605 ela 8790
606 sco 8771
607 iet 8752
608 oas 8741
609 mpl 8721
610 eve 8689
611 ung 8685
612 dus 8674
613 ete 8671
614 apa 8668
615 rem 8648
616 ubl 8575
617 _tu 8553
618 cto 8553
619 atr 8508
620 pop 8505
621 val 8497
622 esp 8495
623 _i_ 8474
624 il_ 8454
625 _ru 8421
626 iat 8412
627 eti 8372
628 ien 8351
629 riu 8348
630 ole 8340
631 dev 8337
632 cân 8332
633 şul 8321
634 ova 8315
635 nil 8309
636 uce 8307
637 dep 8251
638 împ 8243
639 naţ 8143
640 urm 8128
641 ură 8106
642 ean 8101
643 ese 8040
644 emb 8033
645 sal 7996
646 ico 7986
647 org 7980
648 etr 7979
649 dre 7977
650 _fu 7957
651 _ob 7941
652 ană 7928
653 ât_ 7925
654 fie 7882
655 _pâ 7880
656 nţă 7873
657 sc_ 7868
658 cun 7866
659 gre 7811
660 pat 7780
661 cât 7771
662 apr 7758
663 ron 7747
664 pub 7730
665 mpu 7712
666 nel 7710
667 med 7706
668 ret 7698
669 en_ 7683
670 red 7677
671 fac 7670
672 căr 7657
673 apo 7656
674 ntă 7631
675 _oc 7619
676 rac 7593
677 eal 7591
678 măr 7589
679 pla 7585
680 ech 7558
681 los 7549
682 _be 7547
683 evi 7536
684 til 7527
685 _ta 7522
686 sch 7499
687 vit 7491
688 osi 7488
689 tr_ 7482
690 asi 7472
691 oci 7464
692 uro 7447
693 les 7433
694 mod 7373
695 asa 7368
696 lon 7361
697 bis 7338
698 tei 7331
699 arc 7328
700 za_ 7302
701 ola 7295
702 ave 7291
703 os_ 7288
704 ină 7264
705 pot 7263
706 ău_ 7257
707 _ga 7256
708 lta 7254
709 _op 7251
710 oan 7248
711 ami 7216
712 ono 7211
713 gur 7195
714 mă_ 7191
715 exi 7190
716 rd_ 7184
717 uat 7182
718 luc 7171
719 ge_ 7164
720 ism 7155
721 scă 7150
722 zi_ 7130
723 ogi 7121
724 _fă 7118
725 dă_ 7116
726 lec 7080
727 dea 7050
728 uto 7042
729 lis 7039
730 enu 7029
731 cte 7020
732 ţiu 7012
733 _ec 7010
734 aş_ 7002
735 odu 6998
736 tră 6984
737 eor 6979
738 tot 6966
739 rme 6965
740 riv 6960
741 dov 6921
742 erc 6907
743 iec 6900
744 _ai 6879
745 imu 6878
746 dom 6876
747 _eu 6875
748 aşu 6872
749 ţil 6870
750 pie 6865
751 raf 6865
752 ună 6853
753 cin 6852
754 ove 6834
755 _mă 6818
756 _ha 6815
757 izi 6809
758 vie 6793
759 ct_ 6789
760 aşi 6788
761 raţ 6774
762 lun 6766
763 rev 6763
764 ict 6757
765 buc 6755
766 ba_ 6743
767 lel 6743
768 îns 6742
769 ze_ 6731
770 laţ 6694
771 opo 6682
772 ot_ 6676
773 rţi 6662
774 gat 6657
775 udi 6656
776 sin 6595
777 cla 6565
778 âne 6562
779 apt 6555
780 soc 6555
781 acu 6542
782 old 6542
783 iva 6539
784 ucr 6532
785 eur 6531
786 ătr 6521
787 et_ 6512
788 dem 6508
789 esi 6508
790 icu 6498
791 fel 6493
792 ens 6489
793 ero 6453
794 adi 6442
795 use 6435
796 adu 6429
797 dez 6425
798 poa 6421
799 dou 6419
800 _ev 6413
801 opi 6405
802 rus 6405
803 he_ 6399
804 mol 6397
805 ega 6396
806 mis 6396
807 lum 6393
808 tui 6392
809 osc 6372
810 ref 6366
811 ves 6332
812 uă_ 6331
813 exp 6327
814 cra 6300
815 lva 6288
816 ote 6263
817 ol_ 6261
818 zar 6260
819 lt_ 6254
820 clu 6251
821 epr 6246
822 rna 6245
823 fol 6232
824 cap 6227
825 _zi 6219
826 _sf 6217
827 _of 6215
828 igi 6210
829 aut 6177
830 eţi 6172
831 răz 6168
832 xis 6135
833 eau 6121
834 ozi 6116
835 hia 6111
836 cup 6106
837 aju 6105
838 nes 6099
839 fin 6095
840 pan 6093
841 ndr 6091
842 fil 6084
843 nom 6063
844 fun 6060
845 giu 6057
846 utu 6056
847 ram 6054
848 mba 6042
849 pli 6036
850 gan 6031
851 ub_ 6016
852 acă 6010
853 eca 6006
854 boi 5998
855 zen 5995
856 ns_ 5993
857 ătu 5991
858 erv 5985
859 sur 5980
860 ada 5976
861 ăra 5973
862 ton 5969
863 ipa 5966
864 rod 5936
865 nui 5922
866 vic 5920
867 _ul 5897
868 end 5888
869 del 5868
870 afl 5863
871 mor 5861
872 ouă 5857
873 isc 5854
874 tud 5850
875 ua_ 5841
876 uma 5838
877 pa_ 5836
878 azi 5817
879 ard 5811
880 nos 5811
881 rân 5802
882 gar 5800
883 ang 5781
884 zbo 5777
885 inf 5776
886 ăru 5770
887 aru 5769
888 var 5760
889 ţel 5756
890 mur 5752
891 _gu 5751
892 urs 5744
893 cie 5743
894 ogr 5736
895 bra 5719
896 tii 5701
897 du_ 5697
898 las 5693
899 ltu 5691
900 emn 5684
901 sar 5671
902 cui 5668
903 rui 5658
904 ena 5655
905 arm 5654
906 _n_ 5653
907 _jo 5647
908 ilv 5630
909 eră 5629
910 evo 5613
911 zon 5603
912 nsu 5599
913 afi 5596
914 ema 5578
915 met 5576
916 dit 5574
917 roa 5574
918 olt 5571
919 _ed 5569
920 niv 5569
921 dru 5566
922 ghe 5553
923 tăr 5545
924 vor 5529
925 eva 5527
926 sen 5526
927 aco 5516
928 eru 5498
929 obi 5496
930 ără 5486
931 oie 5485
932 roc 5467
933 ăzb 5465
934 ama 5463
935 rov 5461
936 ual 5461
937 aci 5440
938 _io 5432
939 mag 5426
940 lem 5406
941 ert 5396
942 ior 5395
943 reu 5389
944 rup 5383
945 ucu 5375
946 tie 5373
947 vec 5369
948 dif 5343
949 dac 5321
950 ose 5320
951 ain 5312
952 ext 5290
953 niu 5289
954 cit 5284
955 pet 5281
956 sim 5281
957 ape 5280
958 len 5278
959 agi 5272
960 ban 5272
961 bel 5272
962 mpe 5267
963 rol 5256
964 onf 5255
965 _en 5251
966 uta 5251
967 eea 5250
968 rt_ 5238
969 ve_ 5238
970 cet 5235
971 pal 5230
972 cad 5218
973 pus 5216
974 as_ 5213
975 _ri 5208
976 _ab 5196
977 iaţ 5196
978 cas 5194
979 ldo 5193
980 onu 5182
981 siu 5169
982 _l_ 5160
983 mea 5155
984 gal 5154
985 rmi 5149
986 _d_ 5145
987 ruc 5135
988 sud 5130
989 nou 5128
990 via 5118
991 mir 5117
992 am_ 5113
993 rtu 5095
994 nez 5090
995 _th 5075
996 ptu 5075
997 _ho 5074
998 dir 5074
999 uşi 5065
1000 vă_ 5053
1001 nge 5041
1002 eat 5030
1003 nse 5030
1004 ule 5017
1005 rsi 5016
1006 cei 4994
1007 pos 4987
1008 li_ 4983
1009 lti 4973
1010 dur 4966
1011 tab 4964
1012 ple 4956
1013 alb 4954
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _пр 9592
15 _по 8070
16 ени 7619
17 ия_ 6631
18 _в_ 6350
19 _и_ 5933
20 _на 5572
21 ния 5354
22 ост 5176
23 ой_ 5065
24 го_ 5034
25 ть_ 4905
26 ани 4896
27 ых_ 4822
28 про 4721
29 ие_ 4584
30 ого 4559
31 ии_ 4511
32 ств 4390
33 ов_ 4369
34 на_ 4272
35 льн 4235
36 _ко 4214
37 _за 3993
38 _ра 3990
39 аци 3945
40 ова 3943
41 ных 3920
42 ред 3688
43 нны 3676
44 ние 3659
45 _со 3651
46 _об 3634
47 тел 3578
48 ров 3376
49 ста 3370
50 ет_ 3268
51 ся_ 3263
52 пол 3219
53 при 3196
54 _ка 3169
55 _не 3092
56 мен 3061
57 енн 3026
58 ван 2947
59 _вы 2901
60 но_ 2880
61 ель 2875
62 лен 2862
63 пре 2856
64 сти 2841
65 ом_ 2758
66 _до 2739
67 _ст 2732
68 нно 2699
69 анн 2691
70 ист 2663
71 ми_ 2632
72 _ре 2558
73 ые_ 2544
74 ног 2531
75 ий_ 2526
76 ции 2506
77 ли_ 2484
78 нос 2471
79 ти_ 2468
80 чес 2446
81 ной 2439
82 тов 2404
83 стр 2399
84 раз 2381
85 ьно 2373
86 тся 2359
87 _из 2357
88 пер 2345
89 дан 2342
90 ате 2303
91 иче 2236
92 ля_ 2232
93 ите 2220
94 _от 2218
95 ско 2213
96 _с_ 2211
97 оль 2211
98 ест 2188
99 нов 2187
100 ент 2175
101 _ин 2172
102 ки_ 2154
103 тор 2129
104 еск 2123
105 те_ 2119
106 ая_ 2105
107 их_ 2101
108 аль 2056
109 етс 2037
110 _да 2027
111 ать 2002
112 та_ 1962
113 тве 1959
114 кон 1935
115 _те 1922
116 рав 1922
117 _пе 1917
118 вле 1910
119 ера 1904
120 ка_ 1903
121 _мо 1888
122 ные 1888
123 ово 1832
124 ый_ 1828
125 пра 1824
126 ей_ 1810
127 ран 1802
128 сте 1796
129 ото 1776
130 ем_ 1756
131 не_ 1756
132 _дл 1744
133 _оп 1730
134 нен 1713
135 для 1703
136 еде 1701
137 дел 1700
138 ком 1679
139 орм 1674
140 аст 1667
141 сто 1663
142 ски 1652
143 то_ 1651
144 _во 1623
145 ере 1601
146 тем 1581
147 иро 1575
148 рат 1566
149 ьны 1537
150 _си 1534
151 тро 1530
152 тер 1515
153 фор 1503
154 _о_ 1500
155 ако 1493
156 ое_ 1491
157 _ос 1483
158 ков 1460
159 ным 1448
160 чен 1439
161 по_ 1437
162 тан 1436
163 тра 1431
164 ва_ 1422
165 авл 1414
166 _то 1412
167 _та 1405
168 _де 1400
169 нал 1399
170 сть 1391
171 или 1385
172 _эт 1384
173 _ис 1376
174 _го 1374
175 _сп 1367
176 ват 1363
177 ный 1352
178 дер 1340
179 сис 1336
180 ты_ 1326
181 спо 1322
182 ает 1309
183 _ме 1307
184 рос 1305
185 под 1302
186 ден 1301
187 оди 1293
188 вен 1270
189 ион 1265
190 оро 1264
191 ове 1255
192 ить 1254
193 ект 1251
194 пос 1251
195 еле 1239
196 _а_ 1238
197 _ма 1231
198 ном 1230
199 або 1223
200 тно 1223
201 бор 1213
202 _но 1211
203 зак 1211
204 так 1208
205 рма 1207
206 мож 1205
207 _к_ 1186
208 вер 1185
209 тав 1184
210 _ус 1179
211 иск 1179
212 анд 1176
213 ра_ 1176
214 ым_ 1176
215 _сл 1171
216 оже 1167
217 дат 1165
218 вля 1155
219 ию_ 1155
220 ива 1151
221 раб 1142
222 вод 1141
223 али 1140
224 ны_ 1128
225 зов 1127
226 тив 1120
227 ник 1116
228 льз 1115
229 его 1111
230 сов 1111
231 _ди 1109
232 оло 1109
233 том 1108
234 сле 1106
235 _се 1103
236 рас 1100
237 _чт 1099
238 име 1097
239 это 1092
240 _ли 1090
241 жен 1086
242 рем 1086
243 _ил 1083
244 что 1080
245 ход 1079
246 дст 1076
247 лов 1073
248 ую_ 1066
249 гра 1063
250 онн 1062
251 нии 1058
252 как 1052
253 да_ 1050
254 кан 1043
255 ак_ 1038
256 спе 1037
257 кой 1029
258 _па 1026
259 рам 1025
260 цио 1021
261 инф 1020
262 яет 1019
263 выб 1015
264 ляе 1015
265 рац 1012
266 ах_ 1011
267 жно 1008
268 сно 1007
269 бра 1002
270 нфо 986
271 бот 984
272 тре 973
273 мы_ 971
274 она 970
275 род 968
276 ами 949
277 вре 939
278 ита 939
279 мат 933
280 ний 932
281 ее_ 931
282 ери 929
283 пис 927
284 кот 926
285 печ 920
286 ати 919
287 мац 919
288 ная 918
289 ика 912
290 еме 905
291 исп 900
292 ска 899
293 вет 898
294 мер 897
295 му_ 896
296 дос 894
297 _бы 892
298 иза 890
299 ког 889
300 уст 887
301 ато 884
302 дит 884
303 ора 883
304 от_ 880
305 очн 879
306 едс 878
307 ода 878
308 спр 875
309 ющи 873
310 _вс 867
311 из_ 865
312 олн 865
313 зна 864
314 чет 864
315 обр 862
316 уме 862
317 одн 860
318 ное 856
319 осу 856
320 _св 854
321 вед 851
322 ане 850
323 дар 844
324 иал 843
325 _вр 841
326 зац 835
327 опр 835
328 ара 834
329 им_ 834
330 овы 833
331 емы 832
332 арт 830
333 ри_ 830
334 ких 829
335 ма_ 829
336 же_ 828
337 за_ 827
338 ция 824
339 тва 823
340 ано 821
341 кци 816
342 тич 814
343 лас 813
344 нди 813
345 оду 811
346 ана 804
347 ам_ 802
348 нач 795
349 сси 794
350 тат 794
351 во_ 792
352 ида 792
353 бир 788
354 ира 787
355 огр 781
356 каз 780
357 дис 779
358 од_ 779
359 ыбо 777
360 изб 771
361 лит 771
362 лож 770
363 нта 762
364 рог 762
365 лог 761
366 ко_ 755
367 док 754
368 зво 754
369 оку 752
370 _тр 751
371 _е_ 748
372 зда 748
373 ры_ 748
374 овл 746
375 пар 742
376 _ср 739
377 зби 738
378 овн 738
379 _ни 734
380 пла 734
381 осн 730
382 уще 730
383 ерж 729
384 акт 726
385 ожн 725
386 _ве 723
387 вны 723
388 лед 722
389 _им 721
390 аво 721
391 ующ 721
392 дид 720
393 нит 720
394 тац 719
395 оры 716
396 тст 715
397 еди 714
398 рег 712
399 ьзо 712
400 ерв 711
401 ла_ 711
402 слу 710
403 асс 704
404 бъе 704
405 час 704
406 рно 703
407 иру 702
408 опе 700
409 отв 699
410 _че 693
411 рез 693
412 ико 692
413 _уч 690
414 тво 690
415 щес 689
416 ок_ 687
417 все 685
418 соб 685
419 _пл 684
420 мин 684
421 хра 684
422 лич 683
423 шен 683
424 _од 680
425 кол 678
426 ле_ 678
427 ни_ 678
428 зан 677
429 ори 677
430 оде 673
431 имо 672
432 аче 671
433 вол 671
434 дов 670
435 ку_ 670
436 луч 669
437 уча 669
438 ях_ 667
439 уда 666
440 вой 664
441 есп 662
442 ст_ 662
443 _т_ 661
444 аза 658
445 ыми 658
446 едо 657
447 жет 657
448 воз 656
449 нти 655
450 ели 653
451 сре 653
452 вно 651
453 ует 651
454 низ 650
455 _су 649
456 еду 649
457 ата 647
458 цен 647
459 _н_ 644
460 ер_ 641
461 асп 640
462 нте 639
463 _ор 637
464 _ми 636
465 бли 635
466 ке_ 635
467 соо 634
468 _фе 633
469 ант 632
470 рои 630
471 ивн 629
472 оце 629
473 рен 628
474 кум 627
475 _ба 626
476 обл 626
477 объ 626
478 вит 624
479 кто 624
480 вае 623
481 зде 623
482 клю 623
483 _см 616
484 _фо 616
485 оли 614
486 зап 613
487 зме 612
488 йст 611
489 люч 608
490 _бу 607
491 аги 607
492 тны 606
493 явл 606
494 тву 605
495 гот 604
496 атн 602
497 ока 602
498 ини 601
499 ием 599
500 суд 599
501 орг 598
502 пор 598
503 _це 597
504 дол 597
505 оле 597
506 тия 597
507 _фа 596
508 _ро 593
509 тех 593
510 _ан 592
511 ина 592
512 сли 592
513 амм 591
514 ейс 591
515 ит_ 591
516 лат 590
517 рга 586
518 _бо 585
519 ало 583
520 ема 581
521 ган 579
522 нто 579
523 чно 579
524 оме 578
525 фед 576
526 бол 575
527 тви 575
528 льк 574
529 обы 574
530 дин 573
531 осс 573
532 рал 572
533 _бл 571
534 ную 571
535 реб 571
536 лей 570
537 са_ 570
538 мет 569
539 ютс 568
540 _ва 565
541 бес 565
542 озд 563
543 тур 560
544 осо 559
545 три 559
546 год 558
547 еда 558
548 кти 557
549 рес 557
550 одо 556
551 инт 555
552 еля 554
553 ене 554
554 нию 554
555 око 554
556 отр 553
557 рой 552
558 _аг 551
559 ль_ 551
560 _ес 550
561 ави 550
562 азд 549
563 ехн 549
564 ому 549
565 кла 548
566 омп 548
567 сь_ 547
568 нич 546
569 еги 544
570 отк 544
571 рим 544
572 ют_ 543
573 _уп 539
574 вых 539
575 сло 539
576 изв 538
577 щен 535
578 _бе 534
579 _ок 533
580 тек 532
581 гит 531
582 _гр 529
583 вую 529
584 дно 528
585 оля 527
586 ыва 527
587 ити 526
588 обе 524
589 сту 524
590 бы_ 523
591 олж 523
592 быт 522
593 апр 521
594 рти 521
595 тол 521
596 оно 520
597 соз 519
598 рит 517
599 уль 515
600 _кр 514
601 стн 512
602 ета 511
603 код 508
604 мас 507
605 нст 507
606 сер 505
607 сущ 505
608 ься 505
609 нда 504
610 оиз 504
611 тьс 503
612 ен_ 502
613 оот 502
614 льт 500
615 мя_ 499
616 ела 497
617 аме 496
618 кам 495
619 гов 494
620 есс 493
621 чат 493
622 ций 491
623 чни 490
624 ерн 489
625 иях 489
626 лиз 489
627 общ 489
628 убл 489
629 бла 488
630 ови 488
631 рст 486
632 оги 485
633 лок 484
634 нас 484
635 _ку 483
636 _ти 483
637 поз 483
638 ут_ 482
639 чит 481
640 воч 480
641 емо 479
642 бло 478
643 обо 478
644 _ча 476
645 _яв 476
646 рус 476
647 нтр 475
648 дук 474
649 ичн 470
650 упр 470
651 ыть 470
652 _вк 469
653 змо 467
654 чны 467
655 мос 466
656 нар 466
657 лек 465
658 туп 464
659 есл 463
660 гос 461
661 вкл 459
662 вто 458
663 иям 458
664 ают 457
665 ме_ 456
666 _др 453
667 отн 453
668 сок 453
669 нес 452
670 _хр 450
671 ло_ 450
672 раж 449
673 _фи 448
674 лиц 447
675 тоб 447
676 има 446
677 еро 445
678 кие 445
679 лик 445
680 ляю 444
681 сво 443
682 ици 440
683 мес 440
684 яти 440
685 ью_ 439
686 ерс 438
687 орн 438
688 одс 437
689 рна 437
690 арс 436
691 гла 436
692 ежд 436
693 щих 436
694 рия 435
695 сос 435
696 ете 434
697 онт 433
698 _п_ 432
699 мог 432
700 ним 432
701 тру 431
702 чае 431
703 чис 429
704 циа 427
705 ома 426
706 реж 426
707 вып 425
708 озм 425
709 _зн 423
710 там 423
711 _он 422
712 ве_ 422
713 ан_ 421
714 выс 421
715 цию 421
716 еча 420
717 лис 420
718 _вн 419
719 айл 418
720 до_ 418
721 ече 417
722 ре_ 417
723 дет 416
724 ена 416
725 об_ 416
726 руг 416
727 фай 416
728 вир 415
729 льс 415
730 пом 415
731 ада 414
732 ду_ 414
733 _ру 412
734 лад 412
735 пус 412
736 изм 411
737 ики 411
738 сче 411
739 ько 411
740 ечи 409
741 он_ 408
742 вор 407
743 пов 407
744 ето 405
745 рое 404
746 юще 402
747 азо 401
748 ено 401
749 опи 401
750 ром 401
751 тог 401
752 ять 401
753 еоб 399
754 рол 399
755 уче 399
756 буд 398
757 урн 397
758 рок 396
759 вы_ 395
760 _ви 394
761 бще 394
762 екс 394
763 роц 394
764 тоя 394
765 азр 393
766 тит 393
767 щие 393
768 иде 392
769 _г_ 391
770 ающ 391
771 ава 390
772 зат 390
773 ими 390
774 осл 390
775 они 389
776 исл 388
777 ями 387
778 _же 386
779 аки 386
780 вид 384
781 сод 384
782 ор_ 383
783 де_ 382
784 рио 382
785 овк 381
786 пан 381
787 мир 379
788 се_ 378
789 ала 377
790 ади 376
791 жны 375
792 нап 375
793 ъек 375
794 _са 374
795 дей 374
796 емя 374
797 _пу 373
798 _х_ 372
799 риа 372
800 хни 372
801 дач 371
802 _ав 370
803 кая 370
804 дим 369
805 ари 368
806 гру 367
807 ким 367
808 тип 365
809 жде 364
810 рив 364
811 усл 363
812 ьзу 363
813 лжн 361
814 яте 360
815 дру 359
816 жит 359
817 ссо 359
818 _чи 358
819 был 358
820 _р_ 357
821 иви 357
822 мно 357
823 пуб 357
824 кры 356
825 озв 356
826 _ск 355
827 иса 355
828 нео 355
829 обн 354
830 кно 353
831 ень 352
832 кор 352
833 ту_ 352
834 баз 351
835 нут 351
836 рин 351
837 нак 350
838 _га 349
839 спи 349
840 едв 346
841 инс 346
842 _эф 345
843 гис 345
844 ебо 345
845 ям_ 345
846 _ег 343
847 едн 343
848 лем 343
849 авт 342
850 акж 342
851 ащи 342
852 ды_ 342
853 кже 342
854 вне 341
855 дея 341
856 зад 340
857 яза 340
858 _ар 338
859 аем 337
860 вос 337
861 рон 337
862 сам 337
863 _ло 336
864 ены 335
865 жур 335
866 сми 334
867 ака 332
868 рис 332
869 тик 332
870 чив 332
871 вал 331
872 _эк 330
873 йте 330
874 кта 329
875 _жу 328
876 ажд 328
877 кро 328
878 мод 328
879 ачи 327
880 сет 327
881 той 327
882 укт 327
883 кий 325
884 твл 325
885 _мн 324
886 рек 324
887 _кл 323
888 _м_ 323
889 er_ 323
890 цес 323
891 гор 322
892 изи 322
893 мощ 322
894 сан 322
895 унк 322
896 уро 321
897 дак 320
898 еят 320
899 ота 320
900 ело 319
901 окн 319
902 чи_ 319
903 _их 318
904 чер 318
905 яют 318
906 рми 317
907 уде 317
908 азы 316
909 тир 316
910 _л_ 315
911 _сч 315
912 ней 315
913 эле 315
914 ару 314
915 лер 313
916 рот 313
917 бов 312
918 омо 311
919 обх 310
920 ине 309
921 наз 309
922 оми 309
923 _эл 308
924 бхо 308
925 ино 308
926 _ук 307
927 вяз 307
928 рир 307
929 аже 306
930 еще 306
931 пот 305
932 фир 305
933 укц 304
934 аты 303
935 мпа 303
936 ез_ 301
937 нци 301
938 смо 301
939 _дв 300
940 ины 300
941 лее 300
942 рск 300
943 уск 300
944 щег 300
945 абл 298
946 льш 298
947 опу 298
948 _ад 297
949 _хо 297
950 азм 296
951 изн 296
952 реа 296
953 тки 296
954 точ 296
955 ача 295
956 азн 294
957 анс 294
958 ин_ 294
959 олу 294
960 _я_ 293
961 вия 293
962 нят 293
963 нна 292
964 па_ 292
965 тар 292
966 лос 291
967 тв_ 291
968 доб 290
969 ило 290
970 ука 290
971 мот 289
972 реш 289
973 иен 288
974 ваю 287
975 айт 286
976 иод 286
977 кул 286
978 нты 286
979 рве 286
980 авн 285
981 меж 285
982 над 285
983 оче 285
984 рет 285
985 _ур 284
986 ети 284
987 рук 284
988 юча 284
989 _ле 283
990 _уд 283
991 бле 283
992 уры 283
993 еща 282
994 _лю 281
995 але 281
996 дал 279
997 рак 279
998 двы 278
999 аро 277
1000 ноп 277
1001 _пи 276
1002 ибо 276
1003 нь_ 275
1004 оне 275
1005 риг 275
1006 нт_ 274
1007 рев 274
1008 ему 273
1009 изд 273
1010 кра 273
1011 пле 273
1012 _би 272
1013 обс 272
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 _pr 155897
15 _po 137147
16 _na 121520
17 ch_ 116648
18 _a_ 113592
19 _v_ 111891
20 na_ 91317
21 om_ 89631
22 _je 85847
23 ej_ 81274
24 _sa 79764
25 ho_ 76408
26 sa_ 74412
27 je_ 71789
28 _ro 71501
29 ie_ 68043
30 ova 64481
31 ov_ 63563
32 pre 62694
33 ých 62491
34 ku_ 59617
35 tor 56395
36 ne_ 56231
37 _do 56170
38 ia_ 51819
39 _ob 50633
40 _ne 49951
41 ost 49668
42 sta 48663
43 _bo 48630
44 _za 47920
45 ou_ 47508
46 kto 46980
47 li_ 46972
48 _st 46539
49 né_ 45844
50 la_ 45571
51 nie 42336
52 rok 42247
53 to_ 41499
54 _vy 40339
55 bol 40215
56 ko_ 39864
57 pri 39599
58 mi_ 39465
59 ick 39399
60 _ma 39211
61 _kt 38807
62 ého 38445
63 al_ 36907
64 ove 36896
65 nsk 36874
66 sti 36793
67 _ko 36701
68 _me 35723
69 ky_ 35264
70 van 35257
71 str 34549
72 ný_ 34542
73 ka_ 34484
74 lov 33726
75 ani 33554
76 nej 33137
77 _sp 32541
78 nov 32193
79 _ve 32053
80 oku 32041
81 rov 31196
82 val 30961
83 lo_ 30783
84 est 30773
85 _ak 30008
86 _od 29711
87 pod 29618
88 ali 29591
89 _z_ 29143
90 _vo 29126
91 ti_ 28958
92 kov 28917
93 ako 28622
94 _sl 27982
95 eni 27515
96 ná_ 27387
97 ale 27291
98 ast 27112
99 ven 26964
100 ol_ 26614
101 _sú 26468
102 sto 26243
103 _te 26137
104 eho 26093
105 _to 25870
106 _s_ 25732
107 ým_ 25699
108 red 25580
109 ny_ 25518
110 _kr 25224
111 voj 25217
112 nos 24871
113 čas 24839
114 ist 24693
115 kon 24605
116 slo 24576
117 kej 24490
118 men 24473
119 va_ 24440
120 _sv 24363
121 _vý 24006
122 _al 23634
123 aj_ 23234
124 _pa 23203
125 cho 23096
126 _ná 23064
127 kom 23016
128 och 22649
129 ens 22498
130 ran 22482
131 že_ 22451
132 do_ 22407
133 sko 22197
134 olo 21954
135 rie 21845
136 nýc 21748
137 _mo 21706
138 od_ 21560
139 _aj 21552
140 _ka 21397
141 odn 21263
142 pro 21257
143 il_ 21221
144 re_ 20805
145 len 20695
146 spo 20689
147 _mi 20657
148 _ni 20633
149 ske 20477
150 tov 20429
151 _zá 20362
152 vo_ 20310
153 ent 20271
154 oko 20266
155 pol 20215
156 edn 19959
157 za_ 19816
158 roz 19730
159 ať_ 19717
160 _re 19695
161 ate 19638
162 ovi 19395
163 le_ 19235
164 naj 19197
165 pra 19125
166 _ho 19120
167 ci_ 19115
168 sku 19100
169 _ta 19024
170 ich 18959
171 de_ 18775
172 dov 18361
173 oli 18234
174 ce_ 18126
175 te_ 18101
176 den 18010
177 jú_ 17919
178 ké_ 17915
179 tvo 17913
180 _tr 17899
181 tic 17897
182 lad 17806
183 nia 17801
184 ba_ 17779
185 ach 17662
186 dy_ 17569
187 hod 17544
188 sť_ 17509
189 mu_ 17327
190 _de 17298
191 ter 17273
192 _in 17196
193 tom 17172
194 tre 16933
195 rod 16925
196 uje 16923
197 _le 16908
198 ri_ 16860
199 ta_ 16765
200 tra 16747
201 kla 16731
202 dob 16721
203 _že 16627
204 ak_ 16608
205 ria 16575
206 _ch 16503
207 ský 16443
208 prí 16363
209 ati 16354
210 tro 16312
211 dne 16226
212 teľ 16223
213 nom 16199
214 ajú 16140
215 ala 16133
216 tav 16085
217 _ce 16017
218 _št 15873
219 ili 15858
220 ra_ 15837
221 kýc 15807
222 mie 15767
223 áci 15740
224 jed 15628
225 ký_ 15578
226 stv 15552
227 ské 15524
228 _no 15484
229 ele 15482
230 rav 15346
231 ový 15340
232 ami 15221
233 ený 15196
234 po_ 15019
235 tie 14862
236 zna 14852
237 oro 14843
238 rad 14777
239 _ča 14773
240 vie 14760
241 tu_ 14748
242 ní_ 14696
243 ným 14673
244 res 14626
245 mes 14603
246 pos 14525
247 iac 14523
248 _o_ 14521
249 alo 14437
250 néh 14435
251 obe 14398
252 _se 14288
253 by_ 14202
254 _vi 14193
255 _so 14177
256 ver 14165
257 _ok 14068
258 cký 14025
259 rsk 13982
260 _sk 13971
261 veľ 13945
262 kéh 13932
263 iu_ 13903
264 osť 13856
265 ori 13773
266 mer 13669
267 iek 13635
268 _si 13608
269 ska 13575
270 rom 13573
271 orý 13571
272 ola 13497
273 tal 13458
274 áln 13397
275 _vš 13392
276 eto 13392
277 er_ 13380
278 ká_ 13363
279 _di 13353
280 _ja 13342
281 tri 13341
282 ím_ 13337
283 ovo 13295
284 hov 13274
285 ite 13215
286 oré 13210
287 ebo 13191
288 lav 13191
289 cie 13189
290 kol 13160
291 áva 13148
292 mal 13111
293 _pl 13095
294 _os 12988
295 _vz 12986
296 ten 12941
297 vod 12922
298 ste 12848
299 svo 12841
300 nic 12836
301 hra 12822
302 jeh 12717
303 cké 12707
304 šie 12688
305 eme 12680
306 en_ 12648
307 nik 12648
308 ty_ 12647
309 odo 12610
310 ies 12593
311 ené 12575
312 ekt 12537
313 oje 12519
314 _dr 12512
315 vor 12489
316 _zo 12479
317 ca_ 12427
318 sú_ 12421
319 _hr 12399
320 las 12376
321 ych 12326
322 dno 12315
323 ame 12249
324 ádz 12247
325 stu 12075
326 pov 12062
327 ene 12038
328 ujú 12035
329 ved 12029
330 ré_ 12001
331 eno 11965
332 nes 11946
333 era 11939
334 ané 11908
335 výc 11868
336 dza 11857
337 nu_ 11849
338 prv 11822
339 iel 11819
340 iť_ 11807
341 lne 11768
342 me_ 11762
343 _ži 11751
344 chá 11736
345 _an 11726
346 _br 11651
347 rat 11649
348 da_ 11631
349 okr 11629
350 ren 11578
351 med 11568
352 tak 11554
353 ové 11548
354 cov 11490
355 hla 11461
356 ec_ 11424
357 bra 11373
358 lan 11353
359 _k_ 11316
360 prá 11222
361 ve_ 11182
362 _ra 11128
363 ená 11116
364 ret 11041
365 ok_ 11000
366 leb 10986
367 vé_ 10975
368 ero 10969
369 dom 10959
370 avi 10947
371 sky 10834
372 ern 10774
373 lit 10730
374 ave 10697
375 júc 10691
376 el_ 10666
377 ení 10635
378 kra 10624
379 ore 10605
380 ina 10573
381 stn 10552
382 eri 10538
383 por 10515
384 ii_ 10494
385 _li 10493
386 hor 10466
387 nam 10413
388 eli 10401
389 _vl 10394
390 očn 10394
391 adn 10379
392 tan 10374
393 hád 10368
394 aný 10367
395 du_ 10307
396 _ti 10297
397 nou 10296
398 bo_ 10285
399 ans 10279
400 _be 10247
401 die 10221
402 ými 10193
403 _pe 10187
404 poč 10177
405 ade 10156
406 ilo 10117
407 ke_ 10113
408 ede 10087
409 cia 10081
410 ry_ 10070
411 ad_ 10066
412 esk 10017
413 ier 9992
414 oto 9968
415 ľa_ 9966
416 vej 9880
417 mba 9853
418 áro 9848
419 _hl 9817
420 mat 9810
421 oho 9802
422 edz 9743
423 vať 9707
424 sla 9673
425 _zn 9648
426 rý_ 9640
427 vá_ 9611
428 nem 9607
429 _ba 9545
430 zem 9537
431 iny 9531
432 ovn 9514
433 ráv 9493
434 kre 9407
435 _mb 9395
436 sle 9364
437 krá 9360
438 nap 9344
439 kým 9342
440 nen 9338
441 pom 9314
442 dos 9298
443 _zv 9283
444 raj 9238
445 poz 9207
446 ric 9198
447 edo 9133
448 si_ 9116
449 pla 9103
450 via 9078
451 pad 9043
452 _bu 9027
453 _by 9006
454 _ke 8986
455 sob 8973
456 rne 8936
457 tol 8888
458 ová 8884
459 obc 8864
460 nú_ 8863
461 ľov 8818
462 dzi 8813
463 _či 8794
464 ejš 8772
465 vat 8762
466 vne 8760
467 ník 8745
468 nár 8733
469 rob 8733
470 ano 8723
471 pot 8722
472 _op 8704
473 ách 8687
474 čen 8684
475 nto 8662
476 hu_ 8654
477 cha 8636
478 es_ 8612
479 eľk 8587
480 eko 8579
481 akt 8571
482 ant 8544
483 cke 8539
484 dný 8518
485 edi 8513
486 _tu 8496
487 obr 8495
488 _ju 8464
489 _má 8421
490 eck 8398
491 oby 8390
492 vet 8371
493 cel 8368
494 ech 8342
495 tup 8310
496 ila 8306
497 obn 8300
498 ari 8256
499 vý_ 8251
500 _fi 8194
501 aní 8182
502 bec 8168
503 ato 8160
504 del 8124
505 dia 8087
506 lia 8084
507 no_ 8078
508 sve 8077
509 ine 8066
510 žen 8063
511 čia 8062
512 vsk 8040
513 ovs 8036
514 ada 8018
515 odi 8004
516 adi 8001
517 zov 8001
518 kou 7998
519 iat 7992
520 tel 7977
521 din 7955
522 man 7953
523 and 7914
524 vše 7912
525 vys 7909
526 _mu 7886
527 ru_ 7885
528 aro 7854
529 iad 7851
530 ane 7845
531 led 7843
532 eda 7833
533 tat 7813
534 ino 7811
535 _ar 7805
536 se_ 7784
537 dru 7767
538 rej 7763
539 tar 7754
540 sia 7728
541 _dv 7720
542 sov 7711
543 ože 7709
544 dné 7696
545 rá_ 7671
546 vin 7624
547 udo 7620
548 vom 7612
549 stá 7596
550 ado 7588
551 isk 7555
552 ode 7548
553 osl 7542
554 zač 7521
555 _ci 7517
556 per 7507
557 zni 7476
558 avn 7469
559 bud 7468
560 tva 7464
561 ním 7457
562 ere 7443
563 us_ 7434
564 tia 7427
565 tsk 7404
566 lin 7388
567 _ri 7384
568 kos 7381
569 orá 7381
570 nci 7372
571 iet 7370
572 rch 7365
573 ole 7345
574 anc 7339
575 min 7317
576 raz 7307
577 rát 7253
578 ezd 7249
579 _sy 7240
580 as_ 7236
581 _pô 7222
582 ora 7220
583 ome 7219
584 _ic 7217
585 on_ 7204
586 mov 7200
587 vy_ 7189
588 žia 7181
589 for 7178
590 elo 7173
591 or_ 7162
592 obi 7146
593 rot 7137
594 pou 7127
595 vol 7104
596 uto 7098
597 avo 7083
598 väč 7075
599 äčš 7068
600 vša 7053
601 etk 7038
602 oti 7032
603 tát 7032
604 ruh 6996
605 šak 6970
606 ek_ 6968
607 obl 6956
608 ská 6945
609 lu_ 6934
610 _fr 6933
611 až_ 6922
612 met 6919
613 íva 6918
614 lož 6909
615 emi 6907
616 orm 6905
617 ien 6889
618 tne 6878
619 pok 6875
620 lic 6864
621 _ku 6832
622 ivo 6832
623 ava 6826
624 mno 6810
625 atr 6806
626 spr 6804
627 zi_ 6799
628 zal 6798
629 ber 6787
630 ete 6786
631 apr 6784
632 lat 6777
633 _až 6774
634 eľo 6768
635 iti 6760
636 cen 6759
637 nám 6743
638 ied 6738
639 šet 6728
640 jej 6723
641 tok 6721
642 _čl 6715
643 _už 6705
644 _fa 6704
645 _tv 6695
646 _če 6695
647 lie 6691
648 par 6679
649 ozo 6668
650 _vr 6666
651 rí_ 6650
652 odu 6629
653 ice 6619
654 nut 6615
655 tin 6611
656 _or 6607
657 ela 6604
658 ni_ 6601
659 orn 6591
660 ris 6584
661 ín_ 6576
662 ly_ 6575
663 ouž 6561
664 ozn 6561
665 omo 6547
666 amo 6544
667 úci 6538
668 dal 6511
669 isl 6488
670 enc 6487
671 _ab 6485
672 mor 6483
673 vi_ 6481
674 _lo 6479
675 ici 6474
676 tej 6469
677 tik 6456
678 lom 6439
679 yst 6431
680 ian 6426
681 pat 6420
682 ril 6400
683 _da 6397
684 rac 6388
685 trá 6379
686 rýc 6366
687 žív 6361
688 em_ 6360
689 ara 6356
690 iná 6352
691 ide 6326
692 úča 6268
693 am_ 6267
694 uží 6262
695 _zm 6231
696 gra 6220
697 zák 6217
698 _ľu 6210
699 nac 6184
700 obo 6178
701 ed_ 6169
702 ram 6169
703 rak 6163
704 čný 6161
705 ďal 6147
706 rác 6144
707 dok 6125
708 vil 6125
709 eve 6119
710 jen 6115
711 zná 6112
712 vu_ 6108
713 ích 6096
714 olu 6085
715 túr 6084
716 les 6079
717 ust 6072
718 tí_ 6064
719 má_ 6041
720 jav 6040
721 ner 6016
722 ež_ 6006
723 nad 6005
724 ros 5996
725 dan 5994
726 so_ 5984
727 áto 5981
728 už_ 5969
729 bli 5966
730 _gr 5964
731 bor 5960
732 rit 5950
733 čin 5934
734 čne 5934
735 rev 5913
736 rem 5893
737 asť 5886
738 iez 5871
739 iko 5860
740 žit 5858
741 pis 5854
742 vlá 5854
743 vla 5841
744 atk 5837
745 ini 5804
746 mar 5794
747 keď 5791
748 tív 5789
749 aná 5786
750 _bi 5782
751 dol 5779
752 _ru 5775
753 nep 5767
754 _as 5761
755 _čo 5760
756 ana 5759
757 uho 5756
758 roj 5754
759 kal 5753
760 aci 5752
761 _kl 5749
762 spe 5749
763 poj 5745
764 _ha 5736
765 eti 5733
766 not 5732
767 _mn 5726
768 mož 5704
769 _sm 5697
770 los 5677
771 ačn 5645
772 eď_ 5637
773 ika 5635
774 art 5633
775 ona 5631
776 ľud 5618
777 tis 5606
778 nil 5601
779 _ex 5599
780 kat 5599
781 pan 5597
782 dvo 5554
783 asn 5553
784 etr 5544
785 _hi 5542
786 _tý 5530
787 sil 5513
788 fil 5502
789 _i_ 5501
790 ojn 5488
791 kú_ 5478
792 an_ 5475
793 lek 5470
794 čo_ 5465
795 kup 5456
796 my_ 5451
797 _zl 5442
798 tný 5442
799 _la 5439
800 ral 5439
801 hol 5433
802 níc 5433
803 _ze 5426
804 živ 5416
805 fra 5411
806 lád 5409
807 nan 5406
808 neh 5405
809 reb 5396
810 nyc 5391
811 vyš 5390
812 oni 5362
813 dľa 5361
814 ica 5356
815 nti 5349
816 _ďa 5342
817 mic 5333
818 _us 5330
819 kul 5325
820 jši 5324
821 ívn 5305
822 ožn 5304
823 vid 5303
824 obj 5300
825 hvi 5289
826 ese 5287
827 ľad 5284
828 lik 5280
829 ito 5275
830 ces 5274
831 vov 5274
832 inu 5272
833 uni 5272
834 _au 5270
835 štá 5267
836 rán 5266
837 iež 5263
838 kor 5251
839 toč 5230
840 čov 5230
841 žil 5230
842 ýva 5228
843 cky 5223
844 _fo 5211
845 rep 5211
846 tky 5199
847 zo_ 5194
848 cko 5184
849 nte 5165
850 pís 5165
851 maj 5164
852 odp 5164
853 orí 5159
854 pôs 5157
855 ajv 5146
856 čné 5144
857 eži 5140
858 um_ 5132
859 ôso 5125
860 ško 5116
861 asi 5107
862 erá 5104
863 nal 5099
864 noh 5098
865 ha_ 5083
866 vaj 5074
867 vzn 5061
868 ors 5059
869 dis 5056
870 kam 5036
871 odľ 5020
872 ojo 5019
873 _vä 5013
874 súč 5012
875 int 5004
876 ma_ 5003
877 čno 4994
878 _um 4993
879 dpo 4983
880 ciu 4974
881 rel 4972
882 ono 4971
883 tné 4969
884 môž 4966
885 ápa 4962
886 edy 4956
887 ras 4950
888 kci 4944
889 roc 4943
890 _he 4941
891 zor 4938
892 koc 4925
893 záp 4923
894 _hu 4913
895 esi 4897
896 oru 4874
897 mus 4869
898 _hv 4862
899 nka 4860
900 poh 4860
901 eta 4857
902 _ot 4856
903 bez 4854
904 be_ 4846
905 kar 4843
906 dst 4814
907 náz 4814
908 ľko 4814
909 _rí 4799
910 nak 4799
911 sad 4798
912 uch 4797
913 tál 4791
914 sam 4790
915 oci 4786
916 ému 4780
917 ôvo 4779
918 _su 4778
919 kôr 4775
920 tur 4774
921 dro 4752
922 ôr_ 4750
923 lis 4747
924 _ge 4737
925 vek 4729
926 iál 4726
927 vým 4725
928 hy_ 4709
929 moc 4699
930 ont 4699
931 ovc 4675
932 tým 4673
933 rál 4665
934 oča 4663
935 bil 4656
936 rst 4655
937 ted 4655
938 ena 4654
939 aco 4648
940 omi 4641
941 jaz 4639
942 oji 4636
943 úze 4632
944 che 4628
945 _šk 4627
946 nač 4627
947 ron 4622
948 hrá 4610
949 ší_ 4599
950 šte 4598
951 arc 4597
952 _zd 4596
953 aji 4596
954 omu 4581
955 he_ 4579
956 epo 4576
957 ráľ 4574
958 bla 4572
959 ača 4569
960 skô 4568
961 iar 4567
962 šíc 4567
963 bie 4563
964 árn 4563
965 nas 4553
966 jov 4550
967 _mô 4546
968 ík_ 4544
969 vot 4540
970 aby 4537
971 ita 4536
972 kan 4533
973 ote 4533
974 mos 4532
975 iky 4530
976 _kd 4529
977 izm 4524
978 iku 4503
979 _dn 4498
980 čši 4483
981 epr 4481
982 dil 4466
983 ult 4461
984 _sc 4455
985 olí 4449
986 til 4449
987 upi 4447
988 sch 4439
989 ja_ 4435
990 ťou 4433
991 ang 4429
992 let 4426
993 kde 4424
994 sne 4422
995 íci 4421
996 mec 4420
997 výs 4402
998 oži 4398
999 rez 4391
1000 _dô 4390
1001 vit 4386
1002 _zr 4385
1003 nit 4376
1004 eľm 4370
1005 _úz 4369
1006 zen 4362
1007 hľa 4361
1008 ákl 4352
1009 iká 4345
1010 árs 4342
1011 com 4333
1012 onc 4325
1013 liv 4321
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 je_ 254248
15 _je 175460
16 _po 157653
17 _pr 153889
18 _na 146947
19 na_ 123177
20 in_ 113877
21 _in 113626
22 _za 108679
23 _v_ 105174
24 ih_ 97106
25 ki_ 92808
26 no_ 86653
27 ja_ 85091
28 _se 77030
29 ni_ 76362
30 ta_ 74384
31 ne_ 71570
32 _so 69646
33 pre 69178
34 ko_ 68876
35 ga_ 63773
36 li_ 61861
37 _ko 61274
38 em_ 61169
39 jo_ 60836
40 _le 60552
41 sta 59864
42 se_ 58235
43 ost 57799
44 so_ 56906
45 ti_ 56256
46 nje 54926
47 la_ 53616
48 let 52995
49 ke_ 52660
50 _ka 52035
51 _ki 51309
52 pri 51232
53 _bi 51020
54 _iz 50936
55 ega 50289
56 anj 49748
57 _te 48980
58 _do 46792
59 di_ 44935
60 _ve 44305
61 _me 44109
62 ka_ 44005
63 il_ 43393
64 _pa 42651
65 bil 42312
66 _ra 42223
67 _ne 42023
68 _ob 41725
69 ali 41722
70 _st 41402
71 red 40850
72 ija 40419
73 _ta 40073
74 eta 39957
75 za_ 39902
76 ov_ 39794
77 ate 39283
78 da_ 38884
79 _de 38825
80 er_ 37691
81 al_ 37578
82 ski 36890
83 sti 36831
84 _da 36544
85 _od 36507
86 del 36116
87 lo_ 35619
88 ove 34994
89 ske 34973
90 ter 34840
91 raz 34746
92 ova 34441
93 nsk 34339
94 ma_ 34279
95 udi 33094
96 nih 32076
97 jen 32003
98 lov 31555
99 voj 31522
100 nik 31499
101 ji_ 31310
102 ran 31287
103 _ma 31099
104 jem 30990
105 nov 30857
106 to_ 30412
107 kat 30212
108 _tu 30092
109 eni 29985
110 ori 29808
111 men 29725
112 sto 29672
113 rij 29321
114 _dr 29243
115 pa_ 29163
116 lje 28994
117 str 28868
118 en_ 28448
119 _mo 28431
120 lja 28205
121 pos 28096
122 _z_ 27873
123 ani 27817
124 _vo 27719
125 rav 27380
126 eli 27363
127 pro 27329
128 tud 26854
129 ri_ 26093
130 est 26042
131 _re 25787
132 lik 25711
133 val 25540
134 ego 25498
135 nos 25415
136 ist 24943
137 ije 24902
138 pod 24852
139 _sl 24764
140 sko 24588
141 ila 24560
142 _sv 24431
143 van 24302
144 _sp 24297
145 va_ 24237
146 ovi 24198
147 ven 24181
148 kov 24047
149 por 23836
150 ed_ 23644
151 mi_ 23615
152 el_ 23524
153 zna 23509
154 ičn 23399
155 tem 23226
156 ati 23225
157 od_ 23218
158 eno 23180
159 pol 23079
160 aj_ 23043
161 ast 22905
162 ili 22787
163 tal 22700
164 _kr 22621
165 ime 22571
166 le_ 22519
167 po_ 22482
168 eri 22230
169 ju_ 22174
170 om_ 22160
171 med 22136
172 avi 22104
173 ora 22081
174 _bo 22028
175 naj 22023
176 ot_ 21923
177 elo 21921
178 ema 21917
179 lju 21876
180 vo_ 21876
181 _im 21781
182 _nj 21706
183 ve_ 21647
184 pra 21642
185 iko 21561
186 lan 21521
187 kot 21494
188 nja 21463
189 dru 21355
190 ijo 21284
191 gor 21263
192 cij 21189
193 teg 21149
194 _to 21118
195 _s_ 21054
196 oli 21053
197 vil 21011
198 ene 20912
199 vi_ 20900
200 _tr 20846
201 gra 20754
202 jsk 20669
203 tra 20502
204 _la 20379
205 ako 20315
206 ena 20188
207 _al 20104
208 kih 20002
209 kra 19922
210 tan 19741
211 elj 19717
212 _si 19625
213 _sk 19592
214 edn 19589
215 rad 19556
216 _sa 19404
217 ina 19337
218 _ni 19252
219 več 19104
220 aje 19061
221 slo 18658
222 st_ 18647
223 ral 18640
224 nem 18521
225 vel 18450
226 im_ 18442
227 jan 18400
228 rat 18361
229 ele 18351
230 ev_ 18310
231 mer 18285
232 _vs 18048
233 an_ 17950
234 aln 17831
235 _us 17830
236 do_ 17794
237 ajo 17770
238 ala 17748
239 jih 17579
240 gov 17521
241 ste 17511
242 ska 17474
243 ilo 17404
244 neg 17379
245 ika 17266
246 ans 17152
247 _en 17109
248 nji 17030
249 stv 16891
250 eva 16874
251 ira 16860
252 _no 16825
253 uje 16698
254 ik_ 16558
255 ome 16536
256 te_ 16499
257 tev 16473
258 adi 16453
259 olj 16392
260 ar_ 16280
261 kon 16213
262 avn 16170
263 vlj 16028
264 ine 16020
265 ane 16006
266 tni 15839
267 iti 15832
268 ra_ 15796
269 pov 15773
270 ana 15735
271 enj 15733
272 iz_ 15711
273 jev 15673
274 _ce 15604
275 tov 15518
276 mo_ 15463
277 ara 15442
278 ato 15430
279 tak 15418
280 ank 15343
281 edi 15339
282 nas 15262
283 tro 15252
284 _os 15220
285 odn 15154
286 ame 15153
287 čin 15104
288 zaj 15089
289 _čl 14978
290 pom 14862
291 _gl 14758
292 las 14724
293 aja 14713
294 ovo 14692
295 _o_ 14611
296 olo 14555
297 tav 14517
298 vet 14495
299 ano 14387
300 _ga 14342
301 iji 14320
302 ini 14240
303 imi 14231
304 ca_ 14221
305 iki 14210
306 ent 14180
307 tor 14180
308 ogo 14169
309 vni 14162
310 aci 14114
311 _ro 14109
312 dob 14102
313 ela 14020
314 prv 13986
315 čni 13940
316 tre 13907
317 _vi 13899
318 svo 13875
319 lni 13847
320 vse 13810
321 _up 13796
322 še_ 13782
323 _št 13712
324 evi 13671
325 eti 13669
326 _gr 13647
327 dno 13579
328 nij 13550
329 gla 13491
330 nim 13473
331 _mi 13429
332 dni 13314
333 oma 13289
334 bi_ 13273
335 ite 13266
336 ari 13239
337 jeg 13230
338 ver 13226
339 rug 13224
340 nan 13126
341 ede 13113
342 ovn 13029
343 ust 13013
344 sve 12956
345 ino 12871
346 odo 12864
347 ce_ 12845
348 mes 12838
349 led 12830
350 rsk 12823
351 nar 12810
352 man 12806
353 nam 12768
354 tri 12761
355 _pe 12712
356 rej 12706
357 _br 12660
358 čla 12632
359 rje 12616
360 pot 12615
361 de_ 12604
362 ava 12598
363 lad 12591
364 dal 12536
365 _go 12514
366 _zn 12487
367 _lj 12468
368 met 12403
369 ah_ 12402
370 etn 12318
371 re_ 12314
372 eka 12311
373 rab 12302
374 alo 12278
375 mor 12241
376 ens 12239
377 nek 12227
378 on_ 12225
379 raj 12191
380 keg 12186
381 eje 12182
382 kol 12152
383 rem 12080
384 blj 12075
385 _še 12003
386 odi 12000
387 eto 11986
388 _vr 11968
389 ita 11907
390 tu_ 11905
391 upo 11885
392 ose 11836
393 _pl 11830
394 rja 11818
395 ški 11732
396 ojn 11718
397 rod 11693
398 dan 11674
399 ica 11590
400 avl 11535
401 bol 11509
402 bli 11497
403 am_ 11491
404 tel 11456
405 kar 11449
406 _be 11446
407 pis 11401
408 rim 11394
409 čas 11379
410 ale 11378
411 če_ 11318
412 nic 11300
413 spo 11295
414 eve 11294
415 _op 11285
416 jal 11282
417 eda 11239
418 mu_ 11236
419 oko 11221
420 lin 11215
421 alj 11191
422 ima 11161
423 išk 11151
424 vno 11148
425 ole 11129
426 _an 11122
427 bra 11115
428 oto 11086
429 kem 11068
430 ci_ 11061
431 lav 11056
432 eme 11036
433 vod 11027
434 oda 11020
435 rni 10998
436 arj 10997
437 sku 10965
438 tič 10955
439 ada 10939
440 _ba 10931
441 dov 10926
442 dnj 10923
443 tno 10871
444 čne 10835
445 or_ 10833
446 vez 10809
447 mat 10732
448 es_ 10699
449 oje 10699
450 lj_ 10655
451 lah 10616
452 nej 10592
453 dar 10557
454 šte 10546
455 ave 10545
456 amo 10487
457 čno 10478
458 sam 10431
459 dil 10388
460 ore 10383
461 ese 10375
462 rit 10311
463 ice 10305
464 ijs 10303
465 era 10287
466 spr 10138
467 oči 10116
468 _di 10086
469 ved 10063
470 _ja 10059
471 kom 9989
472 vin 9947
473 _ok 9943
474 ške 9940
475 ško 9940
476 top 9926
477 sed 9914
478 ami 9896
479 ins 9864
480 ače 9858
481 var 9844
482 ezn 9839
483 obl 9818
484 gle 9817
485 _ju 9811
486 hod 9777
487 oja 9739
488 dol 9734
489 ris 9732
490 et_ 9730
491 ike 9713
492 iva 9702
493 kri 9688
494 čen 9661
495 jav 9634
496 kup 9603
497 nal 9602
498 _že 9591
499 nke 9554
500 ode 9554
501 ote 9508
502 ahk 9499
503 oro 9471
504 ere 9380
505 ek_ 9369
506 sre 9355
507 ren 9335
508 _jo 9327
509 nap 9263
510 hko 9256
511 ej_ 9246
512 obi 9240
513 lij 9199
514 _fr 9146
515 ono 9137
516 _fi 9097
517 _ča 9095
518 log 9095
519 seb 9086
520 lji 9074
521 bo_ 9025
522 and 9015
523 ona 8967
524 ejo 8935
525 anc 8915
526 tar 8880
527 nad 8879
528 drž 8866
529 stn 8854
530 _zg 8845
531 _ze 8836
532 eč_ 8814
533 nač 8807
534 rst 8784
535 ant 8759
536 den 8756
537 ros 8728
538 tek 8721
539 per 8716
540 ro_ 8699
541 mar 8691
542 živ 8684
543 nom 8675
544 tur 8659
545 adn 8658
546 ril 8626
547 _dv 8608
548 res 8581
549 pad 8522
550 aro 8460
551 ici 8429
552 rek 8426
553 nju 8419
554 _zd 8415
555 jud 8405
556 _sr 8391
557 rža 8375
558 zar 8372
559 _oz 8368
560 nes 8360
561 _ji 8352
562 išč 8352
563 sa_ 8341
564 oti 8338
565 lit 8337
566 si_ 8335
567 ete 8333
568 ern 8330
569 žav 8305
570 tvo 8291
571 omo 8272
572 zem 8272
573 obr 8256
574 rep 8252
575 _lo 8246
576 _ti 8218
577 ekt 8218
578 _pi 8183
579 _va 8164
580 ozn 8144
581 _li 8137
582 _ar 8136
583 dij 8116
584 nav 8079
585 rot 8076
586 iln 8064
587 ku_ 8060
588 rom 8059
589 ejš 8058
590 _če 8056
591 rev 8023
592 tva 8011
593 zap 8008
594 rov 7977
595 at_ 7961
596 ij_ 7959
597 oji 7947
598 sno 7937
599 isa 7920
600 vne 7912
601 dne 7906
602 ben 7887
603 be_ 7881
604 oni 7850
605 žen 7843
606 reb 7838
607 moč 7828
608 emi 7815
609 lno 7800
610 sla 7800
611 roč 7787
612 čil 7763
613 spe 7726
614 sem 7714
615 zve 7713
616 jub 7702
617 erj 7672
618 me_ 7666
619 nil 7655
620 _or 7603
621 nat 7593
622 zač 7586
623 msk 7547
624 lič 7545
625 iso 7536
626 ubl 7500
627 god 7459
628 oče 7457
629 eh_ 7450
630 opi 7440
631 azl 7437
632 _ži 7431
633 _hi 7419
634 kor 7400
635 otr 7391
636 osl 7382
637 tih 7354
638 kan 7350
639 eds 7330
640 mel 7319
641 emo 7315
642 par 7311
643 nda 7307
644 izv 7305
645 tne 7274
646 oka 7241
647 lek 7237
648 dst 7228
649 ruž 7191
650 ade 7188
651 min 7184
652 fra 7182
653 gos 7175
654 plo 7174
655 _vz 7150
656 vor 7149
657 ss_ 7119
658 kal 7110
659 zgo 7088
660 ogr 7087
661 riš 7081
662 ec_ 7080
663 orj 7080
664 orn 7076
665 _um 7064
666 apo 7018
667 rvi 7005
668 _zv 6991
669 ata 6978
670 uni 6968
671 cel 6965
672 že_ 6960
673 ram 6958
674 izi 6934
675 ero 6917
676 lu_ 6909
677 ge_ 6907
678 arn 6878
679 azi 6877
680 ars 6868
681 zel 6844
682 nst 6835
683 daj 6826
684 ive 6818
685 lne 6814
686 pog 6807
687 jer 6801
688 tik 6799
689 niš 6784
690 zli 6777
691 emb 6767
692 nis 6755
693 ron 6754
694 poz 6750
695 aka 6746
696 loč 6743
697 ret 6739
698 rno 6738
699 etu 6736
700 go_ 6728
701 tol 6706
702 rne 6701
703 nci 6699
704 ča_ 6693
705 isk 6687
706 jši 6687
707 sle 6632
708 sil 6630
709 avo 6615
710 ba_ 6588
711 ile 6588
712 uss 6588
713 sod 6584
714 šči 6583
715 bor 6581
716 ner 6581
717 ašk 6577
718 mal 6565
719 nce 6557
720 _ri 6546
721 mlj 6541
722 tin 6533
723 _kl 6530
724 _ke 6528
725 pla 6515
726 jaj 6512
727 ion 6510
728 _sm 6507
729 eza 6496
730 are 6495
731 _ge 6472
732 ten 6469
733 sov 6465
734 ozi 6444
735 dra 6437
736 sel 6432
737 sne 6429
738 tve 6385
739 zda 6368
740 rog 6358
741 du_ 6338
742 rez 6337
743 odp 6323
744 zra 6322
745 asn 6320
746 otn 6319
747 sli 6317
748 eko 6316
749 gan 6305
750 _mu 6297
751 abl 6296
752 _am 6272
753 itv 6254
754 gij 6240
755 ope 6228
756 end 6218
757 av_ 6214
758 not 6200
759 ang 6198
760 ton 6195
761 zij 6186
762 kje 6185
763 _av 6176
764 bno 6167
765 opa 6163
766 cer 6161
767 šče 6139
768 ugi 6138
769 kla 6137
770 igr 6136
771 obe 6128
772 ide 6122
773 obn 6121
774 rop 6120
775 dom 6119
776 kaj 6100
777 emu 6088
778 ber 6084
779 vit 6082
780 ak_ 6067
781 tom 6058
782 jet 6051
783 rva 6018
784 jni 6006
785 _kj 6003
786 _el 6002
787 ons 5999
788 ob_ 5973
789 čel 5971
790 nak 5963
791 nu_ 5957
792 reg 5934
793 les 5933
794 zni 5924
795 lic 5917
796 kro 5908
797 din 5907
798 _ig 5900
799 _ur 5897
800 maj 5890
801 asi 5882
802 mem 5876
803 zan 5864
804 _ha 5854
805 jno 5842
806 enc 5823
807 gen 5816
808 len 5812
809 ad_ 5805
810 pin 5798
811 eči 5787
812 opo 5780
813 hov 5779
814 kim 5767
815 rih 5765
816 nte 5760
817 eml 5758
818 okr 5755
819 nti 5731
820 art 5725
821 očj 5713
822 pon 5703
823 tis 5690
824 taj 5679
825 co_ 5661
826 šča 5649
827 dat 5632
828 su_ 5627
829 gi_ 5593
830 naš 5591
831 ogi 5587
832 tiv 5587
833 ivi 5578
834 zav 5555
835 net 5552
836 cev 5546
837 rin 5544
838 ezi 5540
839 ase 5536
840 rik 5517
841 lem 5513
842 rip 5507
843 zad 5505
844 obo 5487
845 niz 5485
846 _ev 5474
847 nta 5468
848 sev 5454
849 mno 5447
850 esa 5427
851 bni 5418
852 odr 5414
853 jez 5409
854 mag 5407
855 rič 5407
856 vol 5406
857 omi 5381
858 jej 5369
859 nit 5360
860 una 5360
861 vih 5354
862 klj 5344
863 asl 5343
864 ači 5343
865 upi 5339
866 lom 5335
867 či_ 5323
868 san 5322
869 ado 5308
870 rok 5303
871 sni 5303
872 nog 5301
873 ečj 5292
874 rti 5285
875 ses 5281
876 ugo 5279
877 til 5270
878 bri 5262
879 ume 5262
880 ogl 5260
881 epr 5245
882 čan 5239
883 leg 5227
884 tok 5221
885 alc 5216
886 ešk 5215
887 iri 5212
888 pet 5212
889 as_ 5205
890 ru_ 5205
891 reč 5199
892 vaj 5176
893 zik 5174
894 zat 5170
895 ple 5169
896 _he 5160
897 osk 5159
898 oln 5152
899 _vl 5145
900 bel 5145
901 son 5135
902 zvo 5132
903 onc 5130
904 bit 5129
905 zdr 5121
906 tsk 5107
907 vij 5101
908 vir 5097
909 dos 5095
910 iza 5095
911 imo 5089
912 rel 5075
913 rač 5050
914 očn 5049
915 ška 5048
916 ard 5046
917 azv 5044
918 api 5027
919 ši_ 5020
920 žno 5019
921 lat 5015
922 bre 5014
923 eja 5007
924 esn 4997
925 for 4975
926 gre 4969
927 vis 4969
928 vid 4956
929 etr 4949
930 oča 4948
931 dro 4947
932 izr 4945
933 ivn 4940
934 zah 4931
935 lsk 4928
936 kaz 4921
937 jim 4906
938 vla 4892
939 etj 4879
940 pok 4878
941 vrs 4868
942 _is 4866
943 rid 4861
944 ker 4860
945 ebn 4856
946 boj 4855
947 rib 4853
948 lko 4839
949 nič 4832
950 bar 4817
951 vsa 4803
952 oj_ 4800
953 is_ 4796
954 ure 4795
955 onč 4792
956 rak 4789
957 cen 4788
958 saj 4750
959 zde 4747
960 rob 4738
961 ito 4736
962 vsk 4736
963 loš 4731
964 ial 4723
965 žel 4719
966 rož 4716
967 pop 4712
968 vna 4707
969 čet 4700
970 _un 4689
971 gal 4688
972 uži 4683
973 evr 4680
974 vro 4676
975 ras 4674
976 ico 4671
977 zas 4664
978 abi 4655
979 obs 4644
980 ing 4640
981 org 4634
982 jst 4633
983 _ca 4628
984 _mn 4615
985 kop 4590
986 čev 4570
987 ago 4560
988 rna 4556
989 enu 4554
990 opr 4549
991 tru 4549
992 _co 4548
993 ces 4542
994 _ru 4535
995 _wi 4523
996 _bl 4513
997 elu 4508
998 _ot 4501
999 rma 4501
1000 izm 4489
1001 _ku 4484
1002 aga 4469
1003 mon 4454
1004 zal 4450
1005 zme 4443
1006 _fo 4439
1007 nj_ 4422
1008 mov 4419
1009 izd 4417
1010 oga 4417
1011 zvi 4405
1012 ojs 4403
1013 poj 4402
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 en_ 92283
15 _de 80300
16 et_ 65529
17 tt_ 62645
18 för 61570
19 er_ 59169
20 att 50169
21 _fö 49849
22 om_ 48354
23 _at 44382
24 det 42846
25 de_ 39705
26 _oc 38975
27 ar_ 35734
28 ch_ 35025
29 och 35007
30 ing 34078
31 ör_ 32266
32 _vi 32126
33 _in 31054
34 _i_ 30495
35 nde 30447
36 är_ 29798
37 and 28993
38 som 28602
39 _so 28589
40 an_ 27688
41 ter 27216
42 _me 27166
43 na_ 27106
44 den 26707
45 ll_ 25984
46 ra_ 25362
47 ill 25097
48 _av 24638
49 ska 24452
50 _en 23817
51 _ko 23771
52 ion 23550
53 _ti 21676
54 lig 20583
55 av_ 20481
56 ag_ 20283
57 _är 20198
58 te_ 20181
59 til 20149
60 ta_ 19519
61 _ha 19362
62 nte 19169
63 med 19136
64 gen 19065
65 ka_ 18825
66 ett 18812
67 _om 18718
68 isk 18716
69 nin 18697
70 _fr 18084
71 one 17519
72 rna 17506
73 kom 17054
74 _be 17045
75 men 15944
76 lle 15910
77 _st 15861
78 ga_ 15640
79 vi_ 15539
80 der 15492
81 _på 15488
82 ng_ 15337
83 int 15304
84 all 15285
85 omm 15031
86 _sk 14735
87 _ja 14591
88 jag 14120
89 _ut 14114
90 på_ 13913
91 era 13666
92 ern 13636
93 la_ 13405
94 ens 13249
95 _an 12925
96 ste 12832
97 ed_ 12717
98 ätt 12682
99 as_ 12589
100 var 12560
101 har 12468
102 het 12390
103 nen 12157
104 ent 12063
105 man 11899
106 tta 11826
107 _va 11679
108 sta 11672
109 lla 11340
110 _re 11095
111 ns_ 10879
112 _et 10857
113 rin 10843
114 sam 10801
115 ler 10711
116 ver 10695
117 _eu 10678
118 ara 10628
119 gar 10424
120 frå 10330
121 örs 10214
122 ten 10210
123 tio 10160
124 ell 10020
125 ade 10013
126 mis 10012
127 kan 10011
128 iga 9922
129 änd 9862
130 gt_ 9849
131 iss 9835
132 nge 9731
133 ans 9689
134 uro 9647
135 eur 9634
136 upp 9482
137 re_ 9413
138 tet 9405
139 igt 9403
140 ati 9374
141 rop 9343
142 ete 9336
143 _sa 9328
144 _ta 9317
145 ran 9232
146 bet 9156
147 _mi 9142
148 _pr 9059
149 vil 9005
150 _al 8984
151 ser 8942
152 _ge 8925
153 _ka 8909
154 kon 8906
155 nna 8905
156 mer 8891
157 ig_ 8722
158 tal 8694
159 lan 8592
160 så_ 8546
161 _si 8518
162 sio 8464
163 nom 8453
164 kti 8364
165 ner 8341
166 ssi 8330
167 _må 8289
168 _un 8270
169 nga 8253
170 und 8222
171 mmi 8147
172 _ma 8124
173 eri 8035
174 dra 8026
175 äll 7879
176 mme 7730
177 pro 7636
178 del 7570
179 _he 7543
180 ngs 7534
181 _up 7421
182 ram 7421
183 lag 7355
184 iti 7286
185 are 7203
186 år_ 7127
187 _sä 7070
188 nsk 7067
189 kt_ 6912
190 ndr 6806
191 öra 6805
192 ts_ 6795
193 inn 6771
194 lit 6720
195 ghe 6642
196 öre 6619
197 ren 6617
198 mma 6569
199 or_ 6513
200 oli 6430
201 str 6412
202 stä 6279
203 des 6274
204 igh 6272
205 ekt 6267
206 on_ 6265
207 tig 6264
208 ger 6236
209 råd 6233
210 _så 6232
211 res 6179
212 rt_ 6110
213 åde 6100
214 da_ 6078
215 gan 6048
216 ikt 6038
217 at_ 5991
218 ord 5972
219 par 5959
220 när 5953
221 ket 5932
222 ess 5929
223 _nä 5923
224 lt_ 5910
225 han 5901
226 rät 5845
227 tan 5820
228 tte 5817
229 yck 5797
230 vis 5767
231 råg 5743
232 ers 5736
233 min 5724
234 tiv 5706
235 kal 5701
236 ets 5681
237 lut 5664
238 tat 5598
239 dan 5584
240 ope 5564
241 els 5555
242 ort 5546
243 amm 5543
244 _gr 5535
245 sa_ 5532
246 _se 5458
247 fra 5457
248 eis 5411
249 pei 5407
250 ad_ 5385
251 kli 5384
252 tni 5379
253 sla 5321
254 lar 5310
255 ala 5232
256 sät 5222
257 kap 5192
258 st_ 5183
259 ull 5175
260 _vä 5169
261 _hä 5147
262 dig 5135
263 öve 5130
264 rbe 5119
265 _fi 5113
266 nst 5109
267 for 5107
268 pol 5074
269 åst 5071
270 err 5063
271 ock 5022
272 _fa 5009
273 cke 4955
274 _po 4933
275 eme 4910
276 reg 4910
277 ker 4909
278 sku 4859
279 _dä 4855
280 där 4845
281 _tr 4840
282 ån_ 4838
283 _än 4827
284 rat 4804
285 nat 4801
286 _rä 4796
287 mås 4795
288 _lä 4786
289 ång 4783
290 eda 4776
291 ame 4772
292 tis 4753
293 fte 4746
294 red 4726
295 _ve 4714
296 _öv 4703
297 arl 4688
298 arb 4678
299 ssa 4678
300 tor 4674
301 rde 4667
302 rån 4665
303 fin 4648
304 lin 4640
305 tra 4617
306 _pa 4577
307 tar 4563
308 ist 4547
309 kul 4539
310 gra 4511
311 llt 4502
312 _mo 4481
313 nd_ 4474
314 sen 4473
315 el_ 4462
316 gör 4446
317 rst 4441
318 ant 4437
319 ågo 4436
320 akt 4433
321 ige 4416
322 ate 4401
323 in_ 4386
324 ins 4372
325 le_ 4362
326 nda 4362
327 tid 4356
328 art 4355
329 erk 4323
330 rar 4320
331 vår 4309
332 cks 4306
333 _li 4305
334 uta 4280
335 mar 4261
336 _ar 4255
337 ot_ 4246
338 bar 4193
339 lem 4186
340 ven 4171
341 _ef 4168
342 tro 4166
343 ast 4140
344 _åt 4135
345 slu 4129
346 nse 4125
347 gru 4121
348 ilj 4093
349 mot 4078
350 åga 4074
351 arn 4048
352 eta 4043
353 enn 4028
354 uni 3995
355 age 3988
356 stå 3981
357 nt_ 3967
358 rla 3967
359 tik 3967
360 her 3953
361 kså 3944
362 ss_ 3943
363 äns 3930
364 end 3927
365 _my 3922
366 _nå 3920
367 _fo 3888
368 ite 3868
369 rr_ 3864
370 rik 3852
371 kla 3846
372 _or 3812
373 rad 3809
374 sto 3809
375 lam 3808
376 tli 3805
377 rsl 3802
378 pa_ 3782
379 _ba 3779
380 vet 3764
381 nne 3763
382 tur 3754
383 eno 3723
384 _vå 3721
385 kte 3707
386 ntr 3705
387 tag 3701
388 bes 3697
389 nad 3692
390 _di 3686
391 ras 3686
392 nio 3664
393 ma_ 3649
394 per 3634
395 någ 3630
396 ege 3623
397 ike 3593
398 ärd 3592
399 pen 3589
400 _sy 3577
401 opa 3577
402 nis 3563
403 rli 3552
404 sig 3542
405 sko 3520
406 omr 3512
407 sti 3512
408 hål 3488
409 åll 3483
410 änn 3482
411 ret 3462
412 _få 3459
413 ndl 3449
414 get 3439
415 kni 3429
416 em_ 3427
417 vid 3425
418 stö 3421
419 län 3415
420 lik 3397
421 nta 3392
422 _kr 3390
423 oll 3389
424 _la 3348
425 _er 3339
426 ons 3336
427 tän 3336
428 nas 3325
429 utv 3323
430 _ku 3321
431 ken 3314
432 ja_ 3307
433 id_ 3299
434 ckl 3297
435 est 3287
436 lse 3286
437 led 3283
438 eko 3279
439 eft 3270
440 ika 3263
441 es_ 3262
442 ela 3258
443 myc 3242
444 gem 3236
445 tre 3234
446 vän 3225
447 ris 3224
448 eck 3208
449 mel 3202
450 män 3196
451 kra 3192
452 am_ 3190
453 orm 3160
454 se_ 3148
455 ndi 3142
456 _gä 3141
457 hel 3137
458 vär 3137
459 mrå 3136
460 sva 3110
461 al_ 3108
462 täl 3102
463 _el 3086
464 här 3084
465 _gö 3080
466 _ny 3073
467 ats 3072
468 _ni 3061
469 nar 3055
470 ber 3052
471 dri 3045
472 vik 3029
473 trä 3026
474 uts 3023
475 alm 3022
476 _bl 3017
477 _da 3010
478 lma 3009
479 let 2987
480 sst 2979
481 ina 2977
482 dni 2944
483 run 2936
484 far 2891
485 mil 2887
486 sse 2872
487 ena 2866
488 tve 2859
489 rit 2850
490 _le 2847
491 sin 2840
492 nsa 2834
493 gäl 2833
494 _br 2824
495 sk_ 2818
496 rek 2799
497 ilk 2789
498 _rå 2787
499 vec 2782
500 itt 2780
501 ono 2778
502 läg 2762
503 ttn 2759
504 edl 2751
505 omi 2747
506 äve 2747
507 kri 2745
508 pp_ 2742
509 aga 2731
510 änk 2726
511 org 2723
512 is_ 2720
513 dli 2704
514 rän 2704
515 nkt 2703
516 rfö 2698
517 dag 2693
518 _na 2692
519 _os 2689
520 tti 2689
521 ske 2672
522 amt 2669
523 tvi 2662
524 rag 2654
525 avs 2645
526 beh 2644
527 ial 2634
528 ino 2630
529 mfö 2609
530 ede 2604
531 spe 2596
532 nns 2591
533 lja 2581
534 ni_ 2576
535 bör 2570
536 dle 2568
537 sat 2568
538 dem 2561
539 ari 2559
540 ive 2558
541 bli 2543
542 esl 2531
543 lis 2528
544 egi 2521
545 ror 2519
546 kun 2514
547 unk 2514
548 pri 2508
549 inf 2498
550 va_ 2495
551 bor 2493
552 oss 2489
553 _ty 2484
554 it_ 2483
555 apa 2476
556 gär 2455
557 sit 2400
558 ski 2398
559 kar 2397
560 _mö 2394
561 _bö 2393
562 _ek 2391
563 _äv 2384
564 _år 2384
565 gor 2373
566 öst 2373
567 ems 2371
568 _gå 2356
569 ur_ 2356
570 ind 2343
571 nu_ 2339
572 rsk 2337
573 rig 2336
574 rso 2335
575 ali 2328
576 kor 2324
577 _hu 2303
578 kat 2301
579 sys 2298
580 iv_ 2293
581 lls 2289
582 kna 2286
583 mån 2279
584 sfö 2276
585 skt 2275
586 rre 2265
587 ier 2263
588 ott 2263
589 töd 2262
590 mst 2258
591 ini 2253
592 ere 2252
593 ark 2248
594 ke_ 2245
595 pun 2245
596 nan 2242
597 _mä 2240
598 lke 2233
599 vad 2232
600 ägg 2226
601 örd 2219
602 ise 2218
603 rga 2215
604 _bi 2211
605 rs_ 2207
606 isa 2206
607 nka 2206
608 jäl 2204
609 gga 2199
610 soc 2197
611 _nu 2196
612 jli 2196
613 ppe 2195
614 öjl 2195
615 tas 2191
616 _ex 2190
617 gsf 2190
618 möj 2186
619 ruk 2179
620 ida 2176
621 oci 2175
622 nel 2160
623 rkl 2158
624 cia 2151
625 iva 2144
626 _ra 2134
627 rer 2133
628 val 2133
629 _sl 2132
630 fat 2119
631 _pe 2116
632 säk 2109
633 säg 2103
634 ik_ 2095
635 spr 2090
636 ttr 2089
637 rka 2083
638 gån 2079
639 len 2077
640 ift 2076
641 _ho 2072
642 ytt 2071
643 ck_ 2062
644 ut_ 2061
645 äng 2060
646 uti 2041
647 ärf 2036
648 itu 2029
649 ire 2025
650 pas 2022
651 kän 2011
652 ukt 2011
653 bil 2005
654 fal 2005
655 nor 2002
656 elt 1998
657 ann 1994
658 cka 1993
659 tyd 1989
660 rup 1981
661 äga 1980
662 än_ 1978
663 tru 1977
664 dir 1975
665 unn 1974
666 nti 1972
667 _tv 1969
668 kil 1969
669 kol 1959
670 örb 1953
671 jor 1948
672 rti 1938
673 tsl 1937
674 ori 1932
675 cen 1929
676 ffe 1927
677 _sp 1924
678 tem 1919
679 väl 1919
680 mss 1917
681 dam 1912
682 ont 1909
683 _kv 1908
684 vin 1904
685 ert 1903
686 emo 1897
687 erh 1896
688 _hö 1894
689 tad 1888
690 rog 1884
691 _ri 1881
692 läm 1881
693 ära 1880
694 _sj 1878
695 nli 1874
696 omf 1873
697 tts 1868
698 ble 1865
699 por 1861
700 ogr 1860
701 _ju 1854
702 dar 1849
703 ide 1841
704 ust 1828
705 mig 1827
706 nfö 1814
707 teg 1814
708 _lå 1811
709 skr 1810
710 ge_ 1802
711 _fl 1790
712 pek 1784
713 ämn 1784
714 ntl 1781
715 etä 1779
716 rhe 1773
717 ämp 1767
718 met 1761
719 sol 1760
720 idi 1750
721 dfö 1749
722 gon 1749
723 leg 1743
724 vat 1741
725 rol 1740
726 mål 1737
727 åtg 1732
728 tgä 1726
729 ktu 1723
730 ljö 1704
731 ien 1703
732 amh 1694
733 nni 1687
734 äke 1687
735 ånd 1687
736 syn 1686
737 ape 1685
738 _bo 1682
739 erl 1672
740 _sv 1666
741 pla 1665
742 obl 1661
743 _go 1660
744 nal 1657
745 roc 1648
746 _do 1643
747 tån 1643
748 gni 1641
749 åte 1640
750 då_ 1635
751 vå_ 1635
752 _ol 1624
753 _kl 1621
754 opp 1621
755 pre 1621
756 hän 1619
757 rma 1619
758 dis 1618
759 sli 1618
760 gio 1615
761 tsk 1613
762 oce 1612
763 kta 1611
764 spo 1608
765 ani 1597
766 sni 1596
767 lös 1593
768 rob 1590
769 fri 1589
770 ang 1584
771 rme 1577
772 onk 1575
773 rkn 1572
774 _kä 1566
775 onä 1565
776 raf 1563
777 _fu 1556
778 nit 1553
779 dla 1548
780 sak 1548
781 _te 1545
782 lat 1541
783 ale 1539
784 nya 1531
785 täm 1531
786 yss 1528
787 efo 1523
788 _hå 1522
789 olk 1518
790 atu 1515
791 sek 1513
792 ona 1511
793 tri 1511
794 gre 1507
795 åda 1504
796 rdn 1503
797 _rö 1502
798 got 1502
799 krä 1488
800 nsv 1486
801 ult 1484
802 ya_ 1483
803 tst 1481
804 ume 1480
805 kot 1478
806 rdf 1477
807 tit 1477
808 ags 1473
809 mli 1466
810 hur 1465
811 alt 1464
812 _of 1459
813 mat 1454
814 få_ 1453
815 pel 1449
816 lad 1447
817 ora 1446
818 ack 1440
819 sky 1439
820 _to 1438
821 liv 1438
822 nhe 1438
823 åra 1438
824 try 1433
825 yst 1432
826 möt 1428
827 äck 1428
828 ärk 1427
829 nsi 1426
830 räd 1426
831 god 1425
832 ank 1420
833 får 1415
834 ost 1415
835 ute 1407
836 öka 1407
837 sjä 1405
838 vs_ 1405
839 häl 1403
840 lni 1401
841 örh 1397
842 egr 1389
843 mna 1388
844 öte 1379
845 _ga 1375
846 ha_ 1375
847 lde 1375
848 mti 1375
849 rte 1375
850 _eg 1374
851 lta 1373
852 rva 1370
853 _då 1364
854 ölj 1364
855 rts 1361
856 nke 1359
857 ört 1357
858 tin 1356
859 ärs 1355
860 van 1353
861 edb 1352
862 väg 1348
863 ämm 1348
864 orn 1347
865 tör 1347
866 edr 1344
867 beg 1339
868 rot 1335
869 ful 1334
870 älv 1331
871 lln 1330
872 bef 1329
873 tjä 1328
874 sid 1326
875 ana 1323
876 föl 1323
877 sel 1320
878 låt 1319
879 nvä 1318
880 ild 1317
881 gst 1312
882 mpe 1310
883 enh 1308
884 etr 1307
885 inr 1307
886 amf 1306
887 enl 1306
888 gna 1305
889 mit 1305
890 rös 1304
891 bat 1303
892 äge 1303
893 ore 1301
894 utt 1301
895 ral 1300
896 sök 1300
897 je_ 1299
898 las 1299
899 sik 1295
900 nds 1292
901 tif 1292
902 _pl 1289
903 ors 1287
904 jän 1285
905 tsä 1280
906 stn 1279
907 ivi 1278
908 åt_ 1274
909 sna 1271
910 örv 1270
911 mt_ 1268
912 dbo 1262
913 äst 1262
914 llv 1259
915 _no 1257
916 sär 1257
917 deb 1256
918 esu 1256
919 räv 1253
920 riv 1251
921 fis 1248
922 fol 1247
923 pos 1247
924 änt 1244
925 kos 1243
926 _lö 1242
927 byg 1241
928 eba 1241
929 tår 1235
930 rod 1233
931 rn_ 1231
932 app 1226
933 nci 1226
934 ung 1226
935 ode 1224
936 ygg 1224
937 _nö 1223
938 oms 1222
939 ama 1219
940 fek 1217
941 _tu 1215
942 grä 1212
943 sis 1209
944 ehö 1206
945 går 1206
946 ilt 1202
947 cip 1200
948 fle 1200
949 inc 1199
950 lst 1198
951 örf 1197
952 lsä 1196
953 rör 1194
954 tse 1193
955 dom 1191
956 fer 1191
957 _dr 1189
958 kad 1186
959 kur 1186
960 tac 1185
961 bät 1184
962 kel 1184
963 näm 1183
964 lva 1179
965 vit 1178
966 olu 1175
967 eu_ 1173
968 ick 1171
969 anv 1167
970 årt 1165
971 ls_ 1162
972 eff 1158
973 pap 1158
974 aft 1156
975 ots 1154
976 esk 1153
977 rel 1152
978 ppm 1151
979 _bä 1149
980 egl 1147
981 kas 1147
982 urr 1145
983 erv 1144
984 frä 1144
985 hög 1142
986 nno 1141
987 um_ 1140
988 ars 1138
989 gis 1138
990 _pu 1132
991 såd 1131
992 okr 1130
993 nsl 1123
994 rfa 1116
995 mor 1115
996 ane 1114
997 _fe 1112
998 hör 1111
999 kto 1110
1000 fru 1109
1001 mok 1098
1002 bek 1097
1003 _sn 1096
1004 ele 1096
1005 nku 1096
1006 ågr 1095
1007 _hi 1094
1008 örl 1091
1009 das 1090
1010 tom 1090
1011 ref 1088
1012 rdr 1087
1013 ton 1086
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 การ 38686
15 _อง 13941
16 _าง 12827
17 ประ 12609
18 _าน 11182
19 ของ 11179
20 วาม 10809
21 เป_ 9899
22 ได_ 9380
23 _กา 9046
24 ควา 9007
25 ให_ 8846
26 และ 8769
27 _น_ 8413
28 นท_ 7353
29 _ม_ 7302
30 นกา 7139
31 ารเ 6906
32 _ร_ 6515
33 เร_ 6479
34 งกา 6385
35 _อน 6258
36 อย_ 6124
37 งาน 6114
38 _กษ 6073
39 _ต_ 5948
40 จาก 5853
41 _ท_ 5787
42 หน_ 5761
43 ไม_ 5626
44 _นท 5562
45 _อย 5562
46 ใช_ 5524
47 _ส_ 5399
48 _นเ 5255
49 ระเ 5006
50 บร_ 4949
51 _าเ 4868
52 งท_ 4844
53 ทาง 4801
54 กษา 4778
55 หร_ 4774
56 _วย 4768
57 _นก 4563
58 _าร 4511
59 าร_ 4500
60 เพ_ 4448
61 _ว_ 4432
62 _อก 4352
63 เก_ 4288
64 _งเ 4199
65 เง_ 4183
66 องก 4172
67 _ก_ 4170
68 าน_ 4066
69 _จ_ 3832
70 _าห 3809
71 เด_ 3759
72 รรม 3730
73 _ยน 3709
74 _ด_ 3706
75 ามา 3630
76 _าย 3625
77 เล_ 3586
78 _อม 3585
79 คร_ 3584
80 _วน 3573
81 กล_ 3556
82 นต_ 3495
83 ารท 3449
84 ระช 3447
85 _คว 3431
86 หล_ 3431
87 ะเท 3430
88 มาก 3427
89 เทศ 3357
90 าท_ 3332
91 _อ_ 3282
92 _บ_ 3237
93 เม_ 3180
94 _าม 3172
95 _บร 3138
96 _าใ 3087
97 _ใน 3053
98 _ค_ 3041
99 _าว 3010
100 _ปร 2961
101 งน_ 2958
102 _ยง 2951
103 ปร_ 2937
104 ในก 2925
105 กต_ 2920
106 _ป_ 2899
107 _นอ 2897
108 กว_ 2894
109 มาร 2893
110 ากา 2890
111 าต_ 2890
112 _งก 2869
113 ารส 2868
114 _ย_ 2862
115 อร_ 2845
116 _ยว 2832
117 _าก 2829
118 ภาพ 2810
119 เข_ 2799
120 เน_ 2789
121 ามเ 2758
122 าม_ 2755
123 _นต 2750
124 รท_ 2723
125 เช_ 2712
126 _อเ 2696
127 ตร_ 2678
128 นส_ 2671
129 ารถ 2650
130 ะชา 2649
131 ารป 2631
132 บกา 2625
133 งส_ 2616
134 กระ 2614
135 ระบ 2613
136 ะด_ 2609
137 สร_ 2591
138 องเ 2586
139 ไทย 2580
140 _พ_ 2564
141 _ตร 2558
142 รก_ 2549
143 าก_ 2549
144 _จะ 2539
145 ราย 2537
146 _ได 2500
147 งก_ 2498
148 งปร 2467
149 ารใ 2466
150 _งค 2453
151 _ง_ 2450
152 างก 2447
153 ระก 2444
154 รศ_ 2433
155 งต_ 2413
156 _เป 2403
157 นร_ 2398
158 นก_ 2393
159 _แล 2377
160 _กร 2373
161 ยละ 2337
162 านว 2328
163 กรร 2307
164 เส_ 2305
165 งม_ 2285
166 นใน 2281
167 หม_ 2279
168 _ฒน 2253
169 อน_ 2253
170 _ละ 2247
171 ารศ 2241
172 าให 2236
173 งค_ 2220
174 สาม 2186
175 _าค 2183
176 _นส 2178
177 ตาม 2170
178 _า_ 2160
179 _งส 2149
180 ะกา 2146
181 _บก 2138
182 _มา 2130
183 ะท_ 2128
184 มต_ 2123
185 ลาย 2111
186 กร_ 2094
187 นน_ 2089
188 รณ_ 2085
189 มร_ 2080
190 คล_ 2076
191 _งน 2069
192 หาร 2062
193 _ล_ 2052
194 _เก 2049
195 สาร 2047
196 โดย 2010
197 _ดก 2009
198 _ดเ 2009
199 _นค 2006
200 _นา 2000
201 มท_ 1998
202 ชาช 1994
203 องท 1993
204 ละเ 1980
205 าชน 1977
206 _นไ 1976
207 นด_ 1972
208 ฒนา 1970
209 แก_ 1965
210 _งแ 1963
211 _นใ 1954
212 ามส 1942
213 _งห 1936
214 _าท 1932
215 นพ_ 1925
216 ออก 1923
217 อยล 1921
218 ระด 1909
219 _นแ 1891
220 าค_ 1889
221 _ระ 1884
222 เคร 1880
223 _ช_ 1876
224 _นร 1875
225 งร_ 1866
226 อม_ 1866
227 าว_ 1855
228 หมา 1854
229 าล_ 1850
230 นช_ 1846
231 นขอ 1843
232 ชน_ 1842
233 าส_ 1833
234 องค 1829
235 อกา 1824
236 หว_ 1809
237 นว_ 1806
238 มาย 1803
239 นปร 1795
240 มพ_ 1795
241 นอ_ 1793
242 ะบบ 1780
243 ารณ 1780
244 สถา 1778
245 งงา 1774
246 นมา 1768
247 ละก 1767
248 ากก 1767
249 แต_ 1762
250 นวน 1757
251 ารแ 1755
252 _ให 1753
253 ดกา 1749
254 รปร 1747
255 _งท 1736
256 ารพ 1726
257 _งม 1723
258 _ผ_ 1720
259 าบ_ 1703
260 ารก 1701
261 บปร 1694
262 าหน 1691
263 เท_ 1687
264 นระ 1675
265 าคา 1668
266 ากร 1664
267 _ทย 1662
268 ารร 1650
269 องร 1644
270 ชาต 1642
271 านก 1641
272 _าแ 1630
273 าหร 1629
274 นป_ 1628
275 แบบ 1617
276 ญหา 1615
277 มาณ 1615
278 ามร 1614
279 ารจ 1612
280 รส_ 1611
281 ารว 1607
282 _นด 1604
283 ละ_ 1599
284 ปล_ 1593
285 ทยา 1591
286 _นม 1584
287 ครง 1583
288 _วม 1582
289 แล_ 1582
290 _าไ 1578
291 ารต 1574
292 กท_ 1573
293 างเ 1567
294 งด_ 1565
295 หญ_ 1558
296 _ญห 1553
297 _าล 1549
298 นแล 1548
299 อง_ 1547
300 รเง 1546
301 _าจ 1539
302 ราะ 1537
303 _สา 1535
304 ะม_ 1534
305 รจ_ 1532
306 _งข 1529
307 ตรา 1529
308 วก_ 1529
309 _อส 1519
310 ารอ 1518
311 ยาก 1514
312 _ขอ 1503
313 กน_ 1502
314 ารค 1482
315 _อา 1481
316 องป 1480
317 ะส_ 1479
318 _นน 1477
319 _าส 1474
320 าย_ 1469
321 ในเ 1469
322 _วเ 1465
323 งใน 1464
324 นาค 1461
325 ศาส 1460
326 จะเ 1444
327 งข_ 1439
328 องส 1438
329 _ข_ 1436
330 งจา 1431
331 _ศ_ 1429
332 ระม 1429
333 ายไ 1429
334 มส_ 1428
335 _บเ 1423
336 _อไ 1415
337 _ไม 1414
338 งขอ 1408
339 าเป 1406
340 _บป 1405
341 สตร 1400
342 _เพ 1396
343 าระ 1396
344 _นว 1392
345 ยได 1390
346 _นจ 1389
347 ภาค 1389
348 _จา 1384
349 รใช 1384
350 ยใน 1380
351 _กเ 1377
352 หลา 1377
353 _รก 1374
354 ยท_ 1374
355 างา 1373
356 าช_ 1372
357 นให 1371
358 _ใช 1369
359 ายใ 1367
360 _นล 1364
361 กกา 1364
362 _บส 1362
363 _ทธ 1355
364 าสต 1352
365 รด_ 1346
366 _นข 1344
367 _งใ 1339
368 านา 1339
369 มกา 1336
370 งเป 1335
371 ธรร 1335
372 ะก_ 1334
373 _งจ 1329
374 _นป 1325
375 นม_ 1323
376 _งไ 1318
377 _งต 1315
378 รงก 1307
379 รพ_ 1307
380 ะต_ 1305
381 ะมา 1305
382 รวจ 1303
383 _นห 1297
384 าด_ 1296
385 ถาบ 1295
386 _รา 1289
387 _เร 1283
388 _ภา 1280
389 ละส 1274
390 ยวก 1270
391 ารด 1268
392 _ดส 1267
393 ใหญ 1267
394 ผล_ 1264
395 กลา 1260
396 งว_ 1259
397 โคร 1259
398 _บค 1258
399 อกเ 1258
400 อนไ 1257
401 ารข 1256
402 รวม 1254
403 _วง 1252
404 สน_ 1246
405 _งอ 1242
406 ะน_ 1236
407 ทร_ 1233
408 างๆ 1231
409 บาล 1227
410 _าป 1219
411 ารา 1217
412 มน_ 1216
413 ในป 1215
414 _ยม 1211
415 รร_ 1205
416 นค_ 1203
417 รรค 1202
418 รม_ 1201
419 โลก 1201
420 _อใ 1198
421 บท_ 1194
422 ายเ 1194
423 ระท 1193
424 ารล 1192
425 นเป 1186
426 นได 1186
427 นข_ 1183
428 ในร 1183
429 _ยบ 1182
430 ากน 1182
431 ะช_ 1180
432 เห_ 1180
433 าจะ 1178
434 ารบ 1178
435 แห_ 1177
436 _นธ 1176
437 ละค 1174
438 คาร 1171
439 นละ 1171
440 พาะ 1169
441 _หา 1161
442 ดท_ 1161
443 รษฐ 1157
444 ศรษ 1157
445 าง_ 1157
446 รเล 1156
447 _นช 1154
448 อกต 1151
449 านเ 1149
450 _ธ_ 1148
451 งเท 1148
452 _คร 1145
453 งกล 1143
454 เบ_ 1142
455 ยกา 1140
456 ระห 1137
457 บาท 1131
458 _งป 1130
459 รอง 1130
460 _บท 1124
461 รต_ 1124
462 นผ_ 1123
463 มก_ 1122
464 บว_ 1116
465 ปฏ_ 1116
466 _เห 1114
467 _ญญ 1113
468 ทธ_ 1112
469 องจ 1109
470 งมา 1105
471 _วา 1101
472 ดส_ 1101
473 เหล 1100
474 านท 1099
475 ายก 1099
476 นล_ 1098
477 องม 1098
478 _มพ 1097
479 นทา 1096
480 วลา 1094
481 _ดม 1093
482 _งร 1090
483 าเน 1090
484 ษฐก 1084
485 นย_ 1083
486 _ดต 1080
487 ระส 1080
488 _ตา 1072
489 นเร 1071
490 ฐก_ 1070
491 ใหม 1070
492 _กล 1067
493 ลย_ 1067
494 _ดข 1065
495 างไ 1064
496 _ถ_ 1061
497 ดต_ 1059
498 ถาน 1054
499 รกา 1052
500 ยงา 1051
501 _ออ 1048
502 งคว 1048
503 หต_ 1048
504 นาร 1045
505 ยาย 1044
506 กอบ 1041
507 ะหว 1040
508 งแล 1039
509 _นพ 1035
510 นธ_ 1033
511 มข_ 1032
512 ฐาน 1028
513 ะเป 1027
514 เศร 1026
515 นอย 1025
516 เวล 1023
517 าปร 1022
518 างป 1020
519 งระ 1018
520 อาจ 1016
521 จร_ 1011
522 ฉพา 1011
523 บด_ 1011
524 พล_ 1009
525 _กท 1007
526 นไป 1007
527 าใน 1007
528 ลอด 1002
529 ะกอ 1002
530 บค_ 999
531 ยร_ 999
532 สนอ 998
533 นาย 995
534 _าต 992
535 ญญา 992
536 _งา 991
537 นอก 990
538 โรง 990
539 _ผล 986
540 เหต 986
541 เปล 985
542 _นย 982
543 องอ 981
544 งผ_ 980
545 _เค 978
546 บต_ 974
547 รค_ 974
548 รทา 973
549 องใ 971
550 นโล 965
551 อท_ 962
552 านค 960
553 _มข 958
554 ะว_ 958
555 พรา 957
556 _จจ 955
557 ควร 955
558 าแล 954
559 สม_ 953
560 าหา 951
561 _อค 950
562 ลาง 950
563 านอ 950
564 _นผ 949
565 _แก 948
566 เฉพ 948
567 _ยก 945
568 องต 944
569 ทย_ 943
570 พยา 941
571 ารน 939
572 างส 938
573 _อร 937
574 กรณ 936
575 นจ_ 935
576 จจ_ 934
577 นาด 934
578 งช_ 931
579 ยต_ 929
580 งให 928
581 แนว 926
582 แผน 926
583 กข_ 923
584 ตอร 922
585 าตร 922
586 งเร 920
587 นบา 917
588 อให 916
589 _ดท 915
590 าธ_ 915
591 เสร 914
592 มศ_ 911
593 เตอ 911
594 _ชา 910
595 กส_ 909
596 เอ_ 909
597 เอก 906
598 าเร 901
599 _พย 898
600 รให 896
601 งเส 894
602 องแ 894
603 _บอ 892
604 ระจ 892
605 งป_ 891
606 ากเ 890
607 งอ_ 884
608 กจา 883
609 _อท 882
610 านบ 882
611 ามต 882
612 ลาด 881
613 รน_ 880
614 _สม 878
615 เคล 878
616 _เอ 872
617 บสน 872
618 พรร 872
619 ไว_ 870
620 _ษ_ 869
621 ครา 869
622 นเม 869
623 าขอ 866
624 จาร 865
625 สดง 863
626 ยน_ 861
627 ยว_ 860
628 นกล 859
629 _งพ 858
630 _าด 858
631 งจ_ 858
632 หนด 856
633 มม_ 855
634 อก_ 855
635 แรง 855
636 งคม 854
637 งถ_ 854
638 กาศ 852
639 เอง 851
640 างท 850
641 ยม_ 849
642 _กค 848
643 นกร 847
644 สหร 847
645 _มเ 844
646 ลงท 842
647 บาย 840
648 รว_ 840
649 นหน 839
650 _อข 838
651 ะจ_ 837
652 ตรว 836
653 อกจ 836
654 _อว 835
655 งเก 834
656 องผ 831
657 ในส 831
658 บคว 828
659 งไม 827
660 อส_ 825
661 งพ_ 824
662 สภา 823
663 งชา 821
664 องน 820
665 รมก 819
666 จะม 816
667 นคว 815
668 คโน 814
669 ละป 813
670 ารห 810
671 _นโ 809
672 มหา 808
673 อนเ 808
674 _กส 807
675 _าอ 807
676 โนโ 806
677 ทคโ 805
678 ารไ 803
679 _อป 801
680 โลย 801
681 นเด 799
682 เทค 798
683 ละร 794
684 นรา 792
685 ครอ 787
686 ฐบา 787
687 รงง 787
688 _ฐบ 785
689 หกร 784
690 ราค 782
691 _งง 781
692 _งช 776
693 ากท 775
694 ราก 774
695 รเป 772
696 รเร 771
697 _คน 769
698 าจา 766
699 ดล_ 765
700 าณ_ 765
701 แทน 765
702 _วอ 764
703 _กก 762
704 รอบ 762
705 _หล 761
706 งหน 760
707 สาห 759
708 โรค 759
709 รบร 758
710 ยก_ 755
711 อว_ 754
712 ดข_ 753
713 ดมศ 753
714 เจ_ 752
715 _เด 751
716 ะปร 747
717 _อห 745
718 นเง 744
719 รกร 744
720 _ดค 742
721 _เข 742
722 าแห 742
723 ขนา 741
724 ดยเ 741
725 างค 741
726 าพ_ 741
727 _าข 740
728 างแ 740
729 นอง 739
730 นเอ 738
731 ยส_ 737
732 วน_ 737
733 ะยะ 737
734 ามค 736
735 ละอ 733
736 ระย 732
737 านส 732
738 _ดย 729
739 พระ 729
740 _ตส 727
741 _กต 726
742 านข 724
743 _เว 722
744 จะไ 720
745 กษ_ 719
746 วนก 719
747 งย_ 718
748 สอบ 718
749 _จก 717
750 _บผ 717
751 ผลก 717
752 งได 716
753 ารผ 716
754 _หน 714
755 คณะ 713
756 พร_ 713
757 _อต 712
758 _เล 712
759 าคว 712
760 ภาย 711
761 มด_ 710
762 ารม 709
763 ายท 708
764 แสด 708
765 องพ 706
766 งละ 705
767 ชาก 705
768 นภา 705
769 ลกา 705
770 _เม 703
771 นาก 703
772 _นบ 700
773 องไ 700
774 นไห 699
775 บาง 699
776 งหม 698
777 องข 697
778 งอย 696
779 มาต 695
780 ะห_ 695
781 รขอ 693
782 ากข 693
783 ตอบ 691
784 อาห 688
785 หมด 686
786 _เน 685
787 นอน 685
788 าหก 685
789 ธาน 682
790 าใช 682
791 _ห_ 679
792 าะห 679
793 ไหว 679
794 _ชน 678
795 ษา_ 678
796 กอง 677
797 ธนา 677
798 ายต 676
799 เสน 676
800 งแต 675
801 ราช 675
802 าไป 675
803 างร 674
804 วย_ 672
805 กรม 671
806 ทศไ 671
807 นเพ 670
808 นตร 669
809 เทพ 668
810 านม 664
811 ละม 663
812 เหม 662
813 งไร 660
814 _บา 658
815 งจะ 658
816 _มต 657
817 เต_ 656
818 ยนร 654
819 าเส 654
820 ภาษ 653
821 ลาก 653
822 นคร 651
823 ระธ 650
824 รแก 649
825 างด 648
826 มค_ 646
827 องห 646
828 าจ_ 646
829 _กว 645
830 ดว_ 645
831 กงา 644
832 นคน 643
833 นทร 642
834 _บต 641
835 ตสา 641
836 ะกร 641
837 กด_ 639
838 ณะท 639
839 ยแล 639
840 นจะ 637
841 วร_ 635
842 ในช 635
843 _มอ 633
844 กษต 632
845 รแล 632
846 ษตร 632
847 เกษ 631
848 แรก 630
849 _ยา 629
850 บผ_ 629
851 ตลา 628
852 อาก 627
853 กษณ 626
854 งเด 626
855 ยบา 626
856 _อบ 625
857 างม 625
858 เว_ 625
859 _ดห 624
860 นเท 624
861 _ดแ 623
862 มเค 622
863 ในอ 622
864 _งล 621
865 นจา 621
866 _ยร 620
867 ะธา 619
868 เรา 619
869 กก_ 617
870 ศไท 616
871 กใน 615
872 วมท 615
873 วยก 615
874 หาก 615
875 ณฑ_ 614
876 ราง 614
877 ายน 614
878 าได 614
879 ะได 613
880 รณา 612
881 วนใ 612
882 _กง 608
883 _อแ 607
884 ยาล 606
885 วดล 606
886 _บด 605
887 นาม 605
888 ายา 605
889 ายแ 605
890 จะต 604
891 ดขอ 604
892 บน_ 604
893 หาว 604
894 _ดร 603
895 ะเภ 603
896 _าพ 602
897 ะแน 602
898 _าบ 601
899 แวด 601
900 ะบ_ 598
901 ะร_ 597
902 _ยด 596
903 มปร 596
904 เภท 596
905 ยให 595
906 _ซ_ 593
907 _มก 592
908 _แน 592
909 านใ 592
910 าศา 592
911 _ดอ 591
912 ยชน 590
913 อนข 590
914 ายอ 590
915 แม_ 590
916 _กอ 588
917 มอง 585
918 กกว 583
919 บอ_ 582
920 _อจ 581
921 นหล 581
922 บส_ 580
923 พย_ 580
924 ยขอ 580
925 วมก 580
926 ปท_ 579
927 โยบ 579
928 _มท 578
929 าอ_ 578
930 มเส 577
931 ยาศ 577
932 รเม 577
933 าะส 577
934 กสา 576
935 ระโ 576
936 บรร 575
937 นบ_ 574
938 อต_ 574
939 ากจ 574
940 ปลง 572
941 วอย 572
942 มเป 571
943 มสา 570
944 ะสา 570
945 ายง 570
946 นสา 569
947 อกส 569
948 องโ 568
949 าคม 568
950 งผล 567
951 _บน 566
952 างต 566
953 _มช 565
954 _าา 565
955 ดน_ 565
956 งคร 563
957 างช 563
958 คน_ 562
959 อแก 562
960 แกน 562
961 _หร 561
962 นเช 561
963 รช_ 561
964 ารโ 561
965 วม_ 560
966 อนา 560
967 ดให 559
968 มว_ 559
969 ยาม 558
970 วยเ 558
971 ะโย 558
972 มคว 557
973 ามพ 557
974 งสถ 556
975 ลท_ 556
976 นาน 555
977 มขอ 555
978 วาง 555
979 อคว 555
980 ะคว 555
981 แปล 555
982 กมา 553
983 ฝาก 553
984 _นๆ 552
985 นใจ 551
986 อไป 551
987 างอ 551
988 _อด 550
989 _เช 550
990 งหว 548
991 _งผ 546
992 _ดใ 546
993 อนท 546
994 ะไม 545
995 แหน 544
996 นห_ 543
997 ยเฉ 543
998 ละน 543
999 ษณะ 543
1000 นไม 542
1001 รบ_ 541
1002 รอน 541
1003 าวะ 541
1004 รลง 540
1005 ยด_ 539
1006 าป_ 538
1007 แข_ 538
1008 _งว 537
1009 นอา 537
1010 าศ_ 537
1011 บก_ 536
1012 ะพ_ 536
1013 _มค 535
0 #
1 # Licensed to the Apache Software Foundation (ASF) under one or more
2 # contributor license agreements. See the NOTICE file distributed with
3 # this work for additional information regarding copyright ownership.
4 # The ASF licenses this file to You under the Apache License, Version 2.0
5 # (the "License"); you may not use this file except in compliance with
6 # the License. You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 #
16 # This is a tika LanguageIdentifier properties file.
17 # Its name is org/apache/tika/language/tika.language.properties
18 # You can override it by placing a copy on the classpath in a file called
19 # org/apache/tika/language/tika.language.override.properties
20
21 # List of languages for which there are <language>.ngp profiles
22 # If there exists an ISO 639-1 2-letter code it should be used
23 # If not, you can choose an ISO 639-2 3-letter code
24 # See http://www.loc.gov/standards/iso639-2/php/code_list.php
25 languages=be,ca,da,de,eo,et,el,en,es,fi,fr,gl,hu,is,it,lt,nl,no,pl,pt,ro,ru,sk,sl,sv,th,uk
26
27 # List of language names in english
28 name.be=Belarusian
29 name.ca=Catalan
30 name.da=Danish
31 name.de=German
32 name.eo=Esperanto
33 name.et=Estonian
34 name.el=Greek
35 name.en=English
36 name.es=Spanish
37 name.fi=Finnish
38 name.fr=French
39 name.gl=Galician
40 name.hu=Hungarian
41 name.is=Icelandic
42 name.it=Italian
43 name.lt=Lithuanian
44 name.nl=Dutch
45 name.no=Norwegian
46 name.pl=Polish
47 name.pt=Portuguese
48 name.ro=Romanian
49 name.ru=Russian
50 name.sk=Slovakian
51 name.sl=Slovenian
52 name.sv=Swedish
53 name.th=Thai
54 name.uk=Ukrainian
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 на_ 31005
15 _на 30413
16 _пр 27783
17 _по 25204
18 ого 22546
19 _і_ 22494
20 го_ 21091
21 _за 20327
22 ий_ 20183
23 _в_ 20089
24 их_ 20023
25 _ви 19862
26 _ро 19378
27 _та 19078
28 _ст 18927
29 ів_ 18762
30 _ко 18571
31 ня_ 18400
32 ння 17493
33 _ві 16666
34 _до 16607
35 та_ 16281
36 про 16163
37 ся_ 16121
38 _як 15972
39 _з_ 15953
40 _у_ 15901
41 ськ 15538
42 ні_ 14883
43 _th 14863
44 сто 14729
45 від 14181
46 ти_ 13996
47 the 13870
48 ії_ 13749
49 ом_ 13601
50 ми_ 13215
51 ста 12930
52 ере 12896
53 ть_ 12835
54 пер 12814
55 ою_ 12775
56 ува 12375
57 he_ 12322
58 енн 11280
59 _пі 11279
60 ої_ 11274
61 льн 11135
62 ку_ 10750
63 ка_ 10741
64 ван 10642
65 _бу 10602
66 ист 10393
67 аль 10192
68 _пе 10151
69 них 10037
70 ком 9970
71 іст 9955
72 му_ 9909
73 _що 9841
74 _не 9808
75 on_ 9775
76 er_ 9750
77 ний 9654
78 ько 9587
79 ати 9472
80 що_ 9445
81 ки_ 9293
82 анн 9289
83 ли_ 9275
84 ля_ 9265
85 олі 9258
86 ii_ 9168
87 им_ 9115
88 при 9070
89 но_ 8988
90 _мо 8963
91 ост 8878
92 ла_ 8858
93 іль 8796
94 _ма 8783
95 ті_ 8699
96 es_ 8469
97 ому 8239
98 тьс 8186
99 ься 8181
100 літ 8175
101 ала 8069
102 не_ 7970
103 до_ 7967
104 ій_ 7880
105 роз 7833
106 _of 7682
107 _мі 7638
108 кор 7525
109 _сп 7516
110 ion 7514
111 _ін 7488
112 _in 7412
113 of_ 7353
114 ови 7296
115 акт 7195
116 ова 7175
117 тор 7130
118 ика 7023
119 тол 6990
120 нов 6948
121 and 6892
122 ьки 6868
123 ори 6860
124 nd_ 6740
125 _су 6629
126 _co 6626
127 для 6620
128 ав_ 6588
129 ног 6576
130 оло 6571
131 _дл 6564
132 рис 6483
133 ним 6435
134 під 6426
135 _de 6391
136 ані 6362
137 _an 6356
138 ід_ 6269
139 рок 6196
140 сти 6181
141 ові 6129
142 тан 6125
143 так 6070
144 ико 6038
145 аці 6035
146 вер 6012
147 роб 5988
148 _ве 5978
149 аст 5953
150 кий 5945
151 ітт 5902
152 al_ 5899
153 ово 5897
154 лен 5834
155 ну_ 5815
156 сті 5800
157 _ma 5784
158 _ка 5754
159 tio 5750
160 али 5746
161 ють 5735
162 час 5723
163 ент 5675
164 ами 5660
165 тер 5657
166 _то 5635
167 ни_ 5596
168 стр 5553
169 in_ 5547
170 оро 5536
171 _ре 5524
172 _се 5512
173 _св 5506
174 кон 5476
175 _де 5474
176 _те 5445
177 _ба 5391
178 ці_ 5385
179 пов 5355
180 бул 5331
181 міс 5313
182 ter 5306
183 мен 5287
184 ття 5276
185 кла 5261
186 зна 5256
187 кти 5251
188 an_ 5221
189 тов 5161
190 род 5160
191 _га 5147
192 орі 5110
193 _па 5103
194 _ii 5092
195 вик 5087
196 ва_ 5054
197 ном 5052
198 она 5018
199 ту_ 4997
200 єть 4977
201 оди 4956
202 ред 4926
203 ati 4905
204 гал 4902
205 ing 4883
206 оку 4879
207 ng_ 4863
208 ког 4849
209 тя_ 4841
210 _ме 4826
211 ає_ 4813
212 тра 4809
213 пра 4768
214 вал 4763
215 нсь 4732
216 ір_ 4726
217 яки 4721
218 _re 4712
219 _си 4712
220 ло_ 4692
221 ції 4662
222 en_ 4628
223 ної 4628
224 _во 4627
225 is_ 4626
226 _а_ 4607
227 ах_ 4600
228 ник 4599
229 як_ 4574
230 омп 4565
231 ent 4564
232 гра 4562
233 _гр 4560
234 ани 4551
235 вор 4530
236 три 4524
237 біл 4514
238 за_ 4511
239 тво 4476
240 _ча 4467
241 тик 4454
242 але 4453
243 ри_ 4444
244 le_ 4443
245 алі 4442
246 _ал 4433
247 ими 4416
248 ват 4400
249 рав 4384
250 _ос 4373
251 дно 4373
252 ідн 4368
253 ичн 4342
254 сту 4312
255 ія_ 4301
256 рів 4280
257 лас 4276
258 кра 4273
259 _об 4249
260 лад 4248
261 ері 4207
262 ову 4195
263 одн 4169
264 мін 4123
265 _st 4116
266 ков 4101
267 us_ 4097
268 зір 4094
269 сть 4082
270 рим 4081
271 кі_ 4078
272 ако 4074
273 узі 4072
274 _i_ 4060
275 ed_ 4058
276 був 4018
277 том 4003
278 сер 3991
279 _бі 3985
280 _ра 3978
281 ано 3971
282 _wi 3956
283 _ї_ 3948
284 re_ 3943
285 ійн 3924
286 _рі 3910
287 ера 3908
288 ив_ 3902
289 нал 3886
290 спі 3885
291 лі_ 3870
292 ени 3856
293 ніс 3856
294 сте 3856
295 _ca 3843
296 чен 3841
297 суз 3836
298 _a_ 3826
299 ічн 3825
300 дов 3822
301 ia_ 3808
302 кол 3793
303 ато 3785
304 ків 3779
305 ких 3762
306 ств 3756
307 овн 3752
308 _тр 3751
309 _pa 3742
310 рал 3738
311 _од 3736
312 ран 3736
313 ров 3734
314 чно 3733
315 фор 3713
316 пол 3687
317 _to 3680
318 тів 3663
319 ра_ 3662
320 iii 3660
321 рен 3657
322 жен 3637
323 ана 3619
324 _li 3584
325 ити 3576
326 тив 3572
327 _но 3553
328 роц 3533
329 кож 3524
330 тро 3518
331 то_ 3513
332 йсь 3502
333 _це 3495
334 рит 3488
335 мож 3483
336 лак 3473
337 кої 3468
338 дин 3466
339 _pr 3463
340 ною 3462
341 ьно 3423
342 _ск 3421
343 de_ 3419
344 оді 3408
345 вол 3387
346 рам 3380
347 _йо 3369
348 тал 3365
349 вни 3364
350 рат 3363
351 _vi 3357
352 еле 3357
353 _mi 3354
354 орм 3343
355 _кр 3342
356 ок_ 3339
357 пис 3329
358 ром 3326
359 мер 3315
360 віт 3299
361 оні 3283
362 ві_ 3279
363 or_ 3277
364 ше_ 3249
365 _че 3234
366 _го 3214
367 _fo 3213
368 ійс 3195
369 ден 3189
370 лив 3184
371 ага 3181
372 _xv 3179
373 ув_ 3161
374 ьни 3155
375 _mo 3149
376 анд 3145
377 ne_ 3135
378 анс 3132
379 нач 3127
380 _se 3113
381 пор 3110
382 ини 3107
383 for 3105
384 _зн 3103
385 _s_ 3096
386 _са 3094
387 ект 3090
388 арт 3084
389 _be 3081
390 te_ 3074
391 ож_ 3073
392 _po 3068
393 вно 3067
394 вел 3058
395 рос 3058
396 ній 3057
397 чни 3054
398 вид 3046
399 _вс 3044
400 ду_ 3040
401 _бе 3038
402 ина 3038
403 _xi 3022
404 ію_ 3022
405 _є_ 3012
406 _so 3011
407 ce_ 3010
408 цій 3010
409 рот 3008
410 або 3003
411 льш 2994
412 мов 2970
413 мпа 2970
414 пос 2969
415 дом 2963
416 дан 2960
417 пів 2960
418 же_ 2946
419 ход 2945
420 оці 2937
421 сно 2922
422 nte 2921
423 ує_ 2914
424 тав 2911
425 ді_ 2891
426 сво 2874
427 _ch 2868
428 _аб 2868
429 ить 2859
430 ені 2856
431 оча 2855
432 _лі 2852
433 тич 2851
434 to_ 2843
435 _la 2839
436 вся 2832
437 ало 2824
438 vii 2814
439 ers 2812
440 асн 2809
441 ися 2808
442 тин 2806
443 тат 2799
444 _ан 2795
445 вих 2790
446 ерш 2789
447 лик 2778
448 st_ 2777
449 ант 2775
450 вав 2773
451 буд 2771
452 які 2769
453 _di 2766
454 ери 2763
455 нос 2760
456 ман 2759
457 кан 2757
458 ара 2749
459 лов 2746
460 _sa 2738
461 ry_ 2731
462 інш 2730
463 рон 2727
464 se_ 2717
465 ica 2710
466 льк 2709
467 nt_ 2708
468 най 2707
469 ver 2705
470 _да 2702
471 _me 2701
472 ено 2701
473 обл 2700
474 _ти 2699
475 _al 2689
476 йог 2685
477 xvi 2676
478 ели 2676
479 бо_ 2667
480 івн 2659
481 ic_ 2651
482 вні 2650
483 дже 2649
484 зап 2649
485 тем 2647
486 раї 2645
487 во_ 2643
488 ают 2642
489 поч 2636
490 тур 2636
491 _un 2635
492 рес 2634
493 ада 2630
494 мат 2621
495 lin 2620
496 бра 2613
497 _ar 2608
498 ила 2608
499 нні 2607
500 нці 2602
501 рог 2595
502 int 2589
503 et_ 2588
504 трі 2586
505 тис 2580
506 _ba 2575
507 ива 2570
508 опо 2566
509 ns_ 2563
510 оли 2561
511 или 2556
512 ода 2556
513 _no 2554
514 ate 2551
515 аїн 2549
516 ерс 2536
517 бер 2530
518 сві 2529
519 ерн 2527
520 ist 2518
521 нь_ 2511
522 ch_ 2494
523 оль 2494
524 ам_ 2491
525 ix_ 2488
526 ома 2484
527 ine 2474
528 рії 2471
529 _fr 2470
530 her 2467
531 _зв 2465
532 _ді 2461
533 отр 2461
534 tor 2452
535 рез 2448
536 _ні 2444
537 ізн 2443
538 раз 2442
539 _пл 2440
540 ро_ 2439
541 яко 2437
542 _ro 2433
543 ste 2428
544 ким 2423
545 лос 2420
546 вит 2418
547 вою 2416
548 _ва 2413
549 ньо 2408
550 рац 2404
551 одо 2403
552 _ав 2398
553 туп 2395
554 ін_ 2394
555 sta 2391
556 обо 2390
557 der 2388
558 оти 2388
559 ивн 2386
560 лис 2383
561 нів 2383
562 ind 2377
563 вій 2375
564 _чи 2374
565 оре 2372
566 чи_ 2371
567 el_ 2367
568 огр 2359
569 ена 2352
570 дос 2351
571 as_ 2350
572 _ди 2348
573 над 2347
574 наз 2347
575 _кі 2341
576 юва 2341
577 дни 2337
578 чер 2337
579 пре 2334
580 омо 2329
581 рі_ 2329
582 _кл 2324
583 _фо 2324
584 ьна 2314
585 пан 2306
586 лів 2304
587 ди_ 2303
588 нта 2302
589 _is 2300
590 єю_ 2300
591 ру_ 2299
592 рик 2298
593 _ne 2287
594 con 2287
595 eri 2287
596 оду 2286
597 _бо 2281
598 без 2274
599 ll_ 2271
600 ям_ 2263
601 ідо 2260
602 ест 2256
603 ма_ 2252
604 кою 2249
605 аме 2246
606 _со 2245
607 _le 2244
608 аєт 2243
609 мет 2243
610 він 2240
611 сь_ 2238
612 com 2234
613 вої 2233
614 пір 2233
615 res 2227
616 at_ 2226
617 нув 2223
618 ора 2223
619 che 2221
620 мал 2220
621 rs_ 2218
622 апи 2218
623 ле_ 2216
624 іра 2213
625 ill 2212
626 под 2211
627 art 2210
628 ики 2207
629 змі 2202
630 _na 2200
631 la_ 2197
632 th_ 2197
633 дні 2192
634 _бр 2178
635 ула 2174
636 ву_ 2171
637 нап 2169
638 зас 2164
639 um_ 2159
640 iv_ 2158
641 ото 2158
642 сис 2158
643 кіл 2154
644 lan 2149
645 _ел 2148
646 роп 2147
647 скл 2147
648 зро 2146
649 рол 2145
650 ge_ 2136
651 _te 2133
652 _ge 2132
653 ючи 2127
654 дер 2126
655 tra 2123
656 _he 2119
657 нен 2116
658 ням 2116
659 пар 2116
660 _da 2115
661 пла 2115
662 спо 2115
663 ією 2114
664 арі 2103
665 гор 2100
666 ем_ 2100
667 кри 2095
668 ькі 2095
669 вод 2089
670 лан 2086
671 дів 2073
672 _do 2071
673 _му 2070
674 елі 2069
675 all 2068
676 man 2066
677 рма 2066
678 _зм 2064
679 ts_ 2063
680 аро 2058
681 _ар 2057
682 уло 2057
683 су_ 2056
684 вип 2055
685 ta_ 2054
686 _sc 2052
687 ви_ 2047
688 гол 2047
689 und 2041
690 тар 2037
691 ей_ 2031
692 per 2025
693 ьог 2022
694 est 2020
695 іні 2014
696 кар 2012
697 іон 2012
698 омі 2010
699 ер_ 2009
700 вий 2007
701 сел 2003
702 _ук 2002
703 _pe 2001
704 лог 2001
705 екс 1994
706 озв 1990
707 ell 1988
708 ко_ 1988
709 існ 1986
710 _оп 1984
711 _фі 1984
712 вто 1983
713 _ор 1980
714 ино 1980
715 ive 1977
716 ава 1969
717 _su 1968
718 _ли 1968
719 пот 1967
720 ов_ 1966
721 _bo 1964
722 _si 1963
723 иці 1961
724 _ha 1960
725 ель 1960
726 _ге 1957
727 ене 1955
728 рем 1954
729 _із 1953
730 нім 1951
731 age 1949
732 ль_ 1949
733 дал 1947
734 ас_ 1945
735 str 1941
736 лек 1941
737 ran 1938
738 na_ 1929
739 ниц 1929
740 оно 1929
741 всь 1925
742 чин 1925
743 нте 1916
744 цен 1912
745 ька 1912
746 ніш 1908
747 ve_ 1905
748 ону 1901
749 уют 1900
750 вим 1898
751 авс 1897
752 nal 1896
753 pro 1893
754 пак 1892
755 еро 1889
756 вог 1884
757 ави 1883
758 зав 1883
759 ин_ 1881
760 зал 1877
761 me_ 1876
762 із_ 1876
763 ідт 1874
764 _gr 1873
765 ове 1873
766 піс 1870
767 ons 1868
768 win 1865
769 сі_ 1864
770 _tr 1863
771 _en 1862
772 иро 1862
773 уєт 1861
774 sto 1860
775 изн 1860
776 sch 1858
777 _ho 1857
778 ant 1855
779 иво 1855
780 ім_ 1855
781 ra_ 1853
782 де_ 1851
783 кам 1851
784 лін 1847
785 нав 1846
786 тті 1842
787 ess 1838
788 _зо 1837
789 різ 1835
790 укр 1833
791 ідп 1833
792 обр 1832
793 аче 1829
794 це_ 1829
795 хід 1823
796 тув 1822
797 men 1820
798 ty_ 1820
799 по_ 1820
800 _дв 1816
801 ari 1815
802 ших 1813
803 ад_ 1811
804 окр 1811
805 опе 1811
806 інн 1805
807 бли 1804
808 соб 1802
809 гру 1801
810 он_ 1801
811 _ім 1799
812 гат 1795
813 ита 1790
814 ian 1789
815 ули 1789
816 ипу 1788
817 орт 1787
818 _ри 1786
819 _й_ 1785
820 _ле 1785
821 оле 1784
822 ar_ 1781
823 вст 1780
824 цьк 1780
825 нтр 1779
826 сам 1779
827 йни 1778
828 вир 1777
829 нас 1777
830 зви 1776
831 om_ 1775
832 йно 1775
833 mic 1772
834 вле 1769
835 ор_ 1763
836 ect 1759
837 ona 1759
838 лиш 1758
839 _fi 1757
840 _зб 1757
841 вин 1757
842 rd_ 1755
843 uni 1754
844 авл 1754
845 ерт 1754
846 сни 1753
847 ss_ 1752
848 _іс 1750
849 ndo 1750
850 нут 1749
851 тва 1749
852 cha 1747
853 _гу 1746
854 тру 1743
855 уль 1743
856 аді 1741
857 mar 1734
858 ard 1733
859 era 1732
860 пом 1732
861 азв 1731
862 ern 1730
863 вис 1729
864 _on 1728
865 око 1726
866 ско 1724
867 ез_ 1722
868 озр 1721
869 тни 1720
870 дав 1718
871 ень 1718
872 дит 1714
873 ame 1711
874 ак_ 1710
875 лем 1710
876 _lo 1700
877 les 1698
878 ган 1697
879 икі 1695
880 нар 1691
881 nce 1690
882 ber 1689
883 ain 1683
884 edi 1680
885 авт 1673
886 tur 1672
887 _ку 1668
888 осл 1668
889 ord 1667
890 _ra 1663
891 вон 1660
892 ie_ 1659
893 має 1659
894 уча 1657
895 лон 1656
896 рту 1656
897 рив 1651
898 _фр 1647
899 rat 1646
900 ros 1645
901 нти 1640
902 нши 1638
903 има 1637
904 тим 1631
905 анц 1630
906 cor 1629
907 it_ 1629
908 rea 1629
909 _ту 1628
910 ric 1627
911 _wa 1626
912 ерв 1626
913 оме 1625
914 икл 1624
915 one 1623
916 вил 1623
917 пус 1623
918 ача 1614
919 лу_ 1613
920 нії 1613
921 nde 1611
922 зво 1610
923 мог 1609
924 поз 1609
925 яка 1609
926 ктн 1608
927 дна 1606
928 вла 1605
929 зі_ 1604
930 нан 1604
931 ути 1604
932 ито 1603
933 іме 1603
934 ях_ 1601
935 os_ 1599
936 ris 1599
937 _ja 1598
938 _фа 1598
939 обі 1597
940 між 1596
941 оби 1594
942 рад 1594
943 тна 1593
944 іка 1592
945 кат 1589
946 нст 1588
947 dow 1587
948 _as 1584
949 кці 1584
950 _др 1582
951 ряд 1580
952 рин 1579
953 _sh 1576
954 тог 1575
955 tic 1574
956 дат 1574
957 зов 1574
958 _вл 1572
959 амі 1571
960 _au 1570
961 апр 1569
962 баг 1568
963 рно 1567
964 ідк 1563
965 par 1561
966 ція 1561
967 _c_ 1560
968 бом 1560
969 об_ 1560
970 іал 1560
971 они 1557
972 пон 1552
973 авн 1551
974 vi_ 1550
975 ате 1550
976 чі_ 1550
977 ws_ 1549
978 шен 1546
979 _br 1545
980 бор 1540
981 осо 1539
982 мар 1538
983 _iv 1537
984 кал 1536
985 щен 1536
986 min 1535
987 реж 1533
988 інт 1532
989 рни 1531
990 nat 1530
991 _sp 1525
992 важ 1525
993 _дж 1524
994 льб 1523
995 _сл 1520
996 ліз 1520
997 ан_ 1519
998 оля 1519
999 тел 1519
1000 іта 1517
1001 онс 1515
1002 поп 1515
1003 іх_ 1514
1004 атк 1511
1005 ope 1509
1006 ема 1509
1007 ині 1509
1008 ожн 1509
1009 там 1509
1010 орг 1508
1011 his 1505
1012 бут 1505
1013 _x_ 1504
0 <?xml version="1.0" encoding="UTF-8"?>
1 <!--
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 -->
17 <!--
18 Description: This xml file defines the valid mime types used by Tika.
19 The mime type data within this file is based on information from various
20 sources like Apache Nutch, Apache HTTP Server, the file(1) command, etc.
21 -->
22 <mime-info>
23
24 <mime-type type="application/activemessage"/>
25 <mime-type type="application/andrew-inset">
26 <glob pattern="*.ez"/>
27 </mime-type>
28 <mime-type type="application/applefile"/>
29 <mime-type type="application/applixware">
30 <glob pattern="*.aw"/>
31 </mime-type>
32
33 <mime-type type="application/atom+xml">
34 <root-XML localName="feed" namespaceURI="http://purl.org/atom/ns#"/>
35 <glob pattern="*.atom"/>
36 </mime-type>
37
38 <mime-type type="application/atomcat+xml">
39 <glob pattern="*.atomcat"/>
40 </mime-type>
41 <mime-type type="application/atomicmail"/>
42 <mime-type type="application/atomsvc+xml">
43 <glob pattern="*.atomsvc"/>
44 </mime-type>
45 <mime-type type="application/auth-policy+xml"/>
46 <mime-type type="application/batch-smtp"/>
47 <mime-type type="application/beep+xml"/>
48
49 <mime-type type="application/bizagi-modeler">
50 <_comment>BizAgi Process Modeler</_comment>
51 <sub-class-of type="application/zip"/>
52 <glob pattern="*.bpm"/>
53 </mime-type>
54
55 <mime-type type="application/cals-1840"/>
56 <mime-type type="application/ccxml+xml">
57 <glob pattern="*.ccxml"/>
58 </mime-type>
59 <mime-type type="application/cea-2018+xml"/>
60 <mime-type type="application/cellml+xml"/>
61 <mime-type type="application/cnrp+xml"/>
62 <mime-type type="application/commonground"/>
63 <mime-type type="application/conference-info+xml"/>
64 <mime-type type="application/cpl+xml"/>
65 <mime-type type="application/csta+xml"/>
66 <mime-type type="application/cstadata+xml"/>
67 <mime-type type="application/cu-seeme">
68 <glob pattern="*.cu"/>
69 </mime-type>
70 <mime-type type="application/cybercash"/>
71 <mime-type type="application/davmount+xml">
72 <glob pattern="*.davmount"/>
73 </mime-type>
74 <mime-type type="application/dca-rft"/>
75 <mime-type type="application/dec-dx"/>
76 <mime-type type="application/dialog-info+xml"/>
77 <mime-type type="application/dicom"/>
78
79 <mime-type type="application/dita+xml">
80 <sub-class-of type="application/xml"/>
81 <_comment>Darwin Information Typing Architecture</_comment>
82 </mime-type>
83
84 <mime-type type="application/dita+xml;format=map">
85 <sub-class-of type="application/dita+xml"/>
86 <_comment>DITA Map</_comment>
87 <root-XML localName="map"/>
88 <root-XML localName="map" namespaceURI="http://docs.oasis-open.org/namespace"/>
89 <glob pattern="*.ditamap"/>
90 </mime-type>
91 <mime-type type="application/dita+xml;format=topic">
92 <sub-class-of type="application/dita+xml"/>
93 <_comment>DITA Topic</_comment>
94 <root-XML localName="topic"/>
95 <root-XML localName="topic" namespaceURI="http://docs.oasis-open.org/namespace"/>
96 <!-- Topic is the default, Task and Concept are specialisations -->
97 <glob pattern="*.dita"/>
98 </mime-type>
99 <mime-type type="application/dita+xml;format=task">
100 <sub-class-of type="application/dita+xml;format=task"/>
101 <_comment>DITA Task Topic</_comment>
102 <root-XML localName="task"/>
103 <root-XML localName="task" namespaceURI="http://docs.oasis-open.org/namespace"/>
104 </mime-type>
105 <mime-type type="application/dita+xml;format=concept">
106 <sub-class-of type="application/dita+xml;format=topic"/>
107 <_comment>DITA Concept Topic</_comment>
108 <root-XML localName="concept"/>
109 <root-XML localName="concept" namespaceURI="http://docs.oasis-open.org/namespace"/>
110 </mime-type>
111 <mime-type type="application/dita+xml;format=val">
112 <sub-class-of type="application/dita+xml"/>
113 <_comment>DITA Conditional Processing Profile</_comment>
114 <root-XML localName="val"/>
115 <root-XML localName="val" namespaceURI="http://docs.oasis-open.org/namespace"/>
116 <glob pattern="*.ditaval"/>
117 </mime-type>
118
119 <mime-type type="application/dns"/>
120 <mime-type type="application/dvcs"/>
121 <mime-type type="application/ecmascript">
122 <glob pattern="*.ecma"/>
123 </mime-type>
124 <mime-type type="application/edi-consent"/>
125 <mime-type type="application/edi-x12"/>
126 <mime-type type="application/edifact"/>
127 <mime-type type="application/emma+xml">
128 <glob pattern="*.emma"/>
129 </mime-type>
130 <mime-type type="application/epp+xml"/>
131
132 <mime-type type="application/epub+zip">
133 <acronym>EPUB</acronym>
134 <_comment>Electronic Publication</_comment>
135 <magic priority="50">
136 <match value="PK\003\004" type="string" offset="0">
137 <match value="mimetypeapplication/epub+zip" type="string" offset="30"/>
138 </match>
139 </magic>
140 <glob pattern="*.epub"/>
141 </mime-type>
142
143 <mime-type type="application/eshop"/>
144 <mime-type type="application/example"/>
145 <mime-type type="application/fastinfoset"/>
146 <mime-type type="application/fastsoap"/>
147
148 <mime-type type="application/fits">
149 <acronym>FITS</acronym>
150 <_comment>Flexible Image Transport System</_comment>
151 <magic priority="50">
152 <match value="SIMPLE = T" type="string" offset="0"/>
153 </magic>
154 <glob pattern="*.fits"/>
155 <glob pattern="*.fit"/>
156 <glob pattern="*.fts"/>
157 </mime-type>
158
159 <mime-type type="application/font-tdpfr">
160 <glob pattern="*.pfr"/>
161 </mime-type>
162 <mime-type type="application/h224"/>
163 <mime-type type="application/http"/>
164 <mime-type type="application/hyperstudio">
165 <glob pattern="*.stk"/>
166 </mime-type>
167 <mime-type type="application/ibe-key-request+xml"/>
168 <mime-type type="application/ibe-pkg-reply+xml"/>
169 <mime-type type="application/ibe-pp-data"/>
170 <mime-type type="application/iges"/>
171
172 <mime-type type="application/illustrator">
173 <acronym>AI</acronym>
174 <_comment>Adobe Illustrator Artwork</_comment>
175 <tika:link>http://en.wikipedia.org/wiki/Adobe_Illustrator_Artwork</tika:link>
176 <glob pattern="*.ai"/>]
177 <sub-class-of type="application/postscript"/>
178 </mime-type>
179
180 <mime-type type="application/im-iscomposing+xml"/>
181 <mime-type type="application/index"/>
182 <mime-type type="application/index.cmd"/>
183 <mime-type type="application/index.obj"/>
184 <mime-type type="application/index.response"/>
185 <mime-type type="application/index.vnd"/>
186 <mime-type type="application/iotp"/>
187 <mime-type type="application/ipp"/>
188 <mime-type type="application/isup"/>
189
190 <mime-type type="application/java-archive">
191 <_comment>Java Archive</_comment>
192 <tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
193 <tika:uti>com.sun.java-archive</tika:uti>
194 <sub-class-of type="application/zip"/>
195 <glob pattern="*.jar"/>
196 </mime-type>
197
198 <mime-type type="application/vnd.android.package-archive">
199 <sub-class-of type="application/java-archive"/>
200 <glob pattern="*.apk"/>
201 </mime-type>
202 <mime-type type="application/x-tika-java-enterprise-archive">
203 <sub-class-of type="application/java-archive"/>
204 <glob pattern="*.ear"/>
205 </mime-type>
206 <mime-type type="application/x-tika-java-web-archive">
207 <sub-class-of type="application/java-archive"/>
208 <glob pattern="*.war"/>
209 </mime-type>
210
211 <mime-type type="application/x-tika-unix-dump"/>
212
213 <mime-type type="application/java-serialized-object">
214 <glob pattern="*.ser"/>
215 </mime-type>
216
217 <mime-type type="application/javascript">
218 <alias type="application/x-javascript"/>
219 <alias type="text/javascript"/>
220 <sub-class-of type="text/plain"/>
221 <_comment>JavaScript Source Code</_comment>
222 <glob pattern="*.js"/>
223 </mime-type>
224
225 <mime-type type="application/json">
226 <sub-class-of type="application/javascript"/>
227 <glob pattern="*.json"/>
228 </mime-type>
229
230 <mime-type type="application/java-vm">
231 <_comment>Java Class File</_comment>
232 <alias type="application/x-java-vm"/>
233 <alias type="application/x-java"/>
234 <magic priority="40">
235 <match value="0xcafebabe" type="string" offset="0" />
236 </magic>
237 <glob pattern="*.class"/>
238 </mime-type>
239
240 <mime-type type="application/x-java-jnilib">
241 <_comment>Java Native Library for OSX</_comment>
242 <magic priority="50">
243 <match value="0xcafebabe" type="string" offset="0">
244 <match value="0xfeedface" type="string" offset="4096"/>
245 </match>
246 </magic>
247 <glob pattern="*.jnilib"/>
248 </mime-type>
249
250 <mime-type type="application/kpml-request+xml"/>
251 <mime-type type="application/kpml-response+xml"/>
252 <mime-type type="application/lost+xml">
253 <glob pattern="*.lostxml"/>
254 </mime-type>
255
256 <mime-type type="application/mac-binhex40">
257 <alias type="application/mac-binhex"/>
258 <alias type="application/binhex"/>
259 <magic priority="50">
260 <match value="must\ be\ converted\ with\ BinHex" type="string" offset="11"/>
261 </magic>
262 <glob pattern="*.hqx"/>
263 </mime-type>
264
265 <mime-type type="application/mac-compactpro">
266 <glob pattern="*.cpt"/>
267 </mime-type>
268
269 <mime-type type="application/macwriteii"/>
270 <mime-type type="application/marc">
271 <glob pattern="*.mrc"/>
272 </mime-type>
273 <mime-type type="application/mathematica">
274 <glob pattern="*.ma"/>
275 <glob pattern="*.nb"/>
276 <glob pattern="*.mb"/>
277 </mime-type>
278 <mime-type type="application/mathml+xml">
279 <glob pattern="*.mathml"/>
280 </mime-type>
281 <mime-type type="application/mbms-associated-procedure-description+xml"/>
282 <mime-type type="application/mbms-deregister+xml"/>
283 <mime-type type="application/mbms-envelope+xml"/>
284 <mime-type type="application/mbms-msk+xml"/>
285 <mime-type type="application/mbms-msk-response+xml"/>
286 <mime-type type="application/mbms-protection-description+xml"/>
287 <mime-type type="application/mbms-reception-report+xml"/>
288 <mime-type type="application/mbms-register+xml"/>
289 <mime-type type="application/mbms-register-response+xml"/>
290 <mime-type type="application/mbms-user-service-description+xml"/>
291 <mime-type type="application/mbox">
292 <sub-class-of type="text/plain"/>
293 <glob pattern="*.mbox"/>
294 </mime-type>
295 <mime-type type="application/media_control+xml"/>
296 <mime-type type="application/mediaservercontrol+xml">
297 <glob pattern="*.mscml"/>
298 </mime-type>
299 <mime-type type="application/mikey"/>
300 <mime-type type="application/moss-keys"/>
301 <mime-type type="application/moss-signature"/>
302 <mime-type type="application/mosskey-data"/>
303 <mime-type type="application/mosskey-request"/>
304 <mime-type type="application/mp4">
305 <glob pattern="*.mp4s"/>
306 </mime-type>
307 <mime-type type="application/mpeg4-generic"/>
308 <mime-type type="application/mpeg4-iod"/>
309 <mime-type type="application/mpeg4-iod-xmt"/>
310
311 <!-- http://www.iana.org/assignments/media-types/application/msword -->
312 <mime-type type="application/msword">
313 <!-- Use DefaultDetector / org.apache.tika.parser.microsoft.POIFSContainerDetector for more reliable detection of OLE2 documents -->
314 <alias type="application/vnd.ms-word"/>
315 <_comment>Microsoft Word Document</_comment>
316 <tika:link>http://en.wikipedia.org/wiki/.doc</tika:link>
317 <tika:uti>com.microsoft.word.doc</tika:uti>
318 <magic priority="50">
319 <match value="Microsoft\ Word\ 6.0\ Document" type="string" offset="2080"/>
320 <match value="Documento\ Microsoft\ Word\ 6" type="string" offset="2080"/>
321 <match value="MSWordDoc" type="string" offset="2112"/>
322 <match value="0x31be0000" type="big32" offset="0"/>
323 <match value="PO^Q`" type="string" offset="0"/>
324 <match value="\376\067\0\043" type="string" offset="0"/>
325 <match value="\333\245-\0\0\0" type="string" offset="0"/>
326 <match value="\354\245\301" type="string" offset="512"/>
327 <match value="\320\317\021\340\241\261\032\341" type="string" offset="0"/>
328 <match value="\224\246\056" type="string" offset="0"/>
329 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
330 <match value="W\x00o\x00r\x00d\x00D\x00o\x00c\x00u\x00m\x00e\x00n\x00t" type="string" offset="1152:4096" />
331 </match>
332 </magic>
333 <glob pattern="*.doc"/>
334 <glob pattern="*.dot"/>
335 <sub-class-of type="application/x-tika-msoffice"/>
336 </mime-type>
337
338 <mime-type type="application/msword2">
339 <!-- Pre-OLE2, not a subtype of application/x-tika-msoffice -->
340 <_comment>Microsoft Word 2 Document</_comment>
341 <magic priority="50">
342 <match value="0x9ba5" type="string" />
343 <match value="0xdba5" type="string" />
344 </magic>
345 </mime-type>
346 <mime-type type="application/msword5">
347 <!-- Pre-OLE2, not a subtype of application/x-tika-msoffice -->
348 <_comment>Microsoft Word 5 Document</_comment>
349 <magic priority="50">
350 <match value="0xfe37" type="string" />
351 </magic>
352 </mime-type>
353
354 <mime-type type="application/mxf">
355 <glob pattern="*.mxf"/>
356 </mime-type>
357 <mime-type type="application/nasdata"/>
358 <mime-type type="application/news-checkgroups"/>
359 <mime-type type="application/news-groupinfo"/>
360 <mime-type type="application/news-transmission"/>
361 <mime-type type="application/nss"/>
362 <mime-type type="application/ocsp-request"/>
363 <mime-type type="application/ocsp-response"/>
364
365 <mime-type type="application/octet-stream">
366 <magic priority="50">
367 <match value="#\ This\ is\ a\ shell\ archive" type="string" offset="10"/>
368 <match value="\037\036" type="string" offset="0"/>
369 <match value="017437" type="host16" offset="0"/>
370 <match value="0x1fff" type="host16" offset="0"/>
371 <match value="\377\037" type="string" offset="0"/>
372 <match value="0145405" type="host16" offset="0"/>
373 </magic>
374 <glob pattern="*.bin"/>
375 <glob pattern="*.dms"/>
376 <glob pattern="*.lha"/>
377 <glob pattern="*.lrf"/>
378 <glob pattern="*.lzh"/>
379 <glob pattern="*.so"/>
380 <glob pattern="*.iso"/>
381 <glob pattern="*.dmg"/>
382 <glob pattern="*.dist"/>
383 <glob pattern="*.distz"/>
384 <glob pattern="*.pkg"/>
385 <glob pattern="*.bpk"/>
386 <glob pattern="*.dump"/>
387 <glob pattern="*.elc"/>
388 <glob pattern="*.deploy"/>
389 </mime-type>
390
391 <mime-type type="application/oda">
392 <glob pattern="*.oda"/>
393 </mime-type>
394 <mime-type type="application/oebps-package+xml">
395 <glob pattern="*.opf"/>
396 </mime-type>
397
398 <mime-type type="application/ogg">
399 <alias type="application/x-ogg"/>
400 <magic priority="50">
401 <match value="OggS" type="string" offset="0"/>
402 </magic>
403 <glob pattern="*.ogx"/>
404 </mime-type>
405
406 <mime-type type="application/onenote">
407 <glob pattern="*.onetoc"/>
408 <glob pattern="*.onetoc2"/>
409 <glob pattern="*.onetmp"/>
410 <glob pattern="*.onepkg"/>
411 </mime-type>
412 <mime-type type="application/parityfec"/>
413 <mime-type type="application/patch-ops-error+xml">
414 <glob pattern="*.xer"/>
415 </mime-type>
416
417 <mime-type type="application/pdf">
418 <alias type="application/x-pdf"/>
419 <acronym>PDF</acronym>
420 <_comment>Portable Document Format</_comment>
421 <tika:link>http://en.wikipedia.org/wiki/PDF</tika:link>
422 <tika:link>http://www.adobe.com/devnet/pdf/pdf_reference_archive.html</tika:link>
423 <tika:uti>com.adobe.pdf</tika:uti>
424 <magic priority="50">
425 <match value="%PDF-" type="string" offset="0"/>
426 </magic>
427 <glob pattern="*.pdf"/>
428 </mime-type>
429
430 <mime-type type="application/pgp-encrypted">
431 <alias type="application/pgp"/>
432 <glob pattern="*.pgp"/>
433 </mime-type>
434
435 <mime-type type="application/pgp-keys"/>
436
437 <mime-type type="application/pgp-signature">
438 <glob pattern="*.asc"/>
439 <glob pattern="*.sig"/>
440 </mime-type>
441
442 <mime-type type="application/pics-rules">
443 <glob pattern="*.prf"/>
444 </mime-type>
445 <mime-type type="application/pidf+xml"/>
446 <mime-type type="application/pidf-diff+xml"/>
447 <mime-type type="application/pkcs10">
448 <glob pattern="*.p10"/>
449 </mime-type>
450
451 <mime-type type="application/pkcs7-mime">
452 <glob pattern="*.p7m"/>
453 <glob pattern="*.p7c"/>
454 </mime-type>
455
456 <mime-type type="application/pkcs7-signature">
457 <glob pattern="*.p7s"/>
458 <magic priority="50">
459 <match value="-----BEGIN PKCS7" type="string" offset="0"/>
460 <match value="0x3082FFFF06092a864886f70d0107FFa0" type="string"
461 mask="0xFFFF0000FFFFFFFFFFFFFFFFFFFF00FF" offset="0"/>
462 <match value="0x308006092a864886f70d0107FFa0" type="string"
463 mask="0xFFFFFFFFFFFFFFFFFFFFFFFF00FF" offset="0"/>
464 </magic>
465 </mime-type>
466
467 <mime-type type="application/pkix-cert">
468 <glob pattern="*.cer"/>
469 </mime-type>
470 <mime-type type="application/pkix-crl">
471 <glob pattern="*.crl"/>
472 </mime-type>
473 <mime-type type="application/pkix-pkipath">
474 <glob pattern="*.pkipath"/>
475 </mime-type>
476 <mime-type type="application/pkixcmp">
477 <glob pattern="*.pki"/>
478 </mime-type>
479 <mime-type type="application/pls+xml">
480 <glob pattern="*.pls"/>
481 </mime-type>
482 <mime-type type="application/poc-settings+xml"/>
483
484 <mime-type type="application/postscript">
485 <_comment>PostScript</_comment>
486 <magic priority="50">
487 <match value="%!" type="string" offset="0" />
488 <match value="\004%!" type="string" offset="0" />
489 <!-- Windows format EPS -->
490 <match value="0xc5d0d3c6" type="string" offset="0"/>
491 </magic>
492 <glob pattern="*.ps"/>
493 <glob pattern="*.eps"/>
494 <glob pattern="*.epsf"/>
495 <glob pattern="*.epsi"/>
496 </mime-type>
497
498 <mime-type type="application/prs.alvestrand.titrax-sheet"/>
499 <mime-type type="application/prs.cww">
500 <glob pattern="*.cww"/>
501 </mime-type>
502 <mime-type type="application/prs.nprend"/>
503 <mime-type type="application/prs.plucker"/>
504 <mime-type type="application/qsig"/>
505
506 <mime-type type="application/rdf+xml">
507 <root-XML localName="RDF"/>
508 <root-XML localName="RDF" namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
509 <sub-class-of type="application/xml"/>
510 <acronym>RDF/XML</acronym>
511 <_comment>XML syntax for RDF graphs</_comment>
512 <glob pattern="*.rdf"/>
513 <glob pattern="*.owl"/>
514 <glob pattern="^rdf$" isregex="true"/>
515 <glob pattern="^owl$" isregex="true"/>
516 <glob pattern="*.xmp"/>
517 </mime-type>
518
519 <mime-type type="application/reginfo+xml">
520 <glob pattern="*.rif"/>
521 </mime-type>
522 <mime-type type="application/relax-ng-compact-syntax">
523 <sub-class-of type="text/plain"/>
524 <glob pattern="*.rnc"/>
525 </mime-type>
526 <mime-type type="application/remote-printing"/>
527 <mime-type type="application/resource-lists+xml">
528 <glob pattern="*.rl"/>
529 </mime-type>
530 <mime-type type="application/resource-lists-diff+xml">
531 <glob pattern="*.rld"/>
532 </mime-type>
533 <mime-type type="application/riscos"/>
534 <mime-type type="application/rlmi+xml"/>
535 <mime-type type="application/rls-services+xml">
536 <glob pattern="*.rs"/>
537 </mime-type>
538 <mime-type type="application/rsd+xml">
539 <glob pattern="*.rsd"/>
540 </mime-type>
541
542 <mime-type type="application/rss+xml">
543 <alias type="text/rss"/>
544 <root-XML localName="rss"/>
545 <root-XML namespaceURI="http://purl.org/rss/1.0/"/>
546 <glob pattern="*.rss"/>
547 </mime-type>
548
549 <mime-type type="application/rtf">
550 <_comment>Rich Text Format File</_comment>
551 <alias type="text/rtf"/>
552 <magic priority="50">
553 <match value="{\\rtf" type="string" offset="0"/>
554 </magic>
555 <glob pattern="*.rtf"/>
556 <sub-class-of type="text/plain"/>
557 </mime-type>
558
559 <mime-type type="application/rtx"/>
560 <mime-type type="application/samlassertion+xml"/>
561 <mime-type type="application/samlmetadata+xml"/>
562 <mime-type type="application/sbml+xml">
563 <glob pattern="*.sbml"/>
564 </mime-type>
565 <mime-type type="application/scvp-cv-request">
566 <glob pattern="*.scq"/>
567 </mime-type>
568 <mime-type type="application/scvp-cv-response">
569 <glob pattern="*.scs"/>
570 </mime-type>
571 <mime-type type="application/scvp-vp-request">
572 <glob pattern="*.spq"/>
573 </mime-type>
574 <mime-type type="application/scvp-vp-response">
575 <glob pattern="*.spp"/>
576 </mime-type>
577 <mime-type type="application/sdp">
578 <glob pattern="*.sdp"/>
579 </mime-type>
580 <mime-type type="application/set-payment"/>
581 <mime-type type="application/set-payment-initiation">
582 <glob pattern="*.setpay"/>
583 </mime-type>
584 <mime-type type="application/set-registration"/>
585 <mime-type type="application/set-registration-initiation">
586 <glob pattern="*.setreg"/>
587 </mime-type>
588 <mime-type type="application/sgml"/>
589 <mime-type type="application/sgml-open-catalog"/>
590 <mime-type type="application/shf+xml">
591 <glob pattern="*.shf"/>
592 </mime-type>
593 <mime-type type="application/sieve"/>
594 <mime-type type="application/simple-filter+xml"/>
595 <mime-type type="application/simple-message-summary"/>
596 <mime-type type="application/simplesymbolcontainer"/>
597 <mime-type type="application/slate"/>
598
599 <mime-type type="application/smil+xml">
600 <alias type="application/smil"/>
601 <_comment>SMIL Multimedia</_comment>
602 <glob pattern="*.smi"/>
603 <glob pattern="*.smil"/>
604 <glob pattern="*.sml"/>
605 </mime-type>
606
607 <mime-type type="application/soap+fastinfoset"/>
608 <mime-type type="application/soap+xml"/>
609
610 <mime-type type="application/sldworks">
611 <_comment>SolidWorks CAD program</_comment>
612 <glob pattern="*.sldprt" />
613 <glob pattern="*.sldasm" />
614 <glob pattern="*.slddrw" />
615 <sub-class-of type="application/x-tika-msoffice" />
616 </mime-type>
617
618 <mime-type type="application/sparql-query">
619 <glob pattern="*.rq"/>
620 </mime-type>
621 <mime-type type="application/sparql-results+xml">
622 <glob pattern="*.srx"/>
623 </mime-type>
624 <mime-type type="application/spirits-event+xml"/>
625 <mime-type type="application/srgs">
626 <glob pattern="*.gram"/>
627 </mime-type>
628 <mime-type type="application/srgs+xml">
629 <glob pattern="*.grxml"/>
630 </mime-type>
631 <mime-type type="application/ssml+xml">
632 <glob pattern="*.ssml"/>
633 </mime-type>
634 <mime-type type="application/timestamp-query"/>
635 <mime-type type="application/timestamp-reply"/>
636 <mime-type type="application/tve-trigger"/>
637 <mime-type type="application/ulpfec"/>
638 <mime-type type="application/vemmi"/>
639 <mime-type type="application/vividence.scriptfile"/>
640 <mime-type type="application/vnd.3gpp.bsf+xml"/>
641 <mime-type type="application/vnd.3gpp.pic-bw-large">
642 <glob pattern="*.plb"/>
643 </mime-type>
644 <mime-type type="application/vnd.3gpp.pic-bw-small">
645 <glob pattern="*.psb"/>
646 </mime-type>
647 <mime-type type="application/vnd.3gpp.pic-bw-var">
648 <glob pattern="*.pvb"/>
649 </mime-type>
650 <mime-type type="application/vnd.3gpp.sms"/>
651 <mime-type type="application/vnd.3gpp2.bcmcsinfo+xml"/>
652 <mime-type type="application/vnd.3gpp2.sms"/>
653 <mime-type type="application/vnd.3gpp2.tcap">
654 <glob pattern="*.tcap"/>
655 </mime-type>
656 <mime-type type="application/vnd.3m.post-it-notes">
657 <glob pattern="*.pwn"/>
658 </mime-type>
659 <mime-type type="application/vnd.accpac.simply.aso">
660 <glob pattern="*.aso"/>
661 </mime-type>
662 <mime-type type="application/vnd.accpac.simply.imp">
663 <glob pattern="*.imp"/>
664 </mime-type>
665 <mime-type type="application/vnd.acucobol">
666 <glob pattern="*.acu"/>
667 </mime-type>
668 <mime-type type="application/vnd.acucorp">
669 <glob pattern="*.atc"/>
670 <glob pattern="*.acutc"/>
671 </mime-type>
672 <mime-type type="application/vnd.adobe.air-application-installer-package+zip">
673 <glob pattern="*.air"/>
674 </mime-type>
675 <mime-type type="application/vnd.adobe.aftereffects.project">
676 <glob pattern="*.aep"/>
677 </mime-type>
678 <mime-type type="application/vnd.adobe.aftereffects.template">
679 <glob pattern="*.aet"/>
680 </mime-type>
681 <mime-type type="application/vnd.adobe.xdp+xml">
682 <glob pattern="*.xdp"/>
683 </mime-type>
684 <mime-type type="application/vnd.adobe.xfdf">
685 <glob pattern="*.xfdf"/>
686 </mime-type>
687 <mime-type type="application/vnd.aether.imp"/>
688 <mime-type type="application/vnd.airzip.filesecure.azf">
689 <glob pattern="*.azf"/>
690 </mime-type>
691 <mime-type type="application/vnd.airzip.filesecure.azs">
692 <glob pattern="*.azs"/>
693 </mime-type>
694 <mime-type type="application/vnd.amazon.ebook">
695 <glob pattern="*.azw"/>
696 </mime-type>
697 <mime-type type="application/vnd.americandynamics.acc">
698 <glob pattern="*.acc"/>
699 </mime-type>
700 <mime-type type="application/vnd.amiga.ami">
701 <glob pattern="*.ami"/>
702 </mime-type>
703 <mime-type type="application/vnd.anser-web-certificate-issue-initiation">
704 <glob pattern="*.cii"/>
705 </mime-type>
706 <mime-type type="application/vnd.anser-web-funds-transfer-initiation">
707 <glob pattern="*.fti"/>
708 </mime-type>
709 <mime-type type="application/vnd.antix.game-component">
710 <glob pattern="*.atx"/>
711 </mime-type>
712 <mime-type type="application/vnd.apple.installer+xml">
713 <glob pattern="*.mpkg"/>
714 </mime-type>
715
716 <mime-type type="application/vnd.apple.iwork">
717 <sub-class-of type="application/zip"/>
718 </mime-type>
719 <mime-type type="application/vnd.apple.keynote">
720 <root-XML localName="presentation" namespaceURI="http://developer.apple.com/namespaces/keynote2" />
721 <sub-class-of type="application/vnd.apple.iwork" />
722 <glob pattern="*.key"/>
723 </mime-type>
724 <mime-type type="application/vnd.apple.pages">
725 <root-XML localName="document" namespaceURI="http://developer.apple.com/namespaces/sl" />
726 <sub-class-of type="application/vnd.apple.iwork" />
727 <glob pattern="*.pages"/>
728 </mime-type>
729 <mime-type type="application/vnd.apple.numbers">
730 <root-XML localName="document" namespaceURI="http://developer.apple.com/namespaces/ls" />
731 <sub-class-of type="application/vnd.apple.iwork" />
732 <glob pattern="*.numbers"/>
733 </mime-type>
734 <mime-type type="application/x-tika-iworks-protected">
735 <sub-class-of type="application/vnd.apple.iwork" />
736 <_comment>Password Protected iWorks File</_comment>
737 </mime-type>
738
739 <mime-type type="application/vnd.arastra.swi">
740 <glob pattern="*.swi"/>
741 </mime-type>
742 <mime-type type="application/vnd.audiograph"/>
743 <mime-type type="application/vnd.autopackage"/>
744 <mime-type type="application/vnd.avistar+xml"/>
745 <mime-type type="application/vnd.blueice.multipass">
746 <glob pattern="*.mpm"/>
747 </mime-type>
748 <mime-type type="application/vnd.bluetooth.ep.oob"/>
749 <mime-type type="application/vnd.bmi">
750 <glob pattern="*.bmi"/>
751 </mime-type>
752 <mime-type type="application/vnd.businessobjects">
753 <glob pattern="*.rep"/>
754 </mime-type>
755 <mime-type type="application/vnd.cab-jscript"/>
756 <mime-type type="application/vnd.canon-cpdl"/>
757 <mime-type type="application/vnd.canon-lips"/>
758 <mime-type type="application/vnd.cendio.thinlinc.clientconf"/>
759 <mime-type type="application/vnd.chemdraw+xml">
760 <glob pattern="*.cdxml"/>
761 </mime-type>
762 <mime-type type="application/vnd.chipnuts.karaoke-mmd">
763 <glob pattern="*.mmd"/>
764 </mime-type>
765 <mime-type type="application/vnd.cinderella">
766 <glob pattern="*.cdy"/>
767 </mime-type>
768 <mime-type type="application/vnd.cirpack.isdn-ext"/>
769 <mime-type type="application/vnd.claymore">
770 <glob pattern="*.cla"/>
771 </mime-type>
772 <mime-type type="application/vnd.clonk.c4group">
773 <glob pattern="*.c4g"/>
774 <glob pattern="*.c4d"/>
775 <glob pattern="*.c4f"/>
776 <glob pattern="*.c4p"/>
777 <glob pattern="*.c4u"/>
778 </mime-type>
779 <mime-type type="application/vnd.commerce-battelle"/>
780 <mime-type type="application/vnd.commonspace">
781 <glob pattern="*.csp"/>
782 </mime-type>
783 <mime-type type="application/vnd.contact.cmsg">
784 <glob pattern="*.cdbcmsg"/>
785 </mime-type>
786 <mime-type type="application/vnd.cosmocaller">
787 <glob pattern="*.cmc"/>
788 </mime-type>
789 <mime-type type="application/vnd.crick.clicker">
790 <glob pattern="*.clkx"/>
791 </mime-type>
792 <mime-type type="application/vnd.crick.clicker.keyboard">
793 <glob pattern="*.clkk"/>
794 </mime-type>
795 <mime-type type="application/vnd.crick.clicker.palette">
796 <glob pattern="*.clkp"/>
797 </mime-type>
798 <mime-type type="application/vnd.crick.clicker.template">
799 <glob pattern="*.clkt"/>
800 </mime-type>
801 <mime-type type="application/vnd.crick.clicker.wordbank">
802 <glob pattern="*.clkw"/>
803 </mime-type>
804 <mime-type type="application/vnd.criticaltools.wbs+xml">
805 <glob pattern="*.wbs"/>
806 </mime-type>
807 <mime-type type="application/vnd.ctc-posml">
808 <glob pattern="*.pml"/>
809 </mime-type>
810 <mime-type type="application/vnd.ctct.ws+xml"/>
811 <mime-type type="application/vnd.cups-pdf"/>
812 <mime-type type="application/vnd.cups-postscript"/>
813 <mime-type type="application/vnd.cups-ppd">
814 <glob pattern="*.ppd"/>
815 </mime-type>
816 <mime-type type="application/vnd.cups-raster"/>
817 <mime-type type="application/vnd.cups-raw"/>
818 <mime-type type="application/vnd.curl.car">
819 <glob pattern="*.car"/>
820 </mime-type>
821 <mime-type type="application/vnd.curl.pcurl">
822 <glob pattern="*.pcurl"/>
823 </mime-type>
824 <mime-type type="application/vnd.cybank"/>
825 <mime-type type="application/vnd.data-vision.rdz">
826 <glob pattern="*.rdz"/>
827 </mime-type>
828 <mime-type type="application/vnd.denovo.fcselayout-link">
829 <glob pattern="*.fe_launch"/>
830 </mime-type>
831 <mime-type type="application/vnd.dir-bi.plate-dl-nosuffix"/>
832 <mime-type type="application/vnd.dna">
833 <glob pattern="*.dna"/>
834 </mime-type>
835 <mime-type type="application/vnd.dolby.mlp">
836 <glob pattern="*.mlp"/>
837 </mime-type>
838 <mime-type type="application/vnd.dolby.mobile.1"/>
839 <mime-type type="application/vnd.dolby.mobile.2"/>
840 <mime-type type="application/vnd.dpgraph">
841 <glob pattern="*.dpg"/>
842 </mime-type>
843 <mime-type type="application/vnd.dreamfactory">
844 <glob pattern="*.dfac"/>
845 </mime-type>
846 <mime-type type="application/vnd.dvb.esgcontainer"/>
847 <mime-type type="application/vnd.dvb.ipdcdftnotifaccess"/>
848 <mime-type type="application/vnd.dvb.ipdcesgaccess"/>
849 <mime-type type="application/vnd.dvb.ipdcroaming"/>
850 <mime-type type="application/vnd.dvb.iptv.alfec-base"/>
851 <mime-type type="application/vnd.dvb.iptv.alfec-enhancement"/>
852 <mime-type type="application/vnd.dvb.notif-aggregate-root+xml"/>
853 <mime-type type="application/vnd.dvb.notif-container+xml"/>
854 <mime-type type="application/vnd.dvb.notif-generic+xml"/>
855 <mime-type type="application/vnd.dvb.notif-ia-msglist+xml"/>
856 <mime-type type="application/vnd.dvb.notif-ia-registration-request+xml"/>
857 <mime-type type="application/vnd.dvb.notif-ia-registration-response+xml"/>
858 <mime-type type="application/vnd.dvb.notif-init+xml"/>
859 <mime-type type="application/vnd.dxr"/>
860 <mime-type type="application/vnd.dynageo">
861 <glob pattern="*.geo"/>
862 </mime-type>
863 <mime-type type="application/vnd.ecdis-update"/>
864 <mime-type type="application/vnd.ecowin.chart">
865 <glob pattern="*.mag"/>
866 </mime-type>
867 <mime-type type="application/vnd.ecowin.filerequest"/>
868 <mime-type type="application/vnd.ecowin.fileupdate"/>
869 <mime-type type="application/vnd.ecowin.series"/>
870 <mime-type type="application/vnd.ecowin.seriesrequest"/>
871 <mime-type type="application/vnd.ecowin.seriesupdate"/>
872 <mime-type type="application/vnd.emclient.accessrequest+xml"/>
873 <mime-type type="application/vnd.enliven">
874 <glob pattern="*.nml"/>
875 </mime-type>
876 <mime-type type="application/vnd.epson.esf">
877 <glob pattern="*.esf"/>
878 </mime-type>
879 <mime-type type="application/vnd.epson.msf">
880 <glob pattern="*.msf"/>
881 </mime-type>
882 <mime-type type="application/vnd.epson.quickanime">
883 <glob pattern="*.qam"/>
884 </mime-type>
885 <mime-type type="application/vnd.epson.salt">
886 <glob pattern="*.slt"/>
887 </mime-type>
888 <mime-type type="application/vnd.epson.ssf">
889 <glob pattern="*.ssf"/>
890 </mime-type>
891 <mime-type type="application/vnd.ericsson.quickcall"/>
892 <mime-type type="application/vnd.eszigno3+xml">
893 <glob pattern="*.es3"/>
894 <glob pattern="*.et3"/>
895 </mime-type>
896 <mime-type type="application/vnd.etsi.aoc+xml"/>
897 <mime-type type="application/vnd.etsi.cug+xml"/>
898 <mime-type type="application/vnd.etsi.iptvcommand+xml"/>
899 <mime-type type="application/vnd.etsi.iptvdiscovery+xml"/>
900 <mime-type type="application/vnd.etsi.iptvprofile+xml"/>
901 <mime-type type="application/vnd.etsi.iptvsad-bc+xml"/>
902 <mime-type type="application/vnd.etsi.iptvsad-cod+xml"/>
903 <mime-type type="application/vnd.etsi.iptvsad-npvr+xml"/>
904 <mime-type type="application/vnd.etsi.iptvueprofile+xml"/>
905 <mime-type type="application/vnd.etsi.mcid+xml"/>
906 <mime-type type="application/vnd.etsi.sci+xml"/>
907 <mime-type type="application/vnd.etsi.simservs+xml"/>
908 <mime-type type="application/vnd.eudora.data"/>
909 <mime-type type="application/vnd.ezpix-album">
910 <glob pattern="*.ez2"/>
911 </mime-type>
912 <mime-type type="application/vnd.ezpix-package">
913 <glob pattern="*.ez3"/>
914 </mime-type>
915 <mime-type type="application/vnd.f-secure.mobile"/>
916 <mime-type type="application/vnd.fdf">
917 <glob pattern="*.fdf"/>
918 </mime-type>
919 <mime-type type="application/vnd.fdsn.mseed">
920 <glob pattern="*.mseed"/>
921 </mime-type>
922 <mime-type type="application/vnd.fdsn.seed">
923 <glob pattern="*.seed"/>
924 <glob pattern="*.dataless"/>
925 </mime-type>
926 <mime-type type="application/vnd.ffsns"/>
927 <mime-type type="application/vnd.fints"/>
928 <mime-type type="application/vnd.flographit">
929 <glob pattern="*.gph"/>
930 </mime-type>
931 <mime-type type="application/vnd.fluxtime.clip">
932 <glob pattern="*.ftc"/>
933 </mime-type>
934 <mime-type type="application/vnd.font-fontforge-sfd"/>
935 <mime-type type="application/vnd.framemaker">
936 <glob pattern="*.fm"/>
937 <glob pattern="*.frame"/>
938 <glob pattern="*.maker"/>
939 <glob pattern="*.book"/>
940 </mime-type>
941 <mime-type type="application/vnd.frogans.fnc">
942 <glob pattern="*.fnc"/>
943 </mime-type>
944 <mime-type type="application/vnd.frogans.ltf">
945 <glob pattern="*.ltf"/>
946 </mime-type>
947 <mime-type type="application/vnd.fsc.weblaunch">
948 <glob pattern="*.fsc"/>
949 </mime-type>
950 <mime-type type="application/vnd.fujitsu.oasys">
951 <glob pattern="*.oas"/>
952 </mime-type>
953 <mime-type type="application/vnd.fujitsu.oasys2">
954 <glob pattern="*.oa2"/>
955 </mime-type>
956 <mime-type type="application/vnd.fujitsu.oasys3">
957 <glob pattern="*.oa3"/>
958 </mime-type>
959 <mime-type type="application/vnd.fujitsu.oasysgp">
960 <glob pattern="*.fg5"/>
961 </mime-type>
962 <mime-type type="application/vnd.fujitsu.oasysprs">
963 <glob pattern="*.bh2"/>
964 </mime-type>
965 <mime-type type="application/vnd.fujixerox.art-ex"/>
966 <mime-type type="application/vnd.fujixerox.art4"/>
967 <mime-type type="application/vnd.fujixerox.hbpl"/>
968 <mime-type type="application/vnd.fujixerox.ddd">
969 <glob pattern="*.ddd"/>
970 </mime-type>
971 <mime-type type="application/vnd.fujixerox.docuworks">
972 <glob pattern="*.xdw"/>
973 </mime-type>
974 <mime-type type="application/vnd.fujixerox.docuworks.binder">
975 <glob pattern="*.xbd"/>
976 </mime-type>
977 <mime-type type="application/vnd.fut-misnet"/>
978 <mime-type type="application/vnd.fuzzysheet">
979 <glob pattern="*.fzs"/>
980 </mime-type>
981 <mime-type type="application/vnd.genomatix.tuxedo">
982 <glob pattern="*.txd"/>
983 </mime-type>
984 <mime-type type="application/vnd.geogebra.file">
985 <glob pattern="*.ggb"/>
986 </mime-type>
987 <mime-type type="application/vnd.geogebra.tool">
988 <glob pattern="*.ggt"/>
989 </mime-type>
990 <mime-type type="application/vnd.geometry-explorer">
991 <glob pattern="*.gex"/>
992 <glob pattern="*.gre"/>
993 </mime-type>
994 <mime-type type="application/vnd.gmx">
995 <glob pattern="*.gmx"/>
996 </mime-type>
997
998 <mime-type type="application/vnd.google-earth.kml+xml">
999 <root-XML localName="kml"/>
1000 <root-XML namespaceURI="http://www.opengis.net/kml/2.2" localName="kml"/>
1001 <acronym>KML</acronym>
1002 <_comment>Keyhole Markup Language</_comment>
1003 <glob pattern="*.kml"/>
1004 <sub-class-of type="application/xml"/>
1005 </mime-type>
1006
1007 <mime-type type="application/vnd.google-earth.kmz">
1008 <sub-class-of type="application/zip"/>
1009 <glob pattern="*.kmz"/>
1010 </mime-type>
1011 <mime-type type="application/vnd.grafeq">
1012 <glob pattern="*.gqf"/>
1013 <glob pattern="*.gqs"/>
1014 </mime-type>
1015 <mime-type type="application/vnd.gridmp"/>
1016 <mime-type type="application/vnd.groove-account">
1017 <glob pattern="*.gac"/>
1018 </mime-type>
1019 <mime-type type="application/vnd.groove-help">
1020 <glob pattern="*.ghf"/>
1021 </mime-type>
1022 <mime-type type="application/vnd.groove-identity-message">
1023 <glob pattern="*.gim"/>
1024 </mime-type>
1025 <mime-type type="application/vnd.groove-injector">
1026 <glob pattern="*.grv"/>
1027 </mime-type>
1028 <mime-type type="application/vnd.groove-tool-message">
1029 <glob pattern="*.gtm"/>
1030 </mime-type>
1031 <mime-type type="application/vnd.groove-tool-template">
1032 <glob pattern="*.tpl"/>
1033 </mime-type>
1034 <mime-type type="application/vnd.groove-vcard">
1035 <glob pattern="*.vcg"/>
1036 </mime-type>
1037 <mime-type type="application/vnd.handheld-entertainment+xml">
1038 <glob pattern="*.zmm"/>
1039 </mime-type>
1040 <mime-type type="application/vnd.hbci">
1041 <glob pattern="*.hbci"/>
1042 </mime-type>
1043 <mime-type type="application/vnd.hcl-bireports"/>
1044 <mime-type type="application/vnd.hhe.lesson-player">
1045 <glob pattern="*.les"/>
1046 </mime-type>
1047 <mime-type type="application/vnd.hp-hpgl">
1048 <glob pattern="*.hpgl"/>
1049 </mime-type>
1050 <mime-type type="application/vnd.hp-hpid">
1051 <glob pattern="*.hpid"/>
1052 </mime-type>
1053 <mime-type type="application/vnd.hp-hps">
1054 <glob pattern="*.hps"/>
1055 </mime-type>
1056 <mime-type type="application/vnd.hp-jlyt">
1057 <glob pattern="*.jlt"/>
1058 </mime-type>
1059 <mime-type type="application/vnd.hp-pcl">
1060 <glob pattern="*.pcl"/>
1061 </mime-type>
1062 <mime-type type="application/vnd.hp-pclxl">
1063 <glob pattern="*.pclxl"/>
1064 </mime-type>
1065 <mime-type type="application/vnd.httphone"/>
1066 <mime-type type="application/vnd.hydrostatix.sof-data">
1067 <glob pattern="*.sfd-hdstx"/>
1068 </mime-type>
1069 <mime-type type="application/vnd.hzn-3d-crossword">
1070 <glob pattern="*.x3d"/>
1071 </mime-type>
1072 <mime-type type="application/vnd.ibm.afplinedata"/>
1073 <mime-type type="application/vnd.ibm.electronic-media"/>
1074 <mime-type type="application/vnd.ibm.minipay">
1075 <glob pattern="*.mpy"/>
1076 </mime-type>
1077 <mime-type type="application/vnd.ibm.modcap">
1078 <glob pattern="*.afp"/>
1079 <glob pattern="*.listafp"/>
1080 <glob pattern="*.list3820"/>
1081 </mime-type>
1082 <mime-type type="application/vnd.ibm.rights-management">
1083 <glob pattern="*.irm"/>
1084 </mime-type>
1085 <mime-type type="application/vnd.ibm.secure-container">
1086 <glob pattern="*.sc"/>
1087 </mime-type>
1088 <mime-type type="application/vnd.iccprofile">
1089 <glob pattern="*.icc"/>
1090 <glob pattern="*.icm"/>
1091 </mime-type>
1092 <mime-type type="application/vnd.igloader">
1093 <glob pattern="*.igl"/>
1094 </mime-type>
1095 <mime-type type="application/vnd.immervision-ivp">
1096 <glob pattern="*.ivp"/>
1097 </mime-type>
1098 <mime-type type="application/vnd.immervision-ivu">
1099 <glob pattern="*.ivu"/>
1100 </mime-type>
1101 <mime-type type="application/vnd.informedcontrol.rms+xml"/>
1102 <mime-type type="application/vnd.informix-visionary"/>
1103 <mime-type type="application/vnd.intercon.formnet">
1104 <glob pattern="*.xpw"/>
1105 <glob pattern="*.xpx"/>
1106 </mime-type>
1107 <mime-type type="application/vnd.intertrust.digibox"/>
1108 <mime-type type="application/vnd.intertrust.nncp"/>
1109 <mime-type type="application/vnd.intu.qbo">
1110 <glob pattern="*.qbo"/>
1111 </mime-type>
1112 <mime-type type="application/vnd.intu.qfx">
1113 <glob pattern="*.qfx"/>
1114 </mime-type>
1115 <mime-type type="application/vnd.iptc.g2.conceptitem+xml"/>
1116 <mime-type type="application/vnd.iptc.g2.knowledgeitem+xml"/>
1117 <mime-type type="application/vnd.iptc.g2.newsitem+xml"/>
1118 <mime-type type="application/vnd.iptc.g2.packageitem+xml"/>
1119 <mime-type type="application/vnd.ipunplugged.rcprofile">
1120 <glob pattern="*.rcprofile"/>
1121 </mime-type>
1122 <mime-type type="application/vnd.irepository.package+xml">
1123 <glob pattern="*.irp"/>
1124 </mime-type>
1125 <mime-type type="application/vnd.is-xpr">
1126 <glob pattern="*.xpr"/>
1127 </mime-type>
1128 <mime-type type="application/vnd.jam">
1129 <glob pattern="*.jam"/>
1130 </mime-type>
1131 <mime-type type="application/vnd.japannet-directory-service"/>
1132 <mime-type type="application/vnd.japannet-jpnstore-wakeup"/>
1133 <mime-type type="application/vnd.japannet-payment-wakeup"/>
1134 <mime-type type="application/vnd.japannet-registration"/>
1135 <mime-type type="application/vnd.japannet-registration-wakeup"/>
1136 <mime-type type="application/vnd.japannet-setstore-wakeup"/>
1137 <mime-type type="application/vnd.japannet-verification"/>
1138 <mime-type type="application/vnd.japannet-verification-wakeup"/>
1139 <mime-type type="application/vnd.jcp.javame.midlet-rms">
1140 <glob pattern="*.rms"/>
1141 </mime-type>
1142 <mime-type type="application/vnd.jisp">
1143 <glob pattern="*.jisp"/>
1144 </mime-type>
1145 <mime-type type="application/vnd.joost.joda-archive">
1146 <glob pattern="*.joda"/>
1147 </mime-type>
1148 <mime-type type="application/vnd.kahootz">
1149 <glob pattern="*.ktz"/>
1150 <glob pattern="*.ktr"/>
1151 </mime-type>
1152 <mime-type type="application/vnd.kde.karbon">
1153 <glob pattern="*.karbon"/>
1154 </mime-type>
1155
1156 <mime-type type="application/vnd.kde.kchart">
1157 <alias type="application/x-kchart"/>
1158 <_comment>KChart File</_comment>
1159 <glob pattern="*.chrt"/>
1160 </mime-type>
1161
1162 <mime-type type="application/vnd.kde.kformula">
1163 <glob pattern="*.kfo"/>
1164 </mime-type>
1165
1166 <mime-type type="application/vnd.kde.kivio">
1167 <glob pattern="*.flw"/>
1168 </mime-type>
1169
1170 <mime-type type="application/vnd.kde.kontour">
1171 <glob pattern="*.kon"/>
1172 </mime-type>
1173
1174 <mime-type type="application/vnd.kde.kpresenter">
1175 <alias type="application/x-kpresenter"/>
1176 <_comment>KPresenter File</_comment>
1177 <glob pattern="*.kpr"/>
1178 <glob pattern="*.kpt"/>
1179 </mime-type>
1180
1181 <mime-type type="application/vnd.kde.kspread">
1182 <alias type="application/x-kspread"/>
1183 <_comment>KSpread File</_comment>
1184 <glob pattern="*.ksp"/>
1185 </mime-type>
1186
1187 <mime-type type="application/vnd.kde.kword">
1188 <alias type="application/x-kword"/>
1189 <_comment>KWord File</_comment>
1190 <glob pattern="*.kwd"/>
1191 <glob pattern="*.kwt"/>
1192 </mime-type>
1193
1194 <mime-type type="application/vnd.kenameaapp">
1195 <glob pattern="*.htke"/>
1196 </mime-type>
1197 <mime-type type="application/vnd.kidspiration">
1198 <glob pattern="*.kia"/>
1199 </mime-type>
1200 <mime-type type="application/vnd.kinar">
1201 <glob pattern="*.kne"/>
1202 <glob pattern="*.knp"/>
1203 </mime-type>
1204 <mime-type type="application/vnd.koan">
1205 <alias type="application/x-koan"/>
1206 <_comment>SSEYO Koan File</_comment>
1207 <glob pattern="*.skp"/>
1208 <glob pattern="*.skd"/>
1209 <glob pattern="*.skt"/>
1210 <glob pattern="*.skm"/>
1211 </mime-type>
1212 <mime-type type="application/vnd.kodak-descriptor">
1213 <glob pattern="*.sse"/>
1214 </mime-type>
1215 <mime-type type="application/vnd.liberty-request+xml"/>
1216 <mime-type type="application/vnd.llamagraphics.life-balance.desktop">
1217 <glob pattern="*.lbd"/>
1218 </mime-type>
1219 <mime-type type="application/vnd.llamagraphics.life-balance.exchange+xml">
1220 <glob pattern="*.lbe"/>
1221 </mime-type>
1222 <mime-type type="application/vnd.lotus-1-2-3">
1223 <glob pattern="*.123"/>
1224 </mime-type>
1225 <mime-type type="application/vnd.lotus-approach">
1226 <glob pattern="*.apr"/>
1227 </mime-type>
1228 <mime-type type="application/vnd.lotus-freelance">
1229 <glob pattern="*.pre"/>
1230 </mime-type>
1231 <mime-type type="application/vnd.lotus-notes">
1232 <glob pattern="*.nsf"/>
1233 </mime-type>
1234 <mime-type type="application/vnd.lotus-organizer">
1235 <glob pattern="*.org"/>
1236 </mime-type>
1237
1238 <mime-type type="application/vnd.lotus-screencam">
1239 <!-- <glob pattern="*.scm"/> - conflicts with text/x-scheme -->
1240 </mime-type>
1241
1242 <mime-type type="application/vnd.lotus-wordpro">
1243 <magic priority="50">
1244 <match value="WordPro\0" type="string" offset="0" />
1245 <match value="WordPro\r\373" type="string" offset="0" />
1246 </magic>
1247 <glob pattern="*.lwp"/>
1248 </mime-type>
1249
1250 <mime-type type="application/vnd.macports.portpkg">
1251 <glob pattern="*.portpkg"/>
1252 </mime-type>
1253 <mime-type type="application/vnd.marlin.drm.actiontoken+xml"/>
1254 <mime-type type="application/vnd.marlin.drm.conftoken+xml"/>
1255 <mime-type type="application/vnd.marlin.drm.license+xml"/>
1256 <mime-type type="application/vnd.marlin.drm.mdcf"/>
1257 <mime-type type="application/vnd.mcd">
1258 <glob pattern="*.mcd"/>
1259 </mime-type>
1260 <mime-type type="application/vnd.medcalcdata">
1261 <glob pattern="*.mc1"/>
1262 </mime-type>
1263 <mime-type type="application/vnd.mediastation.cdkey">
1264 <glob pattern="*.cdkey"/>
1265 </mime-type>
1266 <mime-type type="application/vnd.meridian-slingshot"/>
1267 <mime-type type="application/vnd.mfer">
1268 <glob pattern="*.mwf"/>
1269 </mime-type>
1270 <mime-type type="application/vnd.mfmp">
1271 <glob pattern="*.mfm"/>
1272 </mime-type>
1273 <mime-type type="application/vnd.micrografx.flo">
1274 <glob pattern="*.flo"/>
1275 </mime-type>
1276 <mime-type type="application/vnd.micrografx.igx">
1277 <glob pattern="*.igx"/>
1278 </mime-type>
1279
1280 <mime-type type="application/vnd.mif">
1281 <_comment>FrameMaker Interchange Format</_comment>
1282 <alias type="application/x-mif"/>
1283 <alias type="application/x-frame"/>
1284 <magic priority="50">
1285 <match value="\&lt;MakerFile" type="string" offset="0" />
1286 <match value="\&lt;MIFFile" type="string" offset="0" />
1287 <match value="\&lt;MakerDictionary" type="string" offset="0" />
1288 <match value="\&lt;MakerScreenFont" type="string" offset="0" />
1289 <match value="\&lt;MML" type="string" offset="0" />
1290 <match value="\&lt;Book" type="string" offset="0" />
1291 <match value="\&lt;Maker" type="string" offset="0" />
1292 <match value="\x3c\x4d\x49\x46\x46\x69\x6c\x65\x20" type="string" offset="0" />
1293 </magic>
1294 <glob pattern="*.mif"/>
1295 </mime-type>
1296
1297 <mime-type type="application/vnd.mindjet.mindmanager">
1298 <_comment>MindManager</_comment>
1299 <sub-class-of type="application/zip"/>
1300 <glob pattern="*.mmp"/>
1301 <glob pattern="*.mmap"/>
1302 <glob pattern="*.mmpt"/>
1303 <glob pattern="*.mmat"/>
1304 <glob pattern="*.mmmp"/>
1305 <glob pattern="*.mmas"/>
1306 </mime-type>
1307
1308 <mime-type type="application/vnd.minisoft-hp3000-save"/>
1309 <mime-type type="application/vnd.mitsubishi.misty-guard.trustweb"/>
1310 <mime-type type="application/vnd.mobius.daf">
1311 <glob pattern="*.daf"/>
1312 </mime-type>
1313 <mime-type type="application/vnd.mobius.dis">
1314 <glob pattern="*.dis"/>
1315 </mime-type>
1316 <mime-type type="application/vnd.mobius.mbk">
1317 <glob pattern="*.mbk"/>
1318 </mime-type>
1319 <mime-type type="application/vnd.mobius.mqy">
1320 <glob pattern="*.mqy"/>
1321 </mime-type>
1322 <mime-type type="application/vnd.mobius.msl">
1323 <glob pattern="*.msl"/>
1324 </mime-type>
1325 <mime-type type="application/vnd.mobius.plc">
1326 <glob pattern="*.plc"/>
1327 </mime-type>
1328 <mime-type type="application/vnd.mobius.txf">
1329 <glob pattern="*.txf"/>
1330 </mime-type>
1331 <mime-type type="application/vnd.mophun.application">
1332 <glob pattern="*.mpn"/>
1333 </mime-type>
1334 <mime-type type="application/vnd.mophun.certificate">
1335 <glob pattern="*.mpc"/>
1336 </mime-type>
1337 <mime-type type="application/vnd.motorola.flexsuite"/>
1338 <mime-type type="application/vnd.motorola.flexsuite.adsi"/>
1339 <mime-type type="application/vnd.motorola.flexsuite.fis"/>
1340 <mime-type type="application/vnd.motorola.flexsuite.gotap"/>
1341 <mime-type type="application/vnd.motorola.flexsuite.kmr"/>
1342 <mime-type type="application/vnd.motorola.flexsuite.ttc"/>
1343 <mime-type type="application/vnd.motorola.flexsuite.wem"/>
1344 <mime-type type="application/vnd.motorola.iprm"/>
1345 <mime-type type="application/vnd.mozilla.xul+xml">
1346 <glob pattern="*.xul"/>
1347 </mime-type>
1348 <mime-type type="application/vnd.ms-artgalry">
1349 <glob pattern="*.cil"/>
1350 </mime-type>
1351 <mime-type type="application/vnd.ms-asf"/>
1352 <mime-type type="application/vnd.ms-cab-compressed">
1353 <glob pattern="*.cab"/>
1354 </mime-type>
1355
1356 <!-- http://www.iana.org/assignments/media-types/application/vnd.ms-excel -->
1357 <mime-type type="application/vnd.ms-excel">
1358 <!-- Use DefaultDetector / org.apache.tika.parser.microsoft.POIFSContainerDetector for more reliable detection of OLE2 documents -->
1359 <alias type="application/msexcel" />
1360 <_comment>Microsoft Excel Spreadsheet</_comment>
1361 <magic priority="50">
1362 <match value="Microsoft\ Excel\ 5.0\ Worksheet" type="string" offset="2080"/>
1363 <match value="Foglio\ di\ lavoro\ Microsoft\ Exce" type="string" offset="2080"/>
1364 <match value="Biff5" type="string" offset="2114"/>
1365 <match value="Biff5" type="string" offset="2121"/>
1366 <match value="\x09\x04\x06\x00\x00\x00\x10\x00" type="string" offset="0"/>
1367 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
1368 <match value="W\x00o\x00r\x00k\x00b\x00o\x00o\x00k" type="string" offset="1152:4096" />
1369 </match>
1370 </magic>
1371 <glob pattern="*.xls"/>
1372 <glob pattern="*.xlm"/>
1373 <glob pattern="*.xla"/>
1374 <glob pattern="*.xlc"/>
1375 <glob pattern="*.xlt"/>
1376 <glob pattern="*.xlw"/>
1377 <glob pattern="*.xll"/>
1378 <glob pattern="*.xld"/>
1379 <sub-class-of type="application/x-tika-msoffice"/>
1380 </mime-type>
1381
1382 <mime-type type="application/vnd.ms-excel.addin.macroenabled.12">
1383 <_comment>Office Open XML Workbook Add-in (macro-enabled)</_comment>
1384 <glob pattern="*.xlam"/>
1385 <sub-class-of type="application/x-tika-ooxml"/>
1386 </mime-type>
1387
1388 <mime-type type="application/vnd.ms-excel.sheet.macroenabled.12">
1389 <_comment>Office Open XML Workbook (macro-enabled)</_comment>
1390 <glob pattern="*.xlsm"/>
1391 <sub-class-of type="application/x-tika-ooxml"/>
1392 </mime-type>
1393
1394 <mime-type type="application/vnd.ms-excel.sheet.binary.macroenabled.12">
1395 <_comment>Microsoft Excel 2007 Binary Spreadsheet</_comment>
1396 <glob pattern="*.xlsb"/>
1397 <sub-class-of type="application/x-tika-ooxml"/>
1398 </mime-type>
1399
1400 <mime-type type="application/vnd.ms-fontobject">
1401 <glob pattern="*.eot"/>
1402 </mime-type>
1403 <mime-type type="application/vnd.ms-htmlhelp">
1404 <glob pattern="*.chm"/>
1405 <magic priority="50">
1406 <match value="ITSF" type="string" offset="0"/>
1407 </magic>
1408 </mime-type>
1409 <mime-type type="application/vnd.ms-ims">
1410 <glob pattern="*.ims"/>
1411 </mime-type>
1412 <mime-type type="application/vnd.ms-lrm">
1413 <glob pattern="*.lrm"/>
1414 </mime-type>
1415
1416 <mime-type type="application/vnd.ms-outlook">
1417 <_comment>Microsoft Outlook Message</_comment>
1418 <glob pattern="*.msg" />
1419 <sub-class-of type="application/x-tika-msoffice"/>
1420 </mime-type>
1421
1422 <mime-type type="application/vnd.ms-pki.seccat">
1423 <glob pattern="*.cat"/>
1424 </mime-type>
1425 <mime-type type="application/vnd.ms-pki.stl">
1426 <glob pattern="*.stl"/>
1427 </mime-type>
1428 <mime-type type="application/vnd.ms-playready.initiator+xml"/>
1429
1430 <!-- http://www.iana.org/assignments/media-types/application/vnd.ms-powerpoint -->
1431 <mime-type type="application/vnd.ms-powerpoint">
1432 <!-- Use DefaultDetector / org.apache.tika.parser.microsoft.POIFSContainerDetector for more reliable detection of OLE2 documents -->
1433 <alias type="application/mspowerpoint"/>
1434 <_comment>Microsoft Powerpoint Presentation</_comment>
1435 <magic priority="50">
1436 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
1437 <match value="P\x00o\x00w\x00e\x00r\x00P\x00o\x00i\x00n\x00t\x00 D\x00o\x00c\x00u\x00m\x00e\x00n\x00t" type="string" offset="1152:4096" />
1438 </match>
1439 </magic>
1440 <glob pattern="*.ppt"/>
1441 <glob pattern="*.ppz"/>
1442 <glob pattern="*.pps"/>
1443 <glob pattern="*.pot"/>
1444 <glob pattern="*.ppa"/>
1445 <sub-class-of type="application/x-tika-msoffice"/>
1446 </mime-type>
1447
1448 <mime-type type="application/vnd.ms-powerpoint.addin.macroenabled.12">
1449 <_comment>Office Open XML Presentation Add-in (macro-enabled)</_comment>
1450 <glob pattern="*.ppam"/>
1451 <sub-class-of type="application/x-tika-ooxml"/>
1452 </mime-type>
1453
1454 <mime-type type="application/vnd.ms-powerpoint.presentation.macroenabled.12">
1455 <_comment>Office Open XML Presentation (macro-enabled)</_comment>
1456 <glob pattern="*.pptm"/>
1457 <sub-class-of type="application/x-tika-ooxml"/>
1458 </mime-type>
1459
1460 <mime-type type="application/vnd.ms-powerpoint.slide.macroenabled.12">
1461 <glob pattern="*.sldm"/>
1462 <sub-class-of type="application/x-tika-ooxml"/>
1463 </mime-type>
1464
1465 <mime-type type="application/vnd.ms-powerpoint.slideshow.macroenabled.12">
1466 <_comment>Office Open XML Presentation Slideshow (macro-enabled)</_comment>
1467 <glob pattern="*.ppsm"/>
1468 <sub-class-of type="application/x-tika-ooxml"/>
1469 </mime-type>
1470
1471 <mime-type type="application/vnd.ms-powerpoint.template.macroenabled.12">
1472 <glob pattern="*.potm"/>
1473 <sub-class-of type="application/x-tika-ooxml"/>
1474 </mime-type>
1475
1476 <mime-type type="application/vnd.ms-project">
1477 <glob pattern="*.mpp"/>
1478 <glob pattern="*.mpt"/>
1479 <sub-class-of type="application/x-tika-msoffice"/>
1480 </mime-type>
1481
1482 <mime-type type="application/x-project">
1483 <glob pattern="*.mpx"/>
1484 <magic priority="50">
1485 <match value="MPX,Microsoft Project for Windows," type="string" offset="0"/>
1486 </magic>
1487 <sub-class-of type="text/plain"/>
1488 </mime-type>
1489
1490 <mime-type type="application/vnd.ms-tnef">
1491 <alias type="application/ms-tnef" />
1492 <magic priority="50">
1493 <match value="0x223e9f78" type="little32" offset="0" />
1494 </magic>
1495 </mime-type>
1496
1497 <mime-type type="application/vnd.ms-wmdrm.lic-chlg-req"/>
1498 <mime-type type="application/vnd.ms-wmdrm.lic-resp"/>
1499 <mime-type type="application/vnd.ms-wmdrm.meter-chlg-req"/>
1500 <mime-type type="application/vnd.ms-wmdrm.meter-resp"/>
1501
1502 <mime-type type="application/vnd.ms-word.document.macroenabled.12">
1503 <_comment>Office Open XML Document (macro-enabled)</_comment>
1504 <glob pattern="*.docm"/>
1505 <sub-class-of type="application/x-tika-ooxml"/>
1506 </mime-type>
1507
1508 <mime-type type="application/vnd.ms-word.template.macroenabled.12">
1509 <_comment>Office Open XML Document Template (macro-enabled)</_comment>
1510 <glob pattern="*.dotm"/>
1511 <sub-class-of type="application/x-tika-ooxml"/>
1512 </mime-type>
1513
1514 <mime-type type="application/vnd.ms-works">
1515 <magic priority="50">
1516 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
1517 <match value="M\x00a\x00t\x00O\x00S\x00T" type="string" offset="1152:4096" />
1518 </match>
1519 </magic>
1520 <glob pattern="*.wps"/>
1521 <glob pattern="*.wks"/>
1522 <glob pattern="*.wcm"/>
1523 <glob pattern="*.wdb"/>
1524 <sub-class-of type="application/x-tika-msoffice"/>
1525 </mime-type>
1526
1527 <mime-type type="application/vnd.ms-wpl">
1528 <glob pattern="*.wpl"/>
1529 </mime-type>
1530 <mime-type type="application/vnd.ms-xpsdocument">
1531 <glob pattern="*.xps"/>
1532 <sub-class-of type="application/x-tika-ooxml"/>
1533 </mime-type>
1534 <mime-type type="application/vnd.mseq">
1535 <glob pattern="*.mseq"/>
1536 </mime-type>
1537 <mime-type type="application/vnd.msign"/>
1538 <mime-type type="application/vnd.multiad.creator"/>
1539 <mime-type type="application/vnd.multiad.creator.cif"/>
1540 <mime-type type="application/vnd.music-niff"/>
1541 <mime-type type="application/vnd.musician">
1542 <glob pattern="*.mus"/>
1543 </mime-type>
1544 <mime-type type="application/vnd.muvee.style">
1545 <glob pattern="*.msty"/>
1546 </mime-type>
1547 <mime-type type="application/vnd.ncd.control"/>
1548 <mime-type type="application/vnd.ncd.reference"/>
1549 <mime-type type="application/vnd.nervana"/>
1550 <mime-type type="application/vnd.netfpx"/>
1551 <mime-type type="application/vnd.neurolanguage.nlu">
1552 <glob pattern="*.nlu"/>
1553 </mime-type>
1554 <mime-type type="application/vnd.noblenet-directory">
1555 <glob pattern="*.nnd"/>
1556 </mime-type>
1557 <mime-type type="application/vnd.noblenet-sealer">
1558 <glob pattern="*.nns"/>
1559 </mime-type>
1560 <mime-type type="application/vnd.noblenet-web">
1561 <glob pattern="*.nnw"/>
1562 </mime-type>
1563 <mime-type type="application/vnd.nokia.catalogs"/>
1564 <mime-type type="application/vnd.nokia.conml+wbxml"/>
1565 <mime-type type="application/vnd.nokia.conml+xml"/>
1566 <mime-type type="application/vnd.nokia.isds-radio-presets"/>
1567 <mime-type type="application/vnd.nokia.iptv.config+xml"/>
1568 <mime-type type="application/vnd.nokia.landmark+wbxml"/>
1569 <mime-type type="application/vnd.nokia.landmark+xml"/>
1570 <mime-type type="application/vnd.nokia.landmarkcollection+xml"/>
1571 <mime-type type="application/vnd.nokia.n-gage.ac+xml"/>
1572 <mime-type type="application/vnd.nokia.n-gage.data">
1573 <glob pattern="*.ngdat"/>
1574 </mime-type>
1575 <mime-type type="application/vnd.nokia.n-gage.symbian.install">
1576 <glob pattern="*.n-gage"/>
1577 </mime-type>
1578 <mime-type type="application/vnd.nokia.ncd"/>
1579 <mime-type type="application/vnd.nokia.pcd+wbxml"/>
1580 <mime-type type="application/vnd.nokia.pcd+xml"/>
1581 <mime-type type="application/vnd.nokia.radio-preset">
1582 <glob pattern="*.rpst"/>
1583 </mime-type>
1584 <mime-type type="application/vnd.nokia.radio-presets">
1585 <glob pattern="*.rpss"/>
1586 </mime-type>
1587 <mime-type type="application/vnd.novadigm.edm">
1588 <glob pattern="*.edm"/>
1589 </mime-type>
1590 <mime-type type="application/vnd.novadigm.edx">
1591 <glob pattern="*.edx"/>
1592 </mime-type>
1593 <mime-type type="application/vnd.novadigm.ext">
1594 <glob pattern="*.ext"/>
1595 </mime-type>
1596
1597 <!-- =================================================================== -->
1598 <!-- Open Document Format for Office Applications (OpenDocument) v1.0 -->
1599 <!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0 -->
1600 <!-- =================================================================== -->
1601
1602 <mime-type type="application/vnd.oasis.opendocument.chart">
1603 <alias type="application/x-vnd.oasis.opendocument.chart"/>
1604 <_comment>OpenDocument v1.0: Chart document</_comment>
1605 <magic>
1606 <match type="string" offset="0" value="PK">
1607 <match type="string" offset="30"
1608 value="mimetypeapplication/vnd.oasis.opendocument.chart"/>
1609 </match>
1610 </magic>
1611 <glob pattern="*.odc"/>
1612 </mime-type>
1613
1614 <mime-type type="application/vnd.oasis.opendocument.chart-template">
1615 <alias type="application/x-vnd.oasis.opendocument.chart-template"/>
1616 <_comment>OpenDocument v1.0: Chart document used as template</_comment>
1617 <magic>
1618 <match type="string" offset="0" value="PK">
1619 <match type="string" offset="30"
1620 value="mimetypeapplication/vnd.oasis.opendocument.chart-template"/>
1621 </match>
1622 </magic>
1623 <glob pattern="*.otc"/>
1624 </mime-type>
1625
1626 <mime-type type="application/vnd.oasis.opendocument.database">
1627 <glob pattern="*.odb"/>
1628 </mime-type>
1629
1630 <mime-type type="application/vnd.oasis.opendocument.formula">
1631 <alias type="application/x-vnd.oasis.opendocument.formula"/>
1632 <_comment>OpenDocument v1.0: Formula document</_comment>
1633 <magic>
1634 <match type="string" offset="0" value="PK">
1635 <match type="string" offset="30"
1636 value="mimetypeapplication/vnd.oasis.opendocument.formula" />
1637 </match>
1638 </magic>
1639 <glob pattern="*.odf"/>
1640 <sub-class-of type="application/zip"/>
1641 </mime-type>
1642
1643 <mime-type type="application/vnd.oasis.opendocument.formula-template">
1644 <alias type="application/x-vnd.oasis.opendocument.formula-template"/>
1645 <_comment>OpenDocument v1.0: Formula document used as template</_comment>
1646 <magic>
1647 <match type="string" offset="0" value="PK">
1648 <match type="string" offset="30"
1649 value="mimetypeapplication/vnd.oasis.opendocument.formula-template"/>
1650 </match>
1651 </magic>
1652 <glob pattern="*.odft"/>
1653 </mime-type>
1654
1655 <mime-type type="application/vnd.oasis.opendocument.graphics">
1656 <alias type="application/x-vnd.oasis.opendocument.graphics"/>
1657 <_comment>OpenDocument v1.0: Graphics document (Drawing)</_comment>
1658 <magic>
1659 <match type="string" offset="0" value="PK">
1660 <match type="string" offset="30"
1661 value="mimetypeapplication/vnd.oasis.opendocument.graphics"/>
1662 </match>
1663 </magic>
1664 <glob pattern="*.odg"/>
1665 </mime-type>
1666
1667 <mime-type type="application/vnd.oasis.opendocument.graphics-template">
1668 <alias type="application/x-vnd.oasis.opendocument.graphics-template"/>
1669 <_comment>OpenDocument v1.0: Graphics document used as template</_comment>
1670 <magic>
1671 <match type="string" offset="0" value="PK">
1672 <match type="string" offset="30"
1673 value="mimetypeapplication/vnd.oasis.opendocument.graphics-template"/>
1674 </match>
1675 </magic>
1676 <glob pattern="*.otg"/>
1677 </mime-type>
1678
1679 <mime-type type="application/vnd.oasis.opendocument.image">
1680 <alias type="application/x-vnd.oasis.opendocument.image"/>
1681 <_comment>OpenDocument v1.0: Image document</_comment>
1682 <magic>
1683 <match type="string" offset="0" value="PK">
1684 <match type="string" offset="30"
1685 value="mimetypeapplication/vnd.oasis.opendocument.image"/>
1686 </match>
1687 </magic>
1688 <glob pattern="*.odi"/>
1689 </mime-type>
1690
1691 <mime-type type="application/vnd.oasis.opendocument.image-template">
1692 <alias type="application/x-vnd.oasis.opendocument.image-template"/>
1693 <_comment>OpenDocument v1.0: Image document used as template</_comment>
1694 <magic>
1695 <match type="string" offset="0" value="PK">
1696 <match type="string" offset="30"
1697 value="mimetypeapplication/vnd.oasis.opendocument.image-template"/>
1698 </match>
1699 </magic>
1700 <glob pattern="*.oti"/>
1701 </mime-type>
1702
1703 <mime-type type="application/vnd.oasis.opendocument.presentation">
1704 <alias type="application/x-vnd.oasis.opendocument.presentation"/>
1705 <_comment>OpenDocument v1.0: Presentation document</_comment>
1706 <magic>
1707 <match type="string" offset="0" value="PK">
1708 <match type="string" offset="30"
1709 value="mimetypeapplication/vnd.oasis.opendocument.presentation"/>
1710 </match>
1711 </magic>
1712 <glob pattern="*.odp"/>
1713 </mime-type>
1714
1715 <mime-type type="application/vnd.oasis.opendocument.presentation-template">
1716 <alias type="application/x-vnd.oasis.opendocument.presentation-template"/>
1717 <_comment>OpenDocument v1.0: Presentation document used as template</_comment>
1718 <magic>
1719 <match type="string" offset="0" value="PK">
1720 <match type="string" offset="30"
1721 value="mimetypeapplication/vnd.oasis.opendocument.presentation-template"/>
1722 </match>
1723 </magic>
1724 <glob pattern="*.otp"/>
1725 </mime-type>
1726
1727 <mime-type type="application/vnd.oasis.opendocument.spreadsheet">
1728 <alias type="application/x-vnd.oasis.opendocument.spreadsheet"/>
1729 <_comment>OpenDocument v1.0: Spreadsheet document</_comment>
1730 <magic>
1731 <match type="string" offset="0" value="PK">
1732 <match type="string" offset="30"
1733 value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet"/>
1734 </match>
1735 </magic>
1736 <glob pattern="*.ods"/>
1737 </mime-type>
1738
1739 <mime-type type="application/vnd.oasis.opendocument.spreadsheet-template">
1740 <alias type="application/x-vnd.oasis.opendocument.spreadsheet-template"/>
1741 <_comment>OpenDocument v1.0: Spreadsheet document used as template</_comment>
1742 <magic>
1743 <match type="string" offset="0" value="PK">
1744 <match type="string" offset="30"
1745 value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet-template"/>
1746 </match>
1747 </magic>
1748 <glob pattern="*.ots"/>
1749 </mime-type>
1750
1751 <mime-type type="application/vnd.oasis.opendocument.text">
1752 <alias type="application/x-vnd.oasis.opendocument.text"/>
1753 <_comment>OpenDocument v1.0: Text document</_comment>
1754 <magic>
1755 <match type="string" offset="0" value="PK">
1756 <match type="string" offset="30"
1757 value="mimetypeapplication/vnd.oasis.opendocument.text"/>
1758 </match>
1759 </magic>
1760 <glob pattern="*.odt"/>
1761 </mime-type>
1762
1763 <mime-type type="application/vnd.oasis.opendocument.text-master">
1764 <alias type="application/x-vnd.oasis.opendocument.text-master"/>
1765 <_comment>OpenDocument v1.0: Global Text document</_comment>
1766 <magic>
1767 <match type="string" offset="0" value="PK">
1768 <match type="string" offset="30"
1769 value="mimetypeapplication/vnd.oasis.opendocument.text-master"/>
1770 </match>
1771 </magic>
1772 <glob pattern="*.otm"/>
1773 </mime-type>
1774
1775 <mime-type type="application/vnd.oasis.opendocument.text-template">
1776 <alias type="application/x-vnd.oasis.opendocument.text-template"/>
1777 <_comment>OpenDocument v1.0: Text document used as template</_comment>
1778 <magic>
1779 <match type="string" offset="0" value="PK">
1780 <match type="string" offset="30"
1781 value="mimetypeapplication/vnd.oasis.opendocument.text-template"/>
1782 </match>
1783 </magic>
1784 <glob pattern="*.ott"/>
1785 </mime-type>
1786
1787 <mime-type type="application/vnd.oasis.opendocument.text-web">
1788 <alias type="application/x-vnd.oasis.opendocument.text-web"/>
1789 <_comment>OpenDocument v1.0: Text document used as template for HTML documents</_comment>
1790 <magic>
1791 <match type="string" offset="0" value="PK">
1792 <match type="string" offset="30"
1793 value="mimetypeapplication/vnd.oasis.opendocument.text-web"/>
1794 </match>
1795 </magic>
1796 <glob pattern="*.oth"/>
1797 </mime-type>
1798
1799 <mime-type type="application/vnd.obn"/>
1800 <mime-type type="application/vnd.olpc-sugar">
1801 <glob pattern="*.xo"/>
1802 </mime-type>
1803 <mime-type type="application/vnd.oma-scws-config"/>
1804 <mime-type type="application/vnd.oma-scws-http-request"/>
1805 <mime-type type="application/vnd.oma-scws-http-response"/>
1806 <mime-type type="application/vnd.oma.bcast.associated-procedure-parameter+xml"/>
1807 <mime-type type="application/vnd.oma.bcast.drm-trigger+xml"/>
1808 <mime-type type="application/vnd.oma.bcast.imd+xml"/>
1809 <mime-type type="application/vnd.oma.bcast.ltkm"/>
1810 <mime-type type="application/vnd.oma.bcast.notification+xml"/>
1811 <mime-type type="application/vnd.oma.bcast.provisioningtrigger"/>
1812 <mime-type type="application/vnd.oma.bcast.sgboot"/>
1813 <mime-type type="application/vnd.oma.bcast.sgdd+xml"/>
1814 <mime-type type="application/vnd.oma.bcast.sgdu"/>
1815 <mime-type type="application/vnd.oma.bcast.simple-symbol-container"/>
1816 <mime-type type="application/vnd.oma.bcast.smartcard-trigger+xml"/>
1817 <mime-type type="application/vnd.oma.bcast.sprov+xml"/>
1818 <mime-type type="application/vnd.oma.bcast.stkm"/>
1819 <mime-type type="application/vnd.oma.dcd"/>
1820 <mime-type type="application/vnd.oma.dcdc"/>
1821 <mime-type type="application/vnd.oma.dd2+xml">
1822 <glob pattern="*.dd2"/>
1823 </mime-type>
1824 <mime-type type="application/vnd.oma.drm.risd+xml"/>
1825 <mime-type type="application/vnd.oma.group-usage-list+xml"/>
1826 <mime-type type="application/vnd.oma.poc.detailed-progress-report+xml"/>
1827 <mime-type type="application/vnd.oma.poc.final-report+xml"/>
1828 <mime-type type="application/vnd.oma.poc.groups+xml"/>
1829 <mime-type type="application/vnd.oma.poc.invocation-descriptor+xml"/>
1830 <mime-type type="application/vnd.oma.poc.optimized-progress-report+xml"/>
1831 <mime-type type="application/vnd.oma.xcap-directory+xml"/>
1832 <mime-type type="application/vnd.omads-email+xml"/>
1833 <mime-type type="application/vnd.omads-file+xml"/>
1834 <mime-type type="application/vnd.omads-folder+xml"/>
1835 <mime-type type="application/vnd.omaloc-supl-init"/>
1836
1837 <mime-type type="application/vnd.openofficeorg.extension">
1838 <glob pattern="*.oxt"/>
1839 </mime-type>
1840
1841 <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
1842 <_comment>Office Open XML Presentation</_comment>
1843 <glob pattern="*.pptx"/>
1844 <glob pattern="*.thmx"/>
1845 <sub-class-of type="application/x-tika-ooxml"/>
1846 </mime-type>
1847
1848 <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.slide">
1849 <glob pattern="*.sldx"/>
1850 <sub-class-of type="application/x-tika-ooxml"/>
1851 </mime-type>
1852
1853 <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.template">
1854 <_comment>Office Open XML Presentation Template</_comment>
1855 <glob pattern="*.potx"/>
1856 <sub-class-of type="application/x-tika-ooxml"/>
1857 </mime-type>
1858
1859 <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.slideshow">
1860 <_comment>Office Open XML Presentation Slideshow</_comment>
1861 <glob pattern="*.ppsx"/>
1862 <sub-class-of type="application/x-tika-ooxml"/>
1863 </mime-type>
1864
1865 <mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
1866 <_comment>Office Open XML Workbook</_comment>
1867 <glob pattern="*.xlsx"/>
1868 <sub-class-of type="application/x-tika-ooxml"/>
1869 </mime-type>
1870
1871 <mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.template">
1872 <_comment>Office Open XML Workbook Template</_comment>
1873 <glob pattern="*.xltx"/>
1874 <sub-class-of type="application/x-tika-ooxml"/>
1875 </mime-type>
1876
1877 <mime-type type="application/vnd.ms-excel.template.macroenabled.12">
1878 <_comment>Office Open XML Workbook Template (macro-enabled)</_comment>
1879 <glob pattern="*.xltm"/>
1880 <sub-class-of type="application/x-tika-ooxml"/>
1881 </mime-type>
1882
1883 <mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
1884 <_comment>Office Open XML Document</_comment>
1885 <glob pattern="*.docx"/>
1886 <sub-class-of type="application/x-tika-ooxml"/>
1887 </mime-type>
1888
1889 <mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.template">
1890 <_comment>Office Open XML Document Template</_comment>
1891 <glob pattern="*.dotx"/>
1892 <sub-class-of type="application/x-tika-ooxml"/>
1893 </mime-type>
1894
1895 <mime-type type="application/vnd.osa.netdeploy"/>
1896 <mime-type type="application/vnd.osgi.bundle"/>
1897 <mime-type type="application/vnd.osgi.dp">
1898 <glob pattern="*.dp"/>
1899 </mime-type>
1900 <mime-type type="application/vnd.otps.ct-kip+xml"/>
1901
1902 <mime-type type="application/vnd.palm">
1903 <!-- <glob pattern="*.pdb"/> - conflicts with chemical/x-pdb -->
1904 <glob pattern="*.pqa"/>
1905 <glob pattern="*.oprc"/>
1906 </mime-type>
1907
1908 <mime-type type="application/vnd.paos.xml"/>
1909 <mime-type type="application/vnd.pg.format">
1910 <glob pattern="*.str"/>
1911 </mime-type>
1912 <mime-type type="application/vnd.pg.osasli">
1913 <glob pattern="*.ei6"/>
1914 </mime-type>
1915 <mime-type type="application/vnd.piaccess.application-licence"/>
1916 <mime-type type="application/vnd.picsel">
1917 <glob pattern="*.efif"/>
1918 </mime-type>
1919 <mime-type type="application/vnd.poc.group-advertisement+xml"/>
1920 <mime-type type="application/vnd.pocketlearn">
1921 <glob pattern="*.plf"/>
1922 </mime-type>
1923 <mime-type type="application/vnd.powerbuilder6">
1924 <glob pattern="*.pbd"/>
1925 </mime-type>
1926 <mime-type type="application/vnd.powerbuilder6-s"/>
1927 <mime-type type="application/vnd.powerbuilder7"/>
1928 <mime-type type="application/vnd.powerbuilder7-s"/>
1929 <mime-type type="application/vnd.powerbuilder75"/>
1930 <mime-type type="application/vnd.powerbuilder75-s"/>
1931 <mime-type type="application/vnd.preminet"/>
1932 <mime-type type="application/vnd.previewsystems.box">
1933 <glob pattern="*.box"/>
1934 </mime-type>
1935 <mime-type type="application/vnd.proteus.magazine">
1936 <glob pattern="*.mgz"/>
1937 </mime-type>
1938 <mime-type type="application/vnd.publishare-delta-tree">
1939 <glob pattern="*.qps"/>
1940 </mime-type>
1941 <mime-type type="application/vnd.pvi.ptid1">
1942 <glob pattern="*.ptid"/>
1943 </mime-type>
1944 <mime-type type="application/vnd.pwg-multiplexed"/>
1945 <mime-type type="application/vnd.pwg-xhtml-print+xml"/>
1946 <mime-type type="application/vnd.qualcomm.brew-app-res"/>
1947 <mime-type type="application/vnd.quark.quarkxpress">
1948 <glob pattern="*.qxd"/>
1949 <glob pattern="*.qxt"/>
1950 <glob pattern="*.qwd"/>
1951 <glob pattern="*.qwt"/>
1952 <glob pattern="*.qxl"/>
1953 <glob pattern="*.qxb"/>
1954 </mime-type>
1955 <mime-type type="application/vnd.rapid"/>
1956 <mime-type type="application/vnd.recordare.musicxml">
1957 <glob pattern="*.mxl"/>
1958 </mime-type>
1959 <mime-type type="application/vnd.recordare.musicxml+xml">
1960 <glob pattern="*.musicxml"/>
1961 </mime-type>
1962 <mime-type type="application/vnd.renlearn.rlprint"/>
1963 <mime-type type="application/vnd.rim.cod">
1964 <glob pattern="*.cod"/>
1965 </mime-type>
1966
1967 <mime-type type="application/vnd.rn-realmedia">
1968 <magic priority="50">
1969 <match value=".RMF" type="string" offset="0" />
1970 </magic>
1971 <glob pattern="*.rm"/>
1972 </mime-type>
1973
1974 <mime-type type="application/vnd.route66.link66+xml">
1975 <glob pattern="*.link66"/>
1976 </mime-type>
1977 <mime-type type="application/vnd.ruckus.download"/>
1978 <mime-type type="application/vnd.s3sms"/>
1979 <mime-type type="application/vnd.sbm.cid"/>
1980 <mime-type type="application/vnd.sbm.mid2"/>
1981 <mime-type type="application/vnd.scribus"/>
1982 <mime-type type="application/vnd.sealed.3df"/>
1983 <mime-type type="application/vnd.sealed.csf"/>
1984 <mime-type type="application/vnd.sealed.doc"/>
1985 <mime-type type="application/vnd.sealed.eml"/>
1986 <mime-type type="application/vnd.sealed.mht"/>
1987 <mime-type type="application/vnd.sealed.net"/>
1988 <mime-type type="application/vnd.sealed.ppt"/>
1989 <mime-type type="application/vnd.sealed.tiff"/>
1990 <mime-type type="application/vnd.sealed.xls"/>
1991 <mime-type type="application/vnd.sealedmedia.softseal.html"/>
1992 <mime-type type="application/vnd.sealedmedia.softseal.pdf"/>
1993 <mime-type type="application/vnd.seemail">
1994 <glob pattern="*.see"/>
1995 </mime-type>
1996 <mime-type type="application/vnd.sema">
1997 <glob pattern="*.sema"/>
1998 </mime-type>
1999 <mime-type type="application/vnd.semd">
2000 <glob pattern="*.semd"/>
2001 </mime-type>
2002 <mime-type type="application/vnd.semf">
2003 <glob pattern="*.semf"/>
2004 </mime-type>
2005 <mime-type type="application/vnd.shana.informed.formdata">
2006 <glob pattern="*.ifm"/>
2007 </mime-type>
2008 <mime-type type="application/vnd.shana.informed.formtemplate">
2009 <glob pattern="*.itp"/>
2010 </mime-type>
2011 <mime-type type="application/vnd.shana.informed.interchange">
2012 <glob pattern="*.iif"/>
2013 </mime-type>
2014 <mime-type type="application/vnd.shana.informed.package">
2015 <glob pattern="*.ipk"/>
2016 </mime-type>
2017 <mime-type type="application/vnd.simtech-mindmapper">
2018 <glob pattern="*.twd"/>
2019 <glob pattern="*.twds"/>
2020 </mime-type>
2021 <mime-type type="application/vnd.smaf">
2022 <glob pattern="*.mmf"/>
2023 </mime-type>
2024 <mime-type type="application/vnd.smart.teacher">
2025 <glob pattern="*.teacher"/>
2026 </mime-type>
2027 <mime-type type="application/vnd.software602.filler.form+xml"/>
2028 <mime-type type="application/vnd.software602.filler.form-xml-zip"/>
2029 <mime-type type="application/vnd.solent.sdkm+xml">
2030 <glob pattern="*.sdkm"/>
2031 <glob pattern="*.sdkd"/>
2032 </mime-type>
2033 <mime-type type="application/vnd.spotfire.dxp">
2034 <glob pattern="*.dxp"/>
2035 </mime-type>
2036 <mime-type type="application/vnd.spotfire.sfs">
2037 <glob pattern="*.sfs"/>
2038 </mime-type>
2039 <mime-type type="application/vnd.sss-cod"/>
2040 <mime-type type="application/vnd.sss-dtf"/>
2041 <mime-type type="application/vnd.sss-ntf"/>
2042 <mime-type type="application/vnd.stardivision.calc">
2043 <sub-class-of type="application/x-tika-msoffice"/>
2044 <magic priority="50">
2045 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
2046 <match value="StarCalc" type="string" offset="2048:2207" />
2047 </match>
2048 </magic>
2049 <glob pattern="*.sdc"/>
2050 </mime-type>
2051 <mime-type type="application/vnd.stardivision.draw">
2052 <sub-class-of type="application/x-tika-msoffice"/>
2053 <magic priority="50">
2054 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
2055 <match value="StarDraw" type="string" offset="2048:2207" />
2056 </match>
2057 </magic>
2058 <glob pattern="*.sda"/>
2059 </mime-type>
2060 <mime-type type="application/vnd.stardivision.impress">
2061 <sub-class-of type="application/x-tika-msoffice"/>
2062 <magic priority="50">
2063 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
2064 <match value="StarImpress" type="string" offset="2048:2207" />
2065 </match>
2066 </magic>
2067 <glob pattern="*.sdd"/>
2068 </mime-type>
2069 <mime-type type="application/vnd.stardivision.math">
2070 <glob pattern="*.smf"/>
2071 </mime-type>
2072 <mime-type type="application/vnd.stardivision.writer">
2073 <sub-class-of type="application/x-tika-msoffice"/>
2074 <magic priority="50">
2075 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
2076 <match value="StarWriter" type="string" offset="2048:2207" />
2077 </match>
2078 </magic>
2079 <glob pattern="*.sdw"/>
2080 </mime-type>
2081 <mime-type type="application/x-staroffice-template">
2082 <sub-class-of type="application/x-tika-msoffice"/>
2083 <glob pattern="*.vor"/>
2084 </mime-type>
2085 <mime-type type="application/vnd.stardivision.writer-global">
2086 <glob pattern="*.sgl"/>
2087 </mime-type>
2088 <mime-type type="application/vnd.street-stream"/>
2089 <mime-type type="application/vnd.sun.xml.calc">
2090 <glob pattern="*.sxc"/>
2091 </mime-type>
2092 <mime-type type="application/vnd.sun.xml.calc.template">
2093 <glob pattern="*.stc"/>
2094 </mime-type>
2095 <mime-type type="application/vnd.sun.xml.draw">
2096 <glob pattern="*.sxd"/>
2097 </mime-type>
2098 <mime-type type="application/vnd.sun.xml.draw.template">
2099 <glob pattern="*.std"/>
2100 </mime-type>
2101 <mime-type type="application/vnd.sun.xml.impress">
2102 <glob pattern="*.sxi"/>
2103 </mime-type>
2104 <mime-type type="application/vnd.sun.xml.impress.template">
2105 <glob pattern="*.sti"/>
2106 </mime-type>
2107 <mime-type type="application/vnd.sun.xml.math">
2108 <glob pattern="*.sxm"/>
2109 </mime-type>
2110
2111 <mime-type type="application/vnd.sun.xml.writer">
2112 <alias type="application/x-vnd.sun.xml.writer"/>
2113 <_comment>OpenOffice v1.0: Writer Document</_comment>
2114 <magic>
2115 <match type="string" offset="0" value="PK">
2116 <match type="string" offset="30"
2117 value="mimetypeapplication/vnd.sun.xml.writer"/>
2118 </match>
2119 </magic>
2120 <glob pattern="*.sxw"/>
2121 </mime-type>
2122
2123 <mime-type type="application/vnd.sun.xml.writer.global">
2124 <glob pattern="*.sxg"/>
2125 </mime-type>
2126 <mime-type type="application/vnd.sun.xml.writer.template">
2127 <glob pattern="*.stw"/>
2128 </mime-type>
2129 <mime-type type="application/vnd.sun.wadl+xml"/>
2130 <mime-type type="application/vnd.sus-calendar">
2131 <glob pattern="*.sus"/>
2132 <glob pattern="*.susp"/>
2133 </mime-type>
2134 <mime-type type="application/vnd.svd">
2135 <glob pattern="*.svd"/>
2136 </mime-type>
2137 <mime-type type="application/vnd.swiftview-ics"/>
2138
2139 <mime-type type="application/vnd.symbian.install">
2140 <magic priority="50">
2141 <match value="0x10000419" type="little32" offset="8" />
2142 </magic>
2143 <glob pattern="*.sis"/>
2144 <glob pattern="*.sisx"/>
2145 </mime-type>
2146
2147 <mime-type type="application/vnd.syncml+xml">
2148 <glob pattern="*.xsm"/>
2149 </mime-type>
2150 <mime-type type="application/vnd.syncml.dm+wbxml">
2151 <glob pattern="*.bdm"/>
2152 </mime-type>
2153 <mime-type type="application/vnd.syncml.dm+xml">
2154 <glob pattern="*.xdm"/>
2155 </mime-type>
2156 <mime-type type="application/vnd.syncml.dm.notification"/>
2157 <mime-type type="application/vnd.syncml.ds.notification"/>
2158 <mime-type type="application/vnd.tao.intent-module-archive">
2159 <glob pattern="*.tao"/>
2160 </mime-type>
2161
2162 <mime-type type="application/vnd.tcpdump.pcap">
2163 <_comment>TCPDump pcap packet capture</_comment>
2164 <magic priority="50">
2165 <match value="0xa1b2c3d4" type="big32" offset="0" />
2166 <match value="0xd4c3b2a1" type="big32" offset="0" />
2167 </magic>
2168 <glob pattern="*.pcap"/>
2169 <glob pattern="*.cap"/>
2170 <glob pattern="*.dmp"/>
2171 </mime-type>
2172
2173 <mime-type type="application/vnd.tmobile-livetv">
2174 <glob pattern="*.tmo"/>
2175 </mime-type>
2176 <mime-type type="application/vnd.trid.tpt">
2177 <glob pattern="*.tpt"/>
2178 </mime-type>
2179 <mime-type type="application/vnd.triscape.mxs">
2180 <glob pattern="*.mxs"/>
2181 </mime-type>
2182 <mime-type type="application/vnd.trueapp">
2183 <glob pattern="*.tra"/>
2184 </mime-type>
2185 <mime-type type="application/vnd.truedoc"/>
2186 <mime-type type="application/vnd.ufdl">
2187 <glob pattern="*.ufd"/>
2188 <glob pattern="*.ufdl"/>
2189 </mime-type>
2190 <mime-type type="application/vnd.uiq.theme">
2191 <glob pattern="*.utz"/>
2192 </mime-type>
2193 <mime-type type="application/vnd.umajin">
2194 <glob pattern="*.umj"/>
2195 </mime-type>
2196 <mime-type type="application/vnd.unity">
2197 <glob pattern="*.unityweb"/>
2198 </mime-type>
2199 <mime-type type="application/vnd.uoml+xml">
2200 <glob pattern="*.uoml"/>
2201 </mime-type>
2202 <mime-type type="application/vnd.uplanet.alert"/>
2203 <mime-type type="application/vnd.uplanet.alert-wbxml"/>
2204 <mime-type type="application/vnd.uplanet.bearer-choice"/>
2205 <mime-type type="application/vnd.uplanet.bearer-choice-wbxml"/>
2206 <mime-type type="application/vnd.uplanet.cacheop"/>
2207 <mime-type type="application/vnd.uplanet.cacheop-wbxml"/>
2208 <mime-type type="application/vnd.uplanet.channel"/>
2209 <mime-type type="application/vnd.uplanet.channel-wbxml"/>
2210 <mime-type type="application/vnd.uplanet.list"/>
2211 <mime-type type="application/vnd.uplanet.list-wbxml"/>
2212 <mime-type type="application/vnd.uplanet.listcmd"/>
2213 <mime-type type="application/vnd.uplanet.listcmd-wbxml"/>
2214 <mime-type type="application/vnd.uplanet.signal"/>
2215 <mime-type type="application/vnd.vcx">
2216 <glob pattern="*.vcx"/>
2217 </mime-type>
2218 <mime-type type="application/vnd.vd-study"/>
2219 <mime-type type="application/vnd.vectorworks"/>
2220 <mime-type type="application/vnd.vidsoft.vidconference"/>
2221
2222 <!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
2223 <mime-type type="application/vnd.visio">
2224 <_comment>Microsoft Visio Diagram</_comment>
2225 <glob pattern="*.vsd"/>
2226 <glob pattern="*.vst"/>
2227 <glob pattern="*.vss"/>
2228 <glob pattern="*.vsw"/>
2229 <sub-class-of type="application/x-tika-msoffice"/>
2230 </mime-type>
2231
2232 <mime-type type="application/vnd.visionary">
2233 <glob pattern="*.vis"/>
2234 </mime-type>
2235 <mime-type type="application/vnd.vividence.scriptfile"/>
2236 <mime-type type="application/vnd.vsf">
2237 <glob pattern="*.vsf"/>
2238 </mime-type>
2239 <mime-type type="application/vnd.wap.sic"/>
2240 <mime-type type="application/vnd.wap.slc"/>
2241
2242 <mime-type type="application/vnd.wap.wbxml">
2243 <glob pattern="*.wbxml"/>
2244 </mime-type>
2245
2246 <mime-type type="application/vnd.wap.wmlc">
2247 <_comment>Compiled WML Document</_comment>
2248 <glob pattern="*.wmlc"/>
2249 </mime-type>
2250
2251 <mime-type type="application/vnd.wap.wmlscriptc">
2252 <_comment>Compiled WML Script</_comment>
2253 <glob pattern="*.wmlsc"/>
2254 </mime-type>
2255
2256 <mime-type type="application/vnd.webturbo">
2257 <glob pattern="*.wtb"/>
2258 </mime-type>
2259 <mime-type type="application/vnd.wfa.wsc"/>
2260 <mime-type type="application/vnd.wmc"/>
2261 <mime-type type="application/vnd.wmf.bootstrap"/>
2262 <mime-type type="application/vnd.wordperfect">
2263 <alias type="application/wordperfect"/>
2264 <glob pattern="*.wpd"/>
2265 </mime-type>
2266 <mime-type type="application/vnd.wqd">
2267 <glob pattern="*.wqd"/>
2268 </mime-type>
2269 <mime-type type="application/vnd.wrq-hp3000-labelled"/>
2270 <mime-type type="application/vnd.wt.stf">
2271 <glob pattern="*.stf"/>
2272 </mime-type>
2273 <mime-type type="application/vnd.wv.csp+wbxml"/>
2274 <mime-type type="application/vnd.wv.csp+xml"/>
2275 <mime-type type="application/vnd.wv.ssp+xml"/>
2276 <mime-type type="application/vnd.xara">
2277 <glob pattern="*.xar"/>
2278 </mime-type>
2279 <mime-type type="application/vnd.xfdl">
2280 <glob pattern="*.xfdl"/>
2281 </mime-type>
2282 <mime-type type="application/vnd.xfdl.webform"/>
2283 <mime-type type="application/vnd.xmi+xml"/>
2284 <mime-type type="application/vnd.xmpie.cpkg"/>
2285 <mime-type type="application/vnd.xmpie.dpkg"/>
2286 <mime-type type="application/vnd.xmpie.plan"/>
2287 <mime-type type="application/vnd.xmpie.ppkg"/>
2288 <mime-type type="application/vnd.xmpie.xlim"/>
2289 <mime-type type="application/vnd.yamaha.hv-dic">
2290 <glob pattern="*.hvd"/>
2291 </mime-type>
2292 <mime-type type="application/vnd.yamaha.hv-script">
2293 <glob pattern="*.hvs"/>
2294 </mime-type>
2295 <mime-type type="application/vnd.yamaha.hv-voice">
2296 <glob pattern="*.hvp"/>
2297 </mime-type>
2298 <mime-type type="application/vnd.yamaha.openscoreformat">
2299 <glob pattern="*.osf"/>
2300 </mime-type>
2301 <mime-type type="application/vnd.yamaha.openscoreformat.osfpvg+xml">
2302 <glob pattern="*.osfpvg"/>
2303 </mime-type>
2304 <mime-type type="application/vnd.yamaha.smaf-audio">
2305 <glob pattern="*.saf"/>
2306 </mime-type>
2307 <mime-type type="application/vnd.yamaha.smaf-phrase">
2308 <glob pattern="*.spf"/>
2309 </mime-type>
2310 <mime-type type="application/vnd.yellowriver-custom-menu">
2311 <glob pattern="*.cmp"/>
2312 </mime-type>
2313 <mime-type type="application/vnd.zul">
2314 <glob pattern="*.zir"/>
2315 <glob pattern="*.zirz"/>
2316 </mime-type>
2317 <mime-type type="application/vnd.zzazz.deck+xml">
2318 <glob pattern="*.zaz"/>
2319 </mime-type>
2320 <mime-type type="application/voicexml+xml">
2321 <glob pattern="*.vxml"/>
2322 </mime-type>
2323 <mime-type type="application/watcherinfo+xml"/>
2324 <mime-type type="application/whoispp-query"/>
2325 <mime-type type="application/whoispp-response"/>
2326 <mime-type type="application/winhlp">
2327 <glob pattern="*.hlp"/>
2328 </mime-type>
2329 <mime-type type="application/wita"/>
2330 <mime-type type="application/wordperfect5.1"/>
2331 <mime-type type="application/wsdl+xml">
2332 <glob pattern="*.wsdl"/>
2333 </mime-type>
2334 <mime-type type="application/wspolicy+xml">
2335 <glob pattern="*.wspolicy"/>
2336 </mime-type>
2337
2338 <mime-type type="application/x-123">
2339 <magic priority="50">
2340 <match value="0x00001a00" type="big32" offset="0" />
2341 <match value="0x00000200" type="big32" offset="0" />
2342 </magic>
2343 </mime-type>
2344
2345 <mime-type type="application/x-abiword">
2346 <glob pattern="*.abw"/>
2347 </mime-type>
2348 <mime-type type="application/x-ace-compressed">
2349 <glob pattern="*.ace"/>
2350 </mime-type>
2351
2352 <mime-type type="application/x-adobe-indesign">
2353 <acronym>INDD</acronym>
2354 <_comment>Adobe InDesign document</_comment>
2355 <glob pattern="*.indd"/>
2356 <magic priority="50">
2357 <match value="0x0606edf5d81d46e5bd31efe7fe74b71d" type="string" offset="0" />
2358 </magic>
2359 </mime-type>
2360
2361 <mime-type type="application/x-adobe-indesign-interchange">
2362 <acronym>INX</acronym>
2363 <_comment>Adobe InDesign Interchange format</_comment>
2364 <magic priority="50">
2365 <match value="&lt;?aid" type="string" offset="0:100"/>
2366 </magic>
2367 <glob pattern="*.inx"/>
2368 <sub-class-of type="application/xml"/>
2369 </mime-type>
2370
2371 <mime-type type="application/x-archive">
2372 <alias type="application/x-unix-archive"/>
2373 <magic priority="50">
2374 <match value="=&lt;ar&gt;" type="string" offset="0"/>
2375 <match value="!&lt;arch&gt;\n" type="string" offset="0"/>
2376 </magic>
2377 <glob pattern="*.ar"/>
2378 <glob pattern="*.a"/>
2379 </mime-type>
2380
2381 <mime-type type="application/x-arj">
2382 <alias type="application/x-arj-compressed"/>
2383 <magic priority="50">
2384 <match value="0x60ea" type="string" offset="0" />
2385 </magic>
2386 <glob pattern="*.arj"/>
2387 </mime-type>
2388
2389 <mime-type type="application/x-authorware-bin">
2390 <glob pattern="*.aab"/>
2391 <glob pattern="*.x32"/>
2392 <glob pattern="*.u32"/>
2393 <glob pattern="*.vox"/>
2394 </mime-type>
2395 <mime-type type="application/x-authorware-map">
2396 <glob pattern="*.aam"/>
2397 </mime-type>
2398 <mime-type type="application/x-authorware-seg">
2399 <glob pattern="*.aas"/>
2400 </mime-type>
2401
2402 <mime-type type="application/x-bcpio">
2403 <glob pattern="*.bcpio"/>
2404 </mime-type>
2405
2406 <mime-type type="application/x-berkeley-db">
2407 <magic priority="50">
2408 <match value="0x00061561" type="big32" offset="0"/>
2409 <match value="0x00061561" type="host32" offset="12"/>
2410 <match value="0x00061561" type="big32" offset="12"/>
2411 <match value="0x00061561" type="little32" offset="12"/>
2412 <match value="0x00053162" type="host32" offset="12"/>
2413 <match value="0x00053162" type="big32" offset="12"/>
2414 <match value="0x00053162" type="little32" offset="12"/>
2415 <match value="0x00042253" type="host32" offset="12"/>
2416 <match value="0x00042253" type="big32" offset="12"/>
2417 <match value="0x00042253" type="little32" offset="12"/>
2418 <match value="0x00040988" type="host32" offset="12"/>
2419 <match value="0x00040988" type="little32" offset="12"/>
2420 <match value="0x00040988" type="big32" offset="12"/>
2421 <match value="0x00053162" type="host32" offset="0"/>
2422 <match value="0x00053162" type="big32" offset="0"/>
2423 <match value="0x00053162" type="little32" offset="0"/>
2424 </magic>
2425 </mime-type>
2426
2427 <mime-type type="application/x-bibtex-text-file">
2428 <magic priority="50">
2429 <match value="%\ BibTeX\ `" type="string" offset="0"/>
2430 <match value="%%%\ \ " type="string" offset="73"/>
2431 <match value="%\ BibTeX\ standard\ bibliography\ " type="string" offset="0"/>
2432 <match value="%%%\ \ @BibTeX-style-file{" type="string" offset="73"/>
2433 <match value="@article{" type="string" offset="0"/>
2434 <match value="@book{" type="string" offset="0"/>
2435 <match value="@inbook{" type="string" offset="0"/>
2436 <match value="@incollection{" type="string" offset="0"/>
2437 <match value="@inproceedings{" type="string" offset="0"/>
2438 <match value="@manual{" type="string" offset="0"/>
2439 <match value="@misc{" type="string" offset="0"/>
2440 <match value="@preamble{" type="string" offset="0"/>
2441 <match value="@phdthesis{" type="string" offset="0"/>
2442 <match value="@techreport{" type="string" offset="0"/>
2443 <match value="@unpublished{" type="string" offset="0"/>
2444 </magic>
2445 <glob pattern="*.bib"/>
2446 <glob pattern="*.bibtex"/>
2447 </mime-type>
2448
2449 <mime-type type="application/x-bittorrent">
2450 <magic priority="50">
2451 <match value="d8:announce" type="string" offset="0"/>
2452 </magic>
2453 <glob pattern="*.torrent"/>
2454 </mime-type>
2455
2456 <mime-type type="application/x-bplist">
2457 <!-- The priority is 60, as .webarchive files often contain
2458 (X)HTML content. The bplist magic must trump the XHTML
2459 magics further within the file. This must also be
2460 independent of the internal ordering of patterns within
2461 MimeTypes -->
2462 <magic priority="60">
2463 <match value="bplist" type="string" offset="0"/>
2464 </magic>
2465 </mime-type>
2466
2467 <mime-type type="application/x-bzip">
2468 <magic priority="40">
2469 <match value="BZh" type="string" offset="0"/>
2470 </magic>
2471 <glob pattern="*.bz"/>
2472 <glob pattern="*.tbz"/>
2473 </mime-type>
2474
2475 <mime-type type="application/x-bzip2">
2476 <sub-class-of type="application/x-bzip"/>
2477 <_comment>Bzip 2 UNIX Compressed File</_comment>
2478 <magic priority="40">
2479 <match value="\x42\x5a\x68\x39\x31" type="string" offset="0"/>
2480 </magic>
2481 <glob pattern="*.bz2"/>
2482 <glob pattern="*.tbz2"/>
2483 <glob pattern="*.boz"/>
2484 </mime-type>
2485
2486 <mime-type type="application/x-cdlink">
2487 <_comment>Virtual CD-ROM CD Image File</_comment>
2488 <glob pattern="*.vcd"/>
2489 </mime-type>
2490
2491 <mime-type type="application/x-chat">
2492 <glob pattern="*.chat"/>
2493 </mime-type>
2494 <mime-type type="application/x-chess-pgn">
2495 <glob pattern="*.pgn"/>
2496 </mime-type>
2497
2498 <mime-type type="application/x-compress">
2499 <magic priority="50">
2500 <match value="\037\235" type="string" offset="0"/>
2501 </magic>
2502 <glob pattern="*.z"/>
2503 </mime-type>
2504
2505 <mime-type type="application/x-corelpresentations">
2506 <glob pattern="*.shw"/>
2507 <sub-class-of type="application/x-tika-msoffice"/>
2508 </mime-type>
2509
2510 <mime-type type="application/x-cpio">
2511 <_comment>UNIX CPIO Archive</_comment>
2512 <magic priority="50">
2513 <match value="070707" type="little16" offset="0"/>
2514 <match value="070707" type="big16" offset="0"/>
2515 <match value="070707" type="string" offset="0"/>
2516 <match value="070701" type="string" offset="0"/>
2517 <match value="070702" type="string" offset="0"/>
2518 </magic>
2519 <glob pattern="*.cpio"/>
2520 </mime-type>
2521
2522 <mime-type type="application/x-csh">
2523 <glob pattern="*.csh"/>
2524 <glob pattern="*.tcsh"/>
2525 </mime-type>
2526
2527 <mime-type type="application/x-debian-package">
2528 <sub-class-of type="application/x-archive"/>
2529 <magic priority="60">
2530 <match value="!&lt;arch&gt;\ndebian-binary" type="string" offset="0"/>
2531 </magic>
2532 <glob pattern="*.deb"/>
2533 <glob pattern="*.udeb"/>
2534 </mime-type>
2535
2536 <mime-type type="application/x-director">
2537 <_comment>Shockwave Movie</_comment>
2538 <glob pattern="*.dir"/>
2539 <glob pattern="*.dcr"/>
2540 <glob pattern="*.dxr"/>
2541 <glob pattern="*.cst"/>
2542 <glob pattern="*.cct"/>
2543 <glob pattern="*.cxt"/>
2544 <glob pattern="*.w3d"/>
2545 <glob pattern="*.fgd"/>
2546 <glob pattern="*.swa"/>
2547 </mime-type>
2548
2549 <mime-type type="application/x-doom">
2550 <glob pattern="*.wad"/>
2551 </mime-type>
2552 <mime-type type="application/x-dtbncx+xml">
2553 <glob pattern="*.ncx"/>
2554 </mime-type>
2555 <mime-type type="application/x-dtbook+xml">
2556 <glob pattern="*.dtb"/>
2557 </mime-type>
2558 <mime-type type="application/x-dtbresource+xml">
2559 <glob pattern="*.res"/>
2560 </mime-type>
2561
2562 <mime-type type="application/x-dvi">
2563 <_comment>TeX Device Independent Document</_comment>
2564 <magic priority="50">
2565 <match value="\367\002" type="string" offset="0"/>
2566 <match value="0x02f7" type="little16" offset="0"/>
2567 <match value="\x1b\x20\x54\x65\x58\x20\x6f\x75\x74\x70\x75\x74\x20"
2568 type="string" offset="14"/>
2569 </magic>
2570 <glob pattern="*.dvi"/>
2571 </mime-type>
2572
2573 <mime-type type="application/x-elc">
2574 <_comment>Emacs Lisp bytecode</_comment>
2575 <magic priority="50">
2576 <!-- Emacs 18 -->
2577 <match value="\012(" type="string" offset="0" />
2578 <!-- Emacs 19 -->
2579 <match value=";ELC\023\000\000\000" type="string" offset="0" />
2580 </magic>
2581 <glob pattern="*.elc"/>
2582 </mime-type>
2583
2584 <mime-type type="application/x-elf">
2585 <magic priority="50">
2586 <match value="\177ELF" type="string" offset="0" />
2587 </magic>
2588 </mime-type>
2589
2590 <mime-type type="message/x-emlx">
2591 <magic priority="70">
2592 <match value="\nRelay-Version:" type="string" offset="2:9"/>
2593 <match value="\n#!\ rnews" type="string" offset="2:9"/>
2594 <match value="\nN#!\ rnews" type="string" offset="2:9"/>
2595 <match value="\nForward\ to" type="string" offset="2:9"/>
2596 <match value="\nPipe\ to" type="string" offset="2:9"/>
2597 <match value="\nReturn-Path:" type="string" offset="2:9"/>
2598 <match value="\nFrom:" type="string" offset="2:9"/>
2599 <match value="\nReceived:" type="string" offset="2:9"/>
2600 <match value="\nMessage-ID:" type="string" offset="2:9"/>
2601 <match value="\nDate:" type="string" offset="2:9"/>
2602 </magic>
2603 <glob pattern="*.emlx"/>
2604 </mime-type>
2605
2606 <mime-type type="application/x-killustrator">
2607 <_comment>KIllustrator File</_comment>
2608 <glob pattern="*.kil"/>
2609 </mime-type>
2610
2611 <mime-type type="application/x-object">
2612 <sub-class-of type="application/x-elf"/>
2613 <magic priority="50">
2614 <match value="\177ELF" type="string" offset="0">
2615 <match value="0x0100" type="string" offset="16"/>
2616 <match value="0x0001" type="string" offset="16"/>
2617 </match>
2618 </magic>
2619 </mime-type>
2620 <mime-type type="application/x-executable">
2621 <sub-class-of type="application/x-elf"/>
2622 <magic priority="50">
2623 <match value="\177ELF" type="string" offset="0">
2624 <match value="0x0200" type="string" offset="16"/>
2625 <match value="0x0002" type="string" offset="16"/>
2626 </match>
2627 </magic>
2628 </mime-type>
2629 <mime-type type="application/x-sharedlib">
2630 <sub-class-of type="application/x-elf"/>
2631 <magic priority="50">
2632 <match value="\177ELF" type="string" offset="0">
2633 <match value="0x0300" type="string" offset="16"/>
2634 <match value="0x0003" type="string" offset="16"/>
2635 </match>
2636 </magic>
2637 </mime-type>
2638 <mime-type type="application/x-coredump">
2639 <sub-class-of type="application/x-elf"/>
2640 <magic priority="50">
2641 <match value="\177ELF" type="string" offset="0">
2642 <match value="0x0400" type="string" offset="16"/>
2643 <match value="0x0004" type="string" offset="16"/>
2644 </match>
2645 </magic>
2646 </mime-type>
2647
2648 <mime-type type="application/x-dosexec">
2649 <_comment>DOS/Windows executable (EXE)</_comment>
2650 <sub-class-of type="application/x-msdownload"/>
2651 <glob pattern="*.exe"/>
2652 </mime-type>
2653
2654 <mime-type type="application/x-emf">
2655 <acronym>EMF</acronym>
2656 <_comment>Extended Metafile</_comment>
2657 <glob pattern="*.emf"/>
2658 <magic priority="50">
2659 <match value="0x01000000" type="string" offset="0"/>
2660 </magic>
2661 </mime-type>
2662
2663 <mime-type type="application/x-filemaker">
2664 <acronym>FP7</acronym>
2665 <_comment>FileMaker Pro 7</_comment>
2666 <magic priority="50">
2667 <match value="0xC04842414D37" type="string" offset="14" >
2668 <match value="0x4842414D323130314F43543939C102480750726F20372E30C0C0" type="string" offset="525" />
2669 </match>
2670 </magic>
2671 <glob pattern="*.fp7" />
2672 </mime-type>
2673
2674 <mime-type type="application/x-font-bdf">
2675 <glob pattern="*.bdf"/>
2676 </mime-type>
2677 <mime-type type="application/x-font-dos"/>
2678 <mime-type type="application/x-font-framemaker"/>
2679 <mime-type type="application/x-font-ghostscript">
2680 <glob pattern="*.gsf"/>
2681 </mime-type>
2682 <mime-type type="application/x-font-libgrx"/>
2683 <mime-type type="application/x-font-linux-psf">
2684 <glob pattern="*.psf"/>
2685 </mime-type>
2686
2687 <mime-type type="application/x-font-otf">
2688 <acronym>OTF</acronym>
2689 <_comment>OpenType Font</_comment>
2690 <glob pattern="*.otf"/>
2691 </mime-type>
2692
2693 <mime-type type="application/x-font-pcf">
2694 <glob pattern="*.pcf"/>
2695 </mime-type>
2696 <mime-type type="application/x-font-snf">
2697 <glob pattern="*.snf"/>
2698 </mime-type>
2699 <mime-type type="application/x-font-speedo"/>
2700 <mime-type type="application/x-font-sunos-news"/>
2701
2702 <mime-type type="application/x-font-ttf">
2703 <acronym>TTF</acronym>
2704 <_comment>TrueType Font</_comment>
2705 <glob pattern="*.ttf"/>
2706 <glob pattern="*.ttc"/>
2707 <magic priority="40">
2708 <match value="0x00010000" type="string" offset="0"/>
2709 </magic>
2710 </mime-type>
2711
2712 <mime-type type="application/x-font-type1">
2713 <glob pattern="*.pfa"/>
2714 <glob pattern="*.pfb"/>
2715 <magic priority="60">
2716 <!-- Match for PFB, the binary format -->
2717 <match value="\x80\x01\xFF\xFF\x00\x00%!PS-AdobeFont" type="string"
2718 mask="0xFFFF0000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" offset="0"/>
2719 <!-- Match for PFA, the text format" -->
2720 <match value="%!PS-AdobeFont-1.0" type="string" offset="0" />
2721 </magic>
2722 </mime-type>
2723
2724 <mime-type type="application/x-font-adobe-metric">
2725 <_comment>Adobe Font Metric</_comment>
2726 <glob pattern="*.afm"/>
2727 <glob pattern="*.acfm"/>
2728 <glob pattern="*.amfm"/>
2729 <magic priority="40">
2730 <match value="StartFontMetrics" type="string" offset="0"/>
2731 </magic>
2732 </mime-type>
2733
2734 <mime-type type="application/x-font-printer-metric">
2735 <_comment>Printer Font Metric</_comment>
2736 <glob pattern="*.pfm"/>
2737 <magic priority="40">
2738 <match value="0x0001FFFF0000436f707972" type="string" offset="0"
2739 mask="0xFFFF0000FFFFFFFFFFFFFF" />
2740 </magic>
2741 </mime-type>
2742
2743 <mime-type type="application/x-font-vfont"/>
2744
2745 <mime-type type="application/x-foxmail">
2746 <_comment>Foxmail Email File</_comment>
2747 <magic>
2748 <match value="0x1010101010101011111111111153" type="string" offset="0"/>
2749 </magic>
2750 </mime-type>
2751
2752 <mime-type type="application/x-futuresplash">
2753 <_comment>Macromedia FutureSplash File</_comment>
2754 <glob pattern="*.spl"/>
2755 </mime-type>
2756
2757 <mime-type type="application/x-gnucash">
2758 <glob pattern="*.gnucash" />
2759 </mime-type>
2760
2761 <mime-type type="application/x-gnumeric">
2762 <alias type="application/x-Gnumeric-spreadsheet"/>
2763 <magic priority="50">
2764 <match value="=&lt;gmr:Workbook" type="string" offset="39" />
2765 </magic>
2766 <glob pattern="*.gnumeric"/>
2767 </mime-type>
2768
2769 <mime-type type="application/x-gtar">
2770 <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
2771 <magic priority="40">
2772 <!-- GNU tar archive -->
2773 <match value="ustar \0" type="string" offset="257" />
2774 </magic>
2775 <glob pattern="*.gtar"/>
2776 <sub-class-of type="application/x-tar"/>
2777 </mime-type>
2778
2779 <mime-type type="application/x-gzip">
2780 <_comment>Gzip Compressed Archive</_comment>
2781 <magic priority="40">
2782 <match value="\037\213" type="string" offset="0" />
2783 <match value="\x1f\x8b" type="string" offset="0" />
2784 </magic>
2785 <glob pattern="*.tgz" />
2786 <glob pattern="*.gz" />
2787 <glob pattern="*-gz" />
2788 <glob pattern="*.emz" />
2789 </mime-type>
2790
2791 <mime-type type="application/x-hdf">
2792 <_comment>Hierarchical Data Format File</_comment>
2793 <magic priority="50">
2794 <!-- HDF4 -->
2795 <match value="0x0e031301" type="big32" offset="0"/>
2796 <!-- HDF5 -->
2797 <match value="\211HDF\r\n\032" type="string" offset="0"/>
2798 </magic>
2799 <glob pattern="*.hdf"/>
2800 <glob pattern="*.he5"/>
2801 <glob pattern="*.h5"/>
2802 </mime-type>
2803
2804 <mime-type type="application/x-hwp">
2805 <magic priority="50">
2806 <!--
2807 TIKA-330: Detection pattern based on signature strings from
2808 the hwpfilter/source/hwpfile.cpp file in OpenOffice.org.
2809 -->
2810 <match value="HWP Document File V" type="string" offset="0"/>
2811 </magic>
2812 </mime-type>
2813
2814 <mime-type type="application/x-ibooks+zip">
2815 <sub-class-of type="application/epub+zip" />
2816 <acronym>iBooks</acronym>
2817 <_comment>Apple iBooks Author publication format</_comment>
2818 <magic priority="50">
2819 <match value="PK\003\004" type="string" offset="0">
2820 <match value="mimetypeapplication/x-ibooks+zip" type="string" offset="30"/>
2821 </match>
2822 </magic>
2823 <glob pattern="*.ibooks"/>
2824 </mime-type>
2825
2826 <mime-type type="application/x-iso9660-image">
2827 <acronym>ISO</acronym>
2828 <_comment>ISO 9660 CD-ROM filesystem data</_comment>
2829 <magic priority="50">
2830 <match value="CD001" type="string" offset="32769"/>
2831 </magic>
2832 <glob pattern="*.iso"/>
2833 </mime-type>
2834
2835 <mime-type type="application/x-itunes-ipa">
2836 <sub-class-of type="application/zip"/>
2837 <_comment>Apple iOS IPA AppStore file</_comment>
2838 <glob pattern="*.ipa"/>
2839 </mime-type>
2840
2841 <mime-type type="application/x-java-jnlp-file">
2842 <glob pattern="*.jnlp"/>
2843 </mime-type>
2844
2845 <mime-type type="application/x-java-pack200">
2846 <glob pattern="*.pack"/>
2847 </mime-type>
2848
2849 <mime-type type="application/x-kdelnk">
2850 <magic priority="50">
2851 <match value="[KDE\ Desktop\ Entry]" type="string" offset="0"/>
2852 <match value="#\ KDE\ Config\ File" type="string" offset="0"/>
2853 </magic>
2854 </mime-type>
2855
2856 <mime-type type="application/x-latex">
2857 <_comment>LaTeX Source Document</_comment>
2858 <magic priority="50">
2859 <match value="%\ -*-latex-*-" type="string" offset="0"/>
2860 </magic>
2861 <glob pattern="*.latex"/>
2862 <sub-class-of type="application/x-tex"/>
2863 </mime-type>
2864
2865 <mime-type type="application/x-lha">
2866 <magic priority="50">
2867 <match value="-lzs-" type="string" offset="2"/>
2868 <match value="-lh\40-" type="string" offset="2"/>
2869 <match value="-lhd-" type="string" offset="2"/>
2870 <match value="-lh2-" type="string" offset="2"/>
2871 <match value="-lh3-" type="string" offset="2"/>
2872 <match value="-lh4-" type="string" offset="2"/>
2873 <match value="-lh5-" type="string" offset="2"/>
2874 <match value="-lh6-" type="string" offset="2"/>
2875 <match value="-lh7-" type="string" offset="2"/>
2876 </magic>
2877 </mime-type>
2878
2879 <mime-type type="application/x-lharc">
2880 <magic priority="50">
2881 <match value="-lh0-" type="string" offset="2"/>
2882 <match value="-lh1-" type="string" offset="2"/>
2883 <match value="-lz4-" type="string" offset="2"/>
2884 <match value="-lz5-" type="string" offset="2"/>
2885 </magic>
2886 </mime-type>
2887
2888 <mime-type type="application/x-mobipocket-ebook">
2889 <glob pattern="*.prc"/>
2890 <glob pattern="*.mobi"/>
2891 </mime-type>
2892 <mime-type type="application/x-ms-application">
2893 <glob pattern="*.application"/>
2894 </mime-type>
2895 <mime-type type="application/x-ms-wmd">
2896 <glob pattern="*.wmd"/>
2897 </mime-type>
2898 <mime-type type="application/x-ms-wmz">
2899 <sub-class-of type="application/x-gzip"/>
2900 <glob pattern="*.wmz"/>
2901 </mime-type>
2902 <mime-type type="application/x-ms-xbap">
2903 <glob pattern="*.xbap"/>
2904 </mime-type>
2905 <mime-type type="application/x-msaccess">
2906 <glob pattern="*.mdb"/>
2907 <magic priority="60">
2908 <match value="0x000100005374616e" type="string" offset="0"/>
2909 </magic>
2910 </mime-type>
2911 <mime-type type="application/x-msbinder">
2912 <glob pattern="*.obd"/>
2913 </mime-type>
2914 <mime-type type="application/x-mscardfile">
2915 <glob pattern="*.crd"/>
2916 </mime-type>
2917 <mime-type type="application/x-msclip">
2918 <glob pattern="*.clp"/>
2919 </mime-type>
2920
2921 <mime-type type="application/x-msdownload">
2922 <glob pattern="*.dll"/>
2923 <glob pattern="*.com"/>
2924 <glob pattern="*.bat"/>
2925 <glob pattern="*.msi"/>
2926 <magic priority="50">
2927 <match value="MZ" type="string" offset="0"/>
2928 </magic>
2929 </mime-type>
2930
2931 <mime-type type="application/x-msdownload;format=pe">
2932 <sub-class-of type="application/x-msdownload"/>
2933 <magic priority="55">
2934 <!-- Technically the header offset is stored at 0x3c, and isn't a -->
2935 <!-- constant, but it's almost always set to start at 0x80 or 0xf0 -->
2936 <match value="PE\000\000" type="string" offset="128"/>
2937 <match value="PE\000\000" type="string" offset="240"/>
2938 </magic>
2939 </mime-type>
2940 <!-- the PE header should be PEx00x00 then a two byte machine type -->
2941 <mime-type type="application/x-msdownload;format=pe32">
2942 <sub-class-of type="application/x-msdownload;format=pe"/>
2943 <magic priority="60">
2944 <match value="PE\000\000" type="string" offset="128">
2945 <match value="0x014c" type="little16" offset="132"/>
2946 </match>
2947 <match value="PE\000\000" type="string" offset="240">
2948 <match value="0x014c" type="little16" offset="244"/>
2949 </match>
2950 </magic>
2951 </mime-type>
2952 <mime-type type="application/x-msdownload;format=pe64">
2953 <sub-class-of type="application/x-msdownload;format=pe"/>
2954 <magic priority="60">
2955 <match value="PE\000\000" type="string" offset="128">
2956 <match value="0x8664" type="little16" offset="132"/>
2957 </match>
2958 <match value="PE\000\000" type="string" offset="240">
2959 <match value="0x8664" type="little16" offset="244"/>
2960 </match>
2961 </magic>
2962 </mime-type>
2963 <mime-type type="application/x-msdownload;format=pe-itanium">
2964 <sub-class-of type="application/x-msdownload;format=pe"/>
2965 <magic priority="60">
2966 <match value="PE\000\000" type="string" offset="128">
2967 <match value="0x0200" type="little16" offset="132"/>
2968 </match>
2969 <match value="PE\000\000" type="string" offset="240">
2970 <match value="0x0200" type="little16" offset="244"/>
2971 </match>
2972 </magic>
2973 </mime-type>
2974 <mime-type type="application/x-msdownload;format=pe-armLE">
2975 <sub-class-of type="application/x-msdownload;format=pe"/>
2976 <magic priority="60">
2977 <match value="pe\000\000" type="string" offset="128">
2978 <match value="0x01c0" type="little16" offset="132"/>
2979 </match>
2980 <match value="pe\000\000" type="string" offset="240">
2981 <match value="0x01c0" type="little16" offset="244"/>
2982 </match>
2983 </magic>
2984 </mime-type>
2985 <mime-type type="application/x-msdownload;format=pe-arm7">
2986 <sub-class-of type="application/x-msdownload;format=pe"/>
2987 <magic priority="60">
2988 <match value="pe\000\000" type="string" offset="128">
2989 <match value="0x01c4" type="little16" offset="132"/>
2990 </match>
2991 <match value="pe\000\000" type="string" offset="240">
2992 <match value="0x01c4" type="little16" offset="244"/>
2993 </match>
2994 </magic>
2995 </mime-type>
2996
2997 <mime-type type="application/x-msmediaview">
2998 <glob pattern="*.mvb"/>
2999 <glob pattern="*.m13"/>
3000 <glob pattern="*.m14"/>
3001 </mime-type>
3002 <mime-type type="application/x-msmetafile">
3003 <alias type="image/x-wmf"/>
3004 <acronym>WMF</acronym>
3005 <_comment>Windows Metafile</_comment>
3006 <glob pattern="*.wmf"/>
3007 <magic priority="50">
3008 <match value="0xd7cdc69a0000" type="string" offset="0"/>
3009 <match value="0x010009000003" type="string" offset="0"/>
3010 </magic>
3011 </mime-type>
3012 <mime-type type="application/x-msmoney">
3013 <glob pattern="*.mny"/>
3014 </mime-type>
3015 <mime-type type="application/x-mspublisher">
3016 <glob pattern="*.pub"/>
3017 </mime-type>
3018 <mime-type type="application/x-msschedule">
3019 <glob pattern="*.scd"/>
3020 </mime-type>
3021 <mime-type type="application/x-msterminal">
3022 <glob pattern="*.trm"/>
3023 </mime-type>
3024 <mime-type type="application/x-mswrite">
3025 <glob pattern="*.wri"/>
3026 </mime-type>
3027 <mime-type type="application/x-netcdf">
3028 <glob pattern="*.nc"/>
3029 <glob pattern="*.cdf"/>
3030 </mime-type>
3031 <mime-type type="application/x-pkcs12">
3032 <glob pattern="*.p12"/>
3033 <glob pattern="*.pfx"/>
3034 </mime-type>
3035 <mime-type type="application/x-pkcs7-certificates">
3036 <glob pattern="*.p7b"/>
3037 <glob pattern="*.spc"/>
3038 </mime-type>
3039 <mime-type type="application/x-pkcs7-certreqresp">
3040 <glob pattern="*.p7r"/>
3041 </mime-type>
3042
3043 <mime-type type="application/x-prt">
3044 <glob pattern="*.prt"/>
3045 <magic priority="50">
3046 <match value="0M3C" type="string" offset="8" />
3047 </magic>
3048 </mime-type>
3049
3050 <mime-type type="application/x-quattro-pro">
3051 <glob pattern="*.qpw"/>
3052 <glob pattern="*.wb1"/>
3053 <glob pattern="*.wb2"/>
3054 <glob pattern="*.wb3"/>
3055 <sub-class-of type="application/x-tika-msoffice"/>
3056 </mime-type>
3057
3058 <mime-type type="application/x-rar-compressed">
3059 <_comment>RAR archive</_comment>
3060 <alias type="application/x-rar"/>
3061 <magic priority="50">
3062 <match value="Rar!" type="string" offset="0"/>
3063 <match value="\x52\x61\x72\x21\x1a" type="string" offset="0"/>
3064 </magic>
3065 <glob pattern="*.rar"/>
3066 </mime-type>
3067
3068 <mime-type type="application/x-rpm">
3069 <_comment>RedHat Package Manager</_comment>
3070 <glob pattern="*.rpm"/>
3071 <magic priority="50">
3072 <match value="\xed\xab\xee\xdb" type="string" offset="0"/>
3073 </magic>
3074 </mime-type>
3075
3076 <mime-type type="application/x-sas">
3077 <_comment>SAS Program</_comment>
3078 <glob pattern="*.sas"/>
3079 <sub-class-of type="text/plain"/>
3080 </mime-type>
3081 <mime-type type="application/x-sas-program-data">
3082 <_comment>SAS Stored Program (DATA Step)</_comment>
3083 <glob pattern="*.ss7"/>
3084 <glob pattern="*.sas7bpgm"/>
3085 </mime-type>
3086 <mime-type type="application/x-sas-audit">
3087 <_comment>SAS Audit</_comment>
3088 <glob pattern="*.st7"/>
3089 <glob pattern="*.sas7baud"/>
3090 </mime-type>
3091 <mime-type type="application/x-sas-data">
3092 <_comment>SAS Data Set</_comment>
3093 <glob pattern="*.sd7"/>
3094 <glob pattern="*.sas7bdat"/>
3095 <magic priority="40">
3096 <match value="SAS FILE" type="string" offset="84" />
3097 </magic>
3098 </mime-type>
3099 <mime-type type="application/x-sas-view">
3100 <_comment>SAS Data Set View</_comment>
3101 <glob pattern="*.sv7"/>
3102 <glob pattern="*.sas7bvew"/>
3103 </mime-type>
3104 <mime-type type="application/x-sas-data-index">
3105 <_comment>SAS Data Set Index</_comment>
3106 <glob pattern="*.si7"/>
3107 <glob pattern="*.sas7bndx"/>
3108 </mime-type>
3109 <mime-type type="application/x-sas-catalog">
3110 <_comment>SAS Catalog</_comment>
3111 <glob pattern="*.sc7"/>
3112 <glob pattern="*.sas7bcat"/>
3113 </mime-type>
3114 <mime-type type="application/x-sas-access">
3115 <_comment>SAS Access Descriptor</_comment>
3116 <glob pattern="*.sa7"/>
3117 <glob pattern="*.sas7bacs"/>
3118 </mime-type>
3119 <mime-type type="application/x-sas-fdb">
3120 <_comment>SAS FDB Consolidation Database File</_comment>
3121 <glob pattern="*.sf7"/>
3122 <glob pattern="*.sas7bfdb"/>
3123 </mime-type>
3124 <mime-type type="application/x-sas-mddb">
3125 <_comment>SAS MDDB Multi-Dimensional Database File</_comment>
3126 <glob pattern="*.sm7"/>
3127 <glob pattern="*.sas7bmdb"/>
3128 </mime-type>
3129 <mime-type type="application/x-sas-dmdb">
3130 <_comment>SAS DMDB Data Mining Database File</_comment>
3131 <glob pattern="*.s7m"/>
3132 <glob pattern="*.sas7bdmd"/>
3133 </mime-type>
3134 <mime-type type="application/x-sas-itemstor">
3135 <_comment>SAS Item Store (ItemStor) File</_comment>
3136 <glob pattern="*.sr7"/>
3137 <glob pattern="*.sas7bitm"/>
3138 </mime-type>
3139 <mime-type type="application/x-sas-utility">
3140 <_comment>SAS Utility</_comment>
3141 <glob pattern="*.su7"/>
3142 <glob pattern="*.sas7butl"/>
3143 </mime-type>
3144 <mime-type type="application/x-sas-putility">
3145 <_comment>SAS Permanent Utility</_comment>
3146 <glob pattern="*.sp7"/>
3147 <glob pattern="*.sas7bput"/>
3148 </mime-type>
3149 <mime-type type="application/x-sas-transport">
3150 <_comment>SAS Transport File</_comment>
3151 <glob pattern="*.stx"/>
3152 </mime-type>
3153 <mime-type type="application/x-sas-backup">
3154 <_comment>SAS Backup</_comment>
3155 <glob pattern="*.sas7bbak"/>
3156 </mime-type>
3157
3158 <mime-type type="application/x-sc">
3159 <magic priority="50">
3160 <match value="Spreadsheet" type="string" offset="38"/>
3161 </magic>
3162 </mime-type>
3163
3164 <mime-type type="application/x-sh">
3165 <_comment>UNIX/LINUX Shell Script</_comment>
3166 <magic priority="50">
3167 <match value="#!/" type="string" offset="0"/>
3168 <match value="#!\ /" type="string" offset="0"/>
3169 <match value="#!\t/" type="string" offset="0"/>
3170 <match value="eval &quot;exec" type="string" offset="0"/>
3171 </magic>
3172 <glob pattern="*.sh"/>
3173 <glob pattern="*.bash"/>
3174 <sub-class-of type="text/plain"/>
3175 </mime-type>
3176
3177 <mime-type type="application/x-shar">
3178 <glob pattern="*.shar"/>
3179 </mime-type>
3180
3181 <mime-type type="application/x-shockwave-flash">
3182 <acronym>Flash</acronym>
3183 <_comment>Adobe Flash</_comment>
3184 <magic priority="50">
3185 <match value="FWS" type="string" offset="0"/> <!-- F = Uncompressed -->
3186 <match value="CWS" type="string" offset="0"/> <!-- C = Compressed -->
3187 </magic>
3188 <glob pattern="*.swf"/>
3189 </mime-type>
3190
3191 <mime-type type="application/x-silverlight-app">
3192 <glob pattern="*.xap"/>
3193 </mime-type>
3194
3195 <mime-type type="application/x-stuffit">
3196 <magic priority="50">
3197 <match value="StuffIt" type="string" offset="0"/>
3198 </magic>
3199 <glob pattern="*.sit"/>
3200 </mime-type>
3201
3202 <mime-type type="application/x-stuffitx">
3203 <glob pattern="*.sitx"/>
3204 </mime-type>
3205 <mime-type type="application/x-sv4cpio">
3206 <glob pattern="*.sv4cpio"/>
3207 </mime-type>
3208 <mime-type type="application/x-sv4crc">
3209 <glob pattern="*.sv4crc"/>
3210 </mime-type>
3211
3212 <mime-type type="application/x-tar">
3213 <magic priority="40">
3214 <!-- POSIX tar archive -->
3215 <match value="ustar\0" type="string" offset="257" />
3216 </magic>
3217 <glob pattern="*.tar"/>
3218 </mime-type>
3219
3220 <mime-type type="application/x-tex">
3221 <_comment>TeX Source</_comment>
3222 <alias type="text/x-tex"/>
3223 <magic priority="50">
3224 <match value="\\input" type="string" offset="0"/>
3225 <match value="\\section" type="string" offset="0"/>
3226 <match value="\\setlength" type="string" offset="0"/>
3227 <match value="\\documentstyle" type="string" offset="0"/>
3228 <match value="\\chapter" type="string" offset="0"/>
3229 <match value="\\documentclass" type="string" offset="0"/>
3230 <match value="\\relax" type="string" offset="0"/>
3231 <match value="\\contentsline" type="string" offset="0"/>
3232 </magic>
3233 <glob pattern="*.tex"/>
3234 </mime-type>
3235
3236 <mime-type type="application/x-tex-tfm">
3237 <glob pattern="*.tfm"/>
3238 </mime-type>
3239
3240 <mime-type type="application/x-texinfo">
3241 <alias type="text/x-texinfo" />
3242 <magic priority="50">
3243 <match value="\\input\ texinfo" type="string" offset="0"/>
3244 </magic>
3245 <glob pattern="*.texinfo"/>
3246 <glob pattern="*.texi"/>
3247 </mime-type>
3248
3249 <!-- =================================================================== -->
3250 <!-- Microsoft Office binary file formats -->
3251 <!-- http://www.microsoft.com/interop/docs/OfficeBinaryFormats.mspx -->
3252 <!-- =================================================================== -->
3253 <mime-type type="application/x-tika-msoffice">
3254 <magic>
3255 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8"/>
3256 </magic>
3257 </mime-type>
3258
3259 <mime-type type="application/x-tika-msoffice-embedded">
3260 <sub-class-of type="application/x-tika-msoffice"/>
3261 </mime-type>
3262 <mime-type type="application/x-tika-msoffice-embedded;format=ole10_native">
3263 <sub-class-of type="application/x-tika-msoffice-embedded"/>
3264 <_comment>OLE10 Native Embedded Document</_comment>
3265 </mime-type>
3266 <mime-type type="application/x-tika-msoffice-embedded;format=comp_obj">
3267 <sub-class-of type="application/x-tika-msoffice-embedded"/>
3268 <_comment>CompObj OLE2 Embedded Document</_comment>
3269 </mime-type>
3270
3271 <mime-type type="application/x-tika-msworks-spreadsheet">
3272 <glob pattern="*.xlr"/>
3273 <sub-class-of type="application/vnd.ms-excel"/>
3274 <!-- this has to be highter than the Excel match -->
3275 <magic priority="60">
3276 <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
3277 <match value="W\x00k\x00s\x00S\x00S\x00W\x00o\x00r\x00k\x00B\x00o\x00o\x00k" type="string" offset="1152:4096" />
3278 </match>
3279 </magic>
3280 </mime-type>
3281
3282 <!-- =================================================================== -->
3283 <!-- Office Open XML file formats -->
3284 <!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm -->
3285 <!-- =================================================================== -->
3286 <mime-type type="application/x-tika-ooxml">
3287 <sub-class-of type="application/zip"/>
3288 <magic priority="50">
3289 <match value="PK\003\004" type="string" offset="0">
3290 <match value="[Content_Types].xml" type="string" offset="30"/>
3291 </match>
3292 </magic>
3293 </mime-type>
3294
3295 <!-- Note - password protected OOXML files are actually stored in -->
3296 <!-- an OLE2 (application/x-tika-msoffice) container -->
3297 <mime-type type="application/x-tika-ooxml-protected">
3298 <sub-class-of type="application/x-tika-ooxml"/>
3299 <_comment>Password Protected OOXML File</_comment>
3300 </mime-type>
3301
3302 <mime-type type="application/x-uc2-compressed">
3303 <magic priority="50">
3304 <match value="UC2\x1a" type="string" offset="0" />
3305 </magic>
3306 <glob pattern="*.uc2"/>
3307 </mime-type>
3308 <mime-type type="application/x-ustar">
3309 <glob pattern="*.ustar"/>
3310 </mime-type>
3311 <mime-type type="application/x-wais-source">
3312 <glob pattern="*.src"/>
3313 </mime-type>
3314 <mime-type type="application/x-webarchive">
3315 <sub-class-of type="application/x-bplist"/>
3316 <glob pattern="*.webarchive"/>
3317 </mime-type>
3318 <mime-type type="application/x-x509-ca-cert">
3319 <glob pattern="*.der"/>
3320 <glob pattern="*.crt"/>
3321 </mime-type>
3322 <mime-type type="application/x-xfig">
3323 <glob pattern="*.fig"/>
3324 </mime-type>
3325 <mime-type type="application/x-xpinstall">
3326 <glob pattern="*.xpi"/>
3327 </mime-type>
3328
3329 <mime-type type="application/x-xmind">
3330 <_comment>XMind Pro</_comment>
3331 <sub-class-of type="application/zip"/>
3332 <glob pattern="*.xmind"/>
3333 <!-- .xmap is also used, but that extension is more common elsewhere -->
3334 <!-- <glob pattern="*.xmap"/> -->
3335 </mime-type>
3336
3337 <mime-type type="application/x-xz">
3338 <glob pattern="*.xz"/>
3339 <magic priority="50">
3340 <match value="\3757zXZ\000" type="string" offset="0"/>
3341 </magic>
3342 </mime-type>
3343
3344 <mime-type type="application/x-zoo">
3345 <magic priority="50">
3346 <match value="0xfdc4a7dc" type="little32" offset="20"/>
3347 </magic>
3348 <glob pattern="*.zoo"/>
3349 </mime-type>
3350
3351 <mime-type type="application/x400-bp"/>
3352 <mime-type type="application/xcap-att+xml"/>
3353 <mime-type type="application/xcap-caps+xml"/>
3354 <mime-type type="application/xcap-el+xml"/>
3355 <mime-type type="application/xcap-error+xml"/>
3356 <mime-type type="application/xcap-ns+xml"/>
3357 <mime-type type="application/xcon-conference-info-diff+xml"/>
3358 <mime-type type="application/xcon-conference-info+xml"/>
3359 <mime-type type="application/xenc+xml">
3360 <glob pattern="*.xenc"/>
3361 </mime-type>
3362
3363 <mime-type type="application/xhtml+xml">
3364 <magic priority="50">
3365 <match value="&lt;html xmlns=" type="string" offset="0:8192"/>
3366 </magic>
3367 <root-XML namespaceURI="http://www.w3.org/1999/xhtml" localName="html"/>
3368 <glob pattern="*.xhtml"/>
3369 <glob pattern="*.xht"/>
3370 </mime-type>
3371
3372 <mime-type type="application/xhtml-voice+xml"/>
3373
3374 <mime-type type="application/xml">
3375 <acronym>XML</acronym>
3376 <_comment>Extensible Markup Language</_comment>
3377 <tika:link>http://en.wikipedia.org/wiki/Xml</tika:link>
3378 <tika:uti>public.xml</tika:uti>
3379 <alias type="text/xml"/>
3380 <magic priority="50">
3381 <match value="&lt;?xml" type="string" offset="0"/>
3382 <match value="&lt;?XML" type="string" offset="0"/>
3383 <match value="&lt;!--" type="string" offset="0"/>
3384 <!-- UTF-8 BOM -->
3385 <match value="0xEFBBBF3C3F786D6C" type="string" offset="0"/>
3386 <!-- UTF-16 LE/BE -->
3387 <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0"/>
3388 <match value="0xFEFF003C003F0078006D006C" type="string" offset="0"/>
3389 <!-- TODO: Add matches for the other possible XML encoding schemes -->
3390 </magic>
3391 <glob pattern="*.xml"/>
3392 <glob pattern="*.xsl"/>
3393 <glob pattern="*.xsd"/>
3394 <sub-class-of type="text/plain" />
3395 </mime-type>
3396
3397 <mime-type type="application/xml-dtd">
3398 <_comment>XML Document Type Definition</_comment>
3399 <sub-class-of type="text/plain"/>
3400 <alias type="text/x-dtd"/>
3401 <glob pattern="*.dtd"/>
3402 </mime-type>
3403
3404 <mime-type type="application/xml-external-parsed-entity">
3405 <alias type="text/xml-external-parsed-entity"/>
3406 </mime-type>
3407
3408 <mime-type type="application/xmpp+xml"/>
3409 <mime-type type="application/xop+xml">
3410 <glob pattern="*.xop"/>
3411 </mime-type>
3412
3413 <mime-type type="application/xslt+xml">
3414 <alias type="text/xsl"/>
3415 <acronym>XSLT</acronym>
3416 <_comment>XSL Transformations</_comment>
3417 <root-XML localName="stylesheet"
3418 namespaceURI="http://www.w3.org/1999/XSL/Transform"/>
3419 <glob pattern="*.xslt"/>
3420 </mime-type>
3421
3422 <mime-type type="application/xspf+xml">
3423 <glob pattern="*.xspf"/>
3424 </mime-type>
3425 <mime-type type="application/xv+xml">
3426 <glob pattern="*.mxml"/>
3427 <glob pattern="*.xhvml"/>
3428 <glob pattern="*.xvml"/>
3429 <glob pattern="*.xvm"/>
3430 </mime-type>
3431
3432 <mime-type type="application/zip">
3433 <_comment>Compressed Archive File</_comment>
3434 <tika:link>http://en.wikipedia.org/wiki/ZIP_(file_format)</tika:link>
3435 <tika:uti>com.pkware.zip-archive</tika:uti>
3436 <alias type="application/x-zip-compressed"/>
3437 <magic priority="40">
3438 <match value="PK\003\004" type="string" offset="0"/>
3439 </magic>
3440 <glob pattern="*.zip"/>
3441 </mime-type>
3442
3443 <mime-type type="application/x-7z-compressed">
3444 <acronym>7zip</acronym>
3445 <_comment>7-zip archive</_comment>
3446 <magic priority="50">
3447 <!-- Magic: '7', 'z', 0xBC, 0xAF, 0x27, 0x1C -->
3448 <match value="7z" type="string" offset="0:1" >
3449 <match value="0xBCAF271C" type="string" offset="2:5" />
3450 </match>
3451 </magic>
3452 <glob pattern="*.7z" />
3453 </mime-type>
3454
3455 <mime-type type="audio/32kadpcm"/>
3456 <mime-type type="audio/3gpp"/>
3457 <mime-type type="audio/3gpp2"/>
3458 <mime-type type="audio/ac3"/>
3459 <mime-type type="audio/adpcm">
3460 <glob pattern="*.adp"/>
3461 </mime-type>
3462
3463 <mime-type type="audio/amr">
3464 <glob pattern="*.amr"/>
3465 <magic priority="40">
3466 <!-- Specific match for the original AMR format -->
3467 <match value="#!AMR\n" type="string" offset="0"/>
3468 <!-- General match for AMR subtypes we don't have entries for -->
3469 <match value="#!AMR" type="string" offset="0"/>
3470 </magic>
3471 </mime-type>
3472 <mime-type type="audio/amr-wb">
3473 <sub-class-of type="audio/amr"/>
3474 <magic priority="50">
3475 <match value="#!AMR-WB\n" type="string" offset="0"/>
3476 </magic>
3477 </mime-type>
3478 <mime-type type="audio/amr-wb+">
3479 <sub-class-of type="audio/amr"/>
3480 <!-- TIKA-1156 sample needed - might be "#!AMR-WB+\n" ? -->
3481 </mime-type>
3482
3483 <mime-type type="audio/asc"/>
3484
3485 <mime-type type="audio/basic">
3486 <_comment>uLaw/AU Audio File</_comment>
3487 <magic priority="20">
3488 <match value=".snd" type="string" offset="0">
3489 <match value="1" type="big32" offset="12"/>
3490 <match value="2" type="big32" offset="12"/>
3491 <match value="3" type="big32" offset="12"/>
3492 <match value="4" type="big32" offset="12"/>
3493 <match value="5" type="big32" offset="12"/>
3494 <match value="6" type="big32" offset="12"/>
3495 <match value="7" type="big32" offset="12"/>
3496 </match>
3497 <match offset="0" type="string" value="\x2e\x73\x6e\x64\x00\x00\x00"/>
3498 </magic>
3499 <glob pattern="*.au"/>
3500 <glob pattern="*.snd"/>
3501 </mime-type>
3502
3503 <mime-type type="audio/bv16"/>
3504 <mime-type type="audio/bv32"/>
3505 <mime-type type="audio/clearmode"/>
3506 <mime-type type="audio/cn"/>
3507 <mime-type type="audio/dat12"/>
3508 <mime-type type="audio/dls"/>
3509 <mime-type type="audio/dsr-es201108"/>
3510 <mime-type type="audio/dsr-es202050"/>
3511 <mime-type type="audio/dsr-es202211"/>
3512 <mime-type type="audio/dsr-es202212"/>
3513 <mime-type type="audio/dvi4"/>
3514 <mime-type type="audio/eac3"/>
3515 <mime-type type="audio/evrc"/>
3516 <mime-type type="audio/evrc-qcp"/>
3517 <mime-type type="audio/evrc0"/>
3518 <mime-type type="audio/evrc1"/>
3519 <mime-type type="audio/evrcb"/>
3520 <mime-type type="audio/evrcb0"/>
3521 <mime-type type="audio/evrcb1"/>
3522 <mime-type type="audio/evrcwb"/>
3523 <mime-type type="audio/evrcwb0"/>
3524 <mime-type type="audio/evrcwb1"/>
3525 <mime-type type="audio/example"/>
3526 <mime-type type="audio/g719"/>
3527 <mime-type type="audio/g722"/>
3528 <mime-type type="audio/g7221"/>
3529 <mime-type type="audio/g723"/>
3530 <mime-type type="audio/g726-16"/>
3531 <mime-type type="audio/g726-24"/>
3532 <mime-type type="audio/g726-32"/>
3533 <mime-type type="audio/g726-40"/>
3534 <mime-type type="audio/g728"/>
3535 <mime-type type="audio/g729"/>
3536 <mime-type type="audio/g7291"/>
3537 <mime-type type="audio/g729d"/>
3538 <mime-type type="audio/g729e"/>
3539 <mime-type type="audio/gsm"/>
3540 <mime-type type="audio/gsm-efr"/>
3541 <mime-type type="audio/ilbc"/>
3542 <mime-type type="audio/l16"/>
3543 <mime-type type="audio/l20"/>
3544 <mime-type type="audio/l24"/>
3545 <mime-type type="audio/l8"/>
3546 <mime-type type="audio/lpc"/>
3547
3548 <mime-type type="audio/midi">
3549 <acronym>MIDI</acronym>
3550 <_comment>Musical Instrument Digital Interface</_comment>
3551 <magic priority ="20">
3552 <match type="string" value="MThd" offset="0"/>
3553 </magic>
3554 <glob pattern="*.mid"/>
3555 <glob pattern="*.midi"/>
3556 <glob pattern="*.kar"/>
3557 <glob pattern="*.rmi"/>
3558 </mime-type>
3559
3560 <mime-type type="audio/mobile-xmf"/>
3561 <mime-type type="audio/mp4">
3562 <alias type="audio/x-m4a"/>
3563 <alias type="audio/x-mp4a"/>
3564 <magic priority="60">
3565 <match value="ftypM4A " type="string" offset="4"/>
3566 <match value="ftypM4B " type="string" offset="4"/>
3567 <match value="ftypF4A " type="string" offset="4"/>
3568 <match value="ftypF4B " type="string" offset="4"/>
3569 </magic>
3570 <glob pattern="*.mp4a"/>
3571 <glob pattern="*.m4a"/>
3572 <glob pattern="*.m4b"/>
3573 </mime-type>
3574 <mime-type type="audio/mp4a-latm"/>
3575 <mime-type type="audio/mpa"/>
3576 <mime-type type="audio/mpa-robust"/>
3577
3578 <mime-type type="audio/mpeg">
3579 <alias type="audio/x-mpeg"/>
3580 <acronym>MP3</acronym>
3581 <_comment>MPEG-1 Audio Layer 3</_comment>
3582 <magic priority="20">
3583 <!-- http://mpgedit.org/mpgedit/mpeg_format/MP3Format.html -->
3584 <!-- Bit pattern for first two bytes: 11111111 111VVLLC -->
3585 <!-- VV = MPEG Audio Version ID; 10 = V2, 11 = V1 -->
3586 <!-- LL = Layer description; 01 = L3, 10 = L2, 11 = L1 -->
3587 <!-- C = Protection bit; 0 = CRC, 1 = no CRC -->
3588 <match value="0xfff2" type="string" offset="0"/> <!-- V2, L3, CRC -->
3589 <match value="0xfff3" type="string" offset="0"/> <!-- V2, L3 -->
3590 <match value="0xfff4" type="string" offset="0"/> <!-- V2, L2, CRC -->
3591 <match value="0xfff5" type="string" offset="0"/> <!-- V2, L2 -->
3592 <match value="0xfff6" type="string" offset="0"/> <!-- V2, L1, CRC -->
3593 <match value="0xfff7" type="string" offset="0"/> <!-- V2, L1 -->
3594 <match value="0xfffa" type="string" offset="0"/> <!-- V1, L3, CRC -->
3595 <match value="0xfffb" type="string" offset="0"/> <!-- V1, L3 -->
3596 <match value="0xfffc" type="string" offset="0"/> <!-- V1, L2, CRC -->
3597 <match value="0xfffd" type="string" offset="0"/> <!-- V1, L2 -->
3598 <!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
3599 <!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC -->
3600 <match value="0xffff" type="string" offset="0"/> <!-- V1, L1 -->
3601 <match value="ID3" type="string" offset="0"/>
3602 </magic>
3603 <glob pattern="*.mpga"/>
3604 <glob pattern="*.mp2"/>
3605 <glob pattern="*.mp2a"/>
3606 <glob pattern="*.mp3"/>
3607 <glob pattern="*.m2a"/>
3608 <glob pattern="*.m3a"/>
3609 </mime-type>
3610
3611 <mime-type type="audio/mpeg4-generic"/>
3612
3613 <mime-type type="audio/ogg">
3614 <_comment>Ogg Vorbis Codec Compressed WAV File</_comment>
3615 <alias type="application/x-ogg"/>
3616 <magic priority="60">
3617 <!-- For a single stream file -->
3618 <match value="OggS\000.......................\001vorbis" type="string"
3619 mask="0xFFFFFFFF00000000000000000000000000000000000000000000000000FFFFFFFFFFFF"
3620 offset="0"/>
3621 <match value="\x4f\x67\x67\x53\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00"
3622 type="string" offset="0"/>
3623 </magic>
3624 <glob pattern="*.oga"/>
3625 <glob pattern="*.ogg"/>
3626 <glob pattern="*.spx"/>
3627 <sub-class-of type="application/ogg"/>
3628 </mime-type>
3629
3630 <mime-type type="audio/parityfec"/>
3631 <mime-type type="audio/pcma"/>
3632 <mime-type type="audio/pcma-wb"/>
3633 <mime-type type="audio/pcmu-wb"/>
3634 <mime-type type="audio/pcmu"/>
3635
3636 <mime-type type="audio/prs.sid">
3637 <magic priority="50">
3638 <match value="PSID" type="string" offset="0"/>
3639 </magic>
3640 </mime-type>
3641
3642 <mime-type type="audio/qcelp"/>
3643 <mime-type type="audio/red"/>
3644 <mime-type type="audio/rtp-enc-aescm128"/>
3645 <mime-type type="audio/rtp-midi"/>
3646 <mime-type type="audio/rtx"/>
3647 <mime-type type="audio/smv"/>
3648 <mime-type type="audio/smv0"/>
3649 <mime-type type="audio/smv-qcp"/>
3650 <mime-type type="audio/sp-midi"/>
3651 <mime-type type="audio/t140c"/>
3652 <mime-type type="audio/t38"/>
3653 <mime-type type="audio/telephone-event"/>
3654 <mime-type type="audio/tone"/>
3655 <mime-type type="audio/ulpfec"/>
3656 <mime-type type="audio/vdvi"/>
3657 <mime-type type="audio/vmr-wb"/>
3658 <mime-type type="audio/vnd.3gpp.iufp"/>
3659 <mime-type type="audio/vnd.4sb"/>
3660 <mime-type type="audio/vnd.audiokoz"/>
3661 <mime-type type="audio/vnd.adobe.soundbooth">
3662 <glob pattern="*.asnd"/>
3663 </mime-type>
3664 <mime-type type="audio/vnd.celp"/>
3665 <mime-type type="audio/vnd.cisco.nse"/>
3666 <mime-type type="audio/vnd.cmles.radio-events"/>
3667 <mime-type type="audio/vnd.cns.anp1"/>
3668 <mime-type type="audio/vnd.cns.inf1"/>
3669 <mime-type type="audio/vnd.digital-winds">
3670 <glob pattern="*.eol"/>
3671 </mime-type>
3672 <mime-type type="audio/vnd.dlna.adts"/>
3673 <mime-type type="audio/vnd.dolby.heaac.1"/>
3674 <mime-type type="audio/vnd.dolby.heaac.2"/>
3675 <mime-type type="audio/vnd.dolby.mlp"/>
3676 <mime-type type="audio/vnd.dolby.mps"/>
3677 <mime-type type="audio/vnd.dolby.pl2"/>
3678 <mime-type type="audio/vnd.dolby.pl2x"/>
3679 <mime-type type="audio/vnd.dolby.pl2z"/>
3680 <mime-type type="audio/vnd.dts">
3681 <glob pattern="*.dts"/>
3682 </mime-type>
3683 <mime-type type="audio/vnd.dts.hd">
3684 <glob pattern="*.dtshd"/>
3685 </mime-type>
3686 <mime-type type="audio/vnd.everad.plj"/>
3687 <mime-type type="audio/vnd.hns.audio"/>
3688 <mime-type type="audio/vnd.lucent.voice">
3689 <glob pattern="*.lvp"/>
3690 </mime-type>
3691 <mime-type type="audio/vnd.ms-playready.media.pya">
3692 <glob pattern="*.pya"/>
3693 </mime-type>
3694 <mime-type type="audio/vnd.nokia.mobile-xmf"/>
3695 <mime-type type="audio/vnd.nortel.vbk"/>
3696 <mime-type type="audio/vnd.nuera.ecelp4800">
3697 <glob pattern="*.ecelp4800"/>
3698 </mime-type>
3699 <mime-type type="audio/vnd.nuera.ecelp7470">
3700 <glob pattern="*.ecelp7470"/>
3701 </mime-type>
3702 <mime-type type="audio/vnd.nuera.ecelp9600">
3703 <glob pattern="*.ecelp9600"/>
3704 </mime-type>
3705 <mime-type type="audio/vnd.octel.sbc"/>
3706 <mime-type type="audio/vnd.qcelp"/>
3707 <mime-type type="audio/vnd.rhetorex.32kadpcm"/>
3708 <mime-type type="audio/vnd.sealedmedia.softseal.mpeg"/>
3709 <mime-type type="audio/vnd.vmx.cvsd"/>
3710 <mime-type type="audio/vorbis"/>
3711 <mime-type type="audio/vorbis-config"/>
3712 <mime-type type="audio/x-aac">
3713 <glob pattern="*.aac"/>
3714 </mime-type>
3715
3716 <mime-type type="audio/x-adbcm">
3717 <magic priority="20">
3718 <match value=".snd" type="string" offset="0">
3719 <match value="23" type="big32" offset="12"/>
3720 </match>
3721 </magic>
3722 </mime-type>
3723
3724 <mime-type type="audio/x-aiff">
3725 <alias type="audio/aiff"/>
3726 <acronym>AIFF</acronym>
3727 <_comment>Audio Interchange File Format</_comment>
3728 <magic priority="20">
3729 <match value="FORM....AIFF" type="string" offset="0"
3730 mask="0xFFFFFFFF00000000FFFFFFFF"/>
3731 <match value="FORM....AIFC" type="string" offset="0"
3732 mask="0xFFFFFFFF00000000FFFFFFFF"/>
3733 <!-- Amiga IFF sound sample, somewhat like the more modern AIFF -->
3734 <match value="FORM....8SVX" type="string" offset="0"
3735 mask="0xFFFFFFFF00000000FFFFFFFF"/>
3736 <match offset="0" type="string" value="\x46\x4f\x52\x4d\x00"/>
3737 </magic>
3738 <glob pattern="*.aif"/>
3739 <glob pattern="*.aiff"/>
3740 <glob pattern="*.aifc"/>
3741 </mime-type>
3742
3743 <mime-type type="audio/x-dec-basic">
3744 <magic priority="20">
3745 <match value="0x0064732E" type="big32" offset="0">
3746 <match value="1" type="big32" offset="12"/>
3747 <match value="2" type="big32" offset="12"/>
3748 <match value="3" type="big32" offset="12"/>
3749 <match value="4" type="big32" offset="12"/>
3750 <match value="5" type="big32" offset="12"/>
3751 <match value="6" type="big32" offset="12"/>
3752 <match value="7" type="big32" offset="12"/>
3753 </match>
3754 </magic>
3755 </mime-type>
3756
3757 <mime-type type="audio/x-dec-adbcm">
3758 <magic priority="20">
3759 <match value="0x0064732E" type="big32" offset="0">
3760 <match value="23" type="big32" offset="12"/>
3761 </match>
3762 </magic>
3763 </mime-type>
3764
3765 <mime-type type="audio/x-flac">
3766 <acronym>FLAC</acronym>
3767 <_comment>Free Lossless Audio Codec</_comment>
3768 <magic priority="50">
3769 <match value="fLaC" type="string" offset="0"/>
3770 </magic>
3771 <glob pattern="*.flac"/>
3772 </mime-type>
3773
3774 <mime-type type="audio/x-mod">
3775 <acronym>MOD</acronym>
3776 <magic priority="50">
3777 <match value="Extended\ Module:" type="string" offset="0"/>
3778 <match value="BMOD2STM" type="string" offset="21"/>
3779 <match value="M.K." type="string" offset="1080"/>
3780 <match value="M!K!" type="string" offset="1080"/>
3781 <match value="FLT4" type="string" offset="1080"/>
3782 <match value="FLT8" type="string" offset="1080"/>
3783 <match value="4CHN" type="string" offset="1080"/>
3784 <match value="6CHN" type="string" offset="1080"/>
3785 <match value="8CHN" type="string" offset="1080"/>
3786 <match value="CD81" type="string" offset="1080"/>
3787 <match value="OKTA" type="string" offset="1080"/>
3788 <match value="16CN" type="string" offset="1080"/>
3789 <match value="32CN" type="string" offset="1080"/>
3790 <match value="IMPM" type="string" offset="0"/>
3791 </magic>
3792 <glob pattern="*.mod"/>
3793 </mime-type>
3794
3795 <mime-type type="audio/x-mpegurl">
3796 <_comment>MP3 Playlist File</_comment>
3797 <magic priority="50">
3798 <match offset="0" type="string" value="\x23\x45\x58\x54\x4d\x33\x55\x0d\x0a"/>
3799 </magic>
3800 <glob pattern="*.m3u"/>
3801 </mime-type>
3802
3803 <mime-type type="audio/x-ms-wax">
3804 <glob pattern="*.wax"/>
3805 </mime-type>
3806 <mime-type type="audio/x-ms-wma">
3807 <sub-class-of type="video/x-ms-asf" />
3808 <glob pattern="*.wma"/>
3809 <magic priority="50">
3810 <match value="Windows Media Audio" type="unicodeLE" offset="0:8192" />
3811 </magic>
3812 </mime-type>
3813
3814 <mime-type type="audio/x-pn-realaudio">
3815 <_comment>Real Audio</_comment>
3816 <alias type="audio/x-realaudio" />
3817 <magic priority="50">
3818 <match value="0x2e7261fd" type="big32" offset="0"/>
3819 </magic>
3820 <glob pattern="*.ram"/>
3821 <glob pattern="*.ra"/>
3822 </mime-type>
3823
3824 <mime-type type="audio/x-pn-realaudio-plugin">
3825 <_comment>RealMedia Player Plug-in</_comment>
3826 <glob pattern="*.rmp"/>
3827 <!-- <glob pattern="*.rpm"/> - conflicts with application/x-rpm -->
3828 </mime-type>
3829
3830 <mime-type type="audio/x-wav">
3831 <acronym>WAV</acronym>
3832 <magic priority="20">
3833 <match value="RIFF....WAVE" type="string" offset="0"
3834 mask="0xFFFFFFFF00000000FFFFFFFF"/>
3835 </magic>
3836 <glob pattern="*.wav"/>
3837 </mime-type>
3838
3839 <mime-type type="chemical/x-cdx">
3840 <magic priority="50">
3841 <match value="VjCD0100" type="string" offset="0"/>
3842 </magic>
3843 <glob pattern="*.cdx"/>
3844 </mime-type>
3845 <mime-type type="chemical/x-cif">
3846 <glob pattern="*.cif"/>
3847 </mime-type>
3848 <mime-type type="chemical/x-cmdf">
3849 <glob pattern="*.cmdf"/>
3850 </mime-type>
3851 <mime-type type="chemical/x-cml">
3852 <glob pattern="*.cml"/>
3853 </mime-type>
3854 <mime-type type="chemical/x-csml">
3855 <glob pattern="*.csml"/>
3856 </mime-type>
3857
3858 <mime-type type="chemical/x-pdb">
3859 <_comment>Brookhaven Protein Databank File</_comment>
3860 <glob pattern="*.pdb"/>
3861 </mime-type>
3862
3863 <mime-type type="chemical/x-xyz">
3864 <glob pattern="*.xyz"/>
3865 </mime-type>
3866
3867 <mime-type type="image/x-ms-bmp">
3868 <alias type="image/bmp"/>
3869 <acronym>BMP</acronym>
3870 <_comment>Windows bitmap</_comment>
3871 <tika:link>http://en.wikipedia.org/wiki/BMP_file_format</tika:link>
3872 <tika:uti>com.microsoft.bmp</tika:uti>
3873 <magic priority="50">
3874 <match value="BM" type="string" offset="0">
3875 <match value="0x0100" type="string" offset="26">
3876 <match value="0x0000" type="string" offset="28"/>
3877 <match value="0x0100" type="string" offset="28"/>
3878 <match value="0x0400" type="string" offset="28"/>
3879 <match value="0x0800" type="string" offset="28"/>
3880 <match value="0x1000" type="string" offset="28"/>
3881 <match value="0x1800" type="string" offset="28"/>
3882 <match value="0x2000" type="string" offset="28"/>
3883 </match>
3884 </match>
3885 </magic>
3886 <glob pattern="*.bmp"/>
3887 <glob pattern="*.dib"/>
3888 </mime-type>
3889
3890 <mime-type type="image/cgm">
3891 <acronym>CGM</acronym>
3892 <_comment>Computer Graphics Metafile</_comment>
3893 <magic priority="50">
3894 <match value="BEGMF" type="string" offset="0"/>
3895 <match value="0x0020" mask="0xffe0" type="string" offset="0">
3896 <match value="0x10220001" type="string" offset="2:64"/>
3897 <match value="0x10220002" type="string" offset="2:64"/>
3898 <match value="0x10220003" type="string" offset="2:64"/>
3899 <match value="0x10220004" type="string" offset="2:64"/>
3900 </match>
3901 </magic>
3902 <glob pattern="*.cgm"/>
3903 </mime-type>
3904
3905 <mime-type type="image/example"/>
3906 <mime-type type="image/fits"/>
3907 <mime-type type="image/g3fax">
3908 <glob pattern="*.g3"/>
3909 </mime-type>
3910
3911 <mime-type type="image/gif">
3912 <acronym>GIF</acronym>
3913 <_comment>Graphics Interchange Format</_comment>
3914 <tika:link>http://en.wikipedia.org/wiki/Gif</tika:link>
3915 <tika:uti>com.compuserve.gif</tika:uti>
3916 <magic priority="50">
3917 <match value="GIF87a" type="string" offset="0"/>
3918 <match value="GIF89a" type="string" offset="0"/>
3919 </magic>
3920 <glob pattern="*.gif"/>
3921 </mime-type>
3922
3923 <mime-type type="image/ief">
3924 <glob pattern="*.ief"/>
3925 </mime-type>
3926
3927 <mime-type type="image/jp2">
3928 <sub-class-of type="image/x-jp2-container" />
3929 <acronym>JP2</acronym>
3930 <_comment>JPEG 2000 Part 1 (JP2)</_comment>
3931 <magic priority="50">
3932 <match value="0x0000000C6A5020200D0A870A" type="string" offset="0">
3933 <match value="0x6a703220" type="string" offset="20"/>
3934 </match>
3935 </magic>
3936 <glob pattern="*.jp2"/>
3937 </mime-type>
3938
3939 <mime-type type="image/jpeg">
3940 <acronym>JPEG</acronym>
3941 <_comment>Joint Photographic Experts Group</_comment>
3942 <tika:link>http://en.wikipedia.org/wiki/Jpeg</tika:link>
3943 <tika:uti>public.jpeg</tika:uti>
3944 <magic priority="50">
3945 <!-- FFD8 is the SOI (Start Of Image) marker. -->
3946 <!-- It is followed by another marker that starts with FF. -->
3947 <match value="0xffd8ff" type="string" offset="0"/>
3948 </magic>
3949 <glob pattern="*.jpg"/>
3950 <glob pattern="*.jpeg"/>
3951 <glob pattern="*.jpe"/>
3952 <glob pattern="*.jif"/>
3953 <glob pattern="*.jfif"/>
3954 <glob pattern="*.jfi"/>
3955 </mime-type>
3956
3957 <mime-type type="image/jpm">
3958 <alias type="video/jpm"/>
3959 <sub-class-of type="image/x-jp2-container" />
3960 <acronym>JP2</acronym>
3961 <_comment>JPEG 2000 Part 6 (JPM)</_comment>
3962 <magic priority="50">
3963 <match value="0x0000000C6A5020200D0A870A" type="string" offset="0">
3964 <match value="0x6a706d20" type="string" offset="20"/>
3965 </match>
3966 </magic>
3967 <glob pattern="*.jpm"/>
3968 <glob pattern="*.jpgm"/>
3969 </mime-type>
3970
3971 <mime-type type="image/jpx">
3972 <sub-class-of type="image/x-jp2-container" />
3973 <acronym>JP2</acronym>
3974 <_comment>JPEG 2000 Part 2 (JPX)</_comment>
3975 <magic priority="50">
3976 <match value="0x0000000C6A5020200D0A870A" type="string" offset="0">
3977 <match value="0x6a707820" type="string" offset="20"/>
3978 </match>
3979 </magic>
3980 <glob pattern="*.jpf"/>
3981 </mime-type>
3982
3983 <mime-type type="image/naplps"/>
3984
3985 <mime-type type="image/nitf">
3986 <alias type="image/ntf"/>
3987 <magic priority="50">
3988 <match value="NITF01.10" type="string" offset="0"/>
3989 <match value="NITF02.000" type="string" offset="0"/>
3990 <match value="NITF02.100" type="string" offset="0"/>
3991 </magic>
3992 <glob pattern="*.ntf"/>
3993 <glob pattern="*.nitf"/>
3994 </mime-type>
3995
3996 <mime-type type="image/png">
3997 <acronym>PNG</acronym>
3998 <_comment>Portable Network Graphics</_comment>
3999 <magic priority="50">
4000 <match value="\x89PNG\x0d\x0a\x1a\x0a" type="string" offset="0"/>
4001 </magic>
4002 <glob pattern="*.png"/>
4003 </mime-type>
4004
4005 <mime-type type="image/prs.btif">
4006 <glob pattern="*.btif"/>
4007 </mime-type>
4008 <mime-type type="image/prs.pti"/>
4009
4010 <mime-type type="image/svg+xml">
4011 <sub-class-of type="application/xml"/>
4012 <acronym>SVG</acronym>
4013 <_comment>Scalable Vector Graphics</_comment>
4014 <root-XML localName="svg" namespaceURI="http://www.w3.org/2000/svg"/>
4015 <glob pattern="*.svg"/>
4016 <glob pattern="*.svgz"/>
4017 </mime-type>
4018
4019 <mime-type type="image/t38"/>
4020
4021 <mime-type type="image/tiff">
4022 <acronym>TIFF</acronym>
4023 <_comment>Tagged Image File Format</_comment>
4024 <magic priority="50">
4025 <!-- MM.* = Big endian (M=Motorola) and 0x002a in big endian -->
4026 <match value="MM\x00\x2a" type="string" offset="0"/>
4027 <!-- II*. = Little endian (I=Intel) and 0x002a in little endian -->
4028 <match value="II\x2a\x00" type="string" offset="0"/>
4029 </magic>
4030 <glob pattern="*.tiff"/>
4031 <glob pattern="*.tif"/>
4032 </mime-type>
4033
4034 <mime-type type="image/tiff-fx"/>
4035
4036 <mime-type type="image/vnd.adobe.photoshop">
4037 <_comment>Photoshop Image</_comment>
4038 <alias type="image/x-psd"/>
4039 <alias type="application/photoshop"/>
4040 <glob pattern="*.psd"/>
4041 <magic priority="50">
4042 <!-- Version of 0x0001 is PSD -->
4043 <match value="8BPS\x00\x01" type="string" offset="0"/>
4044 <!-- Version of 0x0002 is PSB -->
4045 <match value="8BPS\x00\x02" type="string" offset="0"/>
4046 </magic>
4047 </mime-type>
4048
4049 <mime-type type="image/vnd.adobe.premiere">
4050 <glob pattern="*.ppj"/>
4051 <root-XML localName="PremiereData"/>
4052 <sub-class-of type="application/xml"/>
4053 </mime-type>
4054
4055 <mime-type type="image/vnd.cns.inf2"/>
4056 <mime-type type="image/vnd.djvu">
4057 <glob pattern="*.djvu"/>
4058 <glob pattern="*.djv"/>
4059 </mime-type>
4060
4061 <mime-type type="image/vnd.dwg">
4062 <acronym>DWG</acronym>
4063 <_comment>AutoCad Drawing</_comment>
4064 <tika:link>http://en.wikipedia.org/wiki/.dwg</tika:link>
4065 <alias type="image/x-dwg"/>
4066 <alias type="application/acad"/>
4067 <alias type="application/x-acad"/>
4068 <alias type="application/autocad_dwg"/>
4069 <alias type="application/dwg"/>
4070 <alias type="application/x-dwg"/>
4071 <alias type="application/x-autocad"/>
4072 <alias type="image/vnd.dwg"/>
4073 <alias type="drawing/dwg"/>
4074 <glob pattern="*.dwg"/>
4075 <magic priority="50">
4076 <match value="MC0.0" type="string" offset="0"/>
4077 <match value="AC1.2" type="string" offset="0"/>
4078 <match value="AC1.40" type="string" offset="0"/>
4079 <match value="AC1.50" type="string" offset="0"/>
4080 <match value="AC2.10" type="string" offset="0"/>
4081 <match value="AC2.21" type="string" offset="0"/>
4082 <match value="AC2.22" type="string" offset="0"/>
4083 <!-- "AC" followed by four numbers -->
4084 <match value="AC0000" type="string" offset="0"
4085 mask="0xFFFFF0F0F0F0"/>
4086 </magic>
4087 </mime-type>
4088
4089 <mime-type type="image/vnd.dxf">
4090 <acronym>DXF</acronym>
4091 <_comment>AutoCAD DXF</_comment>
4092 <tika:link>http://en.wikipedia.org/wiki/AutoCAD_DXF</tika:link>
4093 <glob pattern="*.dxf"/>
4094 </mime-type>
4095 <mime-type type="image/vnd.fastbidsheet">
4096 <glob pattern="*.fbs"/>
4097 </mime-type>
4098 <mime-type type="image/vnd.fpx">
4099 <glob pattern="*.fpx"/>
4100 </mime-type>
4101 <mime-type type="image/vnd.fst">
4102 <glob pattern="*.fst"/>
4103 </mime-type>
4104 <mime-type type="image/vnd.fujixerox.edmics-mmr">
4105 <glob pattern="*.mmr"/>
4106 </mime-type>
4107 <mime-type type="image/vnd.fujixerox.edmics-rlc">
4108 <glob pattern="*.rlc"/>
4109 </mime-type>
4110 <mime-type type="image/vnd.globalgraphics.pgb"/>
4111
4112 <mime-type type="image/vnd.microsoft.icon">
4113 <acronym>ICO</acronym>
4114 <tika:link>http://en.wikipedia.org/wiki/.ico</tika:link>
4115 <tika:uti>com.microsoft.ico</tika:uti>
4116 <alias type="image/x-icon" />
4117 <magic priority="50">
4118 <match value="\102\101\050\000\000\000\056\000\000\000\000\000\000\000"
4119 type="string" offset="0"/>
4120 <match value="\000\000\001\000" type="string" offset="0"/>
4121 </magic>
4122 <glob pattern="*.ico"/>
4123 </mime-type>
4124
4125 <mime-type type="image/vnd.mix"/>
4126 <mime-type type="image/vnd.ms-modi">
4127 <glob pattern="*.mdi"/>
4128 </mime-type>
4129 <mime-type type="image/vnd.net-fpx">
4130 <glob pattern="*.npx"/>
4131 </mime-type>
4132 <mime-type type="image/vnd.radiance"/>
4133 <mime-type type="image/vnd.sealed.png"/>
4134 <mime-type type="image/vnd.sealedmedia.softseal.gif"/>
4135 <mime-type type="image/vnd.sealedmedia.softseal.jpg"/>
4136 <mime-type type="image/vnd.svf"/>
4137
4138 <mime-type type="image/vnd.wap.wbmp">
4139 <_comment>Wireless Bitmap File Format</_comment>
4140 <glob pattern="*.wbmp"/>
4141 </mime-type>
4142
4143 <mime-type type="image/vnd.xiff">
4144 <glob pattern="*.xif"/>
4145 </mime-type>
4146 <mime-type type="image/x-cmu-raster">
4147 <glob pattern="*.ras"/>
4148 </mime-type>
4149 <mime-type type="image/x-cmx">
4150 <glob pattern="*.cmx"/>
4151 </mime-type>
4152 <mime-type type="image/x-freehand">
4153 <glob pattern="*.fh"/>
4154 <glob pattern="*.fhc"/>
4155 <glob pattern="*.fh4"/>
4156 <glob pattern="*.fh5"/>
4157 <glob pattern="*.fh7"/>
4158 </mime-type>
4159
4160 <mime-type type="image/x-jp2-codestream">
4161 <_comment>JPEG 2000 Codestream</_comment>
4162 <magic priority="25">
4163 <match value="0xff4fff51" type="string" offset="0"/>
4164 </magic>
4165 <glob pattern="*.j2c"/>
4166 </mime-type>
4167
4168 <mime-type type="image/x-jp2-container">
4169 <_comment>JPEG 2000 Container Format</_comment>
4170 <magic priority="50">
4171 <match value="0x0000000C6A5020200D0A870A" type="string" offset="0"/>
4172 </magic>
4173 </mime-type>
4174
4175 <mime-type type="image/x-niff">
4176 <_comment>Navy Interchange File Format</_comment>
4177 <magic priority="50">
4178 <match value="IIN1" type="string" offset="0"/>
4179 </magic>
4180 </mime-type>
4181
4182 <mime-type type="image/x-pcx">
4183 <glob pattern="*.pcx"/>
4184 </mime-type>
4185 <mime-type type="image/x-pict">
4186 <_comment>Apple Macintosh QuickDraw/PICT Format</_comment>
4187 <magic priority="50">
4188 <match value="0x001102FF0C00" type="string" offset="522"/>
4189 </magic>
4190 <glob pattern="*.pic"/>
4191 <glob pattern="*.pct"/>
4192 <glob pattern="*.pict"/>
4193 </mime-type>
4194
4195 <mime-type type="image/x-portable-anymap">
4196 <acronym>PNM</acronym>
4197 <_comment>Portable Any Map</_comment>
4198 <glob pattern="*.pnm" />
4199 </mime-type>
4200
4201 <mime-type type="image/x-portable-bitmap">
4202 <sub-class-of type="image/x-portable-anymap"/>
4203 <acronym>PBM</acronym>
4204 <_comment>Portable Bit Map</_comment>
4205 <magic priority="50">
4206 <match value="P1" type="string" offset="0"/>
4207 <match value="P4" type="string" offset="0"/>
4208 </magic>
4209 <glob pattern="*.pbm"/>
4210 </mime-type>
4211
4212 <mime-type type="image/x-portable-graymap">
4213 <sub-class-of type="image/x-portable-anymap"/>
4214 <acronym>PGM</acronym>
4215 <_comment>Portable Graymap Graphic</_comment>
4216 <magic priority="50">
4217 <match value="P2" type="string" offset="0"/>
4218 <match value="P5" type="string" offset="0"/>
4219 <match offset="0" type="string" value="\x50\x35\x0a"/>
4220 </magic>
4221 <glob pattern="*.pgm"/>
4222 </mime-type>
4223
4224 <mime-type type="image/x-portable-pixmap">
4225 <sub-class-of type="image/x-portable-anymap"/>
4226 <acronym>PXM</acronym>
4227 <_comment>UNIX Portable Bitmap Graphic</_comment>
4228 <magic priority="50">
4229 <match value="P3" type="string" offset="0"/>
4230 <match value="P6" type="string" offset="0"/>
4231 <match value="P7" type="string" offset="0"/>
4232 <match offset="0" type="string" value="\x50\x34\x0a"/>
4233 </magic>
4234 <glob pattern="*.ppm"/>
4235 </mime-type>
4236
4237 <mime-type type="image/x-raw-adobe">
4238 <acronym>DNG</acronym>
4239 <_comment>Adobe Digital Negative</_comment>
4240 <glob pattern="*.dng"/>
4241 </mime-type>
4242
4243 <mime-type type="image/x-raw-hasselblad">
4244 <_comment>Hasselblad raw image</_comment>
4245 <glob pattern="*.3fr"/>
4246 </mime-type>
4247
4248 <mime-type type="image/x-raw-fuji">
4249 <_comment>Fuji raw image</_comment>
4250 <glob pattern="*.raf"/>
4251 </mime-type>
4252
4253 <mime-type type="image/x-raw-canon">
4254 <_comment>Canon raw image</_comment>
4255 <glob pattern="*.crw"/>
4256 <glob pattern="*.cr2"/>
4257 </mime-type>
4258
4259 <mime-type type="image/x-raw-kodak">
4260 <_comment>Kodak raw image</_comment>
4261 <glob pattern="*.k25"/>
4262 <glob pattern="*.kdc"/>
4263 <glob pattern="*.dcs"/>
4264 <glob pattern="*.drf"/>
4265 </mime-type>
4266
4267 <mime-type type="image/x-raw-minolta">
4268 <_comment>Minolta raw image</_comment>
4269 <glob pattern="*.mrw"/>
4270 </mime-type>
4271
4272 <mime-type type="image/x-raw-nikon">
4273 <_comment>Nikon raw image</_comment>
4274 <glob pattern="*.nef"/>
4275 <glob pattern="*.nrw"/>
4276 </mime-type>
4277
4278 <mime-type type="image/x-raw-olympus">
4279 <_comment>Olympus raw image</_comment>
4280 <glob pattern="*.orf"/>
4281 </mime-type>
4282
4283 <mime-type type="image/x-raw-pentax">
4284 <_comment>Pentax raw image</_comment>
4285 <glob pattern="*.ptx"/>
4286 <glob pattern="*.pef"/>
4287 </mime-type>
4288
4289 <mime-type type="image/x-raw-sony">
4290 <_comment>Sony raw image</_comment>
4291 <glob pattern="*.arw"/>
4292 <glob pattern="*.srf"/>
4293 <glob pattern="*.sr2"/>
4294 </mime-type>
4295
4296 <mime-type type="image/x-raw-sigma">
4297 <_comment>Sigma raw image</_comment>
4298 <glob pattern="*.x3f"/>
4299 </mime-type>
4300
4301 <mime-type type="image/x-raw-epson">
4302 <_comment>Epson raw image</_comment>
4303 <glob pattern="*.erf"/>
4304 </mime-type>
4305
4306 <mime-type type="image/x-raw-mamiya">
4307 <_comment>Mamiya raw image</_comment>
4308 <glob pattern="*.mef"/>
4309 </mime-type>
4310
4311 <mime-type type="image/x-raw-leaf">
4312 <_comment>Leaf raw image</_comment>
4313 <glob pattern="*.mos"/>
4314 </mime-type>
4315
4316 <mime-type type="image/x-raw-panasonic">
4317 <_comment>Panasonic raw image</_comment>
4318 <glob pattern="*.raw"/>
4319 <glob pattern="*.rw2"/>
4320 </mime-type>
4321
4322 <mime-type type="image/x-raw-phaseone">
4323 <_comment>Phase One raw image</_comment>
4324 <glob pattern="*.iiq"/>
4325 </mime-type>
4326
4327 <mime-type type="image/x-raw-red">
4328 <_comment>Red raw image</_comment>
4329 <glob pattern="*.r3d"/>
4330 </mime-type>
4331
4332 <mime-type type="image/x-raw-imacon">
4333 <_comment>Imacon raw image</_comment>
4334 <glob pattern="*.fff"/>
4335 </mime-type>
4336
4337 <mime-type type="image/x-raw-logitech">
4338 <_comment>Logitech raw image</_comment>
4339 <glob pattern="*.pxn"/>
4340 </mime-type>
4341
4342 <mime-type type="image/x-raw-casio">
4343 <_comment>Casio raw image</_comment>
4344 <glob pattern="*.bay"/>
4345 </mime-type>
4346
4347 <mime-type type="image/x-raw-rawzor">
4348 <_comment>Rawzor raw image</_comment>
4349 <glob pattern="*.rwz"/>
4350 </mime-type>
4351
4352 <mime-type type="image/x-rgb">
4353 <_comment>Silicon Graphics RGB Bitmap</_comment>
4354 <magic priority="50">
4355 <match offset="0" type="string" value="\x01\xda\x01\x01\x00\x03"/>
4356 </magic>
4357 <glob pattern="*.rgb"/>
4358 </mime-type>
4359
4360 <mime-type type="image/x-xbitmap">
4361 <magic priority="50">
4362 <match value="/* XPM" type="string" offset="0"/>
4363 </magic>
4364 <glob pattern="*.xbm"/>
4365 <sub-class-of type="text/x-c"/>
4366 </mime-type>
4367
4368 <mime-type type="image/x-xcf">
4369 <_comment>GIMP Image File</_comment>
4370 <alias type="image/xcf"/>
4371 <magic priority="50">
4372 <match type="string" value="gimp xcf " offset="0"/>
4373 </magic>
4374 <glob pattern="*.xcf"/>
4375 </mime-type>
4376
4377 <mime-type type="image/x-xpixmap">
4378 <glob pattern="*.xpm"/>
4379 </mime-type>
4380
4381 <mime-type type="image/x-xwindowdump">
4382 <_comment>X Windows Dump</_comment>
4383 <glob pattern="*.xwd"/>
4384 </mime-type>
4385
4386 <mime-type type="message/cpim"/>
4387 <mime-type type="message/delivery-status"/>
4388 <mime-type type="message/disposition-notification"/>
4389 <mime-type type="message/example"/>
4390 <mime-type type="message/external-body"/>
4391 <mime-type type="message/global"/>
4392 <mime-type type="message/global-delivery-status"/>
4393 <mime-type type="message/global-disposition-notification"/>
4394 <mime-type type="message/global-headers"/>
4395 <mime-type type="message/http"/>
4396 <mime-type type="message/imdn+xml"/>
4397
4398 <mime-type type="message/news">
4399 <magic priority="50">
4400 <match value="Path:" type="string" offset="0" />
4401 <match value="Xref:" type="string" offset="0" />
4402 <match value="Article" type="string" offset="0" />
4403 </magic>
4404 </mime-type>
4405
4406 <mime-type type="message/partial"/>
4407
4408 <mime-type type="message/rfc822">
4409 <magic priority="50">
4410 <match value="Relay-Version:" type="stringignorecase" offset="0"/>
4411 <match value="#!\ rnews" type="string" offset="0"/>
4412 <match value="N#!\ rnews" type="string" offset="0"/>
4413 <match value="Forward\ to" type="string" offset="0"/>
4414 <match value="Pipe\ to" type="string" offset="0"/>
4415 <match value="Return-Path:" type="stringignorecase" offset="0"/>
4416 <match value="From:" type="stringignorecase" offset="0"/>
4417 <match value="Received:" type="stringignorecase" offset="0"/>
4418 <match value="Message-ID:" type="stringignorecase" offset="0"/>
4419 <match value="Date:" type="string" offset="0"/>
4420 <match value="MIME-Version:" type="stringignorecase" offset="0"/>
4421 <match value="X-Notes-Item:" type="string" offset="0">
4422 <match value="Message-ID:" type="string" offset="0:8192"/>
4423 </match>
4424 </magic>
4425 <glob pattern="*.eml"/>
4426 <glob pattern="*.mime"/>
4427 <glob pattern="*.mht"/>
4428 <glob pattern="*.mhtml"/>
4429 </mime-type>
4430
4431 <mime-type type="message/s-http"/>
4432 <mime-type type="message/sip"/>
4433 <mime-type type="message/sipfrag"/>
4434 <mime-type type="message/tracking-status"/>
4435 <mime-type type="message/vnd.si.simp"/>
4436
4437 <mime-type type="model/example"/>
4438
4439 <mime-type type="model/iges">
4440 <_comment>Initial Graphics Exchange Specification Format</_comment>
4441 <glob pattern="*.igs"/>
4442 <glob pattern="*.iges"/>
4443 </mime-type>
4444
4445 <mime-type type="model/mesh">
4446 <glob pattern="*.msh"/>
4447 <glob pattern="*.mesh"/>
4448 <glob pattern="*.silo"/>
4449 </mime-type>
4450
4451 <mime-type type="model/vnd.dwf">
4452 <glob pattern="*.dwf"/>
4453 </mime-type>
4454 <mime-type type="model/vnd.flatland.3dml"/>
4455 <mime-type type="model/vnd.gdl">
4456 <glob pattern="*.gdl"/>
4457 </mime-type>
4458 <mime-type type="model/vnd.gs-gdl"/>
4459 <mime-type type="model/vnd.gs.gdl"/>
4460 <mime-type type="model/vnd.gtw">
4461 <glob pattern="*.gtw"/>
4462 </mime-type>
4463 <mime-type type="model/vnd.moml+xml"/>
4464 <mime-type type="model/vnd.mts">
4465 <glob pattern="*.mts"/>
4466 </mime-type>
4467 <mime-type type="model/vnd.parasolid.transmit.binary"/>
4468 <mime-type type="model/vnd.parasolid.transmit.text"/>
4469 <mime-type type="model/vnd.vtu">
4470 <glob pattern="*.vtu"/>
4471 </mime-type>
4472
4473 <mime-type type="model/vrml">
4474 <glob pattern="*.wrl"/>
4475 <glob pattern="*.vrml"/>
4476 </mime-type>
4477
4478 <mime-type type="multipart/alternative"/>
4479 <mime-type type="multipart/appledouble"/>
4480 <mime-type type="multipart/byteranges"/>
4481 <mime-type type="multipart/digest"/>
4482 <mime-type type="multipart/encrypted"/>
4483 <mime-type type="multipart/example"/>
4484 <mime-type type="multipart/form-data"/>
4485 <mime-type type="multipart/header-set"/>
4486 <mime-type type="multipart/mixed"/>
4487 <mime-type type="multipart/parallel"/>
4488 <mime-type type="multipart/related"/>
4489 <mime-type type="multipart/report"/>
4490 <mime-type type="multipart/signed"/>
4491 <mime-type type="multipart/voice-message"/>
4492
4493 <mime-type type="text/x-actionscript">
4494 <_comment>ActionScript source code</_comment>
4495 <glob pattern="*.as"/>
4496 <sub-class-of type="text/plain"/>
4497 </mime-type>
4498
4499 <mime-type type="text/x-ada">
4500 <_comment>Ada source code</_comment>
4501 <glob pattern="*.ada"/>
4502 <glob pattern="*.adb"/>
4503 <glob pattern="*.ads"/>
4504 <sub-class-of type="text/plain"/>
4505 </mime-type>
4506
4507 <mime-type type="text/x-applescript">
4508 <_comment>AppleScript source code</_comment>
4509 <glob pattern="*.applescript"/>
4510 <sub-class-of type="text/plain"/>
4511 </mime-type>
4512
4513 <mime-type type="text/asp">
4514 <_comment>Active Server Page</_comment>
4515 <glob pattern="*.asp"/>
4516 <sub-class-of type="text/plain"/>
4517 </mime-type>
4518
4519 <mime-type type="text/aspdotnet">
4520 <_comment>ASP .NET</_comment>
4521 <glob pattern="*.aspx"/>
4522 <sub-class-of type="text/plain"/>
4523 </mime-type>
4524
4525 <mime-type type="text/x-aspectj">
4526 <_comment>AspectJ source code</_comment>
4527 <glob pattern="*.aj"/>
4528 <sub-class-of type="text/plain"/>
4529 </mime-type>
4530
4531 <mime-type type="text/x-assembly">
4532 <alias type="text/x-asm"/>
4533 <_comment>Assembler source code</_comment>
4534 <glob pattern="*.s"/>
4535 <glob pattern="*.S"/>
4536 <glob pattern="*.asm"/>
4537 <sub-class-of type="text/plain"/>
4538 </mime-type>
4539
4540 <mime-type type="text/calendar">
4541 <glob pattern="*.ics"/>
4542 <glob pattern="*.ifb"/>
4543 </mime-type>
4544
4545 <mime-type type="text/css">
4546 <_comment>Cascading Style Sheet</_comment>
4547 <glob pattern="*.css"/>
4548 <sub-class-of type="text/plain"/>
4549 </mime-type>
4550
4551 <mime-type type="text/csv">
4552 <glob pattern="*.csv"/>
4553 </mime-type>
4554
4555 <mime-type type="text/directory"/>
4556 <mime-type type="text/dns"/>
4557 <mime-type type="text/ecmascript"/>
4558 <mime-type type="text/enriched"/>
4559 <mime-type type="text/example"/>
4560
4561 <mime-type type="text/html">
4562 <_comment>HyperText Markup Language</_comment>
4563 <acronym>HTML</acronym>
4564 <tika:uti>public.html</tika:uti>
4565 <!-- TIKA-327: if you encounter tags in the HTML
4566 with no declared namespace, it's not XHTML, it's just
4567 bad HTML, unfortunately.
4568 -->
4569 <root-XML localName="html"/>
4570 <root-XML localName="HTML"/>
4571 <root-XML localName="link"/>
4572 <root-XML localName="LINK"/>
4573 <root-XML localName="body"/>
4574 <root-XML localName="BODY"/>
4575 <root-XML localName="p"/>
4576 <root-XML localName="P"/>
4577 <root-XML localName="script"/>
4578 <root-XML localName="SCRIPT"/>
4579 <root-XML localName="frameset"/>
4580 <root-XML localName="FRAMESET"/>
4581 <!-- The magic priority needs to be lower than that of -->
4582 <!-- files which contain HTML within them, eg mime emails -->
4583 <magic priority="40">
4584 <match value="&lt;!DOCTYPE HTML" type="string" offset="0:64"/>
4585 <match value="&lt;!doctype html" type="string" offset="0:64"/>
4586 <match value="&lt;HEAD" type="string" offset="0:64"/>
4587 <match value="&lt;head" type="string" offset="0:64"/>
4588 <match value="&lt;TITLE" type="string" offset="0:64"/>
4589 <match value="&lt;title" type="string" offset="0:64"/>
4590 <!-- note on the offset value here: this can only be as big as
4591 MimeTypes#getMinLength(). If you set the offset value to larger
4592 than that size, the magic will only be compared to up to
4593 MimeTypes#getMinLength() bytes.
4594 -->
4595 <match value="&lt;html" type="string" offset="0:8192"/>
4596 <match value="&lt;HTML" type="string" offset="0:64"/>
4597 <match value="&lt;BODY" type="string" offset="0"/>
4598 <match value="&lt;body" type="string" offset="0"/>
4599 <match value="&lt;DIV" type="string" offset="0"/>
4600 <match value="&lt;div" type="string" offset="0"/>
4601 <match value="&lt;TITLE" type="string" offset="0"/>
4602 <match value="&lt;title" type="string" offset="0"/>
4603 <match value="&lt;h1" type="string" offset="0"/>
4604 <match value="&lt;H1" type="string" offset="0"/>
4605 <match value="&lt;!doctype HTML" type="string" offset="0"/>
4606 <match value="&lt;!DOCTYPE html" type="string" offset="0"/>
4607 </magic>
4608 <glob pattern="*.html"/>
4609 <glob pattern="*.htm"/>
4610 </mime-type>
4611
4612 <mime-type type="text/parityfec"/>
4613
4614 <mime-type type="text/plain">
4615 <magic priority="20">
4616 <match value="This is TeX," type="string" offset="0"/>
4617 <match value="This is METAFONT," type="string" offset="0"/>
4618 <match value="/*" type="string" offset="0"/>
4619 <match value="//" type="string" offset="0"/>
4620 <match value=";;" type="string" offset="0"/>
4621 <!-- UTF-16BE BOM -->
4622 <match value="0xfeff" type="string" offset="0"/>
4623 <!-- UTF-16LE BOM -->
4624 <match value="0xfffe" type="string" offset="0"/>
4625 <!-- UTF-8 BOM -->
4626 <match value="0xefbbbf" type="string" offset="0"/>
4627 </magic>
4628
4629 <glob pattern="*.txt"/>
4630 <glob pattern="*.text"/>
4631 <glob pattern="*.conf"/>
4632 <glob pattern="*.cfg"/>
4633 <glob pattern="*.def"/>
4634 <glob pattern="*.list"/>
4635 <glob pattern="*.in"/>
4636
4637 <!-- TIKA-85: http://www.apache.org/dev/svn-eol-style.txt -->
4638 <glob pattern="INSTALL"/>
4639 <glob pattern="KEYS"/>
4640 <glob pattern="Makefile"/>
4641 <glob pattern="README"/>
4642 <glob pattern="abs-linkmap"/>
4643 <glob pattern="abs-menulinks"/>
4644 <glob pattern="*.aart"/>
4645 <glob pattern="*.ac"/>
4646 <glob pattern="*.am"/>
4647 <glob pattern="*.classpath"/>
4648 <glob pattern="*.cmd"/>
4649 <glob pattern="*.config"/>
4650 <glob pattern="*.cwiki"/>
4651 <glob pattern="*.data"/>
4652 <glob pattern="*.dcl"/>
4653 <glob pattern="*.egrm"/>
4654 <glob pattern="*.ent"/>
4655 <glob pattern="*.ft"/>
4656 <glob pattern="*.fn"/>
4657 <glob pattern="*.fv"/>
4658 <glob pattern="*.grm"/>
4659 <glob pattern="*.g"/>
4660 <glob pattern=".htaccess"/>
4661 <glob pattern="*.ihtml"/>
4662 <glob pattern="*.jmx"/>
4663 <glob pattern="*.junit"/>
4664 <glob pattern="*.jx"/>
4665 <glob pattern="*.manifest"/>
4666 <glob pattern="*.m4"/>
4667 <glob pattern="*.mf"/>
4668 <glob pattern="*.MF"/>
4669 <glob pattern="*.meta"/>
4670 <glob pattern="*.n3"/>
4671 <glob pattern="*.pen"/>
4672 <glob pattern="*.pod"/>
4673 <glob pattern="*.pom"/>
4674 <glob pattern="*.project"/>
4675 <glob pattern="*.properties"/>
4676 <glob pattern="*.rng"/>
4677 <glob pattern="*.rnx"/>
4678 <glob pattern="*.roles"/>
4679 <glob pattern="*.tld"/>
4680 <glob pattern="*.types"/>
4681 <glob pattern="*.vm"/>
4682 <glob pattern="*.vsl"/>
4683 <glob pattern="*.wsdd"/>
4684 <glob pattern="*.xargs"/>
4685 <glob pattern="*.xcat"/>
4686 <glob pattern="*.xconf"/>
4687 <glob pattern="*.xegrm"/>
4688 <glob pattern="*.xgrm"/>
4689 <glob pattern="*.xlex"/>
4690 <glob pattern="*.xlog"/>
4691 <glob pattern="*.xmap"/>
4692 <glob pattern="*.xroles"/>
4693 <glob pattern="*.xsamples"/>
4694 <glob pattern="*.xsp"/>
4695 <glob pattern="*.xweb"/>
4696 <glob pattern="*.xwelcome"/>
4697 </mime-type>
4698
4699 <mime-type type="text/prs.fallenstein.rst"/>
4700 <mime-type type="text/prs.lines.tag">
4701 <glob pattern="*.dsc"/>
4702 </mime-type>
4703 <mime-type type="text/red"/>
4704 <mime-type type="text/rfc822-headers"/>
4705 <mime-type type="text/richtext">
4706 <glob pattern="*.rtx"/>
4707 </mime-type>
4708
4709 <mime-type type="text/rtp-enc-aescm128"/>
4710 <mime-type type="text/rtx"/>
4711 <mime-type type="text/sgml">
4712 <glob pattern="*.sgml"/>
4713 <glob pattern="*.sgm"/>
4714 </mime-type>
4715 <mime-type type="text/t140"/>
4716 <mime-type type="text/tab-separated-values">
4717 <glob pattern="*.tsv"/>
4718 </mime-type>
4719
4720 <mime-type type="text/troff">
4721 <_comment>Roff/nroff/troff/groff Unformatted Manual Page (UNIX)</_comment>
4722 <alias type="application/x-troff"/>
4723 <alias type="application/x-troff-man"/>
4724 <alias type="application/x-troff-me"/>
4725 <alias type="application/x-troff-ms"/>
4726 <magic priority="50">
4727 <match value=".\\&quot;" type="string" offset="0"/>
4728 <match value="'\\&quot;" type="string" offset="0"/>
4729 <match value="'.\\&quot;" type="string" offset="0"/>
4730 <match value="\\&quot;" type="string" offset="0"/>
4731 <match value="'''" type="string" offset="0"/>
4732 </magic>
4733 <glob pattern="*.t"/>
4734 <glob pattern="*.tr"/>
4735 <glob pattern="*.roff"/>
4736 <glob pattern="*.man"/>
4737 <glob pattern="*.me"/>
4738 <glob pattern="*.ms"/>
4739 </mime-type>
4740
4741 <mime-type type="text/ulpfec"/>
4742 <mime-type type="text/uri-list">
4743 <glob pattern="*.uri"/>
4744 <glob pattern="*.uris"/>
4745 <glob pattern="*.urls"/>
4746 </mime-type>
4747 <mime-type type="text/vnd.abc"/>
4748 <mime-type type="text/vnd.curl">
4749 <glob pattern="*.curl"/>
4750 </mime-type>
4751 <mime-type type="text/vnd.curl.dcurl">
4752 <glob pattern="*.dcurl"/>
4753 </mime-type>
4754 <mime-type type="text/vnd.curl.scurl">
4755 <glob pattern="*.scurl"/>
4756 </mime-type>
4757 <mime-type type="text/vnd.curl.mcurl">
4758 <glob pattern="*.mcurl"/>
4759 </mime-type>
4760 <mime-type type="text/vnd.dmclientscript"/>
4761 <mime-type type="text/vnd.esmertec.theme-descriptor"/>
4762 <mime-type type="text/vnd.fly">
4763 <glob pattern="*.fly"/>
4764 </mime-type>
4765 <mime-type type="text/vnd.fmi.flexstor">
4766 <glob pattern="*.flx"/>
4767 </mime-type>
4768 <mime-type type="text/vnd.graphviz">
4769 <glob pattern="*.gv"/>
4770 </mime-type>
4771 <mime-type type="text/vnd.in3d.3dml">
4772 <glob pattern="*.3dml"/>
4773 </mime-type>
4774 <mime-type type="text/vnd.in3d.spot">
4775 <glob pattern="*.spot"/>
4776 </mime-type>
4777 <mime-type type="text/vnd.iptc.anpa">
4778 <acronym>ANPA</acronym>
4779 <_comment>American Newspaper Publishers Association Wire Feeds</_comment>
4780 <glob pattern="*.anpa"/>
4781 <magic priority="50">
4782 <match value="\x16\x16\x01" type="string" offset="0"/>
4783 </magic>
4784 </mime-type>
4785 <mime-type type="text/vnd.iptc.newsml"/>
4786 <mime-type type="text/vnd.iptc.nitf"/>
4787 <mime-type type="text/vnd.latex-z"/>
4788 <mime-type type="text/vnd.motorola.reflex"/>
4789 <mime-type type="text/vnd.ms-mediapackage"/>
4790 <mime-type type="text/vnd.net2phone.commcenter.command"/>
4791 <mime-type type="text/vnd.si.uricatalogue"/>
4792 <mime-type type="text/vnd.sun.j2me.app-descriptor">
4793 <glob pattern="*.jad"/>
4794 </mime-type>
4795 <mime-type type="text/vnd.trolltech.linguist"/>
4796 <mime-type type="text/vnd.wap.si"/>
4797 <mime-type type="text/vnd.wap.sl"/>
4798 <mime-type type="text/vnd.wap.wml">
4799 <glob pattern="*.wml"/>
4800 </mime-type>
4801
4802 <mime-type type="text/vnd.wap.wmlscript">
4803 <_comment>WML Script</_comment>
4804 <glob pattern="*.wmls"/>
4805 </mime-type>
4806
4807 <mime-type type="text/x-awk">
4808 <_comment>AWK script</_comment>
4809 <magic priority="50">
4810 <match value="#!/bin/gawk" type="string" offset="0"/>
4811 <match value="#! /bin/gawk" type="string" offset="0"/>
4812 <match value="#!/usr/bin/gawk" type="string" offset="0"/>
4813 <match value="#! /usr/bin/gawk" type="string" offset="0"/>
4814 <match value="#!/usr/local/bin/gawk" type="string" offset="0"/>
4815 <match value="#! /usr/local/bin/gawk" type="string" offset="0"/>
4816 <match value="#!/bin/awk" type="string" offset="0"/>
4817 <match value="#! /bin/awk" type="string" offset="0"/>
4818 <match value="#!/usr/bin/awk" type="string" offset="0"/>
4819 <match value="#! /usr/bin/awk" type="string" offset="0"/>
4820 </magic>
4821 <glob pattern="*.awk"/>
4822 <sub-class-of type="text/plain"/>
4823 </mime-type>
4824
4825 <mime-type type="text/x-basic">
4826 <_comment>Basic source code</_comment>
4827 <glob pattern="*.bas"/>
4828 <glob pattern="*.Bas"/>
4829 <glob pattern="*.BAS"/>
4830 <sub-class-of type="text/plain"/>
4831 </mime-type>
4832
4833 <mime-type type="text/x-c++hdr">
4834 <_comment>C++ source code header</_comment>
4835 <glob pattern="*.hpp"/>
4836 <glob pattern="*.hxx"/>
4837 <glob pattern="*.hh"/>
4838 <glob pattern="*.H"/>
4839 <glob pattern="*.h++"/>
4840 <glob pattern="*.hp"/>
4841 <glob pattern="*.HPP"/>
4842 <sub-class-of type="text/plain"/>
4843 </mime-type>
4844
4845 <mime-type type="text/x-c++src">
4846 <_comment>C++ source code</_comment>
4847 <glob pattern="*.cpp"/>
4848 <glob pattern="*.cxx"/>
4849 <glob pattern="*.cc"/>
4850 <glob pattern="*.C"/>
4851 <glob pattern="*.c++"/>
4852 <glob pattern="*.CPP"/>
4853 <sub-class-of type="text/plain"/>
4854 </mime-type>
4855
4856 <mime-type type="text/x-cgi">
4857 <_comment>CGI script</_comment>
4858 <glob pattern="*.cgi"/>
4859 <sub-class-of type="text/plain"/>
4860 </mime-type>
4861
4862 <mime-type type="text/x-chdr">
4863 <_comment>C source code header</_comment>
4864 <glob pattern="*.h"/>
4865 <sub-class-of type="text/plain"/>
4866 </mime-type>
4867
4868 <mime-type type="text/x-clojure">
4869 <_comment>Clojure source code</_comment>
4870 <glob pattern="*.clj"/>
4871 <sub-class-of type="text/plain"/>
4872 </mime-type>
4873
4874 <mime-type type="text/x-coffeescript">
4875 <_comment>CoffeeScript source code</_comment>
4876 <glob pattern="*.coffee"/>
4877 <sub-class-of type="text/plain"/>
4878 </mime-type>
4879
4880 <mime-type type="text/x-csrc">
4881 <alias type="text/x-c"/>
4882 <_comment>C source code</_comment>
4883 <glob pattern="*.c"/>
4884 <sub-class-of type="text/plain"/>
4885 </mime-type>
4886
4887 <mime-type type="text/x-csharp">
4888 <_comment>C# source code</_comment>
4889 <glob pattern="*.cs"/>
4890 <sub-class-of type="text/plain"/>
4891 </mime-type>
4892
4893 <mime-type type="text/x-cobol">
4894 <_comment>COBOL source code</_comment>
4895 <glob pattern="*.cbl"/>
4896 <glob pattern="*.Cbl"/>
4897 <glob pattern="*.CBL"/>
4898 <glob pattern="*.cob"/>
4899 <glob pattern="*.Cob"/>
4900 <glob pattern="*.COB"/>
4901 <sub-class-of type="text/plain"/>
4902 </mime-type>
4903
4904 <mime-type type="text/x-coldfusion">
4905 <_comment>ColdFusion source code</_comment>
4906 <glob pattern="*.cfm"/>
4907 <glob pattern="*.cfml"/>
4908 <glob pattern="*.cfc"/>
4909 <sub-class-of type="text/plain"/>
4910 </mime-type>
4911
4912 <mime-type type="text/x-common-lisp">
4913 <_comment>Common Lisp source code</_comment>
4914 <glob pattern="*.cl"/>
4915 <glob pattern="*.jl"/>
4916 <glob pattern="*.lisp"/>
4917 <glob pattern="*.lsp"/>
4918 <sub-class-of type="text/plain"/>
4919 </mime-type>
4920
4921 <mime-type type="text/x-diff">
4922 <magic priority="50">
4923 <match value="diff\ " type="string" offset="0"/>
4924 <match value="***\ " type="string" offset="0"/>
4925 <match value="Only\ in\ " type="string" offset="0"/>
4926 <match value="Common\ subdirectories:\ " type="string" offset="0"/>
4927 <match value="Index:" type="string" offset="0"/>
4928 </magic>
4929 <glob pattern="*.diff"/>
4930 <glob pattern="*.patch"/>
4931 <sub-class-of type="text/plain"/>
4932 </mime-type>
4933
4934 <mime-type type="text/x-eiffel">
4935 <_comment>Eiffel source code</_comment>
4936 <glob pattern="*.e"/>
4937 <sub-class-of type="text/plain"/>
4938 </mime-type>
4939
4940 <mime-type type="text/x-emacs-lisp">
4941 <_comment>Emacs Lisp source code</_comment>
4942 <glob pattern="*.el"/>
4943 <sub-class-of type="text/plain"/>
4944 </mime-type>
4945
4946 <mime-type type="text/x-erlang">
4947 <_comment>Erlang source code</_comment>
4948 <glob pattern="*.erl"/>
4949 <sub-class-of type="text/plain"/>
4950 </mime-type>
4951
4952 <mime-type type="text/x-expect">
4953 <_comment>Expect Script</_comment>
4954 <glob pattern="*.exp"/>
4955 <sub-class-of type="text/plain"/>
4956 </mime-type>
4957
4958 <mime-type type="text/x-forth">
4959 <_comment>Forth source code</_comment>
4960 <glob pattern="*.4th"/>
4961 <sub-class-of type="text/plain"/>
4962 </mime-type>
4963
4964 <mime-type type="text/x-fortran">
4965 <_comment>Fortran source code</_comment>
4966 <glob pattern="*.f"/>
4967 <glob pattern="*.F"/>
4968 <glob pattern="*.for"/>
4969 <glob pattern="*.f77"/>
4970 <glob pattern="*.f90"/>
4971 <sub-class-of type="text/plain"/>
4972 </mime-type>
4973
4974 <mime-type type="text/x-go">
4975 <_comment>Go source code</_comment>
4976 <glob pattern="*.go"/>
4977 <sub-class-of type="text/plain"/>
4978 </mime-type>
4979
4980 <mime-type type="text/x-groovy">
4981 <_comment>Groovy source code</_comment>
4982 <glob pattern="*.groovy"/>
4983 <sub-class-of type="text/plain"/>
4984 </mime-type>
4985
4986 <mime-type type="text/x-haskell">
4987 <_comment>Haskell source code</_comment>
4988 <glob pattern="*.hs"/>
4989 <glob pattern="*.lhs"/>
4990 <sub-class-of type="text/plain"/>
4991 </mime-type>
4992
4993 <mime-type type="text/x-idl">
4994 <_comment>Inteface Definition Language</_comment>
4995 <glob pattern="*.idl"/>
4996 <sub-class-of type="text/plain"/>
4997 </mime-type>
4998
4999 <mime-type type="text/x-ini">
5000 <_comment>Configuration file</_comment>
5001 <glob pattern="*.ini"/>
5002 <sub-class-of type="text/plain"/>
5003 </mime-type>
5004
5005 <mime-type type="text/x-java-source">
5006 <_comment>Java source code</_comment>
5007 <alias type="text/x-java" />
5008 <glob pattern="*.java"/>
5009 <sub-class-of type="text/plain"/>
5010 </mime-type>
5011
5012 <mime-type type="text/x-jsp">
5013 <_comment>Java Server Page</_comment>
5014 <alias type="application/x-httpd-jsp"/>
5015 <sub-class-of type="text/plain"/>
5016 <magic priority="50">
5017 <match value="&lt;%@" type="string" offset="0"/>
5018 <match value="&lt;%--" type="string" offset="0"/>
5019 </magic>
5020 <glob pattern="*.jsp"/>
5021 </mime-type>
5022
5023 <mime-type type="text/x-less">
5024 <_comment>LESS source code</_comment>
5025 <glob pattern="*.less"/>
5026 <sub-class-of type="text/plain"/>
5027 </mime-type>
5028
5029 <mime-type type="text/x-lex">
5030 <_comment>Lex/Flex source code</_comment>
5031 <glob pattern="*.l"/>
5032 <sub-class-of type="text/plain"/>
5033 </mime-type>
5034
5035 <mime-type type="text/x-log">
5036 <_comment>application log</_comment>
5037 <glob pattern="*.log"/>
5038 <sub-class-of type="text/plain"/>
5039 </mime-type>
5040
5041 <mime-type type="text/x-lua">
5042 <_comment>Lua source code</_comment>
5043 <glob pattern="*.lua"/>
5044 <sub-class-of type="text/plain"/>
5045 </mime-type>
5046
5047 <mime-type type="text/x-ml">
5048 <_comment>ML source code</_comment>
5049 <glob pattern="*.ml"/>
5050 <sub-class-of type="text/plain"/>
5051 </mime-type>
5052
5053 <mime-type type="text/x-matlab">
5054 <_comment>Matlab source code</_comment>
5055 <magic priority="50">
5056 <match value="function [" type="string" offset="0"/>
5057 </magic>
5058 <!-- <glob pattern="*.m"/> - conflicts with text/x-objcsrc -->
5059 <sub-class-of type="text/plain"/>
5060 </mime-type>
5061
5062 <mime-type type="text/x-modula">
5063 <_comment>Modula source code</_comment>
5064 <glob pattern="*.m3"/>
5065 <glob pattern="*.i3"/>
5066 <glob pattern="*.mg"/>
5067 <glob pattern="*.ig"/>
5068 <sub-class-of type="text/plain"/>
5069 </mime-type>
5070
5071 <mime-type type="text/x-objcsrc">
5072 <_comment>Objective-C source code</_comment>
5073 <glob pattern="*.m"/>
5074 <sub-class-of type="text/plain"/>
5075 </mime-type>
5076
5077 <mime-type type="text/x-ocaml">
5078 <_comment>Ocaml source code</_comment>
5079 <glob pattern="*.ocaml"/>
5080 <glob pattern="*.mli"/>
5081 <sub-class-of type="text/plain"/>
5082 </mime-type>
5083
5084 <mime-type type="text/x-pascal">
5085 <_comment>Pascal source code</_comment>
5086 <glob pattern="*.p"/>
5087 <glob pattern="*.pp"/>
5088 <glob pattern="*.pas"/>
5089 <glob pattern="*.PAS"/>
5090 <glob pattern="*.dpr"/>
5091 <sub-class-of type="text/plain"/>
5092 </mime-type>
5093
5094 <mime-type type="text/x-perl">
5095 <_comment>Perl script</_comment>
5096 <magic priority="50">
5097 <match value="eval \&quot;exec /usr/local/bin/perl" type="string" offset="0"/>
5098 <match value="#!/bin/perl" type="string" offset="0"/>
5099 <match value="#!/bin/env perl" type="string" offset="0"/>
5100 <match value="#!/usr/bin/perl" type="string" offset="0"/>
5101 <match value="#!/usr/local/bin/perl" type="string" offset="0"/>
5102 </magic>
5103 <glob pattern="*.pl"/>
5104 <glob pattern="*.pm"/>
5105 <glob pattern="*.al"/>
5106 <glob pattern="*.perl"/>
5107 <sub-class-of type="text/plain"/>
5108 </mime-type>
5109
5110 <mime-type type="text/x-php">
5111 <_comment>PHP script</_comment>
5112 <magic priority="50">
5113 <match value="&lt;?php" type="string" offset="0"/>
5114 </magic>
5115 <glob pattern="*.php"/>
5116 <glob pattern="*.php3"/>
5117 <glob pattern="*.php4"/>
5118 <sub-class-of type="text/plain"/>
5119 </mime-type>
5120
5121 <mime-type type="text/x-prolog">
5122 <_comment>Prolog source code</_comment>
5123 <glob pattern="*.pro"/>
5124 <!-- <glob pattern="*.pl"/> - conflicts with text/x-perl -->
5125 <sub-class-of type="text/plain"/>
5126 </mime-type>
5127
5128 <mime-type type="text/x-python">
5129 <_comment>Python script</_comment>
5130 <magic priority="50">
5131 <match value="#!/bin/python" type="string" offset="0"/>
5132 <match value="#! /bin/python" type="string" offset="0"/>
5133 <match value="eval &quot;exec /bin/python" type="string" offset="0"/>
5134 <match value="#!/usr/bin/python" type="string" offset="0"/>
5135 <match value="#! /usr/bin/python" type="string" offset="0"/>
5136 <match value="eval &quot;exec /usr/bin/python" type="string" offset="0"/>
5137 <match value="#!/usr/local/bin/python" type="string" offset="0"/>
5138 <match value="#! /usr/local/bin/python" type="string" offset="0"/>
5139 <match value="eval &quot;exec /usr/local/bin/python" type="string" offset="0"/>
5140 <match value="/bin/env python" type="string" offset="1"/>
5141 </magic>
5142 <glob pattern="*.py"/>
5143 <sub-class-of type="text/plain"/>
5144 </mime-type>
5145
5146 <mime-type type="text/x-rst">
5147 <_comment>reStructuredText source code</_comment>
5148 <glob pattern="*.rest"/>
5149 <glob pattern="*.rst"/>
5150 <glob pattern="*.restx"/>
5151 <sub-class-of type="text/plain"/>
5152 </mime-type>
5153
5154 <mime-type type="text/x-rexx">
5155 <_comment>Rexx source code</_comment>
5156 <glob pattern="*.rexx"/>
5157 <sub-class-of type="text/plain"/>
5158 </mime-type>
5159
5160 <mime-type type="text/x-ruby">
5161 <_comment>Ruby source code</_comment>
5162 <glob pattern="*.rb"/>
5163 <sub-class-of type="text/plain"/>
5164 </mime-type>
5165
5166 <mime-type type="text/x-scala">
5167 <_comment>Scala source code</_comment>
5168 <glob pattern="*.scala"/>
5169 <sub-class-of type="text/plain"/>
5170 </mime-type>
5171
5172 <mime-type type="text/x-scheme">
5173 <_comment>Scheme source code</_comment>
5174 <glob pattern="*.scm"/>
5175 <sub-class-of type="text/plain"/>
5176 </mime-type>
5177
5178 <mime-type type="text/x-sed">
5179 <_comment>Sed code</_comment>
5180 <glob pattern="*.sed"/>
5181 <sub-class-of type="text/plain"/>
5182 </mime-type>
5183
5184 <mime-type type="text/x-sql">
5185 <_comment>SQL code</_comment>
5186 <glob pattern="*.sql"/>
5187 <sub-class-of type="text/plain"/>
5188 </mime-type>
5189
5190 <mime-type type="text/x-setext">
5191 <glob pattern="*.etx"/>
5192 <sub-class-of type="text/plain"/>
5193 </mime-type>
5194
5195 <mime-type type="text/x-stsrc">
5196 <_comment>Smalltalk source code</_comment>
5197 <glob pattern="*.st"/>
5198 <sub-class-of type="text/plain"/>
5199 </mime-type>
5200
5201 <mime-type type="text/x-tcl">
5202 <alias type="application/x-tcl"/>
5203 <_comment>Tcl script</_comment>
5204 <glob pattern="*.itk"/>
5205 <glob pattern="*.tcl"/>
5206 <glob pattern="*.tk"/>
5207 <sub-class-of type="text/plain"/>
5208 </mime-type>
5209
5210 <mime-type type="text/x-uuencode">
5211 <glob pattern="*.uu"/>
5212 </mime-type>
5213
5214 <mime-type type="text/x-vbasic">
5215 <_comment>Visual basic source code</_comment>
5216 <glob pattern="*.cls"/>
5217 <glob pattern="*.Cls"/>
5218 <glob pattern="*.CLS"/>
5219 <glob pattern="*.frm"/>
5220 <glob pattern="*.Frm"/>
5221 <glob pattern="*.FRM"/>
5222 <sub-class-of type="text/x-basic"/>
5223 </mime-type>
5224
5225 <mime-type type="text/x-vbdotnet">
5226 <_comment>VB.NET source code</_comment>
5227 <glob pattern="*.vb"/>
5228 <sub-class-of type="text/x-vbasic"/>
5229 </mime-type>
5230
5231 <mime-type type="text/x-vbscript">
5232 <_comment>VBScript source code</_comment>
5233 <glob pattern="*.vbs"/>
5234 <sub-class-of type="text/x-vbasic"/>
5235 </mime-type>
5236
5237 <mime-type type="text/x-vcalendar">
5238 <glob pattern="*.vcs"/>
5239 <sub-class-of type="text/plain"/>
5240 </mime-type>
5241
5242 <mime-type type="text/x-vcard">
5243 <glob pattern="*.vcf"/>
5244 <sub-class-of type="text/plain"/>
5245 </mime-type>
5246
5247 <mime-type type="text/x-verilog">
5248 <_comment>Verilog source code</_comment>
5249 <glob pattern="*.v"/>
5250 <sub-class-of type="text/plain"/>
5251 </mime-type>
5252
5253 <mime-type type="text/x-vhdl">
5254 <_comment>VHDL source code</_comment>
5255 <glob pattern="*.vhd"/>
5256 <glob pattern="*.vhdl"/>
5257 <sub-class-of type="text/plain"/>
5258 </mime-type>
5259
5260 <mime-type type="text/x-web-markdown">
5261 <_comment>Markdown source code</_comment>
5262 <glob pattern="*.md"/>
5263 <glob pattern="*.mdtext"/>
5264 <glob pattern="*.mkd"/>
5265 <glob pattern="*.markdown"/>
5266 <sub-class-of type="text/plain"/>
5267 </mime-type>
5268
5269 <mime-type type="text/x-yacc">
5270 <_comment>Yacc/Bison source code</_comment>
5271 <glob pattern="*.y"/>
5272 <sub-class-of type="text/plain"/>
5273 </mime-type>
5274
5275 <mime-type type="text/x-yaml">
5276 <_comment>YAML source code</_comment>
5277 <glob pattern="*.yaml"/>
5278 <sub-class-of type="text/plain"/>
5279 </mime-type>
5280
5281 <mime-type type="video/3gpp">
5282 <magic priority="60">
5283 <match value="ftyp3ge6" type="string" offset="4"/>
5284 <match value="ftyp3ge7" type="string" offset="4"/>
5285 <match value="ftyp3gg6" type="string" offset="4"/>
5286 <match value="ftyp3gp1" type="string" offset="4"/>
5287 <match value="ftyp3gp2" type="string" offset="4"/>
5288 <match value="ftyp3gp3" type="string" offset="4"/>
5289 <match value="ftyp3gp4" type="string" offset="4"/>
5290 <match value="ftyp3gp5" type="string" offset="4"/>
5291 <match value="ftyp3gp6" type="string" offset="4"/>
5292 <match value="ftyp3gs7" type="string" offset="4"/>
5293 </magic>
5294 <glob pattern="*.3gp"/>
5295 </mime-type>
5296 <mime-type type="video/3gpp-tt"/>
5297 <mime-type type="video/3gpp2">
5298 <magic priority="60">
5299 <match value="ftyp3g2a" type="string" offset="4"/>
5300 <match value="ftyp3g2b" type="string" offset="4"/>
5301 <match value="ftyp3g2c" type="string" offset="4"/>
5302 </magic>
5303 <glob pattern="*.3g2"/>
5304 </mime-type>
5305 <mime-type type="video/bmpeg"/>
5306 <mime-type type="video/bt656"/>
5307 <mime-type type="video/celb"/>
5308 <mime-type type="video/dv"/>
5309 <mime-type type="video/example"/>
5310 <mime-type type="video/h261">
5311 <glob pattern="*.h261"/>
5312 </mime-type>
5313 <mime-type type="video/h263">
5314 <glob pattern="*.h263"/>
5315 </mime-type>
5316 <mime-type type="video/h263-1998"/>
5317 <mime-type type="video/h263-2000"/>
5318 <mime-type type="video/h264">
5319 <glob pattern="*.h264"/>
5320 </mime-type>
5321 <mime-type type="video/jpeg">
5322 <glob pattern="*.jpgv"/>
5323 </mime-type>
5324 <mime-type type="video/jpeg2000"/>
5325
5326 <mime-type type="video/mj2">
5327 <sub-class-of type="image/x-jp2-container" />
5328 <acronym>MJ2</acronym>
5329 <_comment>JPEG 2000 Part 3 (Motion JPEG, MJ2)</_comment>
5330 <magic priority="50">
5331 <match value="0x0000000C6A5020200D0A870A" type="string" offset="0">
5332 <match value="0x6d6a7032" type="string" offset="20"/>
5333 </match>
5334 </magic>
5335 <glob pattern="*.mj2"/>
5336 <glob pattern="*.mjp2"/>
5337 </mime-type>
5338
5339 <mime-type type="video/mp1s"/>
5340 <mime-type type="video/mp2p"/>
5341 <mime-type type="video/mp2t"/>
5342
5343 <mime-type type="video/mp4">
5344 <magic priority="60">
5345 <match value="ftypmp41" type="string" offset="4"/>
5346 <match value="ftypmp42" type="string" offset="4"/>
5347 </magic>
5348 <glob pattern="*.mp4"/>
5349 <glob pattern="*.mp4v"/>
5350 <glob pattern="*.mpg4"/>
5351 <sub-class-of type="video/quicktime" />
5352 </mime-type>
5353 <mime-type type="video/mp4v-es"/>
5354
5355 <mime-type type="video/mpeg">
5356 <_comment>MPEG Movie Clip</_comment>
5357 <magic priority="50">
5358 <match value="\000\000\001\263" type="string" offset="0"/>
5359 <match value="\000\000\001\272" type="string" offset="0"/>
5360 </magic>
5361 <glob pattern="*.mpeg"/>
5362 <glob pattern="*.mpg"/>
5363 <glob pattern="*.mpe"/>
5364 <glob pattern="*.m1v"/>
5365 <glob pattern="*.m2v"/>
5366 </mime-type>
5367
5368 <mime-type type="video/mpeg4-generic"/>
5369 <mime-type type="video/mpv"/>
5370 <mime-type type="video/nv"/>
5371
5372 <mime-type type="video/ogg">
5373 <glob pattern="*.ogv"/>
5374 <sub-class-of type="application/ogg"/>
5375 </mime-type>
5376
5377 <mime-type type="video/parityfec"/>
5378 <mime-type type="video/pointer"/>
5379
5380 <mime-type type="video/quicktime">
5381 <magic priority="50">
5382 <match value="moov" type="string" offset="4"/>
5383 <match value="mdat" type="string" offset="4"/>
5384 <!-- General match, specific ftypXXX ones present for subtypes -->
5385 <match value="ftyp" type="string" offset="4"/>
5386 </magic>
5387 <glob pattern="*.qt"/>
5388 <glob pattern="*.mov"/>
5389 </mime-type>
5390
5391 <mime-type type="video/raw"/>
5392 <mime-type type="video/rtp-enc-aescm128"/>
5393 <mime-type type="video/rtx"/>
5394 <mime-type type="video/smpte292m"/>
5395 <mime-type type="video/ulpfec"/>
5396 <mime-type type="video/vc1"/>
5397 <mime-type type="video/vnd.cctv"/>
5398 <mime-type type="video/vnd.dlna.mpeg-tts"/>
5399 <mime-type type="video/vnd.fvt">
5400 <glob pattern="*.fvt"/>
5401 </mime-type>
5402 <mime-type type="video/vnd.hns.video"/>
5403 <mime-type type="video/vnd.iptvforum.1dparityfec-1010"/>
5404 <mime-type type="video/vnd.iptvforum.1dparityfec-2005"/>
5405 <mime-type type="video/vnd.iptvforum.2dparityfec-1010"/>
5406 <mime-type type="video/vnd.iptvforum.2dparityfec-2005"/>
5407 <mime-type type="video/vnd.iptvforum.ttsavc"/>
5408 <mime-type type="video/vnd.iptvforum.ttsmpeg2"/>
5409 <mime-type type="video/vnd.motorola.video"/>
5410 <mime-type type="video/vnd.motorola.videop"/>
5411 <mime-type type="video/vnd.mpegurl">
5412 <glob pattern="*.mxu"/>
5413 <glob pattern="*.m4u"/>
5414 </mime-type>
5415 <mime-type type="video/vnd.ms-playready.media.pyv">
5416 <glob pattern="*.pyv"/>
5417 </mime-type>
5418 <mime-type type="video/vnd.nokia.interleaved-multimedia"/>
5419 <mime-type type="video/vnd.nokia.videovoip"/>
5420 <mime-type type="video/vnd.objectvideo"/>
5421 <mime-type type="video/vnd.sealed.mpeg1"/>
5422 <mime-type type="video/vnd.sealed.mpeg4"/>
5423 <mime-type type="video/vnd.sealed.swf"/>
5424 <mime-type type="video/vnd.sealedmedia.softseal.mov"/>
5425 <mime-type type="video/vnd.vivo">
5426 <glob pattern="*.viv"/>
5427 </mime-type>
5428 <mime-type type="video/x-f4v">
5429 <glob pattern="*.f4v"/>
5430 </mime-type>
5431
5432 <mime-type type="video/x-flc">
5433 <glob pattern="*.flc"/>
5434 </mime-type>
5435
5436 <mime-type type="video/x-fli">
5437 <glob pattern="*.fli"/>
5438 </mime-type>
5439
5440 <mime-type type="video/x-flv">
5441 <magic priority="50">
5442 <match value="FLV" type="string" offset="0"/>
5443 </magic>
5444 <glob pattern="*.flv"/>
5445 </mime-type>
5446
5447 <mime-type type="video/x-jng">
5448 <magic priority="50">
5449 <match value="\x8bJNG" type="string" offset="0"/>
5450 </magic>
5451 <glob pattern="*.jng"/>
5452 </mime-type>
5453
5454 <mime-type type="video/x-m4v">
5455 <magic priority="60">
5456 <match value="ftypM4V " type="string" offset="4"/>
5457 <match value="ftypM4VH" type="string" offset="4"/>
5458 <match value="ftypM4VP" type="string" offset="4"/>
5459 </magic>
5460 <glob pattern="*.m4v"/>
5461 <sub-class-of type="video/mp4" />
5462 </mime-type>
5463
5464 <mime-type type="video/x-mng">
5465 <magic priority="50">
5466 <match value="\x8aMNG" type="string" offset="0"/>
5467 </magic>
5468 <glob pattern="*.mng"/>
5469 </mime-type>
5470
5471 <mime-type type="video/x-ms-asf">
5472 <glob pattern="*.asf"/>
5473 <glob pattern="*.asx"/>
5474 <magic>
5475 <match value="0x3026b275" type="big32" offset="0" />
5476 </magic>
5477 </mime-type>
5478 <mime-type type="video/x-ms-wm">
5479 <glob pattern="*.wm"/>
5480 </mime-type>
5481 <mime-type type="video/x-ms-wmv">
5482 <sub-class-of type="video/x-ms-asf" />
5483 <glob pattern="*.wmv"/>
5484 <magic priority="60">
5485 <match value="Windows Media Video" type="unicodeLE" offset="0:8192" />
5486 <match value="VC-1 Advanced Profile" type="unicodeLE" offset="0:8192" />
5487 </magic>
5488 </mime-type>
5489 <mime-type type="video/x-ms-wmx">
5490 <glob pattern="*.wmx"/>
5491 </mime-type>
5492 <mime-type type="video/x-ms-wvx">
5493 <glob pattern="*.wvx"/>
5494 </mime-type>
5495
5496 <mime-type type="video/x-msvideo">
5497 <_comment>Audio Video Interleave File</_comment>
5498 <alias type="video/avi"/>
5499 <alias type="video/msvideo"/>
5500 <magic priority="50">
5501 <match value="RIFF....AVI " type="string" offset="0"
5502 mask="0xFFFFFFFF00000000FFFFFFFF"/>
5503 <match offset="8" type="string" value="\x41\x56\x49\x20"/>
5504 </magic>
5505 <glob pattern="*.avi"/>
5506 </mime-type>
5507
5508 <mime-type type="video/x-sgi-movie">
5509 <magic priority="50">
5510 <match value="MOVI" type="string" offset="0"/>
5511 </magic>
5512 <glob pattern="*.movie"/>
5513 </mime-type>
5514
5515 <mime-type type="application/x-matroska">
5516 <_comment>Matroska Media Container</_comment>
5517 <!-- Common magic across all Matroska varients -->
5518 <!-- For full detection, we need a custom Detector, see TIKA-1180 -->
5519 <magic priority="40">
5520 <match value="0x1A45DFA3" type="string" offset="0" />
5521 </magic>
5522 </mime-type>
5523
5524 <mime-type type="video/x-matroska">
5525 <sub-class-of type="application/x-matroska"/>
5526 <glob pattern="*.mkv" />
5527 <!-- Note: The magic value below isn't present in all MKV files -->
5528 <magic priority="50">
5529 <match value="0x1A45DFA3934282886D6174726F736B61" type="string" offset="0" />
5530 </magic>
5531 </mime-type>
5532 <mime-type type="audio/x-matroska">
5533 <sub-class-of type="application/x-matroska"/>
5534 <glob pattern="*.mka" />
5535 </mime-type>
5536
5537 <mime-type type="video/webm">
5538 <sub-class-of type="application/x-matroska"/>
5539 <glob pattern="*.webm" />
5540 </mime-type>
5541
5542 <mime-type type="x-conference/x-cooltalk">
5543 <_comment>Cooltalk Audio</_comment>
5544 <glob pattern="*.ice"/>
5545 </mime-type>
5546
5547 <mime-type type="application/x-fictionbook+xml">
5548 <_comment>FictionBook document</_comment>
5549 <sub-class-of type="application/xml"/>
5550 <root-XML namespaceURI="http://www.gribuser.ru/xml/fictionbook/2.0" localName="FictionBook"/>
5551 <glob pattern="*.fb2"/>
5552 </mime-type>
5553
5554 <mime-type type="text/x-asciidoc">
5555 <_comment>Asciidoc source code</_comment>
5556 <glob pattern="*.asciidoc"/>
5557 <glob pattern="*.adoc"/>
5558 <glob pattern="*.ad"/>
5559 <glob pattern="*.ad.txt"/>
5560 <glob pattern="*.adoc.txt"/>
5561 <sub-class-of type="text/plain"/>
5562 </mime-type>
5563
5564 <mime-type type="text/x-d">
5565 <_comment>D source code</_comment>
5566 <glob pattern="*.d"/>
5567 <sub-class-of type="text/plain"/>
5568 </mime-type>
5569
5570 <mime-type type="text/x-haml">
5571 <_comment>HAML source code</_comment>
5572 <glob pattern="*.haml"/>
5573 <sub-class-of type="text/plain"/>
5574 </mime-type>
5575
5576 <mime-type type="text/x-haxe">
5577 <_comment>Haxe source code</_comment>
5578 <glob pattern="*.hx"/>
5579 <sub-class-of type="text/plain"/>
5580 </mime-type>
5581
5582 <mime-type type="text/x-rsrc">
5583 <_comment>R source code</_comment>
5584 <glob pattern="*.r"/>
5585 <sub-class-of type="text/plain"/>
5586 </mime-type>
5587
5588 <mime-type type="application/xquery">
5589 <_comment>XQuery source code</_comment>
5590 <glob pattern="*.xq"/>
5591 <glob pattern="*.xquery"/>
5592 <sub-class-of type="text/plain"/>
5593 </mime-type>
5594
5595 </mime-info>
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import java.io.IOException;
19 import java.net.URL;
20 import java.util.ArrayList;
21 import java.util.Collections;
22 import java.util.Enumeration;
23 import java.util.HashMap;
24 import java.util.List;
25 import java.util.Map;
26
27 /**
28 * A wrapper around a {@link ClassLoader} that logs all
29 * the Resources loaded through it.
30 * Used to check that a specific ClassLoader was used
31 * when unit testing
32 */
33 public class ResourceLoggingClassLoader extends ClassLoader {
34 private Map<String,List<URL>> loadedResources = new HashMap<String, List<URL>>();
35
36 public ResourceLoggingClassLoader(ClassLoader realClassloader) {
37 super(realClassloader);
38 }
39
40 private List<URL> fetchRecord(String name) {
41 List<URL> alreadyLoaded = loadedResources.get(name);
42 if (alreadyLoaded == null) {
43 alreadyLoaded = new ArrayList<URL>();
44 loadedResources.put(name, alreadyLoaded);
45 }
46 return alreadyLoaded;
47 }
48
49 @Override
50 public URL getResource(String name) {
51 URL resource = super.getResource(name);
52 List<URL> alreadyLoaded = fetchRecord(name);
53 alreadyLoaded.add(resource);
54 return resource;
55 }
56
57 @Override
58 public Enumeration<URL> getResources(String name) throws IOException {
59 Enumeration<URL> resources = super.getResources(name);
60 List<URL> alreadyLoaded = fetchRecord(name);
61
62 // Need to copy as we record
63 List<URL> these = Collections.list(resources);
64 alreadyLoaded.addAll(these);
65
66 // Return our copy
67 return Collections.enumeration(these);
68 }
69
70 public List<URL> getLoadedResources(String resourceName) {
71 List<URL> resources = loadedResources.get(resourceName);
72 if (resources == null) return Collections.emptyList();
73 return Collections.unmodifiableList(resources);
74 }
75 public Map<String,List<URL>> getLoadedResources() {
76 return Collections.unmodifiableMap(loadedResources);
77 }
78 public void resetLoadedResources() {
79 loadedResources.clear();
80 }
81 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import java.io.BufferedInputStream;
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24
25 import org.apache.tika.utils.RereadableInputStream;
26 import org.junit.Test;
27
28 import static org.junit.Assert.assertEquals;
29
30 public class TestRereadableInputStream {
31
32 private final int TEST_SIZE = 3;
33
34 private final int MEMORY_THRESHOLD = 1;
35
36 private final int NUM_PASSES = 4;
37
38 @Test
39 public void test() throws IOException {
40
41 InputStream is = createTestInputStream();
42 RereadableInputStream ris = new RereadableInputStream(is,
43 MEMORY_THRESHOLD, true, true);
44 try {
45 for (int pass = 0; pass < NUM_PASSES; pass++) {
46 for (int byteNum = 0; byteNum < TEST_SIZE; byteNum++) {
47 int byteRead = ris.read();
48 assertEquals("Pass = " + pass + ", byte num should be "
49 + byteNum + " but is " + byteRead + ".", byteNum,
50 byteRead);
51 }
52 ris.rewind();
53 }
54 } finally {
55 // The RereadableInputStream should close the original input
56 // stream (if it hasn't already).
57 ris.close();
58 }
59 }
60
61 /**
62 * Test that the constructor's readToEndOfStreamOnFirstRewind parameter
63 * correctly determines the behavior.
64 *
65 * @throws IOException
66 */
67 @Test
68 public void testRewind() throws IOException {
69 doTestRewind(true);
70 doTestRewind(false);
71 }
72
73 private void doTestRewind(boolean readToEndOnRewind) throws IOException {
74
75 RereadableInputStream ris = null;
76
77 try {
78 InputStream s1 = createTestInputStream();
79 ris = new RereadableInputStream(s1, 5, readToEndOnRewind, true);
80 ris.read();
81 assertEquals(1, ris.getSize());
82 ris.rewind();
83 boolean moreBytesWereRead = (ris.getSize() > 1);
84 assertEquals(readToEndOnRewind, moreBytesWereRead);
85 } finally {
86 if (ris != null) {
87 ris.close();
88 }
89 }
90
91 }
92
93 private TestInputStream createTestInputStream() throws IOException {
94 return new TestInputStream(
95 new BufferedInputStream(
96 new FileInputStream(createTestFile())));
97 }
98
99 private File createTestFile() throws IOException {
100 File testfile = File.createTempFile("TIKA_ris_test", ".tmp");
101 testfile.deleteOnExit();
102 FileOutputStream fos = new FileOutputStream(testfile);
103 for (int i = 0; i < TEST_SIZE; i++) {
104 fos.write(i);
105 }
106 fos.close();
107 return testfile;
108 }
109
110 @Test
111 public void testCloseBehavior() throws IOException {
112 doACloseBehaviorTest(true);
113 doACloseBehaviorTest(false);
114 }
115
116 private void doACloseBehaviorTest(boolean wantToClose) throws IOException {
117
118 TestInputStream tis = createTestInputStream();
119 RereadableInputStream ris =
120 new RereadableInputStream(tis, 5, true, wantToClose);
121 ris.close();
122 assertEquals(wantToClose, tis.isClosed());
123
124 if (! tis.isClosed()) {
125 tis.close();
126 }
127 }
128
129
130 /**
131 * Adds isClosed() to a BufferedInputStream.
132 */
133 class TestInputStream extends BufferedInputStream {
134
135 private boolean closed;
136
137 public TestInputStream(InputStream inputStream) {
138 super(inputStream);
139 }
140
141 public void close() throws IOException {
142 super.close();
143 closed = true;
144 }
145
146 public boolean isClosed() {
147 return closed;
148 }
149 }
150
151 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.junit.Test;
21
22 public class TikaDetectionTest {
23
24 private final Tika tika = new Tika();
25
26 /**
27 * This test checks that Tika correctly detects all the file extensions
28 * defined in the mime.types file (revision 819245) of the Apache HTTP
29 * Server project. The tests were created with:
30 * <pre>
31 * cat docs/conf/mime.types | grep -v '#' | perl -lne '/\S\s+\S/ and do {
32 * my ($type, @ext) = split /\s+/;
33 * for my $ext (@ext) {
34 * print "assertEquals(\"$type\", tika.detect(\"x.$ext\"));";
35 * }
36 * }'
37 * </pre>
38 */
39 @Test
40 public void testHttpServerFileExtensions() {
41 assertEquals("application/andrew-inset", tika.detect("x.ez"));
42 assertEquals("application/applixware", tika.detect("x.aw"));
43 assertEquals("application/atom+xml", tika.detect("x.atom"));
44 assertEquals("application/atomcat+xml", tika.detect("x.atomcat"));
45 assertEquals("application/atomsvc+xml", tika.detect("x.atomsvc"));
46 assertEquals("application/ccxml+xml", tika.detect("x.ccxml"));
47 assertEquals("application/cu-seeme", tika.detect("x.cu"));
48 assertEquals("application/davmount+xml", tika.detect("x.davmount"));
49 assertEquals("application/ecmascript", tika.detect("x.ecma"));
50 assertEquals("application/emma+xml", tika.detect("x.emma"));
51 assertEquals("application/epub+zip", tika.detect("x.epub"));
52 assertEquals("application/font-tdpfr", tika.detect("x.pfr"));
53 assertEquals("application/hyperstudio", tika.detect("x.stk"));
54 assertEquals("application/java-archive", tika.detect("x.jar"));
55 assertEquals("application/java-serialized-object", tika.detect("x.ser"));
56 assertEquals("application/java-vm", tika.detect("x.class"));
57 assertEquals("application/javascript", tika.detect("x.js"));
58 assertEquals("application/json", tika.detect("x.json"));
59 assertEquals("application/lost+xml", tika.detect("x.lostxml"));
60 assertEquals("application/mac-binhex40", tika.detect("x.hqx"));
61 assertEquals("application/mac-compactpro", tika.detect("x.cpt"));
62 assertEquals("application/marc", tika.detect("x.mrc"));
63 assertEquals("application/mathematica", tika.detect("x.ma"));
64 assertEquals("application/mathematica", tika.detect("x.nb"));
65 assertEquals("application/mathematica", tika.detect("x.mb"));
66 assertEquals("application/mathml+xml", tika.detect("x.mathml"));
67 assertEquals("application/mbox", tika.detect("x.mbox"));
68 assertEquals("application/mediaservercontrol+xml", tika.detect("x.mscml"));
69 assertEquals("application/mp4", tika.detect("x.mp4s"));
70 assertEquals("application/msword", tika.detect("x.doc"));
71 assertEquals("application/msword", tika.detect("x.dot"));
72 assertEquals("application/mxf", tika.detect("x.mxf"));
73 assertEquals("application/octet-stream", tika.detect("x.bin"));
74 assertEquals("application/octet-stream", tika.detect("x.dms"));
75 assertEquals("application/octet-stream", tika.detect("x.lha"));
76 assertEquals("application/octet-stream", tika.detect("x.lrf"));
77 assertEquals("application/octet-stream", tika.detect("x.lzh"));
78 assertEquals("application/octet-stream", tika.detect("x.so"));
79 assertEquals("application/octet-stream", tika.detect("x.iso"));
80 assertEquals("application/octet-stream", tika.detect("x.dmg"));
81 assertEquals("application/octet-stream", tika.detect("x.dist"));
82 assertEquals("application/octet-stream", tika.detect("x.distz"));
83 assertEquals("application/octet-stream", tika.detect("x.pkg"));
84 assertEquals("application/octet-stream", tika.detect("x.bpk"));
85 assertEquals("application/octet-stream", tika.detect("x.dump"));
86 assertEquals("application/octet-stream", tika.detect("x.elc"));
87 assertEquals("application/octet-stream", tika.detect("x.deploy"));
88 assertEquals("application/oda", tika.detect("x.oda"));
89 assertEquals("application/oebps-package+xml", tika.detect("x.opf"));
90 assertEquals("application/ogg", tika.detect("x.ogx"));
91 assertEquals("application/onenote", tika.detect("x.onetoc"));
92 assertEquals("application/onenote", tika.detect("x.onetoc2"));
93 assertEquals("application/onenote", tika.detect("x.onetmp"));
94 assertEquals("application/onenote", tika.detect("x.onepkg"));
95 assertEquals("application/patch-ops-error+xml", tika.detect("x.xer"));
96 assertEquals("application/pdf", tika.detect("x.pdf"));
97 assertEquals("application/pgp-encrypted", tika.detect("x.pgp"));
98 assertEquals("application/pgp-signature", tika.detect("x.asc"));
99 assertEquals("application/pgp-signature", tika.detect("x.sig"));
100 assertEquals("application/pics-rules", tika.detect("x.prf"));
101 assertEquals("application/pkcs10", tika.detect("x.p10"));
102 assertEquals("application/pkcs7-mime", tika.detect("x.p7m"));
103 assertEquals("application/pkcs7-mime", tika.detect("x.p7c"));
104 assertEquals("application/pkcs7-signature", tika.detect("x.p7s"));
105 assertEquals("application/pkix-cert", tika.detect("x.cer"));
106 assertEquals("application/pkix-crl", tika.detect("x.crl"));
107 assertEquals("application/pkix-pkipath", tika.detect("x.pkipath"));
108 assertEquals("application/pkixcmp", tika.detect("x.pki"));
109 assertEquals("application/pls+xml", tika.detect("x.pls"));
110 assertEquals("application/illustrator", tika.detect("x.ai"));
111 assertEquals("application/postscript", tika.detect("x.eps"));
112 assertEquals("application/postscript", tika.detect("x.ps"));
113 assertEquals("application/prs.cww", tika.detect("x.cww"));
114 assertEquals("application/rdf+xml", tika.detect("x.rdf"));
115 assertEquals("application/reginfo+xml", tika.detect("x.rif"));
116 assertEquals("application/relax-ng-compact-syntax", tika.detect("x.rnc"));
117 assertEquals("application/resource-lists+xml", tika.detect("x.rl"));
118 assertEquals("application/resource-lists-diff+xml", tika.detect("x.rld"));
119 assertEquals("application/rls-services+xml", tika.detect("x.rs"));
120 assertEquals("application/rsd+xml", tika.detect("x.rsd"));
121 assertEquals("application/rss+xml", tika.detect("x.rss"));
122 assertEquals("application/rtf", tika.detect("x.rtf"));
123 assertEquals("application/sbml+xml", tika.detect("x.sbml"));
124 assertEquals("application/scvp-cv-request", tika.detect("x.scq"));
125 assertEquals("application/scvp-cv-response", tika.detect("x.scs"));
126 assertEquals("application/scvp-vp-request", tika.detect("x.spq"));
127 assertEquals("application/scvp-vp-response", tika.detect("x.spp"));
128 assertEquals("application/sdp", tika.detect("x.sdp"));
129 assertEquals("application/set-payment-initiation", tika.detect("x.setpay"));
130 assertEquals("application/set-registration-initiation", tika.detect("x.setreg"));
131 assertEquals("application/sldworks", tika.detect("x.sldprt"));
132 assertEquals("application/sldworks", tika.detect("x.sldasm"));
133 assertEquals("application/sldworks", tika.detect("x.slddrw"));
134 assertEquals("application/shf+xml", tika.detect("x.shf"));
135 assertEquals("application/smil+xml", tika.detect("x.smi"));
136 assertEquals("application/smil+xml", tika.detect("x.smil"));
137 assertEquals("application/sparql-query", tika.detect("x.rq"));
138 assertEquals("application/sparql-results+xml", tika.detect("x.srx"));
139 assertEquals("application/srgs", tika.detect("x.gram"));
140 assertEquals("application/srgs+xml", tika.detect("x.grxml"));
141 assertEquals("application/ssml+xml", tika.detect("x.ssml"));
142 assertEquals("application/vnd.3gpp.pic-bw-large", tika.detect("x.plb"));
143 assertEquals("application/vnd.3gpp.pic-bw-small", tika.detect("x.psb"));
144 assertEquals("application/vnd.3gpp.pic-bw-var", tika.detect("x.pvb"));
145 assertEquals("application/vnd.3gpp2.tcap", tika.detect("x.tcap"));
146 assertEquals("application/vnd.3m.post-it-notes", tika.detect("x.pwn"));
147 assertEquals("application/vnd.accpac.simply.aso", tika.detect("x.aso"));
148 assertEquals("application/vnd.accpac.simply.imp", tika.detect("x.imp"));
149 assertEquals("application/vnd.acucobol", tika.detect("x.acu"));
150 assertEquals("application/vnd.acucorp", tika.detect("x.atc"));
151 assertEquals("application/vnd.acucorp", tika.detect("x.acutc"));
152 assertEquals("application/vnd.adobe.air-application-installer-package+zip", tika.detect("x.air"));
153 assertEquals("application/vnd.adobe.xdp+xml", tika.detect("x.xdp"));
154 assertEquals("application/vnd.adobe.xfdf", tika.detect("x.xfdf"));
155 assertEquals("application/vnd.airzip.filesecure.azf", tika.detect("x.azf"));
156 assertEquals("application/vnd.airzip.filesecure.azs", tika.detect("x.azs"));
157 assertEquals("application/vnd.amazon.ebook", tika.detect("x.azw"));
158 assertEquals("application/vnd.americandynamics.acc", tika.detect("x.acc"));
159 assertEquals("application/vnd.amiga.ami", tika.detect("x.ami"));
160 assertEquals("application/vnd.android.package-archive", tika.detect("x.apk"));
161 assertEquals("application/vnd.anser-web-certificate-issue-initiation", tika.detect("x.cii"));
162 assertEquals("application/vnd.anser-web-funds-transfer-initiation", tika.detect("x.fti"));
163 assertEquals("application/vnd.antix.game-component", tika.detect("x.atx"));
164 assertEquals("application/vnd.apple.installer+xml", tika.detect("x.mpkg"));
165 assertEquals("application/vnd.arastra.swi", tika.detect("x.swi"));
166 // Differ from httpd - Adobe After Effects is a much more common user of .AEP these days
167 //assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
168 assertEquals("application/vnd.blueice.multipass", tika.detect("x.mpm"));
169 assertEquals("application/vnd.bmi", tika.detect("x.bmi"));
170 assertEquals("application/vnd.businessobjects", tika.detect("x.rep"));
171 assertEquals("application/vnd.chemdraw+xml", tika.detect("x.cdxml"));
172 assertEquals("application/vnd.chipnuts.karaoke-mmd", tika.detect("x.mmd"));
173 assertEquals("application/vnd.cinderella", tika.detect("x.cdy"));
174 assertEquals("application/vnd.claymore", tika.detect("x.cla"));
175 assertEquals("application/vnd.clonk.c4group", tika.detect("x.c4g"));
176 assertEquals("application/vnd.clonk.c4group", tika.detect("x.c4d"));
177 assertEquals("application/vnd.clonk.c4group", tika.detect("x.c4f"));
178 assertEquals("application/vnd.clonk.c4group", tika.detect("x.c4p"));
179 assertEquals("application/vnd.clonk.c4group", tika.detect("x.c4u"));
180 assertEquals("application/vnd.commonspace", tika.detect("x.csp"));
181 assertEquals("application/vnd.contact.cmsg", tika.detect("x.cdbcmsg"));
182 assertEquals("application/vnd.cosmocaller", tika.detect("x.cmc"));
183 assertEquals("application/vnd.crick.clicker", tika.detect("x.clkx"));
184 assertEquals("application/vnd.crick.clicker.keyboard", tika.detect("x.clkk"));
185 assertEquals("application/vnd.crick.clicker.palette", tika.detect("x.clkp"));
186 assertEquals("application/vnd.crick.clicker.template", tika.detect("x.clkt"));
187 assertEquals("application/vnd.crick.clicker.wordbank", tika.detect("x.clkw"));
188 assertEquals("application/vnd.criticaltools.wbs+xml", tika.detect("x.wbs"));
189 assertEquals("application/vnd.ctc-posml", tika.detect("x.pml"));
190 assertEquals("application/vnd.cups-ppd", tika.detect("x.ppd"));
191 assertEquals("application/vnd.curl.car", tika.detect("x.car"));
192 assertEquals("application/vnd.curl.pcurl", tika.detect("x.pcurl"));
193 assertEquals("application/vnd.data-vision.rdz", tika.detect("x.rdz"));
194 assertEquals("application/vnd.denovo.fcselayout-link", tika.detect("x.fe_launch"));
195 assertEquals("application/vnd.dna", tika.detect("x.dna"));
196 assertEquals("application/vnd.dolby.mlp", tika.detect("x.mlp"));
197 assertEquals("application/vnd.dpgraph", tika.detect("x.dpg"));
198 assertEquals("application/vnd.dreamfactory", tika.detect("x.dfac"));
199 assertEquals("application/vnd.dynageo", tika.detect("x.geo"));
200 assertEquals("application/vnd.ecowin.chart", tika.detect("x.mag"));
201 assertEquals("application/vnd.enliven", tika.detect("x.nml"));
202 assertEquals("application/vnd.epson.esf", tika.detect("x.esf"));
203 assertEquals("application/vnd.epson.msf", tika.detect("x.msf"));
204 assertEquals("application/vnd.epson.quickanime", tika.detect("x.qam"));
205 assertEquals("application/vnd.epson.salt", tika.detect("x.slt"));
206 assertEquals("application/vnd.epson.ssf", tika.detect("x.ssf"));
207 assertEquals("application/vnd.eszigno3+xml", tika.detect("x.es3"));
208 assertEquals("application/vnd.eszigno3+xml", tika.detect("x.et3"));
209 assertEquals("application/vnd.ezpix-album", tika.detect("x.ez2"));
210 assertEquals("application/vnd.ezpix-package", tika.detect("x.ez3"));
211 assertEquals("application/vnd.fdf", tika.detect("x.fdf"));
212 assertEquals("application/vnd.fdsn.mseed", tika.detect("x.mseed"));
213 assertEquals("application/vnd.fdsn.seed", tika.detect("x.seed"));
214 assertEquals("application/vnd.fdsn.seed", tika.detect("x.dataless"));
215 assertEquals("application/vnd.flographit", tika.detect("x.gph"));
216 assertEquals("application/vnd.fluxtime.clip", tika.detect("x.ftc"));
217 assertEquals("application/vnd.framemaker", tika.detect("x.fm"));
218 assertEquals("application/vnd.framemaker", tika.detect("x.frame"));
219 assertEquals("application/vnd.framemaker", tika.detect("x.maker"));
220 assertEquals("application/vnd.framemaker", tika.detect("x.book"));
221 assertEquals("application/vnd.frogans.fnc", tika.detect("x.fnc"));
222 assertEquals("application/vnd.frogans.ltf", tika.detect("x.ltf"));
223 assertEquals("application/vnd.fsc.weblaunch", tika.detect("x.fsc"));
224 assertEquals("application/vnd.fujitsu.oasys", tika.detect("x.oas"));
225 assertEquals("application/vnd.fujitsu.oasys2", tika.detect("x.oa2"));
226 assertEquals("application/vnd.fujitsu.oasys3", tika.detect("x.oa3"));
227 assertEquals("application/vnd.fujitsu.oasysgp", tika.detect("x.fg5"));
228 assertEquals("application/vnd.fujitsu.oasysprs", tika.detect("x.bh2"));
229 assertEquals("application/vnd.fujixerox.ddd", tika.detect("x.ddd"));
230 assertEquals("application/vnd.fujixerox.docuworks", tika.detect("x.xdw"));
231 assertEquals("application/vnd.fujixerox.docuworks.binder", tika.detect("x.xbd"));
232 assertEquals("application/vnd.fuzzysheet", tika.detect("x.fzs"));
233 assertEquals("application/vnd.genomatix.tuxedo", tika.detect("x.txd"));
234 assertEquals("application/vnd.geogebra.file", tika.detect("x.ggb"));
235 assertEquals("application/vnd.geogebra.tool", tika.detect("x.ggt"));
236 assertEquals("application/vnd.geometry-explorer", tika.detect("x.gex"));
237 assertEquals("application/vnd.geometry-explorer", tika.detect("x.gre"));
238 assertEquals("application/vnd.gmx", tika.detect("x.gmx"));
239 assertEquals("application/vnd.google-earth.kml+xml", tika.detect("x.kml"));
240 assertEquals("application/vnd.google-earth.kmz", tika.detect("x.kmz"));
241 assertEquals("application/vnd.grafeq", tika.detect("x.gqf"));
242 assertEquals("application/vnd.grafeq", tika.detect("x.gqs"));
243 assertEquals("application/vnd.groove-account", tika.detect("x.gac"));
244 assertEquals("application/vnd.groove-help", tika.detect("x.ghf"));
245 assertEquals("application/vnd.groove-identity-message", tika.detect("x.gim"));
246 assertEquals("application/vnd.groove-injector", tika.detect("x.grv"));
247 assertEquals("application/vnd.groove-tool-message", tika.detect("x.gtm"));
248 assertEquals("application/vnd.groove-tool-template", tika.detect("x.tpl"));
249 assertEquals("application/vnd.groove-vcard", tika.detect("x.vcg"));
250 assertEquals("application/vnd.handheld-entertainment+xml", tika.detect("x.zmm"));
251 assertEquals("application/vnd.hbci", tika.detect("x.hbci"));
252 assertEquals("application/vnd.hhe.lesson-player", tika.detect("x.les"));
253 assertEquals("application/vnd.hp-hpgl", tika.detect("x.hpgl"));
254 assertEquals("application/vnd.hp-hpid", tika.detect("x.hpid"));
255 assertEquals("application/vnd.hp-hps", tika.detect("x.hps"));
256 assertEquals("application/vnd.hp-jlyt", tika.detect("x.jlt"));
257 assertEquals("application/vnd.hp-pcl", tika.detect("x.pcl"));
258 assertEquals("application/vnd.hp-pclxl", tika.detect("x.pclxl"));
259 assertEquals("application/vnd.hydrostatix.sof-data", tika.detect("x.sfd-hdstx"));
260 assertEquals("application/vnd.hzn-3d-crossword", tika.detect("x.x3d"));
261 assertEquals("application/vnd.ibm.minipay", tika.detect("x.mpy"));
262 assertEquals("application/vnd.ibm.modcap", tika.detect("x.afp"));
263 assertEquals("application/vnd.ibm.modcap", tika.detect("x.listafp"));
264 assertEquals("application/vnd.ibm.modcap", tika.detect("x.list3820"));
265 assertEquals("application/vnd.ibm.rights-management", tika.detect("x.irm"));
266 assertEquals("application/vnd.ibm.secure-container", tika.detect("x.sc"));
267 assertEquals("application/vnd.iccprofile", tika.detect("x.icc"));
268 assertEquals("application/vnd.iccprofile", tika.detect("x.icm"));
269 assertEquals("application/vnd.igloader", tika.detect("x.igl"));
270 assertEquals("application/vnd.immervision-ivp", tika.detect("x.ivp"));
271 assertEquals("application/vnd.immervision-ivu", tika.detect("x.ivu"));
272 assertEquals("application/vnd.intercon.formnet", tika.detect("x.xpw"));
273 assertEquals("application/vnd.intercon.formnet", tika.detect("x.xpx"));
274 assertEquals("application/vnd.intu.qbo", tika.detect("x.qbo"));
275 assertEquals("application/vnd.intu.qfx", tika.detect("x.qfx"));
276 assertEquals("application/vnd.ipunplugged.rcprofile", tika.detect("x.rcprofile"));
277 assertEquals("application/vnd.irepository.package+xml", tika.detect("x.irp"));
278 assertEquals("application/vnd.is-xpr", tika.detect("x.xpr"));
279 assertEquals("application/vnd.jam", tika.detect("x.jam"));
280 assertEquals("application/vnd.jcp.javame.midlet-rms", tika.detect("x.rms"));
281 assertEquals("application/vnd.jisp", tika.detect("x.jisp"));
282 assertEquals("application/vnd.joost.joda-archive", tika.detect("x.joda"));
283 assertEquals("application/vnd.kahootz", tika.detect("x.ktz"));
284 assertEquals("application/vnd.kahootz", tika.detect("x.ktr"));
285 assertEquals("application/vnd.kde.karbon", tika.detect("x.karbon"));
286 assertEquals("application/vnd.kde.kchart", tika.detect("x.chrt"));
287 assertEquals("application/vnd.kde.kformula", tika.detect("x.kfo"));
288 assertEquals("application/vnd.kde.kivio", tika.detect("x.flw"));
289 assertEquals("application/vnd.kde.kontour", tika.detect("x.kon"));
290 assertEquals("application/vnd.kde.kpresenter", tika.detect("x.kpr"));
291 assertEquals("application/vnd.kde.kpresenter", tika.detect("x.kpt"));
292 assertEquals("application/vnd.kde.kspread", tika.detect("x.ksp"));
293 assertEquals("application/vnd.kde.kword", tika.detect("x.kwd"));
294 assertEquals("application/vnd.kde.kword", tika.detect("x.kwt"));
295 assertEquals("application/vnd.kenameaapp", tika.detect("x.htke"));
296 assertEquals("application/vnd.kidspiration", tika.detect("x.kia"));
297 assertEquals("application/vnd.kinar", tika.detect("x.kne"));
298 assertEquals("application/vnd.kinar", tika.detect("x.knp"));
299 assertEquals("application/vnd.koan", tika.detect("x.skp"));
300 assertEquals("application/vnd.koan", tika.detect("x.skd"));
301 assertEquals("application/vnd.koan", tika.detect("x.skt"));
302 assertEquals("application/vnd.koan", tika.detect("x.skm"));
303 assertEquals("application/vnd.kodak-descriptor", tika.detect("x.sse"));
304 assertEquals("application/vnd.llamagraphics.life-balance.desktop", tika.detect("x.lbd"));
305 assertEquals("application/vnd.llamagraphics.life-balance.exchange+xml", tika.detect("x.lbe"));
306 assertEquals("application/vnd.lotus-1-2-3", tika.detect("x.123"));
307 assertEquals("application/vnd.lotus-approach", tika.detect("x.apr"));
308 assertEquals("application/vnd.lotus-freelance", tika.detect("x.pre"));
309 assertEquals("application/vnd.lotus-notes", tika.detect("x.nsf"));
310 assertEquals("application/vnd.lotus-organizer", tika.detect("x.org"));
311 assertEquals("text/x-scheme", tika.detect("x.scm"));
312 assertEquals("application/vnd.lotus-wordpro", tika.detect("x.lwp"));
313 assertEquals("application/vnd.macports.portpkg", tika.detect("x.portpkg"));
314 assertEquals("application/vnd.mcd", tika.detect("x.mcd"));
315 assertEquals("application/vnd.medcalcdata", tika.detect("x.mc1"));
316 assertEquals("application/vnd.mediastation.cdkey", tika.detect("x.cdkey"));
317 assertEquals("application/vnd.mfer", tika.detect("x.mwf"));
318 assertEquals("application/vnd.mfmp", tika.detect("x.mfm"));
319 assertEquals("application/vnd.micrografx.flo", tika.detect("x.flo"));
320 assertEquals("application/vnd.micrografx.igx", tika.detect("x.igx"));
321 assertEquals("application/vnd.mif", tika.detect("x.mif"));
322 assertEquals("application/vnd.mobius.daf", tika.detect("x.daf"));
323 assertEquals("application/vnd.mobius.dis", tika.detect("x.dis"));
324 assertEquals("application/vnd.mobius.mbk", tika.detect("x.mbk"));
325 assertEquals("application/vnd.mobius.mqy", tika.detect("x.mqy"));
326 assertEquals("application/vnd.mobius.msl", tika.detect("x.msl"));
327 assertEquals("application/vnd.mobius.plc", tika.detect("x.plc"));
328 assertEquals("application/vnd.mobius.txf", tika.detect("x.txf"));
329 assertEquals("application/vnd.mophun.application", tika.detect("x.mpn"));
330 assertEquals("application/vnd.mophun.certificate", tika.detect("x.mpc"));
331 assertEquals("application/vnd.mozilla.xul+xml", tika.detect("x.xul"));
332 assertEquals("application/vnd.ms-artgalry", tika.detect("x.cil"));
333 assertEquals("application/vnd.ms-cab-compressed", tika.detect("x.cab"));
334 assertEquals("application/vnd.ms-excel", tika.detect("x.xls"));
335 assertEquals("application/vnd.ms-excel", tika.detect("x.xlm"));
336 assertEquals("application/vnd.ms-excel", tika.detect("x.xla"));
337 assertEquals("application/vnd.ms-excel", tika.detect("x.xlc"));
338 assertEquals("application/vnd.ms-excel", tika.detect("x.xlt"));
339 assertEquals("application/vnd.ms-excel", tika.detect("x.xlw"));
340 assertEquals("application/vnd.ms-excel.addin.macroenabled.12", tika.detect("x.xlam"));
341 assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", tika.detect("x.xlsb"));
342 assertEquals("application/vnd.ms-excel.sheet.macroenabled.12", tika.detect("x.xlsm"));
343 assertEquals("application/vnd.ms-excel.template.macroenabled.12", tika.detect("x.xltm"));
344 assertEquals("application/vnd.ms-fontobject", tika.detect("x.eot"));
345 assertEquals("application/vnd.ms-htmlhelp", tika.detect("x.chm"));
346 assertEquals("application/vnd.ms-ims", tika.detect("x.ims"));
347 assertEquals("application/vnd.ms-lrm", tika.detect("x.lrm"));
348 assertEquals("application/vnd.ms-pki.seccat", tika.detect("x.cat"));
349 assertEquals("application/vnd.ms-pki.stl", tika.detect("x.stl"));
350 assertEquals("application/vnd.ms-powerpoint", tika.detect("x.ppt"));
351 assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pps"));
352 assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pot"));
353 assertEquals("application/vnd.ms-powerpoint.addin.macroenabled.12", tika.detect("x.ppam"));
354 assertEquals("application/vnd.ms-powerpoint.presentation.macroenabled.12", tika.detect("x.pptm"));
355 assertEquals("application/vnd.ms-powerpoint.slide.macroenabled.12", tika.detect("x.sldm"));
356 assertEquals("application/vnd.ms-powerpoint.slideshow.macroenabled.12", tika.detect("x.ppsm"));
357 assertEquals("application/vnd.ms-powerpoint.template.macroenabled.12", tika.detect("x.potm"));
358 assertEquals("application/vnd.ms-project", tika.detect("x.mpp"));
359 assertEquals("application/vnd.ms-project", tika.detect("x.mpt"));
360 assertEquals("application/vnd.ms-word.document.macroenabled.12", tika.detect("x.docm"));
361 assertEquals("application/vnd.ms-word.template.macroenabled.12", tika.detect("x.dotm"));
362 assertEquals("application/vnd.ms-works", tika.detect("x.wps"));
363 assertEquals("application/vnd.ms-works", tika.detect("x.wks"));
364 assertEquals("application/vnd.ms-works", tika.detect("x.wcm"));
365 assertEquals("application/vnd.ms-works", tika.detect("x.wdb"));
366 assertEquals("application/vnd.ms-wpl", tika.detect("x.wpl"));
367 assertEquals("application/vnd.ms-xpsdocument", tika.detect("x.xps"));
368 assertEquals("application/vnd.mseq", tika.detect("x.mseq"));
369 assertEquals("application/vnd.musician", tika.detect("x.mus"));
370 assertEquals("application/vnd.muvee.style", tika.detect("x.msty"));
371 assertEquals("application/vnd.neurolanguage.nlu", tika.detect("x.nlu"));
372 assertEquals("application/vnd.noblenet-directory", tika.detect("x.nnd"));
373 assertEquals("application/vnd.noblenet-sealer", tika.detect("x.nns"));
374 assertEquals("application/vnd.noblenet-web", tika.detect("x.nnw"));
375 assertEquals("application/vnd.nokia.n-gage.data", tika.detect("x.ngdat"));
376 assertEquals("application/vnd.nokia.n-gage.symbian.install", tika.detect("x.n-gage"));
377 assertEquals("application/vnd.nokia.radio-preset", tika.detect("x.rpst"));
378 assertEquals("application/vnd.nokia.radio-presets", tika.detect("x.rpss"));
379 assertEquals("application/vnd.novadigm.edm", tika.detect("x.edm"));
380 assertEquals("application/vnd.novadigm.edx", tika.detect("x.edx"));
381 assertEquals("application/vnd.novadigm.ext", tika.detect("x.ext"));
382 assertEquals("application/vnd.oasis.opendocument.chart", tika.detect("x.odc"));
383 assertEquals("application/vnd.oasis.opendocument.chart-template", tika.detect("x.otc"));
384 assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
385 assertEquals("application/vnd.oasis.opendocument.formula", tika.detect("x.odf"));
386 assertEquals("application/vnd.oasis.opendocument.formula-template", tika.detect("x.odft"));
387 assertEquals("application/vnd.oasis.opendocument.graphics", tika.detect("x.odg"));
388 assertEquals("application/vnd.oasis.opendocument.graphics-template", tika.detect("x.otg"));
389 assertEquals("application/vnd.oasis.opendocument.image", tika.detect("x.odi"));
390 assertEquals("application/vnd.oasis.opendocument.image-template", tika.detect("x.oti"));
391 assertEquals("application/vnd.oasis.opendocument.presentation", tika.detect("x.odp"));
392 assertEquals("application/vnd.oasis.opendocument.presentation-template", tika.detect("x.otp"));
393 assertEquals("application/vnd.oasis.opendocument.spreadsheet", tika.detect("x.ods"));
394 assertEquals("application/vnd.oasis.opendocument.spreadsheet-template", tika.detect("x.ots"));
395 assertEquals("application/vnd.oasis.opendocument.text", tika.detect("x.odt"));
396 assertEquals("application/vnd.oasis.opendocument.text-master", tika.detect("x.otm"));
397 assertEquals("application/vnd.oasis.opendocument.text-template", tika.detect("x.ott"));
398 assertEquals("application/vnd.oasis.opendocument.text-web", tika.detect("x.oth"));
399 assertEquals("application/vnd.olpc-sugar", tika.detect("x.xo"));
400 assertEquals("application/vnd.oma.dd2+xml", tika.detect("x.dd2"));
401 assertEquals("application/vnd.openofficeorg.extension", tika.detect("x.oxt"));
402 assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation", tika.detect("x.pptx"));
403 assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slide", tika.detect("x.sldx"));
404 assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slideshow", tika.detect("x.ppsx"));
405 assertEquals("application/vnd.openxmlformats-officedocument.presentationml.template", tika.detect("x.potx"));
406 assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", tika.detect("x.xlsx"));
407 assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.template", tika.detect("x.xltx"));
408 assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", tika.detect("x.docx"));
409 assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.template", tika.detect("x.dotx"));
410 assertEquals("application/vnd.osgi.dp", tika.detect("x.dp"));
411 assertEquals("chemical/x-pdb", tika.detect("x.pdb"));
412 assertEquals("application/vnd.palm", tika.detect("x.pqa"));
413 assertEquals("application/vnd.palm", tika.detect("x.oprc"));
414 assertEquals("application/vnd.pg.format", tika.detect("x.str"));
415 assertEquals("application/vnd.pg.osasli", tika.detect("x.ei6"));
416 assertEquals("application/vnd.picsel", tika.detect("x.efif"));
417 assertEquals("application/vnd.pocketlearn", tika.detect("x.plf"));
418 assertEquals("application/vnd.powerbuilder6", tika.detect("x.pbd"));
419 assertEquals("application/vnd.previewsystems.box", tika.detect("x.box"));
420 assertEquals("application/vnd.proteus.magazine", tika.detect("x.mgz"));
421 assertEquals("application/vnd.publishare-delta-tree", tika.detect("x.qps"));
422 assertEquals("application/vnd.pvi.ptid1", tika.detect("x.ptid"));
423 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qxd"));
424 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qxt"));
425 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qwd"));
426 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qwt"));
427 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qxl"));
428 assertEquals("application/vnd.quark.quarkxpress", tika.detect("x.qxb"));
429 assertEquals("application/vnd.recordare.musicxml", tika.detect("x.mxl"));
430 assertEquals("application/vnd.recordare.musicxml+xml", tika.detect("x.musicxml"));
431 assertEquals("application/vnd.rim.cod", tika.detect("x.cod"));
432 assertEquals("application/vnd.rn-realmedia", tika.detect("x.rm"));
433 assertEquals("application/vnd.route66.link66+xml", tika.detect("x.link66"));
434 assertEquals("application/vnd.seemail", tika.detect("x.see"));
435 assertEquals("application/vnd.sema", tika.detect("x.sema"));
436 assertEquals("application/vnd.semd", tika.detect("x.semd"));
437 assertEquals("application/vnd.semf", tika.detect("x.semf"));
438 assertEquals("application/vnd.shana.informed.formdata", tika.detect("x.ifm"));
439 assertEquals("application/vnd.shana.informed.formtemplate", tika.detect("x.itp"));
440 assertEquals("application/vnd.shana.informed.interchange", tika.detect("x.iif"));
441 assertEquals("application/vnd.shana.informed.package", tika.detect("x.ipk"));
442 assertEquals("application/vnd.simtech-mindmapper", tika.detect("x.twd"));
443 assertEquals("application/vnd.simtech-mindmapper", tika.detect("x.twds"));
444 assertEquals("application/vnd.smaf", tika.detect("x.mmf"));
445 assertEquals("application/vnd.smart.teacher", tika.detect("x.teacher"));
446 assertEquals("application/vnd.solent.sdkm+xml", tika.detect("x.sdkm"));
447 assertEquals("application/vnd.solent.sdkm+xml", tika.detect("x.sdkd"));
448 assertEquals("application/vnd.spotfire.dxp", tika.detect("x.dxp"));
449 assertEquals("application/vnd.spotfire.sfs", tika.detect("x.sfs"));
450 assertEquals("application/vnd.stardivision.calc", tika.detect("x.sdc"));
451 assertEquals("application/vnd.stardivision.draw", tika.detect("x.sda"));
452 assertEquals("application/vnd.stardivision.impress", tika.detect("x.sdd"));
453 assertEquals("application/vnd.stardivision.math", tika.detect("x.smf"));
454 assertEquals("application/vnd.stardivision.writer", tika.detect("x.sdw"));
455 assertEquals("application/x-staroffice-template", tika.detect("x.vor"));
456 assertEquals("application/vnd.stardivision.writer-global", tika.detect("x.sgl"));
457 assertEquals("application/vnd.sun.xml.calc", tika.detect("x.sxc"));
458 assertEquals("application/vnd.sun.xml.calc.template", tika.detect("x.stc"));
459 assertEquals("application/vnd.sun.xml.draw", tika.detect("x.sxd"));
460 assertEquals("application/vnd.sun.xml.draw.template", tika.detect("x.std"));
461 assertEquals("application/vnd.sun.xml.impress", tika.detect("x.sxi"));
462 assertEquals("application/vnd.sun.xml.impress.template", tika.detect("x.sti"));
463 assertEquals("application/vnd.sun.xml.math", tika.detect("x.sxm"));
464 assertEquals("application/vnd.sun.xml.writer", tika.detect("x.sxw"));
465 assertEquals("application/vnd.sun.xml.writer.global", tika.detect("x.sxg"));
466 assertEquals("application/vnd.sun.xml.writer.template", tika.detect("x.stw"));
467 assertEquals("application/vnd.sus-calendar", tika.detect("x.sus"));
468 assertEquals("application/vnd.sus-calendar", tika.detect("x.susp"));
469 assertEquals("application/vnd.svd", tika.detect("x.svd"));
470 assertEquals("application/vnd.symbian.install", tika.detect("x.sis"));
471 assertEquals("application/vnd.symbian.install", tika.detect("x.sisx"));
472 assertEquals("application/vnd.syncml+xml", tika.detect("x.xsm"));
473 assertEquals("application/vnd.syncml.dm+wbxml", tika.detect("x.bdm"));
474 assertEquals("application/vnd.syncml.dm+xml", tika.detect("x.xdm"));
475 assertEquals("application/vnd.tao.intent-module-archive", tika.detect("x.tao"));
476 assertEquals("application/vnd.tmobile-livetv", tika.detect("x.tmo"));
477 assertEquals("application/vnd.trid.tpt", tika.detect("x.tpt"));
478 assertEquals("application/vnd.triscape.mxs", tika.detect("x.mxs"));
479 assertEquals("application/vnd.trueapp", tika.detect("x.tra"));
480 assertEquals("application/vnd.ufdl", tika.detect("x.ufd"));
481 assertEquals("application/vnd.ufdl", tika.detect("x.ufdl"));
482 assertEquals("application/vnd.uiq.theme", tika.detect("x.utz"));
483 assertEquals("application/vnd.umajin", tika.detect("x.umj"));
484 assertEquals("application/vnd.unity", tika.detect("x.unityweb"));
485 assertEquals("application/vnd.uoml+xml", tika.detect("x.uoml"));
486 assertEquals("application/vnd.vcx", tika.detect("x.vcx"));
487 assertEquals("application/vnd.visio", tika.detect("x.vsd"));
488 assertEquals("application/vnd.visio", tika.detect("x.vst"));
489 assertEquals("application/vnd.visio", tika.detect("x.vss"));
490 assertEquals("application/vnd.visio", tika.detect("x.vsw"));
491 assertEquals("application/vnd.visionary", tika.detect("x.vis"));
492 assertEquals("application/vnd.vsf", tika.detect("x.vsf"));
493 assertEquals("application/vnd.wap.wbxml", tika.detect("x.wbxml"));
494 assertEquals("application/vnd.wap.wmlc", tika.detect("x.wmlc"));
495 assertEquals("application/vnd.wap.wmlscriptc", tika.detect("x.wmlsc"));
496 assertEquals("application/vnd.webturbo", tika.detect("x.wtb"));
497 assertEquals("application/vnd.wordperfect", tika.detect("x.wpd"));
498 assertEquals("application/vnd.wqd", tika.detect("x.wqd"));
499 assertEquals("application/vnd.wt.stf", tika.detect("x.stf"));
500 assertEquals("application/vnd.xara", tika.detect("x.xar"));
501 assertEquals("application/vnd.xfdl", tika.detect("x.xfdl"));
502 assertEquals("application/vnd.yamaha.hv-dic", tika.detect("x.hvd"));
503 assertEquals("application/vnd.yamaha.hv-script", tika.detect("x.hvs"));
504 assertEquals("application/vnd.yamaha.hv-voice", tika.detect("x.hvp"));
505 assertEquals("application/vnd.yamaha.openscoreformat", tika.detect("x.osf"));
506 assertEquals("application/vnd.yamaha.openscoreformat.osfpvg+xml", tika.detect("x.osfpvg"));
507 assertEquals("application/vnd.yamaha.smaf-audio", tika.detect("x.saf"));
508 assertEquals("application/vnd.yamaha.smaf-phrase", tika.detect("x.spf"));
509 assertEquals("application/vnd.yellowriver-custom-menu", tika.detect("x.cmp"));
510 assertEquals("application/vnd.zul", tika.detect("x.zir"));
511 assertEquals("application/vnd.zul", tika.detect("x.zirz"));
512 assertEquals("application/vnd.zzazz.deck+xml", tika.detect("x.zaz"));
513 assertEquals("application/voicexml+xml", tika.detect("x.vxml"));
514 assertEquals("application/winhlp", tika.detect("x.hlp"));
515 assertEquals("application/wsdl+xml", tika.detect("x.wsdl"));
516 assertEquals("application/wspolicy+xml", tika.detect("x.wspolicy"));
517 assertEquals("application/x-abiword", tika.detect("x.abw"));
518 assertEquals("application/x-ace-compressed", tika.detect("x.ace"));
519 assertEquals("application/x-authorware-bin", tika.detect("x.aab"));
520 assertEquals("application/x-authorware-bin", tika.detect("x.x32"));
521 assertEquals("application/x-authorware-bin", tika.detect("x.u32"));
522 assertEquals("application/x-authorware-bin", tika.detect("x.vox"));
523 assertEquals("application/x-authorware-map", tika.detect("x.aam"));
524 assertEquals("application/x-authorware-seg", tika.detect("x.aas"));
525 assertEquals("application/x-bcpio", tika.detect("x.bcpio"));
526 assertEquals("application/x-bittorrent", tika.detect("x.torrent"));
527 assertEquals("application/x-bzip", tika.detect("x.bz"));
528 assertEquals("application/x-bzip2", tika.detect("x.bz2"));
529 assertEquals("application/x-bzip2", tika.detect("x.boz"));
530 assertEquals("application/x-cdlink", tika.detect("x.vcd"));
531 assertEquals("application/x-chat", tika.detect("x.chat"));
532 assertEquals("application/x-chess-pgn", tika.detect("x.pgn"));
533 assertEquals("application/x-cpio", tika.detect("x.cpio"));
534 assertEquals("application/x-csh", tika.detect("x.csh"));
535 assertEquals("application/x-debian-package", tika.detect("x.deb"));
536 assertEquals("application/x-debian-package", tika.detect("x.udeb"));
537 assertEquals("application/x-director", tika.detect("x.dir"));
538 assertEquals("application/x-director", tika.detect("x.dcr"));
539 assertEquals("application/x-director", tika.detect("x.dxr"));
540 assertEquals("application/x-director", tika.detect("x.cst"));
541 assertEquals("application/x-director", tika.detect("x.cct"));
542 assertEquals("application/x-director", tika.detect("x.cxt"));
543 assertEquals("application/x-director", tika.detect("x.w3d"));
544 assertEquals("application/x-director", tika.detect("x.fgd"));
545 assertEquals("application/x-director", tika.detect("x.swa"));
546 assertEquals("application/x-doom", tika.detect("x.wad"));
547 assertEquals("application/x-dtbncx+xml", tika.detect("x.ncx"));
548 assertEquals("application/x-dtbook+xml", tika.detect("x.dtb"));
549 assertEquals("application/x-dtbresource+xml", tika.detect("x.res"));
550 assertEquals("application/x-dvi", tika.detect("x.dvi"));
551 assertEquals("application/x-font-bdf", tika.detect("x.bdf"));
552 assertEquals("application/x-font-ghostscript", tika.detect("x.gsf"));
553 assertEquals("application/x-font-linux-psf", tika.detect("x.psf"));
554 assertEquals("application/x-font-otf", tika.detect("x.otf"));
555 assertEquals("application/x-font-pcf", tika.detect("x.pcf"));
556 assertEquals("application/x-font-snf", tika.detect("x.snf"));
557 assertEquals("application/x-font-ttf", tika.detect("x.ttf"));
558 assertEquals("application/x-font-ttf", tika.detect("x.ttc"));
559 assertEquals("application/x-font-type1", tika.detect("x.pfa"));
560 assertEquals("application/x-font-type1", tika.detect("x.pfb"));
561 // TODO Get these fixed upstream too
562 //assertEquals("application/x-font-type1", tika.detect("x.pfm"));
563 //assertEquals("application/x-font-type1", tika.detect("x.afm"));
564 assertEquals("application/x-font-printer-metric", tika.detect("x.pfm"));
565 assertEquals("application/x-font-adobe-metric", tika.detect("x.afm"));
566 assertEquals("application/x-futuresplash", tika.detect("x.spl"));
567 assertEquals("application/x-gnumeric", tika.detect("x.gnumeric"));
568 assertEquals("application/x-gtar", tika.detect("x.gtar"));
569 assertEquals("application/x-hdf", tika.detect("x.hdf"));
570 assertEquals("application/x-java-jnlp-file", tika.detect("x.jnlp"));
571 assertEquals("application/x-latex", tika.detect("x.latex"));
572 assertEquals("application/x-mobipocket-ebook", tika.detect("x.prc"));
573 assertEquals("application/x-mobipocket-ebook", tika.detect("x.mobi"));
574 assertEquals("application/x-ms-application", tika.detect("x.application"));
575 assertEquals("application/x-ms-wmd", tika.detect("x.wmd"));
576 assertEquals("application/x-ms-wmz", tika.detect("x.wmz"));
577 assertEquals("application/x-ms-xbap", tika.detect("x.xbap"));
578 assertEquals("application/x-msaccess", tika.detect("x.mdb"));
579 assertEquals("application/x-msbinder", tika.detect("x.obd"));
580 assertEquals("application/x-mscardfile", tika.detect("x.crd"));
581 assertEquals("application/x-msclip", tika.detect("x.clp"));
582 assertEquals("application/x-dosexec", tika.detect("x.exe"));
583 assertEquals("application/x-msdownload", tika.detect("x.dll"));
584 assertEquals("application/x-msdownload", tika.detect("x.com"));
585 assertEquals("application/x-msdownload", tika.detect("x.bat"));
586 assertEquals("application/x-msdownload", tika.detect("x.msi"));
587 assertEquals("application/x-msmediaview", tika.detect("x.mvb"));
588 assertEquals("application/x-msmediaview", tika.detect("x.m13"));
589 assertEquals("application/x-msmediaview", tika.detect("x.m14"));
590 assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
591 assertEquals("application/x-msmoney", tika.detect("x.mny"));
592 assertEquals("application/x-mspublisher", tika.detect("x.pub"));
593 assertEquals("application/x-msschedule", tika.detect("x.scd"));
594 assertEquals("application/x-msterminal", tika.detect("x.trm"));
595 assertEquals("application/x-mswrite", tika.detect("x.wri"));
596 assertEquals("application/x-netcdf", tika.detect("x.nc"));
597 assertEquals("application/x-netcdf", tika.detect("x.cdf"));
598 assertEquals("application/x-pkcs12", tika.detect("x.p12"));
599 assertEquals("application/x-pkcs12", tika.detect("x.pfx"));
600 assertEquals("application/x-pkcs7-certificates", tika.detect("x.p7b"));
601 assertEquals("application/x-pkcs7-certificates", tika.detect("x.spc"));
602 assertEquals("application/x-pkcs7-certreqresp", tika.detect("x.p7r"));
603 assertEquals("application/x-rar-compressed", tika.detect("x.rar"));
604 assertEquals("application/x-sh", tika.detect("x.sh"));
605 assertEquals("application/x-shar", tika.detect("x.shar"));
606 assertEquals("application/x-shockwave-flash", tika.detect("x.swf"));
607 assertEquals("application/x-silverlight-app", tika.detect("x.xap"));
608 assertEquals("application/x-stuffit", tika.detect("x.sit"));
609 assertEquals("application/x-stuffitx", tika.detect("x.sitx"));
610 assertEquals("application/x-sv4cpio", tika.detect("x.sv4cpio"));
611 assertEquals("application/x-sv4crc", tika.detect("x.sv4crc"));
612 assertEquals("application/x-tar", tika.detect("x.tar"));
613 assertEquals("text/x-tcl", tika.detect("x.tcl"));
614 assertEquals("application/x-tex", tika.detect("x.tex"));
615 assertEquals("application/x-tex-tfm", tika.detect("x.tfm"));
616 assertEquals("application/x-texinfo", tika.detect("x.texinfo"));
617 assertEquals("application/x-texinfo", tika.detect("x.texi"));
618 assertEquals("application/x-ustar", tika.detect("x.ustar"));
619 assertEquals("application/x-wais-source", tika.detect("x.src"));
620 assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
621 assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
622 assertEquals("application/x-xfig", tika.detect("x.fig"));
623 assertEquals("application/x-xpinstall", tika.detect("x.xpi"));
624 assertEquals("application/xenc+xml", tika.detect("x.xenc"));
625 assertEquals("application/xhtml+xml", tika.detect("x.xhtml"));
626 assertEquals("application/xhtml+xml", tika.detect("x.xht"));
627 assertEquals("application/xml", tika.detect("x.xml"));
628 assertEquals("application/xml", tika.detect("x.xsl"));
629 assertEquals("application/xml-dtd", tika.detect("x.dtd"));
630 assertEquals("application/xop+xml", tika.detect("x.xop"));
631 assertEquals("application/xslt+xml", tika.detect("x.xslt"));
632 assertEquals("application/xspf+xml", tika.detect("x.xspf"));
633 assertEquals("application/xv+xml", tika.detect("x.mxml"));
634 assertEquals("application/xv+xml", tika.detect("x.xhvml"));
635 assertEquals("application/xv+xml", tika.detect("x.xvml"));
636 assertEquals("application/xv+xml", tika.detect("x.xvm"));
637 assertEquals("application/zip", tika.detect("x.zip"));
638 assertEquals("audio/adpcm", tika.detect("x.adp"));
639 assertEquals("audio/basic", tika.detect("x.au"));
640 assertEquals("audio/basic", tika.detect("x.snd"));
641 assertEquals("audio/midi", tika.detect("x.mid"));
642 assertEquals("audio/midi", tika.detect("x.midi"));
643 assertEquals("audio/midi", tika.detect("x.kar"));
644 assertEquals("audio/midi", tika.detect("x.rmi"));
645 assertEquals("audio/mp4", tika.detect("x.mp4a"));
646 assertEquals("audio/mpeg", tika.detect("x.mpga"));
647 assertEquals("audio/mpeg", tika.detect("x.mp2"));
648 assertEquals("audio/mpeg", tika.detect("x.mp2a"));
649 assertEquals("audio/mpeg", tika.detect("x.mp3"));
650 assertEquals("audio/mpeg", tika.detect("x.m2a"));
651 assertEquals("audio/mpeg", tika.detect("x.m3a"));
652 assertEquals("audio/ogg", tika.detect("x.oga"));
653 assertEquals("audio/ogg", tika.detect("x.ogg"));
654 assertEquals("audio/ogg", tika.detect("x.spx"));
655 assertEquals("audio/vnd.digital-winds", tika.detect("x.eol"));
656 assertEquals("audio/vnd.dts", tika.detect("x.dts"));
657 assertEquals("audio/vnd.dts.hd", tika.detect("x.dtshd"));
658 assertEquals("audio/vnd.lucent.voice", tika.detect("x.lvp"));
659 assertEquals("audio/vnd.ms-playready.media.pya", tika.detect("x.pya"));
660 assertEquals("audio/vnd.nuera.ecelp4800", tika.detect("x.ecelp4800"));
661 assertEquals("audio/vnd.nuera.ecelp7470", tika.detect("x.ecelp7470"));
662 assertEquals("audio/vnd.nuera.ecelp9600", tika.detect("x.ecelp9600"));
663 assertEquals("audio/x-aac", tika.detect("x.aac"));
664 assertEquals("audio/x-aiff", tika.detect("x.aif"));
665 assertEquals("audio/x-aiff", tika.detect("x.aiff"));
666 assertEquals("audio/x-aiff", tika.detect("x.aifc"));
667 assertEquals("audio/x-mpegurl", tika.detect("x.m3u"));
668 assertEquals("audio/x-ms-wax", tika.detect("x.wax"));
669 assertEquals("audio/x-ms-wma", tika.detect("x.wma"));
670 assertEquals("audio/x-pn-realaudio", tika.detect("x.ram"));
671 assertEquals("audio/x-pn-realaudio", tika.detect("x.ra"));
672 assertEquals("audio/x-pn-realaudio-plugin", tika.detect("x.rmp"));
673 assertEquals("audio/x-wav", tika.detect("x.wav"));
674 assertEquals("chemical/x-cdx", tika.detect("x.cdx"));
675 assertEquals("chemical/x-cif", tika.detect("x.cif"));
676 assertEquals("chemical/x-cmdf", tika.detect("x.cmdf"));
677 assertEquals("chemical/x-cml", tika.detect("x.cml"));
678 assertEquals("chemical/x-csml", tika.detect("x.csml"));
679 assertEquals("chemical/x-xyz", tika.detect("x.xyz"));
680 assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
681 assertEquals("image/cgm", tika.detect("x.cgm"));
682 assertEquals("image/g3fax", tika.detect("x.g3"));
683 assertEquals("image/gif", tika.detect("x.gif"));
684 assertEquals("image/ief", tika.detect("x.ief"));
685 assertEquals("image/jpeg", tika.detect("x.jpeg"));
686 assertEquals("image/jpeg", tika.detect("x.jpg"));
687 assertEquals("image/jpeg", tika.detect("x.jpe"));
688 assertEquals("image/jpm", tika.detect("x.jpm"));
689 assertEquals("image/jpm", tika.detect("x.jpgm"));
690 assertEquals("image/png", tika.detect("x.png"));
691 assertEquals("image/prs.btif", tika.detect("x.btif"));
692 assertEquals("image/svg+xml", tika.detect("x.svg"));
693 assertEquals("image/svg+xml", tika.detect("x.svgz"));
694 assertEquals("image/tiff", tika.detect("x.tiff"));
695 assertEquals("image/tiff", tika.detect("x.tif"));
696 assertEquals("image/vnd.adobe.photoshop", tika.detect("x.psd"));
697 assertEquals("image/vnd.djvu", tika.detect("x.djvu"));
698 assertEquals("image/vnd.djvu", tika.detect("x.djv"));
699 assertEquals("image/vnd.dwg", tika.detect("x.dwg"));
700 assertEquals("image/vnd.dxf", tika.detect("x.dxf"));
701 assertEquals("image/vnd.fastbidsheet", tika.detect("x.fbs"));
702 assertEquals("image/vnd.fpx", tika.detect("x.fpx"));
703 assertEquals("image/vnd.fst", tika.detect("x.fst"));
704 assertEquals("image/vnd.fujixerox.edmics-mmr", tika.detect("x.mmr"));
705 assertEquals("image/vnd.fujixerox.edmics-rlc", tika.detect("x.rlc"));
706 assertEquals("image/vnd.ms-modi", tika.detect("x.mdi"));
707 assertEquals("image/vnd.net-fpx", tika.detect("x.npx"));
708 assertEquals("image/vnd.wap.wbmp", tika.detect("x.wbmp"));
709 assertEquals("image/vnd.xiff", tika.detect("x.xif"));
710 assertEquals("image/x-cmu-raster", tika.detect("x.ras"));
711 assertEquals("image/x-cmx", tika.detect("x.cmx"));
712 assertEquals("image/x-freehand", tika.detect("x.fh"));
713 assertEquals("image/x-freehand", tika.detect("x.fhc"));
714 assertEquals("image/x-freehand", tika.detect("x.fh4"));
715 assertEquals("image/x-freehand", tika.detect("x.fh5"));
716 assertEquals("image/x-freehand", tika.detect("x.fh7"));
717 // Differ from httpd - An official mimetype has subsequently been issued
718 // favicon.ico +friends should now be image/vnd.microsoft.icon
719 //assertEquals("image/x-icon", tika.detect("x.ico"));
720 assertEquals("image/x-pcx", tika.detect("x.pcx"));
721 assertEquals("image/x-pict", tika.detect("x.pic"));
722 assertEquals("image/x-pict", tika.detect("x.pct"));
723 assertEquals("image/x-portable-anymap", tika.detect("x.pnm"));
724 assertEquals("image/x-portable-bitmap", tika.detect("x.pbm"));
725 assertEquals("image/x-portable-graymap", tika.detect("x.pgm"));
726 assertEquals("image/x-portable-pixmap", tika.detect("x.ppm"));
727 assertEquals("image/x-rgb", tika.detect("x.rgb"));
728 assertEquals("image/x-xbitmap", tika.detect("x.xbm"));
729 assertEquals("image/x-xpixmap", tika.detect("x.xpm"));
730 assertEquals("image/x-xwindowdump", tika.detect("x.xwd"));
731 assertEquals("message/rfc822", tika.detect("x.eml"));
732 assertEquals("message/rfc822", tika.detect("x.mime"));
733 assertEquals("model/iges", tika.detect("x.igs"));
734 assertEquals("model/iges", tika.detect("x.iges"));
735 assertEquals("model/mesh", tika.detect("x.msh"));
736 assertEquals("model/mesh", tika.detect("x.mesh"));
737 assertEquals("model/mesh", tika.detect("x.silo"));
738 assertEquals("model/vnd.dwf", tika.detect("x.dwf"));
739 assertEquals("model/vnd.gdl", tika.detect("x.gdl"));
740 assertEquals("model/vnd.gtw", tika.detect("x.gtw"));
741 assertEquals("model/vnd.mts", tika.detect("x.mts"));
742 assertEquals("model/vnd.vtu", tika.detect("x.vtu"));
743 assertEquals("model/vrml", tika.detect("x.wrl"));
744 assertEquals("model/vrml", tika.detect("x.vrml"));
745 assertEquals("text/calendar", tika.detect("x.ics"));
746 assertEquals("text/calendar", tika.detect("x.ifb"));
747 assertEquals("text/css", tika.detect("x.css"));
748 assertEquals("text/csv", tika.detect("x.csv"));
749 assertEquals("text/html", tika.detect("x.html"));
750 assertEquals("text/html", tika.detect("x.htm"));
751 assertEquals("text/plain", tika.detect("x.txt"));
752 assertEquals("text/plain", tika.detect("x.text"));
753 assertEquals("text/plain", tika.detect("x.conf"));
754 assertEquals("text/plain", tika.detect("x.def"));
755 assertEquals("text/plain", tika.detect("x.list"));
756 assertEquals("text/x-log", tika.detect("x.log"));
757 assertEquals("text/plain", tika.detect("x.in"));
758 assertEquals("text/prs.lines.tag", tika.detect("x.dsc"));
759 assertEquals("text/richtext", tika.detect("x.rtx"));
760 assertEquals("text/sgml", tika.detect("x.sgml"));
761 assertEquals("text/sgml", tika.detect("x.sgm"));
762 assertEquals("text/tab-separated-values", tika.detect("x.tsv"));
763 assertEquals("text/troff", tika.detect("x.t"));
764 assertEquals("text/troff", tika.detect("x.tr"));
765 assertEquals("text/troff", tika.detect("x.roff"));
766 assertEquals("text/troff", tika.detect("x.man"));
767 assertEquals("text/troff", tika.detect("x.me"));
768 assertEquals("text/troff", tika.detect("x.ms"));
769 assertEquals("text/uri-list", tika.detect("x.uri"));
770 assertEquals("text/uri-list", tika.detect("x.uris"));
771 assertEquals("text/uri-list", tika.detect("x.urls"));
772 assertEquals("text/vnd.curl", tika.detect("x.curl"));
773 assertEquals("text/vnd.curl.dcurl", tika.detect("x.dcurl"));
774 assertEquals("text/vnd.curl.scurl", tika.detect("x.scurl"));
775 assertEquals("text/vnd.curl.mcurl", tika.detect("x.mcurl"));
776 assertEquals("text/vnd.fly", tika.detect("x.fly"));
777 assertEquals("text/vnd.fmi.flexstor", tika.detect("x.flx"));
778 assertEquals("text/vnd.graphviz", tika.detect("x.gv"));
779 assertEquals("text/vnd.in3d.3dml", tika.detect("x.3dml"));
780 assertEquals("text/vnd.in3d.spot", tika.detect("x.spot"));
781 assertEquals("text/vnd.sun.j2me.app-descriptor", tika.detect("x.jad"));
782 assertEquals("text/vnd.wap.wml", tika.detect("x.wml"));
783 assertEquals("text/vnd.wap.wmlscript", tika.detect("x.wmls"));
784 assertEquals("text/x-assembly", tika.detect("x.s"));
785 assertEquals("text/x-assembly", tika.detect("x.asm"));
786 assertEquals("text/x-csrc", tika.detect("x.c"));
787 assertEquals("text/x-c++src", tika.detect("x.cc"));
788 assertEquals("text/x-c++src", tika.detect("x.cxx"));
789 assertEquals("text/x-c++src", tika.detect("x.cpp"));
790 assertEquals("text/x-chdr", tika.detect("x.h"));
791 assertEquals("text/x-c++hdr", tika.detect("x.hh"));
792 assertEquals("text/x-fortran", tika.detect("x.f"));
793 assertEquals("text/x-fortran", tika.detect("x.for"));
794 assertEquals("text/x-fortran", tika.detect("x.f77"));
795 assertEquals("text/x-fortran", tika.detect("x.f90"));
796 assertEquals("text/x-pascal", tika.detect("x.p"));
797 assertEquals("text/x-pascal", tika.detect("x.pas"));
798 assertEquals("text/x-java-source", tika.detect("x.java"));
799 assertEquals("text/x-setext", tika.detect("x.etx"));
800 assertEquals("text/x-uuencode", tika.detect("x.uu"));
801 assertEquals("text/x-vcalendar", tika.detect("x.vcs"));
802 assertEquals("text/x-vcard", tika.detect("x.vcf"));
803 assertEquals("video/3gpp", tika.detect("x.3gp"));
804 assertEquals("video/3gpp2", tika.detect("x.3g2"));
805 assertEquals("video/h261", tika.detect("x.h261"));
806 assertEquals("video/h263", tika.detect("x.h263"));
807 assertEquals("video/h264", tika.detect("x.h264"));
808 assertEquals("video/jpeg", tika.detect("x.jpgv"));
809 assertEquals("video/mj2", tika.detect("x.mj2"));
810 assertEquals("video/mj2", tika.detect("x.mjp2"));
811 assertEquals("video/mp4", tika.detect("x.mp4"));
812 assertEquals("video/mp4", tika.detect("x.mp4v"));
813 assertEquals("video/mp4", tika.detect("x.mpg4"));
814 assertEquals("video/mpeg", tika.detect("x.mpeg"));
815 assertEquals("video/mpeg", tika.detect("x.mpg"));
816 assertEquals("video/mpeg", tika.detect("x.mpe"));
817 assertEquals("video/mpeg", tika.detect("x.m1v"));
818 assertEquals("video/mpeg", tika.detect("x.m2v"));
819 assertEquals("video/ogg", tika.detect("x.ogv"));
820 assertEquals("video/quicktime", tika.detect("x.qt"));
821 assertEquals("video/quicktime", tika.detect("x.mov"));
822 assertEquals("video/vnd.fvt", tika.detect("x.fvt"));
823 assertEquals("video/vnd.mpegurl", tika.detect("x.mxu"));
824 assertEquals("video/vnd.mpegurl", tika.detect("x.m4u"));
825 assertEquals("video/vnd.ms-playready.media.pyv", tika.detect("x.pyv"));
826 assertEquals("video/vnd.vivo", tika.detect("x.viv"));
827 assertEquals("video/x-f4v", tika.detect("x.f4v"));
828 assertEquals("video/x-fli", tika.detect("x.fli"));
829 assertEquals("video/x-flv", tika.detect("x.flv"));
830 assertEquals("video/x-m4v", tika.detect("x.m4v"));
831 assertEquals("video/x-ms-asf", tika.detect("x.asf"));
832 assertEquals("video/x-ms-asf", tika.detect("x.asx"));
833 assertEquals("video/x-ms-wm", tika.detect("x.wm"));
834 assertEquals("video/x-ms-wmv", tika.detect("x.wmv"));
835 assertEquals("video/x-ms-wmx", tika.detect("x.wmx"));
836 assertEquals("video/x-ms-wvx", tika.detect("x.wvx"));
837 assertEquals("video/x-msvideo", tika.detect("x.avi"));
838 assertEquals("video/x-sgi-movie", tika.detect("x.movie"));
839 assertEquals("x-conference/x-cooltalk", tika.detect("x.ice"));
840 }
841
842 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import static org.junit.Assert.assertNotNull;
19 import static org.junit.Assert.assertTrue;
20 import org.junit.Test;
21
22
23 public class TikaIT {
24
25 @Test
26 public void testToString() {
27 String version = new Tika().toString();
28 assertNotNull(version);
29 assertTrue(version.matches(
30 "Apache Tika \\d+\\.\\d+(\\.\\d+)?(-SNAPSHOT)?"));
31 }
32
33 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.InputStream;
22
23 import org.apache.tika.io.IOUtils;
24
25 public class TypeDetectionBenchmark {
26
27 private static final Tika tika = new Tika();
28
29 public static void main(String[] args) throws Exception {
30 long start = System.currentTimeMillis();
31 if (args.length > 0) {
32 for (String arg : args) {
33 benchmark(new File(arg));
34 }
35 } else {
36 benchmark(new File(
37 "../tika-parsers/src/test/resources/test-documents"));
38 }
39 System.out.println(
40 "Total benchmark time: "
41 + (System.currentTimeMillis() - start) + "ms");
42 }
43
44 private static void benchmark(File file) throws Exception {
45 if (file.isHidden()) {
46 // ignore
47 } else if (file.isFile()) {
48 InputStream input = new FileInputStream(file);
49 try {
50 byte[] content = IOUtils.toByteArray(input);
51 String type =
52 tika.detect(new ByteArrayInputStream(content));
53 long start = System.currentTimeMillis();
54 for (int i = 0; i < 1000; i++) {
55 tika.detect(new ByteArrayInputStream(content));
56 }
57 System.out.printf(
58 "%6dns per Tika.detect(%s) = %s%n",
59 System.currentTimeMillis() - start, file, type);
60 } finally {
61 input.close();
62 }
63 } else if (file.isDirectory()) {
64 for (File child : file.listFiles()) {
65 benchmark(child);
66 }
67 }
68 }
69
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.config;
17
18 import java.net.URL;
19 import java.util.List;
20 import java.util.Map;
21
22 import org.apache.tika.ResourceLoggingClassLoader;
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.parser.AutoDetectParser;
25 import org.apache.tika.parser.DefaultParser;
26 import org.junit.Test;
27
28 import static org.junit.Assert.assertEquals;
29 import static org.junit.Assert.assertNotNull;
30 import static org.junit.Assert.assertTrue;
31 import static org.junit.Assert.fail;
32
33 public class TikaConfigTest {
34
35 /**
36 * Make sure that a configuration file can't reference the
37 * {@link AutoDetectParser} class a &lt;parser&gt; configuration element.
38 *
39 * @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
40 */
41 @Test
42 public void testInvalidParser() throws Exception {
43 URL url = TikaConfigTest.class.getResource("TIKA-866-invalid.xml");
44 System.setProperty("tika.config", url.toExternalForm());
45 try {
46 new TikaConfig();
47 fail("AutoDetectParser allowed in a <parser> element");
48 } catch (TikaException expected) {
49 } finally {
50 System.clearProperty("tika.config");
51 }
52 }
53
54 /**
55 * Make sure that a configuration file can reference also a composite
56 * parser class like {@link DefaultParser} in a &lt;parser&gt;
57 * configuration element.
58 *
59 * @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
60 */
61 public void testCompositeParser() throws Exception {
62 URL url = TikaConfigTest.class.getResource("TIKA-866-composite.xml");
63 System.setProperty("tika.config", url.toExternalForm());
64 try {
65 new TikaConfig();
66 } catch (TikaException e) {
67 fail("Unexpected TikaException: " + e);
68 } finally {
69 System.clearProperty("tika.config");
70 }
71 }
72
73 /**
74 * Make sure that a valid configuration file without mimetypes or
75 * detector entries can be loaded without problems.
76 *
77 * @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
78 */
79 public void testValidParser() throws Exception {
80 URL url = TikaConfigTest.class.getResource("TIKA-866-valid.xml");
81 System.setProperty("tika.config", url.toExternalForm());
82 try {
83 new TikaConfig();
84 } catch (TikaException e) {
85 fail("Unexpected TikaException: " + e);
86 } finally {
87 System.clearProperty("tika.config");
88 }
89 }
90
91 /**
92 * TIKA-1145 If the TikaConfig has a ClassLoader set on it,
93 * that should be used when loading the mimetypes and when
94 * discovering services
95 */
96 public void testClassLoaderUsedEverywhere() throws Exception {
97 ResourceLoggingClassLoader customLoader =
98 new ResourceLoggingClassLoader(getClass().getClassLoader());
99 TikaConfig config;
100
101 // Without a classloader set, normal one will be used
102 config = new TikaConfig();
103 config.getMediaTypeRegistry();
104 config.getParser();
105 assertEquals(0, customLoader.getLoadedResources().size());
106
107 // With a classloader set, resources will come through it
108 config = new TikaConfig(customLoader);
109 config.getMediaTypeRegistry();
110 config.getParser();
111
112 Map<String,List<URL>> resources = customLoader.getLoadedResources();
113 int resourcesCount = resources.size();
114 assertTrue(
115 "Not enough things used the classloader, found only " + resourcesCount,
116 resourcesCount > 3
117 );
118
119 // Ensure everything that should do, did use it
120 // - Parsers
121 assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser"));
122 // - Detectors
123 assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector"));
124 // - Built-In Mimetypes
125 assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml"));
126 // - Custom Mimetypes
127 assertNotNull(resources.get("org/apache/tika/mime/custom-mimetypes.xml"));
128 }
129 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.mime.MediaType;
24 import org.junit.Test;
25
26 import static org.junit.Assert.assertEquals;
27 import static org.junit.Assert.fail;
28
29 /**
30 * Test cases for the {@link MagicDetector} class.
31 */
32 public class MagicDetectorTest {
33
34 @Test
35 public void testDetectNull() throws Exception {
36 MediaType html = new MediaType("text", "html");
37 Detector detector = new MagicDetector(html, "<html".getBytes("ASCII"));
38 assertEquals(
39 MediaType.OCTET_STREAM,
40 detector.detect(null, new Metadata()));
41 }
42
43 @Test
44 public void testDetectSimple() throws Exception {
45 MediaType html = new MediaType("text", "html");
46 Detector detector = new MagicDetector(html, "<html".getBytes("ASCII"));
47
48 assertDetect(detector, html, "<html");
49 assertDetect(detector, html, "<html><head/><body/></html>");
50 assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
51 assertDetect(detector, MediaType.OCTET_STREAM, "<?xml?><html");
52 assertDetect(detector, MediaType.OCTET_STREAM, " <html");
53 assertDetect(detector, MediaType.OCTET_STREAM, "");
54 }
55
56 @Test
57 public void testDetectOffsetRange() throws Exception {
58 MediaType html = new MediaType("text", "html");
59 Detector detector = new MagicDetector(
60 html, "<html".getBytes("ASCII"), null, 0, 64);
61
62 assertDetect(detector, html, "<html");
63 assertDetect(detector, html, "<html><head/><body/></html>");
64 assertDetect(detector, html, "<?xml?><html/>");
65 assertDetect(detector, html, "\n <html");
66 assertDetect(detector, html, "\u0000<html");
67 assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
68 assertDetect(detector, MediaType.OCTET_STREAM, " html");
69 assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
70
71 assertDetect(detector, html,
72 "0........1.........2.........3.........4.........5.........6"
73 + "1234<html");
74 assertDetect(detector, MediaType.OCTET_STREAM,
75 "0........1.........2.........3.........4.........5.........6"
76 + "12345<html");
77
78 assertDetect(detector, MediaType.OCTET_STREAM, "");
79 }
80
81 @Test
82 public void testDetectMask() throws Exception {
83 MediaType html = new MediaType("text", "html");
84 byte up = (byte) 0xdf;
85 Detector detector = new MagicDetector(
86 html,
87 new byte[] { '<', 'H', 'T', 'M', 'L' },
88 new byte[] { (byte) 0xff, up, up, up, up },
89 0, 64);
90
91 assertDetect(detector, html, "<html");
92 assertDetect(detector, html, "<HTML><head/><body/></html>");
93 assertDetect(detector, html, "<?xml?><HtMl/>");
94 assertDetect(detector, html, "\n <html");
95 assertDetect(detector, html, "\u0000<HTML");
96 assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
97 assertDetect(detector, MediaType.OCTET_STREAM, " html");
98
99 assertDetect(detector, html,
100 "0 1 2 3 4 5 6"
101 + "1234<html");
102 assertDetect(detector, MediaType.OCTET_STREAM,
103 "0 1 2 3 4 5 6"
104 + "12345<html");
105
106 assertDetect(detector, MediaType.OCTET_STREAM, "");
107 }
108
109 @Test
110 public void testDetectRegExPDF() throws Exception {
111 MediaType pdf = new MediaType("application", "pdf");
112 Detector detector = new MagicDetector(
113 pdf, "(?s)\\A.{0,144}%PDF-".getBytes("ASCII"), null, true, 0, 0);
114
115 assertDetect(detector, pdf, "%PDF-1.0");
116 assertDetect(
117 detector, pdf,
118 "0 10 20 30 40 50 6"
119 + "0 70 80 90 100 110 1"
120 + "20 130 140"
121 + "34%PDF-1.0");
122 assertDetect(
123 detector, MediaType.OCTET_STREAM,
124 "0 10 20 30 40 50 6"
125 + "0 70 80 90 100 110 1"
126 + "20 130 140"
127 + "345%PDF-1.0");
128 assertDetect(detector, MediaType.OCTET_STREAM, "");
129 }
130
131 @Test
132 public void testDetectRegExGreedy() throws Exception {
133 String pattern =
134 "(?s)\\x3chtml xmlns=\"http://www\\.w3\\.org/1999/xhtml"
135 + "\".*\\x3ctitle\\x3e.*\\x3c/title\\x3e";
136 MediaType xhtml = new MediaType("application", "xhtml+xml");
137 Detector detector = new MagicDetector(xhtml,
138 pattern.getBytes("ASCII"), null,
139 true, 0, 8192);
140
141 assertDetect(detector, xhtml,
142 "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
143 + "<head><title>XHTML test document</title></head>");
144 }
145
146 @Test
147 public void testDetectRegExOptions() throws Exception {
148 String pattern =
149 "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) "
150 + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}"
151 + "(?:HTML|html) 4\\.01";
152
153 String data =
154 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\""
155 + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
156 + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
157 + "<BODY><P>Hello world!</BODY></HTML>";
158
159 String data1 =
160 "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\""
161 + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
162 + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
163 + "<BODY><P>Hello world!</BODY></HTML>";
164
165 String data2 =
166 "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\""
167 + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
168 + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
169 + "<BODY><P>Hello world!</BODY></HTML>";
170
171 MediaType html = new MediaType("text", "html");
172 Detector detector = new MagicDetector(
173 html, pattern.getBytes("ASCII"), null, true, 0, 0);
174
175 assertDetect(detector, html, data);
176 assertDetect(detector, html, data1);
177 assertDetect(detector, MediaType.OCTET_STREAM, data2);
178 }
179
180 @Test
181 public void testDetectStreamReadProblems() throws Exception {
182 byte[] data = "abcdefghijklmnopqrstuvwxyz0123456789".getBytes("ASCII");
183 MediaType testMT = new MediaType("application", "test");
184 Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
185 // Deliberately prevent InputStream.read(...) from reading the entire
186 // buffer in one go
187 InputStream stream = new RestrictiveInputStream(data);
188 assertEquals(testMT, detector.detect(stream, new Metadata()));
189 }
190
191 @Test
192 public void testDetectString() throws Exception {
193 String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
194 MediaType testMT = new MediaType("application", "test");
195 Detector detector;
196
197 // Check regular String matching
198 detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
199 assertDetect(detector, testMT, data.getBytes("ASCII"));
200 detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
201 assertDetect(detector, testMT, data.getBytes("ASCII"));
202
203 // Check Little Endian and Big Endian utf-16 strings
204 detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
205 assertDetect(detector, testMT, data.getBytes("UTF-16LE"));
206 detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
207 assertDetect(detector, testMT, data.getBytes("UTF-16BE"));
208
209 // Check case ignoring String matching
210 detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
211 assertDetect(detector, testMT, data.getBytes("ASCII"));
212 }
213
214 private void assertDetect(Detector detector, MediaType type, String data) {
215 try {
216 byte[] bytes = data.getBytes("ASCII");
217 assertDetect(detector, type, bytes);
218 } catch (IOException e) {
219 fail("Unexpected exception from MagicDetector");
220 }
221 }
222 private void assertDetect(Detector detector, MediaType type, byte[] bytes) {
223 try {
224 InputStream stream = new ByteArrayInputStream(bytes);
225 assertEquals(type, detector.detect(stream, new Metadata()));
226
227 // Test that the stream has been reset
228 for (int i = 0; i < bytes.length; i++) {
229 assertEquals(bytes[i], (byte) stream.read());
230 }
231 assertEquals(-1, stream.read());
232 } catch (IOException e) {
233 fail("Unexpected exception from MagicDetector");
234 }
235 }
236
237 /**
238 * InputStream class that does not read in all available bytes in
239 * one go.
240 */
241 private class RestrictiveInputStream extends ByteArrayInputStream {
242 public RestrictiveInputStream(byte[] buf) {
243 super(buf);
244 }
245
246 /**
247 * Prevent reading the entire len of bytes if requesting more
248 * than 10 bytes.
249 */
250 public int read(byte[] b, int off, int len) {
251 if (len > 10) {
252 return super.read(b, off, len-10);
253 } else {
254 return super.read(b, off, len);
255 }
256 }
257 }
258
259 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.util.HashMap;
20 import java.util.Map;
21 import java.util.regex.Pattern;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25
26 import static org.junit.Assert.assertEquals;
27 import static org.junit.Assert.fail;
28 import org.junit.Before;
29 import org.junit.Test;
30
31 /**
32 * Test cases for the {@link NameDetector} class.
33 */
34 public class NameDetectorTest {
35
36 private Detector detector;
37
38 @Before
39 public void setUp() {
40 Map<Pattern, MediaType> patterns = new HashMap<Pattern, MediaType>();
41 patterns.put(
42 Pattern.compile(".*\\.txt", Pattern.CASE_INSENSITIVE),
43 MediaType.TEXT_PLAIN);
44 patterns.put(Pattern.compile("README"), MediaType.TEXT_PLAIN);
45 detector = new NameDetector(patterns);
46 }
47
48 @Test
49 public void testDetect() {
50 assertDetect(MediaType.TEXT_PLAIN, "text.txt");
51 assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space
52 assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline
53 assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query
54 assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment
55 assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded
56 assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive
57 assertDetect(MediaType.OCTET_STREAM, "text.txt.gz");
58
59 assertDetect(MediaType.TEXT_PLAIN, "README");
60 assertDetect(MediaType.TEXT_PLAIN, " README "); // space around
61 assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace
62 assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path
63 assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path
64 assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive
65 assertDetect(MediaType.OCTET_STREAM, "README.NOW");
66
67 // tough one
68 assertDetect(
69 MediaType.TEXT_PLAIN,
70 " See http://www.example.com:1234/README.txt?a=b#c \n");
71 assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
72 assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this
73
74 // test also the zero input cases
75 assertDetect(MediaType.OCTET_STREAM, "");
76 assertDetect(MediaType.OCTET_STREAM, null);
77 try {
78 assertEquals(
79 MediaType.OCTET_STREAM,
80 detector.detect(null, new Metadata()));
81 } catch (IOException e) {
82 fail("NameDetector should never throw an IOException");
83 }
84 }
85
86 private void assertDetect(MediaType type, String name){
87 Metadata metadata = new Metadata();
88 metadata.set(Metadata.RESOURCE_NAME_KEY, name);
89 try {
90 assertEquals(type, detector.detect(null, metadata));
91 } catch (IOException e) {
92 fail("NameDetector should never throw an IOException");
93 }
94 }
95
96 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.junit.Test;
26
27 import static org.junit.Assert.assertEquals;
28 import static org.junit.Assert.fail;
29
30 /**
31 * Test cases for the {@link TextDetector} class.
32 */
33 public class TextDetectorTest {
34
35 private final Detector detector = new TextDetector();
36
37 @Test
38 public void testDetectNull() throws Exception {
39 assertEquals(
40 MediaType.OCTET_STREAM,
41 detector.detect(null, new Metadata()));
42 }
43
44 /**
45 * Test for type detection of empty documents.
46 *
47 * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
48 */
49 @Test
50 public void testDetectEmpty() throws Exception {
51 assertNotText(new byte[0]);
52 }
53
54 @Test
55 public void testDetectText() throws Exception {
56 assertText("Hello, World!".getBytes("UTF-8"));
57 assertText(" \t\r\n".getBytes("UTF-8"));
58 assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
59 assertNotText(new byte[] { 0 });
60 assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
61
62 byte[] data = new byte[512];
63 Arrays.fill(data, (byte) '.');
64 assertText(data);
65 Arrays.fill(data, 100, 110, (byte) 0x1f);
66 assertText(data); // almost text
67 Arrays.fill(data, 100, 111, (byte) 0x1f);
68 assertNotText(data); // no longer almost text, too many control chars
69 Arrays.fill(data, (byte) 0x1f);
70 assertNotText(data);
71
72 data = new byte[513];
73 Arrays.fill(data, (byte) '.');
74 data[0] = 0x1f;
75 assertText(data);
76 Arrays.fill(data, 100, 150, (byte) 0x83);
77 assertText(data); // almost text
78 Arrays.fill(data, 100, 200, (byte) 0x83);
79 assertNotText(data); // no longer almost text, too many non-ASCII
80 Arrays.fill(data, (byte) 0x1f);
81 assertNotText(data);
82 }
83
84 private void assertText(byte[] data) {
85 try {
86 InputStream stream = new ByteArrayInputStream(data);
87 assertEquals(
88 MediaType.TEXT_PLAIN,
89 detector.detect(stream, new Metadata()));
90
91 // Test that the stream has been reset
92 for (int i = 0; i < data.length; i++) {
93 assertEquals(data[i], (byte) stream.read());
94 }
95 assertEquals(-1, stream.read());
96 } catch (IOException e) {
97 fail("Unexpected exception from TextDetector");
98 }
99 }
100
101 private void assertNotText(byte[] data) {
102 try {
103 assertEquals(
104 MediaType.OCTET_STREAM,
105 detector.detect(
106 new ByteArrayInputStream(data), new Metadata()));
107 } catch (IOException e) {
108 fail("Unexpected exception from TextDetector");
109 }
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import java.io.IOException;
19 import java.util.Map;
20 import java.util.TreeMap;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.mime.MediaType;
24 import org.junit.Test;
25
26 import static org.junit.Assert.assertEquals;
27 import static org.junit.Assert.fail;
28
29 /**
30 * Test cases for the {@link TypeDetector} class.
31 */
32 public class TypeDetectorTest {
33
34 private Detector detector = new TypeDetector();
35
36 private static final Map<String, String> params = new
37 TreeMap<String, String>();
38 static{
39 params.put("a", "b");
40 }
41
42 private static final MediaType TEXT_PLAIN_A_EQ_B =
43 new MediaType("text", "plain", params);
44
45 @Test
46 public void testDetect() {
47 assertDetect(MediaType.TEXT_PLAIN, "text/plain");
48 assertDetect(MediaType.TEXT_PLAIN, "TEXT/PLAIN");
49 assertDetect(MediaType.TEXT_PLAIN, " text/\tplain\n");
50 assertDetect(TEXT_PLAIN_A_EQ_B, "text/plain; a=b");
51 assertDetect(TEXT_PLAIN_A_EQ_B, "\ttext/plain; a=b\n");
52
53 assertDetect(MediaType.OCTET_STREAM, "text\\plain");
54
55 // test also the zero input cases
56 assertDetect(MediaType.OCTET_STREAM, "");
57 assertDetect(MediaType.OCTET_STREAM, null);
58 try {
59 assertEquals(
60 MediaType.OCTET_STREAM,
61 detector.detect(null, new Metadata()));
62 } catch (IOException e) {
63 fail("TypeDetector should never throw an IOException");
64 }
65 }
66
67 private void assertDetect(MediaType type, String name){
68 Metadata metadata = new Metadata();
69 metadata.set(Metadata.CONTENT_TYPE, name);
70 try {
71 assertEquals(type, detector.detect(null, metadata));
72 } catch (IOException e) {
73 fail("TypeDetector should never throw an IOException");
74 }
75 }
76
77 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.PipedInputStream;
22 import java.io.PipedOutputStream;
23 import java.util.concurrent.Semaphore;
24
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.parser.ParseContext;
27 import org.apache.tika.sax.BodyContentHandler;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30 import org.xml.sax.helpers.DefaultHandler;
31
32 import static org.junit.Assert.assertEquals;
33
34 public class ForkParserTest {
35
36 @Test
37 public void testHelloWorld() throws Exception {
38 ForkParser parser = new ForkParser(
39 ForkParserTest.class.getClassLoader(),
40 new ForkTestParser());
41 try {
42 Metadata metadata = new Metadata();
43 ContentHandler output = new BodyContentHandler();
44 InputStream stream = new ByteArrayInputStream(new byte[0]);
45 ParseContext context = new ParseContext();
46 parser.parse(stream, output, metadata, context);
47 assertEquals("Hello, World!", output.toString().trim());
48 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
49 } finally {
50 parser.close();
51 }
52 }
53
54 @Test
55 public void testSerialParsing() throws Exception {
56 ForkParser parser = new ForkParser(
57 ForkParserTest.class.getClassLoader(),
58 new ForkTestParser());
59 try {
60 ParseContext context = new ParseContext();
61 for (int i = 0; i < 10; i++) {
62 ContentHandler output = new BodyContentHandler();
63 InputStream stream = new ByteArrayInputStream(new byte[0]);
64 parser.parse(stream, output, new Metadata(), context);
65 assertEquals("Hello, World!", output.toString().trim());
66 }
67 } finally {
68 parser.close();
69 }
70 }
71
72 @Test
73 public void testParallelParsing() throws Exception {
74 final ForkParser parser = new ForkParser(
75 ForkParserTest.class.getClassLoader(),
76 new ForkTestParser());
77 try {
78 final ParseContext context = new ParseContext();
79
80 Thread[] threads = new Thread[10];
81 ContentHandler[] output = new ContentHandler[threads.length];
82 for (int i = 0; i < threads.length; i++) {
83 final ContentHandler o = new BodyContentHandler();
84 output[i] = o;
85 threads[i] = new Thread() {
86 public void run() {
87 try {
88 InputStream stream =
89 new ByteArrayInputStream(new byte[0]);
90 parser.parse(stream, o, new Metadata(), context);
91 } catch (Exception e) {
92 e.printStackTrace();
93 }
94 }
95 };
96 threads[i].start();
97 }
98
99 for (int i = 0; i < threads.length; i++) {
100 threads[i].join();
101 assertEquals("Hello, World!", output[i].toString().trim());
102 }
103 } finally {
104 parser.close();
105 }
106 }
107
108 @Test
109 public void testPoolSizeReached() throws Exception {
110 final ForkParser parser = new ForkParser(
111 ForkParserTest.class.getClassLoader(),
112 new ForkTestParser());
113 try {
114 final Semaphore barrier = new Semaphore(0);
115
116 Thread[] threads = new Thread[parser.getPoolSize()];
117 PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
118 final ParseContext context = new ParseContext();
119 for (int i = 0; i < threads.length; i++) {
120 final PipedInputStream input = new PipedInputStream() {
121 @Override
122 public synchronized int read() throws IOException {
123 barrier.release();
124 return super.read();
125 }
126 };
127 pipes[i] = new PipedOutputStream(input);
128 threads[i] = new Thread() {
129 public void run() {
130 try {
131 ContentHandler o = new DefaultHandler();
132 parser.parse(input, o, new Metadata(), context);
133 } catch (Exception e) {
134 e.printStackTrace();
135 }
136 }
137 };
138 threads[i].start();
139 }
140
141 // Wait until all the background parsers have been started
142 barrier.acquire(parser.getPoolSize());
143
144 final ContentHandler o = new BodyContentHandler();
145 Thread blocked = new Thread() {
146 public void run() {
147 try {
148 barrier.release();
149 InputStream stream =
150 new ByteArrayInputStream(new byte[0]);
151 parser.parse(stream, o, new Metadata(), context);
152 } catch (Exception e) {
153 e.printStackTrace();
154 }
155 }
156 };
157 blocked.start();
158
159 // Wait until the last thread is started, and then some to
160 // make sure that it would have had a chance to start processing
161 // data had it not been blocked.
162 barrier.acquire();
163 Thread.sleep(1000);
164
165 assertEquals("", o.toString());
166
167 for (int i = 0; i < threads.length; i++) {
168 pipes[i].close();
169 threads[i].join();
170 }
171
172 blocked.join();
173 assertEquals("Hello, World!", o.toString().trim());
174 } finally {
175 parser.close();
176 }
177 }
178
179 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26 import org.apache.tika.parser.AbstractParser;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.sax.XHTMLContentHandler;
29 import org.xml.sax.ContentHandler;
30 import org.xml.sax.SAXException;
31
32 class ForkTestParser extends AbstractParser {
33
34 /** Serial version UID */
35 private static final long serialVersionUID = -5492269783593452319L;
36
37 public Set<MediaType> getSupportedTypes(ParseContext context) {
38 return Collections.singleton(MediaType.TEXT_PLAIN);
39 }
40
41 public void parse(
42 InputStream stream, ContentHandler handler,
43 Metadata metadata, ParseContext context)
44 throws IOException, SAXException, TikaException {
45 stream.read();
46
47 metadata.set(Metadata.CONTENT_TYPE, "text/plain");
48
49 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
50 xhtml.startDocument();
51 char[] ch = "Hello, World!".toCharArray();
52 xhtml.characters(ch, 0, ch.length);
53 xhtml.endDocument();
54 }
55
56 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.io;
18
19
20 import org.junit.Test;
21 import static org.junit.Assert.*;
22
23 public class FilenameUtilsTest {
24
25 /**
26 * Different filesystems and operating systems have different restrictions
27 * on the name that can be used for files and directories.
28 * FilenameUtils.normalize() returns a cross platform file name that turns
29 * special characters in a HEX based code convention. This is %<code>.
30 * For example why?.zip will be converted into why%3F.zip
31 *
32 * @see http://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
33 *
34 * Reserved chars are the ones in FilenameUtils.RESERVED_FILENAME_CHARACTERS:
35 */
36 @Test
37 public void normalizeNothingTodo() throws Exception {
38 final String TEST_NAME = "test.zip";
39
40 assertEquals(TEST_NAME, FilenameUtils.normalize(TEST_NAME));
41 }
42
43 @Test
44 public void normalizeWithNull() throws Exception {
45 try {
46 FilenameUtils.normalize(null);
47 fail("missing check for null parameters");
48 } catch (IllegalArgumentException x) {
49 assertTrue(x.getMessage().contains("name"));
50 assertTrue(x.getMessage().contains("not be null"));
51 }
52 }
53
54 @Test
55 public void normalizeWithReservedChar() throws Exception {
56 final String[] TEST_NAMES = {
57 "test?.txt", "?test.txt", "test.txt?", "?test?txt?"
58 };
59 final String[] EXPECTED_NAMES = {
60 "test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"
61 };
62
63 for (int i=0; i<TEST_NAMES.length; ++i) {
64 //System.out.println("checking " + TEST_NAMES[i]);
65 assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i]));
66 }
67 }
68
69 @Test
70 public void normalizeWithReservedChars() throws Exception {
71 final String TEST_NAME =
72 "?a/b\nc\td\re*f\\g:h<i>j.txt|";
73 final String EXPECTED_NAME =
74 "%3Fa/b%0Ac%09d%0De%2Af\\g%3Ah%3Ci%3Ej.txt%7C";
75
76 assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
77 }
78
79 @Test
80 public void normalizeWithNotPrintableChars() throws Exception {
81 final String TEST_NAME = new String(
82 new char[] {
83 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
84 '.',
85 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
86 }
87 );
88 final String EXPECTED_NAME =
89 "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" +
90 "." +
91 "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F";
92
93 assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
94 }
95
96
97 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 import org.junit.Test;
23
24 import static org.junit.Assert.assertEquals;;
25
26 /**
27 * Test cases for the {@link LookaheadInputStream} class.
28 */
29 public class LookaheadInputStreamTest {
30
31 @Test
32 public void testNullStream() throws IOException {
33 InputStream lookahead = new LookaheadInputStream(null, 100);
34 assertEquals(-1, lookahead.read());
35 }
36
37 @Test
38 public void testEmptyStream() throws IOException {
39 InputStream stream = new ByteArrayInputStream(new byte[0]);
40 InputStream lookahead = new LookaheadInputStream(stream, 100);
41 assertEquals(-1, lookahead.read());
42 lookahead.close();
43 assertEquals(-1, stream.read());
44 }
45
46 @Test
47 public void testBasicLookahead() throws IOException {
48 InputStream stream =
49 new ByteArrayInputStream(new byte[] { 'a', 'b', 'c' });
50 InputStream lookahead = new LookaheadInputStream(stream, 2);
51 assertEquals('a', lookahead.read());
52 assertEquals('b', lookahead.read());
53 assertEquals(-1, lookahead.read());
54 lookahead.close();
55 assertEquals('a', stream.read());
56 assertEquals('b', stream.read());
57 assertEquals('c', stream.read());
58 assertEquals(-1, stream.read());
59 }
60
61 @Test
62 public void testZeroLookahead() throws IOException {
63 InputStream stream =
64 new ByteArrayInputStream(new byte[] { 'a', 'b', 'c' });
65 InputStream lookahead = new LookaheadInputStream(stream, 0);
66 assertEquals(-1, lookahead.read());
67 lookahead.close();
68 assertEquals('a', stream.read());
69 assertEquals('b', stream.read());
70 assertEquals('c', stream.read());
71 assertEquals(-1, stream.read());
72 }
73
74 @Test
75 public void testMarkLookahead() throws IOException {
76 InputStream stream =
77 new ByteArrayInputStream(new byte[] { 'a', 'b', 'c' });
78 InputStream lookahead = new LookaheadInputStream(stream, 2);
79 lookahead.mark(1);
80 assertEquals('a', lookahead.read());
81 lookahead.reset();
82 assertEquals('a', lookahead.read());
83 lookahead.mark(2);
84 assertEquals('b', lookahead.read());
85 assertEquals(-1, lookahead.read());
86 lookahead.reset();
87 assertEquals('b', lookahead.read());
88 assertEquals(-1, lookahead.read());
89 lookahead.close();
90 assertEquals('a', stream.read());
91 assertEquals('b', stream.read());
92 assertEquals('c', stream.read());
93 assertEquals(-1, stream.read());
94 }
95
96 @Test
97 public void testSkipLookahead() throws IOException {
98 InputStream stream =
99 new ByteArrayInputStream(new byte[] { 'a', 'b', 'c' });
100 InputStream lookahead = new LookaheadInputStream(stream, 2);
101 assertEquals(1, lookahead.skip(1));
102 assertEquals('b', lookahead.read());
103 assertEquals(0, lookahead.skip(1));
104 assertEquals(-1, lookahead.read());
105 lookahead.close();
106 assertEquals('a', stream.read());
107 assertEquals('b', stream.read());
108 assertEquals('c', stream.read());
109 assertEquals(-1, stream.read());
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import org.junit.Test;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.ObjectOutputStream;
23
24 import static org.junit.Assert.fail;
25
26 public class TaggedInputStreamTest {
27
28 @Test
29 public void createdIOExceptionIsSerializable() {
30 try {
31 new TaggedInputStream(null).handleIOException(new IOException("Dummy"));
32 } catch (IOException e) {
33 assertCanSerialize(e);
34 }
35 }
36
37 private static void assertCanSerialize(Object e) {
38 ByteArrayOutputStream out = new ByteArrayOutputStream();
39 ObjectOutputStream oos = null;
40 try {
41 oos = new ObjectOutputStream(out);
42 oos.writeObject(e);
43 } catch (IOException e1) {
44 fail(e1.getMessage());
45 } finally {
46 if (oos != null)
47 try {
48 oos.close();
49 } catch (IOException ignore) {
50 }
51 }
52 }
53
54 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.ByteArrayInputStream;
22 import java.io.ByteArrayOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.util.Arrays;
26
27 import org.junit.Test;
28
29 /**
30 * Test class for {@code TailStream}.
31 */
32 public class TailStreamTest
33 {
34 /** Constant for generating test text. */
35 private static final String TEXT =
36 "Lorem ipsum dolor sit amet, consetetur "
37 + "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut "
38 + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero"
39 + " eos et accusam et justo duo dolores et ea rebum. Stet clita "
40 + "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor "
41 + "sit amet.";
42
43 /**
44 * Generates a test text using the specified parameters.
45 *
46 * @param from the start index of the text
47 * @param length the length of the text
48 * @return the generated test text
49 */
50 private static String generateText(int from, int length)
51 {
52 int count = from + length;
53 StringBuilder buf = new StringBuilder(count);
54 while (buf.length() < count)
55 {
56 buf.append(TEXT);
57 }
58 return buf.substring(from, from + length);
59 }
60
61 /**
62 * Generates a stream which contains a test text.
63 *
64 * @param from the start index of the text
65 * @param length the length of the generated stream
66 * @return the stream with the test text
67 */
68 private static InputStream generateStream(int from, int length)
69 {
70 return new ByteArrayInputStream(generateText(from, length).getBytes());
71 }
72
73 /**
74 * Helper method for reading the content of an input stream.
75 *
76 * @param in the stream to be read
77 * @return an array with the content of the stream
78 * @throws IOException if an error occurs
79 */
80 private static byte[] readStream(InputStream in) throws IOException
81 {
82 ByteArrayOutputStream bos = new ByteArrayOutputStream();
83 int c;
84 while ((c = in.read()) != -1)
85 {
86 bos.write(c);
87 }
88 return bos.toByteArray();
89 }
90
91 /**
92 * Tests whether the tail buffer can be obtained before data was read.
93 */
94 @Test
95 public void testTailBeforeRead() throws IOException
96 {
97 TailStream stream = new TailStream(generateStream(0, 100), 50);
98 assertEquals("Wrong buffer length", 0, stream.getTail().length);
99 stream.close();
100 }
101
102 /**
103 * Tests the content of the tail buffer if it is only partly filled.
104 */
105 @Test
106 public void testTailBufferPartlyRead() throws IOException
107 {
108 final int count = 64;
109 TailStream stream = new TailStream(generateStream(0, count), 2 * count);
110 byte[] data = readStream(stream);
111 assertTrue("Wrong content", Arrays.equals(data, stream.getTail()));
112 stream.close();
113 }
114
115 /**
116 * Tests the content of the tail buffer if only single bytes were read.
117 */
118 @Test
119 public void testTailSingleByteReads() throws IOException
120 {
121 final int count = 128;
122 TailStream stream = new TailStream(generateStream(0, 2 * count), count);
123 readStream(stream);
124 assertEquals("Wrong buffer", generateText(count, count), new String(
125 stream.getTail()));
126 }
127
128 /**
129 * Tests the content of the tail buffer if larger chunks are read.
130 */
131 @Test
132 public void testTailChunkReads() throws IOException
133 {
134 final int count = 16384;
135 final int tailSize = 61;
136 final int bufSize = 100;
137 TailStream stream = new TailStream(generateStream(0, count), tailSize);
138 byte[] buf = new byte[bufSize];
139 int read = stream.read(buf, 10, 8);
140 assertEquals("Wrong number of bytes read", 8, read);
141 while (read != -1)
142 {
143 read = stream.read(buf);
144 }
145 assertEquals("Wrong buffer", generateText(count - tailSize, tailSize),
146 new String(stream.getTail()));
147 stream.close();
148 }
149
150 /**
151 * Tests whether mark() and reset() work as expected.
152 */
153 @Test
154 public void testReadWithMarkAndReset() throws IOException
155 {
156 final int tailSize = 64;
157 TailStream stream =
158 new TailStream(generateStream(0, 2 * tailSize), tailSize);
159 byte[] buf = new byte[tailSize / 2];
160 stream.read(buf);
161 stream.mark(tailSize);
162 stream.read(buf);
163 stream.reset();
164 readStream(stream);
165 assertEquals("Wrong buffer", generateText(tailSize, tailSize),
166 new String(stream.getTail()));
167 }
168
169 /**
170 * Tests whether a reset() operation without a mark is simply ignored.
171 */
172 @Test
173 public void testResetWithoutMark() throws IOException
174 {
175 final int tailSize = 75;
176 final int count = 128;
177 TailStream stream = new TailStream(generateStream(0, count), tailSize);
178 stream.reset();
179 byte[] buf = new byte[count];
180 stream.read(buf);
181 assertEquals("Wrong buffer", generateText(count - tailSize, tailSize),
182 new String(stream.getTail()));
183 stream.close();
184 }
185
186 /**
187 * Tests whether skip() also fills the tail buffer.
188 */
189 @Test
190 public void testSkip() throws IOException
191 {
192 final int tailSize = 128;
193 final int count = 1024;
194 final int skipCount = 512;
195 TailStream stream = new TailStream(generateStream(0, count), tailSize);
196 assertEquals("Wrong skip result", skipCount, stream.skip(skipCount));
197 assertEquals("Wrong buffer",
198 generateText(skipCount - tailSize, tailSize),
199 new String(stream.getTail()));
200 stream.close();
201 }
202
203 /**
204 * Tests a skip operation at the end of the stream.
205 */
206 @Test
207 public void testSkipEOS() throws IOException
208 {
209 final int count = 128;
210 TailStream stream = new TailStream(generateStream(0, count), 2 * count);
211 assertEquals("Wrong skip result", count, stream.skip(2 * count));
212 assertEquals("Wrong buffer", generateText(0, count),
213 new String(stream.getTail()));
214 stream.close();
215 }
216
217 /**
218 * Tests skip() if read reaches the end of the stream and returns -1.
219 */
220 @Test
221 public void testSkipReadEnd() throws IOException
222 {
223 final int count = 128;
224 TailStream stream = new TailStream(generateStream(0, count), 2 * count);
225 readStream(stream);
226 assertEquals("Wrong result", -1, stream.skip(1));
227 }
228 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.io;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.ByteArrayOutputStream;
20 import java.io.File;
21 import java.io.FileInputStream;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.OutputStream;
26 import java.net.URL;
27
28 import org.apache.tika.metadata.Metadata;
29
30 import org.junit.Test;
31 import static org.junit.Assert.assertEquals;
32 import static org.junit.Assert.assertFalse;
33 import static org.junit.Assert.assertTrue;
34
35 public class TikaInputStreamTest {
36
37 @Test
38 public void testFileBased() throws IOException {
39 File file = createTempFile("Hello, World!");
40 InputStream stream = TikaInputStream.get(file);
41
42 assertEquals(
43 "The file returned by the getFile() method should"
44 + " be the file used to instantiate a TikaInputStream",
45 file, TikaInputStream.get(stream).getFile());
46
47 assertEquals(
48 "The contents of the TikaInputStream should equal the"
49 + " contents of the underlying file",
50 "Hello, World!", readStream(stream));
51
52 stream.close();
53 assertTrue(
54 "The close() method must not remove the file used to"
55 + " instantiate a TikaInputStream",
56 file.exists());
57
58 file.delete();
59 }
60
61 @Test
62 public void testStreamBased() throws IOException {
63 InputStream input =
64 new ByteArrayInputStream("Hello, World!".getBytes("UTF-8"));
65 InputStream stream = TikaInputStream.get(input);
66
67 File file = TikaInputStream.get(stream).getFile();
68 assertTrue(file != null && file.isFile());
69
70 assertEquals(
71 "The contents of the file returned by the getFile method"
72 + " should equal the contents of the TikaInputStream",
73 "Hello, World!", readFile(file));
74
75 assertEquals(
76 "The contents of the TikaInputStream should not get modified"
77 + " by reading the file first",
78 "Hello, World!", readStream(stream));
79
80 stream.close();
81 assertFalse(
82 "The close() method must remove the temporary file created"
83 + " by a TikaInputStream",
84 file.exists());
85 }
86
87 private File createTempFile(String data) throws IOException {
88 File file = File.createTempFile("tika-", ".tmp");
89 OutputStream stream = new FileOutputStream(file);
90 try {
91 stream.write(data.getBytes("UTF-8"));
92 } finally {
93 stream.close();
94 }
95 return file;
96 }
97
98 private String readFile(File file) throws IOException {
99 InputStream stream = new FileInputStream(file);
100 try {
101 return readStream(stream);
102 } finally {
103 stream.close();
104 }
105 }
106
107 private String readStream(InputStream stream) throws IOException {
108 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
109 IOUtils.copy(stream, buffer);
110 return buffer.toString("UTF-8");
111 }
112
113 @Test
114 public void testGetMetadata() throws Exception {
115 URL url = TikaInputStreamTest.class.getResource("test.txt");
116 Metadata metadata = new Metadata();
117 TikaInputStream.get(url, metadata).close();
118 assertEquals("test.txt", metadata.get(Metadata.RESOURCE_NAME_KEY));
119 assertEquals(
120 Long.toString(new File(url.toURI()).length()),
121 metadata.get(Metadata.CONTENT_LENGTH));
122 }
123
124 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.InputStreamReader;
21 import java.io.Writer;
22 import java.util.HashMap;
23
24 import static org.junit.Assert.assertEquals;
25 import static org.junit.Assert.assertFalse;
26 import static org.junit.Assert.assertTrue;
27
28 import org.apache.tika.io.IOUtils;
29 import org.junit.Before;
30 import org.junit.Test;
31
32 /**
33 * JUnit based test of class {@link LanguageIdentifier}.
34 *
35 * @author Sami Siren
36 * @author Jerome Charron - http://frutch.free.fr/
37 */
38 public class LanguageIdentifierTest {
39
40 private static final String[] languages = new String[] {
41 // TODO - currently Estonian and Greek fail these tests.
42 // Enable when language detection works better.
43 "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it",
44 "lt", "nl", "pt", "sv"
45 };
46
47 @Before
48 public void setUp() {
49 LanguageIdentifier.initProfiles();
50 }
51
52 @Test
53 public void testLanguageDetection() throws IOException {
54 for (String language : languages) {
55 ProfilingWriter writer = new ProfilingWriter();
56 writeTo(language, writer);
57 LanguageIdentifier identifier = null;
58 identifier = new LanguageIdentifier(writer.getProfile());
59 assertEquals(language, identifier.getLanguage());
60 // Lithuanian is detected but isn't reasonably certain:
61 if (!language.equals("lt")) {
62 assertTrue(identifier.toString(), identifier.isReasonablyCertain());
63 }
64 }
65 }
66
67 @Test
68 public void testClearAddAndInitProfiles() throws IOException {
69 // Prepare english and german language profiles
70 ProfilingWriter enWriter = new ProfilingWriter();
71 writeTo("en", enWriter);
72 LanguageProfile enProfile = enWriter.getProfile();
73 ProfilingWriter deWriter = new ProfilingWriter();
74 writeTo("de", deWriter);
75 LanguageProfile deProfile = deWriter.getProfile();
76
77 // Out of the box profiles
78 LanguageIdentifier identifier = null;
79 identifier = new LanguageIdentifier(enProfile);
80 assertEquals("en", identifier.getLanguage());
81 assertTrue(identifier.isReasonablyCertain());
82
83 // No profiles
84 LanguageIdentifier.clearProfiles();
85 identifier = new LanguageIdentifier(enProfile);
86 assertFalse(identifier.isReasonablyCertain());
87
88 // Only English profile
89 LanguageIdentifier.addProfile("en", enProfile);
90 identifier = new LanguageIdentifier(enProfile);
91 assertEquals("en", identifier.getLanguage());
92 assertTrue(identifier.isReasonablyCertain());
93
94 // English and German profiles loaded explicitly from initProfiles method
95 HashMap<String, LanguageProfile> profilesMap = new HashMap<String, LanguageProfile>();
96 profilesMap.put("en", enProfile);
97 profilesMap.put("de", deProfile);
98 LanguageIdentifier.initProfiles(profilesMap);
99 identifier = new LanguageIdentifier(enProfile);
100 assertEquals("en", identifier.getLanguage());
101 assertTrue(identifier.isReasonablyCertain());
102 identifier = new LanguageIdentifier(deProfile);
103 assertEquals("de", identifier.getLanguage());
104 assertTrue(identifier.isReasonablyCertain());
105 }
106
107 @Test
108 public void testMixedLanguages() throws IOException {
109 for (String language : languages) {
110 for (String other : languages) {
111 if (!language.equals(other)) {
112 if (language.equals("lt") || other.equals("lt")) {
113 continue;
114 }
115 ProfilingWriter writer = new ProfilingWriter();
116 writeTo(language, writer);
117 writeTo(other, writer);
118 LanguageIdentifier identifier = null;
119 identifier = new LanguageIdentifier(writer.getProfile());
120 assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + identifier, identifier.isReasonablyCertain());
121 }
122 }
123 }
124 }
125
126 // TIKA-453: Fix up language identifier used for Estonian
127 @Test
128 public void testEstonia() throws Exception {
129 final String estonian = "et";
130 ProfilingWriter writer = new ProfilingWriter();
131 writeTo(estonian, writer);
132 LanguageIdentifier identifier =
133 new LanguageIdentifier(writer.getProfile());
134 assertEquals(estonian, identifier.getLanguage());
135 }
136
137 private void writeTo(String language, Writer writer) throws IOException {
138 InputStream stream =
139 LanguageIdentifierTest.class.getResourceAsStream(language + ".test");
140 try {
141 IOUtils.copy(new InputStreamReader(stream, "UTF-8"), writer);
142 } finally {
143 stream.close();
144 }
145 }
146
147 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.io.IOException;
19
20 import org.junit.Test;
21
22 import static org.junit.Assert.assertEquals;
23 import static org.junit.Assert.assertTrue;
24
25 public class LanguageProfileTest {
26
27 @Test
28 public void testLanguageProfile() throws IOException {
29 LanguageProfile foo = new LanguageProfile();
30 assertEquals(0, foo.getCount("foo"));
31
32 foo.add("foo");
33 assertEquals(1, foo.getCount("foo"));
34
35 foo.add("foo", 3);
36 assertEquals(4, foo.getCount("foo"));
37
38 LanguageProfile bar = new LanguageProfile();
39 assertEquals(1.0, foo.distance(bar), 1e-8);
40
41 bar.add("bar");
42 assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8);
43
44 bar.add("bar", 3);
45 assertEquals(Math.sqrt(2.0), foo.distance(bar), 1e-8);
46
47 LanguageProfile foobar = new LanguageProfile();
48 assertTrue(foo.distance(foobar) == bar.distance(foobar));
49
50 foobar.add("foo");
51 assertTrue( foo.distance(foobar) < bar.distance(foobar));
52
53 foobar.add("bar");
54 assertTrue(foo.distance(foobar) == bar.distance(foobar));
55 }
56
57 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.language;
18
19 import java.io.BufferedReader;
20 import java.io.File;
21 import java.io.FileInputStream;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.net.URISyntaxException;
27
28 import org.apache.tika.exception.TikaException;
29 import org.junit.After;
30 import org.junit.Test;
31
32 import static org.junit.Assert.assertEquals;
33 import static org.junit.Assert.assertTrue;
34
35 public class LanguageProfilerBuilderTest {
36 /* Test members */
37 private LanguageProfilerBuilder ngramProfile = null;
38 private LanguageProfile langProfile = null;
39 private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
40 + LanguageProfilerBuilderTest.class.getName();
41 private final String corpusName = "langbuilder/welsh_corpus.txt";
42 private final String encoding = "UTF-8";
43 private final String FILE_EXTENSION = "ngp";
44 private final String LANGUAGE = "welsh";
45 private final int maxlen = 1000;
46
47 @Test
48 public void testCreateProfile() throws TikaException, IOException, URISyntaxException {
49 InputStream is =
50 LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
51 try {
52 ngramProfile = LanguageProfilerBuilder.create(profileName, is , encoding);
53 } finally {
54 is.close();
55 }
56
57 File f = new File(profileName + "." + FILE_EXTENSION);
58 FileOutputStream fos = new FileOutputStream(f);
59 ngramProfile.save(fos);
60 fos.close();
61 assertEquals(maxlen, ngramProfile.getSorted().size());
62 }
63
64 @Test
65 public void testNGramProfile() throws IOException, TikaException, URISyntaxException {
66 createLanguageProfile();
67 LanguageIdentifier.addProfile(LANGUAGE, langProfile);
68 LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
69 assertEquals(LANGUAGE, identifier.getLanguage());
70 assertTrue(identifier.isReasonablyCertain());
71 }
72
73 private void createLanguageProfile() throws IOException, TikaException, URISyntaxException {
74 // Sort of dependency injection
75 if (ngramProfile == null)
76 testCreateProfile();
77
78 langProfile = new LanguageProfile();
79
80 InputStream stream = new FileInputStream(new File(profileName + "."
81 + FILE_EXTENSION));
82 try {
83 BufferedReader reader = new BufferedReader(new InputStreamReader(
84 stream, encoding));
85 String line = reader.readLine();
86 while (line != null) {
87 if (line.length() > 0 && !line.startsWith("#")) {// skips the
88 // ngp
89 // header/comment
90 int space = line.indexOf(' ');
91 langProfile.add(line.substring(0, space),
92 Long.parseLong(line.substring(space + 1)));
93 }
94 line = reader.readLine();
95 }
96 } finally {
97 stream.close();
98 }
99 }
100
101 @After
102 public void tearDown() throws Exception {
103 File profile = new File(profileName + "." + FILE_EXTENSION);
104 if (profile.exists())
105 profile.delete();
106 }
107 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.language;
17
18 import java.io.IOException;
19
20 import org.junit.Test;
21
22 import static org.junit.Assert.assertEquals;
23
24 public class ProfilingWriterTest {
25
26 @Test
27 public void testProfilingWriter() throws IOException {
28 ProfilingWriter writer = new ProfilingWriter();
29 writer.write(" foo+BAR FooBar\n");
30 writer.close();
31
32 LanguageProfile profile = writer.getProfile();
33 assertEquals(2, profile.getCount("_fo"));
34 assertEquals(2, profile.getCount("foo"));
35 assertEquals(1, profile.getCount("oo_"));
36 assertEquals(1, profile.getCount("oob"));
37 assertEquals(1, profile.getCount("oba"));
38 assertEquals(1, profile.getCount("_ba"));
39 assertEquals(2, profile.getCount("bar"));
40 assertEquals(2, profile.getCount("ar_"));
41 }
42
43 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata;
17
18 //JDK imports
19 import java.util.Date;
20 import java.util.Properties;
21
22
23 import org.junit.Test;
24
25 //Junit imports
26 import static org.junit.Assert.assertEquals;
27 import static org.junit.Assert.assertFalse;
28 import static org.junit.Assert.assertNotNull;
29 import static org.junit.Assert.assertNull;
30 import static org.junit.Assert.assertTrue;
31 import static org.junit.Assert.fail;
32
33 /**
34 * JUnit based tests of class {@link org.apache.tika.metadata.Metadata}.
35 */
36 public class TestMetadata {
37
38 private static final String CONTENTTYPE = "contenttype";
39
40 /** Test for the <code>add(String, String)</code> method. */
41 @Test
42 public void testAdd() {
43 String[] values = null;
44 Metadata meta = new Metadata();
45
46 values = meta.getValues(CONTENTTYPE);
47 assertEquals(0, values.length);
48
49 meta.add(CONTENTTYPE, "value1");
50 values = meta.getValues(CONTENTTYPE);
51 assertEquals(1, values.length);
52 assertEquals("value1", values[0]);
53
54 meta.add(CONTENTTYPE, "value2");
55 values = meta.getValues(CONTENTTYPE);
56 assertEquals(2, values.length);
57 assertEquals("value1", values[0]);
58 assertEquals("value2", values[1]);
59
60 // NOTE : For now, the same value can be added many times.
61 // Should it be changed?
62 meta.add(CONTENTTYPE, "value1");
63 values = meta.getValues(CONTENTTYPE);
64 assertEquals(3, values.length);
65 assertEquals("value1", values[0]);
66 assertEquals("value2", values[1]);
67 assertEquals("value1", values[2]);
68
69 Property nonMultiValued = Property.internalText("nonMultiValued");
70 meta.add(nonMultiValued, "value1");
71 try {
72 meta.add(nonMultiValued, "value2");
73 fail("add should fail on the second call of a non-multi valued item");
74 } catch (PropertyTypeException e) {
75 }
76 }
77
78 /** Test for the <code>set(String, String)</code> method. */
79 @Test
80 public void testSet() {
81 String[] values = null;
82 Metadata meta = new Metadata();
83
84 values = meta.getValues(CONTENTTYPE);
85 assertEquals(0, values.length);
86
87 meta.set(CONTENTTYPE, "value1");
88 values = meta.getValues(CONTENTTYPE);
89 assertEquals(1, values.length);
90 assertEquals("value1", values[0]);
91
92 meta.set(CONTENTTYPE, "value2");
93 values = meta.getValues(CONTENTTYPE);
94 assertEquals(1, values.length);
95 assertEquals("value2", values[0]);
96
97 meta.set(CONTENTTYPE, "new value 1");
98 meta.add("contenttype", "new value 2");
99 values = meta.getValues(CONTENTTYPE);
100 assertEquals(2, values.length);
101 assertEquals("new value 1", values[0]);
102 assertEquals("new value 2", values[1]);
103 }
104
105 /** Test for <code>setAll(Properties)</code> method. */
106 @Test
107 public void testSetProperties() {
108 String[] values = null;
109 Metadata meta = new Metadata();
110 Properties props = new Properties();
111
112 meta.setAll(props);
113 assertEquals(0, meta.size());
114
115 props.setProperty("name-one", "value1.1");
116 meta.setAll(props);
117 assertEquals(1, meta.size());
118 values = meta.getValues("name-one");
119 assertEquals(1, values.length);
120 assertEquals("value1.1", values[0]);
121
122 props.setProperty("name-two", "value2.1");
123 meta.setAll(props);
124 assertEquals(2, meta.size());
125 values = meta.getValues("name-one");
126 assertEquals(1, values.length);
127 assertEquals("value1.1", values[0]);
128 values = meta.getValues("name-two");
129 assertEquals(1, values.length);
130 assertEquals("value2.1", values[0]);
131 }
132
133 /** Test for <code>get(String)</code> method. */
134 @Test
135 public void testGet() {
136 Metadata meta = new Metadata();
137 assertNull(meta.get("a-name"));
138 meta.add("a-name", "value-1");
139 assertEquals("value-1", meta.get("a-name"));
140 meta.add("a-name", "value-2");
141 assertEquals("value-1", meta.get("a-name"));
142 }
143
144 /** Test for <code>isMultiValued()</code> method. */
145 @Test
146 public void testIsMultiValued() {
147 Metadata meta = new Metadata();
148 assertFalse(meta.isMultiValued("key"));
149 meta.add("key", "value1");
150 assertFalse(meta.isMultiValued("key"));
151 meta.add("key", "value2");
152 assertTrue(meta.isMultiValued("key"));
153 }
154
155 /** Test for <code>names</code> method. */
156 @Test
157 public void testNames() {
158 String[] names = null;
159 Metadata meta = new Metadata();
160 names = meta.names();
161 assertEquals(0, names.length);
162
163 meta.add("name-one", "value");
164 names = meta.names();
165 assertEquals(1, names.length);
166 assertEquals("name-one", names[0]);
167 meta.add("name-two", "value");
168 names = meta.names();
169 assertEquals(2, names.length);
170 }
171
172 /** Test for <code>remove(String)</code> method. */
173 @Test
174 public void testRemove() {
175 Metadata meta = new Metadata();
176 meta.remove("name-one");
177 assertEquals(0, meta.size());
178 meta.add("name-one", "value-1.1");
179 meta.add("name-one", "value-1.2");
180 meta.add("name-two", "value-2.2");
181 assertEquals(2, meta.size());
182 assertNotNull(meta.get("name-one"));
183 assertNotNull(meta.get("name-two"));
184 meta.remove("name-one");
185 assertEquals(1, meta.size());
186 assertNull(meta.get("name-one"));
187 assertNotNull(meta.get("name-two"));
188 meta.remove("name-two");
189 assertEquals(0, meta.size());
190 assertNull(meta.get("name-one"));
191 assertNull(meta.get("name-two"));
192 }
193
194 /** Test for <code>equals(Object)</code> method. */
195 @Test
196 public void testObject() {
197 Metadata meta1 = new Metadata();
198 Metadata meta2 = new Metadata();
199 assertFalse(meta1.equals(null));
200 assertFalse(meta1.equals("String"));
201 assertTrue(meta1.equals(meta2));
202 meta1.add("name-one", "value-1.1");
203 assertFalse(meta1.equals(meta2));
204 meta2.add("name-one", "value-1.1");
205 assertTrue(meta1.equals(meta2));
206 meta1.add("name-one", "value-1.2");
207 assertFalse(meta1.equals(meta2));
208 meta2.add("name-one", "value-1.2");
209 assertTrue(meta1.equals(meta2));
210 meta1.add("name-two", "value-2.1");
211 assertFalse(meta1.equals(meta2));
212 meta2.add("name-two", "value-2.1");
213 assertTrue(meta1.equals(meta2));
214 meta1.add("name-two", "value-2.2");
215 assertFalse(meta1.equals(meta2));
216 meta2.add("name-two", "value-2.x");
217 assertFalse(meta1.equals(meta2));
218 }
219
220 /**
221 * Tests for getting and setting integer
222 * based properties
223 */
224 @Test
225 public void testGetSetInt() {
226 Metadata meta = new Metadata();
227
228 // Isn't initially set, will get null back
229 assertEquals(null, meta.get(Metadata.IMAGE_WIDTH));
230 assertEquals(null, meta.getInt(Metadata.IMAGE_WIDTH));
231
232 // Can only set as a single valued int
233 try {
234 meta.set(Metadata.BITS_PER_SAMPLE, 1);
235 fail("Shouldn't be able to set a multi valued property as an int");
236 } catch(PropertyTypeException e) {}
237 try {
238 meta.set(TikaCoreProperties.CREATED, 1);
239 fail("Shouldn't be able to set a date property as an int");
240 } catch(PropertyTypeException e) {}
241
242 // Can set it and retrieve it
243 meta.set(Metadata.IMAGE_WIDTH, 22);
244 assertEquals("22", meta.get(Metadata.IMAGE_WIDTH));
245 assertEquals(22, meta.getInt(Metadata.IMAGE_WIDTH).intValue());
246
247 // If you save a non int value, you get null
248 meta.set(Metadata.IMAGE_WIDTH, "INVALID");
249 assertEquals("INVALID", meta.get(Metadata.IMAGE_WIDTH));
250 assertEquals(null, meta.getInt(Metadata.IMAGE_WIDTH));
251
252 // If you try to retrieve a non simple int value, you get null
253 meta.set(Metadata.IMAGE_WIDTH, 22);
254 assertEquals(22, meta.getInt(Metadata.IMAGE_WIDTH).intValue());
255 assertEquals(null, meta.getInt(Metadata.BITS_PER_SAMPLE));
256 assertEquals(null, meta.getInt(TikaCoreProperties.CREATED));
257 }
258
259 /**
260 * Tests for getting and setting date
261 * based properties
262 */
263 @Test
264 public void testGetSetDate() {
265 Metadata meta = new Metadata();
266 long hour = 60 * 60 * 1000;
267
268 // Isn't initially set, will get null back
269 assertEquals(null, meta.get(TikaCoreProperties.CREATED));
270 assertEquals(null, meta.getInt(TikaCoreProperties.CREATED));
271
272 // Can only set as a single valued date
273 try {
274 meta.set(Metadata.BITS_PER_SAMPLE, new Date(1000));
275 fail("Shouldn't be able to set a multi valued property as a date");
276 } catch(PropertyTypeException e) {}
277 try {
278 meta.set(Metadata.IMAGE_WIDTH, new Date(1000));
279 fail("Shouldn't be able to set an int property as an date");
280 } catch(PropertyTypeException e) {}
281
282 // Can set it and retrieve it
283 meta.set(TikaCoreProperties.CREATED, new Date(1000));
284 assertEquals("1970-01-01T00:00:01Z", meta.get(TikaCoreProperties.CREATED));
285 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
286
287 // If you save a non date value, you get null
288 meta.set(TikaCoreProperties.CREATED, "INVALID");
289 assertEquals("INVALID", meta.get(TikaCoreProperties.CREATED));
290 assertEquals(null, meta.getDate(TikaCoreProperties.CREATED));
291
292 // If you try to retrieve a non simple date value, you get null
293 meta.set(TikaCoreProperties.CREATED, new Date(1000));
294 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
295 assertEquals(null, meta.getInt(Metadata.BITS_PER_SAMPLE));
296 assertEquals(null, meta.getInt(TikaCoreProperties.CREATED));
297
298 // Our format doesn't include milliseconds
299 // This means things get rounded
300 meta.set(TikaCoreProperties.CREATED, new Date(1050));
301 assertEquals("1970-01-01T00:00:01Z", meta.get(TikaCoreProperties.CREATED));
302 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
303
304 // We can accept a number of different ISO-8601 variants
305 meta.set(TikaCoreProperties.CREATED, "1970-01-01T00:00:01Z");
306 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
307
308 meta.set(TikaCoreProperties.CREATED, "1970-01-01 00:00:01Z");
309 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
310
311 meta.set(TikaCoreProperties.CREATED, "1970-01-01T01:00:01+01:00");
312 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
313
314 meta.set(TikaCoreProperties.CREATED, "1970-01-01 01:00:01+01:00");
315 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
316
317 meta.set(TikaCoreProperties.CREATED, "1970-01-01T12:00:01+12:00");
318 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
319
320 meta.set(TikaCoreProperties.CREATED, "1969-12-31T12:00:01-12:00");
321 assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
322
323 // Dates without times, come in at midday UTC
324 meta.set(TikaCoreProperties.CREATED, "1970-01-01");
325 assertEquals(12*hour, meta.getDate(TikaCoreProperties.CREATED).getTime());
326
327 meta.set(TikaCoreProperties.CREATED, "1970:01:01");
328 assertEquals(12*hour, meta.getDate(TikaCoreProperties.CREATED).getTime());
329 }
330
331 /**
332 * Some documents, like jpegs, might have date in unspecified time zone
333 * which should be handled like strings but verified to have parseable ISO 8601 format
334 */
335 @Test
336 public void testGetSetDateUnspecifiedTimezone() {
337 Metadata meta = new Metadata();
338
339 meta.set(TikaCoreProperties.CREATED, "1970-01-01T00:00:01");
340 assertEquals("should return string without time zone specifier because zone is not known",
341 "1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED));
342 }
343
344 /**
345 * Defines a composite property, then checks that when set as the
346 * composite the value can be retrieved with the property or the aliases
347 */
348 @SuppressWarnings("deprecation")
349 @Test
350 public void testCompositeProperty() {
351 Metadata meta = new Metadata();
352 Property compositeProperty = Property.composite(
353 DublinCore.DESCRIPTION, new Property[] {
354 Property.internalText(Metadata.DESCRIPTION),
355 Property.internalText("testDescriptionAlt")
356 });
357 String message = "composite description";
358 meta.set(compositeProperty, message);
359
360 // Fetch as the composite
361 assertEquals(message, meta.get(compositeProperty));
362 // Fetch as the primary property on the composite
363 assertEquals(message, meta.get(DublinCore.DESCRIPTION));
364 // Fetch as the aliases
365 assertEquals(message, meta.get(Metadata.DESCRIPTION));
366 assertEquals(message, meta.get("testDescriptionAlt"));
367 }
368 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.util.ArrayList;
19 import java.util.HashMap;
20 import java.util.List;
21 import java.util.Map;
22
23 import org.junit.Test;
24 import static org.junit.Assert.assertEquals;
25
26 import org.xml.sax.Attributes;
27 import org.xml.sax.SAXException;
28
29
30 public class CustomReaderTest {
31
32 static class CustomMimeTypesReader extends MimeTypesReader {
33 public Map<String, String> values = new HashMap<String, String>();
34 public List<String> ignorePatterns = new ArrayList<String>();
35
36 CustomMimeTypesReader(MimeTypes types) {
37 super(types);
38 }
39
40
41 @Override
42 public void startElement(
43 String uri, String localName, String qName,
44 Attributes attributes) throws SAXException {
45 super.startElement(uri, localName, qName, attributes);
46 if ("hello".equals(qName)) {
47 characters = new StringBuilder();
48 }
49 }
50
51 @Override
52 public void endElement(String uri, String localName, String qName) {
53 super.endElement(uri, localName, qName);
54 if (type != null) {
55 if("hello".equals(qName)) {
56 values.put(type.toString(), characters.toString().trim());
57 characters = null;
58 }
59 }
60 }
61
62 @Override
63 protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex,
64 String qName, Attributes attributes) throws SAXException {
65 ignorePatterns.add( type.toString() + ">>" + pattern);
66 }
67 }
68
69 @Test
70 public void testCustomReader() throws Exception {
71 MimeTypes mimeTypes = new MimeTypes();
72 CustomMimeTypesReader reader = new CustomMimeTypesReader(mimeTypes);
73 reader.read(getClass().getResourceAsStream("custom-mimetypes.xml"));
74
75 String key = "hello/world-file";
76
77 MimeType hello = mimeTypes.forName(key);
78 assertEquals("A \"Hello World\" file", hello.getDescription());
79 assertEquals("world", reader.values.get(key));
80 assertEquals(0, reader.ignorePatterns.size());
81
82 // Now add another resource with conflicting regex
83 reader.read(getClass().getResourceAsStream("custom-mimetypes2.xml"));
84
85 key = "another/world-file";
86 MimeType another = mimeTypes.forName(key);
87 assertEquals("kittens", reader.values.get(key));
88 assertEquals(1, reader.ignorePatterns.size());
89 assertEquals(another.toString()+">>*"+hello.getExtension(),
90 reader.ignorePatterns.get(0));
91
92 //System.out.println( mimeTypes.getMediaTypeRegistry().getTypes() );
93 }
94 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.util.HashMap;
19 import java.util.Map;
20
21 import static java.util.Collections.singletonMap;
22
23 import static org.junit.Assert.assertEquals;
24 import static org.junit.Assert.assertNotNull;
25 import static org.junit.Assert.assertTrue;
26 import org.junit.Test;
27
28 public class MediaTypeTest {
29
30 @Test
31 public void testBasics() {
32 assertEquals(
33 "application/octet-stream",
34 new MediaType("application", "octet-stream").toString());
35
36 assertEquals(
37 "text/plain",
38 new MediaType("text", "plain").toString());
39
40 Map<String, String> parameters = new HashMap<String, String>();
41 assertEquals(
42 "text/plain",
43 new MediaType("text", "plain", parameters).toString());
44
45 parameters.put("charset", "UTF-8");
46 assertEquals(
47 "text/plain; charset=UTF-8",
48 new MediaType("text", "plain", parameters).toString());
49
50 parameters.put("x-eol-style", "crlf");
51 assertEquals(
52 "text/plain; charset=UTF-8; x-eol-style=crlf",
53 new MediaType("text", "plain", parameters).toString());
54 }
55
56 @Test
57 public void testLowerCase() {
58 assertEquals(
59 "text/plain",
60 new MediaType("TEXT", "PLAIN").toString());
61 assertEquals(
62 "text/plain",
63 new MediaType("Text", "Plain").toString());
64
65 Map<String, String> parameters = new HashMap<String, String>();
66 assertEquals(
67 "text/plain",
68 new MediaType("text", "PLAIN", parameters).toString());
69
70 parameters.put("CHARSET", "UTF-8");
71 assertEquals(
72 "text/plain; charset=UTF-8",
73 new MediaType("TEXT", "plain", parameters).toString());
74
75 parameters.put("X-Eol-Style", "crlf");
76 assertEquals(
77 "text/plain; charset=UTF-8; x-eol-style=crlf",
78 new MediaType("TeXt", "PlAiN", parameters).toString());
79 }
80
81 @Test
82 public void testTrim() {
83 assertEquals(
84 "text/plain",
85 new MediaType(" text ", " plain ").toString());
86 assertEquals(
87 "text/plain",
88 new MediaType("\ttext", "plain\t").toString());
89
90 Map<String, String> parameters = new HashMap<String, String>();
91 assertEquals(
92 "text/plain",
93 new MediaType("text\r\n", " \tplain", parameters).toString());
94
95 parameters.put(" charset", "UTF-8");
96 assertEquals(
97 "text/plain; charset=UTF-8",
98 new MediaType("\n\ntext", "plain \r", parameters).toString());
99
100 parameters.put("\r\n\tx-eol-style \t", "crlf");
101 assertEquals(
102 "text/plain; charset=UTF-8; x-eol-style=crlf",
103 new MediaType(" text", "\tplain ", parameters).toString());
104 }
105
106 @Test
107 public void testQuote() {
108 Map<String, String> parameters = new HashMap<String, String>();
109 parameters.put("a", " value with spaces ");
110 parameters.put("b", "text/plain");
111 parameters.put("c", "()<>@,;:\\\"/[]?=");
112 assertEquals(
113 "text/plain; a=\" value with spaces \"; b=\"text\\/plain\""
114 + "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"",
115 new MediaType("text", "plain", parameters).toString());
116 }
117
118 /**
119 * @since TIKA-121
120 */
121 @Test
122 public void testParseWithParams() {
123 String mimeStringWithParams = "text/html;charset=UTF-8;foo=bar;foo2=bar2";
124
125 MediaType type = MediaType.parse(mimeStringWithParams);
126 assertNotNull(type);
127 assertNotNull(type.getParameters());
128 assertNotNull(type.getParameters().keySet());
129 assertEquals(3, type.getParameters().keySet().size());
130 boolean gotCharset = false, gotFoo = false, gotFoo2 = false;
131 for (String param : type.getParameters().keySet()) {
132 if (param.equals("charset")) {
133 gotCharset = true;
134 } else if (param.equals("foo")) {
135 gotFoo = true;
136 } else if (param.equals("foo2")) {
137 gotFoo2 = true;
138 }
139 }
140 assertTrue(gotCharset && gotFoo && gotFoo2);
141 }
142
143 /**
144 * Per http://tools.ietf.org/html/rfc2045#section-5.1, charset can be in quotes
145 */
146 @Test
147 public void testParseWithParamsAndQuotedCharset() {
148 // Typical case, with a quoted charset
149 String mimeStringWithParams = "text/html;charset=\"UTF-8\"";
150
151 MediaType type = MediaType.parse(mimeStringWithParams);
152 assertNotNull(type);
153 assertEquals(singletonMap("charset", "UTF-8"), type.getParameters());
154
155 // Complex case, with various different quoted and un-quoted forms
156 mimeStringWithParams = "text/html;charset=\'UTF-8\';test=\"true\";unquoted=here";
157
158 type = MediaType.parse(mimeStringWithParams);
159 assertNotNull(type);
160 assertEquals(3, type.getParameters().size());
161 assertEquals("UTF-8", type.getParameters().get("charset"));
162 assertEquals("true", type.getParameters().get("test"));
163 assertEquals("here", type.getParameters().get("unquoted"));
164 }
165
166 /**
167 * @since TIKA-121
168 */
169 @Test
170 public void testParseNoParams() {
171 String mimeStringNoParams = "text/html";
172
173 MediaType type = MediaType.parse(mimeStringNoParams);
174 assertNotNull(type);
175 assertNotNull(type.getParameters());
176 assertNotNull(type.getParameters().keySet());
177 assertEquals(0, type.getParameters().keySet().size());
178 }
179
180 /**
181 * @since TIKA-121
182 */
183 @Test
184 public void testParseNoParamsWithSemi() {
185 String mimeStringNoParamsWithSemi = "text/html;";
186 MediaType type = MediaType.parse(mimeStringNoParamsWithSemi);
187 assertNotNull(type);
188 assertNotNull(type.getParameters());
189 assertNotNull(type.getParameters().keySet());
190 assertEquals(0, type.getParameters().keySet().size());
191 }
192
193 /**
194 * TIKA-349
195 */
196 @Test
197 public void testOddParameters() {
198 assertEquals(
199 "text/html; charset=UTF-8",
200 MediaType.parse("text/html;; charset=UTF-8").toString());
201 assertEquals(
202 "text/html; charset=UTF-8",
203 MediaType.parse("text/html;; charset=UTF-8").toString());
204 assertEquals(
205 "text/html; charset=UTF-8",
206 MediaType.parse("text/html;; charset=\"UTF-8\"").toString());
207 assertEquals(
208 "text/html; charset=UTF-8",
209 MediaType.parse("text/html;; charset=\"UTF-8").toString());
210 }
211
212 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.ByteArrayInputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.net.URL;
25
26 import org.apache.tika.config.TikaConfig;
27 import org.apache.tika.metadata.Metadata;
28
29 import org.junit.Before;
30 import org.junit.Test;
31
32 public class MimeDetectionTest {
33
34 private MimeTypes mimeTypes;
35
36 private MediaTypeRegistry registry;
37
38 /** @inheritDoc */
39 @Before
40 public void setUp() {
41 this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
42 this.registry = mimeTypes.getMediaTypeRegistry();
43 }
44
45 @Test
46 public void testDetection() throws Exception {
47 testFile("image/svg+xml", "circles.svg");
48 testFile("image/svg+xml", "circles-with-prefix.svg");
49 testFile("image/png", "datamatrix.png");
50 testFile("text/html", "test.html");
51 testFile("application/xml", "test-iso-8859-1.xml");
52 testFile("application/xml", "test-utf8.xml");
53 testFile("application/xml", "test-utf8-bom.xml");
54 testFile("application/xml", "test-utf16le.xml");
55 testFile("application/xml", "test-utf16be.xml");
56 testFile("application/xml", "test-long-comment.xml");
57 testFile("application/xslt+xml", "stylesheet.xsl");
58 testUrl(
59 "application/rdf+xml",
60 "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
61 "test-difficult-rdf1.xml");
62 testUrl(
63 "application/rdf+xml",
64 "http://www.w3.org/2002/07/owl#",
65 "test-difficult-rdf2.xml");
66 // add evil test from TIKA-327
67 testFile("text/html", "test-tika-327.html");
68 // add another evil html test from TIKA-357
69 testFile("text/html", "testlargerbuffer.html");
70 // test fragment of HTML with <div> (TIKA-1102)
71 testFile("text/html", "htmlfragment");
72 // test binary CGM detection (TIKA-1170)
73 testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
74 // test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170)
75 testFile("text/html", "test-malformed-header.html.bin");
76 }
77
78 @Test
79 public void testByteOrderMark() throws Exception {
80 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
81 new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
82 new Metadata()));
83 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
84 new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
85 new Metadata()));
86 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
87 new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
88 new Metadata()));
89 }
90
91 @Test
92 public void testSuperTypes() {
93 assertTrue(registry.isSpecializationOf(
94 MediaType.parse("text/something; charset=UTF-8"),
95 MediaType.parse("text/something")));
96
97 assertTrue(registry.isSpecializationOf(
98 MediaType.parse("text/something; charset=UTF-8"),
99 MediaType.TEXT_PLAIN));
100
101 assertTrue(registry.isSpecializationOf(
102 MediaType.parse("text/something; charset=UTF-8"),
103 MediaType.OCTET_STREAM));
104
105 assertTrue(registry.isSpecializationOf(
106 MediaType.parse("text/something"),
107 MediaType.TEXT_PLAIN));
108
109 assertTrue(registry.isSpecializationOf(
110 MediaType.parse("application/something+xml"),
111 MediaType.APPLICATION_XML));
112
113 assertTrue(registry.isSpecializationOf(
114 MediaType.parse("application/something+zip"),
115 MediaType.APPLICATION_ZIP));
116
117 assertTrue(registry.isSpecializationOf(
118 MediaType.APPLICATION_XML,
119 MediaType.TEXT_PLAIN));
120
121 assertTrue(registry.isSpecializationOf(
122 MediaType.parse("application/vnd.apple.iwork"),
123 MediaType.APPLICATION_ZIP));
124 }
125
126 @SuppressWarnings("unused")
127 private void testUrlOnly(String expected, String url) throws IOException{
128 InputStream in = new URL(url).openStream();
129 testStream(expected, url, in);
130 }
131
132 private void testUrl(String expected, String url, String file) throws IOException{
133 InputStream in = getClass().getResourceAsStream(file);
134 testStream(expected, url, in);
135 }
136
137 private void testFile(String expected, String filename) throws IOException {
138 InputStream in = getClass().getResourceAsStream(filename);
139 testStream(expected, filename, in);
140 }
141
142 private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException{
143 assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
144 if (!in.markSupported()) {
145 in = new java.io.BufferedInputStream(in);
146 }
147 try {
148 Metadata metadata = new Metadata();
149 String mime = this.mimeTypes.detect(in, metadata).toString();
150 assertEquals(urlOrFileName + " is not properly detected: detected.", expected, mime);
151
152 //Add resource name and test again
153 metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
154 mime = this.mimeTypes.detect(in, metadata).toString();
155 assertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime);
156 } finally {
157 in.close();
158 }
159 }
160
161 private void assertNotNull(String string, InputStream in) {
162 // TODO Auto-generated method stub
163
164 }
165
166 /**
167 * Test for type detection of empty documents.
168 *
169 * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
170 */
171 @Test
172 public void testEmptyDocument() throws IOException {
173 assertEquals(MediaType.OCTET_STREAM, mimeTypes.detect(
174 new ByteArrayInputStream(new byte[0]), new Metadata()));
175
176 Metadata namehint = new Metadata();
177 namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
178 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
179 new ByteArrayInputStream(new byte[0]), namehint));
180
181 Metadata typehint = new Metadata();
182 typehint.set(Metadata.CONTENT_TYPE, "text/plain");
183 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
184 new ByteArrayInputStream(new byte[0]), typehint));
185
186 }
187
188 /**
189 * Test for things like javascript files whose content is enclosed in XML
190 * comment delimiters, but that aren't actually XML.
191 *
192 * @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
193 */
194 @Test
195 public void testNotXML() throws IOException {
196 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
197 new ByteArrayInputStream("<!-- test -->".getBytes("UTF-8")),
198 new Metadata()));
199 }
200
201 /**
202 * Tests that when we repeatedly test the detection of a document
203 * that can be detected with Mime Magic, that we consistently
204 * detect it correctly. See TIKA-391 for more details.
205 */
206 @Test
207 public void testMimeMagicStability() throws IOException {
208 for(int i=0; i<100; i++) {
209 testFile("application/vnd.ms-excel", "test.xls");
210 }
211 }
212
213 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.io.ByteArrayInputStream;
19 import java.lang.reflect.Field;
20 import java.util.ArrayList;
21 import java.util.List;
22
23 import org.apache.tika.config.TikaConfig;
24 import org.apache.tika.metadata.Metadata;
25
26 import org.junit.Before;
27 import org.junit.Test;
28 import static org.junit.Assert.assertEquals;
29 import static org.junit.Assert.assertNotNull;
30 import static org.junit.Assert.assertTrue;
31 import static org.junit.Assert.fail;
32
33 /**
34 * These tests try to ensure that the MimeTypesReader
35 * has correctly processed the mime-types.xml file.
36 * To do this, it tests that various aspects of the
37 * mime-types.xml file have ended up correctly as
38 * globs, matches, magics etc.
39 *
40 * If you make updates to mime-types.xml, then the
41 * checks in this test may no longer hold true.
42 * As such, if tests here start failing after your
43 * changes, please review the test details, and
44 * update it to match the new state of the file!
45 */
46 public class MimeTypesReaderTest {
47
48 private MimeTypes mimeTypes;
49 private List<Magic> magics;
50
51 @SuppressWarnings("unchecked")
52 @Before
53 public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException{
54 this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
55
56 Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
57 magicsField.setAccessible(true);
58 magics = (List<Magic>)magicsField.get(mimeTypes);
59 }
60
61 @Test
62 public void testHtmlMatches() throws Exception {
63 int minMatches = 10;
64
65 // Check on the type
66 MimeType html = mimeTypes.forName("text/html");
67 assertTrue(html.hasMagic());
68 assertTrue(
69 "There should be at least "+minMatches+" HTML matches, found " + html.getMagics().size(),
70 html.getMagics().size() >= minMatches
71 );
72
73 // Check on the overall magics
74 List<Magic> htmlMagics = new ArrayList<Magic>();
75 for(Magic magic : magics) {
76 if(magic.getType().toString().equals("text/html")) {
77 htmlMagics.add(magic);
78 }
79 }
80
81 assertTrue(
82 "There should be at least "+minMatches+" HTML matches, found " + htmlMagics.size(),
83 htmlMagics.size() >= minMatches
84 );
85 }
86
87 @Test
88 public void testExcelMatches() throws Exception {
89 int minMatches = 4;
90
91 // Check on the type
92 MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
93 assertTrue(excel.hasMagic());
94 assertTrue(
95 "There should be at least "+minMatches+" Excel matches, found " + excel.getMagics().size(),
96 excel.getMagics().size() >= minMatches
97 );
98
99 // Check on the overall magics
100 List<Magic> excelMagics = new ArrayList<Magic>();
101 for(Magic magic : magics) {
102 if(magic.getType().toString().equals("application/vnd.ms-excel")) {
103 excelMagics.add(magic);
104 }
105 }
106
107 assertTrue(
108 "There should be at least "+minMatches+" Excel matches, found " + excelMagics.size(),
109 excelMagics.size() >= minMatches
110 );
111 }
112
113 /**
114 * @since TIKA-515
115 */
116 @Test
117 public void testReadComment() {
118 try {
119 assertNotNull(this.mimeTypes.forName("application/msword")
120 .getDescription());
121 } catch (Exception e) {
122 fail(e.getMessage());
123 }
124 }
125
126 /**
127 * @since TIKA-1012
128 */
129 @Test
130 public void testReadExtendedMetadata() throws Exception {
131 MimeType mime = this.mimeTypes.forName("image/x-ms-bmp");
132 assertEquals("BMP", mime.getAcronym());
133 assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier());
134 assertEquals("http://en.wikipedia.org/wiki/BMP_file_format",
135 mime.getLinks().get(0).toString());
136
137 mime = this.mimeTypes.forName("application/xml");
138 assertEquals("XML", mime.getAcronym());
139 assertEquals("public.xml", mime.getUniformTypeIdentifier());
140 assertEquals("http://en.wikipedia.org/wiki/Xml",
141 mime.getLinks().get(0).toString());
142 }
143
144 /**
145 * TIKA-746 Ensures that the custom mimetype maps were also
146 * loaded and used
147 */
148 @Test
149 public void testCustomMimeTypes() {
150 // Check that it knows about our two special ones
151 String helloWorld = "hello/world";
152 String helloWorldFile = "hello/world-file";
153 try {
154 assertNotNull(this.mimeTypes.forName(helloWorld));
155 assertNotNull(this.mimeTypes.forName(helloWorldFile));
156 } catch (Exception e) {
157 fail(e.getMessage());
158 }
159
160 // Check that the details come through as expected
161 try {
162 MimeType hw = this.mimeTypes.forName(helloWorld);
163 MimeType hwf = this.mimeTypes.forName(helloWorldFile);
164
165 // The parent has no comments, globs etc
166 assertEquals("", hw.getDescription());
167 assertEquals("", hw.getExtension());
168 assertEquals(0, hw.getExtensions().size());
169
170 // The file one does
171 assertEquals("A \"Hello World\" file", hwf.getDescription());
172 assertEquals(".hello.world", hwf.getExtension());
173
174 // Check that we can correct detect with the file one:
175 // By name
176 Metadata m = new Metadata();
177 m.add(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
178 assertEquals(hwf.toString(), this.mimeTypes.detect(null, m).toString());
179
180 // By contents
181 m = new Metadata();
182 ByteArrayInputStream s = new ByteArrayInputStream(
183 "Hello, World!".getBytes("ASCII"));
184 assertEquals(hwf.toString(), this.mimeTypes.detect(s, m).toString());
185 } catch (Exception e) {
186 fail(e.getMessage());
187 }
188 }
189
190 @Test
191 public void testGetExtensionForPowerPoint() throws Exception {
192 MimeType mt = this.mimeTypes.forName("application/vnd.ms-powerpoint");
193 String ext = mt.getExtension();
194 assertEquals(".ppt",ext);
195 assertEquals(".ppt",mt.getExtensions().get(0));
196 }
197 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import java.util.List;
19
20 import static org.junit.Assert.assertEquals;
21 import static org.junit.Assert.assertTrue;
22 import static org.junit.Assert.fail;
23
24 import org.junit.Before;
25 import org.junit.Test;
26
27 public class PatternsTest {
28 private MimeTypes fullTypes = MimeTypes.getDefaultMimeTypes();
29
30 private Patterns patterns;
31 private MimeTypes types;
32 private MimeType text;
33
34 @Before
35 public void setUp() throws MimeTypeException {
36 patterns = new Patterns(new MediaTypeRegistry());
37 types = new MimeTypes();
38 text = types.forName("text/plain");
39 }
40
41 /** Test add() */
42 @Test
43 public void testAdd() throws MimeTypeException {
44 try {
45 patterns.add(null, text);
46 fail("Expected IllegalArgumentException");
47 } catch (IllegalArgumentException e) {
48 // expected result
49 }
50 try {
51 patterns.add("", null);
52 fail("Expected IllegalArgumentException");
53 } catch (IllegalArgumentException e) {
54 // expected result
55 }
56 try {
57 patterns.add(null, null);
58 fail("Expected IllegalArgumentException");
59 } catch (IllegalArgumentException e) {
60 // expected result
61 }
62 }
63
64 /** Test matches() */
65 @Test
66 public void testMatches() {
67 try {
68 patterns.matches(null);
69 fail("Expected IllegalArgumentException");
70 } catch (IllegalArgumentException e) {
71 // expected result
72 }
73 }
74
75 @Test
76 public void testExtension() throws MimeTypeException {
77 MimeType doc = types.forName("application/vnd.ms-word");
78 patterns.add("*.doc", doc);
79
80 assertEquals(".doc", doc.getExtension());
81 }
82
83 @Test
84 public void testExtensions() throws Exception{
85 MimeType jpeg = fullTypes.forName("image/jpeg");
86
87 assertEquals(".jpg", jpeg.getExtension());
88
89 List<String> extensions = jpeg.getExtensions();
90 assertTrue(extensions.size() > 1);
91 assertTrue(extensions.contains(".jpg"));
92 assertTrue(extensions.contains(".jpeg"));
93 }
94
95 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20
21 import java.io.ByteArrayInputStream;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Set;
29
30 import org.apache.tika.config.TikaConfig;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.mime.MediaTypeRegistry;
34 import org.apache.tika.sax.BodyContentHandler;
35 import org.junit.Test;
36 import org.xml.sax.ContentHandler;
37
38 public class CompositeParserTest {
39
40 @Test
41 public void testFindDuplicateParsers() {
42 Parser a = new EmptyParser() {
43 public Set<MediaType> getSupportedTypes(ParseContext context) {
44 return Collections.singleton(MediaType.TEXT_PLAIN);
45 }
46 };
47 Parser b = new EmptyParser() {
48 public Set<MediaType> getSupportedTypes(ParseContext context) {
49 return Collections.singleton(MediaType.TEXT_PLAIN);
50 }
51 };
52 Parser c = new EmptyParser() {
53 public Set<MediaType> getSupportedTypes(ParseContext context) {
54 return Collections.singleton(MediaType.OCTET_STREAM);
55 }
56 };
57
58 CompositeParser composite = new CompositeParser(
59 MediaTypeRegistry.getDefaultRegistry(), a, b, c);
60 Map<MediaType, List<Parser>> duplicates =
61 composite.findDuplicateParsers(new ParseContext());
62 assertEquals(1, duplicates.size());
63 List<Parser> parsers = duplicates.get(MediaType.TEXT_PLAIN);
64 assertNotNull(parsers);
65 assertEquals(2, parsers.size());
66 assertEquals(a, parsers.get(0));
67 assertEquals(b, parsers.get(1));
68 }
69
70 @Test
71 public void testDefaultParser() throws Exception {
72 TikaConfig config = TikaConfig.getDefaultConfig();
73
74 CompositeParser parser = (CompositeParser) config.getParser();
75
76 // Check it has the full registry
77 assertEquals(config.getMediaTypeRegistry(), parser.getMediaTypeRegistry());
78 }
79
80 @Test
81 public void testMimeTypeAliases() throws Exception {
82 MediaType bmpCanonical = MediaType.image("x-ms-bmp");
83 Map<String,String> bmpCanonicalMetadata = new HashMap<String, String>();
84 bmpCanonicalMetadata.put("BMP", "True");
85 bmpCanonicalMetadata.put("Canonical", "True");
86 Parser bmpCanonicalParser = new DummyParser(
87 new HashSet<MediaType>(Arrays.asList(bmpCanonical)),
88 bmpCanonicalMetadata, null
89 );
90
91 MediaType bmpAlias = MediaType.image("bmp");
92 Map<String,String> bmpAliasMetadata = new HashMap<String, String>();
93 bmpAliasMetadata.put("BMP", "True");
94 bmpAliasMetadata.put("Alias", "True");
95 Parser bmpAliasParser = new DummyParser(
96 new HashSet<MediaType>(Arrays.asList(bmpAlias)),
97 bmpAliasMetadata, null
98 );
99
100 TikaConfig config = TikaConfig.getDefaultConfig();
101 CompositeParser canonical = new CompositeParser(
102 config.getMediaTypeRegistry(), bmpCanonicalParser
103 );
104 CompositeParser alias = new CompositeParser(
105 config.getMediaTypeRegistry(), bmpAliasParser
106 );
107 CompositeParser both = new CompositeParser(
108 config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser
109 );
110
111 ContentHandler handler = new BodyContentHandler();
112 Metadata metadata;
113
114 // Canonical and Canonical
115 metadata = new Metadata();
116 metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
117 canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
118 assertEquals("True", metadata.get("BMP"));
119 assertEquals("True", metadata.get("Canonical"));
120
121
122 // Alias and Alias
123 metadata = new Metadata();
124 metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
125 alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
126 assertEquals("True", metadata.get("BMP"));
127 assertEquals("True", metadata.get("Alias"));
128
129
130 // Alias type and Canonical parser
131 metadata = new Metadata();
132 metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
133 canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
134 assertEquals("True", metadata.get("BMP"));
135 assertEquals("True", metadata.get("Canonical"));
136
137
138 // Canonical type and Alias parser
139 metadata = new Metadata();
140 metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
141 alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
142 assertEquals("True", metadata.get("BMP"));
143 assertEquals("True", metadata.get("Alias"));
144
145
146 // And when both are there, will go for the last one
147 // to be registered (which is the alias one)
148 metadata = new Metadata();
149 metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
150 both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
151 assertEquals("True", metadata.get("BMP"));
152 assertEquals("True", metadata.get("Alias"));
153 }
154 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Map;
21 import java.util.Set;
22 import java.util.Map.Entry;
23
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MediaType;
27 import org.xml.sax.ContentHandler;
28 import org.xml.sax.SAXException;
29
30 /**
31 * A Dummy Parser for use with unit tests.
32 */
33 public class DummyParser extends AbstractParser {
34 private Set<MediaType> types;
35 private Map<String,String> metadata;
36 private String xmlText;
37
38 public DummyParser(Set<MediaType> types, Map<String, String> metadata,
39 String xmlText) {
40 this.types = types;
41 this.metadata = metadata;
42 this.xmlText = xmlText;
43 }
44
45 public Set<MediaType> getSupportedTypes(ParseContext context) {
46 return types;
47 }
48
49 public void parse(InputStream stream, ContentHandler handler,
50 Metadata metadata, ParseContext context) throws IOException,
51 SAXException, TikaException {
52 for (Entry<String,String> m : this.metadata.entrySet()) {
53 metadata.add(m.getKey(), m.getValue());
54 }
55
56 handler.startDocument();
57 if (xmlText != null) {
58 handler.characters(xmlText.toCharArray(), 0, xmlText.length());
59 }
60 handler.endDocument();
61 }
62
63 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.OutputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.junit.Test;
25
26 /**
27 * Test cases for the {@link BodyContentHandler} class.
28 */
29 public class BodyContentHandlerTest {
30
31 /**
32 * Test that the conversion to an {@link OutputStream} doesn't leave
33 * characters unflushed in an internal buffer.
34 *
35 * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
36 */
37 @Test
38 public void testOutputStream() throws Exception {
39 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
40
41 XHTMLContentHandler xhtml = new XHTMLContentHandler(
42 new BodyContentHandler(buffer), new Metadata());
43 xhtml.startDocument();
44 xhtml.element("p", "Test text");
45 xhtml.endDocument();
46
47 assertEquals("Test text\n", buffer.toString());
48 }
49
50 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.junit.Test;
21 import org.xml.sax.helpers.AttributesImpl;
22
23 /**
24 * Test cases for the {@link LinkContentHandler} class.
25 */
26 public class LinkContentHandlerTest {
27
28 /**
29 * @see <a href="https://issues.apache.org/jira/browse/TIKA-975">TIKA-975</a>
30 */
31 @Test
32 public void testWhitespaceCollapsing() throws Exception {
33 LinkContentHandler linkContentHandler = new LinkContentHandler(true);
34
35 linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl());
36 char[] anchorText = {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', 'e'};
37 linkContentHandler.characters(anchorText, 1, anchorText.length - 1);
38 linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", "");
39
40 assertEquals("No white space", linkContentHandler.getLinks().get(0).getText());
41 }
42
43 /**
44 * @see <a href="https://issues.apache.org/jira/browse/TIKA-975">TIKA-975</a>
45 */
46 @Test
47 public void testDefaultBehavior() throws Exception {
48 LinkContentHandler linkContentHandler = new LinkContentHandler();
49
50 linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl());
51 char[] anchorText = {' ', 'a', 'n', 'c', 'h', 'o', 'r', ' '};
52 linkContentHandler.characters(anchorText, 0, anchorText.length);
53 linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", "");
54
55 assertEquals(" anchor ", linkContentHandler.getLinks().get(0).getText());
56 }
57
58 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.fail;
19
20 import java.io.StringReader;
21 import java.net.ConnectException;
22
23 import javax.xml.parsers.SAXParser;
24 import javax.xml.parsers.SAXParserFactory;
25
26 import org.junit.Before;
27 import org.junit.Test;
28 import org.xml.sax.InputSource;
29 import org.xml.sax.helpers.DefaultHandler;
30
31 /**
32 * Unit tests for the {@link OfflineContentHandler} class.
33 */
34 public class OfflineContentHandlerTest {
35
36 private SAXParser parser;
37
38 private DefaultHandler offline;
39
40 @Before
41 public void setUp() throws Exception {
42 parser = SAXParserFactory.newInstance().newSAXParser();
43 offline = new OfflineContentHandler(new DefaultHandler());
44 }
45
46 @Test
47 public void testExternalDTD() throws Exception {
48 String xml =
49 "<!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>";
50 try {
51 parser.parse(new InputSource(new StringReader(xml)), offline);
52 } catch (ConnectException e) {
53 fail("Parser tried to access the external DTD:" + e);
54 }
55 }
56
57 @Test
58 public void testExternalEntity() throws Exception {
59 String xml =
60 "<!DOCTYPE foo ["
61 + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">"
62 + " ]><foo>&bar;</foo>";
63 try {
64 parser.parse(new InputSource(new StringReader(xml)), offline);
65 } catch (ConnectException e) {
66 fail("Parser tried to access the external DTD:" + e);
67 }
68 }
69
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.junit.Before;
21 import org.junit.Test;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24
25 /**
26 * Unit tests for the {@link SafeContentHandler} class.
27 */
28 public class SafeContentHandlerTest {
29
30 private ContentHandler output;
31
32 private ContentHandler safe;
33
34 @Before
35 public void setUp() {
36 output = new WriteOutContentHandler();
37 safe = new SafeContentHandler(output);
38 }
39
40 @Test
41 public void testEmptyInput() throws SAXException {
42 safe.characters(new char[0], 0, 0);
43 safe.ignorableWhitespace(new char[0], 0, 0);
44 assertEquals("", output.toString());
45 }
46
47 @Test
48 public void testNormalCharacters() throws SAXException {
49 safe.characters("abc".toCharArray(), 0, 3);
50 assertEquals("abc", output.toString());
51 }
52
53 @Test
54 public void testNormalWhitespace() throws SAXException {
55 safe.ignorableWhitespace("abc".toCharArray(), 0, 3);
56 assertEquals("abc", output.toString());
57 }
58
59 @Test
60 public void testInvalidCharacters() throws SAXException {
61 safe.characters("ab\u0007".toCharArray(), 0, 3);
62 safe.characters("a\u000Bc".toCharArray(), 0, 3);
63 safe.characters("\u0019bc".toCharArray(), 0, 3);
64 assertEquals("ab\ufffda\ufffdc\ufffdbc", output.toString());
65 }
66
67 @Test
68 public void testInvalidWhitespace() throws SAXException {
69 safe.ignorableWhitespace("ab\u0000".toCharArray(), 0, 3);
70 safe.ignorableWhitespace("a\u0001c".toCharArray(), 0, 3);
71 safe.ignorableWhitespace("\u0002bc".toCharArray(), 0, 3);
72 assertEquals("ab\ufffda\ufffdc\ufffdbc", output.toString());
73 }
74
75 @Test
76 public void testInvalidSurrogates() throws SAXException {
77 safe.ignorableWhitespace("\udb00\ubfff".toCharArray(), 0, 2);
78 assertEquals("\ufffd\ubfff", output.toString());
79 }
80
81 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.fail;
19
20 import java.io.IOException;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.io.NullInputStream;
24 import org.apache.tika.io.TikaInputStream;
25 import org.junit.Before;
26 import org.junit.Test;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.helpers.AttributesImpl;
29 import org.xml.sax.helpers.DefaultHandler;
30
31 /**
32 * Tests for the {@link SecureContentHandler} class.
33 */
34 public class SecureContentHandlerTest {
35
36 private static final int MANY_BYTES = 2000000;
37
38 private TikaInputStream stream;
39
40 private SecureContentHandler handler;
41
42 @Before
43 public void setUp() {
44 stream = TikaInputStream.get(new NullInputStream(MANY_BYTES));
45 handler = new SecureContentHandler(new DefaultHandler(), stream);
46 }
47
48 @Test
49 public void testZeroCharactersPerByte() throws IOException {
50 try {
51 char[] ch = new char[] { 'x' };
52 for (int i = 0; i < MANY_BYTES; i++) {
53 stream.read();
54 }
55 handler.characters(ch, 0, 1);
56 } catch (SAXException e) {
57 fail("Unexpected SAXException");
58 }
59 }
60
61 @Test
62 public void testOneCharacterPerByte() throws IOException {
63 try {
64 char[] ch = new char[1];
65 for (int i = 0; i < MANY_BYTES; i++) {
66 stream.read();
67 handler.characters(ch, 0, ch.length);
68 }
69 } catch (SAXException e) {
70 fail("Unexpected SAXException");
71 }
72 }
73
74 @Test
75 public void testTenCharactersPerByte() throws IOException {
76 try {
77 char[] ch = new char[10];
78 for (int i = 0; i < MANY_BYTES; i++) {
79 stream.read();
80 handler.characters(ch, 0, ch.length);
81 }
82 } catch (SAXException e) {
83 fail("Unexpected SAXException");
84 }
85 }
86
87 @Test
88 public void testManyCharactersPerByte() throws IOException {
89 try {
90 char[] ch = new char[1000];
91 for (int i = 0; i < MANY_BYTES; i++) {
92 stream.read();
93 handler.characters(ch, 0, ch.length);
94 }
95 fail("Expected SAXException not thrown");
96 } catch (SAXException e) {
97 // expected
98 }
99 }
100
101 @Test
102 public void testSomeCharactersWithoutInput() throws IOException {
103 try {
104 char[] ch = new char[100];
105 for (int i = 0; i < 100; i++) {
106 handler.characters(ch, 0, ch.length);
107 }
108 } catch (SAXException e) {
109 fail("Unexpected SAXException");
110 }
111 }
112
113 @Test
114 public void testManyCharactersWithoutInput() throws IOException {
115 try {
116 char[] ch = new char[100];
117 for (int i = 0; i < 20000; i++) {
118 handler.characters(ch, 0, ch.length);
119 }
120 fail("Expected SAXException not thrown");
121 } catch (SAXException e) {
122 // expected
123 }
124 }
125
126 @Test
127 public void testNestedElements() throws SAXException {
128 for (int i = 1; i < handler.getMaximumDepth(); i++) {
129 handler.startElement("", "x", "x", new AttributesImpl());
130 }
131 try {
132 handler.startElement("", "x", "x", new AttributesImpl());
133 fail("Nested XML element limit exceeded");
134 } catch (SAXException e) {
135 try {
136 handler.throwIfCauseOf(e);
137 throw e;
138 } catch (TikaException expected) {
139 }
140 }
141 }
142
143 @Test
144 public void testNestedEntries() throws SAXException {
145 AttributesImpl atts = new AttributesImpl();
146 atts.addAttribute("", "class", "class", "CDATA", "package-entry");
147 for (int i = 1; i < handler.getMaximumPackageEntryDepth(); i++) {
148 handler.startElement("", "div", "div", atts);
149 }
150 try {
151 handler.startElement("", "div", "div", atts);
152 fail("Nested XML element limit exceeded");
153 } catch (SAXException e) {
154 try {
155 handler.throwIfCauseOf(e);
156 throw e;
157 } catch (TikaException expected) {
158 }
159 }
160 }
161
162 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.assertEquals;
19 import org.junit.Test;
20 import org.xml.sax.ContentHandler;
21 import org.xml.sax.helpers.AttributesImpl;
22
23 public class SerializerTest {
24
25 @Test
26 public void testToTextContentHandler() throws Exception {
27 assertStartDocument("", new ToTextContentHandler());
28 assertCharacters("content", new ToTextContentHandler());
29 assertCharacterEscaping("<&\">", new ToTextContentHandler());
30 assertIgnorableWhitespace(" \t\r\n", new ToTextContentHandler());
31 assertEmptyElement("", new ToTextContentHandler());
32 assertEmptyElementWithAttributes("", new ToTextContentHandler());
33 assertEmptyElementWithAttributeEscaping("", new ToTextContentHandler());
34 assertElement("content", new ToTextContentHandler());
35 assertElementWithAttributes("content", new ToTextContentHandler());
36 }
37
38 @Test
39 public void testToXMLContentHandler() throws Exception {
40 assertStartDocument("", new ToXMLContentHandler());
41 assertStartDocument(
42 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
43 new ToXMLContentHandler("UTF-8"));
44 assertCharacters("content", new ToXMLContentHandler());
45 assertCharacterEscaping("&lt;&amp;\"&gt;", new ToXMLContentHandler());
46 assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler());
47 assertEmptyElement("<br />", new ToXMLContentHandler());
48 assertEmptyElementWithAttributes(
49 "<meta name=\"foo\" value=\"bar\" />",
50 new ToXMLContentHandler());
51 assertEmptyElementWithAttributeEscaping(
52 "<p class=\"&lt;&amp;&quot;&gt;\" />",
53 new ToXMLContentHandler());
54 assertElement("<p>content</p>", new ToXMLContentHandler());
55 assertElementWithAttributes(
56 "<p class=\"test\">content</p>",
57 new ToXMLContentHandler());
58 }
59
60 @Test
61 public void testToHTMLContentHandler() throws Exception {
62 assertStartDocument("", new ToHTMLContentHandler());
63 assertCharacters("content", new ToHTMLContentHandler());
64 assertCharacterEscaping("&lt;&amp;\"&gt;", new ToHTMLContentHandler());
65 assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler());
66 assertEmptyElement("<br>", new ToHTMLContentHandler());
67 assertEmptyElementWithAttributes(
68 "<meta name=\"foo\" value=\"bar\">",
69 new ToHTMLContentHandler());
70 assertEmptyElementWithAttributeEscaping(
71 "<p class=\"&lt;&amp;&quot;&gt;\"></p>",
72 new ToHTMLContentHandler());
73 assertElement("<p>content</p>", new ToHTMLContentHandler());
74 assertElementWithAttributes(
75 "<p class=\"test\">content</p>",
76 new ToHTMLContentHandler());
77 }
78
79 private void assertStartDocument(String expected, ContentHandler handler)
80 throws Exception {
81 handler.startDocument();
82 assertEquals(expected, handler.toString());
83 }
84
85 private void assertCharacters(String expected, ContentHandler handler)
86 throws Exception {
87 handler.characters("content".toCharArray(), 0, 7);
88 assertEquals(expected, handler.toString());
89 }
90
91 private void assertCharacterEscaping(
92 String expected, ContentHandler handler) throws Exception {
93 handler.characters("<&\">".toCharArray(), 0, 4);
94 assertEquals(expected, handler.toString());
95 }
96
97 private void assertIgnorableWhitespace(
98 String expected, ContentHandler handler) throws Exception {
99 handler.ignorableWhitespace(" \t\r\n".toCharArray(), 0, 4);
100 assertEquals(expected, handler.toString());
101 }
102
103 private void assertEmptyElement(String expected, ContentHandler handler)
104 throws Exception {
105 AttributesImpl attributes = new AttributesImpl();
106 handler.startElement("", "br", "br", attributes);
107 handler.endElement("", "br", "br");
108 assertEquals(expected, handler.toString());
109 }
110
111 private void assertEmptyElementWithAttributes(
112 String expected, ContentHandler handler) throws Exception {
113 AttributesImpl attributes = new AttributesImpl();
114 attributes.addAttribute("", "name", "name", "CDATA", "foo");
115 attributes.addAttribute("", "value", "value", "CDATA", "bar");
116 handler.startElement("", "meta", "meta", attributes);
117 handler.endElement("", "meta", "meta");
118 assertEquals(expected, handler.toString());
119 }
120
121 private void assertEmptyElementWithAttributeEscaping(
122 String expected, ContentHandler handler) throws Exception {
123 AttributesImpl attributes = new AttributesImpl();
124 attributes.addAttribute("", "class", "class", "CDATA", "<&\">");
125 handler.startElement("", "p", "p", attributes);
126 handler.endElement("", "p", "p");
127 assertEquals(expected, handler.toString());
128 }
129
130 private void assertElement(
131 String expected, ContentHandler handler) throws Exception {
132 AttributesImpl attributes = new AttributesImpl();
133 handler.startElement("", "p", "p", attributes);
134 handler.characters("content".toCharArray(), 0, 7);
135 handler.endElement("", "p", "p");
136 assertEquals(expected, handler.toString());
137 }
138
139 private void assertElementWithAttributes(
140 String expected, ContentHandler handler) throws Exception {
141 AttributesImpl attributes = new AttributesImpl();
142 attributes.addAttribute("", "class", "class", "CDATA", "test");
143 handler.startElement("", "p", "p", attributes);
144 handler.characters("content".toCharArray(), 0, 7);
145 handler.endElement("", "p", "p");
146 assertEquals(expected, handler.toString());
147 }
148
149 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.util.ArrayList;
21 import java.util.List;
22
23 import org.apache.tika.metadata.Metadata;
24
25 import org.junit.Before;
26 import org.junit.Test;
27
28 import org.xml.sax.ContentHandler;
29 import org.xml.sax.SAXException;
30
31 /**
32 * Unit tests for the {@link XHTMLContentHandler} class.
33 */
34 public class XHTMLContentHandlerTest {
35
36 private ContentHandler output;
37
38 private XHTMLContentHandler xhtml;
39
40 @Before
41 public void setUp() {
42 output = new BodyContentHandler();
43 xhtml = new XHTMLContentHandler(output, new Metadata());
44 }
45
46 /**
47 * Test that content in block elements are properly separated in text
48 * output.
49 *
50 * @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
51 */
52 @Test
53 public void testExtraWhitespace() throws SAXException {
54 xhtml.startDocument();
55
56 xhtml.element("p", "foo");
57 xhtml.startElement("p");
58 xhtml.characters("b");
59 xhtml.element("b", "a"); // inlines should not cause extra whitespace
60 xhtml.characters("r");
61 xhtml.endElement("p");
62
63 xhtml.startElement("table");
64 xhtml.startElement("tr");
65 xhtml.element("th", "x");
66 xhtml.element("th", "y");
67 xhtml.endElement("tr");
68 xhtml.startElement("tr");
69 xhtml.element("td", "a");
70 xhtml.element("td", "b");
71 xhtml.endElement("tr");
72 xhtml.endElement("table");
73 xhtml.endDocument();
74
75 String[] words = output.toString().split("\\s+");
76 assertEquals(6, words.length);
77 assertEquals("foo", words[0]);
78 assertEquals("bar", words[1]);
79 assertEquals("x", words[2]);
80 assertEquals("y", words[3]);
81 assertEquals("a", words[4]);
82 assertEquals("b", words[5]);
83 }
84
85 /**
86 * Test that content in option elements are properly separated in text
87 * output.
88 *
89 * @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
90 */
91 @Test
92 public void testWhitespaceWithOptions() throws Exception {
93 xhtml.startDocument();
94 xhtml.startElement("form");
95 xhtml.startElement("select");
96 xhtml.element("option", "opt1");
97 xhtml.element("option", "opt2");
98 xhtml.endElement("select");
99 xhtml.endElement("form");
100 xhtml.endDocument();
101
102 String[] words = output.toString().split("\\s+");
103 assertEquals(2, words.length);
104 assertEquals("opt1", words[0]);
105 assertEquals("opt2", words[1]);
106 }
107
108 @Test
109 public void testWhitespaceWithMenus() throws Exception {
110 xhtml.startDocument();
111 xhtml.startElement("menu");
112 xhtml.element("li", "one");
113 xhtml.element("li", "two");
114 xhtml.endElement("menu");
115 xhtml.endDocument();
116
117 String[] words = getRealWords(output.toString());
118 assertEquals(2, words.length);
119 assertEquals("one", words[0]);
120 assertEquals("two", words[1]);
121 }
122
123 /**
124 * Return array of non-zerolength words. Splitting on whitespace will get us
125 * empty words for emptylines.
126 *
127 * @param string some mix of newlines and real words
128 * @return array of real words.
129 */
130 private static String[] getRealWords(String string) {
131 String[] possibleWords = string.split("\\s+");
132 List<String> words = new ArrayList<String>(possibleWords.length);
133 for (String word : possibleWords) {
134 if (word.length() > 0) {
135 words.add(word);
136 }
137 }
138
139 return words.toArray(new String[words.size()]);
140 }
141
142 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax.xpath;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import org.junit.Before;
23 import org.junit.Test;
24
25 public class XPathParserTest {
26
27 private static final String NS = "test namespace";
28
29 private XPathParser parser;
30
31 @Before
32 public void setUp() {
33 parser = new XPathParser();
34 parser.addPrefix(null, null);
35 parser.addPrefix("prefix", NS);
36 }
37
38 @Test
39 public void testText() {
40 Matcher matcher = parser.parse("/text()");
41 assertTrue(matcher.matchesText());
42 assertFalse(matcher.matchesElement());
43 assertFalse(matcher.matchesAttribute(NS, "name"));
44 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
45 }
46
47 @Test
48 public void testAnyAttribute() {
49 Matcher matcher = parser.parse("/@*");
50 assertFalse(matcher.matchesText());
51 assertFalse(matcher.matchesElement());
52 assertTrue(matcher.matchesAttribute(null, "name"));
53 assertTrue(matcher.matchesAttribute(NS, "name"));
54 assertTrue(matcher.matchesAttribute(NS, "eman"));
55 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
56 }
57
58 @Test
59 public void testNamedAttribute() {
60 Matcher matcher = parser.parse("/@name");
61 assertFalse(matcher.matchesText());
62 assertFalse(matcher.matchesElement());
63 assertTrue(matcher.matchesAttribute(null, "name"));
64 assertFalse(matcher.matchesAttribute(NS, "name"));
65 assertFalse(matcher.matchesAttribute(NS, "eman"));
66 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
67 }
68
69 @Test
70 public void testPrefixedAttribute() {
71 Matcher matcher = parser.parse("/@prefix:name");
72 assertFalse(matcher.matchesText());
73 assertFalse(matcher.matchesElement());
74 assertFalse(matcher.matchesAttribute(null, "name"));
75 assertTrue(matcher.matchesAttribute(NS, "name"));
76 assertFalse(matcher.matchesAttribute(NS, "eman"));
77 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
78 }
79
80 @Test
81 public void testAnyElement() {
82 Matcher matcher = parser.parse("/*");
83 assertFalse(matcher.matchesText());
84 assertFalse(matcher.matchesElement());
85 assertFalse(matcher.matchesAttribute(null, "name"));
86 assertFalse(matcher.matchesAttribute(NS, "name"));
87 assertFalse(matcher.matchesAttribute(NS, "eman"));
88 matcher = matcher.descend(NS, "name");
89 assertFalse(matcher.matchesText());
90 assertTrue(matcher.matchesElement());
91 assertFalse(matcher.matchesAttribute(null, "name"));
92 assertFalse(matcher.matchesAttribute(NS, "name"));
93 assertFalse(matcher.matchesAttribute(NS, "eman"));
94 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
95 }
96
97 @Test
98 public void testNamedElement() {
99 Matcher matcher = parser.parse("/name");
100 assertFalse(matcher.matchesText());
101 assertFalse(matcher.matchesElement());
102 assertFalse(matcher.matchesAttribute(null, "name"));
103 assertFalse(matcher.matchesAttribute(NS, "name"));
104 assertFalse(matcher.matchesAttribute(NS, "eman"));
105 assertEquals(Matcher.FAIL, matcher.descend(NS, "name"));
106 assertEquals(Matcher.FAIL, matcher.descend(null, "enam"));
107 matcher = matcher.descend(null, "name");
108 assertFalse(matcher.matchesText());
109 assertTrue(matcher.matchesElement());
110 assertFalse(matcher.matchesAttribute(null, "name"));
111 assertFalse(matcher.matchesAttribute(NS, "name"));
112 assertFalse(matcher.matchesAttribute(NS, "eman"));
113 }
114
115 @Test
116 public void testPrefixedElement() {
117 Matcher matcher = parser.parse("/prefix:name");
118 assertFalse(matcher.matchesText());
119 assertFalse(matcher.matchesElement());
120 assertFalse(matcher.matchesAttribute(null, "name"));
121 assertFalse(matcher.matchesAttribute(NS, "name"));
122 assertFalse(matcher.matchesAttribute(NS, "eman"));
123 assertEquals(Matcher.FAIL, matcher.descend(null, "name"));
124 assertEquals(Matcher.FAIL, matcher.descend(NS, "enam"));
125 matcher = matcher.descend(NS, "name");
126 assertFalse(matcher.matchesText());
127 assertTrue(matcher.matchesElement());
128 assertFalse(matcher.matchesAttribute(null, "name"));
129 assertFalse(matcher.matchesAttribute(NS, "name"));
130 assertFalse(matcher.matchesAttribute(NS, "eman"));
131 }
132
133 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import org.junit.Test;
23
24 public class CharsetUtilsTest {
25
26 @Test
27 public void testInvalidCharset() {
28 assertFalse(CharsetUtils.isSupported(" utf-8"));
29 assertFalse(CharsetUtils.isSupported("my charset name"));
30 assertFalse(CharsetUtils.isSupported("charset1; charset2"));
31 assertFalse(CharsetUtils.isSupported(null));
32 assertFalse(CharsetUtils.isSupported(""));
33 }
34
35 @Test
36 public void testValidCharset() {
37 assertTrue(CharsetUtils.isSupported("UTF-8"));
38 assertFalse(CharsetUtils.isSupported("bogus"));
39 }
40
41 @Test
42 public void testCleaningCharsetName() {
43 assertEquals("UTF-8", CharsetUtils.clean("utf-8"));
44 assertEquals(null, CharsetUtils.clean(""));
45 assertEquals(null, CharsetUtils.clean(null));
46 assertEquals("US-ASCII", CharsetUtils.clean(" us-ascii "));
47 assertEquals("UTF-8", CharsetUtils.clean("\"utf-8\""));
48 assertEquals("ISO-8859-1", CharsetUtils.clean("ISO-8859-1, latin1"));
49 }
50
51 @Test
52 public void testFunkyNames() {
53 assertEquals(null, CharsetUtils.clean("none"));
54 assertEquals(null, CharsetUtils.clean("no"));
55
56 assertEquals("UTF-8", CharsetUtils.clean("utf-8>"));
57
58 assertEquals("ISO-8859-1", CharsetUtils.clean("iso-8851-1"));
59 assertEquals("ISO-8859-15", CharsetUtils.clean("8859-15"));
60
61 assertEquals("windows-1251", CharsetUtils.clean("cp-1251"));
62 assertEquals("windows-1251", CharsetUtils.clean("win1251"));
63 assertEquals("windows-1251", CharsetUtils.clean("WIN-1251"));
64 assertEquals("windows-1251", CharsetUtils.clean("win-1251"));
65 assertEquals("windows-1252", CharsetUtils.clean("Windows"));
66
67 assertEquals("KOI8-R", CharsetUtils.clean("koi8r"));
68 }
69
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.util.List;
23
24 import org.junit.Test;
25
26 /**
27 * Test case for {@link RegexUtils}.
28 *
29 * @version $Revision$ $Date$
30 */
31 public class RegexUtilsTest {
32
33 /**
34 * Test {@link RegexUtils#extractLinks(String)} with no links.
35 */
36
37 @Test
38 public void testExtractLinksNone() {
39 List<String> links = null;
40
41 links = RegexUtils.extractLinks(null);
42 assertNotNull(links);
43 assertEquals(0, links.size());
44
45 links = RegexUtils.extractLinks("");
46 assertNotNull(links);
47 assertEquals(0, links.size());
48
49 links = RegexUtils.extractLinks(
50 "Test with no links " +
51 "What about www.google.com");
52 assertNotNull(links);
53 assertEquals(0, links.size());
54 }
55
56
57 /**
58 * Test {@link RegexUtils#extractLinks(String)} for http.
59 */
60 @Test
61 public void testExtractLinksHttp() {
62 List<String> links = RegexUtils.extractLinks(
63 "Test with http://www.nutch.org/index.html is it found? " +
64 "What about www.google.com at http://www.google.de " +
65 "A longer URL could be http://www.sybit.com/solutions/portals.html");
66
67 assertTrue("Url not found!", links.size() == 3);
68 assertEquals("Wrong URL", "http://www.nutch.org/index.html", links.get(0));
69 assertEquals("Wrong URL", "http://www.google.de", links.get(1));
70 assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", links.get(2));
71 }
72
73 /**
74 * Test {@link RegexUtils#extractLinks(String)} for ftp.
75 */
76 @Test
77 public void testExtractLinksFtp() {
78 List<String> links = RegexUtils.extractLinks(
79 "Test with ftp://www.nutch.org is it found? " +
80 "What about www.google.com at ftp://www.google.de");
81
82 assertTrue("Url not found!", links.size() == 2);
83 assertEquals("Wrong URL", "ftp://www.nutch.org", links.get(0));
84 assertEquals("Wrong URL", "ftp://www.google.de", links.get(1));
85 }
86 }
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-dotnet</artifactId>
32 <name>Apache Tika for .NET</name>
33 <url>http://tika.apache.org/</url>
34
35 <properties>
36 <ikvm>C:\ikvm-7.0.4335.0</ikvm>
37 <mscorlib.jar>${ikvm}\mscorlib.jar</mscorlib.jar>
38 <System.jar>${ikvm}\System.jar</System.jar>
39 </properties>
40
41 <dependencies>
42 <dependency>
43 <groupId>org.apache.tika</groupId>
44 <artifactId>tika-app</artifactId>
45 <version>${project.version}</version>
46 </dependency>
47 <dependency>
48 <groupId>ikvm</groupId>
49 <artifactId>mscorlib</artifactId>
50 <version>2.0</version>
51 <scope>system</scope>
52 <systemPath>${mscorlib.jar}</systemPath>
53 </dependency>
54 <dependency>
55 <groupId>ikvm</groupId>
56 <artifactId>System</artifactId>
57 <version>2.0</version>
58 <scope>system</scope>
59 <systemPath>${System.jar}</systemPath>
60 </dependency>
61 </dependencies>
62
63 <build>
64 <plugins>
65 <plugin>
66 <artifactId>maven-dependency-plugin</artifactId>
67 <version>2.4</version>
68 <executions>
69 <execution>
70 <id>copy-dependencies</id>
71 <phase>package</phase>
72 <goals>
73 <goal>copy-dependencies</goal>
74 </goals>
75 <configuration>
76 <stripVersion>true</stripVersion>
77 <excludeTransitive>true</excludeTransitive>
78 <excludeScope>system</excludeScope>
79 </configuration>
80 </execution>
81 </executions>
82 </plugin>
83 <plugin>
84 <artifactId>maven-antrun-plugin</artifactId>
85 <executions>
86 <execution>
87 <phase>package</phase>
88 <goals>
89 <goal>run</goal>
90 </goals>
91 <configuration>
92 <target>
93 <exec executable="${ikvm}/bin/ikvmc.exe">
94 <arg value="-nowarn:0100" />
95 <arg value="-nowarn:0105" />
96 <arg value="-nowarn:0109" />
97 <arg value="-nowarn:0111" />
98 <arg value="-nowarn:0112" />
99 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Charsets.dll" />
100 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Core.dll" />
101 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Text.dll" />
102 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.Util.dll" />
103 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.XML.API.dll" />
104 <arg value="-reference:${ikvm}/bin/IKVM.OpenJDK.XML.Transform.dll" />
105 <arg value="-target:library" />
106 <arg value="-compressresources" />
107 <arg value="-out:${project.build.directory}/${project.build.finalName}.dll" />
108 <arg value="-recurse:${project.build.directory}\*.class" />
109 <arg value="${project.build.directory}/dependency/tika-app.jar" />
110 </exec>
111 </target>
112 </configuration>
113 </execution>
114 </executions>
115 </plugin>
116 <plugin>
117 <groupId>org.codehaus.mojo</groupId>
118 <artifactId>build-helper-maven-plugin</artifactId>
119 <version>1.7</version>
120 <executions>
121 <execution>
122 <phase>package</phase>
123 <goals>
124 <goal>attach-artifact</goal>
125 </goals>
126 <configuration>
127 <artifacts>
128 <artifacts>
129 <file>${project.build.directory}/${project.build.finalName}.dll</file>
130 <type>dll</type>
131 </artifacts>
132 </artifacts>
133 </configuration>
134 </execution>
135 </executions>
136 </plugin>
137 </plugins>
138 <pluginManagement>
139 <plugins>
140 <!-- This plugin's configuration is used to store Eclipse m2e settings
141 only. It has no influence on the Maven build itself. -->
142 <plugin>
143 <groupId>org.eclipse.m2e</groupId>
144 <artifactId>lifecycle-mapping</artifactId>
145 <version>1.0.0</version>
146 <configuration>
147 <lifecycleMappingMetadata>
148 <pluginExecutions>
149 <pluginExecution>
150 <pluginExecutionFilter>
151 <groupId>org.apache.maven.plugins</groupId>
152 <artifactId>maven-dependency-plugin</artifactId>
153 <versionRange>[2.4,)</versionRange>
154 <goals>
155 <goal>copy-dependencies</goal>
156 </goals>
157 </pluginExecutionFilter>
158 <action>
159 <ignore></ignore>
160 </action>
161 </pluginExecution>
162 </pluginExecutions>
163 </lifecycleMappingMetadata>
164 </configuration>
165 </plugin>
166 </plugins>
167 </pluginManagement>
168 </build>
169
170 <description>A .NET port of Tika functionality.</description>
171 <organization>
172 <name>The Apache Software Foundation</name>
173 <url>http://www.apache.org</url>
174 </organization>
175 <scm>
176 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-dotnet</url>
177 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-dotnet</connection>
178 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-dotnet</developerConnection>
179 </scm>
180 <issueManagement>
181 <system>JIRA</system>
182 <url>https://issues.apache.org/jira/browse/TIKA</url>
183 </issueManagement>
184 <ciManagement>
185 <system>Jenkins</system>
186 <url>https://builds.apache.org/job/Tika-trunk/</url>
187 </ciManagement>
188 </project>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package Tika;
17
18 import java.io.File;
19 import java.io.IOException;
20 import java.net.URL;
21
22 import org.apache.tika.exception.TikaException;
23
24 public class Tika {
25
26 private final org.apache.tika.Tika tika = new org.apache.tika.Tika();
27
28 public cli.System.String detect(cli.System.String name) {
29 return toCliString(tika.detect(toJvmString(name)));
30 }
31
32 public cli.System.String detect(cli.System.IO.FileInfo file)
33 throws cli.System.IO.IOException {
34 try {
35 return toCliString(tika.detect(new File(file.get_FullName())));
36 } catch (IOException e) {
37 throw new cli.System.IO.IOException(e.getMessage(), e);
38 }
39 }
40
41 public cli.System.String detect(cli.System.Uri uri)
42 throws cli.System.IO.IOException {
43 try {
44 return toCliString(tika.detect(new URL(uri.get_AbsolutePath())));
45 } catch (IOException e) {
46 throw new cli.System.IO.IOException(e.getMessage(), e);
47 }
48 }
49
50 public cli.System.String parseToString(cli.System.IO.FileInfo file)
51 throws cli.System.IO.IOException, TikaException {
52 try {
53 return toCliString(tika.parseToString(new File(file.get_FullName())));
54 } catch (IOException e) {
55 throw new cli.System.IO.IOException(e.getMessage(), e);
56 }
57 }
58
59 public cli.System.String parseToString(cli.System.Uri uri)
60 throws cli.System.IO.IOException, TikaException {
61 try {
62 return toCliString(tika.parseToString(new URL(uri.get_AbsoluteUri())));
63 } catch (IOException e) {
64 throw new cli.System.IO.IOException(e.getMessage(), e);
65 }
66 }
67
68 private static cli.System.String toCliString(String string) {
69 return new cli.System.String(string.toCharArray());
70 }
71
72 private static String toJvmString(cli.System.String string) {
73 return new String(string.ToCharArray());
74 }
75
76 }
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-java7</artifactId>
32 <packaging>bundle</packaging>
33
34 <name>Apache Tika Java-7 Components</name>
35 <description>Java-7 reliant components, including FileTypeDetector implementations</description>
36
37 <build>
38 <plugins>
39 <plugin>
40 <groupId>org.apache.felix</groupId>
41 <artifactId>maven-scr-plugin</artifactId>
42 <version>1.7.4</version>
43 </plugin>
44 <plugin>
45 <!-- builds the bundle -->
46 <groupId>org.apache.felix</groupId>
47 <artifactId>maven-bundle-plugin</artifactId>
48 <extensions>true</extensions>
49 <configuration>
50 <instructions>
51 <Export-Package>
52 org.apache.tika.filetypedetector
53 </Export-Package>
54 <Private-Package />
55 </instructions>
56 </configuration>
57 </plugin>
58 <plugin>
59 <groupId>org.apache.maven.plugins</groupId>
60 <artifactId>maven-compiler-plugin</artifactId>
61 <version>3.1</version>
62 <configuration>
63 <source>1.7</source>
64 <target>1.7</target>
65 </configuration>
66 </plugin>
67 </plugins>
68 </build>
69
70 <dependencies>
71 <dependency>
72 <groupId>${project.groupId}</groupId>
73 <artifactId>tika-core</artifactId>
74 <version>${project.version}</version>
75 </dependency>
76 <dependency>
77 <groupId>${project.groupId}</groupId>
78 <artifactId>tika-parsers</artifactId>
79 <version>${project.version}</version>
80 </dependency>
81 <dependency>
82 <groupId>biz.aQute</groupId>
83 <artifactId>bndlib</artifactId>
84 <scope>provided</scope>
85 </dependency>
86 <dependency>
87 <groupId>junit</groupId>
88 <artifactId>junit</artifactId>
89 <scope>test</scope>
90 <version>4.11</version>
91 </dependency>
92 </dependencies>
93
94 <url>http://tika.apache.org/</url>
95 <organization>
96 <name>The Apache Software Foundation</name>
97 <url>http://www.apache.org</url>
98 </organization>
99 <scm>
100 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-java7</url>
101 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-java7</connection>
102 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-java7</developerConnection>
103 </scm>
104 <issueManagement>
105 <system>JIRA</system>
106 <url>https://issues.apache.org/jira/browse/TIKA</url>
107 </issueManagement>
108 <ciManagement>
109 <system>Jenkins</system>
110 <url>https://builds.apache.org/job/Tika-trunk/</url>
111 </ciManagement>
112 </project>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.filetypedetector;
18
19 import java.io.IOException;
20 import java.nio.file.Files;
21 import java.nio.file.Path;
22 import java.nio.file.spi.FileTypeDetector;
23
24 import org.apache.tika.Tika;
25 import org.apache.tika.mime.MimeTypes;
26
27 public class TikaFileTypeDetector extends FileTypeDetector {
28 private final Tika tika = new Tika();
29
30 public TikaFileTypeDetector() {
31 super();
32 }
33
34 @Override
35 public String probeContentType(Path path) throws IOException {
36 // Try to detect based on the file name only for efficiency
37 String fileNameDetect = tika.detect(path.toString());
38 if(!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
39 return fileNameDetect;
40 }
41
42 // Then check the file content if necessary
43 String fileContentDetect = tika.detect(path.toFile());
44 if(!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
45 return fileContentDetect;
46 }
47
48 // Specification says to return null if we could not
49 // conclusively determine the file type
50 return null;
51 }
52
53 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Tika Java-7 FileTypeDetector implementations.
19 */
20 @aQute.bnd.annotation.Version("1.0.0")
21 package org.apache.tika.filetypedetector;
0 org.apache.tika.filetypedetector.TikaFileTypeDetector
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.filetypedetector;
17
18 import static org.junit.Assert.*;
19
20 import java.io.IOException;
21 import java.nio.file.Files;
22 import java.nio.file.Path;
23 import java.nio.file.spi.FileTypeDetector;
24 import java.util.Iterator;
25 import java.util.ServiceLoader;
26
27 import org.junit.After;
28 import org.junit.Before;
29 import org.junit.Rule;
30 import org.junit.Test;
31 import org.junit.rules.TemporaryFolder;
32
33 public class TikaFileTypeDetectorTest {
34
35 @Rule
36 public TemporaryFolder tempDir = new TemporaryFolder();
37
38 private Path testDirectory = null;
39
40 private static final String TEST_CLASSPATH = "/test-documents/test.html";
41 private static final String TEST_HTML = "test.html";
42 private static final String TEST_UNRECOGNISED_EXTENSION = "test.unrecognisedextension";
43
44 @Before
45 public void setUp() throws Exception {
46 testDirectory = tempDir.newFolder().toPath();
47 Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH),
48 testDirectory.resolve(TEST_HTML));
49 Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH),
50 testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
51 }
52
53 @After
54 public void tearDown() throws Exception {
55 }
56
57 @Test
58 public final void testDirectAccess() throws Exception {
59 String contentType = new TikaFileTypeDetector().probeContentType(testDirectory.resolve(TEST_HTML));
60 assertNotNull(contentType);
61 assertEquals("text/html", contentType);
62 }
63
64 @Test
65 public final void testFilesProbeContentTypePathExtension() throws Exception {
66 String contentType = Files.probeContentType(testDirectory.resolve(TEST_HTML));
67 assertNotNull(contentType);
68 assertEquals("text/html", contentType);
69 }
70
71 @Test
72 public final void testFilesProbeContentTypePathUnrecognised() throws Exception {
73 String contentType = Files.probeContentType(testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
74 assertNotNull(contentType);
75 assertEquals("text/html", contentType);
76 }
77
78 @Test
79 public final void testMetaInfServicesLoad() throws Exception {
80 ServiceLoader<FileTypeDetector> serviceLoader = ServiceLoader.load(FileTypeDetector.class);
81
82 Iterator<FileTypeDetector> iterator = serviceLoader.iterator();
83 assertTrue(iterator.hasNext());
84
85 while(iterator.hasNext()) {
86 FileTypeDetector fileTypeDetector = iterator.next();
87 assertNotNull(fileTypeDetector);
88 assertTrue(fileTypeDetector instanceof TikaFileTypeDetector);
89 }
90 }
91 }
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache</groupId>
26 <artifactId>apache</artifactId>
27 <version>10</version>
28 <relativePath />
29 </parent>
30
31 <groupId>org.apache.tika</groupId>
32 <artifactId>tika-parent</artifactId>
33 <version>1.5</version>
34 <packaging>pom</packaging>
35
36 <name>Apache Tika parent</name>
37 <description>
38 Apache Tika is a toolkit for detecting and extracting metadata and
39 structured text content from various documents using existing parser
40 libraries.
41 </description>
42 <inceptionYear>2007</inceptionYear>
43
44 <url>http://tika.apache.org/</url>
45
46 <issueManagement>
47 <system>JIRA</system>
48 <url>https://issues.apache.org/jira/browse/TIKA</url>
49 </issueManagement>
50
51 <mailingLists>
52 <mailingList>
53 <name>Development mailing list</name>
54 <subscribe>dev-subscribe@tika.apache.org</subscribe>
55 <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
56 <post>dev@tika.apache.org</post>
57 <archive>http://mail-archives.apache.org/mod_mbox/tika-dev/</archive>
58 <otherArchives>
59 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
60 <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
61 <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
62 <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
63 <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
64 <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
65 <otherArchive>http://tika.markmail.org/</otherArchive>
66 </otherArchives>
67 </mailingList>
68 <mailingList>
69 <name>Commit mailing list</name>
70 <subscribe>commits-subscribe@tika.apache.org</subscribe>
71 <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
72 <post>commits@tika.apache.org</post>
73 <archive>http://mail-archives.apache.org/mod_mbox/tika-commits/</archive>
74 <otherArchives>
75 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
76 <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
77 <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
78 </otherArchives>
79 </mailingList>
80 <mailingList>
81 <name>User mailing list</name>
82 <subscribe>user-subscribe@tika.apache.org</subscribe>
83 <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
84 <post>user@tika.apache.org</post>
85 <archive>http://mail-archives.apache.org/mod_mbox/tika-user/</archive>
86 <otherArchives>
87 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
88 <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
89 </otherArchives>
90 </mailingList>
91 </mailingLists>
92
93 <developers>
94 <developer>
95 <name>Rida Benjelloun</name>
96 <id>ridabenjelloun</id>
97 <email>ridabenjelloun@apache.org</email>
98 <roles>
99 <role>committer</role>
100 </roles>
101 </developer>
102 <developer>
103 <name>Keith Bennett</name>
104 <id>kbennett</id>
105 <roles>
106 <role>committer</role>
107 </roles>
108 </developer>
109 <developer>
110 <name>Mark Harwood</name>
111 <id>mharwood</id>
112 <roles>
113 <role>committer</role>
114 </roles>
115 </developer>
116 <developer>
117 <name>Ken Krugler</name>
118 <id>kkrugler</id>
119 <email>kkrugler@apache.org</email>
120 <url>http://ken-blog.krugler.org</url>
121 <organization>Bixo Labs</organization>
122 <organizationUrl>http://bixolabs.com</organizationUrl>
123 <roles>
124 <role>committer</role>
125 </roles>
126 </developer>
127 <developer>
128 <name>Chris A. Mattmann</name>
129 <id>mattmann</id>
130 <email>mattmann@apache.org</email>
131 <url>http://people.apache.org/~mattmann/</url>
132 <organization>NASA Jet Propulsion Laboratory</organization>
133 <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
134 <timezone>-8</timezone>
135 <properties />
136 <roles>
137 <role>committer</role>
138 </roles>
139 </developer>
140 <developer>
141 <name>Michael McCandless</name>
142 <id>mikemccand</id>
143 <email>mikemccand@apache.org</email>
144 <organization>IBM</organization>
145 <properties />
146 <roles>
147 <role>committer</role>
148 </roles>
149 </developer>
150 <developer>
151 <name>Dave Meikle</name>
152 <id>dmeikle</id>
153 <roles>
154 <role>committer</role>
155 </roles>
156 </developer>
157 <developer>
158 <name>Sami Siren</name>
159 <id>siren</id>
160 <roles>
161 <role>committer</role>
162 </roles>
163 </developer>
164 <developer>
165 <name>Jukka Zitting</name>
166 <id>jukka</id>
167 <roles>
168 <role>committer</role>
169 </roles>
170 </developer>
171 <developer>
172 <name>Nick Burch</name>
173 <id>nick</id>
174 <organization>Alfresco</organization>
175 <organizationUrl>http://alfresco.com</organizationUrl>
176 <roles>
177 <role>committer</role>
178 </roles>
179 </developer>
180 <developer>
181 <name>Maxim Valyanskiy</name>
182 <id>maxcom</id>
183 <organization>Jet Infosystems</organization>
184 <roles>
185 <role>committer</role>
186 </roles>
187 <timezone>+3</timezone>
188 </developer>
189 <developer>
190 <name>Oleg Tikhonov</name>
191 <id>oleg</id>
192 <roles>
193 <role>committer</role>
194 </roles>
195 <timezone>+2</timezone>
196 </developer>
197 <developer>
198 <name>Ray Gauss II</name>
199 <id>rgauss</id>
200 <organization>Alfresco</organization>
201 <organizationUrl>http://alfresco.com</organizationUrl>
202 <timezone>-5</timezone>
203 <roles>
204 <role>committer</role>
205 </roles>
206 </developer>
207 </developers>
208 <contributors>
209 <contributor>
210 <name>Doug Cutting</name>
211 <roles>
212 <role>mentor</role>
213 </roles>
214 </contributor>
215 <contributor>
216 <name>Bertrand Delacretaz</name>
217 <roles>
218 <role>mentor</role>
219 </roles>
220 </contributor>
221 <contributor>
222 <name>Niall Pemberton</name>
223 <roles>
224 <role>emeritus</role>
225 </roles>
226 </contributor>
227 </contributors>
228
229 <dependencyManagement>
230 <dependencies>
231 <dependency>
232 <groupId>biz.aQute</groupId>
233 <artifactId>bndlib</artifactId>
234 <version>1.43.0</version>
235 </dependency>
236 <dependency>
237 <groupId>org.apache.felix</groupId>
238 <artifactId>org.apache.felix.scr.annotations</artifactId>
239 <version>1.6.0</version>
240 </dependency>
241 <dependency>
242 <groupId>junit</groupId>
243 <artifactId>junit</artifactId>
244 <version>4.10</version>
245 <scope>test</scope>
246 </dependency>
247 </dependencies>
248 </dependencyManagement>
249
250 <properties>
251 <maven.compile.source>1.6</maven.compile.source>
252 <maven.compile.target>1.6</maven.compile.target>
253 <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
254 </properties>
255
256 <build>
257 <plugins>
258 <plugin>
259 <artifactId>maven-compiler-plugin</artifactId>
260 <configuration>
261 <source>${maven.compile.source}</source>
262 <target>${maven.compile.target}</target>
263 </configuration>
264 </plugin>
265 </plugins>
266 <pluginManagement>
267 <plugins>
268 <plugin>
269 <groupId>org.apache.felix</groupId>
270 <artifactId>maven-bundle-plugin</artifactId>
271 <version>2.3.4</version>
272 </plugin>
273 <plugin>
274 <groupId>org.apache.maven.plugins</groupId>
275 <artifactId>maven-surefire-plugin</artifactId>
276 <version>2.12</version>
277 </plugin>
278 <plugin>
279 <groupId>org.apache.maven.plugins</groupId>
280 <artifactId>maven-shade-plugin</artifactId>
281 <version>1.6</version>
282 </plugin>
283 </plugins>
284 </pluginManagement>
285 </build>
286
287 <profiles>
288 <profile>
289 <id>pedantic</id>
290 <build>
291 <plugins>
292 <plugin>
293 <groupId>org.apache.rat</groupId>
294 <artifactId>apache-rat-plugin</artifactId>
295 <executions>
296 <execution>
297 <phase>verify</phase>
298 <goals>
299 <goal>check</goal>
300 </goals>
301 </execution>
302 </executions>
303 </plugin>
304 </plugins>
305 </build>
306 </profile>
307 <profile>
308 <id>sonar</id>
309 <build>
310 <plugins>
311 <plugin>
312 <groupId>org.apache.maven.plugins</groupId>
313 <artifactId>maven-surefire-plugin</artifactId>
314 <configuration>
315 <excludes>
316 <exclude>**/ForkParser*Test.java</exclude>
317 </excludes>
318 </configuration>
319 </plugin>
320 </plugins>
321 </build>
322 </profile>
323 </profiles>
324 </project>
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-parsers</artifactId>
32 <packaging>bundle</packaging>
33 <name>Apache Tika parsers</name>
34 <url>http://tika.apache.org/</url>
35
36 <properties>
37 <poi.version>3.10-beta2</poi.version>
38 <codec.version>1.5</codec.version> <!-- NOTE: sync with POI -->
39 <mime4j.version>0.7.2</mime4j.version>
40 <vorbis.version>0.1</vorbis.version>
41 </properties>
42
43 <dependencies>
44 <!-- Optional OSGi dependency, used only when running within OSGi -->
45 <dependency>
46 <groupId>org.osgi</groupId>
47 <artifactId>org.osgi.core</artifactId>
48 <version>4.0.0</version>
49 <scope>provided</scope>
50 <optional>true</optional>
51 </dependency>
52
53 <dependency>
54 <groupId>${project.groupId}</groupId>
55 <artifactId>tika-core</artifactId>
56 <version>${project.version}</version>
57 </dependency>
58
59 <!-- Externally Maintained Parsers -->
60 <dependency>
61 <groupId>org.gagravarr</groupId>
62 <artifactId>vorbis-java-tika</artifactId>
63 <version>${vorbis.version}</version>
64 </dependency>
65
66 <!-- Optional OSGi dependencies, used only when running within OSGi -->
67 <dependency>
68 <groupId>org.apache.felix</groupId>
69 <artifactId>org.apache.felix.scr.annotations</artifactId>
70 <scope>provided</scope>
71 </dependency>
72
73 <!-- Upstream parser libraries -->
74 <dependency>
75 <groupId>edu.ucar</groupId>
76 <artifactId>netcdf</artifactId>
77 <version>4.2-min</version>
78 </dependency>
79 <dependency>
80 <groupId>org.apache.james</groupId>
81 <artifactId>apache-mime4j-core</artifactId>
82 <version>${mime4j.version}</version>
83 </dependency>
84 <dependency>
85 <groupId>org.apache.james</groupId>
86 <artifactId>apache-mime4j-dom</artifactId>
87 <version>${mime4j.version}</version>
88 </dependency>
89 <dependency>
90 <groupId>org.apache.commons</groupId>
91 <artifactId>commons-compress</artifactId>
92 <version>1.5</version>
93 </dependency>
94 <dependency>
95 <groupId>commons-codec</groupId>
96 <artifactId>commons-codec</artifactId>
97 <version>${codec.version}</version>
98 </dependency>
99 <dependency>
100 <groupId>org.apache.pdfbox</groupId>
101 <artifactId>pdfbox</artifactId>
102 <version>1.8.4</version>
103 </dependency>
104 <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
105 as optional, but we prefer to have them always to avoid
106 problems with encrypted PDFs. -->
107 <dependency>
108 <groupId>org.bouncycastle</groupId>
109 <artifactId>bcmail-jdk15</artifactId>
110 <version>1.45</version>
111 </dependency>
112 <dependency>
113 <groupId>org.bouncycastle</groupId>
114 <artifactId>bcprov-jdk15</artifactId>
115 <version>1.45</version>
116 </dependency>
117 <dependency>
118 <groupId>org.apache.poi</groupId>
119 <artifactId>poi</artifactId>
120 <version>${poi.version}</version>
121 </dependency>
122 <dependency>
123 <groupId>org.apache.poi</groupId>
124 <artifactId>poi-scratchpad</artifactId>
125 <version>${poi.version}</version>
126 </dependency>
127 <dependency>
128 <groupId>org.apache.poi</groupId>
129 <artifactId>poi-ooxml</artifactId>
130 <version>${poi.version}</version>
131 <exclusions>
132 <exclusion>
133 <groupId>stax</groupId>
134 <artifactId>stax-api</artifactId>
135 </exclusion>
136 <exclusion>
137 <groupId>xml-apis</groupId>
138 <artifactId>xml-apis</artifactId>
139 </exclusion>
140 </exclusions>
141 </dependency>
142 <dependency>
143 <groupId>org.apache.geronimo.specs</groupId>
144 <artifactId>geronimo-stax-api_1.0_spec</artifactId>
145 <version>1.0.1</version>
146 </dependency>
147 <dependency>
148 <groupId>org.ccil.cowan.tagsoup</groupId>
149 <artifactId>tagsoup</artifactId>
150 <version>1.2.1</version>
151 </dependency>
152 <dependency>
153 <groupId>org.ow2.asm</groupId>
154 <artifactId>asm-debug-all</artifactId>
155 <version>4.1</version>
156 </dependency>
157 <dependency>
158 <groupId>com.googlecode.mp4parser</groupId>
159 <artifactId>isoparser</artifactId>
160 <version>1.0-RC-1</version>
161 </dependency>
162 <dependency>
163 <groupId>com.drewnoakes</groupId>
164 <artifactId>metadata-extractor</artifactId>
165 <version>2.6.2</version>
166 </dependency>
167 <dependency>
168 <groupId>de.l3s.boilerpipe</groupId>
169 <artifactId>boilerpipe</artifactId>
170 <version>1.1.0</version>
171 </dependency>
172 <dependency>
173 <groupId>rome</groupId>
174 <artifactId>rome</artifactId>
175 <version>0.9</version>
176 </dependency>
177 <dependency>
178 <groupId>org.gagravarr</groupId>
179 <artifactId>vorbis-java-core</artifactId>
180 <version>${vorbis.version}</version>
181 </dependency>
182 <dependency>
183 <groupId>com.googlecode.juniversalchardet</groupId>
184 <artifactId>juniversalchardet</artifactId>
185 <version>1.0.3</version>
186 </dependency>
187 <dependency>
188 <groupId>com.uwyn</groupId>
189 <artifactId>jhighlight</artifactId>
190 <version>1.0</version>
191 <exclusions>
192 <exclusion>
193 <groupId>javax.servlet</groupId>
194 <artifactId>servlet-api</artifactId>
195 </exclusion>
196 </exclusions>
197 </dependency>
198
199 <!-- Test dependencies -->
200 <dependency>
201 <groupId>junit</groupId>
202 <artifactId>junit</artifactId>
203 <scope>test</scope>
204 </dependency>
205 <dependency>
206 <groupId>org.mockito</groupId>
207 <artifactId>mockito-core</artifactId>
208 <version>1.7</version>
209 <scope>test</scope>
210 </dependency>
211 <dependency>
212 <groupId>org.slf4j</groupId>
213 <artifactId>slf4j-log4j12</artifactId>
214 <version>1.5.6</version>
215 <scope>test</scope>
216 </dependency>
217 </dependencies>
218
219 <build>
220 <plugins>
221 <plugin>
222 <groupId>org.apache.felix</groupId>
223 <artifactId>maven-bundle-plugin</artifactId>
224 <extensions>true</extensions>
225 <configuration>
226 <instructions>
227 <Bundle-DocURL>${project.url}</Bundle-DocURL>
228 <Bundle-Activator>
229 org.apache.tika.parser.internal.Activator
230 </Bundle-Activator>
231 <Import-Package>
232 org.w3c.dom,
233 org.apache.tika.*,
234 *;resolution:=optional
235 </Import-Package>
236 </instructions>
237 </configuration>
238 </plugin>
239 <plugin>
240 <groupId>org.apache.rat</groupId>
241 <artifactId>apache-rat-plugin</artifactId>
242 <configuration>
243 <excludes>
244 <exclude>src/main/java/org/apache/tika/parser/txt/Charset*.java</exclude>
245 <exclude>src/test/resources/test-documents/**</exclude>
246 </excludes>
247 </configuration>
248 </plugin>
249 </plugins>
250
251 <pluginManagement>
252 <plugins>
253 <!-- This plugin's configuration is used to store Eclipse m2e -->
254 <!-- settings only. It has no influence on the Maven build itself. -->
255 <plugin>
256 <groupId>org.eclipse.m2e</groupId>
257 <artifactId>lifecycle-mapping</artifactId>
258 <version>1.0.0</version>
259 <configuration>
260 <lifecycleMappingMetadata>
261 <pluginExecutions>
262 <pluginExecution>
263 <pluginExecutionFilter>
264 <groupId>org.apache.felix</groupId>
265 <artifactId>maven-scr-plugin</artifactId>
266 <versionRange>[1.7.2,)</versionRange>
267 <goals>
268 <goal>scr</goal>
269 </goals>
270 </pluginExecutionFilter>
271 <action>
272 <execute />
273 </action>
274 </pluginExecution>
275 </pluginExecutions>
276 </lifecycleMappingMetadata>
277 </configuration>
278 </plugin>
279 </plugins>
280 </pluginManagement>
281 </build>
282
283 <organization>
284 <name>The Apache Software Foundation</name>
285 <url>http://www.apache.org</url>
286 </organization>
287 <scm>
288 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-parsers</url>
289 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-parsers</connection>
290 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-parsers</developerConnection>
291 </scm>
292 <issueManagement>
293 <system>JIRA</system>
294 <url>https://issues.apache.org/jira/browse/TIKA</url>
295 </issueManagement>
296 <ciManagement>
297 <system>Jenkins</system>
298 <url>https://builds.apache.org/job/Tika-trunk/</url>
299 </ciManagement>
300 </project>
0 APACHE TIKA SUBCOMPONENTS
1
2 Apache Tika includes a number of subcomponents with separate copyright notices
3 and license terms. Your use of these subcomponents is subject to the terms and
4 conditions of the following licenses.
5
6 Charset detection code from ICU4J (http://site.icu-project.org/)
7
8 Copyright (c) 1995-2009 International Business Machines Corporation
9 and others
10
11 All rights reserved.
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, and/or sell copies of the Software, and to permit persons
18 to whom the Software is furnished to do so, provided that the above
19 copyright notice(s) and this permission notice appear in all copies
20 of the Software and that both the above copyright notice(s) and this
21 permission notice appear in supporting documentation.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
26 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
27 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
28 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
30 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
31 SOFTWARE.
32
33 Except as contained in this notice, the name of a copyright holder shall
34 not be used in advertising or otherwise to promote the sale, use or other
35 dealings in this Software without prior written authorization of the
36 copyright holder.
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.asm;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26 import org.apache.tika.parser.AbstractParser;
27 import org.apache.tika.parser.ParseContext;
28 import org.xml.sax.ContentHandler;
29 import org.xml.sax.SAXException;
30
31 /**
32 * Parser for Java .class files.
33 */
34 public class ClassParser extends AbstractParser {
35
36 /** Serial version UID */
37 private static final long serialVersionUID = -3531388963354454357L;
38
39 private static final Set<MediaType> SUPPORTED_TYPES =
40 Collections.singleton(MediaType.application("java-vm"));
41
42 public Set<MediaType> getSupportedTypes(ParseContext context) {
43 return SUPPORTED_TYPES;
44 }
45
46 public void parse(
47 InputStream stream, ContentHandler handler,
48 Metadata metadata, ParseContext context)
49 throws IOException, SAXException, TikaException {
50 new XHTMLClassVisitor(handler, metadata).parse(stream);
51 }
52
53 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.asm;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.TikaCoreProperties;
24 import org.apache.tika.sax.XHTMLContentHandler;
25 import org.objectweb.asm.AnnotationVisitor;
26 import org.objectweb.asm.Attribute;
27 import org.objectweb.asm.ClassReader;
28 import org.objectweb.asm.ClassVisitor;
29 import org.objectweb.asm.FieldVisitor;
30 import org.objectweb.asm.MethodVisitor;
31 import org.objectweb.asm.Opcodes;
32 import org.objectweb.asm.Type;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Class visitor that generates XHTML SAX events to describe the
38 * contents of the visited class.
39 */
40 class XHTMLClassVisitor extends ClassVisitor {
41
42 private final XHTMLContentHandler xhtml;
43
44 private final Metadata metadata;
45
46 private Type type;
47
48 private String packageName;
49
50 public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
51 super(Opcodes.ASM4);
52 this.xhtml = new XHTMLContentHandler(handler, metadata);
53 this.metadata = metadata;
54 }
55
56 public void parse(InputStream stream)
57 throws TikaException, SAXException, IOException {
58 try {
59 ClassReader reader = new ClassReader(stream);
60 reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
61 } catch (RuntimeException e) {
62 if (e.getCause() instanceof SAXException) {
63 throw (SAXException) e.getCause();
64 } else {
65 throw new TikaException("Failed to parse a Java class", e);
66 }
67 }
68 }
69
70 public void visit(
71 int version, int access, String name, String signature,
72 String superName, String[] interfaces) {
73 type = Type.getObjectType(name);
74
75 String className = type.getClassName();
76 int dot = className.lastIndexOf('.');
77 if (dot != -1) {
78 packageName = className.substring(0, dot);
79 className = className.substring(dot + 1);
80 }
81
82 metadata.set(TikaCoreProperties.TITLE, className);
83 metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
84
85 try {
86 xhtml.startDocument();
87 xhtml.startElement("pre");
88
89 if (packageName != null) {
90 writeKeyword("package");
91 xhtml.characters(" " + packageName + ";\n");
92 }
93
94 writeAccess(access);
95 if (isSet(access, Opcodes.ACC_INTERFACE)) {
96 writeKeyword("interface");
97 writeSpace();
98 writeType(type);
99 writeSpace();
100 writeInterfaces("extends", interfaces);
101 } else if (isSet(access, Opcodes.ACC_ENUM)) {
102 writeKeyword("enum");
103 writeSpace();
104 writeType(type);
105 writeSpace();
106 } else {
107 writeKeyword("class");
108 writeSpace();
109 writeType(type);
110 writeSpace();
111 if (superName != null) {
112 Type superType = Type.getObjectType(superName);
113 if (!superType.getClassName().equals("java.lang.Object")) {
114 writeKeyword("extends");
115 writeSpace();
116 writeType(superType);
117 writeSpace();
118 }
119 }
120 writeInterfaces("implements", interfaces);
121 }
122 xhtml.characters("{\n");
123 } catch (SAXException e) {
124 throw new RuntimeException(e);
125 }
126 }
127
128 private void writeInterfaces(String keyword, String[] interfaces)
129 throws SAXException {
130 if (interfaces != null && interfaces.length > 0) {
131 writeKeyword(keyword);
132 String separator = " ";
133 for (String iface : interfaces) {
134 xhtml.characters(separator);
135 writeType(Type.getObjectType(iface));
136 separator = ", ";
137 }
138 writeSpace();
139 }
140 }
141
142 public void visitEnd() {
143 try {
144 xhtml.characters("}\n");
145 xhtml.endElement("pre");
146 xhtml.endDocument();
147 } catch (SAXException e) {
148 throw new RuntimeException(e);
149 }
150 }
151
152 /**
153 * Ignored.
154 */
155 public void visitOuterClass(String owner, String name, String desc) {
156 }
157
158 /**
159 * Ignored.
160 */
161 public void visitSource(String source, String debug) {
162 }
163
164
165 /**
166 * Ignored.
167 */
168 public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
169 return null;
170 }
171
172 /**
173 * Ignored.
174 */
175 public void visitAttribute(Attribute attr) {
176 }
177
178 /**
179 * Ignored.
180 */
181 public void visitInnerClass(
182 String name, String outerName, String innerName, int access) {
183 }
184
185 /**
186 * Visits a field.
187 */
188 public FieldVisitor visitField(
189 int access, String name, String desc, String signature,
190 Object value) {
191 if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
192 try {
193 xhtml.characters(" ");
194 writeAccess(access);
195 writeType(Type.getType(desc));
196 writeSpace();
197 writeIdentifier(name);
198
199 if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
200 xhtml.characters(" = ");
201 xhtml.characters(value.toString());
202 }
203
204 writeSemicolon();
205 writeNewline();
206 } catch (SAXException e) {
207 throw new RuntimeException(e);
208 }
209 }
210
211 return null;
212 }
213
214 /**
215 * Visits a method.
216 */
217 public MethodVisitor visitMethod(
218 int access, String name, String desc, String signature,
219 String[] exceptions) {
220 if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
221 try {
222 xhtml.characters(" ");
223 writeAccess(access);
224 writeType(Type.getReturnType(desc));
225 writeSpace();
226 if ("<init>".equals(name)) {
227 writeType(type);
228 } else {
229 writeIdentifier(name);
230 }
231
232 xhtml.characters("(");
233 String separator = "";
234 for (Type arg : Type.getArgumentTypes(desc)) {
235 xhtml.characters(separator);
236 writeType(arg);
237 separator = ", ";
238 }
239 xhtml.characters(")");
240
241 if (exceptions != null && exceptions.length > 0) {
242 writeSpace();
243 writeKeyword("throws");
244 separator = " ";
245 for (String exception : exceptions) {
246 xhtml.characters(separator);
247 writeType(Type.getObjectType(exception));
248 separator = ", ";
249 }
250 }
251
252 writeSemicolon();
253 writeNewline();
254 } catch (SAXException e) {
255 throw new RuntimeException(e);
256 }
257 }
258
259 return null;
260 }
261
262 private void writeIdentifier(String identifier) throws SAXException {
263 xhtml.startElement("span", "class", "java-identifier");
264 xhtml.characters(identifier);
265 xhtml.endElement("span");
266 }
267
268 private void writeKeyword(String keyword) throws SAXException {
269 xhtml.startElement("span", "class", "java-keyword");
270 xhtml.characters(keyword);
271 xhtml.endElement("span");
272 }
273
274 private void writeSemicolon() throws SAXException {
275 xhtml.characters(";");
276 }
277
278 private void writeSpace() throws SAXException {
279 xhtml.characters(" ");
280 }
281
282 private void writeNewline() throws SAXException {
283 xhtml.characters("\n");
284 }
285
286 private void writeAccess(int access) throws SAXException {
287 writeAccess(access, Opcodes.ACC_PRIVATE, "private");
288 writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
289 writeAccess(access, Opcodes.ACC_PUBLIC, "public");
290 writeAccess(access, Opcodes.ACC_STATIC, "static");
291 writeAccess(access, Opcodes.ACC_FINAL, "final");
292 writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
293 writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
294 writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
295 writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
296 writeAccess(access, Opcodes.ACC_NATIVE, "native");
297 }
298
299 private void writeAccess(int access, int code, String keyword)
300 throws SAXException {
301 if (isSet(access, code)) {
302 writeKeyword(keyword);
303 xhtml.characters(" ");
304 }
305 }
306
307 private void writeType(Type type) throws SAXException {
308 String name = type.getClassName();
309 if (name.startsWith(packageName + ".")) {
310 xhtml.characters(name.substring(packageName.length() + 1));
311 } else if (name.startsWith("java.lang.")) {
312 xhtml.characters(name.substring("java.lang.".length()));
313 } else {
314 xhtml.characters(name);
315 }
316 }
317
318 private static boolean isSet(int value, int flag) {
319 return (value & flag) != 0;
320 }
321
322 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.audio;
17
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Map;
25 import java.util.Map.Entry;
26 import java.util.Set;
27
28 import javax.sound.sampled.AudioFileFormat;
29 import javax.sound.sampled.AudioFileFormat.Type;
30 import javax.sound.sampled.AudioFormat;
31 import javax.sound.sampled.AudioSystem;
32 import javax.sound.sampled.UnsupportedAudioFileException;
33
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.XMPDM;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.AbstractParser;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.sax.XHTMLContentHandler;
41 import org.xml.sax.ContentHandler;
42 import org.xml.sax.SAXException;
43
44 public class AudioParser extends AbstractParser {
45
46 /** Serial version UID */
47 private static final long serialVersionUID = -6015684081240882695L;
48
49 private static final Set<MediaType> SUPPORTED_TYPES =
50 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
51 MediaType.audio("basic"),
52 MediaType.audio("x-wav"),
53 MediaType.audio("x-aiff"))));
54
55 public Set<MediaType> getSupportedTypes(ParseContext context) {
56 return SUPPORTED_TYPES;
57 }
58
59 public void parse(
60 InputStream stream, ContentHandler handler,
61 Metadata metadata, ParseContext context)
62 throws IOException, SAXException, TikaException {
63 // AudioSystem expects the stream to support the mark feature
64 if (!stream.markSupported()) {
65 stream = new BufferedInputStream(stream);
66 }
67 try {
68 AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
69 Type type = fileFormat.getType();
70 if (type == Type.AIFC || type == Type.AIFF) {
71 metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
72 } else if (type == Type.AU || type == Type.SND) {
73 metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
74 } else if (type == Type.WAVE) {
75 metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
76 }
77
78 AudioFormat audioFormat = fileFormat.getFormat();
79 int channels = audioFormat.getChannels();
80 if (channels != AudioSystem.NOT_SPECIFIED) {
81 metadata.set("channels", String.valueOf(channels));
82 // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
83 }
84 float rate = audioFormat.getSampleRate();
85 if (rate != AudioSystem.NOT_SPECIFIED) {
86 metadata.set("samplerate", String.valueOf(rate));
87 metadata.set(
88 XMPDM.AUDIO_SAMPLE_RATE,
89 Integer.toString((int) rate));
90 }
91 int bits = audioFormat.getSampleSizeInBits();
92 if (bits != AudioSystem.NOT_SPECIFIED) {
93 metadata.set("bits", String.valueOf(bits));
94 if (bits == 8) {
95 metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
96 } else if (bits == 16) {
97 metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
98 } else if (bits == 32) {
99 metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
100 }
101 }
102 metadata.set("encoding", audioFormat.getEncoding().toString());
103
104 // Javadoc suggests that some of the following properties might
105 // be available, but I had no success in finding any:
106
107 // "duration" Long playback duration of the file in microseconds
108 // "author" String name of the author of this file
109 // "title" String title of this file
110 // "copyright" String copyright message
111 // "date" Date date of the recording or release
112 // "comment" String an arbitrary text
113
114 addMetadata(metadata, fileFormat.properties());
115 addMetadata(metadata, audioFormat.properties());
116 } catch (UnsupportedAudioFileException e) {
117 // There is no way to know whether this exception was
118 // caused by the document being corrupted or by the format
119 // just being unsupported. So we do nothing.
120 }
121
122 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
123 xhtml.startDocument();
124 xhtml.endDocument();
125 }
126
127 private void addMetadata(Metadata metadata, Map<String, Object> properties) {
128 if (properties != null) {
129 for (Entry<String, Object> entry : properties.entrySet()) {
130 Object value = entry.getValue();
131 if (value != null) {
132 metadata.set(entry.getKey(), value.toString());
133 }
134 }
135 }
136 }
137
138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.audio;
17
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import javax.sound.midi.InvalidMidiDataException;
27 import javax.sound.midi.MetaMessage;
28 import javax.sound.midi.MidiMessage;
29 import javax.sound.midi.MidiSystem;
30 import javax.sound.midi.Patch;
31 import javax.sound.midi.Sequence;
32 import javax.sound.midi.Track;
33
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.mime.MediaType;
37 import org.apache.tika.parser.AbstractParser;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.sax.XHTMLContentHandler;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
42
43 public class MidiParser extends AbstractParser {
44
45 /** Serial version UID */
46 private static final long serialVersionUID = 6343278584336189432L;
47
48 private static final Set<MediaType> SUPPORTED_TYPES =
49 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
50 MediaType.application("x-midi"),
51 MediaType.audio("midi"))));
52
53 public Set<MediaType> getSupportedTypes(ParseContext context) {
54 return SUPPORTED_TYPES;
55 }
56
57 public void parse(
58 InputStream stream, ContentHandler handler,
59 Metadata metadata, ParseContext context)
60 throws IOException, SAXException, TikaException {
61 metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
62
63 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
64 xhtml.startDocument();
65
66 // MidiSystem expects the stream to support the mark feature
67 InputStream buffered = new BufferedInputStream(stream);
68 try {
69 Sequence sequence = MidiSystem.getSequence(buffered);
70
71 Track[] tracks = sequence.getTracks();
72 metadata.set("tracks", String.valueOf(tracks.length));
73 // TODO: Use XMPDM.TRACKS?
74
75 Patch[] patches = sequence.getPatchList();
76 metadata.set("patches", String.valueOf(patches.length));
77
78 float type = sequence.getDivisionType();
79 if (type == Sequence.PPQ) {
80 metadata.set("divisionType", "PPQ");
81 } else if (type == Sequence.SMPTE_24) {
82 metadata.set("divisionType", "SMPTE_24");
83 } else if (type == Sequence.SMPTE_25) {
84 metadata.set("divisionType", "SMPTE_25");
85 } else if (type == Sequence.SMPTE_30) {
86 metadata.set("divisionType", "SMPTE_30");
87 } else if (type == Sequence.SMPTE_30DROP) {
88 metadata.set("divisionType", "SMPTE_30DROP");
89 } else if (type == Sequence.SMPTE_24) {
90 metadata.set("divisionType", String.valueOf(type));
91 }
92
93 for (Track track : tracks) {
94 xhtml.startElement("p");
95 for (int i = 0; i < track.size(); i++) {
96 MidiMessage message = track.get(i).getMessage();
97 if (message instanceof MetaMessage) {
98 MetaMessage meta = (MetaMessage) message;
99 // Types 1-15 are reserved for text events
100 if (meta.getType() >= 1 && meta.getType() <= 15) {
101 // FIXME: What's the encoding?
102 xhtml.characters(
103 new String(meta.getData(), "ISO-8859-1"));
104 }
105 }
106 }
107 xhtml.endElement("p");
108 }
109 } catch (InvalidMidiDataException ignore) {
110 // There is no way to know whether this exception was
111 // caused by the document being corrupted or by the format
112 // just being unsupported. So we do nothing.
113 }
114
115 xhtml.endDocument();
116 }
117
118 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Iterator;
25 import java.util.Set;
26
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
33 import org.apache.tika.parser.chm.core.ChmExtractor;
34 import org.apache.tika.parser.html.HtmlParser;
35 import org.apache.tika.sax.BodyContentHandler;
36 import org.apache.tika.sax.XHTMLContentHandler;
37 import org.xml.sax.ContentHandler;
38 import org.xml.sax.SAXException;
39
40 public class ChmParser extends AbstractParser {
41
42 /** Serial version UID */
43 private static final long serialVersionUID = 5938777307516469802L;
44
45 private static final Set<MediaType> SUPPORTED_TYPES =
46 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
47 MediaType.application("vnd.ms-htmlhelp"),
48 MediaType.application("chm"),
49 MediaType.application("x-chm"))));
50
51 public Set<MediaType> getSupportedTypes(ParseContext context) {
52 return SUPPORTED_TYPES;
53 }
54
55 public void parse(InputStream stream, ContentHandler handler,
56 Metadata metadata, ParseContext context) throws IOException,
57 SAXException, TikaException {
58 ChmExtractor chmExtractor = new ChmExtractor(stream);
59
60 // metadata
61 metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
62
63 // content
64 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
65 xhtml.startDocument();
66
67 Iterator<DirectoryListingEntry> it =
68 chmExtractor.getChmDirList().getDirectoryListingEntryList().iterator();
69 while (it.hasNext()) {
70 DirectoryListingEntry entry = it.next();
71 if (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm")) {
72 xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
73 }
74 }
75
76 xhtml.endDocument();
77 }
78
79 /**
80 * Extracts data from byte[]
81 */
82 private String extract(byte[] byteObject) throws TikaException {// throws IOException
83 StringBuilder wBuf = new StringBuilder();
84 InputStream stream = null;
85 Metadata metadata = new Metadata();
86 HtmlParser htmlParser = new HtmlParser();
87 BodyContentHandler handler = new BodyContentHandler(-1);// -1
88 ParseContext parser = new ParseContext();
89 try {
90 stream = new ByteArrayInputStream(byteObject);
91 htmlParser.parse(stream, handler, metadata, parser);
92 wBuf.append(handler.toString()
93 + System.getProperty("line.separator"));
94 } catch (SAXException e) {
95 throw new RuntimeException(e);
96 } catch (IOException e) {
97 // Pushback overflow from tagsoup
98 }
99 return wBuf.toString();
100 }
101
102
103 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import java.io.Serializable;
19
20 import org.apache.tika.exception.TikaException;
21
22 /**
23 *
24 * Defines an accessor interface
25 *
26 * @param <T>
27 */
28 public interface ChmAccessor<T> extends Serializable {
29 /**
30 * Parses chm accessor
31 *
32 * @param data
33 * chm file
34 * @param chmAccessor
35 * @throws TikaException
36 */
37 void parse(byte[] data, T chmAccessor) throws TikaException;
38 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import java.math.BigInteger;
19 import java.util.ArrayList;
20 import java.util.List;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.parser.chm.core.ChmCommons;
24 import org.apache.tika.parser.chm.core.ChmConstants;
25
26 /**
27 * Holds chm listing entries
28 */
29 public class ChmDirectoryListingSet {
30 private List<DirectoryListingEntry> dlel;
31 private byte[] data;
32 private int placeHolder = -1;
33 private long dataOffset = -1;
34 private int controlDataIndex = -1;
35 private int resetTableIndex = -1;
36
37 private boolean isNotControlDataFound = true;
38 private boolean isNotResetTableFound = true;
39
40 /**
41 * Constructs chm directory listing set
42 *
43 * @param data
44 * byte[]
45 * @param chmItsHeader
46 * @param chmItspHeader
47 * @throws TikaException
48 */
49 public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
50 ChmItspHeader chmItspHeader) throws TikaException {
51 setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
52 ChmCommons.assertByteArrayNotNull(data);
53 setData(data);
54 enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
55 }
56
57 public String toString() {
58 StringBuilder sb = new StringBuilder();
59 sb.append("list:=" + getDirectoryListingEntryList().toString()
60 + System.getProperty("line.separator"));
61 sb.append("number of list items:="
62 + getDirectoryListingEntryList().size());
63 return sb.toString();
64 }
65
66 /**
67 * Returns control data index that located in List
68 *
69 * @return control data index
70 */
71 public int getControlDataIndex() {
72 return controlDataIndex;
73 }
74
75 /**
76 * Sets control data index
77 *
78 * @param controlDataIndex
79 */
80 protected void setControlDataIndex(int controlDataIndex) {
81 this.controlDataIndex = controlDataIndex;
82 }
83
84 /**
85 * Return index of reset table
86 *
87 * @return reset table index
88 */
89 public int getResetTableIndex() {
90 return resetTableIndex;
91 }
92
93 /**
94 * Sets reset table index
95 *
96 * @param resetTableIndex
97 */
98 protected void setResetTableIndex(int resetTableIndex) {
99 this.resetTableIndex = resetTableIndex;
100 }
101
102 /**
103 * Gets place holder
104 *
105 * @return place holder
106 */
107 private int getPlaceHolder() {
108 return placeHolder;
109 }
110
111 /**
112 * Sets place holder
113 *
114 * @param placeHolder
115 */
116 private void setPlaceHolder(int placeHolder) {
117 this.placeHolder = placeHolder;
118 }
119
120 /**
121 * Enumerates chm directory listing entries
122 *
123 * @param chmItsHeader
124 * chm itsf header
125 * @param chmItspHeader
126 * chm itsp header
127 */
128 private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
129 ChmItspHeader chmItspHeader) {
130 try {
131 int startPmgl = chmItspHeader.getIndex_head();
132 int stopPmgl = chmItspHeader.getUnknown_0024();
133 int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
134 .getHeader_len());
135 setDataOffset(chmItsHeader.getDataOffset());
136
137 /* loops over all pmgls */
138 int previous_index = 0;
139 byte[] dir_chunk = null;
140 for (int i = startPmgl; i <= stopPmgl; i++) {
141 int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
142 + dir_offset;
143 if (i == 0) {
144 dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
145 // dir_chunk = Arrays.copyOfRange(getData(), dir_offset,
146 // (((1+i) * (int)chmItspHeader.getBlock_len()) +
147 // dir_offset));
148 dir_chunk = ChmCommons
149 .copyOfRange(getData(), dir_offset,
150 (((1 + i) * (int) chmItspHeader
151 .getBlock_len()) + dir_offset));
152 previous_index = data_copied;
153 } else {
154 dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
155 // dir_chunk = Arrays.copyOfRange(getData(), previous_index,
156 // (((1+i) * (int)chmItspHeader.getBlock_len()) +
157 // dir_offset));
158 dir_chunk = ChmCommons
159 .copyOfRange(getData(), previous_index,
160 (((1 + i) * (int) chmItspHeader
161 .getBlock_len()) + dir_offset));
162 previous_index = data_copied;
163 }
164 enumerateOneSegment(dir_chunk);
165 dir_chunk = null;
166 }
167 } catch (Exception e) {
168 e.printStackTrace();
169 } finally {
170 setData(null);
171 }
172 }
173
174 /**
175 * Checks control data
176 *
177 * @param dle
178 * chm directory listing entry
179 */
180 private void checkControlData(DirectoryListingEntry dle) {
181 if (isNotControlDataFound) {
182 if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
183 setControlDataIndex(getDirectoryListingEntryList().size());
184 isNotControlDataFound = false;
185 }
186 }
187 }
188
189 /**
190 * Checks reset table
191 *
192 * @param dle
193 * chm directory listing entry
194 */
195 private void checkResetTable(DirectoryListingEntry dle) {
196 if (isNotResetTableFound) {
197 if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
198 setResetTableIndex(getDirectoryListingEntryList().size());
199 isNotResetTableFound = false;
200 }
201 }
202 }
203
204 /**
205 * Enumerates chm directory listing entries in single chm segment
206 *
207 * @param dir_chunk
208 */
209 private void enumerateOneSegment(byte[] dir_chunk) {
210 try {
211 if (dir_chunk != null) {
212
213 int indexWorkData = ChmCommons.indexOf(dir_chunk,
214 "::".getBytes());
215 int indexUserData = ChmCommons.indexOf(dir_chunk,
216 "/".getBytes());
217
218 if (indexUserData < indexWorkData)
219 setPlaceHolder(indexUserData);
220 else
221 setPlaceHolder(indexWorkData);
222
223 if (getPlaceHolder() > 0
224 && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
225 do {
226 if (dir_chunk[getPlaceHolder() - 1] > 0) {
227 DirectoryListingEntry dle = new DirectoryListingEntry();
228
229 // two cases: 1. when dir_chunk[getPlaceHolder() -
230 // 1] == 0x73
231 // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
232 doNameCheck(dir_chunk, dle);
233
234 // dle.setName(new
235 // String(Arrays.copyOfRange(dir_chunk,
236 // getPlaceHolder(), (getPlaceHolder() +
237 // dle.getNameLength()))));
238 dle.setName(new String(ChmCommons.copyOfRange(
239 dir_chunk, getPlaceHolder(),
240 (getPlaceHolder() + dle.getNameLength()))));
241 checkControlData(dle);
242 checkResetTable(dle);
243 setPlaceHolder(getPlaceHolder()
244 + dle.getNameLength());
245
246 /* Sets entry type */
247 if (getPlaceHolder() < dir_chunk.length
248 && dir_chunk[getPlaceHolder()] == 0)
249 dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
250 else
251 dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
252
253 setPlaceHolder(getPlaceHolder() + 1);
254 dle.setOffset(getEncint(dir_chunk));
255 dle.setLength(getEncint(dir_chunk));
256 getDirectoryListingEntryList().add(dle);
257 } else
258 setPlaceHolder(getPlaceHolder() + 1);
259
260 } while (hasNext(dir_chunk));
261 }
262 }
263
264 } catch (Exception e) {
265 e.printStackTrace();
266 }
267 }
268
269 /**
270 * Checks if a name and name length are correct. If not then handles it as
271 * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
272 * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
273 *
274 * @param dir_chunk
275 * @param dle
276 */
277 private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
278 if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
279 dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
280 } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
281 dle.setNameLength(dir_chunk[getPlaceHolder()]);
282 setPlaceHolder(getPlaceHolder() + 1);
283 } else {
284 dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
285 }
286 }
287
288 /**
289 * Checks if it's possible move further on byte[]
290 *
291 * @param dir_chunk
292 *
293 * @return boolean
294 */
295 private boolean hasNext(byte[] dir_chunk) {
296 while (getPlaceHolder() < dir_chunk.length) {
297 if (dir_chunk[getPlaceHolder()] == 47
298 && dir_chunk[getPlaceHolder() + 1] != ':') {
299 setPlaceHolder(getPlaceHolder());
300 return true;
301 } else if (dir_chunk[getPlaceHolder()] == ':'
302 && dir_chunk[getPlaceHolder() + 1] == ':') {
303 setPlaceHolder(getPlaceHolder());
304 return true;
305 } else
306 setPlaceHolder(getPlaceHolder() + 1);
307 }
308 return false;
309 }
310
311 /**
312 * Returns encrypted integer
313 *
314 * @param data_chunk
315 *
316 * @return
317 */
318 private int getEncint(byte[] data_chunk) {
319 byte ob;
320 BigInteger bi = BigInteger.ZERO;
321 byte[] nb = new byte[1];
322
323 if (getPlaceHolder() < data_chunk.length) {
324 while ((ob = data_chunk[getPlaceHolder()]) < 0) {
325 nb[0] = (byte) ((ob & 0x7f));
326 bi = bi.shiftLeft(7).add(new BigInteger(nb));
327 setPlaceHolder(getPlaceHolder() + 1);
328 }
329 nb[0] = (byte) ((ob & 0x7f));
330 bi = bi.shiftLeft(7).add(new BigInteger(nb));
331 setPlaceHolder(getPlaceHolder() + 1);
332 }
333 return bi.intValue();
334 }
335
336 /**
337 * @param args
338 */
339 public static void main(String[] args) {
340 }
341
342 /**
343 * Sets chm directory listing entry list
344 *
345 * @param dlel
346 * chm directory listing entry list
347 */
348 public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
349 this.dlel = dlel;
350 }
351
352 /**
353 * Returns chm directory listing entry list
354 *
355 * @return List<DirectoryListingEntry>
356 */
357 public List<DirectoryListingEntry> getDirectoryListingEntryList() {
358 return dlel;
359 }
360
361 /**
362 * Sets data
363 *
364 * @param data
365 */
366 private void setData(byte[] data) {
367 this.data = data;
368 }
369
370 /**
371 * Returns data
372 *
373 * @return
374 */
375 private byte[] getData() {
376 return data;
377 }
378
379 /**
380 * Sets data offset
381 *
382 * @param dataOffset
383 */
384 private void setDataOffset(long dataOffset) {
385 this.dataOffset = dataOffset;
386 }
387
388 /**
389 * Returns data offset
390 *
391 * @return dataOffset
392 */
393 public long getDataOffset() {
394 return dataOffset;
395 }
396 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import java.math.BigInteger;
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.parser.chm.assertion.ChmAssert;
22 import org.apache.tika.parser.chm.core.ChmConstants;
23 import org.apache.tika.parser.chm.exception.ChmParsingException;
24
25 /**
26 * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
27 * Total header length, including header section table and following data. 000C:
28 * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
29 * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
30 * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
31 * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
32 * beginning of file 0008: QWORD Length of section Following the header section
33 * table is 8 bytes of additional header data. In Version 2 files, this data is
34 * not there and the content section starts immediately after the directory.
35 *
36 * {@link http
37 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
38 * /?show-translation-form=1}
39 *
40 */
41 /* structure of ITSF headers */
42 public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
43 private static final long serialVersionUID = 2215291838533213826L;
44 private byte[] signature = new String("ITSF").getBytes(); /* 0 (ITSF) */
45 private int version; /* 4 */
46 private int header_len; /* 8 */
47 private int unknown_000c; /* c */
48 private long last_modified; /* 10 */
49 private long lang_id; /* 14 */
50 private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
51 private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
52 private long unknown_offset; /* 38 */
53 private long unknown_len; /* 40 */
54 private long dir_offset; /* 48 */
55 private long dir_len; /* 50 */
56 private long data_offset; /* 58 (Not present before V3) */
57
58 /* local usage */
59 private int dataRemained;
60 private int currentPlace = 0;
61
62 /**
63 * Prints the values of ChmfHeader
64 */
65 public String toString() {
66 StringBuilder sb = new StringBuilder();
67 sb.append(new String(getSignature()) + " ");
68 sb.append(getVersion() + " ");
69 sb.append(getHeaderLen() + " ");
70 sb.append(getUnknown_000c() + " ");
71 sb.append(getLastModified() + " ");
72 sb.append(getLangId() + " ");
73 sb.append(getDir_uuid() + " ");
74 sb.append(getStream_uuid() + " ");
75 sb.append(getUnknownOffset() + " ");
76 sb.append(getUnknownLen() + " ");
77 sb.append(getDirOffset() + " ");
78 sb.append(getDirLen() + " ");
79 sb.append(getDataOffset() + " ");
80 return sb.toString();
81 }
82
83 /**
84 * Returns a signature of itsf header
85 *
86 * @return itsf header
87 */
88 public byte[] getSignature() {
89 return signature;
90 }
91
92 /**
93 * Sets itsf header signature
94 *
95 * @param signature
96 */
97 protected void setSignature(byte[] signature) {
98 this.signature = signature;
99 }
100
101 /**
102 * Returns itsf header version
103 *
104 * @return itsf version
105 */
106 public int getVersion() {
107 return version;
108 }
109
110 /**
111 * Sets itsf version
112 *
113 * @param version
114 */
115 protected void setVersion(int version) {
116 this.version = version;
117 }
118
119 /**
120 * Returns itsf header length
121 *
122 * @return length
123 */
124 public int getHeaderLen() {
125 return header_len;
126 }
127
128 /**
129 * Sets itsf header length
130 *
131 * @param header_len
132 */
133 protected void setHeaderLen(int header_len) {
134 this.header_len = header_len;
135 }
136
137 /**
138 * Returns unknown_00c value
139 *
140 * @return unknown_00c
141 */
142 public int getUnknown_000c() {
143 return unknown_000c;
144 }
145
146 /**
147 * Sets unknown_00c
148 *
149 * @param unknown_000c
150 */
151 protected void setUnknown_000c(int unknown_000c) {
152 this.unknown_000c = unknown_000c;
153 }
154
155 /**
156 * Returns last modified date of the chm file
157 *
158 * @return last modified date as long
159 */
160 public long getLastModified() {
161 return last_modified;
162 }
163
164 /**
165 * Sets last modified date of the chm file
166 *
167 * @param last_modified
168 */
169 protected void setLastModified(long last_modified) {
170 this.last_modified = last_modified;
171 }
172
173 /**
174 * Returns language ID
175 *
176 * @return language_id
177 */
178 public long getLangId() {
179 return lang_id;
180 }
181
182 /**
183 * Sets language_id
184 *
185 * @param lang_id
186 */
187 protected void setLangId(long lang_id) {
188 this.lang_id = lang_id;
189 }
190
191 /**
192 * Returns directory uuid
193 *
194 * @return dir_uuid
195 */
196 public byte[] getDir_uuid() {
197 return dir_uuid;
198 }
199
200 /**
201 * Sets directory uuid
202 *
203 * @param dir_uuid
204 */
205 protected void setDir_uuid(byte[] dir_uuid) {
206 this.dir_uuid = dir_uuid;
207 }
208
209 /**
210 * Returns stream uuid
211 *
212 * @return stream_uuid
213 */
214 public byte[] getStream_uuid() {
215 return stream_uuid;
216 }
217
218 /**
219 * Sets stream uuid
220 *
221 * @param stream_uuid
222 */
223 protected void setStream_uuid(byte[] stream_uuid) {
224 this.stream_uuid = stream_uuid;
225 }
226
227 /**
228 * Returns unknown offset
229 *
230 * @return unknown_offset
231 */
232 public long getUnknownOffset() {
233 return unknown_offset;
234 }
235
236 /**
237 * Sets unknown offset
238 *
239 * @param unknown_offset
240 */
241 protected void setUnknownOffset(long unknown_offset) {
242 this.unknown_offset = unknown_offset;
243 }
244
245 /**
246 * Returns unknown length
247 *
248 * @return unknown_length
249 */
250 public long getUnknownLen() {
251 return unknown_len;
252 }
253
254 /**
255 * Sets unknown length
256 *
257 * @param unknown_len
258 */
259 protected void setUnknownLen(long unknown_len) {
260 this.unknown_len = unknown_len;
261 }
262
263 /**
264 * Returns directory offset
265 *
266 * @return directory_offset
267 */
268 public long getDirOffset() {
269 return dir_offset;
270 }
271
272 /**
273 * Sets directory offset
274 *
275 * @param dir_offset
276 */
277 protected void setDirOffset(long dir_offset) {
278 this.dir_offset = dir_offset;
279 }
280
281 /**
282 * Returns directory length
283 *
284 * @return directory_offset
285 */
286 public long getDirLen() {
287 return dir_len;
288 }
289
290 /**
291 * Sets directory length
292 *
293 * @param dir_len
294 */
295 protected void setDirLen(long dir_len) {
296 this.dir_len = dir_len;
297 }
298
299 /**
300 * Returns data offset
301 *
302 * @return data_offset
303 */
304 public long getDataOffset() {
305 return data_offset;
306 }
307
308 /**
309 * Sets data offset
310 *
311 * @param data_offset
312 */
313 protected void setDataOffset(long data_offset) {
314 this.data_offset = data_offset;
315 }
316
317 /**
318 * Copies 4 first bytes of the byte[]
319 *
320 * @param data
321 * @param chmItsfHeader
322 * @param count
323 * @throws TikaException
324 */
325 private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
326 int count) throws TikaException {
327 ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
328 System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
329 this.setCurrentPlace(this.getCurrentPlace() + count);
330 this.setDataRemained(this.getDataRemained() - count);
331 }
332
333 /**
334 * Copies X bytes of source byte[] to the dest byte[]
335 *
336 * @param data
337 * @param dest
338 * @param count
339 * @return
340 */
341 private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
342 System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
343 this.setCurrentPlace(this.getCurrentPlace() + count);
344 this.setDataRemained(this.getDataRemained() - count);
345 return dest;
346 }
347
348 /**
349 * Takes 8 bytes and reverses them
350 *
351 * @param data
352 * @param dest
353 * @return
354 * @throws TikaException
355 */
356 private long unmarshalUint64(byte[] data, long dest) throws TikaException{
357 byte[] temp = new byte[8];
358 int i, j;
359
360 if (8 > this.getDataRemained())
361 throw new TikaException("8 > this.getDataRemained()");
362
363 for (i = 8, j = 7; i > 0; i--) {
364 temp[j--] = data[this.getCurrentPlace()];
365 this.setCurrentPlace(this.getCurrentPlace() + 1);
366 }
367
368 dest = new BigInteger(temp).longValue();
369 this.setDataRemained(this.getDataRemained() - 8);
370 return dest;
371 }
372
373 private int unmarshalInt32(byte[] data, int dest) throws TikaException{
374 ChmAssert.assertByteArrayNotNull(data);
375
376 if (4 > this.getDataRemained())
377 throw new TikaException("4 > dataLenght");
378 dest = data[this.getCurrentPlace()]
379 | data[this.getCurrentPlace() + 1] << 8
380 | data[this.getCurrentPlace() + 2] << 16
381 | data[this.getCurrentPlace() + 3] << 24;
382
383 this.setCurrentPlace(this.getCurrentPlace() + 4);
384 this.setDataRemained(this.getDataRemained() - 4);
385 return dest;
386 }
387
388 private long unmarshalUInt32(byte[] data, long dest) throws TikaException{
389 ChmAssert.assertByteArrayNotNull(data);
390 if (4 > getDataRemained())
391 throw new TikaException("4 > dataLenght");
392 dest = data[this.getCurrentPlace()]
393 | data[this.getCurrentPlace() + 1] << 8
394 | data[this.getCurrentPlace() + 2] << 16
395 | data[this.getCurrentPlace() + 3] << 24;
396
397 setDataRemained(this.getDataRemained() - 4);
398 this.setCurrentPlace(this.getCurrentPlace() + 4);
399 return dest;
400 }
401
402 public static void main(String[] args) {
403 }
404
405 /**
406 * Sets data remained to be processed
407 *
408 * @param dataRemained
409 */
410 private void setDataRemained(int dataRemained) {
411 this.dataRemained = dataRemained;
412 }
413
414 /**
415 * Returns data remained
416 *
417 * @return data_remainned
418 */
419 private int getDataRemained() {
420 return dataRemained;
421 }
422
423 /**
424 * Sets current place in the byte[]
425 *
426 * @param currentPlace
427 */
428 private void setCurrentPlace(int currentPlace) {
429 this.currentPlace = currentPlace;
430 }
431
432 /**
433 * Returns current place in the byte[]
434 *
435 * @return current place
436 */
437 private int getCurrentPlace() {
438 return currentPlace;
439 }
440
441 // @Override
442 public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
443 if (data.length < ChmConstants.CHM_ITSF_V2_LEN
444 || data.length > ChmConstants.CHM_ITSF_V3_LEN)
445 throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
446
447 chmItsfHeader.setDataRemained(data.length);
448 chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
449 chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
450 chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
451 chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
452 chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
453 chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
454 chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
455 chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
456 chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
457 chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
458 chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
459 chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
460
461 if (!new String(chmItsfHeader.getSignature()).equals(ChmConstants.ITSF))
462 throw new TikaException("seems not valid file");
463 if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
464 if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
465 throw new TikaException("something wrong with header");
466 } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
467 if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
468 throw new TikaException("unknown v3 header lenght");
469 } else
470 throw new ChmParsingException("unsupported chm format");
471
472 /*
473 * now, if we have a V3 structure, unmarshal the rest, otherwise,
474 * compute it
475 */
476 if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
477 if (chmItsfHeader.getDataRemained() >= 0)
478 chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
479 + chmItsfHeader.getDirLen());
480 else
481 throw new TikaException("cannot set data offset, no data remained");
482 } else
483 chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
484 + chmItsfHeader.getDirLen());
485 }
486 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.chm.assertion.ChmAssert;
20 import org.apache.tika.parser.chm.core.ChmCommons;
21 import org.apache.tika.parser.chm.core.ChmConstants;
22 import org.apache.tika.parser.chm.exception.ChmParsingException;
23
24 /**
25 * Directory header The directory starts with a header; its format is as
26 * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
27 * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
28 * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
29 * Depth of the index tree - 1 there is no index, 2 if there is one level of
30 * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
31 * (though at least one file has 0 despite there being no index chunk, probably
32 * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
33 * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
34 * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
35 * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
36 * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
37 * DWORD -1 (unknown)
38 *
39 * {@link http
40 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
41 * /?show-translation-form=1}
42 *
43 */
44 public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
45 // TODO: refactor all unmarshals
46 private static final long serialVersionUID = 1962394421998181341L;
47 private byte[] signature = new String(ChmConstants.ITSP).getBytes(); /*
48 * 0
49 * (ITSP
50 * )
51 */
52 private int version; /* 4 */
53 private int header_len; /* 8 */
54 private int unknown_000c; /* c */
55 private long block_len; /* 10 */
56 private int blockidx_intvl; /* 14 */
57 private int index_depth; /* 18 */
58 private int index_root; /* 1c */
59 private int index_head; /* 20 */
60 private int unknown_0024; /* 24 */
61 private long num_blocks; /* 28 */
62 private int unknown_002c; /* 2c */
63 private long lang_id; /* 30 */
64 private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
65 private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
66
67 /* local usage */
68 private int dataRemained;
69 private int currentPlace = 0;
70
71 public String toString() {
72 StringBuilder sb = new StringBuilder();
73 sb.append("[ signature:=" + new String(getSignature())
74 + System.getProperty("line.separator"));
75 sb.append("version:=\t" + getVersion()
76 + System.getProperty("line.separator"));
77 sb.append("header_len:=\t" + getHeader_len()
78 + System.getProperty("line.separator"));
79 sb.append("unknown_00c:=\t" + getUnknown_000c()
80 + System.getProperty("line.separator"));
81 sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
82 + System.getProperty("line.separator"));
83 sb.append("blockidx_intvl:=" + getBlockidx_intvl()
84 + ", density of quickref section, usually 2"
85 + System.getProperty("line.separator"));
86 sb.append("index_depth:=\t"
87 + getIndex_depth()
88 + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
89 + System.getProperty("line.separator"));
90 sb.append("index_root:=\t" + getIndex_root()
91 + ", chunk number of root index chunk, -1 if there is none"
92 + System.getProperty("line.separator"));
93 sb.append("index_head:=\t" + getIndex_head()
94 + ", chunk number of first PMGL (listing) chunk"
95 + System.getProperty("line.separator"));
96 sb.append("unknown_0024:=\t" + getUnknown_0024()
97 + ", chunk number of last PMGL (listing) chunk"
98 + System.getProperty("line.separator"));
99 sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
100 + System.getProperty("line.separator"));
101 sb.append("unknown_002c:=\t" + getUnknown_002c()
102 + ", number of directory chunks (total)"
103 + System.getProperty("line.separator"));
104 sb.append("lang_id:=\t" + getLang_id() + " - "
105 + ChmCommons.getLanguage(getLang_id())
106 + System.getProperty("line.separator"));
107 sb.append("system_uuid:=" + getSystem_uuid()
108 + System.getProperty("line.separator"));
109 sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
110 return sb.toString();
111 }
112
113 /**
114 * Copies 4 bits from data[]
115 *
116 * @param data
117 * @param chmItspHeader
118 * @param count
119 * @throws TikaException
120 */
121 private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
122 int count) throws TikaException {
123 ChmAssert.assertByteArrayNotNull(data);
124 ChmAssert.assertChmAccessorNotNull(chmItspHeader);
125 this.setDataRemained(data.length);
126 System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
127 this.setCurrentPlace(this.getCurrentPlace() + count);
128 this.setDataRemained(this.getDataRemained() - count);
129 }
130
131 private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
132 ChmAssert.assertByteArrayNotNull(data);
133 if (4 > this.getDataRemained())
134 throw new TikaException("4 > dataLenght");
135 dest = data[this.getCurrentPlace()]
136 | data[this.getCurrentPlace() + 1] << 8
137 | data[this.getCurrentPlace() + 2] << 16
138 | data[this.getCurrentPlace() + 3] << 24;
139
140 this.setCurrentPlace(this.getCurrentPlace() + 4);
141 this.setDataRemained(this.getDataRemained() - 4);
142 return dest;
143 }
144
145 private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
146 ChmAssert.assertByteArrayNotNull(data);
147 if (4 > dataLenght)
148 throw new TikaException("4 > dataLenght");
149 dest = data[this.getCurrentPlace()]
150 | data[this.getCurrentPlace() + 1] << 8
151 | data[this.getCurrentPlace() + 2] << 16
152 | data[this.getCurrentPlace() + 3] << 24;
153
154 setDataRemained(this.getDataRemained() - 4);
155 this.setCurrentPlace(this.getCurrentPlace() + 4);
156 return dest;
157 }
158
159 private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
160 int count) {
161 System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
162 this.setCurrentPlace(this.getCurrentPlace() + count);
163 this.setDataRemained(this.getDataRemained() - count);
164 return dest;
165 }
166
167 /**
168 * Returns how many bytes remained
169 *
170 * @return int
171 */
172 private int getDataRemained() {
173 return dataRemained;
174 }
175
176 /**
177 * Sets how many bytes remained
178 *
179 * @param dataRemained
180 */
181 private void setDataRemained(int dataRemained) {
182 this.dataRemained = dataRemained;
183 }
184
185 /**
186 * Returns a place holder
187 *
188 * @return current place
189 */
190 private int getCurrentPlace() {
191 return currentPlace;
192 }
193
194 /**
195 * Sets current place
196 *
197 * @param currentPlace
198 */
199 private void setCurrentPlace(int currentPlace) {
200 this.currentPlace = currentPlace;
201 }
202
203 /**
204 * Returns a signature of the header
205 *
206 * @return itsp signature
207 */
208 public byte[] getSignature() {
209 return signature;
210 }
211
212 /**
213 * Sets itsp signature
214 *
215 * @param signature
216 */
217 protected void setSignature(byte[] signature) {
218 this.signature = signature;
219 }
220
221 /**
222 * Returns version of itsp header
223 *
224 * @return version
225 */
226 public int getVersion() {
227 return version;
228 }
229
230 /**
231 * Sets a version of itsp header
232 *
233 * @param version
234 */
235 protected void setVersion(int version) {
236 this.version = version;
237 }
238
239 /**
240 * Returns header length
241 *
242 * @return header length
243 */
244 public int getHeader_len() {
245 return header_len;
246 }
247
248 /**
249 * Sets itsp header length
250 *
251 * @param header_len
252 */
253 protected void setHeader_len(int header_len) {
254 this.header_len = header_len;
255 }
256
257 /**
258 * Returns 000c unknown bytes
259 */
260 public int getUnknown_000c() {
261 return unknown_000c;
262 }
263
264 /**
265 * Sets 000c unknown bytes Unknown means here that those guys who cracked
266 * the chm format do not know what's it purposes for
267 *
268 * @param unknown_000c
269 */
270 protected void setUnknown_000c(int unknown_000c) {
271 this.unknown_000c = unknown_000c;
272 }
273
274 /**
275 * Returns block's length
276 *
277 * @return block_length
278 */
279 public long getBlock_len() {
280 return block_len;
281 }
282
283 /**
284 * Sets block length
285 *
286 * @param block_len
287 */
288 protected void setBlock_len(long block_len) {
289 this.block_len = block_len;
290 }
291
292 /**
293 * Returns block index interval
294 *
295 * @return blockidx_intvl
296 */
297 public int getBlockidx_intvl() {
298 return blockidx_intvl;
299 }
300
301 /**
302 * Sets block index interval
303 *
304 * @param blockidx_intvl
305 */
306 protected void setBlockidx_intvl(int blockidx_intvl) {
307 this.blockidx_intvl = blockidx_intvl;
308 }
309
310 /**
311 * Returns an index depth
312 *
313 * @return index_depth
314 */
315 public int getIndex_depth() {
316 return index_depth;
317 }
318
319 /**
320 * Sets an index depth
321 *
322 * @param index_depth
323 */
324 protected void setIndex_depth(int index_depth) {
325 this.index_depth = index_depth;
326 }
327
328 /**
329 * Returns index root
330 *
331 * @return index_root
332 */
333 public int getIndex_root() {
334 return index_root;
335 }
336
337 /**
338 * Sets an index root
339 *
340 * @param index_root
341 */
342 protected void setIndex_root(int index_root) {
343 this.index_root = index_root;
344 }
345
346 /**
347 * Returns an index head
348 *
349 * @return index_head
350 */
351 public int getIndex_head() {
352 return index_head;
353 }
354
355 /**
356 * Sets an index head
357 *
358 * @param index_head
359 */
360 protected void setIndex_head(int index_head) {
361 this.index_head = index_head;
362 }
363
364 /**
365 * Returns 0024 unknown bytes
366 *
367 * @return unknown_0024
368 */
369 public int getUnknown_0024() {
370 return unknown_0024;
371 }
372
373 /**
374 * Sets 0024 unknown bytes
375 *
376 * @param unknown_0024
377 */
378 protected void setUnknown_0024(int unknown_0024) {
379 this.unknown_0024 = unknown_0024;
380 }
381
382 /**
383 * Returns number of blocks
384 *
385 * @return num_blocks
386 */
387 public long getNum_blocks() {
388 return num_blocks;
389 }
390
391 /**
392 * Sets number of blocks containing in the chm file
393 *
394 * @param num_blocks
395 */
396 protected void setNum_blocks(long num_blocks) {
397 this.num_blocks = num_blocks;
398 }
399
400 /**
401 * Returns 002c unknown bytes
402 *
403 * @return unknown_002c
404 */
405 public int getUnknown_002c() {
406 return unknown_002c;
407 }
408
409 /**
410 * Sets 002c unknown bytes
411 *
412 * @param unknown_002c
413 */
414 protected void setUnknown_002c(int unknown_002c) {
415 this.unknown_002c = unknown_002c;
416 }
417
418 /**
419 * Returns language id
420 *
421 * @return lang_id
422 */
423 public long getLang_id() {
424 return lang_id;
425 }
426
427 /**
428 * Sets language id
429 *
430 * @param lang_id
431 */
432 protected void setLang_id(long lang_id) {
433 this.lang_id = lang_id;
434 }
435
436 /**
437 * Returns system uuid
438 *
439 * @return system_uuid
440 */
441 public byte[] getSystem_uuid() {
442 return system_uuid;
443 }
444
445 /**
446 * Sets system uuid
447 *
448 * @param system_uuid
449 */
450 protected void setSystem_uuid(byte[] system_uuid) {
451 this.system_uuid = system_uuid;
452 }
453
454 /**
455 * Returns 0044 unknown bytes
456 *
457 * @return unknown_0044
458 */
459 public byte[] getUnknown_0044() {
460 return unknown_0044;
461 }
462
463 /**
464 * Sets 0044 unknown bytes
465 *
466 * @param unknown_0044
467 */
468 protected void setUnknown_0044(byte[] unknown_0044) {
469 this.unknown_0044 = unknown_0044;
470 }
471
472 // @Override
473 public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
474 /* we only know how to deal with the 0x58 and 0x60 byte structures */
475 if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
476 throw new ChmParsingException("we only know how to deal with the 0x58 and 0x60 byte structures");
477
478 /* unmarshal common fields */
479 chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
480 // ChmCommons.unmarshalCharArray(data, chmItspHeader,
481 // ChmConstants.CHM_SIGNATURE_LEN);
482 chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
483 chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
484 chmItspHeader
485 .setHeader_len(chmItspHeader.unmarshalInt32(data,
486 chmItspHeader.getDataRemained(),
487 chmItspHeader.getHeader_len()));
488 chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
489 chmItspHeader.getDataRemained(),
490 chmItspHeader.getUnknown_000c()));
491 chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
492 chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
493 chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
494 chmItspHeader.getDataRemained(),
495 chmItspHeader.getBlockidx_intvl()));
496 chmItspHeader
497 .setIndex_depth(chmItspHeader.unmarshalInt32(data,
498 chmItspHeader.getDataRemained(),
499 chmItspHeader.getIndex_depth()));
500 chmItspHeader
501 .setIndex_root(chmItspHeader.unmarshalInt32(data,
502 chmItspHeader.getDataRemained(),
503 chmItspHeader.getIndex_root()));
504 chmItspHeader
505 .setIndex_head(chmItspHeader.unmarshalInt32(data,
506 chmItspHeader.getDataRemained(),
507 chmItspHeader.getIndex_head()));
508 chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
509 chmItspHeader.getDataRemained(),
510 chmItspHeader.getUnknown_0024()));
511 chmItspHeader
512 .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
513 chmItspHeader.getDataRemained(),
514 chmItspHeader.getNum_blocks()));
515 chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
516 chmItspHeader.getDataRemained(),
517 chmItspHeader.getUnknown_002c())));
518 chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
519 chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
520 chmItspHeader
521 .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
522 chmItspHeader.getDataRemained(),
523 chmItspHeader.getSystem_uuid(),
524 ChmConstants.BYTE_ARRAY_LENGHT));
525 chmItspHeader
526 .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
527 chmItspHeader.getDataRemained(),
528 chmItspHeader.getUnknown_0044(),
529 ChmConstants.BYTE_ARRAY_LENGHT));
530
531 /* Checks validity of the itsp header */
532 if (!new String(chmItspHeader.getSignature()).equals(ChmConstants.ITSP))
533 throw new ChmParsingException("seems not valid signature");
534
535 if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
536 throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
537
538 if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
539 throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
540 }
541
542 /**
543 * @param args
544 */
545 public static void main(String[] args) {
546 }
547 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.chm.assertion.ChmAssert;
20 import org.apache.tika.parser.chm.core.ChmConstants;
21 import org.apache.tika.parser.chm.exception.ChmParsingException;
22
23 /**
24 *
25 * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
26 * information on the compression. The information is partially known: 0000:
27 * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
28 * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
29 * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
30 * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
31 * 001C: DWORD 0 (unknown)
32 *
33 * {@link http
34 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
35 * /?page=2 }
36 *
37 */
38 public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
39 private static final long serialVersionUID = -7897854774939631565L;
40 /* class' members */
41 private long size; /* 0 */
42 private byte[] signature = new String(ChmConstants.LZXC).getBytes(); /*
43 * 4
44 * (LZXC
45 * )
46 */
47 private long version; /* 8 */
48 private long resetInterval; /* c */
49 private long windowSize; /* 10 */
50 private long windowsPerReset; /* 14 */
51 private long unknown_18; /* 18 */
52
53 /* local usage */
54 private int dataRemained;
55 private int currentPlace = 0;
56
57 /**
58 * Returns a remained data
59 *
60 * @return dataRemained
61 */
62 private int getDataRemained() {
63 return dataRemained;
64 }
65
66 /**
67 * Sets a remained data
68 *
69 * @param dataRemained
70 */
71 private void setDataRemained(int dataRemained) {
72 this.dataRemained = dataRemained;
73 }
74
75 /**
76 * Returns a place holder
77 *
78 * @return current_place
79 */
80 private int getCurrentPlace() {
81 return currentPlace;
82 }
83
84 /**
85 * Sets a place holder
86 *
87 * @param current_place
88 */
89 private void setCurrentPlace(int currentPlace) {
90 this.currentPlace = currentPlace;
91 }
92
93 /**
94 * Returns a size of control data
95 *
96 * @return size
97 */
98 public long getSize() {
99 return size;
100 }
101
102 /**
103 * Sets a size of control data
104 *
105 * @param size
106 */
107 protected void setSize(long size) {
108 this.size = size;
109 }
110
111 /**
112 * Returns a signature of control data block
113 *
114 * @return signature
115 */
116 public byte[] getSignature() {
117 return signature;
118 }
119
120 /**
121 * Sets a signature of control data block
122 *
123 * @param signature
124 */
125 protected void setSignature(byte[] signature) {
126 this.signature = signature;
127 }
128
129 /**
130 * Returns a version of control data block
131 *
132 * @return version
133 */
134 public long getVersion() {
135 return version;
136 }
137
138 /**
139 * Sets version of control data block
140 *
141 * @param version
142 */
143 protected void setVersion(long version) {
144 this.version = version;
145 }
146
147 /**
148 * Returns reset interval
149 *
150 * @return reset_interval
151 */
152 public long getResetInterval() {
153 return resetInterval;
154 }
155
156 /**
157 * Sets a reset interval
158 *
159 * @param resetInterval
160 */
161 protected void setResetInterval(long resetInterval) {
162 this.resetInterval = resetInterval;
163 }
164
165 /**
166 * Returns a window size
167 *
168 * @return window_size
169 */
170 public long getWindowSize() {
171 return windowSize;
172 }
173
174 /**
175 * Sets a window size
176 *
177 * @param window_size
178 */
179 protected void setWindowSize(long windowSize) {
180 this.windowSize = windowSize;
181 }
182
183 /**
184 * Returns windows per reset
185 *
186 * @return
187 */
188 public long getWindowsPerReset() {
189 return windowsPerReset;
190 }
191
192 /**
193 * Sets windows per reset
194 *
195 * @param windows_per_reset
196 */
197 protected void setWindowsPerReset(long windowsPerReset) {
198 this.windowsPerReset = windowsPerReset;
199 }
200
201 /**
202 * Returns unknown 18 bytes
203 *
204 * @return unknown_18
205 */
206 public long getUnknown_18() {
207 return unknown_18;
208 }
209
210 /**
211 * Sets unknown 18 bytes
212 *
213 * @param unknown_18
214 */
215 protected void setUnknown_18(long unknown_18) {
216 this.unknown_18 = unknown_18;
217 }
218
219 private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
220 assert (data != null && data.length > 0);
221 if (4 > getDataRemained())
222 throw new ChmParsingException("4 > dataLenght");
223 dest = data[this.getCurrentPlace()]
224 | data[this.getCurrentPlace() + 1] << 8
225 | data[this.getCurrentPlace() + 2] << 16
226 | data[this.getCurrentPlace() + 3] << 24;
227
228 setDataRemained(this.getDataRemained() - 4);
229 this.setCurrentPlace(this.getCurrentPlace() + 4);
230 return dest;
231 }
232
233 private void unmarshalCharArray(byte[] data,
234 ChmLzxcControlData chmLzxcControlData, int count) throws TikaException {
235 ChmAssert.assertByteArrayNotNull(data);
236 ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
237 ChmAssert.assertPositiveInt(count);
238 System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
239 this.setCurrentPlace(this.getCurrentPlace() + count);
240 this.setDataRemained(this.getDataRemained() - count);
241 }
242
243 /**
244 * Returns textual representation of ChmLzxcControlData
245 */
246 public String toString() {
247 StringBuilder sb = new StringBuilder();
248 sb.append("size(unknown):=" + this.getSize() + ", ");
249 sb.append("signature(Compression type identifier):="
250 + new String(this.getSignature()) + ", ");
251 sb.append("version(Possibly numeric code for LZX):="
252 + this.getVersion() + System.getProperty("line.separator"));
253 sb.append("resetInterval(The Huffman reset interval):="
254 + this.getResetInterval() + ", ");
255 sb.append("windowSize:=" + this.getWindowSize() + ", ");
256 sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
257 + this.getWindowsPerReset() + ", ");
258 sb.append("unknown_18:=" + this.getUnknown_18()
259 + System.getProperty("line.separator"));
260 return sb.toString();
261 }
262
263 // @Override
264 public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) throws TikaException {
265 if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
266 throw new ChmParsingException("we want at least 0x18 bytes");
267 chmLzxcControlData.setDataRemained(data.length);
268 chmLzxcControlData.setSize(unmarshalUInt32(data, chmLzxcControlData.getSize()));
269 chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
270 ChmConstants.CHM_SIGNATURE_LEN);
271 chmLzxcControlData.setVersion(unmarshalUInt32(data,
272 chmLzxcControlData.getVersion()));
273 chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
274 chmLzxcControlData.getResetInterval()));
275 chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
276 chmLzxcControlData.getWindowSize()));
277 chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
278 chmLzxcControlData.getWindowsPerReset()));
279
280 if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
281 chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
282 chmLzxcControlData.getUnknown_18()));
283 else
284 chmLzxcControlData.setUnknown_18(0);
285
286 if (chmLzxcControlData.getVersion() == 2) {
287 chmLzxcControlData.setWindowSize(getWindowSize()
288 * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
289 }
290
291 if (chmLzxcControlData.getWindowSize() == 0
292 || chmLzxcControlData.getResetInterval() == 0)
293 throw new ChmParsingException(
294 "window size / resetInterval should be more than zero");
295
296 if (chmLzxcControlData.getWindowSize() == 1)
297 throw new ChmParsingException(
298 "window size / resetInterval should be more than 1");
299
300 /* checks a signature */
301 if (!new String(chmLzxcControlData.getSignature())
302 .equals(ChmConstants.LZXC))
303 throw new ChmParsingException(
304 "the signature does not seem to be correct");
305 }
306
307 /**
308 * @param args
309 */
310 public static void main(String[] args) {
311 }
312 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import java.math.BigInteger;
19 import java.util.Arrays;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.parser.chm.assertion.ChmAssert;
23 import org.apache.tika.parser.chm.core.ChmConstants;
24 import org.apache.tika.parser.chm.exception.ChmParsingException;
25
26 /**
27 * LZXC reset table For ensuring a decompression. Reads the block named
28 * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
29 * .
30 *
31 * {@link http
32 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
33 * /?page=2 }
34 *
35 */
36 public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
37 private static final long serialVersionUID = -8209574429411707460L;
38 /* class members */
39 private long version; // 0000: DWORD 2 unknown (possibly a version number)
40 private long block_count; // 0004: DWORD Number of entries in reset table
41 private long unknown; // 0008: DWORD 8 unknown
42 private long table_offset; // 000C: DWORD $28 Length of table header (area
43 // before table entries)
44 private long uncompressed_len; // 0010: QWORD Uncompressed Length
45 private long compressed_len; // 0018: QWORD Compressed Length
46 private long block_len; // 0020: QWORD 0x8000 block size for locations below
47 private long[] block_address;
48
49 /* local usage */
50 private int dataRemained;
51 private int currentPlace = 0;
52
53 private int getDataRemained() {
54 return dataRemained;
55 }
56
57 private void setDataRemained(int dataRemained) {
58 this.dataRemained = dataRemained;
59 }
60
61 /**
62 * Returns block addresses
63 *
64 * @return block addresses
65 */
66 public long[] getBlockAddress() {
67 return block_address;
68 }
69
70 /**
71 * Sets block addresses
72 *
73 * @param block_address
74 */
75 public void setBlockAddress(long[] block_address) {
76 this.block_address = block_address;
77 }
78
79 private int getCurrentPlace() {
80 return currentPlace;
81 }
82
83 private void setCurrentPlace(int currentPlace) {
84 this.currentPlace = currentPlace;
85 }
86
87 @Override
88 public String toString() {
89 StringBuilder sb = new StringBuilder();
90 sb.append("version:=" + getVersion()
91 + System.getProperty("line.separator"));
92 sb.append("block_count:=" + getBlockCount()
93 + System.getProperty("line.separator"));
94 sb.append("unknown:=" + getUnknown()
95 + System.getProperty("line.separator"));
96 sb.append("table_offset:=" + getTableOffset()
97 + System.getProperty("line.separator"));
98 sb.append("uncompressed_len:=" + getUncompressedLen()
99 + System.getProperty("line.separator"));
100 sb.append("compressed_len:=" + getCompressedLen()
101 + System.getProperty("line.separator"));
102 sb.append("block_len:=" + getBlockLen()
103 + System.getProperty("line.separator"));
104 sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
105 return sb.toString();
106 }
107
108 /**
109 * Enumerates chm block addresses
110 *
111 * @param data
112 *
113 * @return byte[] of addresses
114 * @throws TikaException
115 */
116 private long[] enumerateBlockAddresses(byte[] data) throws TikaException {
117 ChmAssert.assertByteArrayNotNull(data);
118 /* we have limit of number of blocks to be extracted */
119 if (getBlockCount() > 5000)
120 setBlockCount(5000);
121
122 if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
123 setBlockCount(getDataRemained() / 8);
124
125 long[] addresses = new long[(int) getBlockCount()];
126 int rem = getDataRemained() / 8;
127 for (int i = 0; i < rem; i++) {
128 long num = -1;
129
130 try {
131 addresses[i] = unmarshalUint64(data, num);
132 } catch (Exception e) {
133 throw new TikaException(e.getMessage());
134 }
135 }
136 return addresses;
137 }
138
139 /**
140 * Validates parameters such as byte[] and chm lzxc reset table
141 *
142 * @param data
143 * @param chmLzxcResetTable
144 *
145 * @return boolean
146 * @throws TikaException
147 */
148 private boolean validateParamaters(byte[] data,
149 ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
150 int goodParameter = 0;
151 ChmAssert.assertByteArrayNotNull(data);
152 ++goodParameter;
153 ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
154 ++goodParameter;
155 return (goodParameter == 2);
156 }
157
158 private long unmarshalUInt32(byte[] data, long dest) throws TikaException {
159 ChmAssert.assertByteArrayNotNull(data);
160 dest = data[this.getCurrentPlace()]
161 | data[this.getCurrentPlace() + 1] << 8
162 | data[this.getCurrentPlace() + 2] << 16
163 | data[this.getCurrentPlace() + 3] << 24;
164
165 setDataRemained(this.getDataRemained() - 4);
166 this.setCurrentPlace(this.getCurrentPlace() + 4);
167 return dest;
168 }
169
170 private long unmarshalUint64(byte[] data, long dest) throws TikaException {
171 ChmAssert.assertByteArrayNotNull(data);
172 byte[] temp = new byte[8];
173 int i, j;// counters
174
175 for (i = 8, j = 7; i > 0; i--) {
176 if (data.length > this.getCurrentPlace()) {
177 temp[j--] = data[this.getCurrentPlace()];
178 this.setCurrentPlace(this.getCurrentPlace() + 1);
179 } else
180 throw new TikaException("data is too small to calculate address block");
181 }
182 dest = new BigInteger(temp).longValue();
183 this.setDataRemained(this.getDataRemained() - 8);
184 return dest;
185 }
186
187 /**
188 * Returns the version
189 *
190 * @return - long
191 */
192 public long getVersion() {
193 return version;
194 }
195
196 /**
197 * Sets the version
198 *
199 * @param version
200 * - long
201 */
202 public void setVersion(long version) {
203 this.version = version;
204 }
205
206 /**
207 * Gets a block count
208 *
209 * @return - int
210 */
211 public long getBlockCount() {
212 return block_count;
213 }
214
215 /**
216 * Sets a block count
217 *
218 * @param block_count
219 * - long
220 */
221 public void setBlockCount(long block_count) {
222 this.block_count = block_count;
223 }
224
225 /**
226 * Gets unknown
227 *
228 * @return - long
229 */
230 public long getUnknown() {
231 return unknown;
232 }
233
234 /**
235 * Sets an unknown
236 *
237 * @param unknown
238 * - long
239 */
240 public void setUnknown(long unknown) {
241 this.unknown = unknown;
242 }
243
244 /**
245 * Gets a table offset
246 *
247 * @return - long
248 */
249 public long getTableOffset() {
250 return table_offset;
251 }
252
253 /**
254 * Sets a table offset
255 *
256 * @param table_offset
257 * - long
258 */
259 public void setTableOffset(long table_offset) {
260 this.table_offset = table_offset;
261 }
262
263 /**
264 * Gets uncompressed length
265 *
266 * @return - {@link BigInteger }
267 */
268 public long getUncompressedLen() {
269 return uncompressed_len;
270 }
271
272 /**
273 * Sets uncompressed length
274 *
275 * @param uncompressed_len
276 * - {@link BigInteger}
277 */
278 public void setUncompressedLen(long uncompressed_len) {
279 this.uncompressed_len = uncompressed_len;
280 }
281
282 /**
283 * Gets compressed length
284 *
285 * @return - {@link BigInteger}
286 */
287 public long getCompressedLen() {
288 return compressed_len;
289 }
290
291 /**
292 * Sets compressed length
293 *
294 * @param compressed_len
295 * - {@link BigInteger}
296 */
297 public void setCompressedLen(long compressed_len) {
298 this.compressed_len = compressed_len;
299 }
300
301 /**
302 * Gets a block length
303 *
304 * @return - {@link BigInteger}
305 */
306 public long getBlockLen() {
307 return block_len;
308 }
309
310 /**
311 * Sets a block length
312 *
313 * @param block_len
314 * - {@link BigInteger}
315 */
316 public void setBlockLlen(long block_len) {
317 this.block_len = block_len;
318 }
319
320 /**
321 * @param args
322 */
323 public static void main(String[] args) {
324
325 }
326
327 // @Override
328 public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) throws TikaException {
329 setDataRemained(data.length);
330 if (validateParamaters(data, chmLzxcResetTable)) {
331 /* unmarshal fields */
332 chmLzxcResetTable.setVersion(unmarshalUInt32(data, chmLzxcResetTable.getVersion()));
333 chmLzxcResetTable.setBlockCount(unmarshalUInt32(data, chmLzxcResetTable.getBlockCount()));
334 chmLzxcResetTable.setUnknown(unmarshalUInt32(data, chmLzxcResetTable.getUnknown()));
335 chmLzxcResetTable.setTableOffset(unmarshalUInt32(data, chmLzxcResetTable.getTableOffset()));
336 chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data, chmLzxcResetTable.getUncompressedLen()));
337 chmLzxcResetTable.setCompressedLen(unmarshalUint64(data, chmLzxcResetTable.getCompressedLen()));
338 chmLzxcResetTable.setBlockLlen(unmarshalUint64(data, chmLzxcResetTable.getBlockLen()));
339 chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
340 }
341
342 /* checks chmLzxcResetTable */
343 if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
344 throw new ChmParsingException(
345 "does not seem currect version of chmLzxcResetTable");
346 }
347 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import java.util.Arrays;
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.parser.chm.assertion.ChmAssert;
22 import org.apache.tika.parser.chm.core.ChmCommons;
23 import org.apache.tika.parser.chm.core.ChmConstants;
24 import org.apache.tika.parser.chm.exception.ChmParsingException;
25
26 /**
27 * Description Note: not always exists An index chunk has the following format:
28 * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
29 * directory chunk 0008: Directory index entries (to quickref/free area) The
30 * quickref area in an PMGI is the same as in an PMGL The format of a directory
31 * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
32 * ENCINT: directory listing chunk which starts with name Encoded Integers aka
33 * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
34 * indicates "continued to the next byte". Bytes are stored most significant to
35 * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
36 * 0x3515.
37 *
38 * <p>
39 * Note: This class is not in use
40 *
41 * {@link http
42 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
43 * /?show-translation-form=1 }
44 *
45 *
46 */
47 public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
48 private static final long serialVersionUID = -2092282339894303701L;
49 private byte[] signature = new String(ChmConstants.CHM_PMGI_MARKER).getBytes(); /* 0 (PMGI) */
50 private long free_space; /* 4 */
51
52 /* local usage */
53 private int dataRemained;
54 private int currentPlace = 0;
55
56 private int getDataRemained() {
57 return dataRemained;
58 }
59
60 private void setDataRemained(int dataRemained) {
61 this.dataRemained = dataRemained;
62 }
63
64 private int getCurrentPlace() {
65 return currentPlace;
66 }
67
68 private void setCurrentPlace(int currentPlace) {
69 this.currentPlace = currentPlace;
70 }
71
72 private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
73 int count) throws ChmParsingException {
74 int index = -1;
75 ChmAssert.assertByteArrayNotNull(data);
76 ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
77 ChmAssert.assertPositiveInt(count);
78 this.setDataRemained(data.length);
79 index = ChmCommons.indexOf(data,
80 ChmConstants.CHM_PMGI_MARKER.getBytes());
81 if (index >= 0)
82 System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
83 else{
84 //Some chm documents (actually most of them) do not contain
85 //PMGI header, in this case, we just notice about it.
86 }
87 this.setCurrentPlace(this.getCurrentPlace() + count);
88 this.setDataRemained(this.getDataRemained() - count);
89 }
90
91 private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
92 ChmAssert.assertByteArrayNotNull(data);
93
94 if (4 > getDataRemained())
95 throw new ChmParsingException("4 > dataLenght");
96 dest = data[this.getCurrentPlace()]
97 | data[this.getCurrentPlace() + 1] << 8
98 | data[this.getCurrentPlace() + 2] << 16
99 | data[this.getCurrentPlace() + 3] << 24;
100
101 setDataRemained(this.getDataRemained() - 4);
102 this.setCurrentPlace(this.getCurrentPlace() + 4);
103 return dest;
104 }
105
106 /**
107 * Returns pmgi signature if exists
108 *
109 * @return signature
110 */
111 public byte[] getSignature() {
112 return signature;
113 }
114
115 /**
116 * Sets pmgi signature
117 *
118 * @param signature
119 */
120 protected void setSignature(byte[] signature) {
121 this.signature = signature;
122 }
123
124 /**
125 * Returns pmgi free space
126 *
127 * @return free_space
128 */
129 public long getFreeSpace() {
130 return free_space;
131 }
132
133 /**
134 * Sets pmgi free space
135 *
136 * @param free_space
137 */
138 protected void setFreeSpace(long free_space) {
139 this.free_space = free_space;
140 }
141
142 /**
143 * Returns textual representation of the pmgi header
144 */
145 public String toString() {
146 StringBuilder sb = new StringBuilder();
147 sb.append("signature:=" + new String(getSignature()) + ", ");
148 sb.append("free space:=" + getFreeSpace()
149 + System.getProperty("line.separator"));
150 return sb.toString();
151 }
152
153 // @Override
154 public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) throws TikaException {
155 /* we only know how to deal with a 0x8 byte structures */
156 if (data.length < ChmConstants.CHM_PMGI_LEN)
157 throw new TikaException("we only know how to deal with a 0x8 byte structures");
158
159 /* unmarshal fields */
160 chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader, ChmConstants.CHM_SIGNATURE_LEN);
161 chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data, chmPmgiHeader.getFreeSpace()));
162
163 /* check structure */
164 if (!Arrays.equals(chmPmgiHeader.getSignature(),
165 ChmConstants.CHM_PMGI_MARKER.getBytes()))
166 throw new TikaException(
167 "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
168
169 }
170
171 /**
172 * @param args
173 */
174 public static void main(String[] args) {
175
176 }
177 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.chm.assertion.ChmAssert;
20 import org.apache.tika.parser.chm.core.ChmConstants;
21 import org.apache.tika.parser.chm.exception.ChmParsingException;
22
23 /**
24 * Description There are two types of directory chunks -- index chunks, and
25 * listing chunks. The index chunk will be omitted if there is only one listing
26 * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
27 * DWORD Length of free space and/or quickref area at end of directory chunk
28 * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
29 * reading directory in sequence (-1 if this is the first listing chunk) 0010:
30 * DWORD Chunk number of next listing chunk when reading directory in sequence
31 * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
32 * quickref area) Sorted by filename; the sort is case-insensitive The quickref
33 * area is written backwards from the end of the chunk. One quickref entry
34 * exists for every n entries in the file, where n is calculated as 1 + (1 <<
35 * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
36 * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
37 * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
38 * Offset of entry 3n from entry 0 ... The format of a directory listing entry
39 * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
40 * content section ENCINT: offset ENCINT: length The offset is from the
41 * beginning of the content section the file is in, after the section has been
42 * decompressed (if appropriate). The length also refers to length of the file
43 * in the section after decompression. There are two kinds of file represented
44 * in the directory: user data and format related files. The files which are
45 * format-related have names which begin with '::', the user data files have
46 * names which begin with "/".
47 *
48 * {@link http
49 * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
50 * /?show-translation-form=1 }
51 *
52 * @author olegt
53 *
54 */
55 public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
56 private static final long serialVersionUID = -6139486487475923593L;
57 private byte[] signature = new String(ChmConstants.PMGL).getBytes(); /*
58 * 0
59 * (PMGL
60 * )
61 */
62 private long free_space; /* 4 */
63 private long unknown_0008; /* 8 */
64 private int block_prev; /* c */
65 private int block_next; /* 10 */
66
67 /* local usage */
68 private int dataRemained;
69 private int currentPlace = 0;
70
71 private int getDataRemained() {
72 return dataRemained;
73 }
74
75 private void setDataRemained(int dataRemained) {
76 this.dataRemained = dataRemained;
77 }
78
79 private int getCurrentPlace() {
80 return currentPlace;
81 }
82
83 private void setCurrentPlace(int currentPlace) {
84 this.currentPlace = currentPlace;
85 }
86
87 public long getFreeSpace() {
88 return free_space;
89 }
90
91 public void setFreeSpace(long free_space) {
92 this.free_space = free_space;
93 }
94
95 public String toString() {
96 StringBuilder sb = new StringBuilder();
97 sb.append("signatute:=" + new String(getSignature()) + ", ");
98 sb.append("free space:=" + getFreeSpace() + ", ");
99 sb.append("unknown0008:=" + getUnknown0008() + ", ");
100 sb.append("prev block:=" + getBlockPrev() + ", ");
101 sb.append("next block:=" + getBlockNext()
102 + System.getProperty("line.separator"));
103 return sb.toString();
104 }
105
106 protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
107 int count) throws TikaException {
108 ChmAssert.assertByteArrayNotNull(data);
109 this.setDataRemained(data.length);
110 System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
111 this.setCurrentPlace(this.getCurrentPlace() + count);
112 this.setDataRemained(this.getDataRemained() - count);
113 }
114
115 private int unmarshalInt32(byte[] data, int dest) throws TikaException {
116 ChmAssert.assertByteArrayNotNull(data);
117 if (4 > this.getDataRemained())
118 throw new TikaException("4 > dataLenght");
119 dest = data[this.getCurrentPlace()]
120 | data[this.getCurrentPlace() + 1] << 8
121 | data[this.getCurrentPlace() + 2] << 16
122 | data[this.getCurrentPlace() + 3] << 24;
123
124 this.setCurrentPlace(this.getCurrentPlace() + 4);
125 this.setDataRemained(this.getDataRemained() - 4);
126 return dest;
127 }
128
129 private long unmarshalUInt32(byte[] data, long dest) throws ChmParsingException {
130 ChmAssert.assertByteArrayNotNull(data);
131 if (4 > getDataRemained())
132 throw new ChmParsingException("4 > dataLenght");
133 dest = data[this.getCurrentPlace()]
134 | data[this.getCurrentPlace() + 1] << 8
135 | data[this.getCurrentPlace() + 2] << 16
136 | data[this.getCurrentPlace() + 3] << 24;
137
138 setDataRemained(this.getDataRemained() - 4);
139 this.setCurrentPlace(this.getCurrentPlace() + 4);
140 return dest;
141 }
142
143 // @Override
144 public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
145 if (data.length < ChmConstants.CHM_PMGL_LEN)
146 throw new TikaException(ChmPmglHeader.class.getName()
147 + " we only know how to deal with a 0x14 byte structures");
148
149 /* unmarshal fields */
150 chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
151 ChmConstants.CHM_SIGNATURE_LEN);
152 chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data,
153 chmPmglHeader.getFreeSpace()));
154 chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data,
155 chmPmglHeader.getUnknown0008()));
156 chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data,
157 chmPmglHeader.getBlockPrev()));
158 chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data,
159 chmPmglHeader.getBlockNext()));
160
161 /* check structure */
162 if (!new String(chmPmglHeader.getSignature()).equals(ChmConstants.PMGL))
163 throw new ChmParsingException(ChmPmglHeader.class.getName()
164 + " pmgl != pmgl.signature");
165
166 }
167
168 public byte[] getSignature() {
169 return signature;
170 }
171
172 protected void setSignature(byte[] signature) {
173 this.signature = signature;
174 }
175
176 public long getUnknown0008() {
177 return unknown_0008;
178 }
179
180 protected void setUnknown0008(long unknown_0008) {
181 this.unknown_0008 = unknown_0008;
182 }
183
184 public int getBlockPrev() {
185 return block_prev;
186 }
187
188 protected void setBlockPrev(int block_prev) {
189 this.block_prev = block_prev;
190 }
191
192 public int getBlockNext() {
193 return block_next;
194 }
195
196 protected void setBlockNext(int block_next) {
197 this.block_next = block_next;
198 }
199
200 /**
201 * @param args
202 */
203 public static void main(String[] args) {
204
205 }
206 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.accessor;
17
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.chm.assertion.ChmAssert;
20 import org.apache.tika.parser.chm.core.ChmCommons;
21
22 /**
23 * The format of a directory listing entry is as follows: BYTE: length of name
24 * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
25 * length The offset is from the beginning of the content section the file is
26 * in, after the section has been decompressed (if appropriate). The length also
27 * refers to length of the file in the section after decompression. There are
28 * two kinds of file represented in the directory: user data and format related
29 * files. The files which are format-related have names which begin with '::',
30 * the user data files have names which begin with "/".
31 *
32 */
33 public class DirectoryListingEntry {
34 /* Length of the entry name */
35 private int name_length;
36 /* Entry name or directory name */
37 private String name;
38 /* Entry type */
39 private ChmCommons.EntryType entryType;
40 /* Entry offset */
41 private int offset;
42 /* Entry size */
43 private int length;
44
45 public DirectoryListingEntry() {
46
47 }
48
49 /**
50 * Constructs directoryListingEntry
51 *
52 * @param name_length
53 * int
54 * @param name
55 * String
56 * @param isCompressed
57 * ChmCommons.EntryType
58 * @param offset
59 * int
60 * @param length
61 * int
62 * @throws TikaException
63 */
64 public DirectoryListingEntry(int name_length, String name,
65 ChmCommons.EntryType isCompressed, int offset, int length) throws TikaException {
66 ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed, offset, length);
67 setNameLength(name_length);
68 setName(name);
69 setEntryType(isCompressed);
70 setOffset(offset);
71 setLength(length);
72 }
73
74 public String toString() {
75 StringBuilder sb = new StringBuilder();
76 sb.append("name_length:=" + getNameLength() + System.getProperty("line.separator"));
77 sb.append("name:=" + getName() + System.getProperty("line.separator"));
78 sb.append("entryType:=" + getEntryType() + System.getProperty("line.separator"));
79 sb.append("offset:=" + getOffset() + System.getProperty("line.separator"));
80 sb.append("length:=" + getLength());
81 return sb.toString();
82 }
83
84 /**
85 * Returns an entry name length
86 *
87 * @return int
88 */
89 public int getNameLength() {
90 return name_length;
91 }
92
93 /**
94 * Sets an entry name length
95 *
96 * @param name_length
97 * int
98 */
99 protected void setNameLength(int name_length) {
100 this.name_length = name_length;
101 }
102
103 /**
104 * Returns an entry name
105 *
106 * @return String
107 */
108 public String getName() {
109 return name;
110 }
111
112 /**
113 * Sets entry name
114 *
115 * @param name
116 * String
117 */
118 protected void setName(String name) {
119 this.name = name;
120 }
121
122 /**
123 * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
124 *
125 * @return ChmCommons.EntryType
126 */
127 public ChmCommons.EntryType getEntryType() {
128 return entryType;
129 }
130
131 protected void setEntryType(ChmCommons.EntryType entryType) {
132 this.entryType = entryType;
133 }
134
135 public int getOffset() {
136 return offset;
137 }
138
139 protected void setOffset(int offset) {
140 this.offset = offset;
141 }
142
143 public int getLength() {
144 return length;
145 }
146
147 protected void setLength(int length) {
148 this.length = length;
149 }
150
151 public static void main(String[] args) {
152 }
153 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.assertion;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.parser.chm.accessor.ChmAccessor;
23 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
24 import org.apache.tika.parser.chm.core.ChmCommons;
25 import org.apache.tika.parser.chm.exception.ChmParsingException;
26
27 /**
28 * Contains chm extractor assertions
29 */
30 public class ChmAssert {
31 /**
32 * Checks a validity of the chmBlockSegment parameters
33 *
34 * @param data
35 * byte[]
36 * @param resetTable
37 * ChmLzxcResetTable
38 * @param blockNumber
39 * int
40 * @param lzxcBlockOffset
41 * int
42 * @param lzxcBlockLength
43 * int
44 * @throws TikaException
45 */
46 public static final void assertChmBlockSegment(byte[] data,
47 ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
48 int lzxcBlockLength) throws TikaException {
49 if ((data == null))
50 throw new TikaException("data[] is null");
51
52 if ((data.length <= 0))
53 throw new TikaException("data[] length should be greater than zero");
54
55 if (resetTable == null)
56 throw new TikaException("resetTable is null");
57
58 if (resetTable.getBlockAddress().length <= 1)
59 throw new TikaException("resetTable.getBlockAddress().length should be greater than zero");
60
61 if (blockNumber < 0)
62 throw new TikaException("blockNumber should be positive number");
63
64 if (lzxcBlockOffset < 0)
65 throw new TikaException("lzxcBlockOffset should be positive number");
66
67 if (lzxcBlockLength < 0)
68 throw new TikaException("lzxcBlockLength should be positive number");
69 }
70
71 /**
72 * Checks if InputStream is not null
73 *
74 * @param is
75 * InputStream
76 * @throws ChmParsingException
77 * @throws IOException
78 */
79 public static final void assertInputStreamNotNull(InputStream is) throws IOException {
80 if (is == null)
81 throw new IOException("input sream is null");
82 }
83
84 /**
85 * Checks validity of ChmAccessor parameters
86 *
87 * @param data
88 * @param chmItsfHeader
89 * @param count
90 * @throws ChmParsingException
91 */
92 public static final void assertChmAccessorParameters(byte[] data,
93 ChmAccessor<?> chmAccessor, int count) throws ChmParsingException {
94 assertByteArrayNotNull(data);
95 assertChmAccessorNotNull(chmAccessor);
96 }
97
98 /**
99 * Checks if byte[] is not null
100 *
101 * @param data
102 * @throws ChmParsingException
103 */
104 public static final void assertByteArrayNotNull(byte[] data) throws ChmParsingException {
105 if (data == null)
106 throw new ChmParsingException("byte[] data is null");
107 }
108
109 /**
110 * Checks if ChmAccessor is not null In case of null throws exception
111 *
112 * @param ChmAccessor
113 * @throws ChmParsingException
114 */
115 public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) throws ChmParsingException {
116 if (chmAccessor == null)
117 throw new ChmParsingException("chm header is null");
118 }
119
120 /**
121 * Checks validity of the DirectoryListingEntry's parameters In case of
122 * invalid parameter(s) throws an exception
123 *
124 * @param name_length
125 * length of the chm entry name
126 * @param name
127 * chm entry name
128 * @param entryType
129 * EntryType
130 * @param offset
131 * @param length
132 * @throws ChmParsingException
133 */
134 public static final void assertDirectoryListingEntry(int name_length,
135 String name, ChmCommons.EntryType entryType, int offset, int length) throws ChmParsingException {
136 if (name_length < 0)
137 throw new ChmParsingException("invalid name length");
138 if (name == null)
139 throw new ChmParsingException("invalid name");
140
141 if ((entryType != ChmCommons.EntryType.COMPRESSED)
142 && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
143 throw new ChmParsingException("invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
144
145 if (offset < 0)
146 throw new ChmParsingException("invalid offset");
147
148 if (length < 0)
149 throw new ChmParsingException("invalid length");
150 }
151
152 public static void assertCopyingDataIndex(int index, int dataLength) throws ChmParsingException {
153 if (index >= dataLength)
154 throw new ChmParsingException("cannot parse chm file index > data.length");
155 }
156
157 /**
158 * Checks if int param is greater than zero In case param <=0 throws an
159 * exception
160 *
161 * @param param
162 * @throws ChmParsingException
163 */
164 public static void assertPositiveInt(int param) throws ChmParsingException {
165 if (param <= 0)
166 throw new ChmParsingException("resetTable.getBlockAddress().length should be greater than zero");
167 }
168 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.core;
17
18 import java.io.FileNotFoundException;
19 import java.io.FileOutputStream;
20 import java.io.IOException;
21 import java.util.Iterator;
22 import java.util.List;
23
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
26 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
27 import org.apache.tika.parser.chm.assertion.ChmAssert;
28 import org.apache.tika.parser.chm.exception.ChmParsingException;
29
30 public class ChmCommons {
31 /* Prevents initialization */
32 private ChmCommons() {
33 }
34
35 public static void assertByteArrayNotNull(byte[] data) throws TikaException {
36 if (data == null)
37 throw new TikaException("byte[] is null");
38 }
39
40 /**
41 * Represents entry types: uncompressed, compressed
42 */
43 public enum EntryType {
44 UNCOMPRESSED, COMPRESSED
45 }
46
47 /**
48 * Represents lzx states: started decoding, not started decoding
49 */
50 public enum LzxState {
51 STARTED_DECODING, NOT_STARTED_DECODING
52 }
53
54 /**
55 * Represents intel file states during decompression
56 */
57 public enum IntelState {
58 STARTED, NOT_STARTED
59 }
60
61 /**
62 * Represents lzx block types in order to decompress differently
63 */
64 public final static int UNDEFINED = 0;
65 public final static int VERBATIM = 1;
66 public final static int ALIGNED_OFFSET = 2;
67 public final static int UNCOMPRESSED = 3;
68
69 /**
70 * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
71 * i.e 2^X
72 *
73 * @param window
74 * chmLzxControlData.getWindowSize()
75 *
76 * @return window size
77 */
78 public static int getWindowSize(int window) {
79 int win = 0;
80 while (window > 1) {
81 window >>>= 1;
82 win++;
83 }
84 return win;
85 }
86
87 public static byte[] getChmBlockSegment(byte[] data,
88 ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
89 int lzxcBlockLength) throws TikaException {
90 ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
91 lzxcBlockOffset, lzxcBlockLength);
92 int blockLength = -1;
93 // TODO add int_max_value checking
94 if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
95 blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
96 .getBlockAddress()[blockNumber]);
97 } else {
98 /* new code */
99 if (blockNumber >= resetTable.getBlockAddress().length)
100 blockLength = 0;
101 else
102 /* end new code */
103 blockLength = (int) (lzxcBlockLength - resetTable
104 .getBlockAddress()[blockNumber]);
105 }
106 byte[] t = ChmCommons
107 .copyOfRange(
108 data,
109 (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
110 (int) (lzxcBlockOffset
111 + resetTable.getBlockAddress()[blockNumber] + blockLength));
112 return (t != null) ? t : new byte[1];
113 }
114
115 /**
116 * Returns textual representation of LangID
117 *
118 * @param langID
119 *
120 * @return language name
121 */
122 public static String getLanguage(long langID) {
123 /* Potential problem with casting */
124 switch ((int) langID) {
125 case 1025:
126 return "Arabic";
127 case 1069:
128 return "Basque";
129 case 1027:
130 return "Catalan";
131 case 2052:
132 return "Chinese (Simplified)";
133 case 1028:
134 return "Chinese (Traditional)";
135 case 1029:
136 return "Czech";
137 case 1030:
138 return "Danish";
139 case 1043:
140 return "Dutch";
141 case 1033:
142 return "English (United States)";
143 case 1035:
144 return "Finnish";
145 case 1036:
146 return "French";
147 case 1031:
148 return "German";
149 case 1032:
150 return "Greek";
151 case 1037:
152 return "Hebrew";
153 case 1038:
154 return "Hungarian";
155 case 1040:
156 return "Italian";
157 case 1041:
158 return "Japanese";
159 case 1042:
160 return "Korean";
161 case 1044:
162 return "Norwegian";
163 case 1045:
164 return "Polish";
165 case 2070:
166 return "Portuguese";
167 case 1046:
168 return "Portuguese (Brazil)";
169 case 1049:
170 return "Russian";
171 case 1051:
172 return "Slovakian";
173 case 1060:
174 return "Slovenian";
175 case 3082:
176 return "Spanish";
177 case 1053:
178 return "Swedish";
179 case 1055:
180 return "Turkish";
181 default:
182 return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
183 }
184 }
185
186 /**
187 * Checks skippable patterns
188 *
189 * @param directoryListingEntry
190 *
191 * @return boolean
192 */
193 public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
194 return (directoryListingEntry.getName().startsWith("/$")
195 || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
196 .getName().startsWith("::")) ? true : false;
197 }
198
199 /**
200 * Writes byte[][] to the file
201 *
202 * @param buffer
203 * @param fileToBeSaved
204 * file name
205 * @throws TikaException
206 */
207 public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException {
208 FileOutputStream output = null;
209 if (buffer != null && fileToBeSaved != null
210 && !ChmCommons.isEmpty(fileToBeSaved)) {
211 try {
212 output = new FileOutputStream(fileToBeSaved);
213 if (output != null)
214 for (int i = 0; i < buffer.length; i++) {
215 output.write(buffer[i]);
216 }
217 } catch (FileNotFoundException e) {
218 throw new TikaException(e.getMessage());
219 } catch (IOException e) {
220 e.printStackTrace();
221 } finally {
222 if (output != null)
223 try {
224 output.flush();
225 output.close();
226 } catch (IOException e) {
227 e.printStackTrace();
228 }
229 }
230 }
231 }
232
233 /**
234 * Reverses the order of given array
235 *
236 * @param array
237 */
238 public static void reverse(byte[] array) {
239 if (array == null) {
240 return;
241 }
242 int i = 0;
243 int j = array.length - 1;
244 byte tmp;
245 while (j > i) {
246 tmp = array[j];
247 array[j] = array[i];
248 array[i] = tmp;
249 j--;
250 i++;
251 }
252 }
253
254 /**
255 * Returns an index of the reset table
256 *
257 * @param text
258 * @param pattern
259 * @return index of the reset table
260 * @throws ChmParsingException
261 */
262 public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) throws ChmParsingException {
263 return (indexOf(text, pattern)) - 4;
264 }
265
266 /**
267 * Searches some pattern in byte[]
268 *
269 * @param text
270 * byte[]
271 * @param pattern
272 * byte[]
273 * @return an index, if nothing found returns -1
274 * @throws ChmParsingException
275 */
276 public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
277 int[] next = null;
278 int i = 0, j = -1;
279
280 /* Preprocessing */
281 if (pattern != null && text != null) {
282 next = new int[pattern.length];
283 next[0] = -1;
284 } else
285 throw new ChmParsingException("pattern and/or text should not be null");
286
287 /* Computes a failure function */
288 while (i < pattern.length - 1) {
289 if (j == -1 || pattern[i] == pattern[j]) {
290 i++;
291 j++;
292 if (pattern[i] != pattern[j])
293 next[i] = j;
294 else
295 next[i] = next[j];
296 } else
297 j = next[j];
298 }
299
300 /* Reinitializes local variables */
301 i = j = 0;
302
303 /* Matching */
304 while (i < text.length && j < pattern.length) {
305 if (j == -1 || pattern[j] == text[i]) {
306 i++;
307 j++;
308 } else
309 j = next[j];
310 }
311 if (j == pattern.length)
312 return (i - j); // match found at offset i - M
313 else
314 return -1; // not found
315 }
316
317 /**
318 * Searches for some pattern in the directory listing entry list
319 *
320 * @param list
321 * @param pattern
322 * @return an index, if nothing found returns -1
323 */
324 public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
325 int place = 0;
326 for (Iterator<DirectoryListingEntry> iterator = list.iterator(); iterator.hasNext();) {
327 DirectoryListingEntry directoryListingEntry = iterator.next();
328 if (directoryListingEntry.toString().contains(pattern)) {
329 return place;
330 } else
331 ++place;
332 }
333 return -1;// not found
334 }
335
336 /*
337 * This method is added because of supporting of Java 5
338 */
339 public static byte[] copyOfRange(byte[] original, int from, int to) {
340 checkCopyOfRangeParams(original, from, to);
341 int newLength = to - from;
342 if (newLength < 0)
343 throw new IllegalArgumentException(from + " > " + to);
344 byte[] copy = new byte[newLength];
345 System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
346 return copy;
347 }
348
349 private static void checkCopyOfRangeParams(byte[] original, int from, int to) {
350 if (original == null)
351 throw new NullPointerException("array is null");
352 if (from < 0)
353 throw new IllegalArgumentException(from + " should be > 0");
354 if (to < 0)
355 throw new IllegalArgumentException(to + " should be > 0");
356 }
357
358 /*
359 * This method is added because of supporting of Java 5
360 */
361 public static boolean isEmpty(String str) {
362 return str == null || str.length() == 0;
363 }
364
365 /**
366 * @param args
367 */
368 public static void main(String[] args) {
369 }
370
371 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.core;
17
18 public class ChmConstants {
19 /* Prevents instantiation */
20 private ChmConstants() {
21 }
22
23 public static final String DEFAULT_CHARSET = "UTF-8";
24 public static final String ITSF = "ITSF";
25 public static final String ITSP = "ITSP";
26 public static final String PMGL = "PMGL";
27 public static final String LZXC = "LZXC";
28 public static final String CHM_PMGI_MARKER = "PMGI";
29 public static final int BYTE_ARRAY_LENGHT = 16;
30 public static final int CHM_ITSF_V2_LEN = 0x58;
31 public static final int CHM_ITSF_V3_LEN = 0x60;
32 public static final int CHM_ITSP_V1_LEN = 0x54;
33 public static final int CHM_PMGL_LEN = 0x14;
34 public static final int CHM_PMGI_LEN = 0x08;
35 public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
36 public static final int CHM_LZXC_MIN_LEN = 0x18;
37 public static final int CHM_LZXC_V2_LEN = 0x1c;
38 public static final int CHM_SIGNATURE_LEN = 4;
39 public static final int CHM_VER_2 = 2;
40 public static final int CHM_VER_3 = 3;
41 public static final int CHM_VER_1 = 1;
42 public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
43
44 /* my hacking */
45 public static final int START_PMGL = 0xCC;
46 public static final String CONTROL_DATA = "ControlData";
47 public static final String RESET_TABLE = "ResetTable";
48 public static final String CONTENT = "Content";
49
50 /* some constants defined by the LZX specification */
51 public static final int LZX_MIN_MATCH = 2;
52 public static final int LZX_MAX_MATCH = 257;
53 public static final int LZX_NUM_CHARS = 256;
54 public static final int LZX_BLOCKTYPE_INVALID = 0; /*
55 * also blocktypes 4-7
56 * invalid
57 */
58 public static final int LZX_BLOCKTYPE_VERBATIM = 1;
59 public static final int LZX_BLOCKTYPE_ALIGNED = 2;
60 public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
61 public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
62 public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
63 public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
64 * aligned offset tree
65 * #elements
66 */
67 public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
68 * this one missing
69 * from spec!
70 */
71 public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
72 * length tree
73 * #elements
74 */
75
76 /* LZX huffman defines: tweak tablebits as desired */
77 public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
78 public static final int LZX_PRETREE_TABLEBITS = 6;
79 public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
80 public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
81 public static final int LZX_MAINTREE_TABLEBITS = 12;
82 public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
83 public static final int LZX_LENGTH_TABLEBITS = 12;
84 public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
85 public static final int LZX_ALIGNED_TABLEBITS = 7;
86 public static final int LZX_LENTABLE_SAFETY = 64;
87
88 public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
89 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
90 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
91 17, 17 };
92
93 public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
94 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
95 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
96 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
97 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
98 1966080, 2097152 };
99 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.core;
17
18 import java.io.ByteArrayOutputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Iterator;
23 import java.util.List;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.IOUtils;
27 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
28 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
29 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
30 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
31 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
32 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
33 import org.apache.tika.parser.chm.assertion.ChmAssert;
34 import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
35 import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
36 import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
37
38 /**
39 * Extracts text from chm file. Enumerates chm entries.
40 */
41 public class ChmExtractor {
42 private List<ChmLzxBlock> lzxBlocksCache = null;
43 private ChmDirectoryListingSet chmDirList = null;
44 private ChmItsfHeader chmItsfHeader = null;
45 private ChmItspHeader chmItspHeader = null;
46 private ChmLzxcResetTable chmLzxcResetTable = null;
47 private ChmLzxcControlData chmLzxcControlData = null;
48 private byte[] data = null;
49 private int indexOfContent;
50 private long lzxBlockOffset;
51 private long lzxBlockLength;
52
53 /**
54 * Returns lzxc control data.
55 *
56 * @return ChmLzxcControlData
57 */
58 private ChmLzxcControlData getChmLzxcControlData() {
59 return chmLzxcControlData;
60 }
61
62 /**
63 * Sets lzxc control data
64 *
65 * @param chmLzxcControlData
66 */
67 private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
68 this.chmLzxcControlData = chmLzxcControlData;
69 }
70
71 private ChmItspHeader getChmItspHeader() {
72 return chmItspHeader;
73 }
74
75 private void setChmItspHeader(ChmItspHeader chmItspHeader) {
76 this.chmItspHeader = chmItspHeader;
77 }
78
79 /**
80 * Returns lzxc reset table
81 *
82 * @return ChmLzxcResetTable
83 */
84 private ChmLzxcResetTable getChmLzxcResetTable() {
85 return chmLzxcResetTable;
86 }
87
88 /**
89 * Sets lzxc reset table
90 *
91 * @param chmLzxcResetTable
92 */
93 private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
94 this.chmLzxcResetTable = chmLzxcResetTable;
95 }
96
97 /**
98 * Returns lzxc block length
99 *
100 * @return lzxBlockLength
101 */
102 private long getLzxBlockLength() {
103 return lzxBlockLength;
104 }
105
106 /**
107 * Sets lzxc block length
108 *
109 * @param lzxBlockLength
110 */
111 private void setLzxBlockLength(long lzxBlockLength) {
112 this.lzxBlockLength = lzxBlockLength;
113 }
114
115 /**
116 * Returns lzxc block offset
117 *
118 * @return lzxBlockOffset
119 */
120 private long getLzxBlockOffset() {
121 return lzxBlockOffset;
122 }
123
124 /**
125 * Sets lzxc block offset
126 */
127 private void setLzxBlockOffset(long lzxBlockOffset) {
128 this.lzxBlockOffset = lzxBlockOffset;
129 }
130
131 private int getIndexOfContent() {
132 return indexOfContent;
133 }
134
135 private void setIndexOfContent(int indexOfContent) {
136 this.indexOfContent = indexOfContent;
137 }
138
139 private byte[] getData() {
140 return data;
141 }
142
143 private void setData(byte[] data) {
144 this.data = data;
145 }
146
147 public ChmExtractor(InputStream is) throws TikaException, IOException {
148 ChmAssert.assertInputStreamNotNull(is);
149 try {
150 setData(IOUtils.toByteArray(is));
151
152 /* Creates and parses chm itsf header */
153 setChmItsfHeader(new ChmItsfHeader());
154 // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
155 // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
156 getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
157 ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
158
159 /* Creates and parses chm itsp header */
160 setChmItspHeader(new ChmItspHeader());
161 // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
162 // getChmItsfHeader().getDirOffset(),
163 // (int) getChmItsfHeader().getDirOffset() +
164 // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
165 getChmItspHeader().parse(
166 ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
167 .getDirOffset(), (int) getChmItsfHeader().getDirOffset() +
168 ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
169
170 /* Creates instance of ChmDirListingContainer */
171 setChmDirList(new ChmDirectoryListingSet(getData(),
172 getChmItsfHeader(), getChmItspHeader()));
173
174 int indexOfControlData = getChmDirList().getControlDataIndex();
175 int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
176 ChmConstants.LZXC.getBytes());
177 byte[] dir_chunk = null;
178 if (indexOfResetData > 0)
179 dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData
180 + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
181 // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
182 // indexOfResetData
183 // +
184 // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
185
186 /* Creates and parses chm control data */
187 setChmLzxcControlData(new ChmLzxcControlData());
188 getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
189
190 int indexOfResetTable = getChmDirList().getResetTableIndex();
191 setChmLzxcResetTable(new ChmLzxcResetTable());
192
193 int startIndex = (int) getChmDirList().getDataOffset()
194 + getChmDirList().getDirectoryListingEntryList()
195 .get(indexOfResetTable).getOffset();
196
197 // assert startIndex < data.length
198 ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
199
200 // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
201 // +
202 // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
203 dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
204 + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
205
206 getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
207
208 setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(),
209 ChmConstants.CONTENT));
210 setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset()
211 + getChmItsfHeader().getDataOffset()));
212 setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
213
214 setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
215
216 } catch (IOException e) {
217 // ignore
218 }
219 }
220
221 /**
222 * Enumerates chm entities
223 *
224 * @return list of chm entities
225 */
226 public List<String> enumerateChm() {
227 List<String> listOfEntries = new ArrayList<String>();
228 for (Iterator<DirectoryListingEntry> it = getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();) {
229 listOfEntries.add(it.next().getName());
230 }
231 return listOfEntries;
232 }
233
234 /**
235 * Decompresses a chm entry
236 *
237 * @param directoryListingEntry
238 *
239 * @return decompressed data
240 * @throws TikaException
241 */
242 public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
243 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
244 ChmLzxBlock lzxBlock = null;
245 try {
246 /* UNCOMPRESSED type is easiest one */
247 if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
248 && directoryListingEntry.getLength() > 0
249 && !ChmCommons.hasSkip(directoryListingEntry)) {
250 int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
251 .getOffset());
252 // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
253 // dataOffset + directoryListingEntry.getLength());
254 buffer.write(ChmCommons.copyOfRange(
255 getData(), dataOffset,
256 dataOffset + directoryListingEntry.getLength()));
257 } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
258 && !ChmCommons.hasSkip(directoryListingEntry)) {
259 /* Gets a chm block info */
260 ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
261 directoryListingEntry, (int) getChmLzxcResetTable()
262 .getBlockLen(), getChmLzxcControlData());
263
264 int i = 0, start = 0, block = 0;
265
266 if ((getLzxBlockLength() < Integer.MAX_VALUE)
267 && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
268 // TODO: Improve the caching
269 // caching ... = O(n^2) - depends on startBlock and endBlock
270 if (getLzxBlocksCache().size() != 0) {
271 for (i = 0; i < getLzxBlocksCache().size(); i++) {
272 lzxBlock = getLzxBlocksCache().get(i);
273 for (int j = bb.getIniBlock(); j <= bb
274 .getStartBlock(); j++) {
275 if (lzxBlock.getBlockNumber() == j)
276 if (j > start) {
277 start = j;
278 block = i;
279 }
280 if (start == bb.getStartBlock())
281 break;
282 }
283 }
284 }
285
286 if (i == getLzxBlocksCache().size() && i == 0) {
287 start = bb.getIniBlock();
288
289 byte[] dataSegment = ChmCommons.getChmBlockSegment(
290 getData(),
291 getChmLzxcResetTable(), start,
292 (int) getLzxBlockOffset(),
293 (int) getLzxBlockLength());
294
295 lzxBlock = new ChmLzxBlock(start, dataSegment,
296 getChmLzxcResetTable().getBlockLen(), null);
297
298 getLzxBlocksCache().add(lzxBlock);
299 } else {
300 lzxBlock = getLzxBlocksCache().get(block);
301 }
302
303 for (i = start; i <= bb.getEndBlock();) {
304 if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
305 buffer.write(lzxBlock.getContent(
306 bb.getStartOffset(), bb.getEndOffset()));
307 break;
308 }
309
310 if (i == bb.getStartBlock()) {
311 buffer.write(lzxBlock.getContent(
312 bb.getStartOffset()));
313 }
314
315 if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
316 buffer.write(lzxBlock.getContent());
317 }
318
319 if (i == bb.getEndBlock()) {
320 buffer.write(lzxBlock.getContent(
321 0, bb.getEndOffset()));
322 break;
323 }
324
325 i++;
326
327 if (i % getChmLzxcControlData().getResetInterval() == 0) {
328 lzxBlock = new ChmLzxBlock(i,
329 ChmCommons.getChmBlockSegment(getData(),
330 getChmLzxcResetTable(), i,
331 (int) getLzxBlockOffset(),
332 (int) getLzxBlockLength()),
333 getChmLzxcResetTable().getBlockLen(), null);
334 } else {
335 lzxBlock = new ChmLzxBlock(i,
336 ChmCommons.getChmBlockSegment(getData(),
337 getChmLzxcResetTable(), i,
338 (int) getLzxBlockOffset(),
339 (int) getLzxBlockLength()),
340 getChmLzxcResetTable().getBlockLen(),
341 lzxBlock);
342 }
343
344 getLzxBlocksCache().add(lzxBlock);
345 }
346
347 if (getLzxBlocksCache().size() > getChmLzxcResetTable()
348 .getBlockCount()) {
349 getLzxBlocksCache().clear();
350 }
351 }
352 }
353 } catch (Exception e) {
354 throw new TikaException(e.getMessage());
355 }
356
357 return buffer.toByteArray();
358 }
359
360 private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
361 this.lzxBlocksCache = lzxBlocksCache;
362 }
363
364 private List<ChmLzxBlock> getLzxBlocksCache() {
365 return lzxBlocksCache;
366 }
367
368 private void setChmDirList(ChmDirectoryListingSet chmDirList) {
369 this.chmDirList = chmDirList;
370 }
371
372 public ChmDirectoryListingSet getChmDirList() {
373 return chmDirList;
374 }
375
376 private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
377 this.chmItsfHeader = chmItsfHeader;
378 }
379
380 private ChmItsfHeader getChmItsfHeader() {
381 return chmItsfHeader;
382 }
383 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.chm.core;
18
19 import java.util.List;
20
21 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
22 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
23 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
24 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
25 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
26 import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
27
28 public class ChmWrapper {
29 private List<ChmLzxBlock> lzxBlocksCache = null;
30 private ChmDirectoryListingSet chmDirList = null;
31 private ChmItsfHeader chmItsfHeader = null;
32 private ChmItspHeader chmItspHeader = null;
33 private ChmLzxcResetTable chmLzxcResetTable = null;
34 private ChmLzxcControlData chmLzxcControlData = null;
35 private byte[] data = null;
36 private int indexOfContent;
37 private long lzxBlockOffset;
38 private long lzxBlockLength;
39 private int indexOfResetData;
40 private int indexOfResetTable;
41 private int startIndex;
42
43 protected int getStartIndex() {
44 return startIndex;
45 }
46
47 protected void setStartIndex(int startIndex) {
48 this.startIndex = startIndex;
49 }
50
51 protected int getIndexOfResetTable() {
52 return indexOfResetTable;
53 }
54
55 protected void setIndexOfResetTable(int indexOfResetTable) {
56 this.indexOfResetTable = indexOfResetTable;
57 }
58
59 protected List<ChmLzxBlock> getLzxBlocksCache() {
60 return lzxBlocksCache;
61 }
62
63 protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
64 this.lzxBlocksCache = lzxBlocksCache;
65 }
66
67 protected ChmDirectoryListingSet getChmDirList() {
68 return chmDirList;
69 }
70
71 protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
72 this.chmDirList = chmDirList;
73 }
74
75 protected ChmItsfHeader getChmItsfHeader() {
76 return chmItsfHeader;
77 }
78
79 protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
80 this.chmItsfHeader = chmItsfHeader;
81 }
82
83 protected ChmLzxcResetTable getChmLzxcResetTable() {
84 return chmLzxcResetTable;
85 }
86
87 protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
88 this.chmLzxcResetTable = chmLzxcResetTable;
89 }
90
91 protected ChmLzxcControlData getChmLzxcControlData() {
92 return chmLzxcControlData;
93 }
94
95 protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
96 this.chmLzxcControlData = chmLzxcControlData;
97 }
98
99 protected byte[] getData() {
100 return data;
101 }
102
103 protected void setData(byte[] data) {
104 this.data = data;
105 }
106
107 protected int getIndexOfContent() {
108 return indexOfContent;
109 }
110
111 protected void setIndexOfContent(int indexOfContent) {
112 this.indexOfContent = indexOfContent;
113 }
114
115 protected long getLzxBlockOffset() {
116 return lzxBlockOffset;
117 }
118
119 protected void setLzxBlockOffset(long lzxBlockOffset) {
120 this.lzxBlockOffset = lzxBlockOffset;
121 }
122
123 protected long getLzxBlockLength() {
124 return lzxBlockLength;
125 }
126
127 protected void setLzxBlockLength(long lzxBlockLength) {
128 this.lzxBlockLength = lzxBlockLength;
129 }
130
131 protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
132 this.chmItspHeader = chmItspHeader;
133 }
134
135 protected ChmItspHeader getChmItspHeader() {
136 return chmItspHeader;
137 }
138
139 protected void setIndexOfResetData(int indexOfResetData) {
140 this.indexOfResetData = indexOfResetData;
141 }
142
143 protected int getIndexOfResetData() {
144 return indexOfResetData;
145 }
146 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.exception;
17
18 import org.apache.tika.exception.TikaException;
19
20 public class ChmParsingException extends TikaException {
21 private static final long serialVersionUID = 6497936044733665210L;
22
23 public ChmParsingException(String description) {
24 super(description);
25 }
26 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.lzx;
17
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
20 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
21 import org.apache.tika.parser.chm.exception.ChmParsingException;
22
23 /**
24 * A container that contains chm block information such as: i. initial block is
25 * using to reset main tree ii. start block is using for knowing where to start
26 * iii. end block is using for knowing where to stop iv. start offset is using
27 * for knowing where to start reading v. end offset is using for knowing where
28 * to stop reading
29 *
30 */
31 public class ChmBlockInfo {
32 /* class members */
33 private int iniBlock;
34 private int startBlock;
35 private int endBlock;
36 private int startOffset;
37 private int endOffset;
38
39 private static ChmBlockInfo chmBlockInfo = null;
40
41 private ChmBlockInfo() {
42
43 }
44
45 /**
46 * Returns an information related to the chmBlockInfo
47 *
48 * @param dle
49 * - DirectoryListingEntry
50 * @param bytesPerBlock
51 * - int, = chmLzxcResetTable.block_length
52 * @param clcd
53 * - ChmLzxcControlData
54 * @param chmBlockInfo
55 * - ChmBlockInfo
56 *
57 * @return ChmBlockInfo
58 * @throws TikaException
59 */
60 protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
61 int bytesPerBlock, ChmLzxcControlData clcd,
62 ChmBlockInfo chmBlockInfo) throws TikaException {
63 if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
64 throw new ChmParsingException("Please check you parameters");
65
66 chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
67 chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
68 / bytesPerBlock);
69 chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
70 chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
71 % bytesPerBlock);
72 // potential problem with casting long to int
73 chmBlockInfo
74 .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
75 % (int) clcd.getResetInterval());
76 return chmBlockInfo;
77 }
78
79 public static ChmBlockInfo getChmBlockInfoInstance(
80 DirectoryListingEntry dle, int bytesPerBlock,
81 ChmLzxcControlData clcd) {
82 setChmBlockInfo(new ChmBlockInfo());
83 getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
84 getChmBlockInfo().setEndBlock(
85 (dle.getOffset() + dle.getLength()) / bytesPerBlock);
86 getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
87 getChmBlockInfo().setEndOffset(
88 (dle.getOffset() + dle.getLength()) % bytesPerBlock);
89 // potential problem with casting long to int
90 getChmBlockInfo().setIniBlock(
91 (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
92 % (int) clcd.getResetInterval());
93 return getChmBlockInfo();
94 }
95
96 /**
97 * Returns textual representation of ChmBlockInfo
98 */
99 public String toString() {
100 StringBuilder sb = new StringBuilder();
101 sb.append("iniBlock:=" + getIniBlock() + ", ");
102 sb.append("startBlock:=" + getStartBlock() + ", ");
103 sb.append("endBlock:=" + getEndBlock() + ", ");
104 sb.append("startOffset:=" + getStartOffset() + ", ");
105 sb.append("endOffset:=" + getEndOffset()
106 + System.getProperty("line.separator"));
107 return sb.toString();
108 }
109
110 private boolean validateParameters(DirectoryListingEntry dle,
111 int bytesPerBlock, ChmLzxcControlData clcd,
112 ChmBlockInfo chmBlockInfo) {
113 int goodParameter = 0;
114 if (dle != null)
115 ++goodParameter;
116 if (bytesPerBlock > 0)
117 ++goodParameter;
118 if (clcd != null)
119 ++goodParameter;
120 if (chmBlockInfo != null)
121 ++goodParameter;
122 return (goodParameter == 4);
123 }
124
125 public static void main(String[] args) {
126 }
127
128 /**
129 * Returns an initial block index
130 *
131 * @return int
132 */
133 public int getIniBlock() {
134 return iniBlock;
135 }
136
137 /**
138 * Sets the initial block index
139 *
140 * @param iniBlock
141 * - int
142 */
143 private void setIniBlock(int iniBlock) {
144 this.iniBlock = iniBlock;
145 }
146
147 /**
148 * Returns the start block index
149 *
150 * @return int
151 */
152 public int getStartBlock() {
153 return startBlock;
154 }
155
156 /**
157 * Sets the start block index
158 *
159 * @param startBlock
160 * - int
161 */
162 private void setStartBlock(int startBlock) {
163 this.startBlock = startBlock;
164 }
165
166 /**
167 * Returns the end block index
168 *
169 * @return - int
170 */
171 public int getEndBlock() {
172 return endBlock;
173 }
174
175 /**
176 * Sets the end block index
177 *
178 * @param endBlock
179 * - int
180 */
181 private void setEndBlock(int endBlock) {
182 this.endBlock = endBlock;
183 }
184
185 /**
186 * Returns the start offset index
187 *
188 * @return - int
189 */
190 public int getStartOffset() {
191 return startOffset;
192 }
193
194 /**
195 * Sets the start offset index
196 *
197 * @param startOffset
198 * - int
199 */
200 private void setStartOffset(int startOffset) {
201 this.startOffset = startOffset;
202 }
203
204 /**
205 * Returns the end offset index
206 *
207 * @return - int
208 */
209 public int getEndOffset() {
210 return endOffset;
211 }
212
213 /**
214 * Sets the end offset index
215 *
216 * @param endOffset
217 * - int
218 */
219 private void setEndOffset(int endOffset) {
220 this.endOffset = endOffset;
221 }
222
223 public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
224 ChmBlockInfo.chmBlockInfo = chmBlockInfo;
225 }
226
227 public static ChmBlockInfo getChmBlockInfo() {
228 return chmBlockInfo;
229 }
230 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.lzx;
17
18 import java.math.BigInteger;
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.parser.chm.core.ChmCommons;
22 import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
23 import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
24 import org.apache.tika.parser.chm.core.ChmConstants;
25 import org.apache.tika.parser.chm.exception.ChmParsingException;
26
27 /**
28 * Decompresses a chm block. Depending on chm block type chooses most relevant
29 * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
30 * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
31 * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
32 * Currently relying on previous chm block these types changing according to the
33 * previous chm block type. We need to invent more appropriate way to handle
34 * such types.
35 *
36 */
37 public class ChmLzxBlock {
38 private int block_number;
39 private long block_length;
40 private ChmLzxState state;
41 private byte[] content = null;
42 private ChmSection chmSection = null;
43 private int contentLength = 0;
44
45 // trying to find solution for bad blocks ...
46 private int previousBlockType = -1;
47
48 public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
49 ChmLzxBlock prevBlock) {
50 try {
51 if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
52 setBlockNumber(blockNumber);
53
54 if (prevBlock != null
55 && prevBlock.getState().getBlockLength() > prevBlock
56 .getState().getBlockRemaining())
57 setChmSection(new ChmSection(prevBlock.getContent()));
58 else
59 setChmSection(new ChmSection(dataSegment));
60
61 setBlockLength(blockLength);
62
63 // ============================================
64 // we need to take care of previous context
65 // ============================================
66 checkLzxBlock(prevBlock);
67 setContent((int) blockLength);
68 if (prevBlock == null
69 || getContent().length < (int) getBlockLength()) {
70 setContent((int) getBlockLength());
71 }
72
73 if (prevBlock != null && prevBlock.getState() != null)
74 previousBlockType = prevBlock.getState().getBlockType();
75
76 extractContent();
77 } else
78 throw new TikaException("Check your chm lzx block parameters");
79 } catch (Exception e) {
80 // TODO: handle exception
81 }
82 }
83
84 protected int getContentLength() {
85 return contentLength;
86 }
87
88 protected void setContentLength(int contentLength) {
89 this.contentLength = contentLength;
90 }
91
92 private ChmSection getChmSection() {
93 return chmSection;
94 }
95
96 private void setChmSection(ChmSection chmSection) {
97 this.chmSection = chmSection;
98 }
99
100 private void assertStateNotNull() throws TikaException {
101 if (getState() == null)
102 throw new ChmParsingException("state is null");
103 }
104
105 private void extractContent() throws TikaException {
106 assertStateNotNull();
107 if (getChmSection().getData() != null) {
108 boolean continueLoop = true;
109 while (continueLoop && getContentLength() < getBlockLength()) {
110 if (getState() != null && getState().getBlockRemaining() == 0) {
111 if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
112 getState().setHadStarted(LzxState.STARTED_DECODING);
113 if (getChmSection().getSyncBits(1) == 1) {
114 int intelSizeTemp = (getChmSection()
115 .getSyncBits(16) << 16)
116 + getChmSection().getSyncBits(16);
117 if (intelSizeTemp >= 0)
118 getState().setIntelFileSize(intelSizeTemp);
119 else
120 getState().setIntelFileSize(0);
121 }
122 }
123 getState().setBlockType(getChmSection().getSyncBits(3));
124 getState().setBlockLength(
125 (getChmSection().getSyncBits(16) << 8)
126 + getChmSection().getSyncBits(8));
127 getState().setBlockRemaining(getState().getBlockLength());
128
129 // ----------------------------------------
130 // Trying to handle 3 - 7 block types
131 // ----------------------------------------
132 if (getState().getBlockType() > 3) {
133 if (previousBlockType >= 0 && previousBlockType < 3)
134 getState().setBlockType(previousBlockType);
135 }
136
137 switch (getState().getBlockType()) {
138 case ChmCommons.ALIGNED_OFFSET:
139 createAlignedTreeTable();
140 case ChmCommons.VERBATIM:
141 /* Creates mainTreeTable */
142 createMainTreeTable();
143 createLengthTreeTable();
144 if (getState().getMainTreeLengtsTable()[0xe8] != 0)
145 getState().setIntelState(IntelState.STARTED);
146 break;
147 case ChmCommons.UNCOMPRESSED:
148 getState().setIntelState(IntelState.STARTED);
149 if (getChmSection().getTotal() > 16)
150 getChmSection().setSwath(
151 getChmSection().getSwath() - 1);
152 getState().setR0(
153 (new BigInteger(getChmSection()
154 .reverseByteOrder(
155 getChmSection().unmarshalBytes(
156 4))).longValue()));
157 getState().setR1(
158 (new BigInteger(getChmSection()
159 .reverseByteOrder(
160 getChmSection().unmarshalBytes(
161 4))).longValue()));
162 getState().setR2(
163 (new BigInteger(getChmSection()
164 .reverseByteOrder(
165 getChmSection().unmarshalBytes(
166 4))).longValue()));
167 break;
168 default:
169 break;
170 }
171 }
172
173 int tempLen;
174
175 if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
176 getState().setBlockRemaining(
177 getContentLength() + getState().getBlockRemaining()
178 - (int) getBlockLength());
179 tempLen = (int) getBlockLength();
180 } else {
181 tempLen = getContentLength()
182 + getState().getBlockRemaining();
183 getState().setBlockRemaining(0);
184 }
185
186 int lastLength = getContentLength();
187 switch (getState().getBlockType()) {
188 case ChmCommons.ALIGNED_OFFSET:
189 // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
190 decompressAlignedBlock(tempLen, getChmSection().getData());// prevcontext
191 break;
192 case ChmCommons.VERBATIM:
193 decompressVerbatimBlock(tempLen, getChmSection().getData());
194 break;
195 case ChmCommons.UNCOMPRESSED:
196 decompressUncompressedBlock(tempLen, getChmSection()
197 .getData());
198 break;
199 }
200 getState().increaseFramesRead();
201 if ((getState().getFramesRead() < 32768)
202 && getState().getIntelFileSize() != 0)
203 intelE8Decoding();
204
205 continueLoop = getContentLength() > lastLength;
206 }
207 }
208 }
209
210 protected void intelE8Decoding() {
211 if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
212 || (getState().getIntelState() == IntelState.NOT_STARTED)) {
213 getState().setBlockRemaining(
214 getState().getBlockRemaining() - (int) getBlockLength());
215 } else {
216 long curpos = getState().getBlockRemaining();
217 getState().setBlockRemaining(
218 getState().getBlockRemaining() - (int) getBlockLength());
219 int i = 0;
220 while (i < getBlockLength() - 10) {
221 if (content[i] != 0xe8) {
222 i++;
223 continue;
224 }
225 byte[] b = new byte[4];
226 b[0] = getContent()[i + 3];
227 b[1] = getContent()[i + 2];
228 b[2] = getContent()[i + 1];
229 b[3] = getContent()[i + 0];
230 long absoff = (new BigInteger(b)).longValue();
231 if ((absoff >= -curpos)
232 && (absoff < getState().getIntelFileSize())) {
233 long reloff = (absoff >= 0) ? absoff - curpos : absoff
234 + getState().getIntelFileSize();
235 getContent()[i + 0] = (byte) reloff;
236 getContent()[i + 1] = (byte) (reloff >>> 8);
237 getContent()[i + 2] = (byte) (reloff >>> 16);
238 getContent()[i + 3] = (byte) (reloff >>> 24);
239 }
240 i += 4;
241 curpos += 5;
242 }
243 }
244 }
245
246 private short[] createPreLenTable() {
247 short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
248 for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
249 tmp[i] = (short) getChmSection().getSyncBits(
250 ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
251 }
252 return tmp;
253 }
254
255 private void createLengthTreeTable() throws TikaException {
256 short[] prelentable = createPreLenTable();
257
258 if (prelentable == null) {
259 throw new ChmParsingException("pretreetable is null");
260 }
261
262 short[] pretreetable = createTreeTable2(prelentable,
263 (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
264 + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
265 ChmConstants.LZX_PRETREE_TABLEBITS,
266 ChmConstants.LZX_PRETREE_MAXSYMBOLS);
267
268 if (pretreetable == null) {
269 throw new ChmParsingException("pretreetable is null");
270 }
271
272 createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
273 pretreetable, prelentable);
274
275 getState().setLengthTreeTable(
276 createTreeTable2(getState().getLengthTreeLengtsTable(),
277 (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
278 + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
279 ChmConstants.LZX_MAINTREE_TABLEBITS,
280 ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
281 }
282
283 private void decompressUncompressedBlock(int len, byte[] prevcontent) {
284 if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
285 for (int i = getContentLength(); i < (getContentLength() + getState()
286 .getBlockRemaining()); i++)
287 content[i] = getChmSection().getByte();
288
289 setContentLength(getContentLength()
290 + getState().getBlockRemaining());
291 getState().setBlockRemaining(0);
292 } else {
293 for (int i = getContentLength(); i < getBlockLength(); i++)
294 content[i] = getChmSection().getByte();
295 getState().setBlockRemaining(
296 (int) getBlockLength() - getContentLength());// = blockLen -
297 // contentlen;
298 setContentLength((int) getBlockLength());
299 }
300 }
301
302 private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
303
304 if ((getChmSection() == null) || (getState() == null)
305 || (getState().getMainTreeTable() == null))
306 throw new ChmParsingException("chm section is null");
307
308 short s;
309 int x, i, border;
310 int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
311 int matchoffset = 0;
312 for (i = getContentLength(); i < len; i++) {
313 /* new code */
314 border = getChmSection().getDesyncBits(
315 ChmConstants.LZX_MAINTREE_TABLEBITS, 0);
316 if (border >= getState().mainTreeTable.length)
317 break;
318 /* end new code */
319 s = getState().mainTreeTable[getChmSection().getDesyncBits(
320 ChmConstants.LZX_MAINTREE_TABLEBITS, 0)];
321 if (s >= getState().getMainTreeElements()) {
322 x = ChmConstants.LZX_MAINTREE_TABLEBITS;
323 do {
324 x++;
325 s <<= 1;
326 s += getChmSection().checkBit(x);
327 } while ((s = getState().mainTreeTable[s]) >= getState()
328 .getMainTreeElements());
329 }
330 getChmSection().getSyncBits(getState().mainTreeTable[s]);
331 if (s < ChmConstants.LZX_NUM_CHARS) {
332 content[i] = (byte) s;
333 } else {
334 s -= ChmConstants.LZX_NUM_CHARS;
335 matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
336 if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
337 matchfooter = getState().lengthTreeTable[getChmSection()
338 .getDesyncBits(ChmConstants.LZX_MAINTREE_TABLEBITS,
339 0)];
340 if (matchfooter >= ChmConstants.LZX_MAINTREE_TABLEBITS) {
341 x = ChmConstants.LZX_MAINTREE_TABLEBITS;
342 do {
343 x++;
344 matchfooter <<= 1;
345 matchfooter += getChmSection().checkBit(x);
346 } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
347 }
348 getChmSection().getSyncBits(
349 getState().lengthTreeLengtsTable[matchfooter]);
350 matchlen += matchfooter;
351 }
352 matchlen += ChmConstants.LZX_MIN_MATCH;
353 matchoffset = s >>> 3;
354 if (matchoffset > 2) {
355 extra = ChmConstants.EXTRA_BITS[matchoffset];
356 matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
357 if (extra > 3) {
358 extra -= 3;
359 long l = getChmSection().getSyncBits(extra);
360 matchoffset += (l << 3);
361 int g = getChmSection().getDesyncBits(
362 ChmConstants.LZX_NUM_PRIMARY_LENGTHS, 0);
363 int t = getState().getAlignedTreeTable()[g];
364 if (t >= getState().getMainTreeElements()) {
365 x = ChmConstants.LZX_MAINTREE_TABLEBITS;
366 do {
367 x++;
368 t <<= 1;
369 t += getChmSection().checkBit(x);
370 } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
371 .getMainTreeElements());
372 }
373 getChmSection().getSyncBits(
374 getState().getAlignedTreeTable()[t]);
375 matchoffset += t;
376 } else if (extra == 3) {
377 int g = (int) getChmSection().getDesyncBits(
378 ChmConstants.LZX_NUM_PRIMARY_LENGTHS, 0);
379 int t = getState().getAlignedTreeTable()[g];
380 if (t >= getState().getMainTreeElements()) {
381 x = ChmConstants.LZX_MAINTREE_TABLEBITS;
382 do {
383 x++;
384 t <<= 1;
385 t += getChmSection().checkBit(x);
386 } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
387 .getMainTreeElements());
388 }
389 getChmSection().getSyncBits(
390 getState().getAlignedTreeTable()[t]);
391 matchoffset += t;
392 } else if (extra > 0) {
393 long l = getChmSection().getSyncBits(extra);
394 matchoffset += l;
395 } else
396 matchoffset = 1;
397 getState().setR2(getState().getR1());
398 getState().setR1(getState().getR0());
399 getState().setR0(matchoffset);
400 } else if (matchoffset == 0) {
401 matchoffset = (int) getState().getR0();
402 } else if (matchoffset == 1) {
403 matchoffset = (int) getState().getR1();
404 getState().setR1(getState().getR0());
405 getState().setR0(matchoffset);
406 } else /** match_offset == 2 */
407 {
408 matchoffset = (int) getState().getR2();
409 getState().setR2(getState().getR0());
410 getState().setR0(matchoffset);
411 }
412 rundest = i;
413 runsrc = rundest - matchoffset;
414 i += (matchlen - 1);
415 if (i > len)
416 break;
417
418 if (runsrc < 0) {
419 if (matchlen + runsrc <= 0) {
420 runsrc = prevcontent.length + runsrc;
421 while (matchlen-- > 0)
422 content[rundest++] = prevcontent[runsrc++];
423 } else {
424 runsrc = prevcontent.length + runsrc;
425 while (runsrc < prevcontent.length)
426 content[rundest++] = prevcontent[runsrc++];
427 matchlen = matchlen + runsrc - prevcontent.length;
428 runsrc = 0;
429 while (matchlen-- > 0)
430 content[rundest++] = content[runsrc++];
431 }
432
433 } else {
434 /* copies any wrappes around source data */
435 while ((runsrc < 0) && (matchlen-- > 0)) {
436 content[rundest++] = content[(int) (runsrc + getBlockLength())];
437 runsrc++;
438 }
439 /* copies match data - no worries about destination wraps */
440 while (matchlen-- > 0)
441 content[rundest++] = content[runsrc++];
442 }
443 }
444 }
445 setContentLength(len);
446 }
447
448 private void assertShortArrayNotNull(short[] array) throws TikaException {
449 if (array == null)
450 throw new ChmParsingException("short[] is null");
451 }
452
453 private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
454 short s;
455 int x, i;
456 int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
457 int matchoffset = 0;
458 for (i = getContentLength(); i < len; i++) {
459 int f = (int) getChmSection().getDesyncBits(
460 ChmConstants.LZX_MAINTREE_TABLEBITS, 0);
461 assertShortArrayNotNull(getState().getMainTreeTable());
462 s = getState().getMainTreeTable()[f];
463 if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
464 x = ChmConstants.LZX_MAINTREE_TABLEBITS;
465 do {
466 x++;
467 s <<= 1;
468 s += getChmSection().checkBit(x);
469 } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
470 }
471 getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
472 if (s < ChmConstants.LZX_NUM_CHARS) {
473 content[i] = (byte) s;
474 } else {
475 s -= ChmConstants.LZX_NUM_CHARS;
476 matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
477 if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
478 matchfooter = getState().getLengthTreeTable()[(int) getChmSection()
479 .getDesyncBits(ChmConstants.LZX_LENGTH_TABLEBITS, 0)];
480 if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
481 x = ChmConstants.LZX_LENGTH_TABLEBITS;
482 do {
483 x++;
484 matchfooter <<= 1;
485 matchfooter += getChmSection().checkBit(x);
486 } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
487 }
488 getChmSection().getSyncBits(
489 getState().getLengthTreeLengtsTable()[matchfooter]);
490 matchlen += matchfooter;
491 }
492 matchlen += ChmConstants.LZX_MIN_MATCH;
493 // shorter than 2
494 matchoffset = s >>> 3;
495 if (matchoffset > 2) {
496 if (matchoffset != 3) { // should get other bits to retrieve
497 // offset
498 extra = ChmConstants.EXTRA_BITS[matchoffset];
499 long l = getChmSection().getSyncBits(extra);
500 matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
501 } else {
502 matchoffset = 1;
503 }
504 getState().setR2(getState().getR1());
505 getState().setR1(getState().getR0());
506 getState().setR0(matchoffset);
507 } else if (matchoffset == 0) {
508 matchoffset = (int) getState().getR0();
509 } else if (matchoffset == 1) {
510 matchoffset = (int) getState().getR1();
511 getState().setR1(getState().getR0());
512 getState().setR0(matchoffset);
513 } else /* match_offset == 2 */
514 {
515 matchoffset = (int) getState().getR2();
516 getState().setR2(getState().getR0());
517 getState().setR0(matchoffset);
518 }
519 rundest = i;
520 runsrc = rundest - matchoffset;
521 i += (matchlen - 1);
522 if (i > len)
523 break;
524 if (runsrc < 0) {
525 if (matchlen + runsrc <= 0) {
526 runsrc = prevcontent.length + runsrc;
527 while ((matchlen-- > 0) && (prevcontent != null)
528 && ((runsrc + 1) > 0))
529 if ((rundest < content.length)
530 && (runsrc < content.length))
531 content[rundest++] = prevcontent[runsrc++];
532 } else {
533 runsrc = prevcontent.length + runsrc;
534 while (runsrc < prevcontent.length)
535 if ((rundest < content.length)
536 && (runsrc < content.length))
537 content[rundest++] = prevcontent[runsrc++];
538 matchlen = matchlen + runsrc - prevcontent.length;
539 runsrc = 0;
540 while (matchlen-- > 0)
541 content[rundest++] = content[runsrc++];
542 }
543
544 } else {
545 /* copies any wrapped source data */
546 while ((runsrc < 0) && (matchlen-- > 0)) {
547 content[rundest++] = content[(int) (runsrc + getBlockLength())];
548 runsrc++;
549 }
550 /* copies match data - no worries about destination wraps */
551 while (matchlen-- > 0) {
552 if ((rundest < content.length)
553 && (runsrc < content.length))
554 content[rundest++] = content[runsrc++];
555 }
556 }
557 }
558 }
559 setContentLength(len);
560 }
561
562 private void createLengthTreeLenTable(int offset, int tablelen,
563 short[] pretreetable, short[] prelentable) throws TikaException {
564 if (prelentable == null || getChmSection() == null
565 || pretreetable == null || prelentable == null)
566 throw new ChmParsingException("is null");
567
568 int i = offset; // represents offset
569 int z, y, x;// local counters
570 while (i < tablelen) {
571 z = pretreetable[(int) getChmSection().getDesyncBits(
572 ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
573 if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
574 // 20
575 x = ChmConstants.LZX_PRETREE_TABLEBITS;
576 do {
577 x++;
578 z <<= 1;
579 z += getChmSection().checkBit(x);
580 } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
581 }
582 getChmSection().getSyncBits(prelentable[z]);
583 if (z < 17) {
584 z = getState().getLengthTreeLengtsTable()[i] - z;
585 if (z < 0)
586 z = z + 17;
587 getState().getLengthTreeLengtsTable()[i] = (short) z;
588 i++;
589 } else if (z == 17) {
590 y = (int) getChmSection().getSyncBits(4);
591 y += 4;
592 for (int j = 0; j < y; j++)
593 if (i < getState().getLengthTreeLengtsTable().length)
594 getState().getLengthTreeLengtsTable()[i++] = 0;
595 } else if (z == 18) {
596 y = (int) getChmSection().getSyncBits(5);
597 y += 20;
598 for (int j = 0; j < y; j++)
599 if (i < getState().getLengthTreeLengtsTable().length)
600 getState().getLengthTreeLengtsTable()[i++] = 0;
601 } else if (z == 19) {
602 y = getChmSection().getSyncBits(1);
603 y += 4;
604 z = pretreetable[(int) getChmSection().getDesyncBits(
605 ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
606 if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
607 x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
608 do {
609 x++;
610 z <<= 1;
611 z += getChmSection().checkBit(x);
612 } while ((z = pretreetable[z]) >= ChmConstants.LZX_MAINTREE_TABLEBITS);
613 }
614 getChmSection().getSyncBits(prelentable[z]);
615 z = getState().getLengthTreeLengtsTable()[i] - z;
616 if (z < 0)
617 z = z + 17;
618 for (int j = 0; j < y; j++)
619 getState().getLengthTreeLengtsTable()[i++] = (short) z;
620 }
621 }
622 }
623
624 private void createMainTreeTable() throws TikaException {
625 short[] prelentable = createPreLenTable();
626 short[] pretreetable = createTreeTable2(prelentable,
627 (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
628 + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
629 ChmConstants.LZX_PRETREE_TABLEBITS,
630 ChmConstants.LZX_PRETREE_MAXSYMBOLS);
631 createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
632 prelentable);
633 prelentable = createPreLenTable();
634 pretreetable = createTreeTable2(prelentable,
635 (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
636 + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
637 ChmConstants.LZX_PRETREE_TABLEBITS,
638 ChmConstants.LZX_PRETREE_MAXSYMBOLS);
639 createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
640 getState().mainTreeLengtsTable.length, pretreetable,
641 prelentable);
642
643 getState().setMainTreeTable(
644 createTreeTable2(getState().mainTreeLengtsTable,
645 (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
646 + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
647 ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
648 .getMainTreeElements()));
649
650 }
651
652 private void createMainTreeLenTable(int offset, int tablelen,
653 short[] pretreetable, short[] prelentable) throws TikaException {
654 if (pretreetable == null)
655 throw new ChmParsingException("pretreetable is null");
656 int i = offset;
657 int z, y, x;
658 while (i < tablelen) {
659 int f = getChmSection().getDesyncBits(
660 ChmConstants.LZX_PRETREE_TABLEBITS, 0);
661 z = pretreetable[f];
662 if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
663 x = ChmConstants.LZX_PRETREE_TABLEBITS;
664 do {
665 x++;
666 z <<= 1;
667 z += getChmSection().checkBit(x);
668 } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
669 }
670 getChmSection().getSyncBits(prelentable[z]);
671 if (z < 17) {
672 z = getState().getMainTreeLengtsTable()[i] - z;
673 if (z < 0)
674 z = z + 17;
675 getState().mainTreeLengtsTable[i] = (short) z;
676 i++;
677 } else if (z == 17) {
678 y = getChmSection().getSyncBits(4);
679 y += 4;
680 for (int j = 0; j < y; j++) {
681 assertInRange(getState().getMainTreeLengtsTable(), i);
682 getState().mainTreeLengtsTable[i++] = 0;
683 }
684 } else if (z == 18) {
685 y = getChmSection().getSyncBits(5);
686 y += 20;
687 for (int j = 0; j < y; j++) {
688 assertInRange(getState().getMainTreeLengtsTable(), i);
689 getState().mainTreeLengtsTable[i++] = 0;
690 }
691 } else if (z == 19) {
692 y = getChmSection().getSyncBits(1);
693 y += 4;
694 z = pretreetable[getChmSection().getDesyncBits(
695 ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
696 if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
697 x = ChmConstants.LZX_PRETREE_TABLEBITS;
698 do {
699 x++;
700 z <<= 1;
701 z += getChmSection().checkBit(x);
702 } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
703 }
704 getChmSection().getSyncBits(prelentable[z]);
705 z = getState().mainTreeLengtsTable[i] - z;
706 if (z < 0)
707 z = z + 17;
708 for (int j = 0; j < y; j++)
709 if (i < getState().getMainTreeLengtsTable().length)
710 getState().mainTreeLengtsTable[i++] = (short) z;
711 }
712 }
713 }
714
715 private void assertInRange(short[] array, int index) throws ChmParsingException {
716 if (index >= array.length)
717 throw new ChmParsingException(index + " is bigger than "
718 + array.length);
719 }
720
721 private short[] createAlignedLenTable() {
722 int tablelen = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
723 int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
724 short[] tmp = new short[tablelen];
725 for (int i = 0; i < tablelen; i++) {
726 tmp[i] = (short) getChmSection().getSyncBits(bits);
727 }
728 return tmp;
729 }
730
731 private void createAlignedTreeTable() {
732 getState().setAlignedLenTable(createAlignedLenTable());
733 getState().setAlignedLenTable(
734 createTreeTable2(getState().getAlignedLenTable(),
735 (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
736 + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
737 ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
738 ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
739 }
740
741 private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
742 int maxsymbol) {
743 short[] tmp = new short[tablelen];
744 short sym;
745 int leaf;
746 int bit_num = 1;
747 long fill;
748 int pos = 0;
749 /* the current position in the decode table */
750 long table_mask = (1 << bits);
751 long bit_mask = (table_mask >> 1);
752 long next_symbol = bit_mask;
753
754 /* fills entries for short codes for a direct mapping */
755 while (bit_num <= bits) {
756 for (sym = 0; sym < maxsymbol; sym++) {
757 if (lentable.length > sym && lentable[sym] == bit_num) {
758 leaf = pos;// pos=0
759
760 if ((pos += bit_mask) > table_mask)
761 return null;
762
763 fill = bit_mask;
764 while (fill-- > 0)
765 tmp[leaf++] = sym;
766 }
767 }
768 bit_mask >>= 1;
769 bit_num++;
770 }
771
772 /* if there are any codes longer than nbits */
773 if (pos != table_mask) {
774 /* clears the remainder of the table */
775 for (leaf = pos; leaf < table_mask; leaf++)
776 tmp[leaf] = 0;
777
778 /* gives ourselves room for codes to grow by up to 16 more bits */
779 pos <<= 16;
780 table_mask <<= 16;
781 bit_mask = 1 << 15;
782
783 while (bit_num <= 16) {
784 for (sym = 0; sym < maxsymbol; sym++) {
785 if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
786 leaf = pos >> 16;
787 for (fill = 0; fill < bit_num - bits; fill++) {
788 /*
789 * if this path hasn't been taken yet, 'allocate'
790 * two entries
791 */
792 if (tmp[leaf] == 0) {
793 if (((next_symbol << 1) + 1) < tmp.length) {
794 tmp[(int) (next_symbol << 1)] = 0;
795 tmp[(int) (next_symbol << 1) + 1] = 0;
796 tmp[leaf] = (short) next_symbol++;
797 }
798
799 }
800 /*
801 * follows the path and select either left or right
802 * for next bit
803 */
804 leaf = tmp[leaf] << 1;
805 if (((pos >> (15 - fill)) & 1) != 0)
806 leaf++;
807 }
808 tmp[leaf] = sym;
809
810 if ((pos += bit_mask) > table_mask)
811 return null;
812 /* table overflow */
813 } else {
814 // return null;
815 }
816 }
817 bit_mask >>= 1;
818 bit_num++;
819 }
820 }
821
822 /* is it full table? */
823 if (pos == table_mask)
824 return tmp;
825
826 return tmp;
827 }
828
829 public byte[] getContent() {
830 return content;
831 }
832
833 public byte[] getContent(int startOffset, int endOffset) {
834 int length = endOffset - startOffset;
835 // return (getContent() != null) ? Arrays.copyOfRange(getContent(),
836 // startOffset, (startOffset + length)) : new byte[1];
837 return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
838 startOffset, (startOffset + length)) : new byte[1];
839 }
840
841 public byte[] getContent(int start) {
842 // return (getContent() != null) ? Arrays.copyOfRange(getContent(),
843 // start, (getContent().length + start)) : new byte[1];
844 return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
845 start, (getContent().length + start)) : new byte[1];
846 }
847
848 private void setContent(int contentLength) {
849 this.content = new byte[contentLength];
850 }
851
852 private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
853 if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
854 setState(new ChmLzxState((int) getBlockLength()));
855 else
856 setState(chmPrevLzxBlock.getState());
857 }
858
859 private boolean validateConstructorParams(int blockNumber,
860 byte[] dataSegment, long blockLength) throws TikaException {
861 int goodParameter = 0;
862 if (blockNumber >= 0)
863 ++goodParameter;
864 else
865 throw new ChmParsingException("block number should be possitive");
866 if (dataSegment != null && dataSegment.length > 0)
867 ++goodParameter;
868 else
869 throw new ChmParsingException("data segment should not be null");
870 if (blockLength > 0)
871 ++goodParameter;
872 else
873 throw new ChmParsingException(
874 "block length should be more than zero");
875 return (goodParameter == 3);
876 }
877
878 public int getBlockNumber() {
879 return block_number;
880 }
881
882 private void setBlockNumber(int block_number) {
883 this.block_number = block_number;
884 }
885
886 private long getBlockLength() {
887 return block_length;
888 }
889
890 private void setBlockLength(long block_length) {
891 this.block_length = block_length;
892 }
893
894 public ChmLzxState getState() {
895 return state;
896 }
897
898 private void setState(ChmLzxState state) {
899 this.state = state;
900 }
901
902 /**
903 * @param args
904 */
905 public static void main(String[] args) {
906 // TODO Auto-generated method stub
907
908 }
909 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.lzx;
17
18 import java.util.concurrent.CancellationException;
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.parser.chm.core.ChmCommons;
22 import org.apache.tika.parser.chm.core.ChmConstants;
23 import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
24 import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
25 import org.apache.tika.parser.chm.exception.ChmParsingException;
26
27 public class ChmLzxState {
28 /* Class' members */
29 private int window; /* the actual decoding window */
30 private long window_size; /* window size (32Kb through 2Mb) */
31 private int window_position; /* current offset within the window */
32 private int main_tree_elements; /* number of main tree elements */
33 private LzxState hadStarted; /* have we started decoding at all yet? */
34 private int block_type; /* type of this block */
35 private int block_length; /* uncompressed length of this block */
36 private int block_remaining; /* uncompressed bytes still left to decode */
37 private int frames_read; /* the number of CFDATA blocks processed */
38 private int intel_file_size; /* magic header value used for transform */
39 private long intel_current_possition; /* current offset in transform space */
40 private IntelState intel_state; /* have we seen any translatable data yet? */
41 private long R0; /* for the LRU offset system */
42 private long R1; /* for the LRU offset system */
43 private long R2; /* for the LRU offset system */
44
45 // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
46 protected short[] mainTreeLengtsTable;
47 protected short[] mainTreeTable;
48
49 protected short[] lengthTreeTable;
50 protected short[] lengthTreeLengtsTable;
51
52 protected short[] alignedLenTable;
53 protected short[] alignedTreeTable;
54
55 protected short[] getMainTreeTable() {
56 return mainTreeTable;
57 }
58
59 protected short[] getAlignedTreeTable() {
60 return alignedTreeTable;
61 }
62
63 protected void setAlignedTreeTable(short[] alignedTreeTable) {
64 this.alignedTreeTable = alignedTreeTable;
65 }
66
67 protected short[] getLengthTreeTable() throws TikaException {
68 if (lengthTreeTable != null)
69 return this.lengthTreeTable;
70 else
71 throw new ChmParsingException("lengthTreeTable is null");
72 }
73
74 protected void setLengthTreeTable(short[] lengthTreeTable) {
75 this.lengthTreeTable = lengthTreeTable;
76 }
77
78 protected void setMainTreeTable(short[] mainTreeTable) {
79 this.mainTreeTable = mainTreeTable;
80 }
81
82 protected short[] getAlignedLenTable() {
83 return this.alignedLenTable;
84 }
85
86 protected void setAlignedLenTable(short[] alignedLenTable) {
87 this.alignedLenTable = alignedLenTable;
88 }
89
90 /**
91 * It suits for informative outlook
92 */
93 public String toString() {
94 StringBuilder sb = new StringBuilder();
95 sb.append("actual decoding window:=" + getWindow()
96 + System.getProperty("line.separator"));
97 sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
98 + System.getProperty("line.separator"));
99 sb.append("current offset within the window:=" + getWindowPosition()
100 + System.getProperty("line.separator"));
101 sb.append("number of main tree elements:=" + getMainTreeElements()
102 + System.getProperty("line.separator"));
103 sb.append("have we started decoding at all yet?:=" + getHadStarted()
104 + System.getProperty("line.separator"));
105 sb.append("type of this block:=" + getBlockType()
106 + System.getProperty("line.separator"));
107 sb.append("uncompressed length of this block:=" + getBlockLength()
108 + System.getProperty("line.separator"));
109 sb.append("uncompressed bytes still left to decode:="
110 + getBlockRemaining() + System.getProperty("line.separator"));
111 sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
112 + System.getProperty("line.separator"));
113 sb.append("magic header value used for transform:="
114 + getIntelFileSize() + System.getProperty("line.separator"));
115 sb.append("current offset in transform space:="
116 + getIntelCurrentPossition()
117 + System.getProperty("line.separator"));
118 sb.append("have we seen any translatable data yet?:=" + getIntelState()
119 + System.getProperty("line.separator"));
120 sb.append("R0 for the LRU offset system:=" + getR0()
121 + System.getProperty("line.separator"));
122 sb.append("R1 for the LRU offset system:=" + getR1()
123 + System.getProperty("line.separator"));
124 sb.append("R2 for the LRU offset system:=" + getR2()
125 + System.getProperty("line.separator"));
126 sb.append("main tree length:=" + getMainTreeLengtsTable().length
127 + System.getProperty("line.separator"));
128 sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
129 + System.getProperty("line.separator"));
130 return sb.toString();
131 }
132
133 public ChmLzxState(int window) throws TikaException {
134 if (window >= 0) {
135 int position_slots;
136 int win = ChmCommons.getWindowSize(window);
137 setWindowSize(1 << win);
138 /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
139 if (win < 15 || win > 21)
140 throw new ChmParsingException("window less than 15 or window greater than 21");
141
142 /* Calculates required position slots */
143 if (win == 20)
144 position_slots = 42;
145 else if (win == 21)
146 position_slots = 50;
147 else
148 position_slots = win << 1;
149
150 setR0(1);
151 setR1(1);
152 setR2(1);
153 setMainTreeElements(512);
154 setHadStarted(LzxState.NOT_STARTED_DECODING);
155 setFramesRead(0);
156 setBlockRemaining(0);
157 setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
158 setIntelCurrentPossition(0);
159 setIntelState(IntelState.NOT_STARTED);
160 setWindowPosition(0);
161 setMainTreeLengtsTable(new short[getMainTreeElements()]);
162 setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
163 } else
164 throw new CancellationException(
165 "window size should be more than zero");
166 }
167
168 protected void setWindow(int window) {
169 this.window = window;
170 }
171
172 protected int getWindow() {
173 return window;
174 }
175
176 protected void setWindowSize(long window_size) {
177 this.window_size = window_size;
178 }
179
180 protected long getWindowSize() {
181 return window_size;
182 }
183
184 protected void setWindowPosition(int window_position) {
185 this.window_position = window_position;
186 }
187
188 protected int getWindowPosition() {
189 return window_position;
190 }
191
192 protected void setMainTreeElements(int main_tree_elements) {
193 this.main_tree_elements = main_tree_elements;
194 }
195
196 protected int getMainTreeElements() {
197 return main_tree_elements;
198 }
199
200 protected void setHadStarted(LzxState hadStarted) {
201 this.hadStarted = hadStarted;
202 }
203
204 protected LzxState getHadStarted() {
205 return hadStarted;
206 }
207
208 protected void setBlockType(int block_type) {
209 this.block_type = block_type;
210 }
211
212 public int getBlockType() {
213 return block_type;
214 }
215
216 protected void setBlockLength(int block_length) {
217 this.block_length = block_length;
218 }
219
220 protected int getBlockLength() {
221 return block_length;
222 }
223
224 protected void setBlockRemaining(int block_remaining) {
225 this.block_remaining = block_remaining;
226 }
227
228 protected int getBlockRemaining() {
229 return block_remaining;
230 }
231
232 protected void setFramesRead(int frames_read) {
233 this.frames_read = frames_read;
234 }
235
236 protected void increaseFramesRead() {
237 this.frames_read = getFramesRead() + 1;
238 }
239
240 protected int getFramesRead() {
241 return frames_read;
242 }
243
244 protected void setIntelFileSize(int intel_file_size) {
245 this.intel_file_size = intel_file_size;
246 }
247
248 protected int getIntelFileSize() {
249 return intel_file_size;
250 }
251
252 protected void setIntelCurrentPossition(long intel_current_possition) {
253 this.intel_current_possition = intel_current_possition;
254 }
255
256 protected long getIntelCurrentPossition() {
257 return intel_current_possition;
258 }
259
260 protected void setIntelState(IntelState intel_state) {
261 this.intel_state = intel_state;
262 }
263
264 protected IntelState getIntelState() {
265 return intel_state;
266 }
267
268 protected void setR0(long r0) {
269 R0 = r0;
270 }
271
272 protected long getR0() {
273 return R0;
274 }
275
276 protected void setR1(long r1) {
277 R1 = r1;
278 }
279
280 protected long getR1() {
281 return R1;
282 }
283
284 protected void setR2(long r2) {
285 R2 = r2;
286 }
287
288 protected long getR2() {
289 return R2;
290 }
291
292 public static void main(String[] args) {
293 }
294
295 public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
296 this.mainTreeLengtsTable = mainTreeLengtsTable;
297 }
298
299 public short[] getMainTreeLengtsTable() {
300 return mainTreeLengtsTable;
301 }
302
303 public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
304 this.lengthTreeLengtsTable = lengthTreeLengtsTable;
305 }
306
307 public short[] getLengthTreeLengtsTable() {
308 return lengthTreeLengtsTable;
309 }
310 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm.lzx;
17
18 import java.math.BigInteger;
19 import java.util.Arrays;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.parser.chm.core.ChmCommons;
23
24 public class ChmSection {
25 private byte[] data;
26 private int swath;// kiks
27 private int total;// remains
28 private int buffer;// val
29
30 public ChmSection(byte[] data) throws TikaException {
31 ChmCommons.assertByteArrayNotNull(data);
32 setData(data);
33 }
34
35 /* Utilities */
36 public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
37 ChmCommons.assertByteArrayNotNull(toBeReversed);
38 ChmCommons.reverse(toBeReversed);
39 return toBeReversed;
40 }
41
42 public int checkBit(int i) {
43 return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
44 }
45
46 public int getSyncBits(int bit) {
47 return getDesyncBits(bit, bit);
48 }
49
50 public int getDesyncBits(int bit, int removeBit) {
51 while (getTotal() < 16) {
52 setBuffer((getBuffer() << 16) + unmarshalUByte()
53 + (unmarshalUByte() << 8));
54 setTotal(getTotal() + 16);
55 }
56 int tmp = (getBuffer() >>> (getTotal() - bit));
57 setTotal(getTotal() - removeBit);
58 setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
59 return tmp;
60 }
61
62 public int unmarshalUByte() {
63 return (int) (getByte() & 255);
64 }
65
66 public byte getByte() {
67 if (getSwath() < getData().length) {
68 setSwath(getSwath() + 1);
69 return getData()[getSwath() - 1];
70 } else
71 return 0;
72 }
73
74 public int getLeft() {
75 return (getData().length - getSwath());
76 }
77
78 public byte[] getData() {
79 return data;
80 }
81
82 public BigInteger getBigInteger(int i) {
83 if (getData() == null)
84 return BigInteger.ZERO;
85 if (getData().length - getSwath() < i)
86 i = getData().length - getSwath();
87 byte[] tmp = new byte[i];
88 for (int j = i - 1; j >= 0; j--) {
89 tmp[i - j - 1] = getData()[getSwath() + j];
90 }
91 setSwath(getSwath() + i);
92 return new BigInteger(tmp);
93 }
94
95 public byte[] stringToAsciiBytes(String s) {
96 char[] c = s.toCharArray();
97 byte[] byteval = new byte[c.length];
98 for (int i = 0; i < c.length; i++)
99 byteval[i] = (byte) c[i];
100 return byteval;
101 }
102
103 public BigInteger unmarshalUlong() {
104 return getBigInteger(8);
105 }
106
107 public long unmarshalUInt() {
108 return getBigInteger(4).longValue();
109 }
110
111 public int unmarshalInt() {
112 return getBigInteger(4).intValue();
113 }
114
115 public byte[] unmarshalBytes(int i) {
116 if (i == 0)
117 return new byte[1];
118 byte[] t = new byte[i];
119 for (int j = 0; j < i; j++)
120 t[j] = getData()[j + getSwath()];
121 setSwath(getSwath() + i);
122 return t;
123 }
124
125 public BigInteger getEncint() {
126 byte ob;
127 BigInteger bi = BigInteger.ZERO;
128 byte[] nb = new byte[1];
129 while ((ob = this.getByte()) < 0) {
130 nb[0] = (byte) ((ob & 0x7f));
131 bi = bi.shiftLeft(7).add(new BigInteger(nb));
132 }
133 nb[0] = (byte) ((ob & 0x7f));
134 bi = bi.shiftLeft(7).add(new BigInteger(nb));
135 return bi;
136 }
137
138 public char unmarshalUtfChar() {
139 byte ob;
140 int i = 1;
141 byte[] ba;
142 ob = this.getByte();
143 if (ob < 0) {
144 i = 2;
145 while ((ob << (24 + i)) < 0)
146 i++;
147 }
148 ba = new byte[i];
149 ba[0] = ob;
150 int j = 1;
151 while (j < i) {
152 ba[j] = this.getByte();
153 j++;
154 }
155 i = ba.length;
156 if (i == 1)
157 return (char) ba[0];
158 else {
159 int n;
160 n = ba[0] & 15; // 00001111b, gets last 4 bits
161 j = 1;
162 while (j < i)
163 n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
164 return (char) n;
165 }
166 }
167
168 private void setData(byte[] data) {
169 this.data = data;
170 }
171
172 public int getSwath() {
173 return swath;
174 }
175
176 public void setSwath(int swath) {
177 this.swath = swath;
178 }
179
180 public int getTotal() {
181 return total;
182 }
183
184 public void setTotal(int total) {
185 this.total = total;
186 }
187
188 private int getBuffer() {
189 return buffer;
190 }
191
192 private void setBuffer(int buffer) {
193 this.buffer = buffer;
194 }
195
196 /**
197 * @param args
198 * @throws TikaException
199 */
200 public static void main(String[] args) throws TikaException {
201 byte[] array = { 4, 78, -67, 90, 1, -33 };
202 ChmSection chmSection = new ChmSection(array);
203 System.out.println("before " + Arrays.toString(array));
204 System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
205 }
206 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.code;
17
18 import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
19 import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
20 import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.Charset;
25 import java.util.HashMap;
26 import java.util.Map;
27 import java.util.Set;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.tika.config.ServiceLoader;
32 import org.apache.tika.detect.AutoDetectReader;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.io.CloseShieldInputStream;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.TikaCoreProperties;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.Parser;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
42
43 import com.uwyn.jhighlight.renderer.Renderer;
44 import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
45 /**
46 * Generic Source code parser for Java, Groovy, C++
47 *
48 * @author Hong-Thai.Nguyen
49 * @since 1.6
50 */
51 public class SourceCodeParser implements Parser {
52
53 private static final long serialVersionUID = -4543476498190054160L;
54
55 private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
56
57 private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
58 private static final long serialVersionUID = -741976157563751152L;
59 {
60 put(MediaType.text("x-c++src"), CPP);
61 put(MediaType.text("x-java-source"), JAVA);
62 put(MediaType.text("x-groovy"), GROOVY);
63 }
64 };
65
66 private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
67
68 @Override
69 public Set<MediaType> getSupportedTypes(ParseContext context) {
70 return TYPES_TO_RENDERER.keySet();
71 }
72
73 @Override
74 public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
75 throws IOException, SAXException, TikaException {
76
77 AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
78
79 try {
80 Charset charset = reader.getCharset();
81 String mediaType = metadata.get(Metadata.CONTENT_TYPE);
82 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
83 if (mediaType != null && name != null) {
84 MediaType type = MediaType.parse(mediaType);
85 metadata.set(Metadata.CONTENT_TYPE, type.toString());
86 metadata.set(Metadata.CONTENT_ENCODING, charset.name());
87
88 StringBuilder out = new StringBuilder();
89 String line;
90 int nbLines = 0;
91 while ((line = reader.readLine()) != null) {
92 out.append(line);
93 String author = parserAuthor(line);
94 if (author != null) {
95 metadata.add(TikaCoreProperties.CREATOR, author);
96 }
97 nbLines ++;
98 }
99 metadata.set("LoC", String.valueOf(nbLines));
100
101 Renderer renderer = getRenderer(type.toString());
102 String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
103 char[] charArray = codeAsHtml.toCharArray();
104 handler.startDocument();
105 handler.characters(charArray, 0, charArray.length);
106 handler.endDocument();
107 }
108 } finally {
109 reader.close();
110 }
111
112 }
113
114 private Renderer getRenderer(String mimeType) {
115 MediaType mt = MediaType.parse(mimeType);
116 String type = TYPES_TO_RENDERER.get(mt);
117 if (type == null) {
118 throw new RuntimeException("unparseable content type " + mimeType);
119 }
120 return XhtmlRendererFactory.getRenderer(type);
121 }
122
123
124 private String parserAuthor(String line) {
125 Matcher m = authorPattern.matcher(line);
126 if (m.find()) {
127 return m.group(1).trim();
128 }
129
130 return null;
131 }
132 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.crypto;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.io.CloseShieldInputStream;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.mime.MediaType;
26 import org.apache.tika.parser.AbstractParser;
27 import org.apache.tika.parser.EmptyParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.bouncycastle.cms.CMSException;
31 import org.bouncycastle.cms.CMSSignedDataParser;
32 import org.bouncycastle.cms.CMSTypedStream;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Basic parser for PKCS7 data.
38 */
39 public class Pkcs7Parser extends AbstractParser {
40
41 /** Serial version UID */
42 private static final long serialVersionUID = -7310531559075115044L;
43
44 private static final MediaType PKCS7_MIME =
45 MediaType.application("pkcs7-mime");
46
47 private static final MediaType PKCS7_SIGNATURE =
48 MediaType.application("pkcs7-signature");
49
50 public Set<MediaType> getSupportedTypes(ParseContext context) {
51 return MediaType.set(PKCS7_MIME, PKCS7_SIGNATURE);
52 }
53
54 public void parse(
55 InputStream stream, ContentHandler handler,
56 Metadata metadata, ParseContext context)
57 throws IOException, SAXException, TikaException {
58 try {
59 CMSSignedDataParser parser =
60 new CMSSignedDataParser(new CloseShieldInputStream(stream));
61 try {
62 CMSTypedStream content = parser.getSignedContent();
63 if (content == null) {
64 throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)");
65 }
66 InputStream input = content.getContentStream();
67 try {
68 Parser delegate =
69 context.get(Parser.class, EmptyParser.INSTANCE);
70 delegate.parse(input, handler, metadata, context);
71 } finally {
72 input.close();
73 }
74 } finally {
75 parser.close();
76 }
77 } catch (CMSException e) {
78 throw new TikaException("Unable to parse pkcs7 signed data", e);
79 }
80 }
81
82 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.dwg;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.poi.util.IOUtils;
24 import org.apache.poi.util.StringUtil;
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.EndianUtils;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.Property;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37 /**
38 * DWG (CAD Drawing) parser. This is a very basic parser, which just
39 * looks for bits of the headers.
40 * Note that we use Apache POI for various parts of the processing, as
41 * lots of the low level string/int/short concepts are the same.
42 */
43 public class DWGParser extends AbstractParser {
44
45 /** Serial version UID */
46 private static final long serialVersionUID = -7744232583079169119L;
47
48 private static MediaType TYPE = MediaType.image("vnd.dwg");
49
50 public Set<MediaType> getSupportedTypes(ParseContext context) {
51 return Collections.singleton(TYPE);
52 }
53
54 /** The order of the fields in the header */
55 private static final Property[] HEADER_PROPERTIES_ENTRIES = {
56 TikaCoreProperties.TITLE,
57 TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
58 TikaCoreProperties.CREATOR,
59 TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
60 TikaCoreProperties.COMMENTS,
61 TikaCoreProperties.MODIFIER,
62 null, // Unknown?
63 TikaCoreProperties.RELATION, // Hyperlink
64 };
65
66 /** For the 2000 file, they're indexed */
67 private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
68 null,
69 TikaCoreProperties.RELATION, // 0x01
70 TikaCoreProperties.TITLE, // 0x02
71 TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
72 TikaCoreProperties.CREATOR, // 0x04
73 null,
74 TikaCoreProperties.COMMENTS,// 0x06
75 TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
76 TikaCoreProperties.MODIFIER, // 0x08
77 };
78
79 private static final String HEADER_2000_PROPERTIES_MARKER_STR =
80 "DWGPROPS COOKIE";
81
82 private static final byte[] HEADER_2000_PROPERTIES_MARKER =
83 new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
84
85 static {
86 StringUtil.putCompressedUnicode(
87 HEADER_2000_PROPERTIES_MARKER_STR,
88 HEADER_2000_PROPERTIES_MARKER, 0);
89 }
90
91 /**
92 * How far to skip after the last standard property, before
93 * we find any custom properties that might be there.
94 */
95 private static final int CUSTOM_PROPERTIES_SKIP = 20;
96
97 /**
98 * The value of padding bytes other than 0 in some DWG files.
99 */
100 private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
101
102 public void parse(
103 InputStream stream, ContentHandler handler,
104 Metadata metadata, ParseContext context)
105 throws IOException, TikaException, SAXException {
106 // First up, which version of the format are we handling?
107 byte[] header = new byte[128];
108 IOUtils.readFully(stream, header);
109 String version = new String(header, 0, 6, "US-ASCII");
110
111 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
112 xhtml.startDocument();
113
114 if (version.equals("AC1015")) {
115 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
116 if (skipTo2000PropertyInfoSection(stream, header)) {
117 get2000Props(stream,metadata,xhtml);
118 }
119 } else if (version.equals("AC1018")) {
120 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
121 if (skipToPropertyInfoSection(stream, header)) {
122 get2004Props(stream,metadata,xhtml);
123 }
124 } else if (version.equals("AC1021") || version.equals("AC1024")) {
125 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
126 if (skipToPropertyInfoSection(stream, header)) {
127 get2007and2010Props(stream,metadata,xhtml);
128 }
129 } else {
130 throw new TikaException(
131 "Unsupported AutoCAD drawing version: " + version);
132 }
133
134 xhtml.endDocument();
135 }
136
137 /**
138 * Stored as US-ASCII
139 */
140 private void get2004Props(
141 InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
142 throws IOException, TikaException, SAXException {
143 // Standard properties
144 for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
145 String headerValue = read2004String(stream);
146 handleHeader(i, headerValue, metadata, xhtml);
147 }
148
149 // Custom properties
150 int customCount = skipToCustomProperties(stream);
151 for (int i = 0; i < customCount; i++) {
152 String propName = read2004String(stream);
153 String propValue = read2004String(stream);
154 if(propName.length() > 0 && propValue.length() > 0) {
155 metadata.add(propName, propValue);
156 }
157 }
158 }
159
160 private String read2004String(InputStream stream) throws IOException, TikaException {
161 int stringLen = EndianUtils.readUShortLE(stream);
162
163 byte[] stringData = new byte[stringLen];
164 IOUtils.readFully(stream, stringData);
165
166 // Often but not always null terminated
167 if (stringData[stringLen-1] == 0) {
168 stringLen--;
169 }
170 String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
171 return value;
172 }
173
174 /**
175 * Stored as UCS2, so 16 bit "unicode"
176 */
177 private void get2007and2010Props(
178 InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
179 throws IOException, TikaException, SAXException {
180 // Standard properties
181 for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
182 String headerValue = read2007and2010String(stream);
183 handleHeader(i, headerValue, metadata, xhtml);
184 }
185
186 // Custom properties
187 int customCount = skipToCustomProperties(stream);
188 for (int i = 0; i < customCount; i++) {
189 String propName = read2007and2010String(stream);
190 String propValue = read2007and2010String(stream);
191 if(propName.length() > 0 && propValue.length() > 0) {
192 metadata.add(propName, propValue);
193 }
194 }
195 }
196
197 private String read2007and2010String(InputStream stream) throws IOException, TikaException {
198 int stringLen = EndianUtils.readUShortLE(stream);
199
200 byte[] stringData = new byte[stringLen * 2];
201 IOUtils.readFully(stream, stringData);
202 String value = StringUtil.getFromUnicodeLE(stringData);
203
204 // Some strings are null terminated
205 if(value.charAt(value.length()-1) == 0) {
206 value = value.substring(0, value.length()-1);
207 }
208
209 return value;
210 }
211
212 private void get2000Props(
213 InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
214 throws IOException, TikaException, SAXException {
215 int propCount = 0;
216 while(propCount < 30) {
217 int propIdx = EndianUtils.readUShortLE(stream);
218 int length = EndianUtils.readUShortLE(stream);
219 int valueType = stream.read();
220
221 if(propIdx == 0x28) {
222 // This one seems not to follow the pattern
223 length = 0x19;
224 } else if(propIdx == 90) {
225 // We think this means the end of properties
226 break;
227 }
228
229 byte[] value = new byte[length];
230 IOUtils.readFully(stream, value);
231 if(valueType == 0x1e) {
232 // Normal string, good
233 String val = StringUtil.getFromCompressedUnicode(value, 0, length);
234
235 // Is it one we can look up by index?
236 if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
237 metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
238 xhtml.element("p", val);
239 } else if(propIdx == 0x012c) {
240 int splitAt = val.indexOf('=');
241 if(splitAt > -1) {
242 String propName = val.substring(0, splitAt);
243 String propVal = val.substring(splitAt+1);
244 metadata.add(propName, propVal);
245 }
246 }
247 } else {
248 // No idea...
249 }
250
251 propCount++;
252 }
253 }
254
255 private void handleHeader(
256 int headerNumber, String value, Metadata metadata,
257 XHTMLContentHandler xhtml) throws SAXException {
258 if(value == null || value.length() == 0) {
259 return;
260 }
261
262 Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
263 if(headerProp != null) {
264 metadata.set(headerProp, value);
265 }
266
267 xhtml.element("p", value);
268 }
269
270 /**
271 * Grab the offset, then skip there
272 */
273 private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
274 throws IOException, TikaException {
275 // The offset is stored in the header from 0x20 onwards
276 long offsetToSection = EndianUtils.getLongLE(header, 0x20);
277
278 // Sanity check the offset. Some files seem to use a different format,
279 // and the offset isn't available at 0x20. Until we can work out how
280 // to find the offset in those files, skip them if detected
281 if (offsetToSection > 0xa00000l) {
282 // Header should never be more than 10mb into the file, something is wrong
283 offsetToSection = 0;
284 }
285
286 // Work out how far to skip, and sanity check
287 long toSkip = offsetToSection - header.length;
288 if(offsetToSection == 0){
289 return false;
290 }
291 while (toSkip > 0) {
292 byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
293 IOUtils.readFully(stream, skip);
294 toSkip -= skip.length;
295 }
296 return true;
297 }
298
299 /**
300 * We think it can be anywhere...
301 */
302 private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
303 throws IOException {
304 int val = 0;
305 while(val != -1) {
306 val = stream.read();
307 if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
308 boolean going = true;
309 for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
310 val = stream.read();
311 if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
312 }
313 if(going) {
314 // Bingo, found it
315 return true;
316 }
317 }
318 }
319 return false;
320 }
321
322 private int skipToCustomProperties(InputStream stream)
323 throws IOException, TikaException {
324 // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
325 byte[] padding = new byte[4];
326 IOUtils.readFully(stream, padding);
327 if((padding[0] == 0 && padding[1] == 0 &&
328 padding[2] == 0 && padding[3] == 0) ||
329 (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
330 padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
331 padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
332 padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
333
334 // Looks hopeful, skip on
335 padding = new byte[CUSTOM_PROPERTIES_SKIP];
336 IOUtils.readFully(stream, padding);
337
338 // We should now have the count
339 int count = EndianUtils.readUShortLE(stream);
340
341 // Sanity check it
342 if(count > 0 && count < 0x7f) {
343 // Looks plausible
344 return count;
345 } else {
346 // No properties / count is too high to trust
347 return 0;
348 }
349 } else {
350 // No padding. That probably means no custom props
351 return 0;
352 }
353 }
354
355 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.epub;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import javax.xml.XMLConstants;
24 import javax.xml.parsers.ParserConfigurationException;
25 import javax.xml.parsers.SAXParser;
26 import javax.xml.parsers.SAXParserFactory;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.CloseShieldInputStream;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.mime.MediaType;
32 import org.apache.tika.parser.AbstractParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.sax.OfflineContentHandler;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38 import org.xml.sax.SAXNotRecognizedException;
39
40 /**
41 * Parser for EPUB OPS <code>*.html</code> files.
42 *
43 * For the time being, assume XHTML (TODO: DTBook)
44 */
45 public class EpubContentParser extends AbstractParser {
46
47 public Set<MediaType> getSupportedTypes(ParseContext context) {
48 return Collections.emptySet(); // not a top-level parser
49 }
50
51 public void parse(
52 InputStream stream, ContentHandler handler,
53 Metadata metadata, ParseContext context)
54 throws IOException, SAXException, TikaException {
55 final XHTMLContentHandler xhtml =
56 new XHTMLContentHandler(handler,metadata);
57
58 try {
59 SAXParserFactory factory = SAXParserFactory.newInstance();
60 factory.setValidating(false);
61 factory.setNamespaceAware(true);
62 try {
63 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
64 } catch (SAXNotRecognizedException e) {
65 // TIKA-329: Some XML parsers do not support the secure-processing
66 // feature, even though it's required by JAXP in Java 5. Ignoring
67 // the exception is fine here, deployments without this feature
68 // are inherently vulnerable to XML denial-of-service attacks.
69 }
70 SAXParser parser = factory.newSAXParser();
71 parser.parse(
72 new CloseShieldInputStream(stream),
73 new OfflineContentHandler(xhtml));
74 } catch (ParserConfigurationException e) {
75 throw new TikaException("XML parser configuration error", e);
76 }
77 }
78
79 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.epub;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24 import java.util.zip.ZipEntry;
25 import java.util.zip.ZipInputStream;
26
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.io.IOUtils;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.Parser;
34 import org.apache.tika.parser.xml.DcXMLParser;
35 import org.apache.tika.sax.BodyContentHandler;
36 import org.apache.tika.sax.EmbeddedContentHandler;
37 import org.apache.tika.sax.XHTMLContentHandler;
38 import org.xml.sax.ContentHandler;
39 import org.xml.sax.SAXException;
40 import org.xml.sax.helpers.DefaultHandler;
41
42 /**
43 * Epub parser
44 */
45 public class EpubParser extends AbstractParser {
46
47 /** Serial version UID */
48 private static final long serialVersionUID = 215176772484050550L;
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.application("epub+zip"),
53 MediaType.application("x-ibooks+zip")
54 )));
55
56 private Parser meta = new DcXMLParser();
57
58 private Parser content = new EpubContentParser();
59
60 public Parser getMetaParser() {
61 return meta;
62 }
63
64 public void setMetaParser(Parser meta) {
65 this.meta = meta;
66 }
67
68 public Parser getContentParser() {
69 return content;
70 }
71
72 public void setContentParser(Parser content) {
73 this.content = content;
74 }
75
76 public Set<MediaType> getSupportedTypes(ParseContext context) {
77 return SUPPORTED_TYPES;
78 }
79
80 public void parse(
81 InputStream stream, ContentHandler handler,
82 Metadata metadata, ParseContext context)
83 throws IOException, SAXException, TikaException {
84 // Because an EPub file is often made up of multiple XHTML files,
85 // we need explicit control over the start and end of the document
86 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
87 xhtml.startDocument();
88 ContentHandler childHandler = new EmbeddedContentHandler(
89 new BodyContentHandler(xhtml));
90
91 ZipInputStream zip = new ZipInputStream(stream);
92 ZipEntry entry = zip.getNextEntry();
93 while (entry != null) {
94 if (entry.getName().equals("mimetype")) {
95 String type = IOUtils.toString(zip, "UTF-8");
96 metadata.set(Metadata.CONTENT_TYPE, type);
97 } else if (entry.getName().equals("metadata.xml")) {
98 meta.parse(zip, new DefaultHandler(), metadata, context);
99 } else if (entry.getName().endsWith(".opf")) {
100 meta.parse(zip, new DefaultHandler(), metadata, context);
101 } else if (entry.getName().endsWith(".html") ||
102 entry.getName().endsWith(".xhtml")) {
103 content.parse(zip, childHandler, metadata, context);
104 }
105 entry = zip.getNextEntry();
106 }
107
108 // Finish everything
109 xhtml.endDocument();
110 }
111
112 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.executable;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.sql.Date;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import org.apache.poi.util.IOUtils;
27 import org.apache.poi.util.LittleEndian;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.EndianUtils;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.mime.MediaType;
32 import org.apache.tika.parser.AbstractParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 /**
39 * Parser for executable files. Currently supports ELF and PE
40 */
41 public class ExecutableParser extends AbstractParser implements MachineMetadata {
42 /** Serial version UID */
43 private static final long serialVersionUID = 32128791892482l;
44
45 private static final MediaType PE_EXE = MediaType.application("x-msdownload");
46 private static final MediaType ELF_GENERAL = MediaType.application("x-elf");
47 private static final MediaType ELF_OBJECT = MediaType.application("x-object");
48 private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
49 private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
50 private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
51 private static final Set<MediaType> SUPPORTED_TYPES =
52 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
53 PE_EXE,
54 ELF_GENERAL,
55 ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
56 )));
57 public Set<MediaType> getSupportedTypes(ParseContext context) {
58 return SUPPORTED_TYPES;
59 }
60
61 public void parse(
62 InputStream stream, ContentHandler handler,
63 Metadata metadata, ParseContext context)
64 throws IOException, SAXException, TikaException {
65 // We only do metadata, for now
66 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
67
68 // What kind is it?
69 byte[] first4 = new byte[4];
70 IOUtils.readFully(stream, first4);
71
72 if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
73 parsePE(xhtml, metadata, stream, first4);
74 } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
75 first4[2] == (byte)'L' && first4[3] == (byte)'F') {
76 parseELF(xhtml, metadata, stream, first4);
77 }
78
79
80 // Finish everything
81 xhtml.endDocument();
82 }
83
84 /**
85 * Parses a DOS or Windows PE file
86 */
87 public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
88 InputStream stream, byte[] first4) throws TikaException, IOException {
89 metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString());
90 metadata.set(PLATFORM, PLATFORM_WINDOWS);
91
92 // Skip over the MS-DOS bit
93 byte[] msdosSection = new byte[0x3c-4];
94 IOUtils.readFully(stream, msdosSection);
95
96 // Grab the PE header offset
97 int peOffset = LittleEndian.readInt(stream);
98
99 // Sanity check - while it may go anywhere, it's normally in the first few kb
100 if (peOffset > 4096 || peOffset < 0x3f) return;
101
102 // Skip the rest of the MS-DOS stub (if PE), until we reach what should
103 // be the PE header (if this is a PE executable)
104 stream.skip(peOffset - 0x40);
105
106 // Read the PE header
107 byte[] pe = new byte[24];
108 IOUtils.readFully(stream, pe);
109
110 // Check it really is a PE header
111 if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
112 // Good, has a valid PE signature
113 } else {
114 // Old style MS-DOS
115 return;
116 }
117
118 // Read the header values
119 int machine = LittleEndian.getUShort(pe, 4);
120 int numSectors = LittleEndian.getUShort(pe, 6);
121 long createdAt = LittleEndian.getInt(pe, 8);
122 long symbolTableOffset = LittleEndian.getInt(pe, 12);
123 long numSymbols = LittleEndian.getInt(pe, 16);
124 int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
125 int characteristcs = LittleEndian.getUShort(pe, 22);
126
127 // Turn this into helpful metadata
128 Date createdAtD = new Date(createdAt*1000l);
129 metadata.set(Metadata.CREATION_DATE, createdAtD);
130
131 switch(machine) {
132 case 0x14c:
133 metadata.set(MACHINE_TYPE, MACHINE_x86_32);
134 metadata.set(ENDIAN, Endian.LITTLE.getName());
135 metadata.set(ARCHITECTURE_BITS, "32");
136 break;
137 case 0x8664:
138 metadata.set(MACHINE_TYPE, MACHINE_x86_32);
139 metadata.set(ENDIAN, Endian.LITTLE.getName());
140 metadata.set(ARCHITECTURE_BITS, "64");
141 break;
142 case 0x200:
143 metadata.set(MACHINE_TYPE, MACHINE_IA_64);
144 metadata.set(ENDIAN, Endian.LITTLE.getName());
145 metadata.set(ARCHITECTURE_BITS, "64");
146 break;
147
148 case 0x184:
149 metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
150 metadata.set(ENDIAN, Endian.LITTLE.getName());
151 metadata.set(ARCHITECTURE_BITS, "32");
152 break;
153 case 0x284:
154 metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
155 metadata.set(ENDIAN, Endian.LITTLE.getName());
156 metadata.set(ARCHITECTURE_BITS, "64");
157 break;
158
159 case 0x1c0:
160 case 0x1c4:
161 metadata.set(MACHINE_TYPE, MACHINE_ARM);
162 metadata.set(ENDIAN, Endian.LITTLE.getName());
163 metadata.set(ARCHITECTURE_BITS, "32");
164 break;
165
166 case 0x268:
167 metadata.set(MACHINE_TYPE, MACHINE_M68K);
168 metadata.set(ENDIAN, Endian.BIG.getName());
169 metadata.set(ARCHITECTURE_BITS, "32");
170 break;
171
172 case 0x266:
173 case 0x366:
174 case 0x466:
175 metadata.set(MACHINE_TYPE, MACHINE_MIPS);
176 metadata.set(ENDIAN, Endian.BIG.getName());
177 metadata.set(ARCHITECTURE_BITS, "16");
178 break;
179 case 0x162:
180 case 0x166:
181 case 0x168:
182 case 0x169:
183 metadata.set(MACHINE_TYPE, MACHINE_MIPS);
184 metadata.set(ENDIAN, Endian.LITTLE.getName());
185 metadata.set(ARCHITECTURE_BITS, "16");
186 break;
187
188 case 0x1f0:
189 case 0x1f1:
190 metadata.set(MACHINE_TYPE, MACHINE_PPC);
191 metadata.set(ENDIAN, Endian.LITTLE.getName());
192 metadata.set(ARCHITECTURE_BITS, "32");
193 break;
194
195 case 0x1a2:
196 case 0x1a3:
197 metadata.set(MACHINE_TYPE, MACHINE_SH3);
198 metadata.set(ENDIAN, Endian.BIG.getName());
199 metadata.set(ARCHITECTURE_BITS, "32");
200 break;
201 case 0x1a6:
202 metadata.set(MACHINE_TYPE, MACHINE_SH4);
203 metadata.set(ENDIAN, Endian.BIG.getName());
204 metadata.set(ARCHITECTURE_BITS, "32");
205 break;
206 case 0x1a8:
207 metadata.set(MACHINE_TYPE, MACHINE_SH3);
208 metadata.set(ENDIAN, Endian.BIG.getName());
209 metadata.set(ARCHITECTURE_BITS, "32");
210 break;
211
212 case 0x9041:
213 metadata.set(MACHINE_TYPE, MACHINE_M32R);
214 metadata.set(ENDIAN, Endian.BIG.getName());
215 metadata.set(ARCHITECTURE_BITS, "32");
216 break;
217
218 case 0xebc:
219 metadata.set(MACHINE_TYPE, MACHINE_EFI);
220 break;
221
222 default:
223 metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
224 break;
225 }
226 }
227
228 /**
229 * Parses a Unix ELF file
230 */
231 public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
232 InputStream stream, byte[] first4) throws TikaException, IOException {
233 // Byte 5 is the architecture
234 int architecture = stream.read();
235 if (architecture == 1) {
236 metadata.set(ARCHITECTURE_BITS, "32");
237 } else if (architecture == 2) {
238 metadata.set(ARCHITECTURE_BITS, "64");
239 }
240
241 // Byte 6 is the endian-ness
242 int endian = stream.read();
243 if (endian == 1) {
244 metadata.set(ENDIAN, Endian.LITTLE.getName());
245 } else if (endian == 2) {
246 metadata.set(ENDIAN, Endian.BIG.getName());
247 }
248
249 // Byte 7 is the elf version
250 int elfVer = stream.read();
251
252 // Byte 8 is the OS, if set (lots of compilers don't)
253 // Byte 9 is the OS (specific) ABI version
254 int os = stream.read();
255 int osVer = stream.read();
256 if (os > 0 || osVer > 0)
257 {
258 switch (os) {
259 case 0:
260 metadata.set(PLATFORM, PLATFORM_SYSV);
261 break;
262
263 case 1:
264 metadata.set(PLATFORM, PLATFORM_HPUX);
265 break;
266
267 case 2:
268 metadata.set(PLATFORM, PLATFORM_NETBSD);
269 break;
270
271 case 3:
272 metadata.set(PLATFORM, PLATFORM_LINUX);
273 break;
274
275 case 6:
276 metadata.set(PLATFORM, PLATFORM_SOLARIS);
277 break;
278
279 case 7:
280 metadata.set(PLATFORM, PLATFORM_AIX);
281 break;
282
283 case 8:
284 metadata.set(PLATFORM, PLATFORM_IRIX);
285 break;
286
287 case 9:
288 metadata.set(PLATFORM, PLATFORM_FREEBSD);
289 break;
290
291 case 10:
292 metadata.set(PLATFORM, PLATFORM_TRU64);
293 break;
294
295 case 12:
296 metadata.set(PLATFORM, PLATFORM_FREEBSD);
297 break;
298
299 case 64:
300 case 97:
301 metadata.set(PLATFORM, PLATFORM_ARM);
302 break;
303
304 case 255:
305 metadata.set(PLATFORM, PLATFORM_EMBEDDED);
306 break;
307 }
308 }
309
310 // Bytes 10-16 are padding and lengths
311 byte[] padLength = new byte[7];
312 IOUtils.readFully(stream, padLength);
313
314 // Bytes 16-17 are the object type (LE/BE)
315 int type;
316 if (endian == 1) {
317 type = EndianUtils.readUShortLE(stream);
318 } else {
319 type = EndianUtils.readUShortBE(stream);
320 }
321 switch(type) {
322 case 1:
323 metadata.add(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
324 break;
325
326 case 2:
327 metadata.add(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
328 break;
329
330 case 3:
331 metadata.add(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
332 break;
333
334 case 4:
335 metadata.add(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
336 break;
337
338 default:
339 metadata.add(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
340 break;
341 }
342
343 // Bytes 18-19 are the machine (EM_*)
344 int machine;
345 if (endian == 1) {
346 machine = EndianUtils.readUShortLE(stream);
347 } else {
348 machine = EndianUtils.readUShortBE(stream);
349 }
350 switch(machine) {
351 case 2:
352 case 18:
353 case 43:
354 metadata.set(MACHINE_TYPE, MACHINE_SPARC);
355 break;
356 case 3:
357 metadata.set(MACHINE_TYPE, MACHINE_x86_32);
358 break;
359 case 4:
360 metadata.set(MACHINE_TYPE, MACHINE_M68K);
361 break;
362 case 5:
363 metadata.set(MACHINE_TYPE, MACHINE_M88K);
364 break;
365 case 8:
366 case 10:
367 metadata.set(MACHINE_TYPE, MACHINE_MIPS);
368 break;
369 case 7:
370 metadata.set(MACHINE_TYPE, MACHINE_S370);
371 break;
372 case 20:
373 case 21:
374 metadata.set(MACHINE_TYPE, MACHINE_PPC);
375 break;
376 case 22:
377 metadata.set(MACHINE_TYPE, MACHINE_S390);
378 break;
379 case 40:
380 metadata.set(MACHINE_TYPE, MACHINE_ARM);
381 break;
382 case 41:
383 case 0x9026:
384 metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
385 break;
386 case 50:
387 metadata.set(MACHINE_TYPE, MACHINE_IA_64);
388 break;
389 case 62:
390 metadata.set(MACHINE_TYPE, MACHINE_x86_64);
391 break;
392 case 75:
393 metadata.set(MACHINE_TYPE, MACHINE_VAX);
394 break;
395 case 88:
396 metadata.set(MACHINE_TYPE, MACHINE_M32R);
397 break;
398 }
399
400
401
402 // Bytes 20-23 are the version
403 // TODO
404 }
405 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.executable;
17
18 import org.apache.tika.metadata.Property;
19
20 /**
21 * Metadata for describing machines, such as their
22 * architecture, type and endian-ness
23 */
24 public interface MachineMetadata {
25 public static final String PREFIX = "machine:";
26
27 public static Property ARCHITECTURE_BITS = Property.internalClosedChoise(PREFIX+"architectureBits",
28 new String[] { "8", "16", "32", "64" });
29
30 public static final String PLATFORM_SYSV = "System V";
31 public static final String PLATFORM_HPUX = "HP-UX";
32 public static final String PLATFORM_NETBSD = "NetBSD";
33 public static final String PLATFORM_LINUX = "Linux";
34 public static final String PLATFORM_SOLARIS = "Solaris";
35 public static final String PLATFORM_AIX = "AIX";
36 public static final String PLATFORM_IRIX = "IRIX";
37 public static final String PLATFORM_FREEBSD = "FreeBSD";
38 public static final String PLATFORM_TRU64 = "Tru64";
39 public static final String PLATFORM_ARM = "ARM"; // ARM architecture ABI
40 public static final String PLATFORM_EMBEDDED = "Embedded"; // Stand-alone (embedded) ABI
41 public static final String PLATFORM_WINDOWS = "Windows";
42
43 public static Property PLATFORM = Property.internalClosedChoise(PREFIX+"platform",
44 new String[] { PLATFORM_SYSV, PLATFORM_HPUX, PLATFORM_NETBSD, PLATFORM_LINUX,
45 PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, PLATFORM_FREEBSD, PLATFORM_TRU64,
46 PLATFORM_ARM, PLATFORM_EMBEDDED, PLATFORM_WINDOWS });
47
48 public static final String MACHINE_x86_32 = "x86-32";
49 public static final String MACHINE_x86_64 = "x86-64";
50 public static final String MACHINE_IA_64 = "IA-64";
51 public static final String MACHINE_SPARC = "SPARC";
52 public static final String MACHINE_M68K = "Motorola-68000";
53 public static final String MACHINE_M88K = "Motorola-88000";
54 public static final String MACHINE_MIPS = "MIPS";
55 public static final String MACHINE_PPC = "PPC";
56 public static final String MACHINE_S370 = "S370";
57 public static final String MACHINE_S390 = "S390";
58 public static final String MACHINE_ARM = "ARM";
59 public static final String MACHINE_VAX = "Vax";
60 public static final String MACHINE_ALPHA = "Alpha";
61 public static final String MACHINE_EFI = "EFI"; // EFI ByteCode
62 public static final String MACHINE_M32R = "M32R";
63 public static final String MACHINE_SH3 = "SH3";
64 public static final String MACHINE_SH4 = "SH4";
65 public static final String MACHINE_SH5 = "SH5";
66 public static final String MACHINE_UNKNOWN = "Unknown";
67
68 public static Property MACHINE_TYPE = Property.internalClosedChoise(PREFIX+"machineType",
69 new String[] { MACHINE_x86_32, MACHINE_x86_64, MACHINE_IA_64, MACHINE_SPARC,
70 MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, MACHINE_PPC,
71 MACHINE_S370, MACHINE_S390,
72 MACHINE_ARM, MACHINE_VAX, MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R,
73 MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, MACHINE_UNKNOWN });
74
75 public static final class Endian {
76 private String name;
77 private boolean msb;
78 public String getName() { return name; }
79 public boolean isMSB() { return msb; }
80 public String getMSB() { if(msb) { return "MSB"; } else { return "LSB"; } }
81 private Endian(String name, boolean msb) { this.name = name; this.msb = msb; }
82
83 public static final Endian LITTLE = new Endian("Little", false);
84 public static final Endian BIG = new Endian("Big", true);
85 }
86 public static Property ENDIAN = Property.internalClosedChoise(PREFIX+"endian",
87 new String[] { Endian.LITTLE.name, Endian.BIG.name });
88 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.feed;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.CloseShieldInputStream;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.InputSource;
35 import org.xml.sax.SAXException;
36
37 import com.sun.syndication.feed.synd.SyndContent;
38 import com.sun.syndication.feed.synd.SyndEntry;
39 import com.sun.syndication.feed.synd.SyndFeed;
40 import com.sun.syndication.io.FeedException;
41 import com.sun.syndication.io.SyndFeedInput;
42
43 /**
44 * Feed parser.
45 * <p>
46 * Uses Rome for parsing the feeds. A feed description is put in a paragraph
47 * with its link and title in an anchor.
48 */
49 public class FeedParser extends AbstractParser {
50
51 /** Serial version UID */
52 private static final long serialVersionUID = -3785361933034525186L;
53
54 private static final Set<MediaType> SUPPORTED_TYPES =
55 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
56 MediaType.application("rss+xml"),
57 MediaType.application("atom+xml"))));
58
59 public Set<MediaType> getSupportedTypes(ParseContext context) {
60 return SUPPORTED_TYPES;
61 }
62
63 public void parse(
64 InputStream stream, ContentHandler handler,
65 Metadata metadata, ParseContext context)
66 throws IOException, SAXException, TikaException {
67 // set the encoding?
68 try {
69 SyndFeed feed = new SyndFeedInput().build(
70 new InputSource(new CloseShieldInputStream(stream)));
71
72 String title = stripTags(feed.getTitleEx());
73 String description = stripTags(feed.getDescriptionEx());
74
75 metadata.set(TikaCoreProperties.TITLE, title);
76 metadata.set(TikaCoreProperties.DESCRIPTION, description);
77 // store the other fields in the metadata
78
79 XHTMLContentHandler xhtml =
80 new XHTMLContentHandler(handler, metadata);
81 xhtml.startDocument();
82
83 xhtml.element("h1", title);
84 xhtml.element("p", description);
85
86 xhtml.startElement("ul");
87 for (Object e : feed.getEntries()) {
88 SyndEntry entry = (SyndEntry) e;
89 String link = entry.getLink();
90 if (link != null) {
91 xhtml.startElement("li");
92 xhtml.startElement("a", "href", link);
93 xhtml.characters(stripTags(entry.getTitleEx()));
94 xhtml.endElement("a");
95 SyndContent content = entry.getDescription();
96 if (content != null) {
97 xhtml.newline();
98 xhtml.characters(content.getValue());
99 }
100 xhtml.endElement("li");
101 }
102 }
103 xhtml.endElement("ul");
104
105 xhtml.endDocument();
106 } catch (FeedException e) {
107 throw new TikaException("RSS parse error", e);
108 }
109
110 }
111
112 private static String stripTags(SyndContent c) {
113 if (c == null)
114 return "";
115
116 String value = c.getValue();
117
118 String[] parts = value.split("<[^>]*>");
119 StringBuffer buf = new StringBuffer();
120
121 for (String part : parts)
122 buf.append(part);
123
124 return buf.toString().trim();
125 }
126 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.font;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.Set;
23
24 import org.apache.fontbox.afm.AFMParser;
25 import org.apache.fontbox.afm.FontMetric;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.Property;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37 /**
38 * Parser for AFM Font Files
39 */
40 public class AdobeFontMetricParser extends AbstractParser {
41 /** Serial version UID */
42 private static final long serialVersionUID = -4820306522217196835L;
43
44 private static final MediaType AFM_TYPE =
45 MediaType.application( "x-font-adobe-metric" );
46
47 private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(AFM_TYPE);
48
49 public Set<MediaType> getSupportedTypes( ParseContext context ) {
50 return SUPPORTED_TYPES;
51 }
52
53 public void parse(InputStream stream, ContentHandler handler,
54 Metadata metadata, ParseContext context)
55 throws IOException, SAXException, TikaException {
56 FontMetric fontMetrics;
57 AFMParser parser = new AFMParser( stream );
58
59 // Have FontBox process the file
60 parser.parse();
61 fontMetrics = parser.getResult();
62
63 // Get the comments in the file to display in xhtml
64 List<String> comments = fontMetrics.getComments();
65
66 // Get the creation date
67 extractCreationDate( metadata, comments );
68
69 metadata.set( Metadata.CONTENT_TYPE, AFM_TYPE.toString() );
70 metadata.set( TikaCoreProperties.TITLE, fontMetrics.getFullName() );
71
72 // Add metadata associated with the font type
73 addMetadataByString( metadata, "AvgCharacterWidth", Float.toString( fontMetrics.getAverageCharacterWidth() ) );
74 addMetadataByString( metadata, "DocVersion", Float.toString( fontMetrics.getAFMVersion() ) );
75 addMetadataByString( metadata, "FontName", fontMetrics.getFontName() );
76 addMetadataByString( metadata, "FontFullName", fontMetrics.getFullName() );
77 addMetadataByString( metadata, "FontFamilyName", fontMetrics.getFamilyName() );
78 addMetadataByString( metadata, "FontVersion", fontMetrics.getFontVersion() );
79 addMetadataByString( metadata, "FontWeight", fontMetrics.getWeight() );
80 addMetadataByString( metadata, "FontNotice", fontMetrics.getNotice() );
81 addMetadataByString( metadata, "FontUnderlineThickness", Float.toString( fontMetrics.getUnderlineThickness() ) );
82
83 // Output the remaining comments as text
84 XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata );
85 xhtml.startDocument();
86
87 // Display the comments
88 if (comments.size() > 0) {
89 xhtml.element( "h1", "Comments" );
90 xhtml.startElement("div", "class", "comments");
91 for (String comment : comments) {
92 xhtml.element( "p", comment );
93 }
94 xhtml.endElement("div");
95 }
96
97 xhtml.endDocument();
98 }
99
100 private void addMetadataByString( Metadata metadata, String name, String value ) {
101 // Add metadata if an appropriate value is passed
102 if (value != null) {
103 metadata.add( name, value );
104 }
105 }
106
107 private void addMetadataByProperty( Metadata metadata, Property property, String value ) {
108 // Add metadata if an appropriate value is passed
109 if (value != null)
110 {
111 metadata.set( property, value );
112 }
113 }
114
115
116 private void extractCreationDate( Metadata metadata, List<String> comments ) {
117 String date = null;
118
119 for (String value : comments) {
120 // Look for the creation date
121 if( value.matches( ".*Creation\\sDate.*" ) ) {
122 date = value.substring( value.indexOf( ":" ) + 2 );
123 comments.remove( value );
124
125 break;
126 }
127 }
128
129 // If appropriate date then store as metadata
130 if( date != null ) {
131 addMetadataByProperty( metadata, Metadata.CREATION_DATE, date );
132 }
133 }
134 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.font;
17
18 import java.awt.Font;
19 import java.awt.FontFormatException;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.Collections;
23 import java.util.Set;
24
25 import org.apache.fontbox.ttf.TTFParser;
26 import org.apache.fontbox.ttf.TrueTypeFont;
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.io.TikaInputStream;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.metadata.TikaCoreProperties;
31 import org.apache.tika.mime.MediaType;
32 import org.apache.tika.parser.AbstractParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 /**
39 * Parser for TrueType font files (TTF).
40 */
41 public class TrueTypeParser extends AbstractParser {
42
43 /** Serial version UID */
44 private static final long serialVersionUID = 44788554612243032L;
45
46 private static final MediaType TYPE =
47 MediaType.application("x-font-ttf");
48
49 private static final Set<MediaType> SUPPORTED_TYPES =
50 Collections.singleton(TYPE);
51
52 public Set<MediaType> getSupportedTypes(ParseContext context) {
53 return SUPPORTED_TYPES;
54 }
55
56 public void parse(
57 InputStream stream, ContentHandler handler,
58 Metadata metadata, ParseContext context)
59 throws IOException, SAXException, TikaException {
60 TikaInputStream tis = TikaInputStream.cast(stream);
61
62 // Until PDFBOX-1749 is fixed, if we can, use AWT to verify
63 // that the file is valid (otherwise FontBox could hang)
64 // See TIKA-1182 for details
65 if (tis != null) {
66 try {
67 if (tis.hasFile()) {
68 Font.createFont(Font.TRUETYPE_FONT, tis.getFile());
69 } else {
70 tis.mark(0);
71 Font.createFont(Font.TRUETYPE_FONT, stream);
72 tis.reset();
73 }
74 } catch (FontFormatException ex) {
75 throw new TikaException("Bad TrueType font.");
76 }
77 }
78
79 // Ask FontBox to parse the file for us
80 TrueTypeFont font;
81 TTFParser parser = new TTFParser();
82 if (tis != null && tis.hasFile()) {
83 font = parser.parseTTF(tis.getFile());
84 } else {
85 font = parser.parseTTF(stream);
86 }
87
88 // Report the details of the font
89 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
90 metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated().getTime());
91 metadata.set(
92 TikaCoreProperties.MODIFIED,
93 font.getHeader().getModified().getTime());
94
95 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
96 xhtml.startDocument();
97 xhtml.endDocument();
98 }
99
100 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.hdf;
18
19 //JDK imports
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Collections;
24 import java.util.Set;
25
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.io.IOUtils;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.parser.netcdf.NetCDFParser;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37 import ucar.nc2.Attribute;
38 import ucar.nc2.Group;
39 import ucar.nc2.NetcdfFile;
40
41 /**
42 *
43 * Since the {@link NetCDFParser} depends on the <a
44 * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
45 * we are able to use it to parse HDF files as well. See <a href=
46 * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
47 * >this link</a> for more information.
48 */
49 public class HDFParser extends AbstractParser {
50
51 /** Serial version UID */
52 private static final long serialVersionUID = 1091208208003437549L;
53
54 private static final Set<MediaType> SUPPORTED_TYPES =
55 Collections.singleton(MediaType.application("x-hdf"));
56
57 /*
58 * (non-Javadoc)
59 *
60 * @see
61 * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
62 * .tika.parser.ParseContext)
63 */
64 public Set<MediaType> getSupportedTypes(ParseContext context) {
65 return SUPPORTED_TYPES;
66 }
67
68 /*
69 * (non-Javadoc)
70 *
71 * @see
72 * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
73 * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
74 * org.apache.tika.parser.ParseContext)
75 */
76 public void parse(InputStream stream, ContentHandler handler,
77 Metadata metadata, ParseContext context) throws IOException,
78 SAXException, TikaException {
79 ByteArrayOutputStream os = new ByteArrayOutputStream();
80 IOUtils.copy(stream, os);
81
82 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
83 if (name == null) {
84 name = "";
85 }
86 try {
87 NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
88 unravelStringMet(ncFile, null, metadata);
89 } catch (IOException e) {
90 throw new TikaException("HDF parse error", e);
91 }
92
93 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
94 xhtml.startDocument();
95 xhtml.endDocument();
96 }
97
98 protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
99 if (group == null) {
100 group = ncFile.getRootGroup();
101 }
102
103 // unravel its string attrs
104 for (Attribute attribute : group.getAttributes()) {
105 if (attribute.isString()) {
106 met.add(attribute.getName(), attribute.getStringValue());
107 } else {
108 // try and cast its value to a string
109 met.add(attribute.getName(), String.valueOf(attribute
110 .getNumericValue()));
111 }
112 }
113
114 for (Group g : group.getGroups()) {
115 unravelStringMet(ncFile, g, met);
116 }
117 }
118
119 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.io.Writer;
19 import java.util.ArrayList;
20 import java.util.BitSet;
21 import java.util.List;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.sax.WriteOutContentHandler;
25 import org.apache.tika.sax.XHTMLContentHandler;
26 import org.xml.sax.Attributes;
27 import org.xml.sax.ContentHandler;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.helpers.AttributesImpl;
30
31 import de.l3s.boilerpipe.BoilerpipeExtractor;
32 import de.l3s.boilerpipe.BoilerpipeProcessingException;
33 import de.l3s.boilerpipe.document.TextBlock;
34 import de.l3s.boilerpipe.document.TextDocument;
35 import de.l3s.boilerpipe.extractors.ArticleExtractor;
36 import de.l3s.boilerpipe.extractors.DefaultExtractor;
37 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
38
39 /**
40 * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
41 * library to automatically extract the main content from a web page.
42 *
43 * Use this as a {@link ContentHandler} object passed to
44 * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
45 */
46 public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
47
48 private static class RecordedElement {
49 public enum ElementType {
50 START,
51 END,
52 CONTINUE
53 }
54
55 private String uri;
56 private String localName;
57 private String qName;
58 private Attributes attrs;
59 private List<char[]> characters;
60 private ElementType elementType;
61
62 public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
63 this(uri, localName, qName, attrs, ElementType.START);
64 }
65
66 public RecordedElement(String uri, String localName, String qName) {
67 this(uri, localName, qName, null, ElementType.END);
68 }
69
70 public RecordedElement() {
71 this(null, null, null, null, ElementType.CONTINUE);
72 }
73
74 protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
75 this.uri = uri;
76 this.localName = localName;
77 this.qName = qName;
78 this.attrs = attrs;
79 this.elementType = elementType;
80 this.characters = new ArrayList<char[]>();
81 }
82
83 @Override
84 public String toString() {
85 return String.format("<%s> of type %s", localName, elementType);
86 };
87
88 public String getUri() {
89 return uri;
90 }
91
92 public String getLocalName() {
93 return localName;
94 }
95
96 public String getQName() {
97 return qName;
98 }
99
100 public Attributes getAttrs() {
101 return attrs;
102 }
103
104 public List<char[]> getCharacters() {
105 return characters;
106 }
107
108 public RecordedElement.ElementType getElementType() {
109 return elementType;
110 }
111 }
112
113 /**
114 * The newline character that gets inserted after block elements.
115 */
116 private static final char[] NL = new char[] { '\n' };
117
118 private ContentHandler delegate;
119 private BoilerpipeExtractor extractor;
120
121 private boolean includeMarkup;
122 private boolean inHeader;
123 private boolean inFooter;
124 private int headerCharOffset;
125 private List<RecordedElement> elements;
126 private TextDocument td;
127
128 /**
129 * Creates a new boilerpipe-based content extractor, using the
130 * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
131 *
132 * @param delegate
133 * The {@link ContentHandler} object
134 */
135 public BoilerpipeContentHandler(ContentHandler delegate) {
136 this(delegate, DefaultExtractor.INSTANCE);
137 }
138
139 /**
140 * Creates a content handler that writes XHTML body character events to
141 * the given writer.
142 *
143 * @param writer writer
144 */
145 public BoilerpipeContentHandler(Writer writer) {
146 this(new WriteOutContentHandler(writer));
147 }
148
149 /**
150 * Creates a new boilerpipe-based content extractor, using the given
151 * extraction rules. The extracted main content will be passed to the
152 * <delegate> content handler.
153 *
154 * @param delegate
155 * The {@link ContentHandler} object
156 * @param extractor
157 * Extraction rules to use, e.g. {@link ArticleExtractor}
158 */
159 public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
160 this.td = null;
161 this.delegate = delegate;
162 this.extractor = extractor;
163 }
164
165 public void setIncludeMarkup(boolean includeMarkup) {
166 this.includeMarkup = includeMarkup;
167 }
168
169 public boolean isIncludeMarkup() {
170 return includeMarkup;
171 }
172
173 /**
174 * Retrieves the built TextDocument
175 *
176 * @return TextDocument
177 */
178 public TextDocument getTextDocument() {
179 return td;
180 }
181
182 @Override
183 public void startDocument() throws SAXException {
184 super.startDocument();
185
186 delegate.startDocument();
187
188 inHeader = true;
189 inFooter = false;
190 headerCharOffset = 0;
191
192 if (includeMarkup) {
193 elements = new ArrayList<RecordedElement>();
194 }
195 };
196
197 @Override
198 public void startPrefixMapping(String prefix, String uri) throws SAXException {
199 super.startPrefixMapping(prefix, uri);
200 delegate.startPrefixMapping(prefix, uri);
201 };
202
203 @Override
204 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
205 super.startElement(uri, localName, qName, atts);
206
207 if (inHeader) {
208 delegate.startElement(uri, localName, qName, atts);
209 } else if (inFooter) {
210 // Do nothing
211 } else if (includeMarkup) {
212 elements.add(new RecordedElement(uri, localName, qName, atts));
213 } else {
214 // This happens for the <body> element, if we're not doing markup.
215 delegate.startElement(uri, localName, qName, atts);
216 }
217 };
218
219 @Override
220 public void characters(char[] chars, int offset, int length) throws SAXException {
221 super.characters(chars, offset, length);
222
223 if (inHeader) {
224 delegate.characters(chars, offset, length);
225 headerCharOffset++;
226 } else if (inFooter) {
227 // Do nothing
228 } else if (includeMarkup) {
229 RecordedElement element = elements.get(elements.size() - 1);
230
231 char[] characters = new char[length];
232 System.arraycopy(chars, offset, characters, 0, length);
233 element.getCharacters().add(characters);
234 }
235 };
236
237 @Override
238 public void endElement(String uri, String localName, String qName) throws SAXException {
239 super.endElement(uri, localName, qName);
240
241 if (inHeader) {
242 delegate.endElement(uri, localName, qName);
243 inHeader = !localName.equals("head");
244 } else if (inFooter) {
245 // Do nothing
246 } else if (localName.equals("body")) {
247 inFooter = true;
248 } else if (includeMarkup) {
249 // Add the end element, and the continuation from the previous element
250 elements.add(new RecordedElement(uri, localName, qName));
251 elements.add(new RecordedElement());
252 }
253 };
254
255 @Override
256 public void endDocument() throws SAXException {
257 super.endDocument();
258
259 td = toTextDocument();
260 try {
261 extractor.process(td);
262 } catch (BoilerpipeProcessingException e) {
263 throw new SAXException(e);
264 }
265
266 Attributes emptyAttrs = new AttributesImpl();
267
268 // At this point we have all the information we need to either emit N paragraphs
269 // of plain text (if not including markup), or we have to replay our recorded elements
270 // and only emit character runs that passed the boilerpipe filters.
271 if (includeMarkup) {
272 BitSet validCharacterRuns = new BitSet();
273 for (TextBlock block : td.getTextBlocks()) {
274 if (block.isContent()) {
275 BitSet bs = block.getContainedTextElements();
276 if (bs != null) {
277 validCharacterRuns.or(bs);
278 }
279 }
280 }
281
282 // Now have bits set for all valid character runs. Replay our recorded elements,
283 // but only emit character runs flagged as valid.
284 int curCharsIndex = headerCharOffset;
285
286 for (RecordedElement element : elements) {
287 switch (element.getElementType()) {
288 case START:
289 delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
290 // Fall through
291
292 case CONTINUE:
293 // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
294 // we have to follow suit.
295 for (char[] chars : element.getCharacters()) {
296 curCharsIndex++;
297
298 if (validCharacterRuns.get(curCharsIndex)) {
299 delegate.characters(chars, 0, chars.length);
300
301 // https://issues.apache.org/jira/browse/TIKA-961
302 if (!Character.isWhitespace(chars[chars.length - 1])) {
303 // Only add whitespace for certain elements
304 if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
305 delegate.ignorableWhitespace(NL, 0, NL.length);
306 }
307 }
308 }
309 }
310 break;
311
312 case END:
313 delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
314 break;
315
316 default:
317 throw new RuntimeException("Unhandled element type: " + element.getElementType());
318 }
319
320
321 }
322 } else {
323 for (TextBlock block : td.getTextBlocks()) {
324 if (block.isContent()) {
325 delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
326 char[] chars = block.getText().toCharArray();
327 delegate.characters(chars, 0, chars.length);
328 delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
329 delegate.ignorableWhitespace(NL, 0, NL.length);
330 }
331 }
332 }
333
334 delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
335 delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
336
337 // We defer ending any prefix mapping until here, which is why we don't pass this
338 // through to the delegate in an overridden method.
339 delegate.endPrefixMapping("");
340
341 delegate.endDocument();
342 }
343 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.util.HashMap;
19 import java.util.HashSet;
20 import java.util.Map;
21 import java.util.Set;
22
23 /**
24 * The default HTML mapping rules in Tika.
25 *
26 * @since Apache Tika 0.6
27 */
28 @SuppressWarnings("serial")
29 public class DefaultHtmlMapper implements HtmlMapper {
30
31 // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
32 private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
33 put("H1", "h1");
34 put("H2", "h2");
35 put("H3", "h3");
36 put("H4", "h4");
37 put("H5", "h5");
38 put("H6", "h6");
39
40 put("P", "p");
41 put("PRE", "pre");
42 put("BLOCKQUOTE", "blockquote");
43 put("Q", "q");
44
45 put("UL", "ul");
46 put("OL", "ol");
47 put("MENU", "ul");
48 put("LI", "li");
49 put("DL", "dl");
50 put("DT", "dt");
51 put("DD", "dd");
52
53 put("TABLE", "table");
54 put("THEAD", "thead");
55 put("TBODY", "tbody");
56 put("TR", "tr");
57 put("TH", "th");
58 put("TD", "td");
59
60 put("ADDRESS", "address");
61
62 // TIKA-460 - add anchors
63 put("A", "a");
64
65 // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
66 put("MAP", "map");
67 put("AREA", "area");
68 put("IMG", "img");
69 put("FRAMESET", "frameset");
70 put("FRAME", "frame");
71 put("IFRAME", "iframe");
72 put("OBJECT", "object");
73 put("PARAM", "param");
74 put("INS", "ins");
75 put("DEL", "del");
76 }};
77
78 private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
79 add("STYLE");
80 add("SCRIPT");
81 }};
82
83 // For information on tags & attributes, see:
84 // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
85 // http://www.w3schools.com/TAGS/
86 private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
87 put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
88 put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
89 put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
90 put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
91 put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
92 put("map", attrSet("id", "class", "style", "title", "name"));
93 put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
94 put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
95 "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
96 put("param", attrSet("id", "name", "value", "valuetype", "type"));
97 put("blockquote", attrSet("cite"));
98 put("ins", attrSet("cite", "datetime"));
99 put("del", attrSet("cite", "datetime"));
100 put("q", attrSet("cite"));
101
102 // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
103 }};
104
105 private static Set<String> attrSet(String... attrs) {
106 Set<String> result = new HashSet<String>();
107 for (String attr : attrs) {
108 result.add(attr);
109 }
110 return result;
111 }
112
113 /**
114 * @since Apache Tika 0.8
115 */
116 public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
117
118 public String mapSafeElement(String name) {
119 return SAFE_ELEMENTS.get(name);
120 }
121
122 /** Normalizes an attribute name. Assumes that the element name
123 * is valid and normalized
124 */
125 public String mapSafeAttribute(String elementName, String attributeName) {
126 Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
127 if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
128 return attributeName;
129 } else {
130 return null;
131 }
132 }
133
134 public boolean isDiscardElement(String name) {
135 return DISCARDABLE_ELEMENTS.contains(name);
136 }
137
138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.ByteBuffer;
21 import java.nio.charset.Charset;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24
25 import org.apache.tika.detect.EncodingDetector;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.utils.CharsetUtils;
29
30 /**
31 * Character encoding detector for determining the character encoding of a
32 * HTML document based on the potential charset parameter found in a
33 * Content-Type http-equiv meta tag somewhere near the beginning. Especially
34 * useful for determining the type among multiple closely related encodings
35 * (ISO-8859-*) for which other types of encoding detection are unreliable.
36 *
37 * @since Apache Tika 1.2
38 */
39 public class HtmlEncodingDetector implements EncodingDetector {
40
41 // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
42 private static final int META_TAG_BUFFER_SIZE = 8192;
43
44
45 private static final Pattern HTTP_META_PATTERN = Pattern.compile(
46 "(?is)<\\s*meta\\s+([^<>]+)"
47 );
48
49 //this should match both the older:
50 //<meta http-equiv="content-type" content="text/html; charset=xyz"/>
51 //and
52 //html5 <meta charset="xyz">
53 //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm
54 //for the noisiness that one might encounter in charset attrs.
55 //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings
56 //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html
57 //For a more general "not" matcher, try:
58 //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
59 private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
60 ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
61 );
62
63 private static final Charset ASCII = Charset.forName("US-ASCII");
64
65 public Charset detect(InputStream input, Metadata metadata)
66 throws IOException {
67 if (input == null) {
68 return null;
69 }
70
71 // Read enough of the text stream to capture possible meta tags
72 input.mark(META_TAG_BUFFER_SIZE);
73 byte[] buffer = new byte[META_TAG_BUFFER_SIZE];
74 int n = 0;
75 int m = input.read(buffer);
76 while (m != -1 && n < buffer.length) {
77 n += m;
78 m = input.read(buffer, n, buffer.length - n);
79 }
80 input.reset();
81
82 // Interpret the head as ASCII and try to spot a meta tag with
83 // a possible character encoding hint
84
85 String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
86
87 Matcher equiv = HTTP_META_PATTERN.matcher(head);
88 Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
89 //iterate through meta tags
90 while (equiv.find()) {
91 String attrs = equiv.group(1);
92 charsetMatcher.reset(attrs);
93 //iterate through charset= and return the first match
94 //that is valid
95 while (charsetMatcher.find()){
96 String candCharset = charsetMatcher.group(1);
97 if (CharsetUtils.isSupported(candCharset)){
98 try{
99 return CharsetUtils.forName(candCharset);
100 } catch (Exception e){
101 //ignore
102 }
103 }
104 }
105 }
106 return null;
107 }
108
109 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.net.MalformedURLException;
19 import java.net.URL;
20 import java.util.Arrays;
21 import java.util.HashSet;
22 import java.util.Locale;
23 import java.util.Set;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.sax.TextContentHandler;
31 import org.apache.tika.sax.XHTMLContentHandler;
32 import org.xml.sax.Attributes;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35 import org.xml.sax.helpers.AttributesImpl;
36
37 class HtmlHandler extends TextContentHandler {
38
39 // List of attributes that need to be resolved.
40 private static final Set<String> URI_ATTRIBUTES =
41 new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
42
43 private final HtmlMapper mapper;
44
45 private final XHTMLContentHandler xhtml;
46
47 private final Metadata metadata;
48
49 private int bodyLevel = 0;
50
51 private int discardLevel = 0;
52
53 private int titleLevel = 0;
54
55 private final StringBuilder title = new StringBuilder();
56
57 private HtmlHandler(
58 HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
59 super(xhtml);
60 this.mapper = mapper;
61 this.xhtml = xhtml;
62 this.metadata = metadata;
63
64 // Try to determine the default base URL, if one has not been given
65 if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
66 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
67 if (name != null) {
68 name = name.trim();
69 try {
70 new URL(name); // test URL format
71 metadata.set(Metadata.CONTENT_LOCATION, name);
72 } catch (MalformedURLException e) {
73 // The resource name is not a valid URL, ignore it
74 }
75 }
76 }
77 }
78
79 public HtmlHandler(
80 HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
81 this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
82 }
83
84 @Override
85 public void startElement(
86 String uri, String local, String name, Attributes atts)
87 throws SAXException {
88 if ("TITLE".equals(name) || titleLevel > 0) {
89 titleLevel++;
90 }
91 if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
92 bodyLevel++;
93 }
94 if (mapper.isDiscardElement(name) || discardLevel > 0) {
95 discardLevel++;
96 }
97
98 if (bodyLevel == 0 && discardLevel == 0) {
99 if ("META".equals(name) && atts.getValue("content") != null) {
100 // TIKA-478: For cases where we have either a name or
101 // "http-equiv", assume that XHTMLContentHandler will emit
102 // these in the <head>, thus passing them through safely.
103 if (atts.getValue("http-equiv") != null) {
104 addHtmlMetadata(
105 atts.getValue("http-equiv"),
106 atts.getValue("content"));
107 } else if (atts.getValue("name") != null) {
108 // Record the meta tag in the metadata
109 addHtmlMetadata(
110 atts.getValue("name"),
111 atts.getValue("content"));
112 } else if (atts.getValue("property") != null) {
113 // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
114 metadata.add(
115 atts.getValue("property"),
116 atts.getValue("content"));
117 }
118 } else if ("BASE".equals(name) && atts.getValue("href") != null) {
119 startElementWithSafeAttributes("base", atts);
120 xhtml.endElement("base");
121 metadata.set(
122 Metadata.CONTENT_LOCATION,
123 resolve(atts.getValue("href")));
124 } else if ("LINK".equals(name)) {
125 startElementWithSafeAttributes("link", atts);
126 xhtml.endElement("link");
127 }
128 }
129
130 if (bodyLevel > 0 && discardLevel == 0) {
131 String safe = mapper.mapSafeElement(name);
132 if (safe != null) {
133 startElementWithSafeAttributes(safe, atts);
134 }
135 }
136
137 title.setLength(0);
138 }
139
140 private static final Pattern ICBM =
141 Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
142
143 /**
144 * Adds a metadata setting from the HTML <head/> to the Tika metadata
145 * object. The name and value are normalized where possible.
146 */
147 private void addHtmlMetadata(String name, String value) {
148 if (name == null || value == null) {
149 // ignore
150 } else if (name.equalsIgnoreCase("ICBM")) {
151 Matcher m = ICBM.matcher(value);
152 if (m.matches()) {
153 metadata.set("ICBM", m.group(1) + ", " + m.group(2));
154 metadata.set(Metadata.LATITUDE, m.group(1));
155 metadata.set(Metadata.LONGITUDE, m.group(2));
156 } else {
157 metadata.set("ICBM", value);
158 }
159 } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)){
160 MediaType type = MediaType.parse(value);
161 if (type != null) {
162 metadata.set(Metadata.CONTENT_TYPE, type.toString());
163 } else {
164 metadata.set(Metadata.CONTENT_TYPE, value);
165 }
166 } else {
167 metadata.set(name, value);
168 }
169 }
170
171 private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
172 if (atts.getLength() == 0) {
173 xhtml.startElement(name);
174 return;
175 }
176
177 boolean isObject = name.equals("object");
178 String codebase = null;
179 if (isObject) {
180 codebase = atts.getValue("", "codebase");
181 if (codebase != null) {
182 codebase = resolve(codebase);
183 } else {
184 codebase = metadata.get(Metadata.CONTENT_LOCATION);
185 }
186 }
187
188 AttributesImpl newAttributes = new AttributesImpl(atts);
189 for (int att = 0; att < newAttributes.getLength(); att++) {
190 String attrName = newAttributes.getLocalName(att);
191 String normAttrName = mapper.mapSafeAttribute(name, attrName);
192 if (normAttrName == null) {
193 newAttributes.removeAttribute(att);
194 att--;
195 } else {
196 // We have a remapped attribute name, so set it as it might have changed.
197 newAttributes.setLocalName(att, normAttrName);
198
199 // And resolve relative links. Eventually this should be pushed
200 // into the HtmlMapper code.
201 if (URI_ATTRIBUTES.contains(normAttrName)) {
202 newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
203 } else if (isObject && "codebase".equals(normAttrName)) {
204 newAttributes.setValue(att, codebase);
205 } else if (isObject
206 && ("data".equals(normAttrName)
207 || "classid".equals(normAttrName))) {
208 newAttributes.setValue(
209 att,
210 resolve(codebase, newAttributes.getValue(att)));
211 }
212 }
213 }
214
215 if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
216 newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
217 }
218
219 xhtml.startElement(name, newAttributes);
220 }
221
222 @Override
223 public void endElement(
224 String uri, String local, String name) throws SAXException {
225 if (bodyLevel > 0 && discardLevel == 0) {
226 String safe = mapper.mapSafeElement(name);
227 if (safe != null) {
228 xhtml.endElement(safe);
229 } else if (XHTMLContentHandler.ENDLINE.contains(
230 name.toLowerCase(Locale.ENGLISH))) {
231 // TIKA-343: Replace closing block tags (and <br/>) with a
232 // newline unless the HtmlMapper above has already mapped
233 // them to something else
234 xhtml.newline();
235 }
236 }
237
238 if (titleLevel > 0) {
239 titleLevel--;
240 if (titleLevel == 0) {
241 metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
242 }
243 }
244 if (bodyLevel > 0) {
245 bodyLevel--;
246 }
247 if (discardLevel > 0) {
248 discardLevel--;
249 }
250 }
251
252 @Override
253 public void characters(char[] ch, int start, int length)
254 throws SAXException {
255 if (titleLevel > 0 && bodyLevel == 0) {
256 title.append(ch, start, length);
257 }
258 if (bodyLevel > 0 && discardLevel == 0) {
259 super.characters(ch, start, length);
260 }
261 }
262
263 @Override
264 public void ignorableWhitespace(char[] ch, int start, int length)
265 throws SAXException {
266 if (bodyLevel > 0 && discardLevel == 0) {
267 super.ignorableWhitespace(ch, start, length);
268 }
269 }
270
271 private String resolve(String url) {
272 return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
273 }
274
275 private String resolve(String base, String url) {
276 url = url.trim();
277
278 // Return the URL as-is if no base URL is available or if the URL
279 // matches a common non-hierarchical or pseudo URI prefix
280 String lower = url.toLowerCase(Locale.ENGLISH);
281 if (base == null
282 || lower.startsWith("urn:")
283 || lower.startsWith("mailto:")
284 || lower.startsWith("tel:")
285 || lower.startsWith("data:")
286 || lower.startsWith("javascript:")
287 || lower.startsWith("about:")) {
288 return url;
289 }
290
291 try {
292 URL baseURL = new URL(base.trim());
293
294 // We need to handle one special case, where the relativeUrl is
295 // just a query string (like "?pid=1"), and the baseUrl doesn't
296 // end with a '/'. In that case, the URL class removes the last
297 // portion of the path, which we don't want.
298 String path = baseURL.getPath();
299 if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
300 return new URL(
301 baseURL.getProtocol(),
302 baseURL.getHost(), baseURL.getPort(),
303 baseURL.getPath() + url).toExternalForm();
304 } else {
305 return new URL(baseURL, url).toExternalForm();
306 }
307 } catch (MalformedURLException e) {
308 // Unknown or broken format; just return the URL as received.
309 return url;
310 }
311 }
312
313 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 /**
19 * HTML mapper used to make incoming HTML documents easier to handle by
20 * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
21 * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
22 * that wants to customize this mapping can place a custom HtmlMapper instance
23 * into the parse context.
24 *
25 * @since Apache Tika 0.6
26 */
27 public interface HtmlMapper {
28
29 /**
30 * Maps "safe" HTML element names to semantic XHTML equivalents. If the
31 * given element is unknown or deemed unsafe for inclusion in the parse
32 * output, then this method returns <code>null</code> and the element
33 * will be ignored but the content inside it is still processed. See
34 * the {@link #isDiscardElement(String)} method for a way to discard
35 * the entire contents of an element.
36 *
37 * @param name HTML element name (upper case)
38 * @return XHTML element name (lower case), or
39 * <code>null</code> if the element is unsafe
40 */
41 String mapSafeElement(String name);
42
43 /**
44 * Checks whether all content within the given HTML element should be
45 * discarded instead of including it in the parse output.
46 *
47 * @param name HTML element name (upper case)
48 * @return <code>true</code> if content inside the named element
49 * should be ignored, <code>false</code> otherwise
50 */
51 boolean isDiscardElement(String name);
52
53
54 /**
55 * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
56 * given attribute is unknown or deemed unsafe for inclusion in the parse
57 * output, then this method returns <code>null</code> and the attribute
58 * will be ignored. This method assumes that the element name
59 * is valid and normalised.
60 *
61 * @param elementName HTML element name (lower case)
62 * @param attributeName HTML attribute name (lower case)
63 * @return XHTML attribute name (lower case), or
64 * <code>null</code> if the element is unsafe
65 */
66 String mapSafeAttribute(String elementName, String attributeName);
67
68 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.charset.Charset;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import org.apache.tika.config.ServiceLoader;
27 import org.apache.tika.detect.AutoDetectReader;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.CloseShieldInputStream;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.mime.MediaType;
32 import org.apache.tika.parser.AbstractParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.ccil.cowan.tagsoup.HTMLSchema;
35 import org.ccil.cowan.tagsoup.Schema;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
41 * and post-processes the events to produce XHTML and metadata expected by
42 * Tika clients.
43 */
44 public class HtmlParser extends AbstractParser {
45
46 /** Serial version UID */
47 private static final long serialVersionUID = 7895315240498733128L;
48
49 private static final Set<MediaType> SUPPORTED_TYPES =
50 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
51 MediaType.text("html"),
52 MediaType.application("xhtml+xml"),
53 MediaType.application("vnd.wap.xhtml+xml"),
54 MediaType.application("x-asp"))));
55
56 private static final ServiceLoader LOADER =
57 new ServiceLoader(HtmlParser.class.getClassLoader());
58
59 /**
60 * HTML schema singleton used to amortise the heavy instantiation time.
61 */
62 private static final Schema HTML_SCHEMA = new HTMLSchema();
63
64 public Set<MediaType> getSupportedTypes(ParseContext context) {
65 return SUPPORTED_TYPES;
66 }
67
68 public void parse(
69 InputStream stream, ContentHandler handler,
70 Metadata metadata, ParseContext context)
71 throws IOException, SAXException, TikaException {
72 // Automatically detect the character encoding
73 AutoDetectReader reader = new AutoDetectReader(
74 new CloseShieldInputStream(stream), metadata,
75 context.get(ServiceLoader.class, LOADER));
76 try {
77 Charset charset = reader.getCharset();
78 String previous = metadata.get(Metadata.CONTENT_TYPE);
79 if (previous == null || previous.startsWith("text/html")) {
80 MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
81 metadata.set(Metadata.CONTENT_TYPE, type.toString());
82 }
83 // deprecated, see TIKA-431
84 metadata.set(Metadata.CONTENT_ENCODING, charset.name());
85
86 // Get the HTML mapper from the parse context
87 HtmlMapper mapper =
88 context.get(HtmlMapper.class, new HtmlParserMapper());
89
90 // Parse the HTML document
91 org.ccil.cowan.tagsoup.Parser parser =
92 new org.ccil.cowan.tagsoup.Parser();
93
94 // Use schema from context or default
95 Schema schema = context.get(Schema.class, HTML_SCHEMA);
96
97 // TIKA-528: Reuse share schema to avoid heavy instantiation
98 parser.setProperty(
99 org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
100 // TIKA-599: Shared schema is thread-safe only if bogons are ignored
101 parser.setFeature(
102 org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
103
104 parser.setContentHandler(new XHTMLDowngradeHandler(
105 new HtmlHandler(mapper, handler, metadata)));
106
107 parser.parse(reader.asInputSource());
108 } finally {
109 reader.close();
110 }
111 }
112
113 /**
114 * Maps "safe" HTML element names to semantic XHTML equivalents. If the
115 * given element is unknown or deemed unsafe for inclusion in the parse
116 * output, then this method returns <code>null</code> and the element
117 * will be ignored but the content inside it is still processed. See
118 * the {@link #isDiscardElement(String)} method for a way to discard
119 * the entire contents of an element.
120 * <p>
121 * Subclasses can override this method to customize the default mapping.
122 *
123 * @deprecated Use the {@link HtmlMapper} mechanism to customize
124 * the HTML mapping. This method will be removed in Tika 1.0.
125 * @since Apache Tika 0.5
126 * @param name HTML element name (upper case)
127 * @return XHTML element name (lower case), or
128 * <code>null</code> if the element is unsafe
129 */
130 protected String mapSafeElement(String name) {
131 return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
132 }
133
134 /**
135 * Checks whether all content within the given HTML element should be
136 * discarded instead of including it in the parse output. Subclasses
137 * can override this method to customize the set of discarded elements.
138 *
139 * @deprecated Use the {@link HtmlMapper} mechanism to customize
140 * the HTML mapping. This method will be removed in Tika 1.0.
141 * @since Apache Tika 0.5
142 * @param name HTML element name (upper case)
143 * @return <code>true</code> if content inside the named element
144 * should be ignored, <code>false</code> otherwise
145 */
146 protected boolean isDiscardElement(String name) {
147 return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
148 }
149
150 /**
151 * @deprecated Use the {@link HtmlMapper} mechanism to customize
152 * the HTML mapping. This method will be removed in Tika 1.0.
153 **/
154 public String mapSafeAttribute(String elementName, String attributeName) {
155 return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,attributeName) ;
156 }
157
158 /**
159 * Adapter class that maintains backwards compatibility with the
160 * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
161 * directly would require those methods to be public, which would break
162 * backwards compatibility with subclasses.
163 *
164 * @deprecated Use the {@link HtmlMapper} mechanism to customize
165 * the HTML mapping. This class will be removed in Tika 1.0.
166 */
167 private class HtmlParserMapper implements HtmlMapper {
168 public String mapSafeElement(String name) {
169 return HtmlParser.this.mapSafeElement(name);
170 }
171 public boolean isDiscardElement(String name) {
172 return HtmlParser.this.isDiscardElement(name);
173 }
174 public String mapSafeAttribute(String elementName, String attributeName){
175 return HtmlParser.this.mapSafeAttribute(elementName,attributeName);
176 }
177 }
178
179 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.util.Locale;
19
20 /**
21 * Alternative HTML mapping rules that pass the input HTML as-is without any
22 * modifications.
23 *
24 * @since Apache Tika 0.8
25 */
26 public class IdentityHtmlMapper implements HtmlMapper {
27
28 public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
29
30 public boolean isDiscardElement(String name) {
31 return false;
32 }
33
34 public String mapSafeAttribute(String elementName, String attributeName) {
35 return attributeName.toLowerCase(Locale.ENGLISH);
36 }
37
38 public String mapSafeElement(String name) {
39 return name.toLowerCase(Locale.ENGLISH);
40 }
41
42 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import java.util.Locale;
19
20 import javax.xml.XMLConstants;
21
22 import org.apache.tika.sax.ContentHandlerDecorator;
23 import org.xml.sax.Attributes;
24 import org.xml.sax.ContentHandler;
25 import org.xml.sax.SAXException;
26 import org.xml.sax.helpers.AttributesImpl;
27
28 /**
29 * Content handler decorator that downgrades XHTML elements to
30 * old-style HTML elements before passing them on to the decorated
31 * content handler. This downgrading consists of dropping all namespaces
32 * (and namespaced attributes) and uppercasing all element names.
33 * Used by the {@link HtmlParser} to make all incoming HTML look the same.
34 */
35 class XHTMLDowngradeHandler extends ContentHandlerDecorator {
36
37 public XHTMLDowngradeHandler(ContentHandler handler) {
38 super(handler);
39 }
40
41 @Override
42 public void startElement(
43 String uri, String localName, String name, Attributes atts)
44 throws SAXException {
45 String upper = localName.toUpperCase(Locale.ENGLISH);
46
47 AttributesImpl attributes = new AttributesImpl();
48 for (int i = 0; i < atts.getLength(); i++) {
49 String auri = atts.getURI(i);
50 String local = atts.getLocalName(i);
51 String qname = atts.getQName(i);
52 if (XMLConstants.NULL_NS_URI.equals(auri)
53 && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
54 && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
55 attributes.addAttribute(
56 auri, local, qname, atts.getType(i), atts.getValue(i));
57 }
58 }
59
60 super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
61 }
62
63 @Override
64 public void endElement(String uri, String localName, String name)
65 throws SAXException {
66 String upper = localName.toUpperCase(Locale.ENGLISH);
67 super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
68 }
69
70 @Override
71 public void startPrefixMapping(String prefix, String uri) {
72 }
73
74 @Override
75 public void endPrefixMapping(String prefix) {
76 }
77
78 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.io.File;
19 import java.io.IOException;
20 import java.text.DecimalFormat;
21 import java.text.DecimalFormatSymbols;
22 import java.text.SimpleDateFormat;
23 import java.util.Date;
24 import java.util.Iterator;
25 import java.util.Locale;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.metadata.IPTC;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.metadata.Property;
33 import org.apache.tika.metadata.TikaCoreProperties;
34 import org.xml.sax.SAXException;
35
36 import com.drew.imaging.jpeg.JpegMetadataReader;
37 import com.drew.imaging.jpeg.JpegProcessingException;
38 import com.drew.imaging.tiff.TiffMetadataReader;
39 import com.drew.lang.GeoLocation;
40 import com.drew.lang.Rational;
41 import com.drew.metadata.Directory;
42 import com.drew.metadata.MetadataException;
43 import com.drew.metadata.Tag;
44 import com.drew.metadata.exif.ExifIFD0Directory;
45 import com.drew.metadata.exif.ExifSubIFDDirectory;
46 import com.drew.metadata.exif.ExifThumbnailDirectory;
47 import com.drew.metadata.exif.GpsDirectory;
48 import com.drew.metadata.iptc.IptcDirectory;
49 import com.drew.metadata.jpeg.JpegCommentDirectory;
50 import com.drew.metadata.jpeg.JpegDirectory;
51
52 /**
53 * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
54 * to read EXIF and IPTC image metadata and map to Tika fields.
55 *
56 * As of 2.4.0 the library supports jpeg and tiff.
57 */
58 public class ImageMetadataExtractor {
59
60 private final Metadata metadata;
61 private DirectoryHandler[] handlers;
62 private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
63
64 /**
65 * @param metadata to extract to, using default directory handlers
66 */
67 public ImageMetadataExtractor(Metadata metadata) {
68 this(metadata,
69 new CopyUnknownFieldsHandler(),
70 new JpegCommentHandler(),
71 new ExifHandler(),
72 new DimensionsHandler(),
73 new GeotagHandler(),
74 new IptcHandler()
75 );
76 }
77
78 /**
79 * @param metadata to extract to
80 * @param handlers handlers in order, note that handlers may override values from earlier handlers
81 */
82 public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
83 this.metadata = metadata;
84 this.handlers = handlers;
85 }
86
87 public void parseJpeg(File file)
88 throws IOException, SAXException, TikaException {
89 try {
90 com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
91 handle(jpegMetadata);
92 } catch (JpegProcessingException e) {
93 throw new TikaException("Can't read JPEG metadata", e);
94 } catch (MetadataException e) {
95 throw new TikaException("Can't read JPEG metadata", e);
96 }
97 }
98
99 public void parseTiff(File file)
100 throws IOException, SAXException, TikaException {
101 try {
102 com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
103 handle(tiffMetadata);
104 } catch (MetadataException e) {
105 throw new TikaException("Can't read TIFF metadata", e);
106 }
107 }
108
109 /**
110 * Copies extracted tags to tika metadata using registered handlers.
111 * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
112 * @throws MetadataException This method does not handle exceptions from Metadata Extractor
113 */
114 protected void handle(com.drew.metadata.Metadata metadataExtractor)
115 throws MetadataException {
116 handle(metadataExtractor.getDirectories().iterator());
117 }
118
119 /**
120 * Copies extracted tags to tika metadata using registered handlers.
121 * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
122 * @throws MetadataException This method does not handle exceptions from Metadata Extractor
123 */
124 protected void handle(Iterator<Directory> directories) throws MetadataException {
125 while (directories.hasNext()) {
126 Directory directory = directories.next();
127 for (int i = 0; i < handlers.length; i++) {
128 if (handlers[i].supports(directory.getClass())) {
129 handlers[i].handle(directory, metadata);
130 }
131 }
132 }
133 }
134
135 /**
136 * Reads one or more type of Metadata Extractor fields.
137 */
138 static interface DirectoryHandler {
139 /**
140 * @param directoryType A Metadata Extractor directory class
141 * @return true if the directory type is supported by this handler
142 */
143 boolean supports(Class<? extends Directory> directoryType);
144 /**
145 * @param directory extracted tags
146 * @param metadata current tika metadata
147 * @throws MetadataException typically field extraction error, aborts all further extraction
148 */
149 void handle(Directory directory, Metadata metadata)
150 throws MetadataException;
151 }
152
153 /**
154 * Mimics the behavior from TIKA-314 of copying all extracted tags
155 * to tika metadata using field names from Metadata Extractor.
156 */
157 static class CopyAllFieldsHandler implements DirectoryHandler {
158 public boolean supports(Class<? extends Directory> directoryType) {
159 return true;
160 }
161 public void handle(Directory directory, Metadata metadata)
162 throws MetadataException {
163 if (directory.getTags() != null) {
164 Iterator<?> tags = directory.getTags().iterator();
165 while (tags.hasNext()) {
166 Tag tag = (Tag) tags.next();
167 metadata.set(tag.getTagName(), tag.getDescription());
168 }
169 }
170 }
171 }
172
173 /**
174 * Copies all fields regardless of directory, if the tag name
175 * is not identical to a known Metadata field name.
176 * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
177 */
178 static class CopyUnknownFieldsHandler implements DirectoryHandler {
179 public boolean supports(Class<? extends Directory> directoryType) {
180 return true;
181 }
182 public void handle(Directory directory, Metadata metadata)
183 throws MetadataException {
184 if (directory.getTags() != null) {
185 Iterator<?> tags = directory.getTags().iterator();
186 while (tags.hasNext()) {
187 Tag tag = (Tag) tags.next();
188 String name = tag.getTagName();
189 if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
190 String value = tag.getDescription().trim();
191 if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
192 value = Boolean.TRUE.toString();
193 } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
194 value = Boolean.FALSE.toString();
195 }
196 metadata.set(name, value);
197 }
198 }
199 }
200 }
201 }
202
203 /**
204 * Basic image properties for TIFF and JPEG, at least.
205 */
206 static class DimensionsHandler implements DirectoryHandler {
207 private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
208 public boolean supports(Class<? extends Directory> directoryType) {
209 return directoryType == JpegDirectory.class ||
210 directoryType == ExifSubIFDDirectory.class ||
211 directoryType == ExifThumbnailDirectory.class ||
212 directoryType == ExifIFD0Directory.class;
213 }
214 public void handle(Directory directory, Metadata metadata) throws MetadataException {
215 // The test TIFF has width and height stored as follows according to exiv2
216 //Exif.Image.ImageWidth Short 1 100
217 //Exif.Image.ImageLength Short 1 75
218 // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
219 set(directory, metadata, ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
220 set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
221 set(directory, metadata, ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
222 set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
223 // Bits per sample, two methods of extracting, exif overrides jpeg
224 set(directory, metadata, JpegDirectory.TAG_JPEG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
225 set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
226 // Straightforward
227 set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
228 }
229 private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
230 if (directory.containsTag(extractTag)) {
231 Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
232 if(m.matches()) {
233 metadata.set(metadataField, m.group(1));
234 }
235 }
236 }
237 }
238
239 static class JpegCommentHandler implements DirectoryHandler {
240 public boolean supports(Class<? extends Directory> directoryType) {
241 return directoryType == JpegCommentDirectory.class;
242 }
243 public void handle(Directory directory, Metadata metadata) throws MetadataException {
244 if (directory.containsTag(JpegCommentDirectory.TAG_JPEG_COMMENT)) {
245 metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_JPEG_COMMENT));
246 }
247 }
248 }
249
250 static class ExifHandler implements DirectoryHandler {
251 private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
252 public boolean supports(Class<? extends Directory> directoryType) {
253 return directoryType == ExifIFD0Directory.class ||
254 directoryType == ExifSubIFDDirectory.class;
255 }
256 public void handle(Directory directory, Metadata metadata) {
257 try {
258 handleDateTags(directory, metadata);
259 handlePhotoTags(directory, metadata);
260 handleCommentTags(directory, metadata);
261 } catch (MetadataException e) {
262 // ignore date parse errors and proceed with other tags
263 }
264 }
265 /**
266 * EXIF may contain image description, although with undefined encoding.
267 * Use IPTC for other annotation fields, and XMP for unicode support.
268 */
269 public void handleCommentTags(Directory directory, Metadata metadata) {
270 if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
271 directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
272 metadata.set(TikaCoreProperties.DESCRIPTION,
273 directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
274 }
275 }
276 /**
277 * Maps common TIFF and EXIF tags onto the Tika
278 * TIFF image metadata namespace.
279 */
280 public void handlePhotoTags(Directory directory, Metadata metadata) {
281 if(directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
282 Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
283 if(exposure instanceof Rational) {
284 metadata.set(Metadata.EXPOSURE_TIME, ((Rational)exposure).doubleValue());
285 } else {
286 metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
287 }
288 }
289
290 if(directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
291 String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
292 if(flash.indexOf("Flash fired") > -1) {
293 metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
294 }
295 else if(flash.indexOf("Flash did not fire") > -1) {
296 metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
297 }
298 else {
299 metadata.set(Metadata.FLASH_FIRED, flash);
300 }
301 }
302
303 if(directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
304 Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
305 if(fnumber instanceof Rational) {
306 metadata.set(Metadata.F_NUMBER, ((Rational)fnumber).doubleValue());
307 } else {
308 metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
309 }
310 }
311
312 if(directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
313 Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
314 if(length instanceof Rational) {
315 metadata.set(Metadata.FOCAL_LENGTH, ((Rational)length).doubleValue());
316 } else {
317 metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
318 }
319 }
320
321 if(directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
322 metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
323 }
324
325 if(directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
326 metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
327 }
328 if(directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
329 metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
330 }
331
332 if(directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
333 Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
334 if(length instanceof Integer) {
335 metadata.set(Metadata.ORIENTATION, Integer.toString( ((Integer)length).intValue() ));
336 } else {
337 metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
338 }
339 }
340
341 if(directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
342 metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
343 }
344
345 if(directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
346 Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
347 if(resolution instanceof Rational) {
348 metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)resolution).doubleValue());
349 } else {
350 metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
351 }
352 }
353 if(directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
354 Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
355 if(resolution instanceof Rational) {
356 metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)resolution).doubleValue());
357 } else {
358 metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
359 }
360 }
361 if(directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
362 metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
363 }
364 if(directory.containsTag(ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_WIDTH)) {
365 metadata.set(Metadata.IMAGE_WIDTH, directory.getDescription(ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_WIDTH));
366 }
367 if(directory.containsTag(ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_HEIGHT)) {
368 metadata.set(Metadata.IMAGE_LENGTH, directory.getDescription(ExifThumbnailDirectory.TAG_THUMBNAIL_IMAGE_HEIGHT));
369 }
370 }
371 /**
372 * Maps exif dates to metadata fields.
373 */
374 public void handleDateTags(Directory directory, Metadata metadata)
375 throws MetadataException {
376 // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
377 Date original = null;
378 if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
379 original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
380 // Unless we have GPS time we don't know the time zone so date must be set
381 // as ISO 8601 datetime without timezone suffix (no Z or +/-)
382 if (original != null) {
383 String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(original); // Same time zone as Metadata Extractor uses
384 metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
385 metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
386 }
387 }
388 if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
389 Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
390 if (datetime != null) {
391 String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(datetime);
392 metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
393 // If Date/Time Original does not exist this might be creation date
394 if (metadata.get(TikaCoreProperties.CREATED) == null) {
395 metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
396 }
397 }
398 }
399 }
400 }
401
402 /**
403 * Reads image comments, originally TIKA-472.
404 * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
405 */
406 static class IptcHandler implements DirectoryHandler {
407 public boolean supports(Class<? extends Directory> directoryType) {
408 return directoryType == IptcDirectory.class;
409 }
410 public void handle(Directory directory, Metadata metadata)
411 throws MetadataException {
412 if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
413 String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
414 for (String k : keywords) {
415 metadata.add(TikaCoreProperties.KEYWORDS, k);
416 }
417 }
418 if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
419 metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
420 } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
421 metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
422 }
423 if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
424 metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
425 metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
426 }
427 if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
428 metadata.set(TikaCoreProperties.DESCRIPTION,
429 // Looks like metadata extractor returns IPTC newlines as a single carriage return,
430 // but the exiv2 command does not so we change to line feed here because that is less surprising to users
431 directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
432 }
433 }
434 }
435
436 /**
437 * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
438 */
439 static class GeotagHandler implements DirectoryHandler {
440 public boolean supports(Class<? extends Directory> directoryType) {
441 return directoryType == GpsDirectory.class;
442 }
443 public void handle(Directory directory, Metadata metadata) throws MetadataException {
444 GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
445 if (geoLocation != null) {
446 DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
447 new DecimalFormatSymbols(Locale.ENGLISH));
448 metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(new Double(geoLocation.getLatitude())));
449 metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(new Double(geoLocation.getLongitude())));
450 }
451 }
452 }
453
454 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Iterator;
24 import java.util.Set;
25
26 import javax.imageio.IIOException;
27 import javax.imageio.ImageIO;
28 import javax.imageio.ImageReader;
29 import javax.imageio.metadata.IIOMetadata;
30 import javax.imageio.stream.ImageInputStream;
31
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.io.CloseShieldInputStream;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.metadata.Property;
36 import org.apache.tika.metadata.TikaCoreProperties;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.AbstractParser;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.sax.XHTMLContentHandler;
41 import org.w3c.dom.NamedNodeMap;
42 import org.w3c.dom.Node;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
46 public class ImageParser extends AbstractParser {
47
48 /** Serial version UID */
49 private static final long serialVersionUID = 7852529269245520335L;
50
51 private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp");
52 private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
53
54 private static final Set<MediaType> SUPPORTED_TYPES =
55 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
56 CANONICAL_BMP_TYPE,
57 JAVA_BMP_TYPE,
58 MediaType.image("gif"),
59 MediaType.image("png"),
60 MediaType.image("vnd.wap.wbmp"),
61 MediaType.image("x-icon"),
62 MediaType.image("x-xcf"))));
63
64 public Set<MediaType> getSupportedTypes(ParseContext context) {
65 return SUPPORTED_TYPES;
66 }
67
68 public void parse(
69 InputStream stream, ContentHandler handler,
70 Metadata metadata, ParseContext context)
71 throws IOException, SAXException, TikaException {
72 String type = metadata.get(Metadata.CONTENT_TYPE);
73 if (type != null) {
74 // Java has a different idea of the BMP mime type to
75 // what the canonical one is, fix this up.
76 if (CANONICAL_BMP_TYPE.toString().equals(type)) {
77 type = JAVA_BMP_TYPE.toString();
78 }
79
80 try {
81 Iterator<ImageReader> iterator =
82 ImageIO.getImageReadersByMIMEType(type);
83 if (iterator.hasNext()) {
84 ImageReader reader = iterator.next();
85 try {
86 ImageInputStream imageStream = ImageIO.createImageInputStream(
87 new CloseShieldInputStream(stream));
88 try {
89 reader.setInput(imageStream);
90
91 metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
92 metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
93 metadata.set("height", Integer.toString(reader.getHeight(0)));
94 metadata.set("width", Integer.toString(reader.getWidth(0)));
95
96 loadMetadata(reader.getImageMetadata(0), metadata);
97 } finally {
98 imageStream.close();
99 }
100 } finally {
101 reader.dispose();
102 }
103 }
104
105 // Translate certain Metadata tags from the ImageIO
106 // specific namespace into the general Tika one
107 setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
108 setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
109 setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
110 } catch (IIOException e) {
111 // TIKA-619: There is a known bug in the Sun API when dealing with GIF images
112 // which Tika will just ignore.
113 if (!(e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) {
114 throw new TikaException(type + " parse error", e);
115 }
116 }
117 }
118
119 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
120 xhtml.startDocument();
121 xhtml.endDocument();
122 }
123
124
125 private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
126 if(metadata.get(imageIOkey) != null) {
127 metadata.set(tikaKey, metadata.get(imageIOkey));
128 }
129 }
130 private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
131 if(metadata.get(imageIOkey) != null) {
132 String v = metadata.get(imageIOkey);
133 if(v.endsWith(" ")) {
134 v = v.substring(0, v.lastIndexOf(' '));
135 }
136 metadata.set(tikaProp, v);
137 }
138 }
139
140 private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
141 String[] names = imageMetadata.getMetadataFormatNames();
142 if (names == null) {
143 return;
144 }
145 int length = names.length;
146 for (int i = 0; i < length; i++) {
147 loadNode(metadata, imageMetadata.getAsTree(names[i]), "", false);
148 }
149 }
150
151 private static void loadNode(
152 Metadata metadata, Node node, String parents,
153 boolean addThisNodeName) {
154 if (addThisNodeName) {
155 if (parents.length() > 0) {
156 parents += " ";
157 }
158 parents += node.getNodeName();
159 }
160 NamedNodeMap map = node.getAttributes();
161 if (map != null) {
162
163 int length = map.getLength();
164 if (length == 1) {
165 metadata.add(parents, normalize(map.item(0).getNodeValue()));
166 } else if (length > 1) {
167 StringBuilder value = new StringBuilder();
168 for (int i = 0; i < length; i++) {
169 if (i > 0) {
170 value.append(", ");
171 }
172 Node attr = map.item(i);
173 value.append(attr.getNodeName());
174 value.append("=");
175 value.append(normalize(attr.getNodeValue()));
176 }
177 metadata.add(parents, value.toString());
178 }
179 }
180
181 Node child = node.getFirstChild();
182 while (child != null) {
183 // print children recursively
184 loadNode(metadata, child, parents, true);
185 child = child.getNextSibling();
186 }
187 }
188
189 private static String normalize(String value) {
190 if (value != null) {
191 value = value.trim();
192 } else {
193 value = "";
194 }
195 if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
196 return Boolean.TRUE.toString();
197 } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
198 return Boolean.FALSE.toString();
199 }
200 return value;
201 }
202
203 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.lang.reflect.Field;
19 import java.lang.reflect.Modifier;
20 import java.util.HashSet;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.Property;
24 import org.apache.tika.metadata.TikaCoreProperties;
25
26 /**
27 * Knowns about all declared {@link Metadata} fields.
28 * Didn't find this functionality anywhere so it was added for
29 * ImageMetadataExtractor, but it can be generalized.
30 */
31 public abstract class MetadataFields {
32
33 private static HashSet<String> known;
34
35 private static void setKnownForClass(Class<?> clazz) {
36 Field[] fields = clazz.getFields();
37 for (Field f : fields) {
38 int mod = f.getModifiers();
39 if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
40 Class<?> c = f.getType();
41 if (String.class.equals(c)) {
42 try {
43 String p = (String) f.get(null);
44 if (p != null) {
45 known.add(p);
46 }
47 } catch (IllegalArgumentException e) {
48 e.printStackTrace();
49 } catch (IllegalAccessException e) {
50 e.printStackTrace();
51 }
52 }
53 if (Property.class.isAssignableFrom(c)) {
54 try {
55 Property p = (Property) f.get(null);
56 if (p != null) {
57 known.add(p.getName());
58 }
59 } catch (IllegalArgumentException e) {
60 e.printStackTrace();
61 } catch (IllegalAccessException e) {
62 e.printStackTrace();
63 }
64 }
65 }
66 }
67 }
68
69 static {
70 known = new HashSet<String>();
71 setKnownForClass(TikaCoreProperties.class);
72 setKnownForClass(Metadata.class);
73 }
74
75 public static boolean isMetadataField(String name) {
76 return known.contains(name);
77 }
78
79 public static boolean isMetadataField(Property property) {
80 return known.contains(property.getName());
81 }
82
83 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.UnsupportedEncodingException;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import org.apache.poi.util.IOUtils;
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.io.EndianUtils;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.metadata.TIFF;
31 import org.apache.tika.metadata.TikaCoreProperties;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.parser.AbstractParser;
34 import org.apache.tika.parser.ParseContext;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * Parser for the Adobe Photoshop PSD File Format.
41 *
42 * Documentation on the file format is available from
43 * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
44 */
45 public class PSDParser extends AbstractParser {
46
47 /** Serial version UID */
48 private static final long serialVersionUID = 883387734607994914L;
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.image("vnd.adobe.photoshop"))));
53
54 public Set<MediaType> getSupportedTypes(ParseContext context) {
55 return SUPPORTED_TYPES;
56 }
57
58 public void parse(
59 InputStream stream, ContentHandler handler,
60 Metadata metadata, ParseContext context)
61 throws IOException, SAXException, TikaException {
62 // Check for the magic header signature
63 byte[] signature = new byte[4];
64 IOUtils.readFully(stream, signature);
65 if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
66 signature[2] == (byte)'P' && signature[3] == (byte)'S') {
67 // Good, signature found
68 } else {
69 throw new TikaException("PSD/PSB magic signature invalid");
70 }
71
72 // Check the version
73 int version = EndianUtils.readUShortBE(stream);
74 if(version == 1 || version == 2) {
75 // Good, we support these two
76 } else {
77 throw new TikaException("Invalid PSD/PSB version " + version);
78 }
79
80 // Skip the reserved block
81 IOUtils.readFully(stream, new byte[6]);
82
83 // Number of channels in the image
84 int numChannels = EndianUtils.readUShortBE(stream);
85 // TODO Identify a suitable metadata key for this
86
87 // Width and Height
88 int height = EndianUtils.readIntBE(stream);
89 int width = EndianUtils.readIntBE(stream);
90 metadata.set(TIFF.IMAGE_LENGTH, height);
91 metadata.set(TIFF.IMAGE_WIDTH, width);
92
93 // Depth (bits per channel)
94 int depth = EndianUtils.readUShortBE(stream);
95 metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
96
97 // Colour mode
98 // Bitmap = 0; Grayscale = 1; Indexed = 2; RGB = 3; CMYK = 4; Multichannel = 7; Duotone = 8; Lab = 9.
99 int colorMode = EndianUtils.readUShortBE(stream);
100 // TODO Identify a suitable metadata key for this
101
102 // Next is the Color Mode section
103 // We don't care about this bit
104 long colorModeSectionSize = EndianUtils.readIntBE(stream);
105 stream.skip(colorModeSectionSize);
106
107 // Next is the Image Resources section
108 // Check for certain interesting keys here
109 long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
110 long read = 0;
111 while(read < imageResourcesSectionSize) {
112 ResourceBlock rb = new ResourceBlock(stream);
113 read += rb.totalLength;
114
115 // Is it one we can do something useful with?
116 if(rb.id == ResourceBlock.ID_CAPTION) {
117 metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString());
118 } else if(rb.id == ResourceBlock.ID_EXIF_1) {
119 // TODO Parse the EXIF info
120 } else if(rb.id == ResourceBlock.ID_EXIF_3) {
121 // TODO Parse the EXIF info
122 } else if(rb.id == ResourceBlock.ID_XMP) {
123 // TODO Parse the XMP info
124 }
125 }
126
127 // Next is the Layer and Mask Info
128 // Finally we have Image Data
129 // We can't do anything with these parts
130
131 // We don't have any helpful text, sorry...
132 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
133 xhtml.startDocument();
134 xhtml.endDocument();
135 }
136
137 private static class ResourceBlock {
138 private static final long SIGNATURE = 0x3842494d; // 8BIM
139 private static final int ID_CAPTION = 0x03F0;
140 private static final int ID_URL = 0x040B;
141 private static final int ID_EXIF_1 = 0x0422;
142 private static final int ID_EXIF_3 = 0x0423;
143 private static final int ID_XMP = 0x0424;
144
145 private int id;
146 private String name;
147 private byte[] data;
148 private int totalLength;
149 private ResourceBlock(InputStream stream) throws IOException, TikaException {
150 // Verify the signature
151 long sig = EndianUtils.readIntBE(stream);
152 if(sig != SIGNATURE) {
153 throw new TikaException("Invalid Image Resource Block Signature Found, got " +
154 sig + " 0x" + Long.toHexString(sig) + " but the spec defines " + SIGNATURE);
155 }
156
157 // Read the block
158 id = EndianUtils.readUShortBE(stream);
159
160 StringBuffer nameB = new StringBuffer();
161 int nameLen = 0;
162 while(true) {
163 int v = stream.read();
164 nameLen++;
165
166 if(v == 0) {
167 // The name length is padded to be even
168 if(nameLen % 2 == 1) {
169 stream.read();
170 nameLen++;
171 }
172 break;
173 } else {
174 nameB.append((char)v);
175 }
176 name = nameB.toString();
177 }
178
179 int dataLen = EndianUtils.readIntBE(stream);
180 if(dataLen %2 == 1) {
181 // Data Length is even padded
182 dataLen = dataLen + 1;
183 }
184 totalLength = 4 + 2 + nameLen + 4 + dataLen;
185
186 data = new byte[dataLen];
187 IOUtils.readFully(stream, data);
188 }
189
190 private String getDataAsString() {
191 // Will be null padded
192 try {
193 return new String(data, 0, data.length-1, "ASCII");
194 } catch(UnsupportedEncodingException e) {
195 throw new RuntimeException("Something is very broken in your JVM!");
196 }
197 }
198 }
199 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.io.TemporaryResources;
25 import org.apache.tika.io.TikaInputStream;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.parser.AbstractParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.image.xmp.JempboxExtractor;
31 import org.apache.tika.sax.XHTMLContentHandler;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.SAXException;
34
35 public class TiffParser extends AbstractParser {
36
37 /** Serial version UID */
38 private static final long serialVersionUID = -3941143576535464926L;
39
40 private static final Set<MediaType> SUPPORTED_TYPES =
41 Collections.singleton(MediaType.image("tiff"));
42
43 public Set<MediaType> getSupportedTypes(ParseContext context) {
44 return SUPPORTED_TYPES;
45 }
46
47 public void parse(
48 InputStream stream, ContentHandler handler,
49 Metadata metadata, ParseContext context)
50 throws IOException, SAXException, TikaException {
51 TemporaryResources tmp = new TemporaryResources();
52 try {
53 TikaInputStream tis = TikaInputStream.get(stream, tmp);
54 new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
55 new JempboxExtractor(metadata).parse(tis);
56 } finally {
57 tmp.dispose();
58 }
59
60 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
61 xhtml.startDocument();
62 xhtml.endDocument();
63 }
64
65 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image.xmp;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.ByteArrayOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.Reader;
24 import java.util.Iterator;
25 import java.util.List;
26
27 import org.apache.jempbox.xmp.XMPMetadata;
28 import org.apache.jempbox.xmp.XMPSchemaDublinCore;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.TikaCoreProperties;
32 import org.xml.sax.InputSource;
33
34 public class JempboxExtractor {
35
36 private XMPPacketScanner scanner = new XMPPacketScanner();
37
38 private Metadata metadata;
39
40 // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
41 private static final String DEFAULT_XMP_CHARSET = "UTF-8";
42
43 public JempboxExtractor(Metadata metadata) {
44 this.metadata = metadata;
45 }
46
47 public void parse(InputStream file) throws IOException, TikaException {
48 ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
49 if (!scanner.parse(file, xmpraw)) {
50 return;
51 }
52
53 Reader decoded = new InputStreamReader(
54 new ByteArrayInputStream(xmpraw.toByteArray()),
55 DEFAULT_XMP_CHARSET);
56 try {
57 XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
58 XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
59 if (dc != null) {
60 if (dc.getTitle() != null) {
61 metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
62 }
63 if (dc.getDescription() != null) {
64 metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
65 }
66 if (dc.getCreators() != null && dc.getCreators().size() > 0) {
67 metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
68 }
69 if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
70 Iterator<String> keywords = dc.getSubjects().iterator();
71 while (keywords.hasNext()) {
72 metadata.add(TikaCoreProperties.KEYWORDS, keywords.next());
73 }
74 // TODO should we set KEYWORDS too?
75 // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
76 }
77 }
78 } catch (IOException e) {
79 // Could not parse embedded XMP metadata. That's not a serious
80 // problem, so we'll just ignore the issue for now.
81 // TODO: Make error handling like this configurable.
82 }
83 }
84
85 protected String joinCreators(List<String> creators) {
86 if (creators == null || creators.size() == 0) {
87 return "";
88 }
89 if (creators.size() == 1) {
90 return creators.get(0);
91 }
92 StringBuffer c = new StringBuffer();
93 for (String s : creators) {
94 c.append(", ").append(s);
95 }
96 return c.substring(2);
97 }
98 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
18
19 package org.apache.tika.parser.image.xmp;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.OutputStream;
24 import java.io.UnsupportedEncodingException;
25
26 /**
27 * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet
28 * it finds and parses it.
29 * <p>
30 * Important: Before you use this class to look for an XMP packet in some random file, please read
31 * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
32 * <p>
33 * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
34 * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
35 */
36 public class XMPPacketScanner {
37
38 private static final byte[] PACKET_HEADER;
39 private static final byte[] PACKET_HEADER_END;
40 private static final byte[] PACKET_TRAILER;
41
42 static {
43 try {
44 PACKET_HEADER = "<?xpacket begin=".getBytes("US-ASCII");
45 PACKET_HEADER_END = "?>".getBytes("US-ASCII");
46 PACKET_TRAILER = "<?xpacket".getBytes("US-ASCII");
47 } catch (UnsupportedEncodingException e) {
48 throw new RuntimeException("Incompatible JVM! US-ASCII encoding not supported.");
49 }
50 }
51
52 /**
53 * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no
54 * XMP packet is found until the stream ends, null is returned. Note: This method
55 * only finds the first XMP packet in a stream. And it cannot determine whether it
56 * has found the right XMP packet if there are multiple packets.
57 *
58 * Does <em>not</em> close the stream.
59 * If XMP block was found reading can continue below the block.
60 *
61 * @param in the InputStream to search
62 * @param xmlOut to write the XMP packet to
63 * @return true if XMP packet is found, false otherwise
64 * @throws IOException if an I/O error occurs
65 * @throws TransformerException if an error occurs while parsing the XMP packet
66 */
67 public boolean parse(InputStream in, OutputStream xmlOut) throws IOException {
68 if (!in.markSupported()) {
69 in = new java.io.BufferedInputStream(in);
70 }
71 boolean foundXMP = skipAfter(in, PACKET_HEADER);
72 if (!foundXMP) {
73 return false;
74 }
75 //TODO Inspect "begin" attribute!
76 if (!skipAfter(in, PACKET_HEADER_END)) {
77 throw new IOException("Invalid XMP packet header!");
78 }
79 //TODO Do with TeeInputStream when Commons IO 1.4 is available
80 if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
81 throw new IOException("XMP packet not properly terminated!");
82 }
83 return true;
84 }
85
86 private static boolean skipAfter(InputStream in, byte[] match) throws IOException {
87 return skipAfter(in, match, null);
88 }
89
90 private static boolean skipAfter(InputStream in, byte[] match, OutputStream out)
91 throws IOException {
92 int found = 0;
93 int len = match.length;
94 int b;
95 while ((b = in.read()) >= 0) {
96 if (b == match[found]) {
97 found++;
98 if (found == len) {
99 return true;
100 }
101 } else {
102 if (out != null) {
103 if (found > 0) {
104 out.write(match, 0, found);
105 }
106 out.write(b);
107 }
108 found = 0;
109 }
110 }
111 return false;
112 }
113
114 }
115
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.internal;
17
18 import java.util.Properties;
19
20 import org.apache.tika.detect.DefaultDetector;
21 import org.apache.tika.detect.Detector;
22 import org.apache.tika.parser.DefaultParser;
23 import org.apache.tika.parser.Parser;
24 import org.osgi.framework.BundleActivator;
25 import org.osgi.framework.BundleContext;
26 import org.osgi.framework.ServiceRegistration;
27
28 public class Activator implements BundleActivator {
29
30 private ServiceRegistration detectorService;
31
32 private ServiceRegistration parserService;
33
34 public void start(BundleContext context) throws Exception {
35 detectorService = context.registerService(
36 Detector.class.getName(),
37 new DefaultDetector(Activator.class.getClassLoader()),
38 new Properties());
39 parserService = context.registerService(
40 Parser.class.getName(),
41 new DefaultParser(Activator.class.getClassLoader()),
42 new Properties());
43 }
44
45 public void stop(BundleContext context) throws Exception {
46 parserService.unregister();
47 detectorService.unregister();
48 }
49
50 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iptc;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.text.ParseException;
21 import java.text.SimpleDateFormat;
22 import java.util.Collections;
23 import java.util.Date;
24 import java.util.HashMap;
25 import java.util.Set;
26 import java.util.TimeZone;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.metadata.TikaCoreProperties;
31 import org.apache.tika.mime.MediaType;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.Parser;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 /**
39 * Parser for IPTC ANPA New Wire Feeds
40 */
41 public class IptcAnpaParser implements Parser {
42 /** Serial version UID */
43 private static final long serialVersionUID = -6062820170212879115L;
44
45 private static final MediaType TYPE =
46 MediaType.text("vnd.iptc.anpa");
47
48 private static final Set<MediaType> SUPPORTED_TYPES =
49 Collections.singleton(TYPE);
50
51 public Set<MediaType> getSupportedTypes(ParseContext context) {
52 return SUPPORTED_TYPES;
53 }
54
55 public void parse(
56 InputStream stream, ContentHandler handler,
57 Metadata metadata, ParseContext context)
58 throws IOException, SAXException, TikaException {
59
60 HashMap<String,String> properties = this.loadProperties(stream);
61 this.setMetadata(metadata, properties);
62
63 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
64 xhtml.startDocument();
65 // TODO: put body content here
66 xhtml.startElement("p");
67 String body = clean(properties.get("body"));
68 if (body != null)
69 xhtml.characters(body);
70 xhtml.endElement("p");
71 xhtml.endDocument();
72 }
73
74 /**
75 * @deprecated This method will be removed in Apache Tika 1.0.
76 */
77 public void parse(
78 InputStream stream, ContentHandler handler, Metadata metadata)
79 throws IOException, SAXException, TikaException {
80 parse(stream, handler, metadata, new ParseContext());
81 }
82
83
84 private int FMT_ANPA_1312 = 0x00; // "NAA 89-3 (ANPA 1312)"
85 private int FMT_ANPA_UPI = 0x01; // "United Press International ANPA 1312 variant"
86 private int FMT_ANPA_UPI_DL = 0x02; // "United Press International Down-Load Message"
87 private int FMT_IPTC_7901 = 0x03; // "IPTC7901 Recommended Message Format"
88 private int FMT_IPTC_PHOTO = 0x04; // "IPTC-NAA Digital Newsphoto Parameter Record"
89 private int FMT_IPTC_CHAR = 0x05; // "IPTC Unstructured Character Oriented File Format (UCOFF)"
90 private int FMT_NITF = 0x06; // "News Industry Text Format (NITF)"
91 private int FMT_NITF_TT = 0x07; // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)"
92 private int FMT_NITF_RB = 0x08; // "Ritzaus Bureau NITF version (RBNITF DTD)"
93 private int FMT_IPTC_AP = 0x09; // "Associated Press news wire format"
94 private int FMT_IPTC_BLM = 0x0A; // "Bloomberg News news wire format"
95 private int FMT_IPTC_NYT = 0x0B; // "New York Times news wire format"
96 private int FMT_IPTC_RTR = 0x0C; // "Reuters news wire format"
97
98 private int FORMAT = FMT_ANPA_1312; // assume the default format to be ANPA-1312
99
100 private final static char SOH = 0x01; // start of header (ctrl-a)
101 private final static char STX = 0x02; // start of text (ctrl-b)
102 private final static char ETX = 0x03; // end of text (ctrl-c)
103 private final static char EOT = 0x04; // the tab character (ctrl-d)
104 private final static char SYN = 0x16; // synchronous idle (ctrl-v)
105
106 private final static char BS = 0x08; // the backspace character (used for diacriticals)
107 private final static char TB = 0x09; // the tab character
108 private final static char LF = 0x0A; // line feed
109 private final static char FF = 0x0C; // form feed
110 private final static char CR = 0x0D; // carriage return
111 private final static char XQ = 0x11; // device control (ctrl-q)
112 private final static char XS = 0x13; // device control (ctrl-s)
113 private final static char FS = 0x1F; // a field delimiter
114
115 private final static char HY = 0x2D; // hyphen
116 private final static char SP = 0x20; // the blank space
117 private final static char LT = 0x3C; // less than
118 private final static char EQ = 0x3D; // less than
119 private final static char CT = 0x5E; // carat
120
121 private final static char SL = 0x91; // single-quote left
122 private final static char SR = 0x92; // single-quote right
123 private final static char DL = 0x93; // double-quote left
124 private final static char DR = 0x94; // double-quote right
125
126
127 /**
128 * scan the news messsage and store the metadata and data into a map
129 */
130 private HashMap<String,String> loadProperties(InputStream is) {
131
132 HashMap<String,String> properties = new HashMap<String,String>();
133
134 FORMAT = this.scanFormat(is);
135
136 byte[] residual = this.getSection(is,"residual");
137
138 byte[] header = this.getSection(is,"header");
139 parseHeader(header, properties);
140
141 byte[] body = this.getSection(is,"body");
142 parseBody(body, properties);
143
144 byte[] footer = this.getSection(is,"footer");
145 parseFooter(footer, properties);
146
147 return (properties);
148 }
149
150
151 private int scanFormat(InputStream is) {
152 int format = this.FORMAT;
153 int maxsize = 524288; // 512K
154
155 byte[] buf = new byte[maxsize];
156 try {
157 if (is.markSupported()) {
158 is.mark(maxsize);
159 }
160 int msgsize = is.read(buf); // read in at least the full data
161
162 String message = (new String(buf)).toLowerCase();
163 // these are not if-then-else, because we want to go from most common
164 // and fall through to least. this is imperfect, as these tags could
165 // show up in other agency stories, but i can't find a spec or any
166 // explicit codes to identify the wire source in the message itself
167
168 if (message.contains("ap-wf")) {
169 format = this.FMT_IPTC_AP;
170 }
171 if (message.contains("reuters")) {
172 format = this.FMT_IPTC_RTR;
173 }
174 if (message.contains("new york times")) {
175 format = this.FMT_IPTC_NYT;
176 }
177 if (message.contains("bloomberg news")) {
178 format = this.FMT_IPTC_BLM;
179 }
180 }
181 catch (IOException eio) {
182 // we are in an unstable state
183 }
184
185 try {
186 if (is.markSupported()) {
187 is.reset();
188 }
189 }
190 catch (IOException eio) {
191 // we are in an unstable state
192 }
193 return (format);
194 }
195
196
197 private void setFormat(int format) {
198 this.FORMAT = format;
199 }
200
201
202 private String getFormatName() {
203
204 String name = "";
205
206 if (FORMAT == this.FMT_IPTC_AP) {
207 name = "Associated Press";
208 }
209
210 else if(FORMAT == this.FMT_IPTC_BLM) {
211 name = "Bloomberg";
212 }
213
214 else if(FORMAT == this.FMT_IPTC_NYT) {
215 name = "New York Times";
216 }
217
218 else if(FORMAT == this.FMT_IPTC_RTR) {
219 name = "Reuters";
220 }
221
222 return (name);
223 }
224
225
226 private byte[] getSection(InputStream is, String name) {
227
228 byte[] value = new byte[0];
229
230 if (name.equals("residual")) {
231 // the header shouldn't be more than 1k, but just being generous here
232 int maxsize = 8192; // 8K
233 byte bstart = SYN; // check for SYN [0x16 : ctrl-v] (may have leftover residue from preceding message)
234 byte bfinish = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
235 value = getSection(is, maxsize, bstart, bfinish, true);
236 }
237
238 else if(name.equals("header")) {
239 // the header shouldn't be more than 1k, but just being generous here
240 int maxsize = 8192; // 8K
241 byte bstart = SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN [0x16 : ctrl-v])
242 byte bfinish = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
243 value = getSection(is, maxsize, bstart, bfinish, true);
244 }
245
246 else if (name.equals("body")) {
247 // the message shouldn't be more than 16k (?), leaving plenty of space
248 int maxsize = 524288; // 512K
249 byte bstart = STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of message)
250 byte bfinish = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
251 value = getSection(is, maxsize, bstart, bfinish, true);
252 }
253
254 else if (name.equals("footer")) {
255 // the footer shouldn't be more than 1k , leaving plenty of space
256 int maxsize = 8192; // 8K
257 byte bstart = ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of footer)
258 byte bfinish = EOT; // check for EOT [0x04 : ctrl-d] (marks end of transmission)
259 value = getSection(is, maxsize, bstart, bfinish, true);
260 }
261
262 return (value);
263 }
264
265
266 private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish, boolean ifincomplete) {
267 byte[] value = new byte[0];
268
269 try {
270 boolean started = false; // check if we have found the start flag
271 boolean finished = false; // check if we have found the finish flag
272 int read = 0; // the number of bytes we read
273 int start = 0; // the position after the start flag
274
275 // TODO: this only pulls back 8K of data on a read, regardless of buffer size
276 // more nefariously, it caps at a total 8K, through all sections
277 int streammax = is.available();
278 maxsize = Math.min(maxsize, streammax);
279
280 is.mark(maxsize);
281 byte[] buf = new byte[maxsize];
282 int totsize = 0;
283 int remainder = maxsize - totsize;
284 while (remainder > 0) {
285 int msgsize = is.read(buf, maxsize-remainder, maxsize); // read in at least the full data
286 if (msgsize == -1) {
287 remainder = msgsize = 0;
288 }
289 remainder -= msgsize;
290 totsize += msgsize;
291 }
292
293 // scan through the provided input stream
294 for (read=0; read < totsize; read++) {
295 byte b = buf[read];
296
297 if (!started) {
298 started = (b == bstart);
299 start = read + 1;
300 continue;
301 }
302
303 if (finished = (b == bfinish)) {
304 /*
305 is.reset();
306 long skipped = is.skip((long)read);
307 if (skipped != read) {
308 // we are in an unstable state
309 }
310 is.mark(1);
311 */
312 break;
313 }
314
315 // load from the stream until we run out of characters, or hit the termination byte
316 continue;
317 }
318
319 // move the input stream back to where it was initially
320 is.reset();
321
322 if (finished) {
323 // now, we want to reset the stream to be sitting right on top of the finish marker
324 is.skip(read);
325 value = new byte[read-start];
326 System.arraycopy(buf, start, value, 0, read-start);
327 }
328 else {
329 if (ifincomplete && started) {
330 // the caller wants anything that was read, and we finished the stream or buffer
331 value = new byte[read-start];
332 System.arraycopy(buf, start, value, 0, read-start);
333 }
334 }
335 }
336 catch (IOException eio) {
337 // something invalid occurred, return an empty string
338 }
339
340 return (value);
341 }
342
343
344 private boolean parseHeader(byte[] value, HashMap<String,String> properties) {
345 boolean added = false;
346
347 String env_serviceid = "";
348 String env_category = "";
349 String env_urgency = "";
350 String hdr_edcode = "";
351 String hdr_subject = "";
352 String hdr_date = "";
353 String hdr_time = "";
354
355 int read = 0;
356
357 while (read < value.length) {
358
359 // pull apart the envelope, getting the service id (....\x1f)
360 while (read < value.length) {
361 byte val_next = value[read++];
362 if (val_next != FS) {
363 env_serviceid += (char)(val_next & 0xff); // convert the byte to an unsigned int
364 }
365 else {
366 break;
367 }
368 }
369
370 // pull apart the envelope, getting the category (....\x13\x11)
371 while (read < value.length) {
372 byte val_next = value[read++];
373 if (val_next != XS) { // the end of the envelope is marked (\x13)
374 env_category += (char)(val_next & 0xff); // convert the byte to an unsigned int
375 }
376 else {
377 val_next = value[read]; // get the remaining byte (\x11)
378 if (val_next == XQ) {
379 read++;
380 }
381 break;
382 }
383 }
384
385 // pull apart the envelope, getting the subject heading
386 while (read < value.length) {
387 boolean subject = true;
388 byte val_next = value[read++];
389 while ((subject) && (val_next != SP) && (val_next != 0x00)) { // ignore the envelope subject
390 hdr_subject += (char)(val_next & 0xff); // convert the byte to an unsigned int
391 val_next = (read < value.length) ? value[read++] : 0x00;
392 while (val_next == SP) { // consume all the spaces
393 subject = false;
394 val_next = (read < value.length) ? value[read++] : 0x00;
395 if (val_next != SP) {
396 --read; // otherwise we eat into the next section
397 }
398 }
399 }
400 if (!subject) {
401 break;
402 }
403 }
404
405 // pull apart the envelope, getting the date and time
406 while (read < value.length) {
407 byte val_next = value[read++];
408 if (hdr_date.length() == 0) {
409 while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens
410 || (val_next == HY)) {
411 hdr_date += (char)(val_next & 0xff); // convert the byte to an unsigned int
412 val_next = (read < value.length) ? value[read++] : 0x00;
413 }
414 }
415 else if (val_next == SP) {
416 while (val_next == SP) { // consume all the spaces
417 val_next = (read < value.length) ? value[read++] : 0x00;
418 }
419 continue;
420 }
421 else {
422 while (((val_next >= (byte)0x30) && (val_next <= (byte)0x39)) // consume all numerics and hyphens
423 || (val_next == HY)) {
424 hdr_time += (char)(val_next & 0xff); // convert the byte to an unsigned int
425 val_next = (read < value.length) ? value[read++] : 0x00;
426 }
427 }
428 }
429 break; // don't let this run back through and start thrashing metadata
430 }
431
432 // if we were saving any of these values, we would set the properties map here
433
434 added = (env_serviceid.length() + env_category.length() + hdr_subject.length() +
435 hdr_date.length() + hdr_time.length()) > 0;
436 return added;
437 }
438
439 private boolean parseBody(byte[] value, HashMap<String,String> properties) {
440 boolean added = false;
441
442 String bdy_heading = "";
443 String bdy_title = "";
444 String bdy_source = "";
445 String bdy_author = "";
446 String bdy_body = "";
447
448 int read = 0;
449 boolean done = false;
450
451 while (!done && (read < value.length)) {
452
453 // pull apart the body, getting the heading (^....\x0d\x0a)
454 while (read < value.length) {
455 byte val_next = value[read++];
456 if (val_next == CT) { // start of a new section , first is the heading
457 val_next = (read < value.length) ? value[read++] : 0x00;
458 // AP, NYT, and Bloomberg end with < , Reuters with EOL
459 while ((val_next != LT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c) and not EOL
460 bdy_heading += (char)(val_next & 0xff); // convert the byte to an unsigned int
461 val_next = (read < value.length) ? value[read++] : 0x00;
462 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
463 }
464 if (val_next == LT) {
465 // hit the delimiter, carry on
466 val_next = (read < value.length) ? value[read++] : 0x00;
467 }
468 while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
469 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
470 if ((val_next != CR) && (val_next != LF)) {
471 --read;
472 }
473 }
474 }
475 else {
476 // this will only be hit on poorly-formed files
477
478 // for reuters, the heading does not start with the ^, so we push one back into the stream
479 if (FORMAT == this.FMT_IPTC_RTR) {
480 if (val_next != CT) {
481 // for any non-whitespace, we need to go back an additional step to non destroy the data
482 if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
483 // if the very first byte is data, we have to shift the whole array, and stuff in a carat
484 if (read == 1) {
485 byte[] resize = new byte[value.length + 1];
486 System.arraycopy(value, 0, resize, 1, value.length);
487 value = resize;
488 }
489 }
490 value[--read] = CT;
491 continue;
492 }
493 }
494 }
495 break;
496 }
497
498 // pull apart the body, getting the title (^....\x0d\x0a)
499 while (read < value.length) {
500 byte val_next = value[read++];
501 if (val_next == CT) { // start of a new section , first is the heading
502 val_next = (read < value.length) ? value[read++] : 0x00;
503 // AP, NYT, and Bloomberg end with < , Reuters with EOL
504 while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL
505 bdy_title += (char)(val_next & 0xff); // convert the byte to an unsigned int
506 val_next = (read < value.length) ? value[read++] : 0x00;
507 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
508 }
509
510 if (val_next == CT) { // start of a new section , when first didn't finish cleanly
511 --read;
512 }
513
514 if (val_next == LT) {
515 // hit the delimiter, carry on
516 val_next = (read < value.length) ? value[read++] : 0x00;
517 }
518
519 while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
520 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
521 if ((val_next != CR) && (val_next != LF)) {
522 --read;
523 }
524 }
525 }
526 else {
527 // this will only be hit on poorly-formed files
528
529 // for bloomberg, the title does not start with the ^, so we push one back into the stream
530 if (FORMAT == this.FMT_IPTC_BLM) {
531 if (val_next == TB) {
532 value[--read] = CT;
533 continue;
534 }
535 }
536
537 // for reuters, the title does not start with the ^, so we push one back into the stream
538 if (FORMAT == this.FMT_IPTC_RTR) {
539 if (val_next != CT) {
540 // for any non-whitespace, we need to go back an additional step to non destroy the data
541 if ((val_next != SP) && (val_next != TB) && (val_next != CR) && (val_next != LF)) {
542 --read;
543 }
544 value[--read] = CT;
545 continue;
546 }
547 }
548 }
549 break;
550 }
551
552
553 // at this point, we have a variable number of metadata lines, with various orders
554 // we scan the start of each line for the special character, and run to the end character
555 // pull apart the body, getting the title (^....\x0d\x0a)
556 boolean metastarted = false;
557 String longline = "";
558 String longkey = "";
559 while (read < value.length) {
560 byte val_next = value[read++];
561
562 // eat up whitespace before committing to the next section
563 if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
564 continue;
565 }
566
567 if (val_next == CT) { // start of a new section , could be authors, sources, etc
568 val_next = (read < value.length) ? value[read++] : 0x00;
569 String tmp_line = "";
570 while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) {
571 // less than delimiter (\x3c), maybe also badly formed with just new line
572 tmp_line += (char)(val_next & 0xff); // convert the byte to an unsigned int
573 val_next = (read < value.length) ? value[read++] : 0x00;
574 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
575 }
576
577 if (val_next == CT) { // start of a new section , when first didn't finish cleanly
578 --read;
579 }
580
581 if (val_next == LT) {
582 // hit the delimiter, carry on
583 val_next = (read < value.length) ? value[read++] : 0x00;
584 }
585
586 while ((val_next == CR) || (val_next == LF)) {
587 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
588 if ((val_next != CR) && (val_next != LF)) {
589 --read;
590 }
591 }
592 if (tmp_line.toLowerCase().startsWith("by") || longline.equals("bdy_author")) {
593 longkey = "bdy_author";
594
595 // prepend a space to subsequent line, so it gets parsed consistent with the lead line
596 tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
597
598 // we have an author candidate
599 int term = tmp_line.length();
600 term = Math.min(term, (tmp_line.indexOf("<") > -1 ? tmp_line.indexOf("<") : term));
601 term = Math.min(term, (tmp_line.indexOf("=") > -1 ? tmp_line.indexOf("=") : term));
602 term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term));
603 term = (term > 0 ) ? term : tmp_line.length();
604 bdy_author += tmp_line.substring(tmp_line.indexOf(" "), term);
605 metastarted = true;
606 longline = ((tmp_line.indexOf("=") > -1) && (!longline.equals(longkey)) ? longkey : "");
607 }
608 else if (FORMAT == this.FMT_IPTC_BLM) {
609 String byline = " by ";
610 if (tmp_line.toLowerCase().contains(byline)) {
611 longkey = "bdy_author";
612
613 int term = tmp_line.length();
614 term = Math.min(term, (tmp_line.indexOf("<") > -1 ? tmp_line.indexOf("<") : term));
615 term = Math.min(term, (tmp_line.indexOf("=") > -1 ? tmp_line.indexOf("=") : term));
616 term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term));
617 term = (term > 0 ) ? term : tmp_line.length();
618 // for bloomberg, the author line sits below their copyright statement
619 bdy_author += tmp_line.substring(tmp_line.toLowerCase().indexOf(byline) + byline.length(), term) + " ";
620 metastarted = true;
621 longline = ((tmp_line.indexOf("=") > -1) && (!longline.equals(longkey)) ? longkey : "");
622 }
623 else if(tmp_line.toLowerCase().startsWith("c.")) {
624 // the author line for bloomberg is a multiline starting with c.2011 Bloomberg News
625 // then containing the author info on the next line
626 if (val_next == TB) {
627 value[--read] = CT;
628 continue;
629 }
630 }
631 else if(tmp_line.toLowerCase().trim().startsWith("(") && tmp_line.toLowerCase().trim().endsWith(")")) {
632 // the author line may have one or more comment lines between the copyright
633 // statement, and the By AUTHORNAME line
634 if (val_next == TB) {
635 value[--read] = CT;
636 continue;
637 }
638 }
639 }
640
641 else if (tmp_line.toLowerCase().startsWith("eds") || longline.equals("bdy_source")) {
642 longkey = "bdy_source";
643 // prepend a space to subsequent line, so it gets parsed consistent with the lead line
644 tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
645
646 // we have a source candidate
647 int term = tmp_line.length();
648 term = Math.min(term, (tmp_line.indexOf("<") > -1 ? tmp_line.indexOf("<") : term));
649 term = Math.min(term, (tmp_line.indexOf("=") > -1 ? tmp_line.indexOf("=") : term));
650 // term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : term));
651 term = (term > 0 ) ? term : tmp_line.length();
652 bdy_source += tmp_line.substring(tmp_line.indexOf(" ") + 1, term) + " ";
653 metastarted = true;
654 longline = (!longline.equals(longkey) ? longkey : "");
655 }
656 else {
657 // this has fallen all the way through. trap it as part of the subject,
658 // rather than just losing it
659 if (!metastarted) {
660 bdy_title += " , " + tmp_line; // not sure where else to put this but in the title
661 }
662 else {
663 // what to do with stuff that is metadata, which falls after metadata lines started?
664 bdy_body += " " + tmp_line + " , "; // not sure where else to put this but in the title
665 }
666 }
667 }
668 else { // we're on to the main body
669 while ((read < value.length) && (val_next != 0)) {
670 // read until the train runs out of tracks
671 bdy_body += (char)(val_next & 0xff); // convert the byte to an unsigned int
672 val_next = (read < value.length) ? value[read++] : 0x00;
673 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
674 }
675
676 }
677 // we would normally break here, but just let this read out to the end
678 }
679 done = true; // don't let this run back through and start thrashing metadata
680 }
681 properties.put("body", bdy_body);
682 properties.put("title", bdy_title);
683 properties.put("subject", bdy_heading);
684 properties.put("author", bdy_author);
685 properties.put("source", bdy_source);
686
687 added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + bdy_author.length() +
688 bdy_source.length()) > 0;
689 return added;
690 }
691
692
693 private boolean parseFooter(byte[] value, HashMap<String,String> properties) {
694 boolean added = false;
695
696 String ftr_source = "";
697 String ftr_datetime = "";
698
699 int read = 0;
700 boolean done = false;
701
702 while (!done && (read < value.length)) {
703
704 // pull apart the footer, getting the news feed source (^....\x0d\x0a)
705 byte val_next = value[read++];
706 byte val_peek = (read < value.length) ? value[read+1] : 0x00; // skip the new lines
707
708 while (((val_next < (byte)0x30) || (val_next > (byte)0x39)) && (val_next != 0)) { // consume all non-numerics first
709 ftr_source += (char)(val_next & 0xff); // convert the byte to an unsigned int
710 val_next = (read < value.length) ? value[read] : 0x00; // attempt to read until end of stream
711 read++;
712 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
713 }
714
715 while ((val_next != LT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) { // get as much timedate as possible
716 // this is an american format, so arrives as mm-dd-yy HHiizzz
717 ftr_datetime += (char)(val_next & 0xff); // convert the byte to an unsigned int
718 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
719 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
720 }
721 if (val_next == LT) {
722 // hit the delimiter, carry on
723 val_next = (read < value.length) ? value[read++] : 0x00;
724 }
725
726 if (ftr_datetime.length() > 0) {
727 // we want to pass this back in a more friendly format
728 String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
729 Date dateunix = new Date();
730 try {
731 // standard ap format
732 String format_in = "MM-dd-yy HHmmzzz";
733
734 if (FORMAT == this.FMT_IPTC_RTR) {
735 // standard reuters format
736 format_in = "HH:mm MM-dd-yy";
737 }
738 SimpleDateFormat dfi = new SimpleDateFormat(format_in);
739 dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
740 dateunix = dfi.parse(ftr_datetime);
741 }
742 catch (ParseException ep) {
743 // failed, but this will just fall through to setting the date to now
744 }
745 SimpleDateFormat dfo = new SimpleDateFormat(format_out);
746 dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
747 ftr_datetime = dfo.format(dateunix);
748 }
749 while ((val_next == CR) || (val_next == LF)) {
750 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
751 if ((val_next != CR) && (val_next != LF)) {
752 --read;
753 }
754 }
755 done = true; // don't let this run back through and start thrashing metadata
756 }
757
758 properties.put("publisher", ftr_source);
759 properties.put("created", ftr_datetime);
760 properties.put("modified", ftr_datetime);
761
762 added = (ftr_source.length() + ftr_datetime.length()) > 0;
763 return added;
764 }
765
766
767 private void setMetadata(Metadata metadata, HashMap<String,String> properties) {
768
769 // every property that gets set must be non-null, or it will cause NPE
770 // in other consuming applications, like Lucene
771 metadata.set(Metadata.CONTENT_TYPE, clean("text/anpa-1312"));
772 metadata.set(TikaCoreProperties.TITLE, clean(properties.get("title")));
773 metadata.set(TikaCoreProperties.KEYWORDS, clean(properties.get("subject")));
774 metadata.set(TikaCoreProperties.CREATOR, clean(properties.get("author")));
775 metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created")));
776 metadata.set(TikaCoreProperties.MODIFIED, clean(properties.get("modified")));
777 metadata.set(TikaCoreProperties.SOURCE, clean(properties.get("source")));
778 // metadata.set(TikaCoreProperties.PUBLISHER, clean(properties.get("publisher")));
779 metadata.set(TikaCoreProperties.PUBLISHER, clean(this.getFormatName()));
780
781 /*
782 metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime());
783 metadata.set(
784 Property.internalDate(TikaCoreProperties.MODIFIED),
785 font.getHeader().getModified().getTime());
786 */
787 }
788
789 private String clean(String value) {
790 if (value == null) {
791 value = "";
792 }
793
794 value = value.replaceAll("``", "`");
795 value = value.replaceAll("''", "'");
796 value = value.replaceAll(new String(new char[] {SL}), "'");
797 value = value.replaceAll(new String(new char[] {SR}), "'");
798 value = value.replaceAll(new String(new char[] {DL}), "\"");
799 value = value.replaceAll(new String(new char[] {DR}), "\"");
800 value = value.trim();
801
802 return (value);
803 }
804 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 /**
19 * Utility class to allow for conversion from an integer to Roman numerals
20 * or alpha-numeric symbols in line with Pages auto numbering formats.
21 */
22 class AutoPageNumberUtils {
23
24 private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
25 "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
26 "U", "V", "W", "X", "Y", "Z" };
27
28 private static final int MAX = 26;
29
30 public static String asAlphaNumeric(int i) {
31 StringBuffer sbuff = new StringBuffer();
32 int index = i % MAX;
33 int ratio = i / MAX;
34
35 if (index == 0) {
36 ratio--;
37 index = MAX;
38 }
39
40 for(int j = 0; j <= ratio; j++) {
41 sbuff.append(ALPHABET[index - 1]); }
42 return sbuff.toString();
43 }
44
45 public static String asAlphaNumericLower(int i) {
46 return asAlphaNumeric(i).toLowerCase();
47 }
48
49 /*
50 * Code copied from jena.apache.org.
51 * @see com.hp.hpl.jena.sparql.util.RomanNumeral
52 */
53 public static String asRomanNumerals(int i) {
54 if ( i <= 0 )
55 throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
56 if ( i > 3999 )
57 throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
58 StringBuffer sbuff = new StringBuffer() ;
59
60 i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
61 i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ;
62 i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ;
63
64 while ( i >= 1 )
65 {
66 sbuff.append("I") ;
67 i -= 1 ;
68 }
69 return sbuff.toString() ;
70
71
72 }
73
74 public static String asRomanNumeralsLower(int i) {
75 return asRomanNumerals(i).toLowerCase();
76 }
77
78 private static int i2r(StringBuffer sbuff, int i,
79 String tens, int iTens,
80 String nines, int iNines,
81 String fives, int iFives,
82 String fours, int iFours)
83 {
84 while ( i >= iTens )
85 {
86 sbuff.append(tens) ;
87 i -= iTens ;
88 }
89
90 if ( i >= iNines )
91 {
92 sbuff.append(nines) ;
93 i -= iNines;
94 }
95
96 if ( i >= iFives )
97 {
98 sbuff.append(fives) ;
99 i -= iFives ;
100 }
101 if ( i >= iFours )
102 {
103 sbuff.append(fours) ;
104 i -= iFours ;
105 }
106 return i ;
107 }
108
109 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import javax.xml.namespace.QName;
27
28 import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
29 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
30 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
31 import org.apache.commons.compress.archivers.zip.ZipFile;
32 import org.apache.tika.detect.XmlRootExtractor;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.io.CloseShieldInputStream;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.mime.MediaType;
37 import org.apache.tika.parser.AbstractParser;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.sax.OfflineContentHandler;
40 import org.apache.tika.sax.XHTMLContentHandler;
41 import org.xml.sax.ContentHandler;
42 import org.xml.sax.SAXException;
43
44 /**
45 * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
46 * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content.
47 *
48 * Currently supported formats:
49 * <ol>
50 * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
51 * <li>Pages format version 1.x. Currently only tested with Pages version 4.0.x
52 * <li>Numbers format version 1.x. Currently only tested with Numbers version 2.0.x
53 * </ol>
54 */
55 public class IWorkPackageParser extends AbstractParser {
56
57 /** Serial version UID */
58 private static final long serialVersionUID = -2160322853809682372L;
59
60 /**
61 * Which files within an iWork file contain the actual content?
62 */
63 public final static Set<String> IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet(
64 new HashSet<String>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))
65 );
66 /**
67 * All iWork files contain one of these, so we can detect based on it
68 */
69 public final static String IWORK_COMMON_ENTRY = "buildVersionHistory.plist";
70
71 public enum IWORKDocumentType {
72 KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
73 NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
74 PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
75 ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
76
77 private final String namespace;
78 private final String part;
79 private final MediaType type;
80
81 IWORKDocumentType(String namespace, String part, MediaType type) {
82 this.namespace = namespace;
83 this.part = part;
84 this.type = type;
85 }
86
87 public String getNamespace() {
88 return namespace;
89 }
90
91 public String getPart() {
92 return part;
93 }
94
95 public MediaType getType() {
96 return type;
97 }
98
99 public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) {
100 try {
101 if (entry == null) {
102 return null;
103 }
104
105 InputStream stream = zip.getInputStream(entry);
106 try {
107 return detectType(stream);
108 } finally {
109 stream.close();
110 }
111 } catch (IOException e) {
112 return null;
113 }
114 }
115
116 public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipArchiveInputStream zip) {
117 if (entry == null) {
118 return null;
119 }
120
121 return detectType(zip);
122 }
123
124 private static IWORKDocumentType detectType(InputStream stream) {
125 QName qname = new XmlRootExtractor().extractRootElement(stream);
126 if (qname != null) {
127 String uri = qname.getNamespaceURI();
128 String local = qname.getLocalPart();
129
130 for (IWORKDocumentType type : values()) {
131 if(type.getNamespace().equals(uri) &&
132 type.getPart().equals(local)) {
133 return type;
134 }
135 }
136 } else {
137 // There was a problem with extracting the root type
138 // Password Protected iWorks files are funny, but we can usually
139 // spot them because they encrypt part of the zip stream
140 try {
141 stream.read();
142 } catch(UnsupportedZipFeatureException e) {
143 // Compression field was likely encrypted
144 return ENCRYPTED;
145 } catch(Exception ignored) {
146 }
147 }
148 return null;
149 }
150 }
151
152 /**
153 * This parser handles all iWorks formats.
154 */
155 private final static Set<MediaType> supportedTypes =
156 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
157 MediaType.application("vnd.apple.iwork"),
158 IWORKDocumentType.KEYNOTE.getType(),
159 IWORKDocumentType.NUMBERS.getType(),
160 IWORKDocumentType.PAGES.getType()
161 )));
162
163 public Set<MediaType> getSupportedTypes(ParseContext context) {
164 return supportedTypes;
165 }
166
167 public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
168 throws IOException, SAXException, TikaException {
169 ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
170 ZipArchiveEntry entry = zip.getNextZipEntry();
171
172 while (entry != null) {
173 if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
174 entry = zip.getNextZipEntry();
175 continue;
176 }
177
178 InputStream entryStream = new BufferedInputStream(zip, 4096);
179 entryStream.mark(4096);
180 IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
181 entryStream.reset();
182
183 if(type != null) {
184 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
185 ContentHandler contentHandler;
186
187 switch(type) {
188 case KEYNOTE:
189 contentHandler = new KeynoteContentHandler(xhtml, metadata);
190 break;
191 case NUMBERS:
192 contentHandler = new NumbersContentHandler(xhtml, metadata);
193 break;
194 case PAGES:
195 contentHandler = new PagesContentHandler(xhtml, metadata);
196 break;
197 case ENCRYPTED:
198 // We can't do anything for the file right now
199 contentHandler = null;
200 break;
201 default:
202 throw new TikaException("Unhandled iWorks file " + type);
203 }
204
205 metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
206 xhtml.startDocument();
207 if (contentHandler != null) {
208 context.getSAXParser().parse(
209 new CloseShieldInputStream(entryStream),
210 new OfflineContentHandler(contentHandler)
211 );
212 }
213 xhtml.endDocument();
214 }
215
216 entry = zip.getNextZipEntry();
217 }
218 zip.close();
219 }
220
221 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.TikaCoreProperties;
20 import org.apache.tika.sax.XHTMLContentHandler;
21 import org.xml.sax.Attributes;
22 import org.xml.sax.SAXException;
23 import org.xml.sax.helpers.DefaultHandler;
24
25 class KeynoteContentHandler extends DefaultHandler {
26
27 public final static String PRESENTATION_WIDTH = "slides-width";
28 public final static String PRESENTATION_HEIGHT = "slides-height";
29
30 private final XHTMLContentHandler xhtml;
31 private final Metadata metadata;
32
33 private boolean inSlide = false;
34 private boolean inTheme = false;
35 private boolean inTitle = false;
36 private boolean inBody = false;
37 private String tableId;
38 private Integer numberOfColumns = null;
39 private Integer currentColumn = null;
40
41 private boolean inMetadata = false;
42 private boolean inMetaDataTitle = false;
43 private boolean inMetaDataAuthors = false;
44
45 private boolean inParsableText = false;
46
47 private int numberOfSlides = 0;
48
49 KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
50 this.xhtml = xhtml;
51 this.metadata = metadata;
52 }
53
54 @Override
55 public void endDocument() throws SAXException {
56 metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
57 }
58
59 @Override
60 public void startElement(
61 String uri, String localName, String qName, Attributes attributes)
62 throws SAXException {
63 if ("key:theme".equals(qName)) {
64 inTheme = true;
65 } else if ("key:slide".equals(qName)) {
66 inSlide = true;
67 numberOfSlides++;
68 xhtml.startElement("div");
69 } else if ("key:master-slide".equals(qName)) {
70 inSlide = true;
71 xhtml.startElement("div");
72 } else if ("key:title-placeholder".equals(qName) && inSlide) {
73 inTitle = true;
74 xhtml.startElement("h1");
75 } else if ("sf:sticky-note".equals(qName) && inSlide) {
76 xhtml.startElement("p");
77 } else if ("key:notes".equals(qName) && inSlide) {
78 xhtml.startElement("p");
79 } else if ("key:body-placeholder".equals(qName) && inSlide) {
80 xhtml.startElement("p");
81 inBody = true;
82 } else if ("key:size".equals(qName) && !inTheme) {
83 String width = attributes.getValue("sfa:w");
84 String height = attributes.getValue("sfa:h");
85 metadata.set(PRESENTATION_WIDTH, width);
86 metadata.set(PRESENTATION_HEIGHT, height);
87 } else if ("sf:text-body".equals(qName)) {
88 inParsableText = true;
89 } else if ("key:metadata".equals(qName)) {
90 inMetadata = true;
91 } else if (inMetadata && "key:title".equals(qName)) {
92 inMetaDataTitle = true;
93 } else if (inMetadata && "key:authors".equals(qName)) {
94 inMetaDataAuthors = true;
95 } else if (inMetaDataTitle && "key:string".equals(qName)) {
96 metadata.set(TikaCoreProperties.TITLE, attributes.getValue("sfa:string"));
97 } else if (inMetaDataAuthors && "key:string".equals(qName)) {
98 metadata.add(TikaCoreProperties.CREATOR, attributes.getValue("sfa:string"));
99 } else if (inSlide && "sf:tabular-model".equals(qName)) {
100 tableId = attributes.getValue("sfa:ID");
101 xhtml.startElement("table");
102 } else if (tableId != null && "sf:columns".equals(qName)) {
103 numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
104 currentColumn = 0;
105 } else if (tableId != null && "sf:ct".equals(qName)) {
106 parseTableData(attributes.getValue("sfa:s"));
107 } else if (tableId != null && "sf:n".equals(qName)) {
108 parseTableData(attributes.getValue("sf:v"));
109 } else if ("sf:p".equals(qName)) {
110 xhtml.startElement("p");
111 }
112 }
113
114 @Override
115 public void endElement(String uri, String localName, String qName)
116 throws SAXException {
117 if ("key:theme".equals(qName)) {
118 inTheme = false;
119 } else if ("key:slide".equals(qName)) {
120 inSlide = false;
121 xhtml.endElement("div");
122 } else if ("key:master-slide".equals(qName)) {
123 inSlide = false;
124 xhtml.endElement("div");
125 } else if ("key:title-placeholder".equals(qName) && inSlide) {
126 inTitle = false;
127 xhtml.endElement("h1");
128 } else if ("sf:sticky-note".equals(qName) && inSlide) {
129 xhtml.endElement("p");
130 } else if ("key:notes".equals(qName) && inSlide) {
131 xhtml.endElement("p");
132 } else if ("key:body-placeholder".equals(qName) && inSlide) {
133 xhtml.endElement("p");
134 inBody = false;
135 } else if ("sf:text-body".equals(qName)) {
136 inParsableText = false;
137 } else if ("key:metadata".equals(qName)) {
138 inMetadata = false;
139 } else if (inMetadata && "key:title".equals(qName)) {
140 inMetaDataTitle = false;
141 } else if (inMetadata && "key:authors".equals(qName)) {
142 inMetaDataAuthors = false;
143 } else if (inSlide && "sf:tabular-model".equals(qName)) {
144 xhtml.endElement("table");
145 tableId = null;
146 numberOfColumns = null;
147 currentColumn = null;
148 } else if ("sf:p".equals(qName)) {
149 xhtml.endElement("p");
150 }
151 }
152
153 @Override
154 public void characters(char[] ch, int start, int length)
155 throws SAXException {
156 if (inParsableText && inSlide && length != 0) {
157 xhtml.characters(ch, start, length);
158 }
159 }
160
161 private void parseTableData(String value) throws SAXException {
162 if (currentColumn == 0) {
163 xhtml.startElement("tr");
164 }
165
166 xhtml.element("td", value);
167
168 if (currentColumn.equals(numberOfColumns)) {
169 xhtml.endElement("tr");
170 }
171 }
172
173 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.apache.tika.metadata.TikaCoreProperties;
21 import org.apache.tika.sax.XHTMLContentHandler;
22 import org.xml.sax.Attributes;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.helpers.DefaultHandler;
25
26 import java.util.HashMap;
27 import java.util.Map;
28
29 class NumbersContentHandler extends DefaultHandler {
30
31 private final XHTMLContentHandler xhtml;
32 private final Metadata metadata;
33
34 private boolean inSheet = false;
35
36 private boolean inText = false;
37 private boolean parseText = false;
38
39 private boolean inMetadata = false;
40 private Property metadataKey;
41 private String metadataPropertyQName;
42
43 private boolean inTable = false;
44 private int numberOfSheets = 0;
45 private int numberOfColumns = -1;
46 private int currentColumn = 0;
47
48 private Map<String, String> menuItems = new HashMap<String, String>();
49 private String currentMenuItemId;
50
51 NumbersContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
52 this.xhtml = xhtml;
53 this.metadata = metadata;
54 }
55
56 @Override
57 public void endDocument() throws SAXException {
58 metadata.set(Metadata.PAGE_COUNT, String.valueOf(numberOfSheets));
59 }
60
61 @Override
62 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
63 if ("ls:workspace".equals(qName)) {
64 inSheet = true;
65 numberOfSheets++;
66 xhtml.startElement("div");
67 String sheetName = attributes.getValue("ls:workspace-name");
68 metadata.add("sheetNames", sheetName);
69 }
70
71 if ("sf:text".equals(qName)) {
72 inText = true;
73 xhtml.startElement("p");
74 }
75
76 if ("sf:p".equals(qName)) {
77 parseText = true;
78 }
79
80 if ("sf:metadata".equals(qName)) {
81 inMetadata = true;
82 return;
83 }
84
85 if (inMetadata && metadataKey == null) {
86 metadataKey = resolveMetadataKey(localName);
87 metadataPropertyQName = qName;
88 }
89
90 if (inMetadata && metadataKey != null && "sf:string".equals(qName)) {
91 metadata.add(metadataKey, attributes.getValue("sfa:string"));
92 }
93
94 if (!inSheet) {
95 return;
96 }
97
98 if ("sf:tabular-model".equals(qName)) {
99 String tableName = attributes.getValue("sf:name");
100 xhtml.startElement("div");
101 xhtml.characters(tableName);
102 xhtml.endElement("div");
103 inTable = true;
104 xhtml.startElement("table");
105 xhtml.startElement("tr");
106 currentColumn = 0;
107 }
108
109 if ("sf:menu-choices".equals(qName)) {
110 menuItems = new HashMap<String, String>();
111 }
112
113 if (inTable && "sf:grid".equals(qName)) {
114 numberOfColumns = Integer.parseInt(attributes.getValue("sf:numcols"));
115 }
116
117 if (menuItems != null && "sf:t".equals(qName)) {
118 currentMenuItemId = attributes.getValue("sfa:ID");
119 }
120
121 if (currentMenuItemId != null && "sf:ct".equals(qName)) {
122 menuItems.put(currentMenuItemId, attributes.getValue("sfa:s"));
123 }
124
125 if (inTable && "sf:ct".equals(qName)) {
126 if (currentColumn >= numberOfColumns) {
127 currentColumn = 0;
128 xhtml.endElement("tr");
129 xhtml.startElement("tr");
130 }
131
132 xhtml.element("td", attributes.getValue("sfa:s"));
133 currentColumn++;
134 }
135
136 if (inTable && ("sf:n".equals(qName) || "sf:rn".equals(qName))) {
137 if (currentColumn >= numberOfColumns) {
138 currentColumn = 0;
139 xhtml.endElement("tr");
140 xhtml.startElement("tr");
141 }
142
143 xhtml.element("td", attributes.getValue("sf:v"));
144 currentColumn++;
145 }
146
147 if (inTable && "sf:proxied-cell-ref".equals(qName)) {
148 if (currentColumn >= numberOfColumns) {
149 currentColumn = 0;
150 xhtml.endElement("tr");
151 xhtml.startElement("tr");
152 }
153
154 xhtml.element("td", menuItems.get(attributes.getValue("sfa:IDREF")));
155 currentColumn++;
156 }
157
158 if ("sf:chart-name".equals(qName)) {
159 // Extract chart name:
160 xhtml.startElement("div", "class", "chart");
161 xhtml.startElement("h1");
162 xhtml.characters(attributes.getValue("sfa:string"));
163 xhtml.endElement("h1");
164 xhtml.endElement("div");
165 }
166 }
167
168 @Override
169 public void characters(char[] ch, int start, int length) throws SAXException {
170 if (parseText && length > 0) {
171 xhtml.characters(ch, start, length);
172 }
173 }
174
175 @Override
176 public void endElement(String uri, String localName, String qName) throws SAXException {
177 if ("ls:workspace".equals(qName)) {
178 inSheet = false;
179 xhtml.endElement("div");
180 }
181
182 if ("sf:text".equals(qName)) {
183 inText = false;
184 xhtml.endElement("p");
185 }
186
187 if ("sf:p".equals(qName)) {
188 parseText = false;
189 }
190
191 if ("sf:metadata".equals(qName)) {
192 inMetadata = false;
193 }
194
195 if (inMetadata && qName.equals(metadataPropertyQName)) {
196 metadataPropertyQName = null;
197 metadataKey = null;
198 }
199
200 if (!inSheet) {
201 return;
202 }
203
204 if ("sf:menu-choices".equals(qName)) {
205 }
206
207 if ("sf:tabular-model".equals(qName)) {
208 inTable = false;
209 xhtml.endElement("tr");
210 xhtml.endElement("table");
211 }
212
213 if (currentMenuItemId != null && "sf:t".equals(qName)) {
214 currentMenuItemId = null;
215 }
216 }
217
218 private Property resolveMetadataKey(String localName) {
219 if ("authors".equals(localName)) {
220 return TikaCoreProperties.CREATOR;
221 }
222 if ("title".equals(localName)) {
223 return TikaCoreProperties.TITLE;
224 }
225 if ("comment".equals(localName)) {
226 return TikaCoreProperties.COMMENTS;
227 }
228 return Property.internalText(localName);
229 }
230 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.apache.tika.metadata.TikaCoreProperties;
21 import org.apache.tika.sax.XHTMLContentHandler;
22 import org.xml.sax.Attributes;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.helpers.DefaultHandler;
25
26 import java.util.ArrayList;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.regex.Pattern;
31
32 class PagesContentHandler extends DefaultHandler {
33
34 private final XHTMLContentHandler xhtml;
35 private final Metadata metadata;
36
37 /** The (interesting) part of the document we're in. Should be more structured... */
38 private enum DocumentPart {
39 METADATA, PARSABLE_TEXT,
40 HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
41 FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
42 FOOTNOTES, ANNOTATIONS;
43 }
44 private DocumentPart inPart = null;
45 private boolean ghostText;
46
47 private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
48
49 private boolean parseProperty = false;
50 private int pageCount = 0;
51 private int slPageCount = 0;
52
53 private HeaderFooter headers = null;
54 private HeaderFooter footers = null;
55 private Footnotes footnotes = null;
56 private Annotations annotations = null;
57
58 private Map<String, List<List<String>>> tableData =
59 new HashMap<String, List<List<String>>>();
60 private String activeTableId;
61 private int numberOfColumns = 0;
62 private List<String> activeRow = new ArrayList<String>();
63
64 private String metaDataLocalName;
65 private String metaDataQName;
66
67 PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
68 this.xhtml = xhtml;
69 this.metadata = metadata;
70 }
71
72 @Override
73 public void endDocument() throws SAXException {
74 metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
75 if (pageCount > 0) {
76 doFooter();
77 xhtml.endElement("div");
78 }
79 }
80
81 @Override
82 public void startElement(
83 String uri, String localName, String qName, Attributes attributes)
84 throws SAXException {
85 if (parseProperty) {
86 String value = parsePrimitiveElementValue(qName, attributes);
87 if (value != null) {
88 Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
89 if(metaDataKey instanceof Property) {
90 metadata.set((Property)metaDataKey, value);
91 } else {
92 metadata.add((String)metaDataKey, value);
93 }
94 }
95 }
96
97 if ("sl:publication-info".equals(qName)) {
98 inPart = DocumentPart.METADATA;
99 } else if ("sf:metadata".equals(qName)) {
100 inPart = DocumentPart.METADATA;
101 } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
102 if (pageCount > 0) {
103 doFooter();
104 xhtml.endElement("div");
105 }
106 xhtml.startElement("div");
107 if ("sl:page-group".equals(qName)) {
108 slPageCount++;
109 } else {
110 pageCount++;
111 }
112 doHeader();
113 } else if ("sf:p".equals(qName)) {
114 if (pageCount+slPageCount > 0) {
115 inPart = DocumentPart.PARSABLE_TEXT;
116 xhtml.startElement("p");
117 }
118 } else if ("sf:attachment".equals(qName)) {
119 String kind = attributes.getValue("sf:kind");
120 if ("tabular-attachment".equals(kind)) {
121 activeTableId = attributes.getValue("sfa:ID");
122 tableData.put(activeTableId, new ArrayList<List<String>>());
123 }
124 } else if ("sf:attachment-ref".equals(qName)) {
125 String idRef = attributes.getValue("sfa:IDREF");
126 outputTable(idRef);
127 } else if ("sf:headers".equals(qName)) {
128 headers = new HeaderFooter(qName);
129 inPart = DocumentPart.HEADERS;
130 } else if ("sf:footers".equals(qName)) {
131 footers = new HeaderFooter(qName);
132 inPart = DocumentPart.FOOTERS;
133 } else if ("sf:header".equals(qName)) {
134 inPart = headers.identifyPart(attributes.getValue("sf:name"));
135 } else if ("sf:footer".equals(qName)) {
136 inPart = footers.identifyPart(attributes.getValue("sf:name"));
137 } else if ("sf:page-number".equals(qName)) {
138 if (inPart == DocumentPart.FOOTER_ODD
139 || inPart == DocumentPart.FOOTER_FIRST
140 || inPart == DocumentPart.FOOTER_EVEN) {
141 // We are in a footer
142 footers.hasAutoPageNumber = true;
143 footers.autoPageNumberFormat = attributes.getValue("sf:format");
144 } else {
145 headers.hasAutoPageNumber = true;
146 headers.autoPageNumberFormat = attributes.getValue("sf:format");
147 }
148
149 xhtml.characters(Integer.toString(this.pageCount));
150 } else if ("sf:footnotes".equals(qName)) {
151 footnotes = new Footnotes();
152 inPart = DocumentPart.FOOTNOTES;
153 } else if ("sf:footnote-mark".equals(qName)) {
154 footnotes.recordMark(attributes.getValue("sf:mark"));
155 } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
156 // What about non auto-numbered?
157 String footnoteMark = attributes.getValue("sf:autonumber");
158 if (footnotes != null) {
159 String footnoteText = footnotes.footnotes.get(footnoteMark);
160 if (footnoteText != null) {
161 xhtml.startElement("div", "style", "footnote");
162 xhtml.characters("Footnote:" ); // As shown in Pages
163 xhtml.characters(footnoteText);
164 xhtml.endElement("div");
165 }
166 }
167 } else if ("sf:annotations".equals(qName)) {
168 annotations = new Annotations();
169 inPart = DocumentPart.ANNOTATIONS;
170 } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
171 annotations.start(attributes.getValue("sf:target"));
172 } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
173 xhtml.startElement("div", "style", "annotated");
174
175 String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
176 if (annotationText != null) {
177 xhtml.startElement("div", "style", "annotation");
178 xhtml.characters(annotationText);
179 xhtml.endElement("div");
180 }
181 } else if ("sf:ghost-text".equals(qName)) {
182 ghostText = true;
183 }
184
185 if (activeTableId != null) {
186 parseTableData(qName, attributes);
187 }
188
189 if (inPart == DocumentPart.METADATA) {
190 metaDataLocalName = localName;
191 metaDataQName = qName;
192 parseProperty = true;
193 }
194 }
195
196 @Override
197 public void endElement(String uri, String localName, String qName)
198 throws SAXException {
199 if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
200 metaDataLocalName = null;
201 parseProperty = false;
202 }
203
204 if ("sl:publication-info".equals(qName)) {
205 inPart = null;
206 } else if ("sf:metadata".equals(qName)) {
207 inPart = null;
208 } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
209 inPart = null;
210 xhtml.endElement("p");
211 } else if ("sf:attachment".equals(qName)) {
212 activeTableId = null;
213 } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
214 annotations.end();
215 } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
216 xhtml.endElement("div");
217 } else if ("sf:ghost-text".equals(qName)) {
218 ghostText = false;
219 }
220 }
221
222 @Override
223 public void characters(char[] ch, int start, int length) throws SAXException {
224 if (length > 0) {
225 if (inPart == DocumentPart.PARSABLE_TEXT) {
226 if (!ghostText) {
227 xhtml.characters(ch, start, length);
228 }
229 } else if(inPart != null) {
230 String str = new String(ch, start, length);
231 if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
232 if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
233 if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
234 if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
235 if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
236 if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
237 if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
238 if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
239 }
240 }
241 }
242
243 private void parseTableData(String qName, Attributes attributes) {
244 if ("sf:grid".equals(qName)) {
245 String numberOfColumns = attributes.getValue("sf:numcols");
246 this.numberOfColumns = Integer.parseInt(numberOfColumns);
247 } else if ("sf:ct".equals(qName)) {
248 activeRow.add(attributes.getValue("sfa:s"));
249
250 if (activeRow.size() >= 3) {
251 tableData.get(activeTableId).add(activeRow);
252 activeRow = new ArrayList<String>();
253 }
254 }
255 }
256
257 private void outputTable(String idRef) throws SAXException {
258 List<List<String>> tableData = this.tableData.get(idRef);
259 if (tableData != null) {
260 xhtml.startElement("table");
261 for (List<String> row : tableData) {
262 xhtml.startElement("tr");
263 for (String cell : row) {
264 xhtml.element("td", cell);
265 }
266 xhtml.endElement("tr");
267 }
268 xhtml.endElement("table");
269 }
270 }
271
272 /**
273 * Returns a resolved key that is common in other document types or
274 * returns the specified metaDataLocalName if no common key could be found.
275 * The key could be a simple String key, or could be a {@link Property}
276 *
277 * @param metaDataLocalName The localname of the element containing metadata
278 * @return a resolved key that is common in other document types
279 */
280 private Object resolveMetaDataKey(String metaDataLocalName) {
281 Object metaDataKey = metaDataLocalName;
282 if ("sf:authors".equals(metaDataQName)) {
283 metaDataKey = TikaCoreProperties.CREATOR;
284 } else if ("sf:title".equals(metaDataQName)) {
285 metaDataKey = TikaCoreProperties.TITLE;
286 } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
287 metaDataKey = TikaCoreProperties.CREATED;
288 } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
289 metaDataKey = Metadata.LAST_MODIFIED;
290 } else if ("sl:language".equals(metaDataQName)) {
291 metaDataKey = TikaCoreProperties.LANGUAGE;
292 }
293 return metaDataKey;
294 }
295
296 /**
297 * Returns the value of a primitive element e.g.:
298 * &lt;sl:number sfa:number="0" sfa:type="f"/&gt; - the number attribute
299 * &lt;sl:string sfa:string="en"/&gt; = the string attribute
300 * <p>
301 * Returns <code>null</code> if the value could not be extracted from
302 * the list of attributes.
303 *
304 * @param qName The fully qualified name of the element containing
305 * the value to extract
306 * @param attributes The list of attributes of which one contains the
307 * value to be extracted
308 * @return the value of a primitive element
309 */
310 private String parsePrimitiveElementValue(
311 String qName, Attributes attributes) {
312 if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
313 return attributes.getValue("sfa:string");
314 } else if ("sl:number".equals(qName)) {
315 return attributes.getValue("sfa:number");
316 } else if ("sl:date".equals(qName)) {
317 return attributes.getValue("sf:val");
318 }
319
320 return null;
321 }
322
323 private void doHeader() throws SAXException {
324 if (headers != null) {
325 headers.output("header");
326 }
327 }
328 private void doFooter() throws SAXException {
329 if (footers != null) {
330 footers.output("footer");
331 }
332 }
333
334 /**
335 * Represents the Headers or Footers in a document
336 */
337 private class HeaderFooter {
338 private String type; // sf:headers or sf:footers
339 private String defaultOdd;
340 private String defaultEven;
341 private String defaultFirst;
342 private boolean hasAutoPageNumber;
343 private String autoPageNumberFormat;
344 // TODO Can there be custom ones?
345
346 private HeaderFooter(String type) {
347 this.type = type;
348 }
349 private DocumentPart identifyPart(String name) {
350 if("SFWPDefaultOddHeaderIdentifier".equals(name))
351 return DocumentPart.HEADER_ODD;
352 if("SFWPDefaultEvenHeaderIdentifier".equals(name))
353 return DocumentPart.HEADER_EVEN;
354 if("SFWPDefaultFirstHeaderIdentifier".equals(name))
355 return DocumentPart.HEADER_FIRST;
356
357 if("SFWPDefaultOddFooterIdentifier".equals(name))
358 return DocumentPart.FOOTER_ODD;
359 if("SFWPDefaultEvenFooterIdentifier".equals(name))
360 return DocumentPart.FOOTER_EVEN;
361 if("SFWPDefaultFirstFooterIdentifier".equals(name))
362 return DocumentPart.FOOTER_FIRST;
363
364 return null;
365 }
366 private void output(String what) throws SAXException {
367 String text = null;
368 if (pageCount == 1 && defaultFirst != null) {
369 text = defaultFirst;
370 } else if (pageCount % 2 == 0 && defaultEven != null) {
371 text = defaultEven;
372 } else {
373 text = defaultOdd;
374 }
375
376 if (text != null) {
377 xhtml.startElement("div", "class", "header");
378 xhtml.characters(text);
379 if (hasAutoPageNumber) {
380 if (autoPageNumberFormat == null) { // raw number
381 xhtml.characters("\t" + pageCount);
382 } else if (autoPageNumberFormat.equals("upper-roman")){
383 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
384 } else if (autoPageNumberFormat.equals("lower-roman")){
385 xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
386 } else if (autoPageNumberFormat.equals("upper-alpha")){
387 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
388 } else if (autoPageNumberFormat.equals("lower-alpha")){
389 xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
390 }
391 }
392 xhtml.endElement("div");
393 }
394 }
395 }
396 /**
397 * Represents Footnotes in a document. The way these work
398 * in the file format isn't very clean...
399 */
400 private static class Footnotes {
401 /** Mark -> Text */
402 Map<String,String> footnotes = new HashMap<String, String>();
403 String lastSeenMark = null;
404
405 /**
406 * Normally happens before the text of the mark
407 */
408 private void recordMark(String mark) {
409 lastSeenMark = mark;
410 }
411 private void text(String text) {
412 if (lastSeenMark != null) {
413 if (footnotes.containsKey(lastSeenMark)) {
414 text = footnotes.get(lastSeenMark) + text;
415 }
416 footnotes.put(lastSeenMark, text);
417 }
418 }
419 }
420 /**
421 * Represents Annotations in a document. We currently
422 * just grab all the sf:p text in each one
423 */
424 private class Annotations {
425 /** ID -> Text */
426 Map<String,String> annotations = new HashMap<String, String>();
427 String currentID = null;
428 StringBuffer currentText = null;
429
430 private void start(String id) {
431 currentID = id;
432 currentText = new StringBuffer();
433 }
434 private void text(String text) {
435 if (text != null && text.length() > 0 && currentText != null) {
436 currentText.append(text);
437 }
438 }
439 private void end() {
440 if (currentText.length() > 0) {
441 annotations.put(currentID, currentText.toString());
442 currentID = null;
443 currentText = null;
444 }
445 }
446 }
447
448 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.jpeg;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.io.TemporaryResources;
25 import org.apache.tika.io.TikaInputStream;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.parser.AbstractParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.image.ImageMetadataExtractor;
31 import org.apache.tika.parser.image.xmp.JempboxExtractor;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 public class JpegParser extends AbstractParser {
37
38 /** Serial version UID */
39 private static final long serialVersionUID = -1355028253756234603L;
40
41 private static final Set<MediaType> SUPPORTED_TYPES =
42 Collections.singleton(MediaType.image("jpeg"));
43
44 public Set<MediaType> getSupportedTypes(ParseContext context) {
45 return SUPPORTED_TYPES;
46 }
47
48 public void parse(
49 InputStream stream, ContentHandler handler,
50 Metadata metadata, ParseContext context)
51 throws IOException, SAXException, TikaException {
52 TemporaryResources tmp = new TemporaryResources();
53 try {
54 TikaInputStream tis = TikaInputStream.get(stream, tmp);
55 new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
56 new JempboxExtractor(metadata).parse(tis);
57 } finally {
58 tmp.dispose();
59 }
60
61 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
62 xhtml.startDocument();
63 xhtml.endDocument();
64 }
65
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mail;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.james.mime4j.MimeException;
22 import org.apache.james.mime4j.codec.DecodeMonitor;
23 import org.apache.james.mime4j.codec.DecoderUtil;
24 import org.apache.james.mime4j.dom.address.Address;
25 import org.apache.james.mime4j.dom.address.AddressList;
26 import org.apache.james.mime4j.dom.address.Mailbox;
27 import org.apache.james.mime4j.dom.address.MailboxList;
28 import org.apache.james.mime4j.dom.field.AddressListField;
29 import org.apache.james.mime4j.dom.field.DateTimeField;
30 import org.apache.james.mime4j.dom.field.MailboxListField;
31 import org.apache.james.mime4j.dom.field.ParsedField;
32 import org.apache.james.mime4j.dom.field.UnstructuredField;
33 import org.apache.james.mime4j.field.LenientFieldParser;
34 import org.apache.james.mime4j.parser.ContentHandler;
35 import org.apache.james.mime4j.stream.BodyDescriptor;
36 import org.apache.james.mime4j.stream.Field;
37 import org.apache.tika.config.TikaConfig;
38 import org.apache.tika.exception.TikaException;
39 import org.apache.tika.metadata.Metadata;
40 import org.apache.tika.metadata.TikaCoreProperties;
41 import org.apache.tika.parser.AutoDetectParser;
42 import org.apache.tika.parser.ParseContext;
43 import org.apache.tika.parser.Parser;
44 import org.apache.tika.sax.BodyContentHandler;
45 import org.apache.tika.sax.EmbeddedContentHandler;
46 import org.apache.tika.sax.XHTMLContentHandler;
47 import org.xml.sax.SAXException;
48
49 /**
50 * Bridge between mime4j's content handler and the generic Sax content handler
51 * used by Tika. See
52 * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
53 */
54 class MailContentHandler implements ContentHandler {
55
56 private boolean strictParsing = false;
57
58 private XHTMLContentHandler handler;
59 private ParseContext context;
60 private Metadata metadata;
61 private TikaConfig tikaConfig = null;
62
63 private boolean inPart = false;
64
65 MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
66 this.handler = xhtml;
67 this.context = context;
68 this.metadata = metadata;
69 this.strictParsing = strictParsing;
70 }
71
72 public void body(BodyDescriptor body, InputStream is) throws MimeException,
73 IOException {
74 // Work out the best underlying parser for the part
75 // Check first for a specified AutoDetectParser (which may have a
76 // specific Config), then a recursing parser, and finally the default
77 Parser parser = context.get(AutoDetectParser.class);
78 if (parser == null) {
79 parser = context.get(Parser.class);
80 }
81 if (parser == null) {
82 if (tikaConfig == null) {
83 tikaConfig = context.get(TikaConfig.class);
84 if (tikaConfig == null) {
85 tikaConfig = TikaConfig.getDefaultConfig();
86 }
87 }
88 parser = tikaConfig.getParser();
89 }
90
91 // use a different metadata object
92 // in order to specify the mime type of the
93 // sub part without damaging the main metadata
94
95 Metadata submd = new Metadata();
96 submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
97 submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
98
99 try {
100 BodyContentHandler bch = new BodyContentHandler(handler);
101 parser.parse(is, new EmbeddedContentHandler(bch), submd, context);
102 } catch (SAXException e) {
103 throw new MimeException(e);
104 } catch (TikaException e) {
105 throw new MimeException(e);
106 }
107 }
108
109 public void endBodyPart() throws MimeException {
110 try {
111 handler.endElement("p");
112 handler.endElement("div");
113 } catch (SAXException e) {
114 throw new MimeException(e);
115 }
116 }
117
118 public void endHeader() throws MimeException {
119 }
120
121 public void startMessage() throws MimeException {
122 try {
123 handler.startDocument();
124 } catch (SAXException e) {
125 throw new MimeException(e);
126 }
127 }
128
129 public void endMessage() throws MimeException {
130 try {
131 handler.endDocument();
132 } catch (SAXException e) {
133 throw new MimeException(e);
134 }
135 }
136
137 public void endMultipart() throws MimeException {
138 inPart = false;
139 }
140
141 public void epilogue(InputStream is) throws MimeException, IOException {
142 }
143
144 /**
145 * Header for the whole message or its parts
146 *
147 * @see http
148 * ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
149 * Field.html
150 **/
151 public void field(Field field) throws MimeException {
152 // inPart indicates whether these metadata correspond to the
153 // whole message or its parts
154 if (inPart) {
155 return;
156 }
157
158 try {
159 String fieldname = field.getName();
160 ParsedField parsedField = LenientFieldParser.getParser().parse(
161 field, DecodeMonitor.SILENT);
162 if (fieldname.equalsIgnoreCase("From")) {
163 MailboxListField fromField = (MailboxListField) parsedField;
164 MailboxList mailboxList = fromField.getMailboxList();
165 if (fromField.isValidField() && mailboxList != null) {
166 for (int i = 0; i < mailboxList.size(); i++) {
167 String from = getDisplayString(mailboxList.get(i));
168 metadata.add(Metadata.MESSAGE_FROM, from);
169 metadata.add(TikaCoreProperties.CREATOR, from);
170 }
171 } else {
172 String from = stripOutFieldPrefix(field, "From:");
173 if (from.startsWith("<")) {
174 from = from.substring(1);
175 }
176 if (from.endsWith(">")) {
177 from = from.substring(0, from.length() - 1);
178 }
179 metadata.add(Metadata.MESSAGE_FROM, from);
180 metadata.add(TikaCoreProperties.CREATOR, from);
181 }
182 } else if (fieldname.equalsIgnoreCase("Subject")) {
183 metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
184 ((UnstructuredField) parsedField).getValue());
185 } else if (fieldname.equalsIgnoreCase("To")) {
186 processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
187 } else if (fieldname.equalsIgnoreCase("CC")) {
188 processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
189 } else if (fieldname.equalsIgnoreCase("BCC")) {
190 processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
191 } else if (fieldname.equalsIgnoreCase("Date")) {
192 DateTimeField dateField = (DateTimeField) parsedField;
193 metadata.set(TikaCoreProperties.CREATED, dateField.getDate());
194 }
195 } catch (RuntimeException me) {
196 if (strictParsing) {
197 throw me;
198 }
199 }
200 }
201
202 private void processAddressList(ParsedField field, String addressListType,
203 String metadataField) throws MimeException {
204 AddressListField toField = (AddressListField) field;
205 if (toField.isValidField()) {
206 AddressList addressList = toField.getAddressList();
207 for (int i = 0; i < addressList.size(); ++i) {
208 metadata.add(metadataField, getDisplayString(addressList.get(i)));
209 }
210 } else {
211 String to = stripOutFieldPrefix(field,
212 addressListType);
213 for (String eachTo : to.split(",")) {
214 metadata.add(metadataField, eachTo.trim());
215 }
216 }
217 }
218
219 private String getDisplayString(Address address) {
220 if (address instanceof Mailbox) {
221 Mailbox mailbox = (Mailbox) address;
222 String name = mailbox.getName();
223 if (name != null && name.length() > 0) {
224 name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
225 return name + " <" + mailbox.getAddress() + ">";
226 } else {
227 return mailbox.getAddress();
228 }
229 } else {
230 return address.toString();
231 }
232 }
233
234 public void preamble(InputStream is) throws MimeException, IOException {
235 }
236
237 public void raw(InputStream is) throws MimeException, IOException {
238 }
239
240 public void startBodyPart() throws MimeException {
241 try {
242 handler.startElement("div", "class", "email-entry");
243 handler.startElement("p");
244 } catch (SAXException e) {
245 throw new MimeException(e);
246 }
247 }
248
249 public void startHeader() throws MimeException {
250 // TODO Auto-generated method stub
251
252 }
253
254 public void startMultipart(BodyDescriptor descr) throws MimeException {
255 inPart = true;
256 }
257
258 private String stripOutFieldPrefix(Field field, String fieldname) {
259 String temp = field.getRaw().toString();
260 int loc = fieldname.length();
261 while (temp.charAt(loc) ==' ') {
262 loc++;
263 }
264 return temp.substring(loc);
265 }
266
267 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mail;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.james.mime4j.MimeException;
24 import org.apache.james.mime4j.parser.MimeStreamParser;
25 import org.apache.james.mime4j.stream.MimeConfig;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.io.TaggedInputStream;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Uses apache-mime4j to parse emails. Each part is treated with the
38 * corresponding parser and displayed within elements.
39 * <p>
40 * A {@link MimeEntityConfig} object can be passed in the parsing context
41 * to better control the parsing process.
42 *
43 * @author jnioche@digitalpebble.com
44 */
45 public class RFC822Parser extends AbstractParser {
46
47 /** Serial version UID */
48 private static final long serialVersionUID = -5504243905998074168L;
49
50 private static final Set<MediaType> SUPPORTED_TYPES = Collections
51 .singleton(MediaType.parse("message/rfc822"));
52
53 public Set<MediaType> getSupportedTypes(ParseContext context) {
54 return SUPPORTED_TYPES;
55 }
56
57 public void parse(InputStream stream, ContentHandler handler,
58 Metadata metadata, ParseContext context) throws IOException,
59 SAXException, TikaException {
60 // Get the mime4j configuration, or use a default one
61 MimeConfig config = new MimeConfig();
62 config.setMaxLineLen(100000);
63 config.setMaxHeaderLen(100000); // max length of any individual header
64 config = context.get(MimeConfig.class, config);
65
66 MimeStreamParser parser = new MimeStreamParser(config);
67 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68
69 MailContentHandler mch = new MailContentHandler(
70 xhtml, metadata, context, config.isStrictParsing());
71 parser.setContentHandler(mch);
72 parser.setContentDecoding(true);
73 TaggedInputStream tagged = TaggedInputStream.get(stream);
74 try {
75 parser.parse(tagged);
76 } catch (IOException e) {
77 tagged.throwIfCauseOf(e);
78 throw new TikaException("Failed to parse an email message", e);
79 } catch (MimeException e) {
80 // Unwrap the exception in case it was not thrown by mime4j
81 Throwable cause = e.getCause();
82 if (cause instanceof TikaException) {
83 throw (TikaException) cause;
84 } else if (cause instanceof SAXException) {
85 throw (SAXException) cause;
86 } else {
87 throw new TikaException("Failed to parse an email message", e);
88 }
89 }
90 }
91
92 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mbox;
17
18 import java.io.BufferedReader;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.io.UnsupportedEncodingException;
23 import java.text.ParseException;
24 import java.text.SimpleDateFormat;
25 import java.util.Collections;
26 import java.util.Date;
27 import java.util.Locale;
28 import java.util.Set;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.metadata.TikaCoreProperties;
35 import org.apache.tika.mime.MediaType;
36 import org.apache.tika.parser.AbstractParser;
37 import org.apache.tika.parser.ParseContext;
38 import org.apache.tika.sax.XHTMLContentHandler;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.SAXException;
41
42 /**
43 * Mbox (mailbox) parser. This version returns the headers for the first email
44 * via metadata, which means headers from subsequent emails will be lost.
45 */
46 public class MboxParser extends AbstractParser {
47
48 /** Serial version UID */
49 private static final long serialVersionUID = -1762689436731160661L;
50
51 private static final Set<MediaType> SUPPORTED_TYPES =
52 Collections.singleton(MediaType.application("mbox"));
53
54 public static final String MBOX_MIME_TYPE = "application/mbox";
55 public static final String MBOX_RECORD_DIVIDER = "From ";
56 private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
57 private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
58
59 private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
60 private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
61
62 private enum ParseStates {
63 START, IN_HEADER, IN_CONTENT
64 }
65
66 public Set<MediaType> getSupportedTypes(ParseContext context) {
67 return SUPPORTED_TYPES;
68 }
69
70 public void parse(
71 InputStream stream, ContentHandler handler,
72 Metadata metadata, ParseContext context)
73 throws IOException, TikaException, SAXException {
74
75 InputStreamReader isr;
76 try {
77 // Headers are going to be 7-bit ascii
78 isr = new InputStreamReader(stream, "US-ASCII");
79 } catch (UnsupportedEncodingException e) {
80 throw new TikaException("US-ASCII is not supported!", e);
81 }
82
83 BufferedReader reader = new BufferedReader(isr);
84
85 metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
86 metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
87
88 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
89 xhtml.startDocument();
90
91 ParseStates parseState = ParseStates.START;
92 String multiLine = null;
93 boolean inQuote = false;
94 int numEmails = 0;
95
96 // We're going to scan, line-by-line, for a line that starts with
97 // "From "
98 for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
99 boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
100 if (newMessage) {
101 numEmails += 1;
102 }
103
104 switch (parseState) {
105 case START:
106 if (newMessage) {
107 parseState = ParseStates.IN_HEADER;
108 newMessage = false;
109 // Fall through to IN_HEADER
110 } else {
111 break;
112 }
113
114 case IN_HEADER:
115 if (newMessage) {
116 saveHeaderInMetadata(numEmails, metadata, multiLine);
117 multiLine = curLine;
118 } else if (curLine.length() == 0) {
119 // Blank line is signal that we're transitioning to the content.
120 saveHeaderInMetadata(numEmails, metadata, multiLine);
121 parseState = ParseStates.IN_CONTENT;
122
123 // Mimic what PackageParser does between entries.
124 xhtml.startElement("div", "class", "email-entry");
125 xhtml.startElement("p");
126 inQuote = false;
127 } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
128 multiLine += " " + curLine.trim();
129 } else {
130 saveHeaderInMetadata(numEmails, metadata, multiLine);
131 multiLine = curLine;
132 }
133
134 break;
135
136 // TODO - use real email parsing support so we can correctly handle
137 // things like multipart messages and quoted-printable encoding.
138 // We'd also want this for charset handling, where content isn't 7-bit
139 // ascii.
140 case IN_CONTENT:
141 if (newMessage) {
142 endMessage(xhtml, inQuote);
143 parseState = ParseStates.IN_HEADER;
144 multiLine = curLine;
145 } else {
146 boolean quoted = curLine.startsWith(">");
147 if (inQuote) {
148 if (!quoted) {
149 xhtml.endElement("q");
150 inQuote = false;
151 }
152 } else if (quoted) {
153 xhtml.startElement("q");
154 inQuote = true;
155 }
156
157 xhtml.characters(curLine);
158
159 // For plain text email, each line is a real break position.
160 xhtml.element("br", "");
161 }
162 }
163 }
164
165 if (parseState == ParseStates.IN_HEADER) {
166 saveHeaderInMetadata(numEmails, metadata, multiLine);
167 } else if (parseState == ParseStates.IN_CONTENT) {
168 endMessage(xhtml, inQuote);
169 }
170
171 xhtml.endDocument();
172 }
173
174 private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
175 if (inQuote) {
176 xhtml.endElement("q");
177 }
178
179 xhtml.endElement("p");
180 xhtml.endElement("div");
181 }
182
183 private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
184 if ((curLine == null) || (numEmails > 1)) {
185 return;
186 } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
187 metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
188 return;
189 }
190
191 Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
192 if (!headerMatcher.matches()) {
193 return; // ignore malformed header lines
194 }
195
196 String headerTag = headerMatcher.group(1).toLowerCase();
197 String headerContent = headerMatcher.group(2);
198
199 if (headerTag.equalsIgnoreCase("From")) {
200 metadata.set(TikaCoreProperties.CREATOR, headerContent);
201 } else if (headerTag.equalsIgnoreCase("To") ||
202 headerTag.equalsIgnoreCase("Cc") ||
203 headerTag.equalsIgnoreCase("Bcc")) {
204 Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
205 if(address.find()) {
206 metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
207 } else if(headerContent.indexOf('@') > -1) {
208 metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
209 }
210
211 String property = Metadata.MESSAGE_TO;
212 if (headerTag.equalsIgnoreCase("Cc")) {
213 property = Metadata.MESSAGE_CC;
214 } else if (headerTag.equalsIgnoreCase("Bcc")) {
215 property = Metadata.MESSAGE_BCC;
216 }
217 metadata.add(property, headerContent);
218 } else if (headerTag.equalsIgnoreCase("Subject")) {
219 // TODO Move to title in Tika 2.0
220 metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE,
221 headerContent);
222 } else if (headerTag.equalsIgnoreCase("Date")) {
223 try {
224 Date date = parseDate(headerContent);
225 metadata.set(TikaCoreProperties.CREATED, date);
226 } catch (ParseException e) {
227 // ignoring date because format was not understood
228 }
229 } else if (headerTag.equalsIgnoreCase("Message-Id")) {
230 metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
231 } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
232 metadata.set(TikaCoreProperties.RELATION, headerContent);
233 } else if (headerTag.equalsIgnoreCase("Content-Type")) {
234 // TODO - key off content-type in headers to
235 // set mapping to use for content and convert if necessary.
236
237 metadata.add(Metadata.CONTENT_TYPE, headerContent);
238 metadata.set(TikaCoreProperties.FORMAT, headerContent);
239 } else {
240 metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
241 }
242 }
243
244 public static Date parseDate(String headerContent) throws ParseException {
245 SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
246 return dateFormat.parse(headerContent);
247 }
248
249 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23 import org.apache.poi.poifs.filesystem.DirectoryEntry;
24 import org.apache.poi.poifs.filesystem.DirectoryNode;
25 import org.apache.poi.poifs.filesystem.DocumentEntry;
26 import org.apache.poi.poifs.filesystem.DocumentInputStream;
27 import org.apache.poi.poifs.filesystem.Entry;
28 import org.apache.poi.poifs.filesystem.Ole10Native;
29 import org.apache.poi.poifs.filesystem.Ole10NativeException;
30 import org.apache.tika.config.TikaConfig;
31 import org.apache.tika.detect.Detector;
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
34 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
35 import org.apache.tika.io.TikaInputStream;
36 import org.apache.tika.metadata.Metadata;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.mime.MimeType;
39 import org.apache.tika.mime.MimeTypeException;
40 import org.apache.tika.mime.MimeTypes;
41 import org.apache.tika.parser.ParseContext;
42 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
43 import org.apache.tika.parser.pkg.ZipContainerDetector;
44 import org.apache.tika.sax.XHTMLContentHandler;
45 import org.xml.sax.SAXException;
46
47 abstract class AbstractPOIFSExtractor {
48 private final EmbeddedDocumentExtractor extractor;
49 private TikaConfig tikaConfig;
50 private MimeTypes mimeTypes;
51 private Detector detector;
52 private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
53
54 protected AbstractPOIFSExtractor(ParseContext context) {
55 EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
56
57 if (ex==null) {
58 this.extractor = new ParsingEmbeddedDocumentExtractor(context);
59 } else {
60 this.extractor = ex;
61 }
62
63 tikaConfig = context.get(TikaConfig.class);
64 mimeTypes = context.get(MimeTypes.class);
65 detector = context.get(Detector.class);
66 }
67
68 // Note - these cache, but avoid creating the default TikaConfig if not needed
69 protected TikaConfig getTikaConfig() {
70 if (tikaConfig == null) {
71 tikaConfig = TikaConfig.getDefaultConfig();
72 }
73 return tikaConfig;
74 }
75 protected Detector getDetector() {
76 if (detector != null) return detector;
77
78 detector = getTikaConfig().getDetector();
79 return detector;
80 }
81 protected MimeTypes getMimeTypes() {
82 if (mimeTypes != null) return mimeTypes;
83
84 mimeTypes = getTikaConfig().getMimeRepository();
85 return mimeTypes;
86 }
87
88 protected void handleEmbeddedResource(TikaInputStream resource, String filename,
89 String relationshipID, String mediaType, XHTMLContentHandler xhtml,
90 boolean outputHtml)
91 throws IOException, SAXException, TikaException {
92 try {
93 Metadata metadata = new Metadata();
94 if(filename != null) {
95 metadata.set(Metadata.TIKA_MIME_FILE, filename);
96 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
97 }
98 if (relationshipID != null) {
99 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
100 }
101 if(mediaType != null) {
102 metadata.set(Metadata.CONTENT_TYPE, mediaType);
103 }
104
105 if (extractor.shouldParseEmbedded(metadata)) {
106 extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
107 }
108 } finally {
109 resource.close();
110 }
111 }
112
113 /**
114 * Handle an office document that's embedded at the POIFS level
115 */
116 protected void handleEmbeddedOfficeDoc(
117 DirectoryEntry dir, XHTMLContentHandler xhtml)
118 throws IOException, SAXException, TikaException {
119
120 // Is it an embedded OLE2 document, or an embedded OOXML document?
121
122 if (dir.hasEntry("Package")) {
123 // It's OOXML (has a ZipFile):
124 Entry ooxml = dir.getEntry("Package");
125
126 TikaInputStream stream = TikaInputStream.get(
127 new DocumentInputStream((DocumentEntry) ooxml));
128 try {
129 ZipContainerDetector detector = new ZipContainerDetector();
130 MediaType type = detector.detect(stream, new Metadata());
131 handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
132 return;
133 } finally {
134 stream.close();
135 }
136 }
137
138 // It's regular OLE2:
139
140 // What kind of document is it?
141 Metadata metadata = new Metadata();
142 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
143 POIFSDocumentType type = POIFSDocumentType.detectType(dir);
144 TikaInputStream embedded = null;
145
146 try {
147 if (type == POIFSDocumentType.OLE10_NATIVE) {
148 try {
149 // Try to un-wrap the OLE10Native record:
150 Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
151 metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
152
153 byte[] data = ole.getDataBuffer();
154 embedded = TikaInputStream.get(data);
155 } catch (Ole10NativeException ex) {
156 // Not a valid OLE10Native record, skip it
157 } catch (Exception e) {
158 logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
159 }
160 } else if (type == POIFSDocumentType.COMP_OBJ) {
161 try {
162 // Grab the contents and process
163 DocumentEntry contentsEntry;
164 try {
165 contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
166 } catch (FileNotFoundException ioe) {
167 contentsEntry = (DocumentEntry)dir.getEntry("Contents");
168 }
169 DocumentInputStream inp = new DocumentInputStream(contentsEntry);
170 byte[] contents = new byte[contentsEntry.getSize()];
171 inp.readFully(contents);
172 embedded = TikaInputStream.get(contents);
173
174 // Try to work out what it is
175 MediaType mediaType = getDetector().detect(embedded, new Metadata());
176 String extension = type.getExtension();
177 try {
178 MimeType mimeType = getMimeTypes().forName(mediaType.toString());
179 extension = mimeType.getExtension();
180 } catch(MimeTypeException mte) {
181 // No details on this type are known
182 }
183
184 // Record what we can do about it
185 metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
186 metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
187 } catch(Exception e) {
188 throw new TikaException("Invalid embedded resource", e);
189 }
190 } else {
191 metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
192 metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
193 }
194
195 // Should we parse it?
196 if (extractor.shouldParseEmbedded(metadata)) {
197 if (embedded == null) {
198 // Make a TikaInputStream that just
199 // passes the root directory of the
200 // embedded document, and is otherwise
201 // empty (byte[0]):
202 embedded = TikaInputStream.get(new byte[0]);
203 embedded.setOpenContainer(dir);
204 }
205 extractor.parseEmbedded(embedded, xhtml, metadata, true);
206 }
207 } finally {
208 if (embedded != null) {
209 embedded.close();
210 }
211 }
212 }
213 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import org.apache.tika.sax.XHTMLContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * Cell of content. Classes that implement this interface are used by
23 * Tika parsers (currently just the MS Excel parser) to keep track of
24 * individual pieces of content before they are rendered to the XHTML
25 * SAX event stream.
26 */
27 public interface Cell {
28
29 /**
30 * Renders the content to the given XHTML SAX event stream.
31 *
32 * @param handler
33 * @throws SAXException
34 */
35 void render(XHTMLContentHandler handler) throws SAXException;
36
37 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import org.apache.tika.sax.XHTMLContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * Cell decorator.
23 */
24 public class CellDecorator implements Cell {
25
26 private final Cell cell;
27
28 public CellDecorator(Cell cell) {
29 this.cell = cell;
30 }
31
32 public void render(XHTMLContentHandler handler) throws SAXException {
33 cell.render(handler);
34 }
35
36 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.awt.Point;
19 import java.io.IOException;
20 import java.text.NumberFormat;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.List;
24 import java.util.Locale;
25 import java.util.Map;
26 import java.util.SortedMap;
27 import java.util.TreeMap;
28
29 import org.apache.poi.ddf.EscherBSERecord;
30 import org.apache.poi.ddf.EscherBlipRecord;
31 import org.apache.poi.ddf.EscherRecord;
32 import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
33 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
34 import org.apache.poi.hssf.eventusermodel.HSSFListener;
35 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
36 import org.apache.poi.hssf.record.BOFRecord;
37 import org.apache.poi.hssf.record.BoundSheetRecord;
38 import org.apache.poi.hssf.record.CellValueRecordInterface;
39 import org.apache.poi.hssf.record.CountryRecord;
40 import org.apache.poi.hssf.record.DateWindow1904Record;
41 import org.apache.poi.hssf.record.DrawingGroupRecord;
42 import org.apache.poi.hssf.record.EOFRecord;
43 import org.apache.poi.hssf.record.ExtendedFormatRecord;
44 import org.apache.poi.hssf.record.FormatRecord;
45 import org.apache.poi.hssf.record.FormulaRecord;
46 import org.apache.poi.hssf.record.HyperlinkRecord;
47 import org.apache.poi.hssf.record.LabelRecord;
48 import org.apache.poi.hssf.record.LabelSSTRecord;
49 import org.apache.poi.hssf.record.NumberRecord;
50 import org.apache.poi.hssf.record.RKRecord;
51 import org.apache.poi.hssf.record.Record;
52 import org.apache.poi.hssf.record.SSTRecord;
53 import org.apache.poi.hssf.record.StringRecord;
54 import org.apache.poi.hssf.record.TextObjectRecord;
55 import org.apache.poi.hssf.record.chart.SeriesTextRecord;
56 import org.apache.poi.hssf.record.common.UnicodeString;
57 import org.apache.poi.hssf.usermodel.HSSFPictureData;
58 import org.apache.poi.poifs.filesystem.DirectoryEntry;
59 import org.apache.poi.poifs.filesystem.DirectoryNode;
60 import org.apache.poi.poifs.filesystem.DocumentInputStream;
61 import org.apache.poi.poifs.filesystem.Entry;
62 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
63 import org.apache.tika.exception.EncryptedDocumentException;
64 import org.apache.tika.exception.TikaException;
65 import org.apache.tika.io.TikaInputStream;
66 import org.apache.tika.parser.ParseContext;
67 import org.apache.tika.sax.XHTMLContentHandler;
68 import org.xml.sax.SAXException;
69
70 /**
71 * Excel parser implementation which uses POI's Event API
72 * to handle the contents of a Workbook.
73 * <p>
74 * The Event API uses a much smaller memory footprint than
75 * <code>HSSFWorkbook</code> when processing excel files
76 * but at the cost of more complexity.
77 * <p>
78 * With the Event API a <i>listener</i> is registered for
79 * specific record types and those records are created,
80 * fired off to the listener and then discarded as the stream
81 * is being processed.
82 *
83 * @see org.apache.poi.hssf.eventusermodel.HSSFListener
84 * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
85 * POI Event API How To</a>
86 */
87 public class ExcelExtractor extends AbstractPOIFSExtractor {
88
89 /**
90 * <code>true</code> if the HSSFListener should be registered
91 * to listen for all records or <code>false</code> (the default)
92 * if the listener should be configured to only receive specified
93 * records.
94 */
95 private boolean listenForAllRecords = false;
96
97 private static final String WORKBOOK_ENTRY = "Workbook";
98
99 public ExcelExtractor(ParseContext context) {
100 super(context);
101 }
102
103 /**
104 * Returns <code>true</code> if this parser is configured to listen
105 * for all records instead of just the specified few.
106 */
107 public boolean isListenForAllRecords() {
108 return listenForAllRecords;
109 }
110
111 /**
112 * Specifies whether this parser should to listen for all
113 * records or just for the specified few.
114 * <p>
115 * <strong>Note:</strong> Under normal operation this setting should
116 * be <code>false</code> (the default), but you can experiment with
117 * this setting for testing and debugging purposes.
118 *
119 * @param listenForAllRecords <code>true</code> if the HSSFListener
120 * should be registered to listen for all records or <code>false</code>
121 * if the listener should be configured to only receive specified records.
122 */
123 public void setListenForAllRecords(boolean listenForAllRecords) {
124 this.listenForAllRecords = listenForAllRecords;
125 }
126
127 /**
128 * Extracts text from an Excel Workbook writing the extracted content
129 * to the specified {@link Appendable}.
130 *
131 * @param filesystem POI file system
132 * @throws IOException if an error occurs processing the workbook
133 * or writing the extracted content
134 */
135 protected void parse(
136 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
137 Locale locale) throws IOException, SAXException, TikaException {
138 parse(filesystem.getRoot(), xhtml, locale);
139 }
140
141 protected void parse(
142 DirectoryNode root, XHTMLContentHandler xhtml,
143 Locale locale) throws IOException, SAXException, TikaException {
144 if (! root.hasEntry(WORKBOOK_ENTRY)) {
145 // Corrupt file / very old file, just skip
146 return;
147 }
148
149 TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
150 listener.processFile(root, isListenForAllRecords());
151 listener.throwStoredException();
152
153 for (Entry entry : root) {
154 if (entry.getName().startsWith("MBD")
155 && entry instanceof DirectoryEntry) {
156 try {
157 handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
158 } catch (TikaException e) {
159 // ignore parse errors from embedded documents
160 }
161 }
162 }
163 }
164
165 // ======================================================================
166
167 /**
168 * HSSF Listener implementation which processes the HSSF records.
169 */
170 private static class TikaHSSFListener implements HSSFListener {
171
172 /**
173 * XHTML content handler to which the document content is rendered.
174 */
175 private final XHTMLContentHandler handler;
176
177 /**
178 * The POIFS Extractor, used for embeded resources.
179 */
180 private final AbstractPOIFSExtractor extractor;
181
182 /**
183 * Potential exception thrown by the content handler. When set to
184 * non-<code>null</code>, causes all subsequent HSSF records to be
185 * ignored and the stored exception to be thrown when
186 * {@link #throwStoredException()} is invoked.
187 */
188 private Exception exception = null;
189
190 private SSTRecord sstRecord;
191 private FormulaRecord stringFormulaRecord;
192
193 private short previousSid;
194
195 /**
196 * Internal <code>FormatTrackingHSSFListener</code> to handle cell
197 * formatting within the extraction.
198 */
199 private FormatTrackingHSSFListener formatListener;
200
201 /**
202 * List of worksheet names.
203 */
204 private List<String> sheetNames = new ArrayList<String>();
205
206 /**
207 * Index of the current worksheet within the workbook.
208 * Used to find the worksheet name in the {@link #sheetNames} list.
209 */
210 private short currentSheetIndex;
211
212 /**
213 * Content of the current worksheet, or <code>null</code> if no
214 * worksheet is currently active.
215 */
216 private SortedMap<Point, Cell> currentSheet = null;
217
218 /**
219 * Extra text or cells that crops up, typically as part of a
220 * worksheet but not always.
221 */
222 private List<Cell> extraTextCells = new ArrayList<Cell>();
223
224 /**
225 * Format for rendering numbers in the worksheet. Currently we just
226 * use the platform default formatting.
227 *
228 * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
229 */
230 private final NumberFormat format;
231
232 /**
233 * These aren't complete when we first see them, as the
234 * depend on continue records that aren't always
235 * contiguous. Collect them for later processing.
236 */
237 private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
238
239 /**
240 * Construct a new listener instance outputting parsed data to
241 * the specified XHTML content handler.
242 *
243 * @param handler Destination to write the parsed output to
244 */
245 private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
246 this.handler = handler;
247 this.extractor = extractor;
248 this.format = NumberFormat.getInstance(locale);
249 this.formatListener = new FormatTrackingHSSFListener(this, locale);
250 }
251
252 /**
253 * Entry point to listener to start the processing of a file.
254 *
255 * @param filesystem POI file system.
256 * @param listenForAllRecords sets whether the listener is configured to listen
257 * for all records types or not.
258 * @throws IOException on any IO errors.
259 * @throws SAXException on any SAX parsing errors.
260 */
261 public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
262 throws IOException, SAXException, TikaException {
263 processFile(filesystem.getRoot(), listenForAllRecords);
264 }
265
266 public void processFile(DirectoryNode root, boolean listenForAllRecords)
267 throws IOException, SAXException, TikaException {
268
269 // Set up listener and register the records we want to process
270 HSSFRequest hssfRequest = new HSSFRequest();
271 if (listenForAllRecords) {
272 hssfRequest.addListenerForAllRecords(formatListener);
273 } else {
274 hssfRequest.addListener(formatListener, BOFRecord.sid);
275 hssfRequest.addListener(formatListener, EOFRecord.sid);
276 hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
277 hssfRequest.addListener(formatListener, CountryRecord.sid);
278 hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
279 hssfRequest.addListener(formatListener, SSTRecord.sid);
280 hssfRequest.addListener(formatListener, FormulaRecord.sid);
281 hssfRequest.addListener(formatListener, LabelRecord.sid);
282 hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
283 hssfRequest.addListener(formatListener, NumberRecord.sid);
284 hssfRequest.addListener(formatListener, RKRecord.sid);
285 hssfRequest.addListener(formatListener, StringRecord.sid);
286 hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
287 hssfRequest.addListener(formatListener, TextObjectRecord.sid);
288 hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
289 hssfRequest.addListener(formatListener, FormatRecord.sid);
290 hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
291 hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
292 }
293
294 // Create event factory and process Workbook (fire events)
295 DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY);
296 HSSFEventFactory eventFactory = new HSSFEventFactory();
297 try {
298 eventFactory.processEvents(hssfRequest, documentInputStream);
299 } catch (org.apache.poi.EncryptedDocumentException e) {
300 throw new EncryptedDocumentException(e);
301 }
302
303 // Output any extra text that came after all the sheets
304 processExtraText();
305
306 // Look for embeded images, now that the drawing records
307 // have been fully matched with their continue data
308 for(DrawingGroupRecord dgr : drawingGroups) {
309 dgr.decode();
310 findPictures(dgr.getEscherRecords());
311 }
312 }
313
314 /**
315 * Process a HSSF record.
316 *
317 * @param record HSSF Record
318 */
319 public void processRecord(Record record) {
320 if (exception == null) {
321 try {
322 internalProcessRecord(record);
323 } catch (TikaException te) {
324 exception = te;
325 } catch (IOException ie) {
326 exception = ie;
327 } catch (SAXException se) {
328 exception = se;
329 }
330 }
331 }
332
333 public void throwStoredException() throws TikaException, SAXException, IOException {
334 if (exception != null) {
335 if(exception instanceof IOException)
336 throw (IOException)exception;
337 if(exception instanceof SAXException)
338 throw (SAXException)exception;
339 if(exception instanceof TikaException)
340 throw (TikaException)exception;
341 throw new TikaException(exception.getMessage());
342 }
343 }
344
345 private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
346 switch (record.getSid()) {
347 case BOFRecord.sid: // start of workbook, worksheet etc. records
348 BOFRecord bof = (BOFRecord) record;
349 if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
350 currentSheetIndex = -1;
351 } else if (bof.getType() == BOFRecord.TYPE_CHART) {
352 if(previousSid == EOFRecord.sid) {
353 // This is a sheet which contains only a chart
354 newSheet();
355 } else {
356 // This is a chart within a normal sheet
357 // Handling of this is a bit hacky...
358 if (currentSheet != null) {
359 processSheet();
360 currentSheetIndex--;
361 newSheet();
362 }
363 }
364 } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
365 newSheet();
366 }
367 break;
368
369 case EOFRecord.sid: // end of workbook, worksheet etc. records
370 if (currentSheet != null) {
371 processSheet();
372 }
373 currentSheet = null;
374 break;
375
376 case BoundSheetRecord.sid: // Worksheet index record
377 BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
378 sheetNames.add(boundSheetRecord.getSheetname());
379 break;
380
381 case SSTRecord.sid: // holds all the strings for LabelSSTRecords
382 sstRecord = (SSTRecord) record;
383 break;
384
385 case FormulaRecord.sid: // Cell value from a formula
386 FormulaRecord formula = (FormulaRecord) record;
387 if (formula.hasCachedResultString()) {
388 // The String itself should be the next record
389 stringFormulaRecord = formula;
390 } else {
391 addTextCell(record, formatListener.formatNumberDateCell(formula));
392 }
393 break;
394
395 case StringRecord.sid:
396 if (previousSid == FormulaRecord.sid) {
397 // Cached string value of a string formula
398 StringRecord sr = (StringRecord) record;
399 addTextCell(stringFormulaRecord, sr.getString());
400 } else {
401 // Some other string not associated with a cell, skip
402 }
403 break;
404
405 case LabelRecord.sid: // strings stored directly in the cell
406 LabelRecord label = (LabelRecord) record;
407 addTextCell(record, label.getValue());
408 break;
409
410 case LabelSSTRecord.sid: // Ref. a string in the shared string table
411 LabelSSTRecord sst = (LabelSSTRecord) record;
412 UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
413 addTextCell(record, unicode.getString());
414 break;
415
416 case NumberRecord.sid: // Contains a numeric cell value
417 NumberRecord number = (NumberRecord) record;
418 addTextCell(record, formatListener.formatNumberDateCell(number));
419 break;
420
421 case RKRecord.sid: // Excel internal number record
422 RKRecord rk = (RKRecord) record;
423 addCell(record, new NumberCell(rk.getRKNumber(), format));
424 break;
425
426 case HyperlinkRecord.sid: // holds a URL associated with a cell
427 if (currentSheet != null) {
428 HyperlinkRecord link = (HyperlinkRecord) record;
429 Point point =
430 new Point(link.getFirstColumn(), link.getFirstRow());
431 Cell cell = currentSheet.get(point);
432 if (cell != null) {
433 String address = link.getAddress();
434 if (address != null) {
435 addCell(record, new LinkedCell(cell, address));
436 } else {
437 addCell(record, cell);
438 }
439 }
440 }
441 break;
442
443 case TextObjectRecord.sid:
444 TextObjectRecord tor = (TextObjectRecord) record;
445 addTextCell(record, tor.getStr().getString());
446 break;
447
448 case SeriesTextRecord.sid: // Chart label or title
449 SeriesTextRecord str = (SeriesTextRecord) record;
450 addTextCell(record, str.getText());
451 break;
452
453 case DrawingGroupRecord.sid:
454 // Collect this now, we'll process later when all
455 // the continue records are in
456 drawingGroups.add( (DrawingGroupRecord)record );
457 break;
458
459 }
460
461 previousSid = record.getSid();
462
463 if (stringFormulaRecord != record) {
464 stringFormulaRecord = null;
465 }
466 }
467
468 private void processExtraText() throws SAXException {
469 if(extraTextCells.size() > 0) {
470 for(Cell cell : extraTextCells) {
471 handler.startElement("div", "class", "outside");
472 cell.render(handler);
473 handler.endElement("div");
474 }
475
476 // Reset
477 extraTextCells.clear();
478 }
479 }
480
481 /**
482 * Adds the given cell (unless <code>null</code>) to the current
483 * worksheet (if any) at the position (if any) of the given record.
484 *
485 * @param record record that holds the cell value
486 * @param cell cell value (or <code>null</code>)
487 */
488 private void addCell(Record record, Cell cell) throws SAXException {
489 if (cell == null) {
490 // Ignore empty cells
491 } else if (currentSheet != null
492 && record instanceof CellValueRecordInterface) {
493 // Normal cell inside a worksheet
494 CellValueRecordInterface value =
495 (CellValueRecordInterface) record;
496 Point point = new Point(value.getColumn(), value.getRow());
497 currentSheet.put(point, cell);
498 } else {
499 // Cell outside the worksheets
500 extraTextCells.add(cell);
501 }
502 }
503
504 /**
505 * Adds a text cell with the given text comment. The given text
506 * is trimmed, and ignored if <code>null</code> or empty.
507 *
508 * @param record record that holds the text value
509 * @param text text content, may be <code>null</code>
510 * @throws SAXException
511 */
512 private void addTextCell(Record record, String text) throws SAXException {
513 if (text != null) {
514 text = text.trim();
515 if (text.length() > 0) {
516 addCell(record, new TextCell(text));
517 }
518 }
519 }
520
521 private void newSheet() {
522 currentSheetIndex++;
523 currentSheet = new TreeMap<Point, Cell>(new PointComparator());
524 }
525
526 /**
527 * Process an excel sheet.
528 *
529 * @throws SAXException if an error occurs
530 */
531 private void processSheet() throws SAXException {
532 // Sheet Start
533 handler.startElement("div", "class", "page");
534 if (currentSheetIndex < sheetNames.size()) {
535 handler.element("h1", sheetNames.get(currentSheetIndex));
536 }
537 handler.startElement("table");
538 handler.startElement("tbody");
539
540 // Process Rows
541 int currentRow = 0;
542 int currentColumn = 0;
543 handler.startElement("tr");
544 handler.startElement("td");
545 for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
546 while (currentRow < entry.getKey().y) {
547 handler.endElement("td");
548 handler.endElement("tr");
549 handler.startElement("tr");
550 handler.startElement("td");
551 currentRow++;
552 currentColumn = 0;
553 }
554
555 while (currentColumn < entry.getKey().x) {
556 handler.endElement("td");
557 handler.startElement("td");
558 currentColumn++;
559 }
560
561 entry.getValue().render(handler);
562 }
563 handler.endElement("td");
564 handler.endElement("tr");
565
566 // Sheet End
567 handler.endElement("tbody");
568 handler.endElement("table");
569
570 // Finish up
571 processExtraText();
572 handler.endElement("div");
573 }
574
575 private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
576 for(EscherRecord escherRecord : records) {
577 if (escherRecord instanceof EscherBSERecord) {
578 EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
579 if (blip != null) {
580 HSSFPictureData picture = new HSSFPictureData(blip);
581 String mimeType = picture.getMimeType();
582 TikaInputStream stream = TikaInputStream.get(picture.getData());
583
584 // Handle the embeded resource
585 extractor.handleEmbeddedResource(
586 stream, null, null, mimeType,
587 handler, true
588 );
589 }
590 }
591
592 // Recursive call.
593 findPictures(escherRecord.getChildRecords());
594 }
595 }
596 }
597
598 /**
599 * Utility comparator for points.
600 */
601 private static class PointComparator implements Comparator<Point> {
602
603 public int compare(Point a, Point b) {
604 int diff = a.y - b.y;
605 if (diff == 0) {
606 diff = a.x - b.x;
607 }
608 return diff;
609 }
610
611 }
612
613 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.IOException;
19 import java.util.HashSet;
20
21 import org.apache.poi.hslf.HSLFSlideShow;
22 import org.apache.poi.hslf.model.*;
23 import org.apache.poi.hslf.usermodel.ObjectData;
24 import org.apache.poi.hslf.usermodel.PictureData;
25 import org.apache.poi.hslf.usermodel.SlideShow;
26 import org.apache.poi.poifs.filesystem.DirectoryNode;
27 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.TikaInputStream;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.sax.XHTMLContentHandler;
32 import org.xml.sax.SAXException;
33 import org.xml.sax.helpers.AttributesImpl;
34
35 public class HSLFExtractor extends AbstractPOIFSExtractor {
36 public HSLFExtractor(ParseContext context) {
37 super(context);
38 }
39
40 protected void parse(
41 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
42 throws IOException, SAXException, TikaException {
43 parse(filesystem.getRoot(), xhtml);
44 }
45
46 protected void parse(
47 DirectoryNode root, XHTMLContentHandler xhtml)
48 throws IOException, SAXException, TikaException {
49 HSLFSlideShow ss = new HSLFSlideShow(root);
50 SlideShow _show = new SlideShow(ss);
51 Slide[] _slides = _show.getSlides();
52
53 xhtml.startElement("div", "class", "slideShow");
54
55 /* Iterate over slides and extract text */
56 for( Slide slide : _slides ) {
57 xhtml.startElement("div", "class", "slide");
58
59 // Slide header, if present
60 HeadersFooters hf = slide.getHeadersFooters();
61 if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
62 xhtml.startElement("p", "class", "slide-header");
63
64 xhtml.characters( hf.getHeaderText() );
65
66 xhtml.endElement("p");
67 }
68
69 // Slide master, if present
70 extractMaster(xhtml, slide.getMasterSheet());
71
72 // Slide text
73 {
74 xhtml.startElement("p", "class", "slide-content");
75
76 textRunsToText(xhtml, slide.getTextRuns());
77
78 xhtml.endElement("p");
79 }
80
81 // Table text
82 for (Shape shape: slide.getShapes()){
83 if (shape instanceof Table){
84 extractTableText(xhtml, (Table)shape);
85 }
86 }
87
88 // Slide footer, if present
89 if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
90 xhtml.startElement("p", "class", "slide-footer");
91
92 xhtml.characters( hf.getFooterText() );
93
94 xhtml.endElement("p");
95 }
96
97 // Comments, if present
98 for( Comment comment : slide.getComments() ) {
99 xhtml.startElement("p", "class", "slide-comment");
100 if (comment.getAuthor() != null) {
101 xhtml.startElement("b");
102 xhtml.characters( comment.getAuthor() );
103 xhtml.endElement("b");
104
105 if (comment.getText() != null) {
106 xhtml.characters( " - ");
107 }
108 }
109 if (comment.getText() != null) {
110 xhtml.characters( comment.getText() );
111 }
112 xhtml.endElement("p");
113 }
114
115 // Now any embedded resources
116 handleSlideEmbeddedResources(slide, xhtml);
117
118 // TODO Find the Notes for this slide and extract inline
119
120 // Slide complete
121 xhtml.endElement("div");
122 }
123
124 // All slides done
125 xhtml.endElement("div");
126
127 /* notes */
128 xhtml.startElement("div", "class", "slideNotes");
129 HashSet<Integer> seenNotes = new HashSet<Integer>();
130 HeadersFooters hf = _show.getNotesHeadersFooters();
131
132 for (Slide slide : _slides) {
133 Notes notes = slide.getNotesSheet();
134 if (notes == null) {
135 continue;
136 }
137 Integer id = Integer.valueOf(notes._getSheetNumber());
138 if (seenNotes.contains(id)) {
139 continue;
140 }
141 seenNotes.add(id);
142
143 // Repeat the Notes header, if set
144 if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
145 xhtml.startElement("p", "class", "slide-note-header");
146 xhtml.characters( hf.getHeaderText() );
147 xhtml.endElement("p");
148 }
149
150 // Notes text
151 textRunsToText(xhtml, notes.getTextRuns());
152
153 // Repeat the notes footer, if set
154 if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
155 xhtml.startElement("p", "class", "slide-note-footer");
156 xhtml.characters( hf.getFooterText() );
157 xhtml.endElement("p");
158 }
159 }
160
161 handleSlideEmbeddedPictures(_show, xhtml);
162
163 xhtml.endElement("div");
164 }
165
166 private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException {
167 if (master == null){
168 return;
169 }
170 Shape[] shapes = master.getShapes();
171 if (shapes == null || shapes.length == 0){
172 return;
173 }
174
175 xhtml.startElement("div", "class", "slide-master-content");
176 for (int i = 0; i < shapes.length; i++){
177 Shape sh = shapes[i];
178 if (sh != null && ! MasterSheet.isPlaceholder(sh)){
179 if (sh instanceof TextShape){
180 TextShape tsh = (TextShape)sh;
181 String text = tsh.getText();
182 if (text != null){
183 xhtml.element("p", text);
184 }
185 }
186 }
187 }
188 xhtml.endElement("div");
189 }
190
191 private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
192 xhtml.startElement("table");
193 for (int row = 0; row < shape.getNumberOfRows(); row++){
194 xhtml.startElement("tr");
195 for (int col = 0; col < shape.getNumberOfColumns(); col++){
196 TableCell cell = shape.getCell(row, col);
197 //insert empty string for empty cell if cell is null
198 String txt = "";
199 if (cell != null){
200 txt = cell.getText();
201 }
202 xhtml.element("td", txt);
203 }
204 xhtml.endElement("tr");
205 }
206 xhtml.endElement("table");
207 }
208
209 private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException {
210 if (runs==null) {
211 return;
212 }
213
214 for (TextRun run : runs) {
215 if (run != null) {
216 // Leaving in wisdom from TIKA-712 for easy revert.
217 // Avoid boiler-plate text on the master slide (0
218 // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
219 //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
220 String txt = run.getText();
221 if (txt != null){
222 xhtml.characters(txt);
223 xhtml.startElement("br");
224 xhtml.endElement("br");
225 }
226 }
227 }
228 }
229
230 private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
231 throws TikaException, SAXException, IOException {
232 for (PictureData pic : slideshow.getPictureData()) {
233 String mediaType = null;
234
235 switch (pic.getType()) {
236 case Picture.EMF:
237 mediaType = "application/x-emf";
238 break;
239 case Picture.JPEG:
240 mediaType = "image/jpeg";
241 break;
242 case Picture.PNG:
243 mediaType = "image/png";
244 break;
245 case Picture.WMF:
246 mediaType = "application/x-msmetafile";
247 break;
248 case Picture.DIB:
249 mediaType = "image/bmp";
250 break;
251 }
252
253 handleEmbeddedResource(
254 TikaInputStream.get(pic.getData()), null, null,
255 mediaType, xhtml, false);
256 }
257 }
258
259 private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
260 throws TikaException, SAXException, IOException {
261 Shape[] shapes;
262 try {
263 shapes = slide.getShapes();
264 } catch(NullPointerException e) {
265 // Sometimes HSLF hits problems
266 // Please open POI bugs for any you come across!
267 return;
268 }
269
270 for( Shape shape : shapes ) {
271 if( shape instanceof OLEShape ) {
272 OLEShape oleShape = (OLEShape)shape;
273 ObjectData data = null;
274 try {
275 data = oleShape.getObjectData();
276 } catch( NullPointerException e ) {
277 /* getObjectData throws NPE some times. */
278 }
279
280 if (data != null) {
281 String objID = Integer.toString(oleShape.getObjectID());
282
283 // Embedded Object: add a <div
284 // class="embedded" id="X"/> so consumer can see where
285 // in the main text each embedded document
286 // occurred:
287 AttributesImpl attributes = new AttributesImpl();
288 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
289 attributes.addAttribute("", "id", "id", "CDATA", objID);
290 xhtml.startElement("div", attributes);
291 xhtml.endElement("div");
292
293 TikaInputStream stream =
294 TikaInputStream.get(data.getData());
295 try {
296 String mediaType = null;
297 if ("Excel.Chart.8".equals(oleShape.getProgID())) {
298 mediaType = "application/vnd.ms-excel";
299 }
300 handleEmbeddedResource(
301 stream, objID, objID,
302 mediaType, xhtml, false);
303 } finally {
304 stream.close();
305 }
306 }
307 }
308 }
309 }
310 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import org.apache.tika.sax.XHTMLContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * Linked cell. This class decorates another content cell with a hyperlink.
23 */
24 public class LinkedCell extends CellDecorator {
25
26 private final String link;
27
28 public LinkedCell(Cell cell, String link) {
29 super(cell);
30 assert link != null;
31 this.link = link;
32 }
33
34 public void render(XHTMLContentHandler handler) throws SAXException {
35 handler.startElement("a", "href", link);
36 super.render(handler);
37 handler.endElement("a");
38 }
39
40 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.text.NumberFormat;
19
20 import org.apache.tika.sax.XHTMLContentHandler;
21 import org.xml.sax.SAXException;
22
23 /**
24 * Number cell.
25 */
26 public class NumberCell implements Cell {
27
28 private final double number;
29
30 private final NumberFormat format;
31
32 public NumberCell(double number, NumberFormat format) {
33 this.number = number;
34 this.format = format;
35 }
36
37 public void render(XHTMLContentHandler handler) throws SAXException {
38 handler.characters(format.format(number));
39 }
40
41 public String toString() {
42 return "Numeric Cell: " + format.format(number);
43 }
44 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.security.GeneralSecurityException;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Locale;
25 import java.util.Set;
26
27 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
28 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
29 import org.apache.poi.poifs.crypt.Decryptor;
30 import org.apache.poi.poifs.crypt.EncryptionInfo;
31 import org.apache.poi.poifs.filesystem.DirectoryEntry;
32 import org.apache.poi.poifs.filesystem.DirectoryNode;
33 import org.apache.poi.poifs.filesystem.Entry;
34 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
35 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
36 import org.apache.tika.exception.EncryptedDocumentException;
37 import org.apache.tika.exception.TikaException;
38 import org.apache.tika.io.CloseShieldInputStream;
39 import org.apache.tika.io.TikaInputStream;
40 import org.apache.tika.metadata.Metadata;
41 import org.apache.tika.mime.MediaType;
42 import org.apache.tika.parser.AbstractParser;
43 import org.apache.tika.parser.ParseContext;
44 import org.apache.tika.parser.PasswordProvider;
45 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
46 import org.apache.tika.sax.BodyContentHandler;
47 import org.apache.tika.sax.EmbeddedContentHandler;
48 import org.apache.tika.sax.XHTMLContentHandler;
49 import org.xml.sax.ContentHandler;
50 import org.xml.sax.SAXException;
51
52 /**
53 * Defines a Microsoft document content extractor.
54 */
55 public class OfficeParser extends AbstractParser {
56
57 /** Serial version UID */
58 private static final long serialVersionUID = 7393462244028653479L;
59
60 private static final Set<MediaType> SUPPORTED_TYPES =
61 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
62 POIFSDocumentType.WORKBOOK.type,
63 POIFSDocumentType.OLE10_NATIVE.type,
64 POIFSDocumentType.WORDDOCUMENT.type,
65 POIFSDocumentType.UNKNOWN.type,
66 POIFSDocumentType.ENCRYPTED.type,
67 POIFSDocumentType.POWERPOINT.type,
68 POIFSDocumentType.PUBLISHER.type,
69 POIFSDocumentType.PROJECT.type,
70 POIFSDocumentType.VISIO.type,
71 // Works isn't supported
72 POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
73 POIFSDocumentType.OUTLOOK.type,
74 POIFSDocumentType.SOLIDWORKS_PART.type,
75 POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
76 POIFSDocumentType.SOLIDWORKS_DRAWING.type
77 )));
78
79 public enum POIFSDocumentType {
80 WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
81 OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
82 COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
83 WORDDOCUMENT("doc", MediaType.application("msword")),
84 UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
85 ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
86 POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
87 PUBLISHER("pub", MediaType.application("x-mspublisher")),
88 PROJECT("mpp", MediaType.application("vnd.ms-project")),
89 VISIO("vsd", MediaType.application("vnd.visio")),
90 WORKS("wps", MediaType.application("vnd.ms-works")),
91 XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
92 OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
93 SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
94 SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
95 SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
96
97 private final String extension;
98 private final MediaType type;
99
100 POIFSDocumentType(String extension, MediaType type) {
101 this.extension = extension;
102 this.type = type;
103 }
104
105 public String getExtension() {
106 return extension;
107 }
108
109 public MediaType getType() {
110 return type;
111 }
112
113 public static POIFSDocumentType detectType(POIFSFileSystem fs) {
114 return detectType(fs.getRoot());
115 }
116
117 public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
118 return detectType(fs.getRoot());
119 }
120
121 public static POIFSDocumentType detectType(DirectoryEntry node) {
122 Set<String> names = new HashSet<String>();
123 for (Entry entry : node) {
124 names.add(entry.getName());
125 }
126 MediaType type = POIFSContainerDetector.detect(names, node);
127 for (POIFSDocumentType poifsType : values()) {
128 if (type.equals(poifsType.type)) {
129 return poifsType;
130 }
131 }
132 return UNKNOWN;
133 }
134 }
135
136 public Set<MediaType> getSupportedTypes(ParseContext context) {
137 return SUPPORTED_TYPES;
138 }
139
140 /**
141 * Extracts properties and text from an MS Document input stream
142 */
143 public void parse(
144 InputStream stream, ContentHandler handler,
145 Metadata metadata, ParseContext context)
146 throws IOException, SAXException, TikaException {
147 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
148 xhtml.startDocument();
149
150 final DirectoryNode root;
151 TikaInputStream tstream = TikaInputStream.cast(stream);
152 if (tstream == null) {
153 root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
154 } else {
155 final Object container = tstream.getOpenContainer();
156 if (container instanceof NPOIFSFileSystem) {
157 root = ((NPOIFSFileSystem) container).getRoot();
158 } else if (container instanceof DirectoryNode) {
159 root = (DirectoryNode) container;
160 } else if (tstream.hasFile()) {
161 root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
162 } else {
163 root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
164 }
165 }
166 parse(root, context, metadata, xhtml);
167 xhtml.endDocument();
168 }
169
170 protected void parse(
171 DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
172 throws IOException, SAXException, TikaException {
173
174 // Parse summary entries first, to make metadata available early
175 new SummaryExtractor(metadata).parseSummaries(root);
176
177 // Parse remaining document entries
178 POIFSDocumentType type = POIFSDocumentType.detectType(root);
179
180 if (type!=POIFSDocumentType.UNKNOWN) {
181 setType(metadata, type.getType());
182 }
183
184 switch (type) {
185 case SOLIDWORKS_PART:
186 // new SolidworksExtractor(context).parse(root, xhtml);
187 break;
188 case SOLIDWORKS_ASSEMBLY:
189 break;
190 case SOLIDWORKS_DRAWING:
191 break;
192 case PUBLISHER:
193 PublisherTextExtractor publisherTextExtractor =
194 new PublisherTextExtractor(root);
195 xhtml.element("p", publisherTextExtractor.getText());
196 break;
197 case WORDDOCUMENT:
198 new WordExtractor(context).parse(root, xhtml);
199 break;
200 case POWERPOINT:
201 new HSLFExtractor(context).parse(root, xhtml);
202 break;
203 case WORKBOOK:
204 case XLR:
205 Locale locale = context.get(Locale.class, Locale.getDefault());
206 new ExcelExtractor(context).parse(root, xhtml, locale);
207 break;
208 case PROJECT:
209 // We currently can't do anything beyond the metadata
210 break;
211 case VISIO:
212 VisioTextExtractor visioTextExtractor =
213 new VisioTextExtractor(root);
214 for (String text : visioTextExtractor.getAllText()) {
215 xhtml.element("p", text);
216 }
217 break;
218 case OUTLOOK:
219 OutlookExtractor extractor =
220 new OutlookExtractor(root, context);
221
222 extractor.parse(xhtml, metadata);
223 break;
224 case ENCRYPTED:
225 EncryptionInfo info = new EncryptionInfo(root);
226 Decryptor d = Decryptor.getInstance(info);
227
228 try {
229 // By default, use the default Office Password
230 String password = Decryptor.DEFAULT_PASSWORD;
231
232 // If they supplied a Password Provider, ask that for the password,
233 // and use the provider given one if available (stick with default if not)
234 PasswordProvider passwordProvider = context.get(PasswordProvider.class);
235 if (passwordProvider != null) {
236 String suppliedPassword = passwordProvider.getPassword(metadata);
237 if (suppliedPassword != null) {
238 password = suppliedPassword;
239 }
240 }
241
242 // Check if we've the right password or not
243 if (!d.verifyPassword(password)) {
244 throw new EncryptedDocumentException();
245 }
246
247 // Decrypt the OLE2 stream, and delegate the resulting OOXML
248 // file to the regular OOXML parser for normal handling
249 OOXMLParser parser = new OOXMLParser();
250
251 parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
252 new BodyContentHandler(xhtml)),
253 metadata, context);
254 } catch (GeneralSecurityException ex) {
255 throw new EncryptedDocumentException(ex);
256 }
257 }
258 }
259
260 private void setType(Metadata metadata, MediaType type) {
261 metadata.set(Metadata.CONTENT_TYPE, type.toString());
262 }
263
264 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.text.ParseException;
21 import java.util.Date;
22
23 import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
24 import org.apache.poi.hsmf.MAPIMessage;
25 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
26 import org.apache.poi.hsmf.datatypes.ByteChunk;
27 import org.apache.poi.hsmf.datatypes.Chunk;
28 import org.apache.poi.hsmf.datatypes.MAPIProperty;
29 import org.apache.poi.hsmf.datatypes.StringChunk;
30 import org.apache.poi.hsmf.datatypes.Types;
31 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
32 import org.apache.poi.poifs.filesystem.DirectoryNode;
33 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.io.TikaInputStream;
36 import org.apache.tika.metadata.Metadata;
37 import org.apache.tika.metadata.TikaCoreProperties;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.html.HtmlParser;
40 import org.apache.tika.parser.mbox.MboxParser;
41 import org.apache.tika.parser.rtf.RTFParser;
42 import org.apache.tika.parser.txt.CharsetDetector;
43 import org.apache.tika.parser.txt.CharsetMatch;
44 import org.apache.tika.sax.BodyContentHandler;
45 import org.apache.tika.sax.EmbeddedContentHandler;
46 import org.apache.tika.sax.XHTMLContentHandler;
47 import org.xml.sax.SAXException;
48
49 /**
50 * Outlook Message Parser.
51 */
52 public class OutlookExtractor extends AbstractPOIFSExtractor {
53 private final MAPIMessage msg;
54
55 public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
56 this(filesystem.getRoot(), context);
57 }
58
59 public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
60 super(context);
61
62 try {
63 this.msg = new MAPIMessage(root);
64 } catch (IOException e) {
65 throw new TikaException("Failed to parse Outlook message", e);
66 }
67 }
68
69 public void parse(XHTMLContentHandler xhtml, Metadata metadata)
70 throws TikaException, SAXException, IOException {
71 try {
72 msg.setReturnNullOnMissingChunk(true);
73
74 // If the message contains strings that aren't stored
75 // as Unicode, try to sort out an encoding for them
76 if(msg.has7BitEncodingStrings()) {
77 if(msg.getHeaders() != null) {
78 // There's normally something in the headers
79 msg.guess7BitEncoding();
80 } else {
81 // Nothing in the header, try encoding detection
82 // on the message body
83 StringChunk text = msg.getMainChunks().textBodyChunk;
84 if(text != null) {
85 CharsetDetector detector = new CharsetDetector();
86 detector.setText( text.getRawValue() );
87 CharsetMatch match = detector.detect();
88 if(match.getConfidence() > 35) {
89 msg.set7BitEncoding( match.getName() );
90 }
91 }
92 }
93 }
94
95 // Start with the metadata
96 String subject = msg.getSubject();
97 String from = msg.getDisplayFrom();
98
99 metadata.set(TikaCoreProperties.CREATOR, from);
100 metadata.set(Metadata.MESSAGE_FROM, from);
101 metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
102 metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
103 metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
104
105 metadata.set(TikaCoreProperties.TITLE, subject);
106 // TODO: Move to description in Tika 2.0
107 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
108 msg.getConversationTopic());
109
110 try {
111 for(String recipientAddress : msg.getRecipientEmailAddressList()) {
112 if(recipientAddress != null)
113 metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
114 }
115 } catch(ChunkNotFoundException he) {} // Will be fixed in POI 3.7 Final
116
117 // Date - try two ways to find it
118 // First try via the proper chunk
119 if(msg.getMessageDate() != null) {
120 metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
121 metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
122 } else {
123 try {
124 // Failing that try via the raw headers
125 String[] headers = msg.getHeaders();
126 if(headers != null && headers.length > 0) {
127 for(String header: headers) {
128 if(header.toLowerCase().startsWith("date:")) {
129 String date = header.substring(header.indexOf(':')+1).trim();
130
131 // See if we can parse it as a normal mail date
132 try {
133 Date d = MboxParser.parseDate(date);
134 metadata.set(TikaCoreProperties.CREATED, d);
135 metadata.set(TikaCoreProperties.MODIFIED, d);
136 } catch(ParseException e) {
137 // Store it as-is, and hope for the best...
138 metadata.set(TikaCoreProperties.CREATED, date);
139 metadata.set(TikaCoreProperties.MODIFIED, date);
140 }
141 break;
142 }
143 }
144 }
145 } catch(ChunkNotFoundException he) {
146 // We can't find the date, sorry...
147 }
148 }
149
150
151 xhtml.element("h1", subject);
152
153 // Output the from and to details in text, as you
154 // often want them in text form for searching
155 xhtml.startElement("dl");
156 if (from!=null) {
157 header(xhtml, "From", from);
158 }
159 header(xhtml, "To", msg.getDisplayTo());
160 header(xhtml, "Cc", msg.getDisplayCC());
161 header(xhtml, "Bcc", msg.getDisplayBCC());
162 try {
163 header(xhtml, "Recipients", msg.getRecipientEmailAddress());
164 } catch(ChunkNotFoundException e) {}
165 xhtml.endElement("dl");
166
167 // Get the message body. Preference order is: html, rtf, text
168 Chunk htmlChunk = null;
169 Chunk rtfChunk = null;
170 Chunk textChunk = null;
171 for(Chunk chunk : msg.getMainChunks().getChunks()) {
172 if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
173 htmlChunk = chunk;
174 }
175 if(chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
176 rtfChunk = chunk;
177 }
178 if(chunk.getChunkId() == MAPIProperty.BODY.id) {
179 textChunk = chunk;
180 }
181 }
182
183 boolean doneBody = false;
184 xhtml.startElement("div", "class", "message-body");
185 if(htmlChunk != null) {
186 byte[] data = null;
187 if(htmlChunk instanceof ByteChunk) {
188 data = ((ByteChunk)htmlChunk).getValue();
189 } else if(htmlChunk instanceof StringChunk) {
190 data = ((StringChunk)htmlChunk).getRawValue();
191 }
192 if(data != null) {
193 HtmlParser htmlParser = new HtmlParser();
194 htmlParser.parse(
195 new ByteArrayInputStream(data),
196 new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
197 new Metadata(), new ParseContext()
198 );
199 doneBody = true;
200 }
201 }
202 if(rtfChunk != null && !doneBody) {
203 ByteChunk chunk = (ByteChunk)rtfChunk;
204 MAPIRtfAttribute rtf = new MAPIRtfAttribute(
205 MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
206 );
207 RTFParser rtfParser = new RTFParser();
208 rtfParser.parse(
209 new ByteArrayInputStream(rtf.getData()),
210 new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
211 new Metadata(), new ParseContext());
212 doneBody = true;
213 }
214 if(textChunk != null && !doneBody) {
215 xhtml.element("p", ((StringChunk)textChunk).getValue());
216 }
217 xhtml.endElement("div");
218
219 // Process the attachments
220 for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
221 xhtml.startElement("div", "class", "attachment-entry");
222
223 String filename = null;
224 if (attachment.attachLongFileName != null) {
225 filename = attachment.attachLongFileName.getValue();
226 } else if (attachment.attachFileName != null) {
227 filename = attachment.attachFileName.getValue();
228 }
229 if (filename != null && filename.length() > 0) {
230 xhtml.element("h1", filename);
231 }
232
233 if(attachment.attachData != null) {
234 handleEmbeddedResource(
235 TikaInputStream.get(attachment.attachData.getValue()),
236 filename, null,
237 null, xhtml, true
238 );
239 }
240 if(attachment.attachmentDirectory != null) {
241 handleEmbeddedOfficeDoc(
242 attachment.attachmentDirectory.getDirectory(),
243 xhtml
244 );
245 }
246
247 xhtml.endElement("div");
248 }
249 } catch(ChunkNotFoundException e) {
250 throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
251 }
252 }
253
254 private void header(XHTMLContentHandler xhtml, String key, String value)
255 throws SAXException {
256 if (value != null && value.length() > 0) {
257 xhtml.element("dt", key);
258 xhtml.element("dd", value);
259 }
260 }
261 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.apache.tika.mime.MediaType.application;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.nio.channels.FileChannel;
23 import java.util.Collections;
24 import java.util.HashSet;
25 import java.util.Set;
26 import java.util.regex.Pattern;
27
28 import org.apache.poi.poifs.filesystem.DirectoryEntry;
29 import org.apache.poi.poifs.filesystem.DirectoryNode;
30 import org.apache.poi.poifs.filesystem.DocumentInputStream;
31 import org.apache.poi.poifs.filesystem.DocumentNode;
32 import org.apache.poi.poifs.filesystem.Entry;
33 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
34 import org.apache.tika.detect.Detector;
35 import org.apache.tika.io.IOUtils;
36 import org.apache.tika.io.TikaInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39
40 /**
41 * A detector that works on a POIFS OLE2 document
42 * to figure out exactly what the file is.
43 * This should work for all OLE2 documents, whether
44 * they are ones supported by POI or not.
45 */
46 public class POIFSContainerDetector implements Detector {
47
48 /** Serial version UID */
49 private static final long serialVersionUID = -3028021741663605293L;
50
51 /** An ASCII String "StarImpress" */
52 private static final byte [] STAR_IMPRESS = new byte [] {
53 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
54 };
55
56 /** An ASCII String "StarDraw" */
57 private static final byte [] STAR_DRAW = new byte [] {
58 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
59 };
60
61 /** An ASCII String "Quill96" for Works Files */
62 private static final byte [] WORKS_QUILL96 = new byte[] {
63 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
64 };
65
66 /** The OLE base file format */
67 public static final MediaType OLE = application("x-tika-msoffice");
68
69 /** The protected OOXML base file format */
70 public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
71
72 /** General embedded document type within an OLE2 container */
73 public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
74
75 /** An OLE10 Native embedded document within another OLE2 document */
76 public static final MediaType OLE10_NATIVE =
77 new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
78
79 /** Some other kind of embedded document, in a CompObj container within another OLE2 document */
80 public static final MediaType COMP_OBJ =
81 new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
82
83 /** Microsoft Excel */
84 public static final MediaType XLS = application("vnd.ms-excel");
85
86 /** Microsoft Word */
87 public static final MediaType DOC = application("msword");
88
89 /** Microsoft PowerPoint */
90 public static final MediaType PPT = application("vnd.ms-powerpoint");
91
92 /** Microsoft Publisher */
93 public static final MediaType PUB = application("x-mspublisher");
94
95 /** Microsoft Visio */
96 public static final MediaType VSD = application("vnd.visio");
97
98 /** Microsoft Works */
99 public static final MediaType WPS = application("vnd.ms-works");
100
101 /** Microsoft Works Spreadsheet 7.0 */
102 public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
103
104 /** Microsoft Outlook */
105 public static final MediaType MSG = application("vnd.ms-outlook");
106
107 /** Microsoft Project */
108 public static final MediaType MPP = application("vnd.ms-project");
109
110 /** StarOffice Calc */
111 public static final MediaType SDC = application("vnd.stardivision.calc");
112
113 /** StarOffice Draw */
114 public static final MediaType SDA = application("vnd.stardivision.draw");
115
116 /** StarOffice Impress */
117 public static final MediaType SDD = application("vnd.stardivision.impress");
118
119 /** StarOffice Writer */
120 public static final MediaType SDW = application("vnd.stardivision.writer");
121
122 /** SolidWorks CAD file */
123 public static final MediaType SLDWORKS = application("sldworks");
124
125 /** Regexp for matching the MPP Project Data stream */
126 private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
127
128 public MediaType detect(InputStream input, Metadata metadata)
129 throws IOException {
130 // Check if we have access to the document
131 if (input == null) {
132 return MediaType.OCTET_STREAM;
133 }
134
135 // If this is a TikaInputStream wrapping an already
136 // parsed NPOIFileSystem/DirectoryNode, just get the
137 // names from the root:
138 TikaInputStream tis = TikaInputStream.cast(input);
139 Set<String> names = null;
140 if (tis != null) {
141 Object container = tis.getOpenContainer();
142 if (container instanceof NPOIFSFileSystem) {
143 names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
144 } else if (container instanceof DirectoryNode) {
145 names = getTopLevelNames((DirectoryNode) container);
146 }
147 }
148
149 if (names == null) {
150 // Check if the document starts with the OLE header
151 input.mark(8);
152 try {
153 if (input.read() != 0xd0 || input.read() != 0xcf
154 || input.read() != 0x11 || input.read() != 0xe0
155 || input.read() != 0xa1 || input.read() != 0xb1
156 || input.read() != 0x1a || input.read() != 0xe1) {
157 return MediaType.OCTET_STREAM;
158 }
159 } finally {
160 input.reset();
161 }
162 }
163
164 // We can only detect the exact type when given a TikaInputStream
165 if (names == null && tis != null) {
166 // Look for known top level entry names to detect the document type
167 names = getTopLevelNames(tis);
168 }
169
170 // Detect based on the names (as available)
171 if (tis != null &&
172 tis.getOpenContainer() != null &&
173 tis.getOpenContainer() instanceof NPOIFSFileSystem) {
174 return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
175 } else {
176 return detect(names, null);
177 }
178 }
179
180 /**
181 * Internal detection of the specific kind of OLE2 document, based on the
182 * names of the top level streams within the file.
183 *
184 * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
185 * entry of the filesystem whose type is to be detected, as a
186 * second argument.
187 */
188 protected static MediaType detect(Set<String> names) {
189 return detect(names, null);
190 }
191
192 /**
193 * Internal detection of the specific kind of OLE2 document, based on the
194 * names of the top-level streams within the file. In some cases the
195 * detection may need access to the root {@link DirectoryEntry} of that file
196 * for best results. The entry can be given as a second, optional argument.
197 *
198 * @param names
199 * @param root
200 * @return
201 */
202 protected static MediaType detect(Set<String> names, DirectoryEntry root) {
203 if (names != null) {
204 if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
205 return SLDWORKS;
206 } else if (names.contains("StarCalcDocument")) {
207 // Star Office Calc
208 return SDC;
209 } else if (names.contains("StarWriterDocument")) {
210 return SDW;
211 } else if (names.contains("StarDrawDocument3")) {
212 if (root == null) {
213 /*
214 * This is either StarOfficeDraw or StarOfficeImpress, we have
215 * to consult the CompObj to distinguish them, if this method is
216 * called in "legacy mode", without the root, just return
217 * x-tika-msoffice. The one-argument method is only for backward
218 * compatibility, if someone calls old API he/she can get the
219 * old result.
220 */
221 return OLE;
222 } else {
223 return processCompObjFormatType(root);
224 }
225 } else if (names.contains("WksSSWorkBook")) {
226 // This check has to be before names.contains("Workbook")
227 // Works 7.0 spreadsheet files contain both
228 // we want to avoid classifying this as Excel
229 return XLR;
230 } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
231 return XLS;
232 } else if (names.contains("Book")) {
233 // Excel 95 or older, we won't be able to parse this....
234 return XLS;
235 } else if (names.contains("EncryptedPackage") &&
236 names.contains("EncryptionInfo") &&
237 names.contains("\u0006DataSpaces")) {
238 // This is a protected OOXML document, which is an OLE2 file
239 // with an Encrypted Stream which holds the OOXML data
240 // Without decrypting the stream, we can't tell what kind of
241 // OOXML file we have. Return a general OOXML Protected type,
242 // and hope the name based detection can guess the rest!
243 return OOXML_PROTECTED;
244 } else if (names.contains("EncryptedPackage")) {
245 return OLE;
246 } else if (names.contains("WordDocument")) {
247 return DOC;
248 } else if (names.contains("Quill")) {
249 return PUB;
250 } else if (names.contains("PowerPoint Document")) {
251 return PPT;
252 } else if (names.contains("VisioDocument")) {
253 return VSD;
254 } else if (names.contains("\u0001Ole10Native")) {
255 return OLE10_NATIVE;
256 } else if (names.contains("MatOST")) {
257 // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
258 return WPS;
259 } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
260 // Newer Works files
261 return WPS;
262 } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
263 return COMP_OBJ;
264 } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
265 // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
266 // If we have the Directory, check
267 if (root != null) {
268 MediaType type = processCompObjFormatType(root);
269 if (type == WPS) {
270 return WPS;
271 } else {
272 // Assume it's a general CompObj embedded resource
273 return COMP_OBJ;
274 }
275 } else {
276 // Assume it's a general CompObj embedded resource
277 return COMP_OBJ;
278 }
279 } else if (names.contains("CONTENTS")) {
280 // CONTENTS without SPELLING nor CompObj normally means some sort
281 // of embedded non-office file inside an OLE2 document
282 // This is most commonly triggered on nested directories
283 return OLE;
284 } else if (names.contains("\u0001CompObj") &&
285 (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
286 // Could be Project, look for common name patterns
287 for (String name : names) {
288 if (mppDataMatch.matcher(name).matches()) {
289 return MPP;
290 }
291 }
292 } else if (names.contains("PerfectOffice_MAIN")) {
293 if (names.contains("SlideShow")) {
294 return MediaType.application("x-corelpresentations"); // .shw
295 } else if (names.contains("PerfectOffice_OBJECTS")) {
296 return MediaType.application("x-quattro-pro"); // .wb?
297 }
298 } else if (names.contains("NativeContent_MAIN")) {
299 return MediaType.application("x-quattro-pro"); // .qpw
300 } else {
301 for (String name : names) {
302 if (name.startsWith("__substg1.0_")) {
303 return MSG;
304 }
305 }
306 }
307 }
308
309 // Couldn't detect a more specific type
310 return OLE;
311 }
312
313 /**
314 * Is this one of the kinds of formats which uses CompObj to
315 * store all of their data, eg Star Draw, Star Impress or
316 * (older) Works?
317 * If not, it's likely an embedded resource
318 */
319 private static MediaType processCompObjFormatType(DirectoryEntry root) {
320 try {
321 Entry e = root.getEntry("\u0001CompObj");
322 if (e != null && e.isDocumentEntry()) {
323 DocumentNode dn = (DocumentNode)e;
324 DocumentInputStream stream = new DocumentInputStream(dn);
325 byte [] bytes = IOUtils.toByteArray(stream);
326 /*
327 * This array contains a string with a normal ASCII name of the
328 * application used to create this file. We want to search for that
329 * name.
330 */
331 if ( arrayContains(bytes, STAR_DRAW) ) {
332 return SDA;
333 } else if (arrayContains(bytes, STAR_IMPRESS)) {
334 return SDD;
335 } else if (arrayContains(bytes, WORKS_QUILL96)) {
336 return WPS;
337 }
338 }
339 } catch (Exception e) {
340 /*
341 * "root.getEntry" can throw FileNotFoundException. The code inside
342 * "if" can throw IOExceptions. Theoretically. Practically no
343 * exceptions will likely ever appear.
344 *
345 * Swallow all of them. If any occur, we just assume that we can't
346 * distinguish between Draw and Impress and return something safe:
347 * x-tika-msoffice
348 */
349 }
350 return OLE;
351 }
352
353 // poor man's search for byte arrays, replace with some library call if
354 // you know one without adding new dependencies
355 private static boolean arrayContains(byte [] larger, byte [] smaller) {
356 int largerCounter = 0;
357 int smallerCounter = 0;
358 while (largerCounter < larger.length) {
359 if (larger[largerCounter] == smaller[smallerCounter]) {
360 largerCounter++;
361 smallerCounter++;
362 if (smallerCounter == smaller.length) {
363 return true;
364 }
365 } else {
366 largerCounter = largerCounter - smallerCounter + 1;
367 smallerCounter=0;
368 }
369 }
370 return false;
371 }
372
373 private static Set<String> getTopLevelNames(TikaInputStream stream)
374 throws IOException {
375 // Force the document stream to a (possibly temporary) file
376 // so we don't modify the current position of the stream
377 FileChannel channel = stream.getFileChannel();
378
379 try {
380 NPOIFSFileSystem fs = new NPOIFSFileSystem(channel);
381
382 // Optimize a possible later parsing process by keeping
383 // a reference to the already opened POI file system
384 stream.setOpenContainer(fs);
385
386 return getTopLevelNames(fs.getRoot());
387 } catch (IOException e) {
388 // Parse error in POI, so we don't know the file type
389 return Collections.emptySet();
390 } catch (RuntimeException e) {
391 // Another problem in POI
392 return Collections.emptySet();
393 }
394 }
395
396 private static Set<String> getTopLevelNames(DirectoryNode root) {
397 Set<String> names = new HashSet<String>();
398 for (Entry entry : root) {
399 names.add(entry.getName());
400 }
401 return names;
402 }
403 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.util.Date;
21
22 import org.apache.commons.logging.Log;
23 import org.apache.commons.logging.LogFactory;
24 import org.apache.poi.hpsf.CustomProperties;
25 import org.apache.poi.hpsf.DocumentSummaryInformation;
26 import org.apache.poi.hpsf.MarkUnsupportedException;
27 import org.apache.poi.hpsf.NoPropertySetStreamException;
28 import org.apache.poi.hpsf.PropertySet;
29 import org.apache.poi.hpsf.SummaryInformation;
30 import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
31 import org.apache.poi.poifs.filesystem.DirectoryNode;
32 import org.apache.poi.poifs.filesystem.DocumentEntry;
33 import org.apache.poi.poifs.filesystem.DocumentInputStream;
34 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
35 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.metadata.MSOffice;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.metadata.Office;
39 import org.apache.tika.metadata.OfficeOpenXMLCore;
40 import org.apache.tika.metadata.OfficeOpenXMLExtended;
41 import org.apache.tika.metadata.PagedText;
42 import org.apache.tika.metadata.Property;
43 import org.apache.tika.metadata.TikaCoreProperties;
44
45 /**
46 * Extractor for Common OLE2 (HPSF) metadata
47 */
48 public class SummaryExtractor {
49 private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
50
51 private static final String SUMMARY_INFORMATION =
52 SummaryInformation.DEFAULT_STREAM_NAME;
53
54 private static final String DOCUMENT_SUMMARY_INFORMATION =
55 DocumentSummaryInformation.DEFAULT_STREAM_NAME;
56
57 private final Metadata metadata;
58
59 public SummaryExtractor(Metadata metadata) {
60 this.metadata = metadata;
61 }
62
63 public void parseSummaries(NPOIFSFileSystem filesystem)
64 throws IOException, TikaException {
65 parseSummaries(filesystem.getRoot());
66 }
67
68 public void parseSummaries(DirectoryNode root)
69 throws IOException, TikaException {
70 parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
71 parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
72 }
73
74 private void parseSummaryEntryIfExists(
75 DirectoryNode root, String entryName)
76 throws IOException, TikaException {
77 try {
78 DocumentEntry entry =
79 (DocumentEntry) root.getEntry(entryName);
80 PropertySet properties =
81 new PropertySet(new DocumentInputStream(entry));
82 if (properties.isSummaryInformation()) {
83 parse(new SummaryInformation(properties));
84 }
85 if (properties.isDocumentSummaryInformation()) {
86 parse(new DocumentSummaryInformation(properties));
87 }
88 } catch (FileNotFoundException e) {
89 // entry does not exist, just skip it
90 } catch (NoPropertySetStreamException e) {
91 // no property stream, just skip it
92 } catch (UnexpectedPropertySetTypeException e) {
93 throw new TikaException("Unexpected HPSF document", e);
94 } catch (MarkUnsupportedException e) {
95 throw new TikaException("Invalid DocumentInputStream", e);
96 } catch (Exception e) {
97 logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
98 }
99 }
100
101 private void parse(SummaryInformation summary) {
102 set(TikaCoreProperties.TITLE, summary.getTitle());
103 set(TikaCoreProperties.CREATOR, summary.getAuthor());
104 set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
105 // TODO Move to OO subject in Tika 2.0
106 set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject());
107 set(TikaCoreProperties.MODIFIER, summary.getLastAuthor());
108 set(TikaCoreProperties.COMMENTS, summary.getComments());
109 set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate());
110 set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName());
111 set(OfficeOpenXMLCore.REVISION, summary.getRevNumber());
112 set(TikaCoreProperties.CREATED, summary.getCreateDateTime());
113 set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime());
114 set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
115 set(Metadata.EDIT_TIME, summary.getEditTime());
116 set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
117
118 // New style counts
119 set(Office.WORD_COUNT, summary.getWordCount());
120 set(Office.CHARACTER_COUNT, summary.getCharCount());
121 set(Office.PAGE_COUNT, summary.getPageCount());
122 if (summary.getPageCount() > 0) {
123 metadata.set(PagedText.N_PAGES, summary.getPageCount());
124 }
125
126 // Old style, Tika 1.0 properties
127 // TODO Remove these in Tika 2.0
128 set(Metadata.TEMPLATE, summary.getTemplate());
129 set(Metadata.APPLICATION_NAME, summary.getApplicationName());
130 set(Metadata.REVISION_NUMBER, summary.getRevNumber());
131 set(Metadata.SECURITY, summary.getSecurity());
132 set(MSOffice.WORD_COUNT, summary.getWordCount());
133 set(MSOffice.CHARACTER_COUNT, summary.getCharCount());
134 set(MSOffice.PAGE_COUNT, summary.getPageCount());
135 }
136
137 private void parse(DocumentSummaryInformation summary) {
138 set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
139 set(OfficeOpenXMLExtended.MANAGER, summary.getManager());
140 set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
141 set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
142
143 // New style counts
144 set(Office.SLIDE_COUNT, summary.getSlideCount());
145 if (summary.getSlideCount() > 0) {
146 metadata.set(PagedText.N_PAGES, summary.getSlideCount());
147 }
148 // Old style, Tika 1.0 counts
149 // TODO Remove these in Tika 2.0
150 set(Metadata.COMPANY, summary.getCompany());
151 set(Metadata.MANAGER, summary.getManager());
152 set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
153 set(Metadata.CATEGORY, summary.getCategory());
154
155 parse(summary.getCustomProperties());
156 }
157
158 private String getLanguage(DocumentSummaryInformation summary) {
159 CustomProperties customProperties = summary.getCustomProperties();
160 if (customProperties != null) {
161 Object value = customProperties.get("Language");
162 if (value instanceof String) {
163 return (String) value;
164 }
165 }
166 return null;
167 }
168
169 /**
170 * Attempt to parse custom document properties and add to the collection of metadata
171 * @param customProperties
172 */
173 private void parse(CustomProperties customProperties) {
174 if (customProperties != null) {
175 for (String name : customProperties.nameSet()) {
176 // Apply the custom prefix
177 String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name;
178
179 // Get, convert and save property value
180 Object value = customProperties.get(name);
181 if (value instanceof String){
182 set(key, (String)value);
183 } else if (value instanceof Date) {
184 Property prop = Property.externalDate(key);
185 metadata.set(prop, (Date)value);
186 } else if (value instanceof Boolean) {
187 Property prop = Property.externalBoolean(key);
188 metadata.set(prop, ((Boolean)value).toString());
189 } else if (value instanceof Long) {
190 Property prop = Property.externalInteger(key);
191 metadata.set(prop, ((Long)value).intValue());
192 } else if (value instanceof Double) {
193 Property prop = Property.externalReal(key);
194 metadata.set(prop, ((Double)value).doubleValue());
195 } else if (value instanceof Integer) {
196 Property prop = Property.externalInteger(key);
197 metadata.set(prop, ((Integer)value).intValue());
198 }
199 }
200 }
201 }
202
203 private void set(String name, String value) {
204 if (value != null) {
205 metadata.set(name, value);
206 }
207 }
208
209 private void set(Property property, String value) {
210 if (value != null) {
211 metadata.set(property, value);
212 }
213 }
214
215 private void set(Property property, Date value) {
216 if (value != null) {
217 metadata.set(property, value);
218 }
219 }
220
221 private void set(Property property, int value) {
222 if (value > 0) {
223 metadata.set(property, value);
224 }
225 }
226
227 private void set(String name, long value) {
228 if (value > 0) {
229 metadata.set(name, Long.toString(value));
230 }
231 }
232 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24
25 import org.apache.poi.hmef.Attachment;
26 import org.apache.poi.hmef.HMEFMessage;
27 import org.apache.poi.hmef.attribute.MAPIAttribute;
28 import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
29 import org.apache.poi.hsmf.datatypes.MAPIProperty;
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
32 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
33 import org.apache.tika.io.TikaInputStream;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.metadata.TikaCoreProperties;
36 import org.apache.tika.mime.MediaType;
37 import org.apache.tika.parser.AbstractParser;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.sax.EmbeddedContentHandler;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
42
43 /**
44 * A POI-powered Tika Parser for TNEF (Transport Neutral
45 * Encoding Format) messages, aka winmail.dat
46 */
47 public class TNEFParser extends AbstractParser {
48 private static final long serialVersionUID = 4611820730372823452L;
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.application("vnd.ms-tnef"),
53 MediaType.application("ms-tnef"),
54 MediaType.application("x-tnef")
55 )));
56
57 public Set<MediaType> getSupportedTypes(ParseContext context) {
58 return SUPPORTED_TYPES;
59 }
60
61 /**
62 * Extracts properties and text from an MS Document input stream
63 */
64 public void parse(
65 InputStream stream, ContentHandler handler,
66 Metadata metadata, ParseContext context)
67 throws IOException, SAXException, TikaException {
68
69 // We work by recursing, so get the appropriate bits
70 EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
71 EmbeddedDocumentExtractor embeddedExtractor;
72 if (ex==null) {
73 embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
74 } else {
75 embeddedExtractor = ex;
76 }
77
78 // Ask POI to process the file for us
79 HMEFMessage msg = new HMEFMessage(stream);
80
81 // Set the message subject if known
82 String subject = msg.getSubject();
83 if(subject != null && subject.length() > 0) {
84 // TODO: Move to title in Tika 2.0
85 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
86 }
87
88 // Recurse into the message body RTF
89 MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
90 if(attr != null && attr instanceof MAPIRtfAttribute) {
91 MAPIRtfAttribute rtf = (MAPIRtfAttribute)attr;
92 handleEmbedded(
93 "message.rtf", "application/rtf",
94 rtf.getData(),
95 embeddedExtractor, handler
96 );
97 }
98
99 // Recurse into each attachment in turn
100 for(Attachment attachment : msg.getAttachments()) {
101 String name = attachment.getLongFilename();
102 if(name == null || name.length() == 0) {
103 name = attachment.getFilename();
104 }
105 if(name == null || name.length() == 0) {
106 String ext = attachment.getExtension();
107 if(ext != null) {
108 name = "unknown" + ext;
109 }
110 }
111 handleEmbedded(
112 name, null, attachment.getContents(),
113 embeddedExtractor, handler
114 );
115 }
116 }
117
118 private void handleEmbedded(String name, String type, byte[] contents,
119 EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
120 throws IOException, SAXException, TikaException {
121 Metadata metadata = new Metadata();
122 if(name != null)
123 metadata.set(Metadata.RESOURCE_NAME_KEY, name);
124 if(type != null)
125 metadata.set(Metadata.CONTENT_TYPE, type);
126
127 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
128 embeddedExtractor.parseEmbedded(
129 TikaInputStream.get(contents),
130 new EmbeddedContentHandler(handler),
131 metadata, false);
132 }
133 }
134 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import org.apache.tika.sax.XHTMLContentHandler;
19 import org.xml.sax.SAXException;
20
21 /**
22 * Text cell.
23 */
24 public class TextCell implements Cell {
25
26 private final String text;
27
28 public TextCell(String text) {
29 this.text = text;
30 }
31
32 public void render(XHTMLContentHandler handler) throws SAXException {
33 handler.characters(text);
34 }
35
36 public String toString() {
37 return "Text Cell: \"" + text + "\"";
38 }
39 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Set;
26
27 import org.apache.poi.hwpf.HWPFDocument;
28 import org.apache.poi.hwpf.HWPFOldDocument;
29 import org.apache.poi.hwpf.OldWordFileFormatException;
30 import org.apache.poi.hwpf.extractor.Word6Extractor;
31 import org.apache.poi.hwpf.model.FieldsDocumentPart;
32 import org.apache.poi.hwpf.model.PicturesTable;
33 import org.apache.poi.hwpf.model.StyleDescription;
34 import org.apache.poi.hwpf.usermodel.CharacterRun;
35 import org.apache.poi.hwpf.usermodel.Field;
36 import org.apache.poi.hwpf.usermodel.HeaderStories;
37 import org.apache.poi.hwpf.usermodel.Paragraph;
38 import org.apache.poi.hwpf.usermodel.Picture;
39 import org.apache.poi.hwpf.usermodel.Range;
40 import org.apache.poi.hwpf.usermodel.Table;
41 import org.apache.poi.hwpf.usermodel.TableCell;
42 import org.apache.poi.hwpf.usermodel.TableRow;
43 import org.apache.poi.poifs.filesystem.DirectoryEntry;
44 import org.apache.poi.poifs.filesystem.DirectoryNode;
45 import org.apache.poi.poifs.filesystem.Entry;
46 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
47 import org.apache.tika.exception.TikaException;
48 import org.apache.tika.io.TikaInputStream;
49 import org.apache.tika.parser.ParseContext;
50 import org.apache.tika.sax.XHTMLContentHandler;
51 import org.xml.sax.SAXException;
52 import org.xml.sax.helpers.AttributesImpl;
53
54 public class WordExtractor extends AbstractPOIFSExtractor {
55
56 private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
57 private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
58
59 public WordExtractor(ParseContext context) {
60 super(context);
61 }
62
63 // True if we are currently in the named style tag:
64 private boolean curStrikeThrough;
65 private boolean curBold;
66 private boolean curItalic;
67
68 protected void parse(
69 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
70 throws IOException, SAXException, TikaException {
71 parse(filesystem.getRoot(), xhtml);
72 }
73
74 protected void parse(
75 DirectoryNode root, XHTMLContentHandler xhtml)
76 throws IOException, SAXException, TikaException {
77 HWPFDocument document;
78 try {
79 document = new HWPFDocument(root);
80 } catch(OldWordFileFormatException e) {
81 parseWord6(root, xhtml);
82 return;
83 }
84 org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
85 new org.apache.poi.hwpf.extractor.WordExtractor(document);
86 HeaderStories headerFooter = new HeaderStories(document);
87
88 // Grab the list of pictures. As far as we can tell,
89 // the pictures should be in order, and may be directly
90 // placed or referenced from an anchor
91 PicturesTable pictureTable = document.getPicturesTable();
92 PicturesSource pictures = new PicturesSource(document);
93
94 // Do any headers, if present
95 Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(),
96 headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
97 handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
98
99 // Do the main paragraph text
100 Range r = document.getRange();
101 for(int i=0; i<r.numParagraphs(); i++) {
102 Paragraph p = r.getParagraph(i);
103 i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
104 }
105
106 // Do everything else
107 for (String paragraph: wordExtractor.getMainTextboxText()) {
108 xhtml.element("p", paragraph);
109 }
110
111 for (String paragraph : wordExtractor.getFootnoteText()) {
112 xhtml.element("p", paragraph);
113 }
114
115 for (String paragraph : wordExtractor.getCommentsText()) {
116 xhtml.element("p", paragraph);
117 }
118
119 for (String paragraph : wordExtractor.getEndnoteText()) {
120 xhtml.element("p", paragraph);
121 }
122
123 // Do any footers, if present
124 Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(),
125 headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
126 handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
127
128 // Handle any pictures that we haven't output yet
129 for(Picture p = pictures.nextUnclaimed(); p != null; ) {
130 handlePictureCharacterRun(
131 null, p, pictures, xhtml
132 );
133 p = pictures.nextUnclaimed();
134 }
135
136 // Handle any embeded office documents
137 try {
138 DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
139 for (Entry entry : op) {
140 if (entry.getName().startsWith("_")
141 && entry instanceof DirectoryEntry) {
142 handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
143 }
144 }
145 } catch(FileNotFoundException e) {
146 }
147 }
148
149 private static int countParagraphs(Range... ranges) {
150 int count = 0;
151 for (Range r : ranges) {
152 if (r != null) { count += r.numParagraphs(); }
153 }
154 return count;
155 }
156
157 private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
158 PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
159 throws SAXException, IOException, TikaException {
160 if (countParagraphs(ranges) > 0) {
161 xhtml.startElement("div", "class", type);
162 for (Range r : ranges) {
163 if (r != null) {
164 for(int i=0; i<r.numParagraphs(); i++) {
165 Paragraph p = r.getParagraph(i);
166
167 String text = p.text();
168 if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
169 // Skip empty header or footer paragraphs
170 } else {
171 i += handleParagraph(p, 0, r, document,
172 FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
173 }
174 }
175 }
176 }
177 xhtml.endElement("div");
178 }
179 }
180
181 private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
182 FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
183 XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
184 // Note - a poi bug means we can't currently properly recurse
185 // into nested tables, so currently we don't
186 if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) {
187 Table t = r.getTable(p);
188 xhtml.startElement("table");
189 xhtml.startElement("tbody");
190 for(int rn=0; rn<t.numRows(); rn++) {
191 TableRow row = t.getRow(rn);
192 xhtml.startElement("tr");
193 for(int cn=0; cn<row.numCells(); cn++) {
194 TableCell cell = row.getCell(cn);
195 xhtml.startElement("td");
196
197 for(int pn=0; pn<cell.numParagraphs(); pn++) {
198 Paragraph cellP = cell.getParagraph(pn);
199 handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml);
200 }
201 xhtml.endElement("td");
202 }
203 xhtml.endElement("tr");
204 }
205 xhtml.endElement("tbody");
206 xhtml.endElement("table");
207 return (t.numParagraphs()-1);
208 }
209
210 TagAndStyle tas;
211
212 if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
213 StyleDescription style =
214 document.getStyleSheet().getStyleDescription(p.getStyleIndex());
215 if (style != null && style.getName() != null && style.getName().length() > 0) {
216 tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
217 } else {
218 tas = new TagAndStyle("p", null);
219 }
220 } else {
221 tas = new TagAndStyle("p", null);
222 }
223
224 if(tas.getStyleClass() != null) {
225 xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
226 } else {
227 xhtml.startElement(tas.getTag());
228 }
229
230 for(int j=0; j<p.numCharacterRuns(); j++) {
231 CharacterRun cr = p.getCharacterRun(j);
232
233 // FIELD_BEGIN_MARK:
234 if (cr.text().getBytes()[0] == 0x13) {
235 Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
236 // 58 is an embedded document
237 // 56 is a document link
238 if (field != null && (field.getType() == 58 || field.getType() == 56)) {
239 // Embedded Object: add a <div
240 // class="embedded" id="_X"/> so consumer can see where
241 // in the main text each embedded document
242 // occurred:
243 String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
244 AttributesImpl attributes = new AttributesImpl();
245 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
246 attributes.addAttribute("", "id", "id", "CDATA", id);
247 xhtml.startElement("div", attributes);
248 xhtml.endElement("div");
249 }
250 }
251
252 if(cr.text().equals("\u0013")) {
253 j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
254 } else if(cr.text().startsWith("\u0008")) {
255 // Floating Picture(s)
256 for(int pn=0; pn<cr.text().length(); pn++) {
257 // Assume they're in the order from the unclaimed list...
258 Picture picture = pictures.nextUnclaimed();
259
260 // Output
261 handlePictureCharacterRun(cr, picture, pictures, xhtml);
262 }
263 } else if(pictureTable.hasPicture(cr)) {
264 // Inline Picture
265 Picture picture = pictures.getFor(cr);
266 handlePictureCharacterRun(cr, picture, pictures, xhtml);
267 } else {
268 handleCharacterRun(cr, tas.isHeading(), xhtml);
269 }
270 }
271
272 // Close any still open style tags
273 if (curStrikeThrough) {
274 xhtml.endElement("s");
275 curStrikeThrough = false;
276 }
277 if (curItalic) {
278 xhtml.endElement("i");
279 curItalic = false;
280 }
281 if (curBold) {
282 xhtml.endElement("b");
283 curBold = false;
284 }
285
286 xhtml.endElement(tas.getTag());
287
288 return 0;
289 }
290
291 private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
292 throws SAXException {
293 // Skip trailing newlines
294 if(!isRendered(cr) || cr.text().equals("\r"))
295 return;
296
297 if(!skipStyling) {
298 if (cr.isBold() != curBold) {
299 // Enforce nesting -- must close s and i tags
300 if (curStrikeThrough) {
301 xhtml.endElement("s");
302 curStrikeThrough = false;
303 }
304 if (curItalic) {
305 xhtml.endElement("i");
306 curItalic = false;
307 }
308 if (cr.isBold()) {
309 xhtml.startElement("b");
310 } else {
311 xhtml.endElement("b");
312 }
313 curBold = cr.isBold();
314 }
315
316 if (cr.isItalic() != curItalic) {
317 // Enforce nesting -- must close s tag
318 if (curStrikeThrough) {
319 xhtml.endElement("s");
320 curStrikeThrough = false;
321 }
322 if (cr.isItalic()) {
323 xhtml.startElement("i");
324 } else {
325 xhtml.endElement("i");
326 }
327 curItalic = cr.isItalic();
328 }
329
330 if (cr.isStrikeThrough() != curStrikeThrough) {
331 if (cr.isStrikeThrough()) {
332 xhtml.startElement("s");
333 } else {
334 xhtml.endElement("s");
335 }
336 curStrikeThrough = cr.isStrikeThrough();
337 }
338 }
339
340 // Clean up the text
341 String text = cr.text();
342 text = text.replace('\r', '\n');
343 if(text.endsWith("\u0007")) {
344 // Strip the table cell end marker
345 text = text.substring(0, text.length()-1);
346 }
347
348 // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
349
350 // line tabulator as break line
351 text = text.replace((char)0x000b,'\n');
352
353 // Non-breaking hyphens are returned as char 30
354 text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
355
356 // Non-required hyphens to zero-width space
357 text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
358
359 xhtml.characters(text);
360 }
361 /**
362 * Can be \13..text..\15 or \13..control..\14..text..\15 .
363 * Nesting is allowed
364 */
365 private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling,
366 PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
367 List<CharacterRun> controls = new ArrayList<CharacterRun>();
368 List<CharacterRun> texts = new ArrayList<CharacterRun>();
369 boolean has14 = false;
370
371 // Split it into before and after the 14
372 int i;
373 for(i=index+1; i<p.numCharacterRuns(); i++) {
374 CharacterRun cr = p.getCharacterRun(i);
375 if(cr.text().equals("\u0013")) {
376 // Nested, oh joy...
377 int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, pictures, xhtml);
378 i += increment;
379 } else if(cr.text().equals("\u0014")) {
380 has14 = true;
381 } else if(cr.text().equals("\u0015")) {
382 if(!has14) {
383 texts = controls;
384 controls = new ArrayList<CharacterRun>();
385 }
386 break;
387 } else {
388 if(has14) {
389 texts.add(cr);
390 } else {
391 controls.add(cr);
392 }
393 }
394 }
395
396 // Do we need to do something special with this?
397 if(controls.size() > 0) {
398 String text = controls.get(0).text();
399 for(int j=1; j<controls.size(); j++) {
400 text += controls.get(j).text();
401 }
402
403 if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
404 && text.indexOf('"') > -1) {
405 String url = text.substring(
406 text.indexOf('"') + 1,
407 text.lastIndexOf('"')
408 );
409 xhtml.startElement("a", "href", url);
410 for(CharacterRun cr : texts) {
411 handleCharacterRun(cr, skipStyling, xhtml);
412 }
413 xhtml.endElement("a");
414 } else {
415 // Just output the text ones
416 for(CharacterRun cr : texts) {
417 if(pictures.hasPicture(cr)) {
418 Picture picture = pictures.getFor(cr);
419 handlePictureCharacterRun(cr, picture, pictures, xhtml);
420 } else {
421 handleCharacterRun(cr, skipStyling, xhtml);
422 }
423 }
424 }
425 } else {
426 // We only had text
427 // Output as-is
428 for(CharacterRun cr : texts) {
429 handleCharacterRun(cr, skipStyling, xhtml);
430 }
431 }
432
433 // Tell them how many to skip over
434 return i-index;
435 }
436
437 private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml)
438 throws SAXException, IOException, TikaException {
439 if(!isRendered(cr) || picture == null) {
440 // Oh dear, we've run out...
441 // Probably caused by multiple \u0008 images referencing
442 // the same real image
443 return;
444 }
445
446 // Which one is it?
447 String extension = picture.suggestFileExtension();
448 int pictureNumber = pictures.pictureNumber(picture);
449
450 // Make up a name for the picture
451 // There isn't one in the file, but we need to be able to reference
452 // the picture from the img tag and the embedded resource
453 String filename = "image"+pictureNumber+(extension.length()>0 ? "."+extension : "");
454
455 // Grab the mime type for the picture
456 String mimeType = picture.getMimeType();
457
458 // Output the img tag
459 AttributesImpl attr = new AttributesImpl();
460 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
461 attr.addAttribute("", "alt", "alt", "CDATA", filename);
462 xhtml.startElement("img", attr);
463 xhtml.endElement("img");
464
465 // Have we already output this one?
466 // (Only expose each individual image once)
467 if(! pictures.hasOutput(picture)) {
468 TikaInputStream stream = TikaInputStream.get(picture.getContent());
469 handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
470 pictures.recordOutput(picture);
471 }
472 }
473
474 /**
475 * Outputs a section of text if the given text is non-empty.
476 *
477 * @param xhtml XHTML content handler
478 * @param section the class of the &lt;div/&gt; section emitted
479 * @param text text to be emitted, if any
480 * @throws SAXException if an error occurs
481 */
482 private void addTextIfAny(
483 XHTMLContentHandler xhtml, String section, String text)
484 throws SAXException {
485 if (text != null && text.length() > 0) {
486 xhtml.startElement("div", "class", section);
487 xhtml.element("p", text);
488 xhtml.endElement("div");
489 }
490 }
491
492 protected void parseWord6(
493 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
494 throws IOException, SAXException, TikaException {
495 parseWord6(filesystem.getRoot(), xhtml);
496 }
497
498 protected void parseWord6(
499 DirectoryNode root, XHTMLContentHandler xhtml)
500 throws IOException, SAXException, TikaException {
501 HWPFOldDocument doc = new HWPFOldDocument(root);
502 Word6Extractor extractor = new Word6Extractor(doc);
503
504 for(String p : extractor.getParagraphText()) {
505 xhtml.element("p", p);
506 }
507 }
508
509 private static final Map<String,TagAndStyle> fixedParagraphStyles = new HashMap<String,TagAndStyle>();
510 private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
511 static {
512 fixedParagraphStyles.put("Default", defaultParagraphStyle);
513 fixedParagraphStyles.put("Normal", defaultParagraphStyle);
514 fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
515 fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
516 fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
517 fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
518 fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
519 }
520
521 /**
522 * Given a style name, return what tag should be used, and
523 * what style should be applied to it.
524 */
525 public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
526 TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
527 if (tagAndStyle != null) {
528 return tagAndStyle;
529 }
530
531 if (styleName.equals("Table Contents") && isTable) {
532 return defaultParagraphStyle;
533 }
534
535 String tag = "p";
536 String styleClass = null;
537
538 if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
539 // "Heading 3" or "Heading2" or "heading 4"
540 int num = 1;
541 try {
542 num = Integer.parseInt(
543 styleName.substring(styleName.length()-1)
544 );
545 } catch(NumberFormatException e) {}
546 // Turn it into a H1 - H6 (H7+ isn't valid!)
547 tag = "h" + Math.min(num, 6);
548 } else {
549 styleClass = styleName.replace(' ', '_');
550 styleClass = styleClass.substring(0,1).toLowerCase() +
551 styleClass.substring(1);
552 }
553
554 return new TagAndStyle(tag,styleClass);
555 }
556
557 public static class TagAndStyle {
558 private String tag;
559 private String styleClass;
560 public TagAndStyle(String tag, String styleClass) {
561 this.tag = tag;
562 this.styleClass = styleClass;
563 }
564 public String getTag() {
565 return tag;
566 }
567 public String getStyleClass() {
568 return styleClass;
569 }
570 public boolean isHeading() {
571 return tag.length()==2 && tag.startsWith("h");
572 }
573 }
574
575 /**
576 * Determines if character run should be included in the extraction.
577 *
578 * @param cr character run.
579 * @return true if character run should be included in extraction.
580 */
581 private boolean isRendered(final CharacterRun cr) {
582 return cr == null || !cr.isMarkedDeleted();
583 }
584
585
586 /**
587 * Provides access to the pictures both by offset, iteration
588 * over the un-claimed, and peeking forward
589 */
590 private static class PicturesSource {
591 private PicturesTable picturesTable;
592 private Set<Picture> output = new HashSet<Picture>();
593 private Map<Integer,Picture> lookup;
594 private List<Picture> nonU1based;
595 private List<Picture> all;
596 private int pn = 0;
597
598 private PicturesSource(HWPFDocument doc) {
599 picturesTable = doc.getPicturesTable();
600 all = picturesTable.getAllPictures();
601
602 // Build the Offset-Picture lookup map
603 lookup = new HashMap<Integer, Picture>();
604 for(Picture p : all) {
605 lookup.put(p.getStartOffset(), p);
606 }
607
608 // Work out which Pictures aren't referenced by
609 // a \u0001 in the main text
610 // These are \u0008 escher floating ones, ones
611 // found outside the normal text, and who
612 // knows what else...
613 nonU1based = new ArrayList<Picture>();
614 nonU1based.addAll(all);
615 Range r = doc.getRange();
616 for(int i=0; i<r.numCharacterRuns(); i++) {
617 CharacterRun cr = r.getCharacterRun(i);
618 if(picturesTable.hasPicture(cr)) {
619 Picture p = getFor(cr);
620 int at = nonU1based.indexOf(p);
621 nonU1based.set(at, null);
622 }
623 }
624 }
625
626 private boolean hasPicture(CharacterRun cr) {
627 return picturesTable.hasPicture(cr);
628 }
629
630 private void recordOutput(Picture picture) {
631 output.add(picture);
632 }
633 private boolean hasOutput(Picture picture) {
634 return output.contains(picture);
635 }
636
637 private int pictureNumber(Picture picture) {
638 return all.indexOf(picture) + 1;
639 }
640
641 private Picture getFor(CharacterRun cr) {
642 return lookup.get(cr.getPicOffset());
643 }
644
645 /**
646 * Return the next unclaimed one, used towards
647 * the end
648 */
649 private Picture nextUnclaimed() {
650 Picture p = null;
651 while(pn < nonU1based.size()) {
652 p = nonU1based.get(pn);
653 pn++;
654 if(p != null) return p;
655 }
656 return null;
657 }
658 }
659 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.net.URI;
21 import java.util.List;
22
23 import org.apache.poi.POIXMLDocument;
24 import org.apache.poi.POIXMLTextExtractor;
25 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
26 import org.apache.poi.openxml4j.opc.PackagePart;
27 import org.apache.poi.openxml4j.opc.PackageRelationship;
28 import org.apache.poi.openxml4j.opc.TargetMode;
29 import org.apache.poi.poifs.filesystem.DirectoryNode;
30 import org.apache.poi.poifs.filesystem.Ole10Native;
31 import org.apache.poi.poifs.filesystem.Ole10NativeException;
32 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
35 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
36 import org.apache.tika.io.TikaInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
40 import org.apache.tika.sax.EmbeddedContentHandler;
41 import org.apache.tika.sax.XHTMLContentHandler;
42 import org.apache.xmlbeans.XmlException;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
46 /**
47 * Base class for all Tika OOXML extractors.
48 *
49 * Tika extractors decorate POI extractors so that the parsed content of
50 * documents is returned as a sequence of XHTML SAX events. Subclasses must
51 * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
52 * populates the {@link XHTMLContentHandler} object received as parameter.
53 */
54 public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
55 static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
56 static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
57 static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
58 static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
59
60 private static final String TYPE_OLE_OBJECT =
61 "application/vnd.openxmlformats-officedocument.oleObject";
62
63 protected POIXMLTextExtractor extractor;
64
65 private final EmbeddedDocumentExtractor embeddedExtractor;
66
67 public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
68 this.extractor = extractor;
69
70 EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
71
72 if (ex==null) {
73 embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
74 } else {
75 embeddedExtractor = ex;
76 }
77
78 }
79
80 /**
81 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
82 */
83 public POIXMLDocument getDocument() {
84 return extractor.getDocument();
85 }
86
87 /**
88 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
89 */
90 public MetadataExtractor getMetadataExtractor() {
91 return new MetadataExtractor(extractor);
92 }
93
94 /**
95 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
96 * org.apache.tika.metadata.Metadata)
97 */
98 public void getXHTML(
99 ContentHandler handler, Metadata metadata, ParseContext context)
100 throws SAXException, XmlException, IOException, TikaException {
101 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
102 xhtml.startDocument();
103
104 buildXHTML(xhtml);
105
106 // Now do any embedded parts
107 handleEmbeddedParts(handler);
108
109 xhtml.endDocument();
110 }
111
112 protected String getJustFileName(String desc) {
113 int idx = desc.lastIndexOf('/');
114 if (idx != -1) {
115 desc = desc.substring(idx+1);
116 }
117 idx = desc.lastIndexOf('.');
118 if (idx != -1) {
119 desc = desc.substring(0, idx);
120 }
121
122 return desc;
123 }
124
125 private void handleEmbeddedParts(ContentHandler handler)
126 throws TikaException, IOException, SAXException {
127 try {
128 for (PackagePart source : getMainDocumentParts()) {
129 for (PackageRelationship rel : source.getRelationships()) {
130
131 URI sourceURI = rel.getSourceURI();
132 String sourceDesc;
133 if (sourceURI != null) {
134 sourceDesc = getJustFileName(sourceURI.getPath());
135 if (sourceDesc.startsWith("slide")) {
136 sourceDesc += "_";
137 } else {
138 sourceDesc = "";
139 }
140 } else {
141 sourceDesc = "";
142 }
143 if (rel.getTargetMode() == TargetMode.INTERNAL) {
144 PackagePart target;
145
146 try {
147 target = source.getRelatedPart(rel);
148 } catch (IllegalArgumentException ex) {
149 continue;
150 }
151
152 String type = rel.getRelationshipType();
153 if (RELATION_OLE_OBJECT.equals(type)
154 && TYPE_OLE_OBJECT.equals(target.getContentType())) {
155 handleEmbeddedOLE(target, handler, sourceDesc + rel.getId());
156 } else if (RELATION_AUDIO.equals(type)
157 || RELATION_IMAGE.equals(type)
158 || RELATION_PACKAGE.equals(type)
159 || RELATION_OLE_OBJECT.equals(type)) {
160 handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
161 }
162 }
163 }
164 }
165 } catch (InvalidFormatException e) {
166 throw new TikaException("Broken OOXML file", e);
167 }
168 }
169
170 /**
171 * Handles an embedded OLE object in the document
172 */
173 private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
174 throws IOException, SAXException {
175 // A POIFSFileSystem needs to be at least 3 blocks big to be valid
176 // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code
177 // if (part.getSize() >= 0 && part.getSize() < 512*3) {
178 // // Too small, skip
179 // return;
180 // }
181
182 // Open the POIFS (OLE2) structure and process
183 POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
184 try {
185 Metadata metadata = new Metadata();
186 TikaInputStream stream = null;
187 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
188
189 DirectoryNode root = fs.getRoot();
190 POIFSDocumentType type = POIFSDocumentType.detectType(root);
191
192 if (root.hasEntry("CONTENTS")
193 && root.hasEntry("\u0001Ole")
194 && root.hasEntry("\u0001CompObj")
195 && root.hasEntry("\u0003ObjInfo")) {
196 // TIKA-704: OLE 2.0 embedded non-Office document?
197 stream = TikaInputStream.get(
198 fs.createDocumentInputStream("CONTENTS"));
199 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
200 embeddedExtractor.parseEmbedded(
201 stream, new EmbeddedContentHandler(handler),
202 metadata, false);
203 }
204 } else if (POIFSDocumentType.OLE10_NATIVE == type) {
205 // TIKA-704: OLE 1.0 embedded document
206 Ole10Native ole =
207 Ole10Native.createFromEmbeddedOleObject(fs);
208 metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
209 byte[] data = ole.getDataBuffer();
210 if (data != null) {
211 stream = TikaInputStream.get(data);
212 }
213
214 if (stream != null
215 && embeddedExtractor.shouldParseEmbedded(metadata)) {
216 embeddedExtractor.parseEmbedded(
217 stream, new EmbeddedContentHandler(handler),
218 metadata, false);
219 }
220 } else {
221 handleEmbeddedFile(part, handler, rel);
222 }
223 } catch (FileNotFoundException e) {
224 // There was no CONTENTS entry, so skip this part
225 } catch (Ole10NativeException e) {
226 // Could not process an OLE 1.0 entry, so skip this part
227 }
228 }
229
230 /**
231 * Handles an embedded file in the document
232 */
233 protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel)
234 throws SAXException, IOException {
235 Metadata metadata = new Metadata();
236 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
237
238 // Get the name
239 String name = part.getPartName().getName();
240 metadata.set(
241 Metadata.RESOURCE_NAME_KEY,
242 name.substring(name.lastIndexOf('/') + 1));
243
244 // Get the content type
245 metadata.set(
246 Metadata.CONTENT_TYPE, part.getContentType());
247
248 // Call the recursing handler
249 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
250 embeddedExtractor.parseEmbedded(
251 TikaInputStream.get(part.getInputStream()),
252 new EmbeddedContentHandler(handler),
253 metadata, false);
254 }
255 }
256
257 /**
258 * Populates the {@link XHTMLContentHandler} object received as parameter.
259 */
260 protected abstract void buildXHTML(XHTMLContentHandler xhtml)
261 throws SAXException, XmlException, IOException;
262
263 /**
264 * Return a list of the main parts of the document, used
265 * when searching for embedded resources.
266 * This should be all the parts of the document that end
267 * up with things embedded into them.
268 */
269 protected abstract List<PackagePart> getMainDocumentParts()
270 throws TikaException;
271 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.math.BigDecimal;
19 import java.util.Date;
20
21 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.poi.POIXMLProperties.CoreProperties;
23 import org.apache.poi.POIXMLProperties.CustomProperties;
24 import org.apache.poi.POIXMLProperties.ExtendedProperties;
25 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
26 import org.apache.poi.openxml4j.util.Nullable;
27 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.metadata.MSOffice;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.Office;
32 import org.apache.tika.metadata.OfficeOpenXMLCore;
33 import org.apache.tika.metadata.OfficeOpenXMLExtended;
34 import org.apache.tika.metadata.PagedText;
35 import org.apache.tika.metadata.Property;
36 import org.apache.tika.metadata.TikaCoreProperties;
37 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
38 import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
39
40 /**
41 * OOXML metadata extractor.
42 *
43 * Currently POI doesn't support metadata extraction for OOXML.
44 *
45 * @see OOXMLExtractor#getMetadataExtractor()
46 */
47 public class MetadataExtractor {
48
49 private final POIXMLTextExtractor extractor;
50
51 public MetadataExtractor(POIXMLTextExtractor extractor) {
52 this.extractor = extractor;
53 }
54
55 public void extract(Metadata metadata) throws TikaException {
56 if (extractor.getDocument() != null ||
57 (extractor instanceof XSSFEventBasedExcelExtractor &&
58 extractor.getPackage() != null)) {
59 extractMetadata(extractor.getCoreProperties(), metadata);
60 extractMetadata(extractor.getExtendedProperties(), metadata);
61 extractMetadata(extractor.getCustomProperties(), metadata);
62 }
63 }
64
65 private void extractMetadata(CoreProperties properties, Metadata metadata) {
66 PackagePropertiesPart propsHolder = properties
67 .getUnderlyingProperties();
68
69 addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
70 addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
71 .getContentStatusProperty());
72 addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
73 .getCreatedProperty());
74 addProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
75 .getCreatorProperty());
76 addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
77 .getDescriptionProperty());
78 addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
79 .getIdentifierProperty());
80 addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
81 .getKeywordsProperty());
82 addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
83 .getLanguageProperty());
84 addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
85 .getLastModifiedByProperty());
86 addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
87 .getLastPrintedProperty());
88 addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
89 .getModifiedProperty());
90 addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
91 .getModifiedProperty());
92 addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
93 .getRevisionProperty());
94 // TODO: Move to OO subject in Tika 2.0
95 addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
96 propsHolder.getSubjectProperty());
97 addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
98 addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
99
100 // Legacy Tika-1.0 style stats
101 // TODO Remove these in Tika 2.0
102 addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
103 addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
104 .getContentStatusProperty());
105 addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
106 .getRevisionProperty());
107 addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
108 }
109
110 private void extractMetadata(ExtendedProperties properties,
111 Metadata metadata) {
112 CTProperties propsHolder = properties.getUnderlyingProperties();
113
114 addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
115 addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
116 addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
117 addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
118 addProperty(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager());
119 addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
120 addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
121 addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
122 addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime());
123
124 if (propsHolder.getPages() > 0) {
125 metadata.set(PagedText.N_PAGES, propsHolder.getPages());
126 } else if (propsHolder.getSlides() > 0) {
127 metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
128 }
129
130 // Process the document statistics
131 addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
132 addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
133 addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
134 addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
135 addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
136 addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
137 addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
138
139 // Legacy Tika-1.0 style stats
140 // TODO Remove these in Tika 2.0
141 addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
142 addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
143 addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
144 addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
145 addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
146 addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
147 addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
148 addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
149 addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
150 addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
151 addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
152 addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
153 addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
154 addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
155 }
156
157 private void extractMetadata(CustomProperties properties,
158 Metadata metadata) {
159 org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
160 props = properties.getUnderlyingProperties();
161
162 for(CTProperty property : props.getPropertyList()) {
163 String val = null;
164 Date date = null;
165
166 if (property.isSetLpwstr()) {
167 val = property.getLpwstr();
168 }
169 else if (property.isSetLpstr()) {
170 val = property.getLpstr();
171 }
172 else if (property.isSetDate()) {
173 date = property.getDate().getTime();
174 }
175 else if (property.isSetFiletime()) {
176 date = property.getFiletime().getTime();
177 }
178
179 else if (property.isSetBool()) {
180 val = Boolean.toString( property.getBool() );
181 }
182
183 // Integers
184 else if (property.isSetI1()) {
185 val = Integer.toString(property.getI1());
186 }
187 else if (property.isSetI2()) {
188 val = Integer.toString(property.getI2());
189 }
190 else if (property.isSetI4()) {
191 val = Integer.toString(property.getI4());
192 }
193 else if (property.isSetI8()) {
194 val = Long.toString(property.getI8());
195 }
196 else if (property.isSetInt()) {
197 val = Integer.toString( property.getInt() );
198 }
199
200 // Unsigned Integers
201 else if (property.isSetUi1()) {
202 val = Integer.toString(property.getUi1());
203 }
204 else if (property.isSetUi2()) {
205 val = Integer.toString(property.getUi2());
206 }
207 else if (property.isSetUi4()) {
208 val = Long.toString(property.getUi4());
209 }
210 else if (property.isSetUi8()) {
211 val = property.getUi8().toString();
212 }
213 else if (property.isSetUint()) {
214 val = Long.toString(property.getUint());
215 }
216
217 // Reals
218 else if (property.isSetR4()) {
219 val = Float.toString( property.getR4() );
220 }
221 else if (property.isSetR8()) {
222 val = Double.toString( property.getR8() );
223 }
224 else if (property.isSetDecimal()) {
225 BigDecimal d = property.getDecimal();
226 if (d == null) {
227 val = null;
228 } else {
229 val = d.toPlainString();
230 }
231 }
232
233 else if (property.isSetArray()) {
234 // TODO Fetch the array values and output
235 }
236 else if (property.isSetVector()) {
237 // TODO Fetch the vector values and output
238 }
239
240 else if (property.isSetBlob() || property.isSetOblob()) {
241 // TODO Decode, if possible
242 }
243 else if (property.isSetStream() || property.isSetOstream() ||
244 property.isSetVstream()) {
245 // TODO Decode, if possible
246 }
247 else if (property.isSetStorage() || property.isSetOstorage()) {
248 // TODO Decode, if possible
249 }
250
251 else {
252 // This type isn't currently supported yet, skip the property
253 }
254
255 String propName = "custom:" + property.getName();
256 if (date != null) {
257 Property tikaProp = Property.externalDate(propName);
258 metadata.set(tikaProp, date);
259 } else if (val != null) {
260 metadata.set(propName, val);
261 }
262 }
263 }
264
265 private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
266 T value = nullableValue.getValue();
267 if (value != null) {
268 if (value instanceof Date) {
269 metadata.set(property, (Date) value);
270 } else if (value instanceof String) {
271 metadata.set(property, (String) value);
272 } else if (value instanceof Integer) {
273 metadata.set(property, (Integer) value);
274 } else if (value instanceof Double) {
275 metadata.set(property, (Double) value);
276 }
277 }
278 }
279
280 private void addProperty(Metadata metadata, String name, Nullable<?> value) {
281 if (value.getValue() != null) {
282 addProperty(metadata, name, value.getValue().toString());
283 }
284 }
285
286 private void addProperty(Metadata metadata, Property property, String value) {
287 if (value != null) {
288 metadata.set(property, value);
289 }
290 }
291
292 private void addProperty(Metadata metadata, String name, String value) {
293 if (value != null) {
294 metadata.set(name, value);
295 }
296 }
297
298 private void addProperty(Metadata metadata, Property property, int value) {
299 if (value > 0) {
300 metadata.set(property, value);
301 }
302 }
303
304 private void addProperty(Metadata metadata, String name, int value) {
305 if (value > 0) {
306 metadata.set(name, Integer.toString(value));
307 }
308 }
309 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19
20 import org.apache.poi.POIXMLDocument;
21 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.parser.ParseContext;
25 import org.apache.xmlbeans.XmlException;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28
29 /**
30 * Interface implemented by all Tika OOXML extractors.
31 *
32 * @see org.apache.poi.POIXMLTextExtractor
33 */
34 public interface OOXMLExtractor {
35
36 /**
37 * Returns the opened document.
38 *
39 * @see POIXMLTextExtractor#getDocument()
40 */
41 POIXMLDocument getDocument();
42
43 /**
44 * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
45 * for OOXML by POI.
46 */
47 MetadataExtractor getMetadataExtractor();
48
49 /**
50 * Parses the document into a sequence of XHTML SAX events sent to the
51 * given content handler.
52 */
53 void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
54 throws SAXException, XmlException, IOException, TikaException;
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Locale;
21
22 import org.apache.poi.POIXMLDocument;
23 import org.apache.poi.POIXMLTextExtractor;
24 import org.apache.poi.extractor.ExtractorFactory;
25 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
26 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
27 import org.apache.poi.openxml4j.opc.OPCPackage;
28 import org.apache.poi.openxml4j.opc.PackageAccess;
29 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
30 import org.apache.poi.xslf.usermodel.XMLSlideShow;
31 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
32 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
33 import org.apache.poi.xwpf.usermodel.XWPFDocument;
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.io.CloseShieldInputStream;
36 import org.apache.tika.io.TikaInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39 import org.apache.tika.parser.EmptyParser;
40 import org.apache.tika.parser.ParseContext;
41 import org.apache.tika.parser.pkg.ZipContainerDetector;
42 import org.apache.xmlbeans.XmlException;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
46 /**
47 * Figures out the correct {@link OOXMLExtractor} for the supplied document and
48 * returns it.
49 */
50 public class OOXMLExtractorFactory {
51
52 public static void parse(
53 InputStream stream, ContentHandler baseHandler,
54 Metadata metadata, ParseContext context)
55 throws IOException, SAXException, TikaException {
56 Locale locale = context.get(Locale.class, Locale.getDefault());
57 ExtractorFactory.setThreadPrefersEventExtractors(true);
58
59 try {
60 OOXMLExtractor extractor;
61 OPCPackage pkg;
62
63 // Locate or Open the OPCPackage for the file
64 TikaInputStream tis = TikaInputStream.cast(stream);
65 if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
66 pkg = (OPCPackage) tis.getOpenContainer();
67 } else if (tis != null && tis.hasFile()) {
68 pkg = OPCPackage.open( tis.getFile().getPath(), PackageAccess.READ );
69 tis.setOpenContainer(pkg);
70 } else {
71 InputStream shield = new CloseShieldInputStream(stream);
72 pkg = OPCPackage.open(shield);
73 }
74
75 // Get the type, and ensure it's one we handle
76 MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
77 if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
78 // Not a supported type, delegate to Empty Parser
79 EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
80 return;
81 }
82 metadata.set(Metadata.CONTENT_TYPE, type.toString());
83
84 // Have the appropriate OOXML text extractor picked
85 POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg);
86
87 POIXMLDocument document = poiExtractor.getDocument();
88 if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
89 extractor = new XSSFExcelExtractorDecorator(
90 context, (XSSFEventBasedExcelExtractor)poiExtractor, locale);
91 } else if (document == null) {
92 throw new TikaException(
93 "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
94 "The extractor returned was a " + poiExtractor
95 );
96 } else if (document instanceof XMLSlideShow) {
97 extractor = new XSLFPowerPointExtractorDecorator(
98 context, (XSLFPowerPointExtractor) poiExtractor);
99 } else if (document instanceof XWPFDocument) {
100 extractor = new XWPFWordExtractorDecorator(
101 context, (XWPFWordExtractor) poiExtractor);
102 } else {
103 extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
104 }
105
106 // Get the bulk of the metadata first, so that it's accessible during
107 // parsing if desired by the client (see TIKA-1109)
108 extractor.getMetadataExtractor().extract(metadata);
109
110 // Extract the text, along with any in-document metadata
111 extractor.getXHTML(baseHandler, metadata, context);
112 } catch (IllegalArgumentException e) {
113 if (e.getMessage().startsWith("No supported documents found")) {
114 throw new TikaException(
115 "TIKA-418: RuntimeException while getting content"
116 + " for thmx and xps file types", e);
117 } else {
118 throw new TikaException("Error creating OOXML extractor", e);
119 }
120 } catch (InvalidFormatException e) {
121 throw new TikaException("Error creating OOXML extractor", e);
122 } catch (OpenXML4JException e) {
123 throw new TikaException("Error creating OOXML extractor", e);
124 } catch (XmlException e) {
125 throw new TikaException("Error creating OOXML extractor", e);
126
127 }
128 }
129
130 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.parser.AbstractParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.xml.sax.ContentHandler;
31 import org.xml.sax.SAXException;
32
33 /**
34 * Office Open XML (OOXML) parser.
35 */
36 public class OOXMLParser extends AbstractParser {
37
38 /** Serial version UID */
39 private static final long serialVersionUID = 6535995710857776481L;
40
41 protected static final Set<MediaType> SUPPORTED_TYPES =
42 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
43 MediaType.application("x-tika-ooxml"),
44 MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
45 MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
46 MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
47 MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
48 MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
49 MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
50 MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
51 MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
52 MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
53 MediaType.application("vnd.ms-excel.template.macroenabled.12"),
54 MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
55 MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
56 MediaType.application("vnd.ms-word.document.macroenabled.12"),
57 MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
58 MediaType.application("vnd.ms-word.template.macroenabled.12"))));
59
60 /**
61 * We claim to support all OOXML files, but we actually don't support a small
62 * number of them.
63 * This list is used to decline certain formats that are not yet supported
64 * by Tika and/or POI.
65 */
66 protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
67 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
68 MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
69 MediaType.application("vnd.ms-xpsdocument")
70 )));
71
72 public Set<MediaType> getSupportedTypes(ParseContext context) {
73 return SUPPORTED_TYPES;
74 }
75
76 public void parse(
77 InputStream stream, ContentHandler handler,
78 Metadata metadata, ParseContext context)
79 throws IOException, SAXException, TikaException {
80 // Have the OOXML file processed
81 OOXMLExtractorFactory.parse(stream, handler, metadata, context);
82 }
83 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.util.ArrayList;
19 import java.util.List;
20
21 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.poi.openxml4j.opc.PackagePart;
23 import org.apache.tika.parser.ParseContext;
24 import org.apache.tika.sax.XHTMLContentHandler;
25 import org.xml.sax.SAXException;
26
27 public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
28
29 public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
30 super(context, extractor);
31 }
32
33 @Override
34 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
35 // extract document content as a single string (not structured)
36 xhtml.element("p", extractor.getText());
37 }
38
39 @Override
40 protected List<PackagePart> getMainDocumentParts() {
41 return new ArrayList<PackagePart>();
42 }
43 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.List;
21 import javax.xml.namespace.QName;
22
23 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
24 import org.apache.poi.openxml4j.opc.PackagePart;
25 import org.apache.poi.openxml4j.opc.PackagePartName;
26 import org.apache.poi.openxml4j.opc.PackageRelationship;
27 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
28 import org.apache.poi.openxml4j.opc.TargetMode;
29 import org.apache.poi.xslf.XSLFSlideShow;
30 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
31 import org.apache.poi.xslf.usermodel.Placeholder;
32 import org.apache.poi.xslf.usermodel.XMLSlideShow;
33 import org.apache.poi.xslf.usermodel.XSLFComments;
34 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
35 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
36 import org.apache.poi.xslf.usermodel.XSLFPictureShape;
37 import org.apache.poi.xslf.usermodel.XSLFRelation;
38 import org.apache.poi.xslf.usermodel.XSLFShape;
39 import org.apache.poi.xslf.usermodel.XSLFSheet;
40 import org.apache.poi.xslf.usermodel.XSLFSlide;
41 import org.apache.poi.xslf.usermodel.XSLFTable;
42 import org.apache.poi.xslf.usermodel.XSLFTableCell;
43 import org.apache.poi.xslf.usermodel.XSLFTableRow;
44 import org.apache.poi.xslf.usermodel.XSLFTextShape;
45 import org.apache.tika.exception.TikaException;
46 import org.apache.tika.parser.ParseContext;
47 import org.apache.tika.sax.XHTMLContentHandler;
48 import org.apache.xmlbeans.XmlException;
49 import org.apache.xmlbeans.XmlObject;
50 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
51 import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
52 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
53 import org.xml.sax.SAXException;
54 import org.xml.sax.helpers.AttributesImpl;
55
56 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
57 public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
58 super(context, extractor);
59 }
60
61 /**
62 * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
63 */
64 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
65 XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
66
67 XSLFSlide[] slides = slideShow.getSlides();
68 for (XSLFSlide slide : slides) {
69 String slideDesc;
70 if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
71 slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString());
72 slideDesc += "_";
73 } else {
74 slideDesc = null;
75 }
76
77 // slide
78 extractContent(slide.getShapes(), false, xhtml, slideDesc);
79
80 // slide layout which is the master sheet for this slide
81 XSLFSheet slideLayout = slide.getMasterSheet();
82 extractContent(slideLayout.getShapes(), true, xhtml, null);
83
84 // slide master which is the master sheet for all text layouts
85 XSLFSheet slideMaster = slideLayout.getMasterSheet();
86 extractContent(slideMaster.getShapes(), true, xhtml, null);
87
88 // notes (if present)
89 XSLFSheet slideNotes = slide.getNotes();
90 if (slideNotes != null) {
91 extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
92
93 // master sheet for this notes
94 XSLFSheet notesMaster = slideNotes.getMasterSheet();
95 extractContent(notesMaster.getShapes(), true, xhtml, null);
96 }
97
98 // comments (if present)
99 XSLFComments comments = slide.getComments();
100 if (comments != null) {
101 for (CTComment comment : comments.getCTCommentsList().getCmList()) {
102 xhtml.element("p", comment.getText());
103 }
104 }
105 }
106 }
107
108 private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
109 throws SAXException {
110 for (XSLFShape sh : shapes) {
111 if (sh instanceof XSLFTextShape) {
112 XSLFTextShape txt = (XSLFTextShape) sh;
113 Placeholder ph = txt.getTextType();
114 if (skipPlaceholders && ph != null) {
115 continue;
116 }
117 xhtml.element("p", txt.getText());
118 } else if (sh instanceof XSLFGroupShape){
119 // recurse into groups of shapes
120 XSLFGroupShape group = (XSLFGroupShape)sh;
121 extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
122 } else if (sh instanceof XSLFTable) {
123 XSLFTable tbl = (XSLFTable)sh;
124 for(XSLFTableRow row : tbl){
125 List<XSLFTableCell> cells = row.getCells();
126 extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
127 }
128 } else if (sh instanceof XSLFGraphicFrame) {
129 XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
130 XmlObject[] sp = frame.getXmlObject().selectPath(
131 "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
132 if (sp != null) {
133 for(XmlObject emb : sp) {
134 XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
135 if (relIDAtt != null) {
136 String relID = relIDAtt.getDomNode().getNodeValue();
137 if (slideDesc != null) {
138 relID = slideDesc + relID;
139 }
140 AttributesImpl attributes = new AttributesImpl();
141 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
142 attributes.addAttribute("", "id", "id", "CDATA", relID);
143 xhtml.startElement("div", attributes);
144 xhtml.endElement("div");
145 }
146 }
147 }
148 } else if (sh instanceof XSLFPictureShape) {
149 if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
150 CTPicture ctPic = ((CTPicture) sh.getXmlObject());
151 if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
152 String relID = ctPic.getBlipFill().getBlip().getEmbed();
153 if (relID != null) {
154 if (slideDesc != null) {
155 relID = slideDesc + relID;
156 }
157 AttributesImpl attributes = new AttributesImpl();
158 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
159 attributes.addAttribute("", "id", "id", "CDATA", relID);
160 xhtml.startElement("div", attributes);
161 xhtml.endElement("div");
162 }
163 }
164 }
165 }
166 }
167 }
168
169 /**
170 * In PowerPoint files, slides have things embedded in them,
171 * and slide drawings which have the images
172 */
173 @Override
174 protected List<PackagePart> getMainDocumentParts() throws TikaException {
175 List<PackagePart> parts = new ArrayList<PackagePart>();
176 XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
177 XSLFSlideShow document = null;
178 try {
179 document = slideShow._getXSLFSlideShow(); // TODO Avoid this in future
180 } catch(Exception e) {
181 throw new TikaException(e.getMessage()); // Shouldn't happen
182 }
183
184 for (CTSlideIdListEntry ctSlide : document.getSlideReferences().getSldIdList()) {
185 // Add the slide
186 PackagePart slidePart;
187 try {
188 slidePart = document.getSlidePart(ctSlide);
189 } catch(IOException e) {
190 throw new TikaException("Broken OOXML file", e);
191 } catch(XmlException xe) {
192 throw new TikaException("Broken OOXML file", xe);
193 }
194 parts.add(slidePart);
195
196 // If it has drawings, return those too
197 try {
198 for(PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
199 if(rel.getTargetMode() == TargetMode.INTERNAL) {
200 PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
201 parts.add( rel.getPackage().getPart(relName) );
202 }
203 }
204 } catch(InvalidFormatException e) {
205 throw new TikaException("Broken OOXML file", e);
206 }
207 }
208
209 return parts;
210 }
211 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.Locale;
23
24 import javax.xml.parsers.ParserConfigurationException;
25 import javax.xml.parsers.SAXParser;
26 import javax.xml.parsers.SAXParserFactory;
27
28 import org.apache.poi.hssf.extractor.ExcelExtractor;
29 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
30 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
31 import org.apache.poi.openxml4j.opc.OPCPackage;
32 import org.apache.poi.openxml4j.opc.PackagePart;
33 import org.apache.poi.openxml4j.opc.PackagePartName;
34 import org.apache.poi.openxml4j.opc.PackageRelationship;
35 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
36 import org.apache.poi.openxml4j.opc.TargetMode;
37 import org.apache.poi.ss.usermodel.DataFormatter;
38 import org.apache.poi.ss.usermodel.HeaderFooter;
39 import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
40 import org.apache.poi.xssf.eventusermodel.XSSFReader;
41 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
42 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
43 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
44 import org.apache.poi.xssf.model.CommentsTable;
45 import org.apache.poi.xssf.model.StylesTable;
46 import org.apache.poi.xssf.usermodel.XSSFComment;
47 import org.apache.poi.xssf.usermodel.XSSFRelation;
48 import org.apache.poi.xssf.usermodel.XSSFShape;
49 import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
50 import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
51 import org.apache.tika.exception.TikaException;
52 import org.apache.tika.metadata.Metadata;
53 import org.apache.tika.metadata.TikaMetadataKeys;
54 import org.apache.tika.parser.ParseContext;
55 import org.apache.tika.sax.XHTMLContentHandler;
56 import org.apache.xmlbeans.XmlException;
57 import org.xml.sax.Attributes;
58 import org.xml.sax.ContentHandler;
59 import org.xml.sax.InputSource;
60 import org.xml.sax.Locator;
61 import org.xml.sax.SAXException;
62 import org.xml.sax.XMLReader;
63
64 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
65 private final XSSFEventBasedExcelExtractor extractor;
66 private final DataFormatter formatter;
67 private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
68 private Metadata metadata;
69
70 public XSSFExcelExtractorDecorator(
71 ParseContext context, XSSFEventBasedExcelExtractor extractor, Locale locale) {
72 super(context, extractor);
73
74 this.extractor = extractor;
75 extractor.setFormulasNotResults(false);
76 extractor.setLocale(locale);
77
78 if(locale == null) {
79 formatter = new DataFormatter();
80 } else {
81 formatter = new DataFormatter(locale);
82 }
83 }
84
85 @Override
86 public void getXHTML(
87 ContentHandler handler, Metadata metadata, ParseContext context)
88 throws SAXException, XmlException, IOException, TikaException {
89
90 this.metadata = metadata;
91 metadata.set(TikaMetadataKeys.PROTECTED, "false");
92
93 super.getXHTML(handler, metadata, context);
94 }
95
96 /**
97 * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
98 */
99 @Override
100 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
101 XmlException, IOException {
102 OPCPackage container = extractor.getPackage();
103
104 ReadOnlySharedStringsTable strings;
105 XSSFReader.SheetIterator iter;
106 XSSFReader xssfReader;
107 StylesTable styles;
108 try {
109 xssfReader = new XSSFReader(container);
110 styles = xssfReader.getStylesTable();
111 iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
112 strings = new ReadOnlySharedStringsTable(container);
113 } catch(InvalidFormatException e) {
114 throw new XmlException(e);
115 } catch (OpenXML4JException oe) {
116 throw new XmlException(oe);
117 }
118
119 while (iter.hasNext()) {
120 InputStream stream = iter.next();
121 sheetParts.add(iter.getSheetPart());
122
123 SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml, iter.getSheetComments());
124
125 // Start, and output the sheet name
126 xhtml.startElement("div");
127 xhtml.element("h1", iter.getSheetName());
128
129 // Extract the main sheet contents
130 xhtml.startElement("table");
131 xhtml.startElement("tbody");
132
133 processSheet(sheetExtractor, styles, strings, stream);
134
135 xhtml.endElement("tbody");
136 xhtml.endElement("table");
137
138 // Output any headers and footers
139 // (Need to process the sheet to get them, so we can't
140 // do the headers before the contents)
141 for(String header : sheetExtractor.headers) {
142 extractHeaderFooter(header, xhtml);
143 }
144 for(String footer : sheetExtractor.footers) {
145 extractHeaderFooter(footer, xhtml);
146 }
147 processShapes(iter.getShapes(), xhtml);
148 // All done with this sheet
149 xhtml.endElement("div");
150 }
151 }
152
153 private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
154 throws SAXException {
155 String content = ExcelExtractor._extractHeaderFooter(
156 new HeaderFooterFromString(hf));
157 if (content.length() > 0) {
158 xhtml.element("p", content);
159 }
160 }
161
162 private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
163 if (shapes == null){
164 return;
165 }
166 for (XSSFShape shape : shapes){
167 if (shape instanceof XSSFSimpleShape){
168 String sText = ((XSSFSimpleShape)shape).getText();
169 if (sText != null && sText.length() > 0){
170 xhtml.element("p", sText);
171 }
172 }
173 }
174 }
175
176 public void processSheet(
177 SheetContentsHandler sheetContentsExtractor,
178 StylesTable styles,
179 ReadOnlySharedStringsTable strings,
180 InputStream sheetInputStream)
181 throws IOException, SAXException {
182 InputSource sheetSource = new InputSource(sheetInputStream);
183 SAXParserFactory saxFactory = SAXParserFactory.newInstance();
184 try {
185 SAXParser saxParser = saxFactory.newSAXParser();
186 XMLReader sheetParser = saxParser.getXMLReader();
187 XSSFSheetInterestingPartsCapturer handler =
188 new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
189 styles, strings, sheetContentsExtractor, formatter, false));
190 sheetParser.setContentHandler(handler);
191 sheetParser.parse(sheetSource);
192 sheetInputStream.close();
193
194 if (handler.hasProtection) {
195 metadata.set(TikaMetadataKeys.PROTECTED, "true");
196 }
197 } catch(ParserConfigurationException e) {
198 throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
199 }
200 }
201
202 /**
203 * Turns formatted sheet events into HTML
204 */
205 protected static class SheetTextAsHTML implements SheetContentsHandler {
206 private XHTMLContentHandler xhtml;
207 private CommentsTable comments;
208 private List<String> headers;
209 private List<String> footers;
210
211 protected SheetTextAsHTML(XHTMLContentHandler xhtml, CommentsTable comments) {
212 this.xhtml = xhtml;
213 this.comments = comments;
214 headers = new ArrayList<String>();
215 footers = new ArrayList<String>();
216 }
217
218 public void startRow(int rowNum) {
219 try {
220 xhtml.startElement("tr");
221 } catch(SAXException e) {}
222 }
223
224 public void endRow() {
225 try {
226 xhtml.endElement("tr");
227 } catch(SAXException e) {}
228 }
229
230 public void cell(String cellRef, String formattedValue) {
231 try {
232 xhtml.startElement("td");
233
234 // Main cell contents
235 xhtml.characters(formattedValue);
236
237 // Comments
238 if(comments != null) {
239 XSSFComment comment = comments.findCellComment(cellRef);
240 if(comment != null) {
241 xhtml.startElement("br");
242 xhtml.endElement("br");
243 xhtml.characters(comment.getAuthor());
244 xhtml.characters(": ");
245 xhtml.characters(comment.getString().getString());
246 }
247 }
248
249 xhtml.endElement("td");
250 } catch(SAXException e) {}
251 }
252
253 public void headerFooter(String text, boolean isHeader, String tagName) {
254 if(isHeader) {
255 headers.add(text);
256 } else {
257 footers.add(text);
258 }
259 }
260 }
261
262 /**
263 * Allows access to headers/footers from raw xml strings
264 */
265 private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
266 protected static class HeaderFooterFromString implements HeaderFooter {
267 private String text;
268 protected HeaderFooterFromString(String text) {
269 this.text = text;
270 }
271
272 public String getCenter() {
273 return hfHelper.getCenterSection(text);
274 }
275 public String getLeft() {
276 return hfHelper.getLeftSection(text);
277 }
278 public String getRight() {
279 return hfHelper.getRightSection(text);
280 }
281
282 public void setCenter(String paramString) {}
283 public void setLeft(String paramString) {}
284 public void setRight(String paramString) {}
285 }
286
287 /**
288 * Captures information on interesting tags, whilst
289 * delegating the main work to the formatting handler
290 */
291 protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
292 private ContentHandler delegate;
293 private boolean hasProtection = false;
294
295 protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
296 this.delegate = delegate;
297 }
298
299 public void startElement(String uri, String localName, String qName,
300 Attributes atts) throws SAXException {
301 if("sheetProtection".equals(qName)) {
302 hasProtection = true;
303 }
304 delegate.startElement(uri, localName, qName, atts);
305 }
306
307 public void characters(char[] ch, int start, int length)
308 throws SAXException {
309 delegate.characters(ch, start, length);
310 }
311 public void endDocument() throws SAXException {
312 delegate.endDocument();
313 }
314 public void endElement(String uri, String localName, String qName)
315 throws SAXException {
316 delegate.endElement(uri, localName, qName);
317 }
318 public void endPrefixMapping(String prefix) throws SAXException {
319 delegate.endPrefixMapping(prefix);
320 }
321 public void ignorableWhitespace(char[] ch, int start, int length)
322 throws SAXException {
323 delegate.ignorableWhitespace(ch, start, length);
324 }
325 public void processingInstruction(String target, String data)
326 throws SAXException {
327 delegate.processingInstruction(target, data);
328 }
329 public void setDocumentLocator(Locator locator) {
330 delegate.setDocumentLocator(locator);
331 }
332 public void skippedEntity(String name) throws SAXException {
333 delegate.skippedEntity(name);
334 }
335 public void startDocument() throws SAXException {
336 delegate.startDocument();
337 }
338 public void startPrefixMapping(String prefix, String uri)
339 throws SAXException {
340 delegate.startPrefixMapping(prefix, uri);
341 }
342 }
343
344 /**
345 * In Excel files, sheets have things embedded in them,
346 * and sheet drawings which have the images
347 */
348 @Override
349 protected List<PackagePart> getMainDocumentParts() throws TikaException {
350 List<PackagePart> parts = new ArrayList<PackagePart>();
351 for(PackagePart part : sheetParts) {
352 // Add the sheet
353 parts.add(part);
354
355 // If it has drawings, return those too
356 try {
357 for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
358 if(rel.getTargetMode() == TargetMode.INTERNAL) {
359 PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
360 parts.add( rel.getPackage().getPart(relName) );
361 }
362 }
363 for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
364 if(rel.getTargetMode() == TargetMode.INTERNAL) {
365 PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
366 parts.add( rel.getPackage().getPart(relName) );
367 }
368 }
369 } catch(InvalidFormatException e) {
370 throw new TikaException("Broken OOXML file", e);
371 }
372 }
373
374 return parts;
375 }
376 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.List;
21 import javax.xml.namespace.QName;
22
23 import org.apache.poi.openxml4j.opc.PackagePart;
24 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
25 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
26 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
27 import org.apache.poi.xwpf.usermodel.BodyType;
28 import org.apache.poi.xwpf.usermodel.IBody;
29 import org.apache.poi.xwpf.usermodel.IBodyElement;
30 import org.apache.poi.xwpf.usermodel.IRunElement;
31 import org.apache.poi.xwpf.usermodel.XWPFDocument;
32 import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
33 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
34 import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
35 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
36 import org.apache.poi.xwpf.usermodel.XWPFPicture;
37 import org.apache.poi.xwpf.usermodel.XWPFPictureData;
38 import org.apache.poi.xwpf.usermodel.XWPFRun;
39 import org.apache.poi.xwpf.usermodel.XWPFSDT;
40 import org.apache.poi.xwpf.usermodel.XWPFSDTContent;
41 import org.apache.poi.xwpf.usermodel.XWPFStyle;
42 import org.apache.poi.xwpf.usermodel.XWPFStyles;
43 import org.apache.poi.xwpf.usermodel.XWPFTable;
44 import org.apache.poi.xwpf.usermodel.XWPFTableCell;
45 import org.apache.poi.xwpf.usermodel.XWPFTableRow;
46 import org.apache.tika.parser.ParseContext;
47 import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
48 import org.apache.tika.parser.microsoft.WordExtractor;
49 import org.apache.tika.sax.XHTMLContentHandler;
50 import org.apache.xmlbeans.XmlCursor;
51 import org.apache.xmlbeans.XmlException;
52 import org.apache.xmlbeans.XmlObject;
53 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
54 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
55 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
56 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
57 import org.xml.sax.SAXException;
58 import org.xml.sax.helpers.AttributesImpl;
59
60 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
61 private XWPFDocument document;
62 private XWPFStyles styles;
63
64 public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
65 super(context, extractor);
66
67 document = (XWPFDocument) extractor.getDocument();
68 styles = document.getStyles();
69 }
70
71 /**
72 * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
73 */
74 @Override
75 protected void buildXHTML(XHTMLContentHandler xhtml)
76 throws SAXException, XmlException, IOException {
77 XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
78
79 // headers
80 if (hfPolicy!=null) {
81 extractHeaders(xhtml, hfPolicy);
82 }
83
84 // process text in the order that it occurs in
85 extractIBodyText(document, xhtml);
86
87 // then all document tables
88 if (hfPolicy!=null) {
89 extractFooters(xhtml, hfPolicy);
90 }
91 }
92
93 private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
94 throws SAXException, XmlException, IOException {
95 for(IBodyElement element : bodyElement.getBodyElements()) {
96 if(element instanceof XWPFParagraph) {
97 XWPFParagraph paragraph = (XWPFParagraph)element;
98 extractParagraph(paragraph, xhtml);
99 }
100 if(element instanceof XWPFTable) {
101 XWPFTable table = (XWPFTable)element;
102 extractTable(table, xhtml);
103 }
104 if (element instanceof XWPFSDT){
105 extractSDT((XWPFSDT) element, xhtml);
106 }
107
108 }
109 }
110
111 private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException,
112 XmlException, IOException {
113 XWPFSDTContent content = element.getContent();
114 String tag = "p";
115 xhtml.startElement(tag);
116 xhtml.characters(content.getText());
117 xhtml.endElement(tag);
118 }
119
120 private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
121 throws SAXException, XmlException, IOException {
122 // If this paragraph is actually a whole new section, then
123 // it could have its own headers and footers
124 // Check and handle if so
125 XWPFHeaderFooterPolicy headerFooterPolicy = null;
126 if (paragraph.getCTP().getPPr() != null) {
127 CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
128 if(ctSectPr != null) {
129 headerFooterPolicy =
130 new XWPFHeaderFooterPolicy(document, ctSectPr);
131 extractHeaders(xhtml, headerFooterPolicy);
132 }
133 }
134
135 // Is this a paragraph, or a heading?
136 String tag = "p";
137 String styleClass = null;
138 if(paragraph.getStyleID() != null) {
139 XWPFStyle style = styles.getStyle(
140 paragraph.getStyleID()
141 );
142
143 if (style != null && style.getName() != null) {
144 TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
145 style.getName(), paragraph.getPartType() == BodyType.TABLECELL
146 );
147 tag = tas.getTag();
148 styleClass = tas.getStyleClass();
149 }
150 }
151
152 if(styleClass == null) {
153 xhtml.startElement(tag);
154 } else {
155 xhtml.startElement(tag, "class", styleClass);
156 }
157
158 // Output placeholder for any embedded docs:
159
160 // TODO: replace w/ XPath/XQuery:
161 for(XWPFRun run : paragraph.getRuns()) {
162 XmlCursor c = run.getCTR().newCursor();
163 c.selectPath("./*");
164 while (c.toNextSelection()) {
165 XmlObject o = c.getObject();
166 if (o instanceof CTObject) {
167 XmlCursor c2 = o.newCursor();
168 c2.selectPath("./*");
169 while (c2.toNextSelection()) {
170 XmlObject o2 = c2.getObject();
171
172 XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
173 if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
174 // Type is "Embed"
175 XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
176 if (relIDAtt != null) {
177 String relID = relIDAtt.getDomNode().getNodeValue();
178 AttributesImpl attributes = new AttributesImpl();
179 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
180 attributes.addAttribute("", "id", "id", "CDATA", relID);
181 xhtml.startElement("div", attributes);
182 xhtml.endElement("div");
183 }
184 }
185 }
186 c2.dispose();
187 }
188 }
189
190 c.dispose();
191 }
192
193 // Attach bookmarks for the paragraph
194 // (In future, we might put them in the right place, for now
195 // we just put them in the correct paragraph)
196 for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
197 xhtml.startElement("a", "name", bookmark.getName());
198 xhtml.endElement("a");
199 }
200
201 TmpFormatting fmtg = new TmpFormatting(false, false);
202
203 // Do the iruns
204 for(IRunElement run : paragraph.getIRuns()) {
205 if (run instanceof XWPFSDT){
206 fmtg = closeStyleTags(xhtml, fmtg);
207 processSDTRun((XWPFSDT)run, xhtml);
208 //for now, we're ignoring formatting in sdt
209 //if you hit an sdt reset to false
210 fmtg.setBold(false);
211 fmtg.setItalic(false);
212 } else {
213 fmtg = processRun((XWPFRun)run, paragraph, xhtml, fmtg);
214 }
215 }
216 closeStyleTags(xhtml, fmtg);
217
218
219 // Now do any comments for the paragraph
220 XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
221 String commentText = comments.getCommentText();
222 if(commentText != null && commentText.length() > 0) {
223 xhtml.characters(commentText);
224 }
225
226 String footnameText = paragraph.getFootnoteText();
227 if(footnameText != null && footnameText.length() > 0) {
228 xhtml.characters(footnameText + "\n");
229 }
230
231 // Also extract any paragraphs embedded in text boxes:
232 for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
233 extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), xhtml);
234 }
235
236 // Finish this paragraph
237 xhtml.endElement(tag);
238
239 if (headerFooterPolicy != null) {
240 extractFooters(xhtml, headerFooterPolicy);
241 }
242 }
243
244 private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
245 TmpFormatting fmtg) throws SAXException {
246 // Close any still open style tags
247 if (fmtg.isItalic()) {
248 xhtml.endElement("i");
249 fmtg.setItalic(false);
250 }
251 if (fmtg.isBold()) {
252 xhtml.endElement("b");
253 fmtg.setBold(false);
254 }
255 return fmtg;
256 }
257
258 private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
259 XHTMLContentHandler xhtml, TmpFormatting tfmtg)
260 throws SAXException, XmlException, IOException{
261 // True if we are currently in the named style tag:
262 if (run.isBold() != tfmtg.isBold()) {
263 if (tfmtg.isItalic()) {
264 xhtml.endElement("i");
265 tfmtg.setItalic(false);
266 }
267 if (run.isBold()) {
268 xhtml.startElement("b");
269 } else {
270 xhtml.endElement("b");
271 }
272 tfmtg.setBold(run.isBold());
273 }
274
275 if (run.isItalic() != tfmtg.isItalic()) {
276 if (run.isItalic()) {
277 xhtml.startElement("i");
278 } else {
279 xhtml.endElement("i");
280 }
281 tfmtg.setItalic(run.isItalic());
282 }
283
284 boolean addedHREF = false;
285 if(run instanceof XWPFHyperlinkRun) {
286 XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
287 XWPFHyperlink link = linkRun.getHyperlink(document);
288 if(link != null && link.getURL() != null) {
289 xhtml.startElement("a", "href", link.getURL());
290 addedHREF = true;
291 } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
292 xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
293 addedHREF = true;
294 }
295 }
296
297 xhtml.characters(run.toString());
298
299 // If we have any pictures, output them
300 for(XWPFPicture picture : run.getEmbeddedPictures()) {
301 if(paragraph.getDocument() != null) {
302 XWPFPictureData data = picture.getPictureData();
303 if(data != null) {
304 AttributesImpl attr = new AttributesImpl();
305
306 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName());
307 attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());
308
309 xhtml.startElement("img", attr);
310 xhtml.endElement("img");
311 }
312 }
313 }
314
315 if (addedHREF) {
316 xhtml.endElement("a");
317 }
318
319 return tfmtg;
320 }
321
322 private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
323 throws SAXException, XmlException, IOException{
324 xhtml.characters(run.getContent().getText());
325 }
326
327 private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
328 throws SAXException, XmlException, IOException {
329 xhtml.startElement("table");
330 xhtml.startElement("tbody");
331 for(XWPFTableRow row : table.getRows()) {
332 xhtml.startElement("tr");
333 for(XWPFTableCell cell : row.getTableCells()) {
334 xhtml.startElement("td");
335 extractIBodyText(cell, xhtml);
336 xhtml.endElement("td");
337 }
338 xhtml.endElement("tr");
339 }
340 xhtml.endElement("tbody");
341 xhtml.endElement("table");
342 }
343
344 private void extractFooters(
345 XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
346 throws SAXException, XmlException, IOException {
347 // footers
348 if (hfPolicy.getFirstPageFooter() != null) {
349 extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
350 }
351 if (hfPolicy.getEvenPageFooter() != null) {
352 extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
353 }
354 if (hfPolicy.getDefaultFooter() != null) {
355 extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
356 }
357 }
358
359 private void extractHeaders(
360 XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
361 throws SAXException, XmlException, IOException {
362 if (hfPolicy == null) return;
363
364 if (hfPolicy.getFirstPageHeader() != null) {
365 extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
366 }
367
368 if (hfPolicy.getEvenPageHeader() != null) {
369 extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
370 }
371
372 if (hfPolicy.getDefaultHeader() != null) {
373 extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
374 }
375 }
376
377 private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header) throws SAXException, XmlException, IOException {
378
379 for (IBodyElement e : header.getBodyElements()){
380 if (e instanceof XWPFParagraph){
381 extractParagraph((XWPFParagraph)e, xhtml);
382 } else if (e instanceof XWPFTable){
383 extractTable((XWPFTable)e, xhtml);
384 } else if (e instanceof XWPFSDT){
385 extractSDT((XWPFSDT)e, xhtml);
386 }
387 }
388 }
389
390 /**
391 * Word documents are simple, they only have the one
392 * main part
393 */
394 @Override
395 protected List<PackagePart> getMainDocumentParts() {
396 List<PackagePart> parts = new ArrayList<PackagePart>();
397 parts.add( document.getPackagePart() );
398 return parts;
399 }
400
401 private class TmpFormatting{
402 private boolean bold = false;
403 private boolean italic = false;
404 private TmpFormatting(boolean bold, boolean italic){
405 this.bold = bold;
406 this.italic = italic;
407 }
408 public boolean isBold() {
409 return bold;
410 }
411 public void setBold(boolean bold) {
412 this.bold = bold;
413 }
414 public boolean isItalic() {
415 return italic;
416 }
417 public void setItalic(boolean italic) {
418 this.italic = italic;
419 }
420
421 }
422
423 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24
25 /**
26 * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
27 * Currently, only the header is processed, not the raw audio data.
28 */
29 public class AudioFrame implements MP3Frame {
30 /** Constant for the MPEG version 1. */
31 public static final int MPEG_V1 = 3;
32
33 /** Constant for the MPEG version 2. */
34 public static final int MPEG_V2 = 2;
35
36 /** Constant for the MPEG version 2.5. */
37 public static final int MPEG_V2_5 = 0;
38
39 /** Constant for audio layer 1. */
40 public static final int LAYER_1 = 3;
41
42 /** Constant for audio layer 2. */
43 public static final int LAYER_2 = 2;
44
45 /** Constant for audio layer 3. */
46 public static final int LAYER_3 = 1;
47
48 private final String version;
49 private final int versionCode;
50 private final int layer;
51 private final int sampleRate;
52 private final int channels;
53 private final int bitRate;
54 private final int length;
55 private final float duration;
56
57 public String getVersion() {
58 return version;
59 }
60
61 /**
62 * Get the sampling rate, in Hz
63 */
64 public int getSampleRate() {
65 return sampleRate;
66 }
67
68 /**
69 * Get the number of channels (1=mono, 2=stereo)
70 */
71 public int getChannels() {
72 return channels;
73 }
74
75 /**
76 * Get the version code.
77 * @return the version code (one of the {@code MPEG} constants)
78 */
79 public int getVersionCode()
80 {
81 return versionCode;
82 }
83
84 /**
85 * Get the audio layer code.
86 * @return the audio layer (one of the {@code LAYER} constants)
87 */
88 public int getLayer()
89 {
90 return layer;
91 }
92
93 /**
94 * Get the bit rate in bit per second.
95 * @return the bit rate
96 */
97 public int getBitRate()
98 {
99 return bitRate;
100 }
101
102 /**
103 * Returns the frame length in bytes.
104 * @return the frame length
105 */
106 public int getLength()
107 {
108 return length;
109 }
110
111 /**
112 * Returns the duration in milliseconds.
113 * @return the duration
114 */
115 public float getDuration()
116 {
117 return duration;
118 }
119
120 /**
121 * Does this appear to be a 4 byte audio frame header?
122 */
123 public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
124 if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
125 return false;
126 }
127 // Check for the magic 11 bits set at the start
128 // Note - doesn't do a CRC check
129 if (h1 == 0xff && (h2 & 0x60) == 0x60) {
130 return true;
131 }
132 return false;
133 }
134
135 /**
136 * @deprecated Use the constructor which is passed all values directly.
137 */
138 @Deprecated
139 public AudioFrame(InputStream stream, ContentHandler handler)
140 throws IOException, SAXException, TikaException {
141 this(-2, -2, -2, -2, stream);
142 }
143
144 /**
145 * @deprecated Use the constructor which is passed all values directly.
146 */
147 @Deprecated
148 public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
149 throws IOException {
150 if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
151 h1 = in.read();
152 h2 = in.read();
153 h3 = in.read();
154 h4 = in.read();
155 }
156
157 if (isAudioHeader(h1, h2, h3, h4)) {
158 layer = (h2 >> 1) & 0x03;
159 versionCode = (h2 >> 3) & 0x03;
160 version = generateVersionStr(versionCode, layer);
161
162 int rateCode = (h3 >> 2) & 0x03;
163 int rate;
164 switch (rateCode) {
165 case 0:
166 rate = 11025;
167 break;
168 case 1:
169 rate = 12000;
170 break;
171 default:
172 rate = 8000;
173 }
174 if (versionCode == MPEG_V2) {
175 rate *= 2;
176 } else if(versionCode == MPEG_V1) {
177 rate *= 4;
178 }
179 sampleRate = rate;
180
181 int chans = h4 & 0x192;
182 if (chans < 3) {
183 // Stereo, joint stereo, dual channel
184 channels = 2;
185 } else {
186 channels = 1;
187 }
188 bitRate = 0;
189 duration = 0;
190 length = 0;
191 } else {
192 throw new IllegalArgumentException("Magic Audio Frame Header not found");
193 }
194 }
195
196 /**
197 *
198 * Creates a new instance of {@code AudioFrame} and initializes all properties.
199 * @param mpegVersion the code for the MPEG version
200 * @param layer the code for the layer
201 * @param bitRate the bit rate (in bps)
202 * @param sampleRate the sample rate (in samples per second)
203 * @param channels the number of channels
204 * @param length the frame length (in bytes)
205 * @param duration the duration of this frame (in milliseconds)
206 */
207 public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
208 int channels, int length, float duration) {
209 versionCode = mpegVersion;
210 this.layer = layer;
211 this.bitRate = bitRate;
212 this.sampleRate = sampleRate;
213 this.channels = channels;
214 this.length = length;
215 this.duration = duration;
216 version = generateVersionStr(mpegVersion, layer);
217 }
218
219 /**
220 * Generates a string for the version of this audio frame.
221 * @param version the code for the MPEG version
222 * @param layer the code for the layer
223 * @return a string for the version
224 */
225 private static String generateVersionStr(int version, int layer) {
226 StringBuilder buf = new StringBuilder(64);
227 buf.append("MPEG 3 Layer ");
228 if (layer == LAYER_3) {
229 buf.append("III");
230 } else if (layer == LAYER_2) {
231 buf.append("II");
232 } else if (layer == LAYER_1) {
233 buf.append("I");
234 } else {
235 buf.append("(reserved)");
236 }
237
238 buf.append(" Version ");
239 if (version == MPEG_V2_5) {
240 buf.append("2.5");
241 } else if(version == MPEG_V2) {
242 buf.append("2");
243 } else if(version == MPEG_V1) {
244 buf.append("1");
245 } else {
246 buf.append("(reseved)");
247 }
248
249 return buf.toString();
250 }
251 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.util.Collections;
19 import java.util.List;
20
21 /**
22 * Takes an array of {@link ID3Tags} in preference order, and when asked for
23 * a given tag, will return it from the first {@link ID3Tags} that has it.
24 */
25 public class CompositeTagHandler implements ID3Tags {
26
27 private ID3Tags[] tags;
28
29 public CompositeTagHandler(ID3Tags[] tags) {
30 this.tags = tags;
31 }
32
33 public boolean getTagsPresent() {
34 for (ID3Tags tag : tags) {
35 if (tag.getTagsPresent()) {
36 return true;
37 }
38 }
39 return false;
40 }
41
42 public String getTitle() {
43 for (ID3Tags tag : tags) {
44 if (tag.getTitle() != null) {
45 return tag.getTitle();
46 }
47 }
48 return null;
49 }
50
51 public String getArtist() {
52 for (ID3Tags tag : tags) {
53 if (tag.getArtist() != null) {
54 return tag.getArtist();
55 }
56 }
57 return null;
58 }
59
60 public String getAlbum() {
61 for (ID3Tags tag : tags) {
62 if (tag.getAlbum() != null) {
63 return tag.getAlbum();
64 }
65 }
66 return null;
67 }
68
69 public String getComposer() {
70 for (ID3Tags tag : tags) {
71 if (tag.getComposer() != null) {
72 return tag.getComposer();
73 }
74 }
75 return null;
76 }
77
78 public String getYear() {
79 for (ID3Tags tag : tags) {
80 if (tag.getYear() != null) {
81 return tag.getYear();
82 }
83 }
84 return null;
85 }
86
87 public List<ID3Comment> getComments() {
88 for (ID3Tags tag : tags) {
89 List<ID3Comment> comments = tag.getComments();
90 if (comments != null && comments.size() > 0) {
91 return comments;
92 }
93 }
94 return Collections.emptyList();
95 }
96
97 public String getGenre() {
98 for (ID3Tags tag : tags) {
99 if (tag.getGenre() != null) {
100 return tag.getGenre();
101 }
102 }
103 return null;
104 }
105
106 public String getTrackNumber() {
107 for (ID3Tags tag : tags) {
108 if (tag.getTrackNumber() != null) {
109 return tag.getTrackNumber();
110 }
111 }
112 return null;
113 }
114
115 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.util.List;
19
20
21 /**
22 * Interface that defines the common interface for ID3 tag parsers,
23 * such as ID3v1 and ID3v2.3.
24 * Implementations should return NULL if the file lacks a given
25 * tag, or if the tag isn't defined for the version.
26 *
27 * Note that so far, only the ID3v1 core tags are listed here. In
28 * future, we may wish to add more to cover the extra tags that
29 * our ID3v2 handlers can produce.
30 */
31 public interface ID3Tags {
32 /**
33 * List of predefined genres.
34 *
35 * @see http://www.id3.org/id3v2-00
36 */
37 String[] GENRES = new String[] {
38 /* 0 */ "Blues",
39 /* 1 */ "Classic Rock",
40 /* 2 */ "Country",
41 /* 3 */ "Dance",
42 /* 4 */ "Disco",
43 /* 5 */ "Funk",
44 /* 6 */ "Grunge",
45 /* 7 */ "Hip-Hop",
46 /* 8 */ "Jazz",
47 /* 9 */ "Metal",
48 /* 10 */ "New Age",
49 /* 11 */ "Oldies",
50 /* 12 */ "Other",
51 /* 13 */ "Pop",
52 /* 14 */ "R&B",
53 /* 15 */ "Rap",
54 /* 16 */ "Reggae",
55 /* 17 */ "Rock",
56 /* 18 */ "Techno",
57 /* 19 */ "Industrial",
58 /* 20 */ "Alternative",
59 /* 21 */ "Ska",
60 /* 22 */ "Death Metal",
61 /* 23 */ "Pranks",
62 /* 24 */ "Soundtrack",
63 /* 25 */ "Euro-Techno",
64 /* 26 */ "Ambient",
65 /* 27 */ "Trip-Hop",
66 /* 28 */ "Vocal",
67 /* 29 */ "Jazz+Funk",
68 /* 30 */ "Fusion",
69 /* 31 */ "Trance",
70 /* 32 */ "Classical",
71 /* 33 */ "Instrumental",
72 /* 34 */ "Acid",
73 /* 35 */ "House",
74 /* 36 */ "Game",
75 /* 37 */ "Sound Clip",
76 /* 38 */ "Gospel",
77 /* 39 */ "Noise",
78 /* 40 */ "AlternRock",
79 /* 41 */ "Bass",
80 /* 42 */ "Soul",
81 /* 43 */ "Punk",
82 /* 44 */ "Space",
83 /* 45 */ "Meditative",
84 /* 46 */ "Instrumental Pop",
85 /* 47 */ "Instrumental Rock",
86 /* 48 */ "Ethnic",
87 /* 49 */ "Gothic",
88 /* 50 */ "Darkwave",
89 /* 51 */ "Techno-Industrial",
90 /* 52 */ "Electronic",
91 /* 53 */ "Pop-Folk",
92 /* 54 */ "Eurodance",
93 /* 55 */ "Dream",
94 /* 56 */ "Southern Rock",
95 /* 57 */ "Comedy",
96 /* 58 */ "Cult",
97 /* 59 */ "Gangsta",
98 /* 60 */ "Top 40",
99 /* 61 */ "Christian Rap",
100 /* 62 */ "Pop/Funk",
101 /* 63 */ "Jungle",
102 /* 64 */ "Native American",
103 /* 65 */ "Cabaret",
104 /* 66 */ "New Wave",
105 /* 67 */ "Psychadelic",
106 /* 68 */ "Rave",
107 /* 69 */ "Showtunes",
108 /* 70 */ "Trailer",
109 /* 71 */ "Lo-Fi",
110 /* 72 */ "Tribal",
111 /* 73 */ "Acid Punk",
112 /* 74 */ "Acid Jazz",
113 /* 75 */ "Polka",
114 /* 76 */ "Retro",
115 /* 77 */ "Musical",
116 /* 78 */ "Rock & Roll",
117 /* 79 */ "Hard Rock",
118 /* 80 */ "Folk",
119 /* 81 */ "Folk-Rock",
120 /* 82 */ "National Folk",
121 /* 83 */ "Swing",
122 /* 84 */ "Fast Fusion",
123 /* 85 */ "Bebob",
124 /* 86 */ "Latin",
125 /* 87 */ "Revival",
126 /* 88 */ "Celtic",
127 /* 89 */ "Bluegrass",
128 /* 90 */ "Avantgarde",
129 /* 91 */ "Gothic Rock",
130 /* 92 */ "Progressive Rock",
131 /* 93 */ "Psychedelic Rock",
132 /* 94 */ "Symphonic Rock",
133 /* 95 */ "Slow Rock",
134 /* 96 */ "Big Band",
135 /* 97 */ "Chorus",
136 /* 98 */ "Easy Listening",
137 /* 99 */ "Acoustic",
138 /* 100 */ "Humour",
139 /* 101 */ "Speech",
140 /* 102 */ "Chanson",
141 /* 103 */ "Opera",
142 /* 104 */ "Chamber Music",
143 /* 105 */ "Sonata",
144 /* 106 */ "Symphony",
145 /* 107 */ "Booty Bass",
146 /* 108 */ "Primus",
147 /* 109 */ "Porn Groove",
148 /* 110 */ "Satire",
149 /* 111 */ "Slow Jam",
150 /* 112 */ "Club",
151 /* 113 */ "Tango",
152 /* 114 */ "Samba",
153 /* 115 */ "Folklore",
154 /* 116 */ "Ballad",
155 /* 117 */ "Power Ballad",
156 /* 118 */ "Rhythmic Soul",
157 /* 119 */ "Freestyle",
158 /* 120 */ "Duet",
159 /* 121 */ "Punk Rock",
160 /* 122 */ "Drum Solo",
161 /* 123 */ "A capella",
162 /* 124 */ "Euro-House",
163 /* 125 */ "Dance Hall",
164 /* sentinel */ ""
165 };
166
167 /**
168 * Does the file contain this kind of tags?
169 */
170 boolean getTagsPresent();
171
172 String getTitle();
173
174 String getArtist();
175
176 String getAlbum();
177
178 String getComposer();
179
180 /**
181 * Retrieves the comments, if any.
182 * Files may have more than one comment, but normally only
183 * one with any language/description pair.
184 */
185 List<ID3Comment> getComments();
186
187 String getGenre();
188
189 String getYear();
190
191 String getTrackNumber();
192
193 /**
194 * Represents a comments in ID3 (especially ID3 v2), where are
195 * made up of several parts
196 */
197 public static class ID3Comment {
198 private String language;
199 private String description;
200 private String text;
201
202 /**
203 * Creates an ID3 v1 style comment tag
204 */
205 public ID3Comment(String id3v1Text) {
206 this.text = id3v1Text;
207 }
208 /**
209 * Creates an ID3 v2 style comment tag
210 */
211 public ID3Comment(String language, String description, String text) {
212 this.language = language;
213 this.description = description;
214 this.text = text;
215 }
216
217 /**
218 * Gets the language, if present
219 */
220 public String getLanguage() {
221 return language;
222 }
223 /**
224 * Gets the description, if present
225 */
226 public String getDescription() {
227 return description;
228 }
229 /**
230 * Gets the text, if present
231 */
232 public String getText() {
233 return text;
234 }
235 }
236 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.UnsupportedEncodingException;
21 import java.util.Arrays;
22 import java.util.List;
23
24 import org.apache.tika.exception.TikaException;
25 import org.xml.sax.ContentHandler;
26 import org.xml.sax.SAXException;
27
28 /**
29 * This is used to parse ID3 Version 1 Tag information from an MP3 file,
30 * if available.
31 *
32 * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
33 */
34 public class ID3v1Handler implements ID3Tags {
35 private String title;
36 private String artist;
37 private String album;
38 private String year;
39 private ID3Comment comment;
40 private String genre;
41 private String trackNumber;
42
43 boolean found = false;
44
45 public ID3v1Handler(InputStream stream, ContentHandler handler)
46 throws IOException, SAXException, TikaException {
47 this(LyricsHandler.getSuffix(stream, 128));
48 }
49
50 /**
51 * Creates from the last 128 bytes of a stream.
52 * @param tagData Must be the last 128 bytes
53 */
54 protected ID3v1Handler(byte[] tagData)
55 throws IOException, SAXException, TikaException {
56 if (tagData.length == 128
57 && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
58 found = true;
59
60 title = getString(tagData, 3, 33);
61 artist = getString(tagData, 33, 63);
62 album = getString(tagData, 63, 93);
63 year = getString(tagData, 93, 97);
64
65 String commentStr = getString(tagData, 97, 127);
66 comment = new ID3Comment(commentStr);
67
68 int genreID = (int) tagData[127] & 0xff; // unsigned byte
69 genre = GENRES[Math.min(genreID, GENRES.length - 1)];
70
71 // ID3v1.1 Track addition
72 // If the last two bytes of the comment field are zero and
73 // non-zero, then the last byte is the track number
74 if (tagData[125] == 0 && tagData[126] != 0) {
75 int trackNum = (int) tagData[126] & 0xff;
76 trackNumber = Integer.toString(trackNum);
77 }
78 }
79 }
80
81
82 public boolean getTagsPresent() {
83 return found;
84 }
85
86 public String getTitle() {
87 return title;
88 }
89
90 public String getArtist() {
91 return artist;
92 }
93
94 public String getAlbum() {
95 return album;
96 }
97
98 public String getYear() {
99 return year;
100 }
101
102 public List<ID3Comment> getComments() {
103 return Arrays.asList(new ID3Comment[] {comment});
104 }
105
106 public String getGenre() {
107 return genre;
108 }
109
110 public String getTrackNumber() {
111 return trackNumber;
112 }
113
114 /**
115 * ID3v1 doesn't have composers,
116 * so returns null;
117 */
118 public String getComposer() {
119 return null;
120 }
121
122 /**
123 * Returns the identified ISO-8859-1 substring from the given byte buffer.
124 * The return value is the zero-terminated substring retrieved from
125 * between the given start and end positions in the given byte buffer.
126 * Extra whitespace (and control characters) from the beginning and the
127 * end of the substring is removed.
128 *
129 * @param buffer byte buffer
130 * @param start start index of the substring
131 * @param end end index of the substring
132 * @return the identified substring
133 * @throws TikaException if the ISO-8859-1 encoding is not available
134 */
135 private static String getString(byte[] buffer, int start, int end)
136 throws TikaException {
137 // Find the zero byte that marks the end of the string
138 int zero = start;
139 while (zero < end && buffer[zero] != 0) {
140 zero++;
141 }
142
143 // Skip trailing whitespace
144 end = zero;
145 while (start < end && buffer[end - 1] <= ' ') {
146 end--;
147 }
148
149 // Skip leading whitespace
150 while (start < end && buffer[start] <= ' ') {
151 start++;
152 }
153
154 // Return the remaining substring
155 try {
156 return new String(buffer, start, end - start, "ISO-8859-1");
157 } catch (UnsupportedEncodingException e) {
158 throw new TikaException("ISO-8859-1 encoding is not available", e);
159 }
160 }
161 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.List;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
24 import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
25 import org.xml.sax.SAXException;
26
27 /**
28 * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
29 * if available.
30 *
31 * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
32 */
33 public class ID3v22Handler implements ID3Tags {
34 private String title;
35 private String artist;
36 private String album;
37 private String year;
38 private String composer;
39 private String genre;
40 private String trackNumber;
41 private List<ID3Comment> comments = new ArrayList<ID3Comment>();
42
43 public ID3v22Handler(ID3v2Frame frame)
44 throws IOException, SAXException, TikaException {
45 RawTagIterator tags = new RawV22TagIterator(frame);
46 while (tags.hasNext()) {
47 RawTag tag = tags.next();
48 if (tag.name.equals("TT2")) {
49 title = getTagString(tag.data, 0, tag.data.length);
50 } else if (tag.name.equals("TP1")) {
51 artist = getTagString(tag.data, 0, tag.data.length);
52 } else if (tag.name.equals("TAL")) {
53 album = getTagString(tag.data, 0, tag.data.length);
54 } else if (tag.name.equals("TYE")) {
55 year = getTagString(tag.data, 0, tag.data.length);
56 } else if (tag.name.equals("TCM")) {
57 composer = getTagString(tag.data, 0, tag.data.length);
58 } else if (tag.name.equals("COM")) {
59 comments.add( getComment(tag.data, 0, tag.data.length) );
60 } else if (tag.name.equals("TRK")) {
61 trackNumber = getTagString(tag.data, 0, tag.data.length);
62 } else if (tag.name.equals("TCO")) {
63 genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
64 }
65 }
66 }
67
68 private String getTagString(byte[] data, int offset, int length) {
69 return ID3v2Frame.getTagString(data, offset, length);
70 }
71 private ID3Comment getComment(byte[] data, int offset, int length) {
72 return ID3v2Frame.getComment(data, offset, length);
73 }
74
75 protected static String extractGenre(String rawGenre) {
76 int open = rawGenre.indexOf("(");
77 int close = rawGenre.indexOf(")");
78 if (open == -1 && close == -1) {
79 return rawGenre;
80 } else if (open < close) {
81 String genreStr = rawGenre.substring(0, open).trim();
82 try {
83 int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
84 return ID3Tags.GENRES[genreID];
85 } catch(ArrayIndexOutOfBoundsException invalidNum) {
86 return genreStr;
87 } catch(NumberFormatException notANum) {
88 return genreStr;
89 }
90 } else {
91 return null;
92 }
93 }
94
95 public boolean getTagsPresent() {
96 return true;
97 }
98
99 public String getTitle() {
100 return title;
101 }
102
103 public String getArtist() {
104 return artist;
105 }
106
107 public String getAlbum() {
108 return album;
109 }
110
111 public String getYear() {
112 return year;
113 }
114
115 public String getComposer() {
116 return composer;
117 }
118
119 public List<ID3Comment> getComments() {
120 return comments;
121 }
122
123 public String getGenre() {
124 return genre;
125 }
126
127 public String getTrackNumber() {
128 return trackNumber;
129 }
130
131 private class RawV22TagIterator extends RawTagIterator {
132 private RawV22TagIterator(ID3v2Frame frame) {
133 frame.super(3, 3, 1, 0);
134 }
135 }
136
137 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.List;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
24 import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
25 import org.xml.sax.SAXException;
26
27 /**
28 * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
29 * if available.
30 *
31 * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
32 */
33 public class ID3v23Handler implements ID3Tags {
34 private String title;
35 private String artist;
36 private String album;
37 private String year;
38 private String composer;
39 private String genre;
40 private String trackNumber;
41 private List<ID3Comment> comments = new ArrayList<ID3Comment>();
42
43 public ID3v23Handler(ID3v2Frame frame)
44 throws IOException, SAXException, TikaException {
45 RawTagIterator tags = new RawV23TagIterator(frame);
46 while (tags.hasNext()) {
47 RawTag tag = tags.next();
48 if (tag.name.equals("TIT2")) {
49 title = getTagString(tag.data, 0, tag.data.length);
50 } else if (tag.name.equals("TPE1")) {
51 artist = getTagString(tag.data, 0, tag.data.length);
52 } else if (tag.name.equals("TALB")) {
53 album = getTagString(tag.data, 0, tag.data.length);
54 } else if (tag.name.equals("TYER")) {
55 year = getTagString(tag.data, 0, tag.data.length);
56 } else if (tag.name.equals("TCOM")) {
57 composer = getTagString(tag.data, 0, tag.data.length);
58 } else if (tag.name.equals("COMM")) {
59 comments.add( getComment(tag.data, 0, tag.data.length) );
60 } else if (tag.name.equals("TRCK")) {
61 trackNumber = getTagString(tag.data, 0, tag.data.length);
62 } else if (tag.name.equals("TCON")) {
63 genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
64 }
65 }
66 }
67
68 private String getTagString(byte[] data, int offset, int length) {
69 return ID3v2Frame.getTagString(data, offset, length);
70 }
71 private ID3Comment getComment(byte[] data, int offset, int length) {
72 return ID3v2Frame.getComment(data, offset, length);
73 }
74
75 public boolean getTagsPresent() {
76 return true;
77 }
78
79 public String getTitle() {
80 return title;
81 }
82
83 public String getArtist() {
84 return artist;
85 }
86
87 public String getAlbum() {
88 return album;
89 }
90
91 public String getYear() {
92 return year;
93 }
94
95 public String getComposer() {
96 return composer;
97 }
98
99 public List<ID3Comment> getComments() {
100 return comments;
101 }
102
103 public String getGenre() {
104 return genre;
105 }
106
107 public String getTrackNumber() {
108 return trackNumber;
109 }
110
111 private class RawV23TagIterator extends RawTagIterator {
112 private RawV23TagIterator(ID3v2Frame frame) {
113 frame.super(4, 4, 1, 2);
114 }
115 }
116
117 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.List;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
24 import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
25 import org.xml.sax.SAXException;
26
27 /**
28 * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
29 * if available.
30 *
31 * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
32 * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
33 */
34 public class ID3v24Handler implements ID3Tags {
35 private String title;
36 private String artist;
37 private String album;
38 private String year;
39 private String composer;
40 private String genre;
41 private String trackNumber;
42 private List<ID3Comment> comments = new ArrayList<ID3Comment>();
43
44 public ID3v24Handler(ID3v2Frame frame)
45 throws IOException, SAXException, TikaException {
46 RawTagIterator tags = new RawV24TagIterator(frame);
47 while (tags.hasNext()) {
48 RawTag tag = tags.next();
49 if (tag.name.equals("TIT2")) {
50 title = getTagString(tag.data, 0, tag.data.length);
51 } else if (tag.name.equals("TPE1")) {
52 artist = getTagString(tag.data, 0, tag.data.length);
53 } else if (tag.name.equals("TALB")) {
54 album = getTagString(tag.data, 0, tag.data.length);
55 } else if (tag.name.equals("TYER")) {
56 year = getTagString(tag.data, 0, tag.data.length);
57 } else if (tag.name.equals("TDRC")) {
58 if(year == null) {
59 year = getTagString(tag.data, 0, tag.data.length);
60 }
61 } else if (tag.name.equals("TCOM")) {
62 composer = getTagString(tag.data, 0, tag.data.length);
63 } else if (tag.name.equals("COMM")) {
64 comments.add( getComment(tag.data, 0, tag.data.length) );
65 } else if (tag.name.equals("TRCK")) {
66 trackNumber = getTagString(tag.data, 0, tag.data.length);
67 } else if (tag.name.equals("TCON")) {
68 genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
69 }
70 }
71 }
72
73 private String getTagString(byte[] data, int offset, int length) {
74 return ID3v2Frame.getTagString(data, offset, length);
75 }
76 private ID3Comment getComment(byte[] data, int offset, int length) {
77 return ID3v2Frame.getComment(data, offset, length);
78 }
79
80 public boolean getTagsPresent() {
81 return true;
82 }
83
84 public String getTitle() {
85 return title;
86 }
87
88 public String getArtist() {
89 return artist;
90 }
91
92 public String getAlbum() {
93 return album;
94 }
95
96 public String getYear() {
97 return year;
98 }
99
100 public String getComposer() {
101 return composer;
102 }
103
104 public List<ID3Comment> getComments() {
105 return comments;
106 }
107
108 public String getGenre() {
109 return genre;
110 }
111
112 public String getTrackNumber() {
113 return trackNumber;
114 }
115
116 private class RawV24TagIterator extends RawTagIterator {
117 private RawV24TagIterator(ID3v2Frame frame) {
118 frame.super(4, 4, 1, 2);
119 }
120 }
121
122 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.PushbackInputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.util.Iterator;
23
24 import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
25
26 /**
27 * A frame of ID3v2 data, which is then passed to a handler to
28 * be turned into useful data.
29 */
30 public class ID3v2Frame implements MP3Frame {
31 private int majorVersion;
32 private int minorVersion;
33 private int flags;
34 private int length;
35 /** Excludes the header size part */
36 private byte[] extendedHeader;
37 private byte[] data;
38
39 public int getMajorVersion() {
40 return majorVersion;
41 }
42
43 public int getMinorVersion() {
44 return minorVersion;
45 }
46
47 public int getFlags() {
48 return flags;
49 }
50
51 public int getLength() {
52 return length;
53 }
54
55 public byte[] getExtendedHeader() {
56 return extendedHeader;
57 }
58
59 public byte[] getData() {
60 return data;
61 }
62
63 /**
64 * Returns the next ID3v2 Frame in
65 * the file, or null if the next batch of data
66 * doesn't correspond to either an ID3v2 header.
67 * If no ID3v2 frame could be detected and the passed in input stream is a
68 * {@code PushbackInputStream}, the bytes read so far are pushed back so
69 * that they can be read again.
70 * ID3v2 Frames should come before all Audio ones.
71 */
72 public static MP3Frame createFrameIfPresent(InputStream inp)
73 throws IOException {
74 int h1 = inp.read();
75 int h2 = inp.read();
76 int h3 = inp.read();
77
78 // Is it an ID3v2 Frame?
79 if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
80 int majorVersion = inp.read();
81 int minorVersion = inp.read();
82 if (majorVersion == -1 || minorVersion == -1) {
83 pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
84 return null;
85 }
86 return new ID3v2Frame(majorVersion, minorVersion, inp);
87 }
88
89 // Not a frame header
90 pushBack(inp, h1, h2, h3);
91 return null;
92 }
93
94 /**
95 * Pushes bytes back into the stream if possible. This method is called if
96 * no ID3v2 header could be found at the current stream position.
97 *
98 * @param inp the input stream
99 * @param bytes the bytes to be pushed back
100 * @throws IOException if an error occurs
101 */
102 private static void pushBack(InputStream inp, int... bytes)
103 throws IOException
104 {
105 if (inp instanceof PushbackInputStream)
106 {
107 byte[] buf = new byte[bytes.length];
108 for (int i = 0; i < bytes.length; i++)
109 {
110 buf[i] = (byte) bytes[i];
111 }
112 ((PushbackInputStream) inp).unread(buf);
113 }
114 }
115
116 private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
117 throws IOException {
118 this.majorVersion = majorVersion;
119 this.minorVersion = minorVersion;
120
121 // Get the flags and the length
122 flags = inp.read();
123 length = get7BitsInt(readFully(inp, 4), 0);
124
125 // Do we have an extended header?
126 if ((flags & 0x02) == 0x02) {
127 int size = getInt(readFully(inp, 4));
128 extendedHeader = readFully(inp, size);
129 }
130
131 // Get the frame's data, or at least as much
132 // of it as we could do
133 data = readFully(inp, length, false);
134 }
135
136 protected static int getInt(byte[] data) {
137 return getInt(data, 0);
138 }
139
140 protected static int getInt(byte[] data, int offset) {
141 int b0 = data[offset+0] & 0xFF;
142 int b1 = data[offset+1] & 0xFF;
143 int b2 = data[offset+2] & 0xFF;
144 int b3 = data[offset+3] & 0xFF;
145 return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
146 }
147
148 protected static int getInt3(byte[] data, int offset) {
149 int b0 = data[offset+0] & 0xFF;
150 int b1 = data[offset+1] & 0xFF;
151 int b2 = data[offset+2] & 0xFF;
152 return (b0 << 16) + (b1 << 8) + (b2 << 0);
153 }
154
155 protected static int getInt2(byte[] data, int offset) {
156 int b0 = data[offset+0] & 0xFF;
157 int b1 = data[offset+1] & 0xFF;
158 return (b0 << 8) + (b1 << 0);
159 }
160
161 /**
162 * AKA a Synchsafe integer.
163 * 4 bytes hold a 28 bit number. The highest
164 * bit in each byte is always 0 and always ignored.
165 */
166 protected static int get7BitsInt(byte[] data, int offset) {
167 int b0 = data[offset+0] & 0x7F;
168 int b1 = data[offset+1] & 0x7F;
169 int b2 = data[offset+2] & 0x7F;
170 int b3 = data[offset+3] & 0x7F;
171 return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
172 }
173
174 protected static byte[] readFully(InputStream inp, int length)
175 throws IOException {
176 return readFully(inp, length, true);
177 }
178 protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
179 throws IOException {
180 byte[] b = new byte[length];
181
182 int pos = 0;
183 int read;
184 while (pos < length) {
185 read = inp.read(b, pos, length-pos);
186 if (read == -1) {
187 if(shortDataIsFatal) {
188 throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
189 } else {
190 // Give them what we found
191 // TODO Log the short read
192 return b;
193 }
194 }
195 pos += read;
196 }
197
198 return b;
199 }
200
201 protected static class TextEncoding {
202 public final boolean doubleByte;
203 public final String encoding;
204 private TextEncoding(String encoding, boolean doubleByte) {
205 this.doubleByte = doubleByte;
206 this.encoding = encoding;
207 }
208 }
209 protected static final TextEncoding[] encodings = new TextEncoding[] {
210 new TextEncoding("ISO-8859-1", false),
211 new TextEncoding("UTF-16", true), // With BOM
212 new TextEncoding("UTF-16BE", true), // Without BOM
213 new TextEncoding("UTF-8", false)
214 };
215
216 /**
217 * Returns the (possibly null padded) String at the given offset and
218 * length. String encoding is held in the first byte;
219 */
220 protected static String getTagString(byte[] data, int offset, int length) {
221 int actualLength = length;
222 if (actualLength == 0) {
223 return "";
224 }
225 if (actualLength == 1 && data[offset] == 0) {
226 return "";
227 }
228
229 // Does it have an encoding flag?
230 // Detect by the first byte being sub 0x20
231 TextEncoding encoding = encodings[0];
232 byte maybeEncodingFlag = data[offset];
233 if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
234 offset++;
235 actualLength--;
236 encoding = encodings[maybeEncodingFlag];
237 }
238
239 // Trim off null termination / padding (as present)
240 while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
241 actualLength -= 2;
242 }
243 while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
244 actualLength--;
245 }
246 if (actualLength == 0) {
247 return "";
248 }
249
250 // TIKA-1024: If it's UTF-16 (with BOM) and all we
251 // have is a naked BOM then short-circuit here
252 // (return empty string), because new String(..)
253 // gives different results on different JVMs
254 if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
255 ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
256 (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
257 return "";
258 }
259
260 try {
261 // Build the base string
262 return new String(data, offset, actualLength, encoding.encoding);
263 } catch (UnsupportedEncodingException e) {
264 throw new RuntimeException(
265 "Core encoding " + encoding.encoding + " is not available", e);
266 }
267 }
268 /**
269 * Builds up the ID3 comment, by parsing and extracting
270 * the comment string parts from the given data.
271 */
272 protected static ID3Comment getComment(byte[] data, int offset, int length) {
273 // Comments must have an encoding
274 int encodingFlag = data[offset];
275 if (encodingFlag >= 0 && encodingFlag < encodings.length) {
276 // Good, valid flag
277 } else {
278 // Invalid string
279 return null;
280 }
281
282 TextEncoding encoding = encodings[encodingFlag];
283
284 // First is a 3 byte language
285 String lang = getString(data, offset+1, 3);
286
287 // After that we have [Desc]\0(\0)[Text]
288 int descStart = offset+4;
289 int textStart = -1;
290 String description = null;
291 String text = null;
292
293 // Find where the description ends
294 try {
295 for (int i=descStart; i<offset+length; i++) {
296 if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
297 // Handle LE vs BE on low byte text
298 if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
299 i++;
300 }
301 textStart = i+2;
302 description = new String(data, descStart, i-descStart, encoding.encoding);
303 break;
304 }
305 if (!encoding.doubleByte && data[i]==0) {
306 textStart = i+1;
307 description = new String(data, descStart, i-descStart, encoding.encoding);
308 break;
309 }
310 }
311
312 // Did we find the end?
313 if (textStart > -1) {
314 text = new String(data, textStart, offset+length-textStart, encoding.encoding);
315 } else {
316 // Assume everything is the text
317 text = new String(data, descStart, offset+length-descStart, encoding.encoding);
318 }
319
320 // Return
321 return new ID3Comment(lang, description, text);
322 } catch (UnsupportedEncodingException e) {
323 throw new RuntimeException(
324 "Core encoding " + encoding.encoding + " is not available", e);
325 }
326 }
327
328 /**
329 * Returns the String at the given
330 * offset and length. Strings are ISO-8859-1
331 */
332 protected static String getString(byte[] data, int offset, int length) {
333 try {
334 return new String(data, offset, length, "ISO-8859-1");
335 } catch (UnsupportedEncodingException e) {
336 throw new RuntimeException(
337 "Core encoding ISO-8859-1 encoding is not available", e);
338 }
339 }
340
341
342 /**
343 * Iterates over id3v2 raw tags.
344 * Create an instance of this that configures the
345 * various length and multipliers.
346 */
347 protected class RawTagIterator implements Iterator<RawTag> {
348 private int nameLength;
349 private int sizeLength;
350 private int sizeMultiplier;
351 private int flagLength;
352
353 private int offset = 0;
354
355 protected RawTagIterator(
356 int nameLength, int sizeLength, int sizeMultiplier,
357 int flagLength) {
358 this.nameLength = nameLength;
359 this.sizeLength = sizeLength;
360 this.sizeMultiplier = sizeMultiplier;
361 this.flagLength = flagLength;
362 }
363
364 public boolean hasNext() {
365 // Check for padding at the end
366 return offset < data.length && data[offset] != 0;
367 }
368
369 public RawTag next() {
370 RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
371 flagLength, data, offset);
372 offset += tag.getSize();
373 return tag;
374 }
375
376 public void remove() {
377 }
378
379 }
380
381 protected static class RawTag {
382 private int headerSize;
383 protected String name;
384 protected int flag;
385 protected byte[] data;
386
387 private RawTag(
388 int nameLength, int sizeLength, int sizeMultiplier,
389 int flagLength, byte[] frameData, int offset) {
390 headerSize = nameLength + sizeLength + flagLength;
391
392 // Name, normally 3 or 4 bytes
393 name = getString(frameData, offset, nameLength);
394
395 // Size
396 int rawSize;
397 if (sizeLength == 3) {
398 rawSize = getInt3(frameData, offset+nameLength);
399 } else {
400 rawSize = getInt(frameData, offset+nameLength);
401 }
402 int size = rawSize * sizeMultiplier;
403
404 // Flag
405 if (flagLength > 0) {
406 if (flagLength == 1) {
407 flag = (int)frameData[offset+nameLength+sizeLength];
408 } else {
409 flag = getInt2(frameData, offset+nameLength+sizeLength);
410 }
411 }
412
413 // Now data
414 int copyFrom = offset+nameLength+sizeLength+flagLength;
415 size = Math.min(size, frameData.length-copyFrom);
416 data = new byte[size];
417 System.arraycopy(frameData, copyFrom, data, 0, size);
418 }
419
420 protected int getSize() {
421 return headerSize + data.length;
422 }
423
424 }
425
426 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24
25 /**
26 * This is used to parse Lyrics3 tag information
27 * from an MP3 file, if available.
28 * Handles lyrics tags of up to 10kb in size.
29 * Will process any ID3v1 tag data if present.
30 * Ignores extended ID3v1 data in the lyrics block
31 *
32 * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
33 */
34 public class LyricsHandler {
35 boolean foundLyrics = false;
36 String lyricsText = null;
37 ID3v1Handler id3v1 = null;
38
39 public LyricsHandler(InputStream stream, ContentHandler handler)
40 throws IOException, SAXException, TikaException {
41 this(getSuffix(stream, 10240+128));
42 }
43
44 /**
45 * Looks for the Lyrics data, which will be
46 * just before the ID3v1 data (if present),
47 * and process it.
48 * Also sets things up for the ID3v1
49 * processing if required.
50 * Creates from the last 128 bytes of a stream.
51 */
52 protected LyricsHandler(byte[] tagData)
53 throws IOException, SAXException, TikaException {
54 if(tagData.length < 128) {
55 return;
56 }
57
58 // Is there ID3v1 data?
59 byte[] last128 = new byte[128];
60 System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
61 id3v1 = new ID3v1Handler(last128);
62
63 if(tagData.length < 137) {
64 return;
65 }
66
67 // Are there lyrics? Look for the closing Lyrics tag
68 // at the end to decide if there is any
69 int lookat = tagData.length - 9;
70 if(id3v1.found) {
71 lookat -= 128;
72 }
73 if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
74 tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
75 tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
76 tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
77 tagData[lookat+8] == '0') {
78 foundLyrics = true;
79
80 // The length (6 bytes) comes just before LYRICS200, and is the
81 // size including the LYRICSBEGIN but excluding the
82 // length+LYRICS200 at the end.
83 int length = Integer.parseInt(
84 new String(tagData, lookat-6, 6)
85 );
86
87 String lyrics = new String(
88 tagData, lookat-length+5, length-11,
89 "ASCII"
90 );
91
92 // Tags are a 3 letter code, 5 digit length, then data
93 int pos = 0;
94 while(pos < lyrics.length()-8) {
95 String tagName = lyrics.substring(pos, pos+3);
96 int tagLen = Integer.parseInt(
97 lyrics.substring(pos+3, pos+8)
98 );
99 int startPos = pos + 8;
100 int endPos = startPos + tagLen;
101
102 if(tagName.equals("LYR")) {
103 lyricsText = lyrics.substring(startPos, endPos);
104 }
105
106 pos = endPos;
107 }
108 }
109 }
110
111 public boolean hasID3v1() {
112 if(id3v1 == null || id3v1.found == false) {
113 return false;
114 }
115 return true;
116 }
117 public boolean hasLyrics() {
118 return lyricsText != null && lyricsText.length() > 0;
119 }
120
121 /**
122 * Reads and returns the last <code>length</code> bytes from the
123 * given stream.
124 * @param stream input stream
125 * @param length number of bytes from the end to read and return
126 * @return stream the <code>InputStream</code> to read from.
127 * @throws IOException if the stream could not be read from.
128 */
129 protected static byte[] getSuffix(InputStream stream, int length)
130 throws IOException {
131 byte[] buffer = new byte[2 * length];
132 int bytesInBuffer = 0;
133
134 int n = stream.read(buffer);
135 while (n != -1) {
136 bytesInBuffer += n;
137 if (bytesInBuffer == buffer.length) {
138 System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
139 bytesInBuffer = length;
140 }
141 n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
142 }
143
144 if (bytesInBuffer < length) {
145 length = bytesInBuffer;
146 }
147
148 byte[] result = new byte[length];
149 System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
150 return result;
151 }
152 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18
19 /**
20 * A frame in an MP3 file, such as ID3v2 Tags or some
21 * audio.
22 */
23 public interface MP3Frame {
24 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.Set;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.TailStream;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.metadata.XMPDM;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 /**
39 * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
40 * from an MP3 file, if available.
41 *
42 * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
43 * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
44 * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
45 */
46 public class Mp3Parser extends AbstractParser {
47
48 /** Serial version UID */
49 private static final long serialVersionUID = 8537074922934844370L;
50
51 private static final Set<MediaType> SUPPORTED_TYPES =
52 Collections.singleton(MediaType.audio("mpeg"));
53
54 public Set<MediaType> getSupportedTypes(ParseContext context) {
55 return SUPPORTED_TYPES;
56 }
57
58
59 public void parse(
60 InputStream stream, ContentHandler handler,
61 Metadata metadata, ParseContext context)
62 throws IOException, SAXException, TikaException {
63 metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
64 metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
65
66 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
67 xhtml.startDocument();
68
69 // Create handlers for the various kinds of ID3 tags
70 ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
71
72 if (audioAndTags.tags.length > 0) {
73 CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
74
75 metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
76 metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
77 metadata.set(XMPDM.ARTIST, tag.getArtist());
78 metadata.set(XMPDM.COMPOSER, tag.getComposer());
79 metadata.set(XMPDM.ALBUM, tag.getAlbum());
80 metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
81 metadata.set(XMPDM.GENRE, tag.getGenre());
82 metadata.set(XMPDM.DURATION, audioAndTags.duration);
83
84 List<String> comments = new ArrayList<String>();
85 for (ID3Comment comment : tag.getComments()) {
86 StringBuffer cmt = new StringBuffer();
87 if (comment.getLanguage() != null) {
88 cmt.append(comment.getLanguage());
89 cmt.append(" - ");
90 }
91 if (comment.getDescription() != null) {
92 cmt.append(comment.getDescription());
93 if (comment.getText() != null) {
94 cmt.append("\n");
95 }
96 }
97 if (comment.getText() != null) {
98 cmt.append(comment.getText());
99 }
100
101 comments.add(cmt.toString());
102 metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
103 }
104
105 xhtml.element("h1", tag.getTitle());
106 xhtml.element("p", tag.getArtist());
107
108 // ID3v1.1 Track addition
109 if (tag.getTrackNumber() != null) {
110 xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
111 metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
112 } else {
113 xhtml.element("p", tag.getAlbum());
114 }
115 xhtml.element("p", tag.getYear());
116 xhtml.element("p", tag.getGenre());
117 xhtml.element("p", String.valueOf(audioAndTags.duration));
118 for (String comment : comments) {
119 xhtml.element("p", comment);
120 }
121 }
122 if (audioAndTags.audio != null) {
123 metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
124 metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
125 metadata.set("version", audioAndTags.audio.getVersion());
126
127 metadata.set(
128 XMPDM.AUDIO_SAMPLE_RATE,
129 Integer.toString(audioAndTags.audio.getSampleRate()));
130 if(audioAndTags.audio.getChannels() == 1) {
131 metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
132 } else if(audioAndTags.audio.getChannels() == 2) {
133 metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
134 } else if(audioAndTags.audio.getChannels() == 5) {
135 metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
136 } else if(audioAndTags.audio.getChannels() == 7) {
137 metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
138 }
139 }
140 if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
141 xhtml.startElement("p", "class", "lyrics");
142 xhtml.characters(audioAndTags.lyrics.lyricsText);
143 xhtml.endElement("p");
144 }
145
146 xhtml.endDocument();
147 }
148
149 /**
150 * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
151 * for each supported set of tags.
152 */
153 protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
154 throws IOException, SAXException, TikaException {
155 ID3v24Handler v24 = null;
156 ID3v23Handler v23 = null;
157 ID3v22Handler v22 = null;
158 ID3v1Handler v1 = null;
159 LyricsHandler lyrics = null;
160 AudioFrame firstAudio = null;
161
162 TailStream tailStream = new TailStream(stream, 10240+128);
163 MpegStream mpegStream = new MpegStream(tailStream);
164
165 // ID3v2 tags live at the start of the file
166 // You can apparently have several different ID3 tag blocks
167 // So, keep going until we don't find any more
168 MP3Frame f;
169 while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
170 if(f instanceof ID3v2Frame) {
171 ID3v2Frame id3F = (ID3v2Frame)f;
172 if (id3F.getMajorVersion() == 4) {
173 v24 = new ID3v24Handler(id3F);
174 } else if(id3F.getMajorVersion() == 3) {
175 v23 = new ID3v23Handler(id3F);
176 } else if(id3F.getMajorVersion() == 2) {
177 v22 = new ID3v22Handler(id3F);
178 }
179 }
180 }
181
182 // Now iterate over all audio frames in the file
183 AudioFrame frame = mpegStream.nextFrame();
184 float duration = 0;
185 while (frame != null)
186 {
187 duration += frame.getDuration();
188 if (firstAudio == null)
189 {
190 firstAudio = frame;
191 }
192 mpegStream.skipFrame();
193 frame = mpegStream.nextFrame();
194 }
195
196 // ID3v1 tags live at the end of the file
197 // Lyrics live just before ID3v1, at the end of the file
198 // Search for both (handlers seek to the end for us)
199 lyrics = new LyricsHandler(tailStream.getTail());
200 v1 = lyrics.id3v1;
201
202 // Go in order of preference
203 // Currently, that's newest to oldest
204 List<ID3Tags> tags = new ArrayList<ID3Tags>();
205
206 if(v24 != null && v24.getTagsPresent()) {
207 tags.add(v24);
208 }
209 if(v23 != null && v23.getTagsPresent()) {
210 tags.add(v23);
211 }
212 if(v22 != null && v22.getTagsPresent()) {
213 tags.add(v22);
214 }
215 if(v1 != null && v1.getTagsPresent()) {
216 tags.add(v1);
217 }
218
219 ID3TagsAndAudio ret = new ID3TagsAndAudio();
220 ret.audio = firstAudio;
221 ret.lyrics = lyrics;
222 ret.tags = tags.toArray(new ID3Tags[tags.size()]);
223 ret.duration = duration;
224 return ret;
225 }
226
227 protected static class ID3TagsAndAudio {
228 private ID3Tags[] tags;
229 private AudioFrame audio;
230 private LyricsHandler lyrics;
231 private float duration;
232 }
233
234 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.PushbackInputStream;
21
22 /**
23 * <p>
24 * A specialized stream class which can be used to extract single frames of MPEG
25 * audio files.
26 * </p>
27 * <p>
28 * Instances of this class are constructed with an underlying stream which
29 * should point to an audio file. Read operations are possible in the usual way.
30 * However, there are special methods for searching and extracting headers of
31 * MPEG frames. Some meta information of frames can be queried.
32 * </p>
33 */
34 class MpegStream extends PushbackInputStream
35 {
36 /** Bit rate table for MPEG V1, layer 1. */
37 private static final int[] BIT_RATE_MPEG1_L1 = {
38 0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000,
39 288000, 320000, 352000, 384000, 416000, 448000
40 };
41
42 /** Bit rate table for MPEG V1, layer 2. */
43 private static final int[] BIT_RATE_MPEG1_L2 = {
44 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
45 160000, 192000, 224000, 256000, 320000, 384000
46 };
47
48 /** Bit rate table for MPEG V1, layer 3. */
49 private static final int[] BIT_RATE_MPEG1_L3 = {
50 0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
51 160000, 192000, 224000, 256000, 320000
52 };
53
54 /** Bit rate table for MPEG V2/V2.5, layer 1. */
55 private static final int[] BIT_RATE_MPEG2_L1 = {
56 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
57 144000, 160000, 176000, 192000, 224000, 256000
58 };
59
60 /** Bit rate table for MPEG V2/V2.5, layer 2 and 3. */
61 private static final int[] BIT_RATE_MPEG2_L2 = {
62 0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000,
63 96000, 112000, 128000, 144000, 160000
64 };
65
66 /** Sample rate table for MPEG V1. */
67 private static final int[] SAMPLE_RATE_MPEG1 = {
68 44100, 48000, 32000
69 };
70
71 /** Sample rate table for MPEG V2. */
72 private static final int[] SAMPLE_RATE_MPEG2 = {
73 22050, 24000, 16000
74 };
75
76 /** Sample rate table for MPEG V2.5. */
77 private static final int[] SAMPLE_RATE_MPEG2_5 = {
78 11025, 12000, 8000
79 };
80
81 /** Sample rate table for all MPEG versions. */
82 private static final int[][] SAMPLE_RATE = createSampleRateTable();
83
84 /** Constant for the number of samples for a layer 1 frame. */
85 private static final int SAMPLE_COUNT_L1 = 384;
86
87 /** Constant for the number of samples for a layer 2 or 3 frame. */
88 private static final int SAMPLE_COUNT_L2 = 1152;
89
90 /** Constant for the size of an MPEG frame header in bytes. */
91 private static final int HEADER_SIZE = 4;
92
93 /** The current MPEG header. */
94 private AudioFrame currentHeader;
95
96 /** A flag whether the end of the stream is reached. */
97 private boolean endOfStream;
98
99 /**
100 * Creates a new instance of {@code MpegStream} and initializes it with the
101 * underlying stream.
102 *
103 * @param in the underlying audio stream
104 */
105 public MpegStream(InputStream in)
106 {
107 super(in, 2 * HEADER_SIZE);
108 }
109
110 /**
111 * Searches for the next MPEG frame header from the current stream position
112 * on. This method advances the underlying input stream until it finds a
113 * valid frame header or the end of the stream is reached. In the former
114 * case a corresponding {@code AudioFrame} object is created. In the latter
115 * case there are no more headers, so the end of the stream is probably
116 * reached.
117 *
118 * @return the next {@code AudioFrame} or <b>null</b>
119 * @throws IOException if an IO error occurs
120 */
121 public AudioFrame nextFrame() throws IOException
122 {
123 AudioFrame frame = null;
124 while (!endOfStream && frame == null)
125 {
126 findFrameSyncByte();
127 if (!endOfStream)
128 {
129 HeaderBitField headerField = createHeaderField();
130 if (!endOfStream)
131 {
132 frame = createHeader(headerField);
133 if (frame == null)
134 {
135 pushBack(headerField);
136 }
137 }
138 }
139 }
140
141 currentHeader = frame;
142 return frame;
143 }
144
145 /**
146 * Skips the current MPEG frame. This method can be called after a valid
147 * MPEG header has been retrieved using {@code nextFrame()}. In this case
148 * the underlying stream is advanced to the end of the associated MPEG
149 * frame. Otherwise, this method has no effect. The return value indicates
150 * whether a frame could be skipped.
151 *
152 * @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise
153 * @throws IOException if an IO error occurs
154 */
155 public boolean skipFrame() throws IOException
156 {
157 if (currentHeader != null)
158 {
159 skipStream(in, currentHeader.getLength() - HEADER_SIZE);
160 currentHeader = null;
161 return true;
162 }
163 return false;
164 }
165
166 /**
167 * Advances the underlying stream until the first byte of frame sync is
168 * found.
169 *
170 * @throws IOException if an error occurs
171 */
172 private void findFrameSyncByte() throws IOException
173 {
174 boolean found = false;
175 while (!found && !endOfStream)
176 {
177 if (nextByte() == 0xFF)
178 {
179 found = true;
180 }
181 }
182 }
183
184 /**
185 * Creates a bit field for the MPEG frame header.
186 *
187 * @return the bit field
188 * @throws IOException if an error occurs
189 */
190 private HeaderBitField createHeaderField() throws IOException
191 {
192 HeaderBitField field = new HeaderBitField();
193 field.add(nextByte());
194 field.add(nextByte());
195 field.add(nextByte());
196 return field;
197 }
198
199 /**
200 * Creates an {@code AudioFrame} object based on the given header field. If
201 * the header field contains invalid values, result is <b>null</b>.
202 *
203 * @param bits the header bit field
204 * @return the {@code AudioFrame}
205 */
206 private AudioFrame createHeader(HeaderBitField bits)
207 {
208 if (bits.get(21, 23) != 7)
209 {
210 return null;
211 }
212
213 int mpegVer = bits.get(19, 20);
214 int layer = bits.get(17, 18);
215 int bitRateCode = bits.get(12, 15);
216 int sampleRateCode = bits.get(10, 11);
217 int padding = bits.get(9);
218
219 if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15
220 || sampleRateCode == 3)
221 {
222 // invalid header values
223 return null;
224 }
225
226 int bitRate = calculateBitRate(mpegVer, layer, bitRateCode);
227 int sampleRate = calculateSampleRate(mpegVer, sampleRateCode);
228 int length = calculateFrameLength(layer, bitRate, sampleRate, padding);
229 float duration = calculateDuration(layer, sampleRate);
230 int channels = calculateChannels(bits.get(6, 7));
231 return new AudioFrame(mpegVer, layer, bitRate, sampleRate, channels,
232 length, duration);
233 }
234
235 /**
236 * Reads the next byte.
237 *
238 * @return the next byte
239 * @throws IOException if an error occurs
240 */
241 private int nextByte() throws IOException
242 {
243 int result = 0;
244 if (!endOfStream)
245 {
246 result = read();
247 if (result == -1)
248 {
249 endOfStream = true;
250 }
251 }
252 return endOfStream ? 0 : result;
253 }
254
255 /**
256 * Pushes the given header field back in the stream so that the bytes are
257 * read again. This method is called if an invalid header was detected. Then
258 * search has to continue at the next byte after the frame sync byte.
259 *
260 * @param field the header bit field with the invalid frame header
261 * @throws IOException if an error occurs
262 */
263 private void pushBack(HeaderBitField field) throws IOException
264 {
265 unread(field.toArray());
266 }
267
268 /**
269 * Skips the given number of bytes from the specified input stream.
270 *
271 * @param in the input stream
272 * @param count the number of bytes to skip
273 * @throws IOException if an IO error occurs
274 */
275 private static void skipStream(InputStream in, long count)
276 throws IOException
277 {
278 long size = count;
279 long skipped = 0;
280 while (size > 0 && skipped >= 0)
281 {
282 skipped = in.skip(size);
283 if (skipped != -1)
284 {
285 size -= skipped;
286 }
287 }
288 }
289
290 /**
291 * Calculates the bit rate based on the given parameters.
292 *
293 * @param mpegVer the MPEG version
294 * @param layer the layer
295 * @param code the code for the bit rate
296 * @return the bit rate in bits per second
297 */
298 private static int calculateBitRate(int mpegVer, int layer, int code)
299 {
300 int[] arr = null;
301
302 if (mpegVer == AudioFrame.MPEG_V1)
303 {
304 switch (layer)
305 {
306 case AudioFrame.LAYER_1:
307 arr = BIT_RATE_MPEG1_L1;
308 break;
309 case AudioFrame.LAYER_2:
310 arr = BIT_RATE_MPEG1_L2;
311 break;
312 case AudioFrame.LAYER_3:
313 arr = BIT_RATE_MPEG1_L3;
314 break;
315 }
316 }
317 else
318 {
319 if (layer == AudioFrame.LAYER_1)
320 {
321 arr = BIT_RATE_MPEG2_L1;
322 }
323 else
324 {
325 arr = BIT_RATE_MPEG2_L2;
326 }
327 }
328 return arr[code];
329 }
330
331 /**
332 * Calculates the sample rate based on the given parameters.
333 *
334 * @param mpegVer the MPEG version
335 * @param code the code for the sample rate
336 * @return the sample rate in samples per second
337 */
338 private static int calculateSampleRate(int mpegVer, int code)
339 {
340 return SAMPLE_RATE[mpegVer][code];
341 }
342
343 /**
344 * Calculates the length of an MPEG frame based on the given parameters.
345 *
346 * @param layer the layer
347 * @param bitRate the bit rate
348 * @param sampleRate the sample rate
349 * @param padding the padding flag
350 * @return the length of the frame in bytes
351 */
352 private static int calculateFrameLength(int layer, int bitRate,
353 int sampleRate, int padding)
354 {
355 if (layer == AudioFrame.LAYER_1)
356 {
357 return (12 * bitRate / sampleRate + padding) * 4;
358 }
359 else
360 {
361 return 144 * bitRate / sampleRate + padding;
362 }
363 }
364
365 /**
366 * Calculates the duration of a MPEG frame based on the given parameters.
367 *
368 * @param layer the layer
369 * @param sampleRate the sample rate
370 * @return the duration of this frame in milliseconds
371 */
372 private static float calculateDuration(int layer, int sampleRate)
373 {
374 int sampleCount =
375 (layer == AudioFrame.LAYER_1) ? SAMPLE_COUNT_L1
376 : SAMPLE_COUNT_L2;
377 return (1000.0f / sampleRate) * sampleCount;
378 }
379
380 /**
381 * Calculates the number of channels based on the given parameters.
382 *
383 * @param chan the code for the channels
384 * @return the number of channels
385 */
386 private static int calculateChannels(int chan)
387 {
388 return chan < 3 ? 2 : 1;
389 }
390
391 /**
392 * Creates the complete array for the sample rate mapping.
393 *
394 * @return the table for the sample rates
395 */
396 private static int[][] createSampleRateTable()
397 {
398 int[][] arr = new int[4][];
399 arr[AudioFrame.MPEG_V1] = SAMPLE_RATE_MPEG1;
400 arr[AudioFrame.MPEG_V2] = SAMPLE_RATE_MPEG2;
401 arr[AudioFrame.MPEG_V2_5] = SAMPLE_RATE_MPEG2_5;
402 return arr;
403 }
404
405 /**
406 * A class representing the bit field of an MPEG header. It allows
407 * convenient access to specific bit groups.
408 */
409 private static class HeaderBitField
410 {
411 /** The internal value. */
412 private int value;
413
414 /**
415 * Adds a byte to this field.
416 *
417 * @param b the byte to be added
418 */
419 public void add(int b)
420 {
421 value <<= 8;
422 value |= b;
423 }
424
425 /**
426 * Returns the value of the bit group from the given start and end
427 * index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the
428 * first 4 bits.
429 *
430 * @param the from index
431 * @param to the to index
432 * @return the value of this group of bits
433 */
434 public int get(int from, int to)
435 {
436 int shiftVal = value >> from;
437 int mask = (1 << (to - from + 1)) - 1;
438 return shiftVal & mask;
439 }
440
441 /**
442 * Returns the value of the bit with the given index. The bit index is
443 * 0-based. Result is either 0 or 1, depending on the value of this bit.
444 *
445 * @param bit the bit index
446 * @return the value of this bit
447 */
448 public int get(int bit)
449 {
450 return get(bit, bit);
451 }
452
453 /**
454 * Returns the internal value of this field as an array. The array
455 * contains 3 bytes.
456 *
457 * @return the internal value of this field as int array
458 */
459 public byte[] toArray()
460 {
461 byte[] result = new byte[3];
462 result[0] = (byte) get(16, 23);
463 result[1] = (byte) get(8, 15);
464 result[2] = (byte) get(0, 7);
465 return result;
466 }
467 }
468 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp4;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.Date;
23 import java.util.HashMap;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Set;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.TikaInputStream;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.Property;
32 import org.apache.tika.metadata.TikaCoreProperties;
33 import org.apache.tika.metadata.XMPDM;
34 import org.apache.tika.mime.MediaType;
35 import org.apache.tika.parser.AbstractParser;
36 import org.apache.tika.parser.ParseContext;
37 import org.apache.tika.sax.XHTMLContentHandler;
38 import org.xml.sax.ContentHandler;
39 import org.xml.sax.SAXException;
40
41 import com.coremedia.iso.IsoFile;
42 import com.coremedia.iso.boxes.Box;
43 import com.coremedia.iso.boxes.ContainerBox;
44 import com.coremedia.iso.boxes.FileTypeBox;
45 import com.coremedia.iso.boxes.MetaBox;
46 import com.coremedia.iso.boxes.MovieBox;
47 import com.coremedia.iso.boxes.MovieHeaderBox;
48 import com.coremedia.iso.boxes.SampleDescriptionBox;
49 import com.coremedia.iso.boxes.SampleTableBox;
50 import com.coremedia.iso.boxes.TrackBox;
51 import com.coremedia.iso.boxes.TrackHeaderBox;
52 import com.coremedia.iso.boxes.UserDataBox;
53 import com.coremedia.iso.boxes.apple.AbstractAppleMetaDataBox;
54 import com.coremedia.iso.boxes.apple.AppleAlbumBox;
55 import com.coremedia.iso.boxes.apple.AppleArtistBox;
56 import com.coremedia.iso.boxes.apple.AppleCommentBox;
57 import com.coremedia.iso.boxes.apple.AppleCustomGenreBox;
58 import com.coremedia.iso.boxes.apple.AppleEncoderBox;
59 import com.coremedia.iso.boxes.apple.AppleItemListBox;
60 import com.coremedia.iso.boxes.apple.AppleRecordingYearBox;
61 import com.coremedia.iso.boxes.apple.AppleStandardGenreBox;
62 import com.coremedia.iso.boxes.apple.AppleTrackAuthorBox;
63 import com.coremedia.iso.boxes.apple.AppleTrackNumberBox;
64 import com.coremedia.iso.boxes.apple.AppleTrackTitleBox;
65 import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
66
67 /**
68 * Parser for the MP4 media container format, as well as the older
69 * QuickTime format that MP4 is based on.
70 *
71 * This uses the MP4Parser project from http://code.google.com/p/mp4parser/
72 * to do the underlying parsing
73 */
74 public class MP4Parser extends AbstractParser {
75 /** Serial version UID */
76 private static final long serialVersionUID = 84011216792285L;
77
78 // Ensure this stays in Sync with the entries in tika-mimetypes.xml
79 private static final Map<MediaType,List<String>> typesMap = new HashMap<MediaType, List<String>>();
80 static {
81 // All types should be 4 bytes long, space padded as needed
82 typesMap.put(MediaType.audio("mp4"), Arrays.asList(
83 "M4A ", "M4B ", "F4A ", "F4B "));
84 typesMap.put(MediaType.video("3gpp"), Arrays.asList(
85 "3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", "3gp6", "3gs7"));
86 typesMap.put(MediaType.video("3gpp2"), Arrays.asList(
87 "3g2a", "3g2b", "3g2c"));
88 typesMap.put(MediaType.video("mp4"), Arrays.asList(
89 "mp41", "mp42"));
90 typesMap.put(MediaType.video("x-m4v"), Arrays.asList(
91 "M4V ", "M4VH", "M4VP"));
92
93 typesMap.put(MediaType.video("quicktime"), Collections.<String>emptyList());
94 typesMap.put(MediaType.application("mp4"), Collections.<String>emptyList());
95 }
96
97 private static final Set<MediaType> SUPPORTED_TYPES =
98 Collections.unmodifiableSet(typesMap.keySet());
99
100 public Set<MediaType> getSupportedTypes(ParseContext context) {
101 return SUPPORTED_TYPES;
102 }
103
104
105 public void parse(
106 InputStream stream, ContentHandler handler,
107 Metadata metadata, ParseContext context)
108 throws IOException, SAXException, TikaException {
109 IsoFile isoFile;
110
111 // The MP4Parser library accepts either a File, or a byte array
112 // As MP4 video files are typically large, always use a file to
113 // avoid OOMs that may occur with in-memory buffering
114 TikaInputStream tstream = TikaInputStream.get(stream);
115 try {
116 isoFile = new IsoFile(tstream.getFileChannel());
117 } finally {
118 tstream.close();
119 }
120
121
122 // Grab the file type box
123 FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
124 if (fileType != null) {
125 // Identify the type
126 MediaType type = MediaType.application("mp4");
127 for (MediaType t : typesMap.keySet()) {
128 if (typesMap.get(t).contains(fileType.getMajorBrand())) {
129 type = t;
130 break;
131 }
132 }
133 metadata.set(Metadata.CONTENT_TYPE, type.toString());
134
135 if (type.getType().equals("audio")) {
136 metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
137 }
138 } else {
139 // Some older QuickTime files lack the FileType
140 metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
141 }
142
143
144 // Get the main MOOV box
145 MovieBox moov = getOrNull(isoFile, MovieBox.class);
146 if (moov == null) {
147 // Bail out
148 return;
149 }
150
151
152 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
153 xhtml.startDocument();
154
155
156 // Pull out some information from the header box
157 MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
158 if (mHeader != null) {
159 // Get the creation and modification dates
160 metadata.set(
161 Metadata.CREATION_DATE,
162 MP4TimeToDate(mHeader.getCreationTime())
163 );
164 metadata.set(
165 TikaCoreProperties.MODIFIED,
166 MP4TimeToDate(mHeader.getModificationTime())
167 );
168
169 // Get the duration
170 double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
171 // TODO Use this
172
173 // The timescale is normally the sampling rate
174 metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
175 }
176
177
178 // Get some more information from the track header
179 // TODO Decide how to handle multiple tracks
180 List<TrackBox> tb = moov.getBoxes(TrackBox.class);
181 if (tb.size() > 0) {
182 TrackBox track = tb.get(0);
183
184 TrackHeaderBox header = track.getTrackHeaderBox();
185 // Get the creation and modification dates
186 metadata.set(
187 TikaCoreProperties.CREATED,
188 MP4TimeToDate(header.getCreationTime())
189 );
190 metadata.set(
191 TikaCoreProperties.MODIFIED,
192 MP4TimeToDate(header.getModificationTime())
193 );
194
195 // Get the video with and height
196 metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth());
197 metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
198
199 // Get the sample information
200 SampleTableBox samples = track.getSampleTableBox();
201 SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
202 if (sampleDesc != null) {
203 // Look for the first Audio Sample, if present
204 AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
205 if (sample != null) {
206 XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
207 //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
208 metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
209 //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
210 //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
211 }
212 }
213 }
214
215 // Get metadata from the User Data Box
216 UserDataBox userData = getOrNull(moov, UserDataBox.class);
217 if (userData != null) {
218 MetaBox meta = getOrNull(userData, MetaBox.class);
219
220 // Check for iTunes Metadata
221 // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
222 // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
223 AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
224 if (apple != null) {
225 // Title
226 AppleTrackTitleBox title = getOrNull(apple, AppleTrackTitleBox.class);
227 addMetadata(TikaCoreProperties.TITLE, metadata, title);
228
229 // Artist
230 AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
231 addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
232 addMetadata(XMPDM.ARTIST, metadata, artist);
233
234 // Album
235 AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
236 addMetadata(XMPDM.ALBUM, metadata, album);
237
238 // Composer
239 AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
240 addMetadata(XMPDM.COMPOSER, metadata, composer);
241
242 // Genre
243 AppleStandardGenreBox sGenre = getOrNull(apple, AppleStandardGenreBox.class);
244 AppleCustomGenreBox cGenre = getOrNull(apple, AppleCustomGenreBox.class);
245 addMetadata(XMPDM.GENRE, metadata, sGenre);
246 addMetadata(XMPDM.GENRE, metadata, cGenre);
247
248 // Year
249 AppleRecordingYearBox year = getOrNull(apple, AppleRecordingYearBox.class);
250 addMetadata(XMPDM.RELEASE_DATE, metadata, year);
251
252 // Track number
253 AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
254 if (trackNum != null) {
255 metadata.set(XMPDM.TRACK_NUMBER, trackNum.getTrackNumber());
256 //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getNumberOfTracks()); // TODO
257 }
258
259 // Comment
260 AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
261 addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
262
263 // Encoder
264 AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
265 // addMetadata(XMPDM.???, metadata, encoder); // TODO
266
267
268 // As text
269 for (Box box : apple.getBoxes()) {
270 if (box instanceof AbstractAppleMetaDataBox) {
271 xhtml.element("p", ((AbstractAppleMetaDataBox)box).getValue());
272 }
273 }
274 }
275
276 // TODO Check for other kinds too
277 }
278
279 // All done
280 xhtml.endDocument();
281 }
282
283 private static void addMetadata(String key, Metadata m, AbstractAppleMetaDataBox metadata) {
284 if (metadata != null) {
285 m.add(key, metadata.getValue());
286 }
287 }
288 private static void addMetadata(Property prop, Metadata m, AbstractAppleMetaDataBox metadata) {
289 if (metadata != null) {
290 m.set(prop, metadata.getValue());
291 }
292 }
293
294 /**
295 * MP4 Dates are stored as 32-bit integer, which represent the seconds
296 * since midnight, January 1, 1904, and are generally in UTC
297 */
298 private static Date MP4TimeToDate(long mp4Time) {
299 long unix = mp4Time - EPOC_AS_MP4_TIME;
300 return new Date(unix*1000);
301 }
302 private static final long EPOC_AS_MP4_TIME = 2082844800l;
303
304 private static <T extends Box> T getOrNull(ContainerBox box, Class<T> clazz) {
305 if (box == null) return null;
306
307 List<T> boxes = box.getBoxes(clazz);
308 if (boxes.size() == 0) {
309 return null;
310 }
311 return boxes.get(0);
312 }
313 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.netcdf;
17
18 //JDK imports
19 import java.io.ByteArrayOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.Collections;
23 import java.util.Set;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.IOUtils;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.Property;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.Parser;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 import ucar.nc2.Attribute;
39 import ucar.nc2.NetcdfFile;
40
41 /**
42 * A {@link Parser} for <a
43 * href="http://www.unidata.ucar.edu/software/netcdf/index.html">NetCDF</a>
44 * files using the UCAR, MIT-licensed <a
45 * href="http://www.unidata.ucar.edu/software/netcdf-java/">NetCDF for Java</a>
46 * API.
47 */
48 public class NetCDFParser extends AbstractParser {
49
50 /** Serial version UID */
51 private static final long serialVersionUID = -5940938274907708665L;
52
53 private final Set<MediaType> SUPPORTED_TYPES =
54 Collections.singleton(MediaType.application("x-netcdf"));
55
56 /*
57 * (non-Javadoc)
58 *
59 * @see
60 * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser
61 * .ParseContext)
62 */
63 public Set<MediaType> getSupportedTypes(ParseContext context) {
64 return SUPPORTED_TYPES;
65 }
66
67 /*
68 * (non-Javadoc)
69 *
70 * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
71 * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
72 * org.apache.tika.parser.ParseContext)
73 */
74 public void parse(InputStream stream, ContentHandler handler,
75 Metadata metadata, ParseContext context) throws IOException,
76 SAXException, TikaException {
77 ByteArrayOutputStream os = new ByteArrayOutputStream();
78 IOUtils.copy(stream, os);
79
80 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
81 if (name == null) {
82 name = "";
83 }
84
85 try {
86 NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
87
88 // first parse out the set of global attributes
89 for (Attribute attr : ncFile.getGlobalAttributes()) {
90 Property property = resolveMetadataKey(attr.getName());
91 if (attr.getDataType().isString()) {
92 metadata.add(property, attr.getStringValue());
93 } else if (attr.getDataType().isNumeric()) {
94 int value = attr.getNumericValue().intValue();
95 metadata.add(property, String.valueOf(value));
96 }
97 }
98 } catch (IOException e) {
99 throw new TikaException("NetCDF parse error", e);
100 }
101
102 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
103 xhtml.startDocument();
104 xhtml.endDocument();
105 }
106
107 private Property resolveMetadataKey(String localName) {
108 if ("title".equals(localName)) {
109 return TikaCoreProperties.TITLE;
110 }
111 return Property.internalText(localName);
112 }
113
114 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.odf;
17
18 import java.io.IOException;
19 import java.io.StringReader;
20
21 import org.apache.tika.sax.ContentHandlerDecorator;
22 import org.xml.sax.Attributes;
23 import org.xml.sax.ContentHandler;
24 import org.xml.sax.InputSource;
25 import org.xml.sax.SAXException;
26 import org.xml.sax.helpers.AttributesImpl;
27
28 /**
29 * Content handler decorator that:<ul>
30 * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
31 * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
32 * </ul>
33 */
34 public class NSNormalizerContentHandler extends ContentHandlerDecorator {
35
36 private static final String OLD_NS =
37 "http://openoffice.org/2000/";
38
39 private static final String NEW_NS =
40 "urn:oasis:names:tc:opendocument:xmlns:";
41
42 private static final String DTD_PUBLIC_ID =
43 "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
44
45 public NSNormalizerContentHandler(ContentHandler handler) {
46 super(handler);
47 }
48
49 private String mapOldNS(String ns) {
50 if (ns != null && ns.startsWith(OLD_NS)) {
51 return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
52 } else {
53 return ns;
54 }
55 }
56
57 @Override
58 public void startElement(
59 String namespaceURI, String localName, String qName,
60 Attributes atts) throws SAXException {
61 AttributesImpl natts = new AttributesImpl();
62 for (int i = 0; i < atts.getLength(); i++) {
63 natts.addAttribute(
64 mapOldNS(atts.getURI(i)), atts.getLocalName(i),
65 atts.getQName(i), atts.getType(i), atts.getValue(i));
66 }
67 super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
68 }
69
70 @Override
71 public void endElement(String namespaceURI, String localName, String qName)
72 throws SAXException {
73 super.endElement(mapOldNS(namespaceURI), localName, qName);
74 }
75
76 @Override
77 public void startPrefixMapping(String prefix, String uri)
78 throws SAXException {
79 super.startPrefixMapping(prefix, mapOldNS(uri));
80 }
81
82 /**
83 * do not load any DTDs (may be requested by parser). Fake the DTD by
84 * returning a empty string as InputSource
85 */
86 @Override
87 public InputSource resolveEntity(String publicId, String systemId)
88 throws IOException, SAXException {
89 if ((systemId != null && systemId.toLowerCase().endsWith(".dtd"))
90 || DTD_PUBLIC_ID.equals(publicId)) {
91 return new InputSource(new StringReader(""));
92 } else {
93 return super.resolveEntity(publicId, systemId);
94 }
95 }
96
97 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.odf;
17
18 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.BitSet;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.Map;
26 import java.util.Set;
27 import java.util.Stack;
28
29 import javax.xml.XMLConstants;
30 import javax.xml.namespace.QName;
31 import javax.xml.parsers.ParserConfigurationException;
32 import javax.xml.parsers.SAXParser;
33 import javax.xml.parsers.SAXParserFactory;
34
35 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.io.CloseShieldInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39 import org.apache.tika.parser.AbstractParser;
40 import org.apache.tika.parser.ParseContext;
41 import org.apache.tika.sax.ElementMappingContentHandler;
42 import org.apache.tika.sax.OfflineContentHandler;
43 import org.apache.tika.sax.XHTMLContentHandler;
44 import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
45 import org.xml.sax.Attributes;
46 import org.xml.sax.ContentHandler;
47 import org.xml.sax.SAXException;
48 import org.xml.sax.SAXNotRecognizedException;
49 import org.xml.sax.helpers.AttributesImpl;
50 import org.xml.sax.helpers.DefaultHandler;
51
52 /**
53 * Parser for ODF <code>content.xml</code> files.
54 */
55 public class OpenDocumentContentParser extends AbstractParser {
56
57 private static final class OpenDocumentElementMappingContentHandler extends
58 ElementMappingContentHandler {
59 private final ContentHandler handler;
60 private final BitSet textNodeStack = new BitSet();
61 private int nodeDepth = 0;
62 private int completelyFiltered = 0;
63 private Stack<String> headingStack = new Stack<String>();
64
65 private OpenDocumentElementMappingContentHandler(ContentHandler handler,
66 Map<QName, TargetElement> mappings) {
67 super(handler, mappings);
68 this.handler = handler;
69 }
70
71 @Override
72 public void characters(char[] ch, int start, int length)
73 throws SAXException {
74 // only forward content of tags from text:-namespace
75 if (completelyFiltered == 0 && nodeDepth > 0
76 && textNodeStack.get(nodeDepth - 1)) {
77 super.characters(ch,start,length);
78 }
79 }
80
81 // helper for checking tags which need complete filtering
82 // (with sub-tags)
83 private boolean needsCompleteFiltering(
84 String namespaceURI, String localName) {
85 if (TEXT_NS.equals(namespaceURI)) {
86 return localName.endsWith("-template")
87 || localName.endsWith("-style");
88 } else if (TABLE_NS.equals(namespaceURI)) {
89 return "covered-table-cell".equals(localName);
90 } else {
91 return false;
92 }
93 }
94
95 // map the heading level to <hX> HTML tags
96 private String getXHTMLHeaderTagName(Attributes atts) {
97 String depthStr = atts.getValue(TEXT_NS, "outline-level");
98 if (depthStr == null) {
99 return "h1";
100 }
101
102 int depth = Integer.parseInt(depthStr);
103 if (depth >= 6) {
104 return "h6";
105 } else if (depth <= 1) {
106 return "h1";
107 } else {
108 return "h" + depth;
109 }
110 }
111
112 /**
113 * Check if a node is a text node
114 */
115 private boolean isTextNode(String namespaceURI, String localName) {
116 if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
117 return true;
118 }
119 if (SVG_NS.equals(namespaceURI)) {
120 return "title".equals(localName) ||
121 "desc".equals(localName);
122 }
123 return false;
124 }
125
126 @Override
127 public void startElement(
128 String namespaceURI, String localName, String qName,
129 Attributes atts) throws SAXException {
130 // keep track of current node type. If it is a text node,
131 // a bit at the current depth ist set in textNodeStack.
132 // characters() checks the top bit to determine, if the
133 // actual node is a text node to print out nodeDepth contains
134 // the depth of the current node and also marks top of stack.
135 assert nodeDepth >= 0;
136
137 textNodeStack.set(nodeDepth++,
138 isTextNode(namespaceURI, localName));
139 // filter *all* content of some tags
140 assert completelyFiltered >= 0;
141
142 if (needsCompleteFiltering(namespaceURI, localName)) {
143 completelyFiltered++;
144 }
145 // call next handler if no filtering
146 if (completelyFiltered == 0) {
147 // special handling of text:h, that are directly passed
148 // to incoming handler
149 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
150 final String el = headingStack.push(getXHTMLHeaderTagName(atts));
151 handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
152 } else {
153 super.startElement(
154 namespaceURI, localName, qName, atts);
155 }
156 }
157 }
158
159 @Override
160 public void endElement(
161 String namespaceURI, String localName, String qName)
162 throws SAXException {
163 // call next handler if no filtering
164 if (completelyFiltered == 0) {
165 // special handling of text:h, that are directly passed
166 // to incoming handler
167 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
168 final String el = headingStack.pop();
169 handler.endElement(XHTMLContentHandler.XHTML, el, el);
170 } else {
171 super.endElement(namespaceURI,localName,qName);
172 }
173
174 // special handling of tabulators
175 if (TEXT_NS.equals(namespaceURI)
176 && ("tab-stop".equals(localName)
177 || "tab".equals(localName))) {
178 this.characters(TAB, 0, TAB.length);
179 }
180 }
181
182 // revert filter for *all* content of some tags
183 if (needsCompleteFiltering(namespaceURI,localName)) {
184 completelyFiltered--;
185 }
186 assert completelyFiltered >= 0;
187
188 // reduce current node depth
189 nodeDepth--;
190 assert nodeDepth >= 0;
191 }
192
193 @Override
194 public void startPrefixMapping(String prefix, String uri) {
195 // remove prefix mappings as they should not occur in XHTML
196 }
197
198 @Override
199 public void endPrefixMapping(String prefix) {
200 // remove prefix mappings as they should not occur in XHTML
201 }
202 }
203
204 public static final String TEXT_NS =
205 "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
206
207 public static final String TABLE_NS =
208 "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
209
210 public static final String OFFICE_NS =
211 "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
212
213 public static final String SVG_NS =
214 "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
215
216 public static final String PRESENTATION_NS =
217 "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
218
219 public static final String DRAW_NS =
220 "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
221
222 public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
223
224 protected static final char[] TAB = new char[] { '\t' };
225
226 private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
227
228 /**
229 * Mappings between ODF tag names and XHTML tag names
230 * (including attributes). All other tag names/attributes are ignored
231 * and left out from event stream.
232 */
233 private static final HashMap<QName, TargetElement> MAPPINGS =
234 new HashMap<QName, TargetElement>();
235
236 static {
237 // general mappings of text:-tags
238 MAPPINGS.put(
239 new QName(TEXT_NS, "p"),
240 new TargetElement(XHTML, "p"));
241 // text:h-tags are mapped specifically in startElement/endElement
242 MAPPINGS.put(
243 new QName(TEXT_NS, "line-break"),
244 new TargetElement(XHTML, "br"));
245 MAPPINGS.put(
246 new QName(TEXT_NS, "list"),
247 new TargetElement(XHTML, "ul"));
248 MAPPINGS.put(
249 new QName(TEXT_NS, "list-item"),
250 new TargetElement(XHTML, "li"));
251 MAPPINGS.put(
252 new QName(TEXT_NS, "note"),
253 new TargetElement(XHTML, "div"));
254 MAPPINGS.put(
255 new QName(OFFICE_NS, "annotation"),
256 new TargetElement(XHTML, "div"));
257 MAPPINGS.put(
258 new QName(PRESENTATION_NS, "notes"),
259 new TargetElement(XHTML, "div"));
260 MAPPINGS.put(
261 new QName(DRAW_NS, "object"),
262 new TargetElement(XHTML, "object"));
263 MAPPINGS.put(
264 new QName(DRAW_NS, "text-box"),
265 new TargetElement(XHTML, "div"));
266 MAPPINGS.put(
267 new QName(SVG_NS, "title"),
268 new TargetElement(XHTML, "span"));
269 MAPPINGS.put(
270 new QName(SVG_NS, "desc"),
271 new TargetElement(XHTML, "span"));
272 MAPPINGS.put(
273 new QName(TEXT_NS, "span"),
274 new TargetElement(XHTML, "span"));
275
276 final HashMap<QName,QName> aAttsMapping =
277 new HashMap<QName,QName>();
278 aAttsMapping.put(
279 new QName(XLINK_NS, "href"),
280 new QName("href"));
281 aAttsMapping.put(
282 new QName(XLINK_NS, "title"),
283 new QName("title"));
284 MAPPINGS.put(
285 new QName(TEXT_NS, "a"),
286 new TargetElement(XHTML, "a", aAttsMapping));
287
288 // create HTML tables from table:-tags
289 MAPPINGS.put(
290 new QName(TABLE_NS, "table"),
291 new TargetElement(XHTML, "table"));
292 // repeating of rows is ignored; for columns, see below!
293 MAPPINGS.put(
294 new QName(TABLE_NS, "table-row"),
295 new TargetElement(XHTML, "tr"));
296 // special mapping for rowspan/colspan attributes
297 final HashMap<QName,QName> tableCellAttsMapping =
298 new HashMap<QName,QName>();
299 tableCellAttsMapping.put(
300 new QName(TABLE_NS, "number-columns-spanned"),
301 new QName("colspan"));
302 tableCellAttsMapping.put(
303 new QName(TABLE_NS, "number-rows-spanned"),
304 new QName("rowspan"));
305 /* TODO: The following is not correct, the cell should be repeated not spanned!
306 * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
307 * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
308 * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
309 * only for empty cells.
310 */
311 tableCellAttsMapping.put(
312 new QName(TABLE_NS, "number-columns-repeated"),
313 new QName("colspan"));
314 MAPPINGS.put(
315 new QName(TABLE_NS, "table-cell"),
316 new TargetElement(XHTML, "td", tableCellAttsMapping));
317 }
318
319 public Set<MediaType> getSupportedTypes(ParseContext context) {
320 return Collections.emptySet(); // not a top-level parser
321 }
322
323 public void parse(
324 InputStream stream, ContentHandler handler,
325 Metadata metadata, ParseContext context)
326 throws IOException, SAXException, TikaException {
327 parseInternal(stream,
328 new XHTMLContentHandler(handler,metadata),
329 metadata, context);
330 }
331
332 void parseInternal(
333 InputStream stream, final ContentHandler handler,
334 Metadata metadata, ParseContext context)
335 throws IOException, SAXException, TikaException {
336
337 DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
338
339 try {
340 SAXParserFactory factory = SAXParserFactory.newInstance();
341 factory.setValidating(false);
342 factory.setNamespaceAware(true);
343 try {
344 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
345 } catch (SAXNotRecognizedException e){
346 // TIKA-329: Some XML parsers do not support the secure-processing
347 // feature, even though it's required by JAXP in Java 5. Ignoring
348 // the exception is fine here, deployments without this feature
349 // are inherently vulnerable to XML denial-of-service attacks.
350 }
351 SAXParser parser = factory.newSAXParser();
352 parser.parse(
353 new CloseShieldInputStream(stream),
354 new OfflineContentHandler(
355 new NSNormalizerContentHandler(dh)));
356 } catch (ParserConfigurationException e) {
357 throw new TikaException("XML parser configuration error", e);
358 }
359 }
360
361 }
362
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.odf;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.metadata.DublinCore;
23 import org.apache.tika.metadata.MSOffice;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.Office;
26 import org.apache.tika.metadata.OfficeOpenXMLCore;
27 import org.apache.tika.metadata.PagedText;
28 import org.apache.tika.metadata.Property;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
32 import org.apache.tika.parser.xml.AttributeMetadataHandler;
33 import org.apache.tika.parser.xml.ElementMetadataHandler;
34 import org.apache.tika.parser.xml.MetadataHandler;
35 import org.apache.tika.parser.xml.XMLParser;
36 import org.apache.tika.sax.TeeContentHandler;
37 import org.apache.tika.sax.xpath.CompositeMatcher;
38 import org.apache.tika.sax.xpath.Matcher;
39 import org.apache.tika.sax.xpath.MatchingContentHandler;
40 import org.apache.tika.sax.xpath.XPathParser;
41 import org.xml.sax.ContentHandler;
42 import org.xml.sax.SAXException;
43
44 /**
45 * Parser for OpenDocument <code>meta.xml</code> files.
46 */
47 public class OpenDocumentMetaParser extends XMLParser {
48 /**
49 * Serial version UID
50 */
51 private static final long serialVersionUID = -8739250869531737584L;
52
53 private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
54 private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
55
56 /**
57 * @see OfficeOpenXMLCore#SUBJECT
58 * @deprecated use OfficeOpenXMLCore#SUBJECT
59 */
60 @Deprecated
61 private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
62 Property.composite(Office.INITIAL_AUTHOR,
63 new Property[] { Property.externalText("initial-creator") });
64
65 private static ContentHandler getDublinCoreHandler(
66 Metadata metadata, Property property, String element) {
67 return new ElementMetadataHandler(
68 DublinCore.NAMESPACE_URI_DC, element,
69 metadata, property);
70 }
71
72 private static ContentHandler getMeta(
73 ContentHandler ch, Metadata md, Property property, String element) {
74 Matcher matcher = new CompositeMatcher(
75 META_XPATH.parse("//meta:" + element),
76 META_XPATH.parse("//meta:" + element + "//text()"));
77 ContentHandler branch =
78 new MatchingContentHandler(new MetadataHandler(md, property), matcher);
79 return new TeeContentHandler(ch, branch);
80 }
81
82 private static ContentHandler getUserDefined(
83 ContentHandler ch, Metadata md) {
84 Matcher matcher = new CompositeMatcher(
85 META_XPATH.parse("//meta:user-defined/@meta:name"),
86 META_XPATH.parse("//meta:user-defined//text()"));
87 // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
88 ContentHandler branch = new MatchingContentHandler(
89 new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
90 matcher);
91 return new TeeContentHandler(ch, branch);
92 }
93
94 @Deprecated private static ContentHandler getStatistic(
95 ContentHandler ch, Metadata md, String name, String attribute) {
96 Matcher matcher =
97 META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
98 ContentHandler branch = new MatchingContentHandler(
99 new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
100 return new TeeContentHandler(ch, branch);
101 }
102 private static ContentHandler getStatistic(
103 ContentHandler ch, Metadata md, Property property, String attribute) {
104 Matcher matcher =
105 META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
106 ContentHandler branch = new MatchingContentHandler(
107 new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
108 return new TeeContentHandler(ch, branch);
109 }
110
111 protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
112 // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
113 // Process the Dublin Core Attributes
114 ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
115 getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
116 getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
117 getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
118 getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
119 getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
120 getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
121 getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
122 getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
123 getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
124 getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
125
126 // Process the OO Meta Attributes
127 ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
128 // ODF uses dc:date for modified
129 ch = new TeeContentHandler(ch, new ElementMetadataHandler(
130 DublinCore.NAMESPACE_URI_DC, "date",
131 md, TikaCoreProperties.MODIFIED));
132
133 // ODF uses dc:subject for description
134 ch = new TeeContentHandler(ch, new ElementMetadataHandler(
135 DublinCore.NAMESPACE_URI_DC, "subject",
136 md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
137 ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
138
139 ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
140 ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
141 ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
142 ch = getMeta(ch, md, Property.externalText("generator"), "generator");
143
144 // Process the user defined Meta Attributes
145 ch = getUserDefined(ch, md);
146
147 // Process the OO Statistics Attributes
148 ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
149 ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
150 ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
151 ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
152 ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
153 ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
154 ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
155 ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
156
157 // Legacy, Tika-1.0 style attributes
158 // TODO Remove these in Tika 2.0
159 ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
160 ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
161 ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
162 ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
163 ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
164 ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
165 ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
166
167 // Legacy Statistics Attributes, replaced with real keys above
168 // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
169 ch = getStatistic(ch, md, "nbPage", "page-count");
170 ch = getStatistic(ch, md, "nbPara", "paragraph-count");
171 ch = getStatistic(ch, md, "nbWord", "word-count");
172 ch = getStatistic(ch, md, "nbCharacter", "character-count");
173 ch = getStatistic(ch, md, "nbTab", "table-count");
174 ch = getStatistic(ch, md, "nbObject", "object-count");
175 ch = getStatistic(ch, md, "nbImg", "image-count");
176
177 // Normalise the rest
178 ch = new NSNormalizerContentHandler(ch);
179 return ch;
180 }
181
182 @Override
183 public void parse(
184 InputStream stream, ContentHandler handler,
185 Metadata metadata, ParseContext context)
186 throws IOException, SAXException, TikaException {
187 super.parse(stream, handler, metadata, context);
188 // Copy subject to description for OO2
189 String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
190 if (odfSubject != null && !odfSubject.equals("") &&
191 (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
192 metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
193 }
194 }
195
196 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.odf;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24 import java.util.zip.ZipEntry;
25 import java.util.zip.ZipInputStream;
26
27 //import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
28 //import org.apache.commons.compress.archivers.zip.ZipFile;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.io.IOUtils;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.parser.AbstractParser;
34 import org.apache.tika.parser.ParseContext;
35 import org.apache.tika.parser.Parser;
36 import org.apache.tika.sax.EndDocumentShieldingContentHandler;
37 import org.apache.tika.sax.XHTMLContentHandler;
38 import org.xml.sax.ContentHandler;
39 import org.xml.sax.SAXException;
40 import org.xml.sax.helpers.DefaultHandler;
41
42 /**
43 * OpenOffice parser
44 */
45 public class OpenDocumentParser extends AbstractParser {
46
47 /** Serial version UID */
48 private static final long serialVersionUID = -6410276875438618287L;
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.application("vnd.sun.xml.writer"),
53 MediaType.application("vnd.oasis.opendocument.text"),
54 MediaType.application("vnd.oasis.opendocument.graphics"),
55 MediaType.application("vnd.oasis.opendocument.presentation"),
56 MediaType.application("vnd.oasis.opendocument.spreadsheet"),
57 MediaType.application("vnd.oasis.opendocument.chart"),
58 MediaType.application("vnd.oasis.opendocument.image"),
59 MediaType.application("vnd.oasis.opendocument.formula"),
60 MediaType.application("vnd.oasis.opendocument.text-master"),
61 MediaType.application("vnd.oasis.opendocument.text-web"),
62 MediaType.application("vnd.oasis.opendocument.text-template"),
63 MediaType.application("vnd.oasis.opendocument.graphics-template"),
64 MediaType.application("vnd.oasis.opendocument.presentation-template"),
65 MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
66 MediaType.application("vnd.oasis.opendocument.chart-template"),
67 MediaType.application("vnd.oasis.opendocument.image-template"),
68 MediaType.application("vnd.oasis.opendocument.formula-template"),
69 MediaType.application("x-vnd.oasis.opendocument.text"),
70 MediaType.application("x-vnd.oasis.opendocument.graphics"),
71 MediaType.application("x-vnd.oasis.opendocument.presentation"),
72 MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
73 MediaType.application("x-vnd.oasis.opendocument.chart"),
74 MediaType.application("x-vnd.oasis.opendocument.image"),
75 MediaType.application("x-vnd.oasis.opendocument.formula"),
76 MediaType.application("x-vnd.oasis.opendocument.text-master"),
77 MediaType.application("x-vnd.oasis.opendocument.text-web"),
78 MediaType.application("x-vnd.oasis.opendocument.text-template"),
79 MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
80 MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
81 MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
82 MediaType.application("x-vnd.oasis.opendocument.chart-template"),
83 MediaType.application("x-vnd.oasis.opendocument.image-template"),
84 MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
85
86 private Parser meta = new OpenDocumentMetaParser();
87
88 private Parser content = new OpenDocumentContentParser();
89
90 public Parser getMetaParser() {
91 return meta;
92 }
93
94 public void setMetaParser(Parser meta) {
95 this.meta = meta;
96 }
97
98 public Parser getContentParser() {
99 return content;
100 }
101
102 public void setContentParser(Parser content) {
103 this.content = content;
104 }
105
106 public Set<MediaType> getSupportedTypes(ParseContext context) {
107 return SUPPORTED_TYPES;
108 }
109
110 public void parse(
111 InputStream stream, ContentHandler baseHandler,
112 Metadata metadata, ParseContext context)
113 throws IOException, SAXException, TikaException {
114
115 // TODO: reuse the already opened ZIPFile, if
116 // present
117
118 /*
119 ZipFile zipFile;
120 if (stream instanceof TikaInputStream) {
121 TikaInputStream tis = (TikaInputStream) stream;
122 Object container = ((TikaInputStream) stream).getOpenContainer();
123 if (container instanceof ZipFile) {
124 zipFile = (ZipFile) container;
125 } else if (tis.hasFile()) {
126 zipFile = new ZipFile(tis.getFile());
127 }
128 }
129 */
130
131 // TODO: if incoming IS is a TIS with a file
132 // associated, we should open ZipFile so we can
133 // visit metadata, mimetype first; today we lose
134 // all the metadata if meta.xml is hit after
135 // content.xml in the stream. Then we can still
136 // read-once for the content.xml.
137
138 XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
139
140 // As we don't know which of the metadata or the content
141 // we'll hit first, catch the endDocument call initially
142 EndDocumentShieldingContentHandler handler =
143 new EndDocumentShieldingContentHandler(xhtml);
144
145 // Process the file in turn
146 ZipInputStream zip = new ZipInputStream(stream);
147 ZipEntry entry = zip.getNextEntry();
148 while (entry != null) {
149 if (entry.getName().equals("mimetype")) {
150 String type = IOUtils.toString(zip, "UTF-8");
151 metadata.set(Metadata.CONTENT_TYPE, type);
152 } else if (entry.getName().equals("meta.xml")) {
153 meta.parse(zip, new DefaultHandler(), metadata, context);
154 } else if (entry.getName().endsWith("content.xml")) {
155 if (content instanceof OpenDocumentContentParser) {
156 ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
157 } else {
158 // Foreign content parser was set:
159 content.parse(zip, handler, metadata, context);
160 }
161 } else if (entry.getName().endsWith("styles.xml")) {
162 if (content instanceof OpenDocumentContentParser) {
163 ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
164 } else {
165 // Foreign content parser was set:
166 content.parse(zip, handler, metadata, context);
167 }
168 }
169 entry = zip.getNextEntry();
170 }
171
172 // Only now call the end document
173 if(handler.getEndDocumentWasCalled()) {
174 handler.reallyEndDocument();
175 }
176 }
177
178 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.opendocument;
17
18 import org.apache.tika.parser.odf.OpenDocumentParser;
19
20 /**
21 * OpenOffice parser
22 *
23 * @deprecated Use the {@link OpenDocumentParser} class instead.
24 * This class will be removed in Apache Tika 1.0.
25 */
26 public class OpenOfficeParser extends OpenDocumentParser {
27 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pdf;
17
18 import java.io.IOException;
19 import java.io.Writer;
20 import java.text.SimpleDateFormat;
21 import java.util.Calendar;
22 import java.util.Iterator;
23 import java.util.List;
24 import java.util.ListIterator;
25 import java.util.Map;
26 import java.util.TreeMap;
27
28 import org.apache.pdfbox.pdmodel.PDDocument;
29 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
30 import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
31 import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
32 import org.apache.pdfbox.pdmodel.PDPage;
33 import org.apache.pdfbox.pdmodel.common.COSObjectable;
34 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
35 import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
36 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
37 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
38 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
39 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
40 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
41 import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
42 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
43 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
44 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
45 import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
46 import org.apache.pdfbox.pdmodel.interactive.form.PDField;
47 import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
48 import org.apache.pdfbox.util.PDFTextStripper;
49 import org.apache.pdfbox.util.TextPosition;
50 import org.apache.tika.exception.TikaException;
51 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
52 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
53 import org.apache.tika.io.IOExceptionWithCause;
54 import org.apache.tika.io.TikaInputStream;
55 import org.apache.tika.metadata.Metadata;
56 import org.apache.tika.parser.ParseContext;
57 import org.apache.tika.sax.EmbeddedContentHandler;
58 import org.apache.tika.sax.XHTMLContentHandler;
59 import org.xml.sax.ContentHandler;
60 import org.xml.sax.SAXException;
61 import org.xml.sax.helpers.AttributesImpl;
62
63 /**
64 * Utility class that overrides the {@link PDFTextStripper} functionality
65 * to produce a semi-structured XHTML SAX events instead of a plain text
66 * stream.
67 */
68 class PDF2XHTML extends PDFTextStripper {
69
70 /**
71 * format used for signature dates
72 */
73 private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
74
75 /**
76 * Maximum recursive depth during AcroForm processing.
77 * Prevents theoretical AcroForm recursion bomb.
78 */
79 private final static int MAX_ACROFORM_RECURSIONS = 10;
80
81
82 // TODO: remove once PDFBOX-1130 is fixed:
83 private boolean inParagraph = false;
84
85 /**
86 * Converts the given PDF document (and related metadata) to a stream
87 * of XHTML SAX events sent to the given content handler.
88 *
89 * @param document PDF document
90 * @param handler SAX content handler
91 * @param metadata PDF metadata
92 * @throws SAXException if the content handler fails to process SAX events
93 * @throws TikaException if the PDF document can not be processed
94 */
95 public static void process(
96 PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
97 PDFParserConfig config)
98 throws SAXException, TikaException {
99 try {
100 // Extract text using a dummy Writer as we override the
101 // key methods to output to the given content
102 // handler.
103 PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
104
105 pdf2XHTML.writeText(document, new Writer() {
106 @Override
107 public void write(char[] cbuf, int off, int len) {
108 }
109 @Override
110 public void flush() {
111 }
112 @Override
113 public void close() {
114 }
115 });
116
117 } catch (IOException e) {
118 if (e.getCause() instanceof SAXException) {
119 throw (SAXException) e.getCause();
120 } else {
121 throw new TikaException("Unable to extract PDF content", e);
122 }
123 }
124 }
125
126 private final ContentHandler originalHandler;
127 private final ParseContext context;
128 private final XHTMLContentHandler handler;
129 private final PDFParserConfig config;
130
131 private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
132 PDFParserConfig config)
133 throws IOException {
134 //source of config (derives from context or PDFParser?) is
135 //already determined in PDFParser. No need to check context here.
136 this.config = config;
137 this.originalHandler = handler;
138 this.context = context;
139 this.handler = new XHTMLContentHandler(handler, metadata);
140 setForceParsing(true);
141 setSortByPosition(config.getSortByPosition());
142 if (config.getEnableAutoSpace()) {
143 setWordSeparator(" ");
144 } else {
145 setWordSeparator("");
146 }
147 // TODO: maybe expose setting these too:
148 //setAverageCharTolerance(1.0f);
149 //setSpacingTolerance(1.0f);
150 setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
151 }
152
153 void extractBookmarkText() throws SAXException {
154 PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
155 if (outline != null) {
156 extractBookmarkText(outline);
157 }
158 }
159
160 void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
161 PDOutlineItem current = bookmark.getFirstChild();
162 if (current != null) {
163 handler.startElement("ul");
164 while (current != null) {
165 handler.startElement("li");
166 handler.characters(current.getTitle());
167 handler.endElement("li");
168 // Recurse:
169 extractBookmarkText(current);
170 current = current.getNextSibling();
171 }
172 handler.endElement("ul");
173 }
174 }
175
176 @Override
177 protected void startDocument(PDDocument pdf) throws IOException {
178 try {
179 handler.startDocument();
180 } catch (SAXException e) {
181 throw new IOExceptionWithCause("Unable to start a document", e);
182 }
183 }
184
185 @Override
186 protected void endDocument(PDDocument pdf) throws IOException {
187 try {
188 // Extract text for any bookmarks:
189 extractBookmarkText();
190 extractEmbeddedDocuments(pdf, originalHandler);
191
192 //extract acroform data at end of doc
193 if (config.getExtractAcroFormContent() == true){
194 extractAcroForm(pdf, handler);
195 }
196 handler.endDocument();
197 } catch (TikaException e){
198 throw new IOExceptionWithCause("Unable to end a document", e);
199 } catch (SAXException e) {
200 throw new IOExceptionWithCause("Unable to end a document", e);
201 }
202 }
203
204 @Override
205 protected void startPage(PDPage page) throws IOException {
206 try {
207 handler.startElement("div", "class", "page");
208 } catch (SAXException e) {
209 throw new IOExceptionWithCause("Unable to start a page", e);
210 }
211 writeParagraphStart();
212 }
213
214 @Override
215 protected void endPage(PDPage page) throws IOException {
216
217 try {
218 writeParagraphEnd();
219 // TODO: remove once PDFBOX-1143 is fixed:
220 if (config.getExtractAnnotationText()) {
221 for(Object o : page.getAnnotations()) {
222 if( o instanceof PDAnnotationLink ) {
223 PDAnnotationLink annotationlink = (PDAnnotationLink) o;
224 if (annotationlink.getAction() != null) {
225 PDAction action = annotationlink.getAction();
226 if( action instanceof PDActionURI ) {
227 PDActionURI uri = (PDActionURI) action;
228 String link = uri.getURI();
229 if (link != null) {
230 handler.startElement("div", "class", "annotation");
231 handler.startElement("a", "href", link);
232 handler.endElement("a");
233 handler.endElement("div");
234 }
235 }
236 }
237 }
238
239 if (o instanceof PDAnnotationMarkup) {
240 PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
241 String title = annot.getTitlePopup();
242 String subject = annot.getSubject();
243 String contents = annot.getContents();
244 // TODO: maybe also annot.getRichContents()?
245 if (title != null || subject != null || contents != null) {
246 handler.startElement("div", "class", "annotation");
247
248 if (title != null) {
249 handler.startElement("div", "class", "annotationTitle");
250 handler.characters(title);
251 handler.endElement("div");
252 }
253
254 if (subject != null) {
255 handler.startElement("div", "class", "annotationSubject");
256 handler.characters(subject);
257 handler.endElement("div");
258 }
259
260 if (contents != null) {
261 handler.startElement("div", "class", "annotationContents");
262 handler.characters(contents);
263 handler.endElement("div");
264 }
265
266 handler.endElement("div");
267 }
268 }
269 }
270 }
271 handler.endElement("div");
272 } catch (SAXException e) {
273 throw new IOExceptionWithCause("Unable to end a page", e);
274 }
275 }
276
277 @Override
278 protected void writeParagraphStart() throws IOException {
279 // TODO: remove once PDFBOX-1130 is fixed
280 if (inParagraph) {
281 // Close last paragraph
282 writeParagraphEnd();
283 }
284 assert !inParagraph;
285 inParagraph = true;
286 try {
287 handler.startElement("p");
288 } catch (SAXException e) {
289 throw new IOExceptionWithCause("Unable to start a paragraph", e);
290 }
291 }
292
293 @Override
294 protected void writeParagraphEnd() throws IOException {
295 // TODO: remove once PDFBOX-1130 is fixed
296 if (!inParagraph) {
297 writeParagraphStart();
298 }
299 assert inParagraph;
300 inParagraph = false;
301 try {
302 handler.endElement("p");
303 } catch (SAXException e) {
304 throw new IOExceptionWithCause("Unable to end a paragraph", e);
305 }
306 }
307
308 @Override
309 protected void writeString(String text) throws IOException {
310 try {
311 handler.characters(text);
312 } catch (SAXException e) {
313 throw new IOExceptionWithCause(
314 "Unable to write a string: " + text, e);
315 }
316 }
317
318 @Override
319 protected void writeCharacters(TextPosition text) throws IOException {
320 try {
321 handler.characters(text.getCharacter());
322 } catch (SAXException e) {
323 throw new IOExceptionWithCause(
324 "Unable to write a character: " + text.getCharacter(), e);
325 }
326 }
327
328 @Override
329 protected void writeWordSeparator() throws IOException {
330 try {
331 handler.characters(getWordSeparator());
332 } catch (SAXException e) {
333 throw new IOExceptionWithCause(
334 "Unable to write a space character", e);
335 }
336 }
337
338 @Override
339 protected void writeLineSeparator() throws IOException {
340 try {
341 handler.newline();
342 } catch (SAXException e) {
343 throw new IOExceptionWithCause(
344 "Unable to write a newline character", e);
345 }
346 }
347
348 private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
349 throws IOException, SAXException, TikaException {
350 PDDocumentCatalog catalog = document.getDocumentCatalog();
351 PDDocumentNameDictionary names = catalog.getNames();
352 if (names == null){
353 return;
354 }
355 PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
356
357 if (embeddedFiles == null) {
358 return;
359 }
360
361 EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
362 if (embeddedExtractor == null) {
363 embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
364 }
365
366 Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
367 //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
368 //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
369 //If there is a need we could add a fully recursive search to find a non-null
370 //Map<String, COSObjectable> that contains the doc info.
371 if (embeddedFileNames != null){
372 processEmbeddedDocNames(embeddedFileNames, embeddedExtractor);
373 } else {
374 List<PDNameTreeNode> kids = embeddedFiles.getKids();
375 if (kids == null){
376 return;
377 }
378 for (PDNameTreeNode n : kids){
379 Map<String, COSObjectable> childNames = n.getNames();
380 if (childNames != null){
381 processEmbeddedDocNames(childNames, embeddedExtractor);
382 }
383 }
384 }
385 }
386
387
388 private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
389 EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
390 if (embeddedFileNames == null){
391 return;
392 }
393 for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
394 PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
395 PDEmbeddedFile file = spec.getEmbeddedFile();
396
397 Metadata metadata = new Metadata();
398 // TODO: other metadata?
399 metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
400 metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
401 metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
402
403 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
404 TikaInputStream stream = TikaInputStream.get(file.createInputStream());
405 try {
406 embeddedExtractor.parseEmbedded(
407 stream,
408 new EmbeddedContentHandler(handler),
409 metadata, false);
410 } finally {
411 stream.close();
412 }
413 }
414 }
415 }
416 private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException,
417 SAXException {
418 //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
419 //this code derives from Ben's code
420 PDDocumentCatalog catalog = pdf.getDocumentCatalog();
421
422 if (catalog == null)
423 return;
424
425 PDAcroForm form = catalog.getAcroForm();
426 if (form == null)
427 return;
428
429 @SuppressWarnings("rawtypes")
430 List fields = form.getFields();
431
432 if (fields == null)
433 return;
434
435 @SuppressWarnings("rawtypes")
436 ListIterator itr = fields.listIterator();
437
438 if (itr == null)
439 return;
440
441 handler.startElement("div", "class", "acroform");
442 handler.startElement("ol");
443 while (itr.hasNext()){
444 Object obj = itr.next();
445 if (obj != null && obj instanceof PDField){
446 processAcroField((PDField)obj, handler, 0);
447 }
448 }
449 handler.endElement("ol");
450 handler.endElement("div");
451 }
452
453 private void processAcroField(PDField field, XHTMLContentHandler handler, final int recurseDepth)
454 throws SAXException, IOException {
455
456 if (recurseDepth >= MAX_ACROFORM_RECURSIONS){
457 return;
458 }
459
460 addFieldString(field, handler);
461
462 @SuppressWarnings("rawtypes")
463 List kids = field.getKids();
464 if(kids != null){
465
466 @SuppressWarnings("rawtypes")
467 Iterator kidsIter = kids.iterator();
468 if (kidsIter == null){
469 return;
470 }
471 int r = recurseDepth+1;
472 handler.startElement("ol");
473 while(kidsIter.hasNext()){
474 Object pdfObj = kidsIter.next();
475 if(pdfObj != null && pdfObj instanceof PDField){
476 PDField kid = (PDField)pdfObj;
477 //recurse
478 processAcroField(kid, handler, r);
479 }
480 }
481 handler.endElement("ol");
482 }
483 }
484 private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException{
485 //Pick partial name to present in content and altName for attribute
486 //Ignoring FullyQualifiedName for now
487 String partName = field.getPartialName();
488 String altName = field.getAlternateFieldName();
489
490 StringBuilder sb = new StringBuilder();
491 AttributesImpl attrs = new AttributesImpl();
492
493 if (partName != null){
494 sb.append(partName).append(": ");
495 }
496 if (altName != null){
497 attrs.addAttribute("", "altName", "altName", "CDATA", altName);
498 }
499 //return early if PDSignature field
500 if (field instanceof PDSignatureField){
501 handleSignature(attrs, (PDSignatureField)field, handler);
502 return;
503 }
504 try {
505 //getValue can throw an IOException if there is no value
506 String value = field.getValue();
507 if (value != null && ! value.equals("null")){
508 sb.append(value);
509 }
510 } catch (IOException e) {
511 //swallow
512 }
513
514 if (attrs.getLength() > 0 || sb.length() > 0){
515 handler.startElement("li", attrs);
516 handler.characters(sb.toString());
517 handler.endElement("li");
518 }
519 }
520
521 private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField,
522 XHTMLContentHandler handler) throws SAXException{
523
524
525 PDSignature sig = sigField.getSignature();
526 if (sig == null){
527 return;
528 }
529 Map<String, String> vals= new TreeMap<String, String>();
530 vals.put("name", sig.getName());
531 vals.put("contactInfo", sig.getContactInfo());
532 vals.put("location", sig.getLocation());
533 vals.put("reason", sig.getReason());
534
535 Calendar cal = sig.getSignDate();
536 if (cal != null){
537 dateFormat.setTimeZone(cal.getTimeZone());
538 vals.put("date", dateFormat.format(cal.getTime()));
539 }
540 //see if there is any data
541 int nonNull = 0;
542 for (String val : vals.keySet()){
543 if (val != null && ! val.equals("")){
544 nonNull++;
545 }
546 }
547 //if there is, process it
548 if (nonNull > 0){
549 handler.startElement("li", parentAttributes);
550
551 AttributesImpl attrs = new AttributesImpl();
552 attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
553
554 handler.startElement("ol", attrs);
555 for (Map.Entry<String, String> e : vals.entrySet()){
556 if (e.getValue() == null || e.getValue().equals("")){
557 continue;
558 }
559 attrs = new AttributesImpl();
560 attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
561 handler.startElement("li", attrs);
562 handler.characters(e.getValue());
563 handler.endElement("li");
564 }
565 handler.endElement("ol");
566 handler.endElement("li");
567 }
568 }
569 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pdf;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Calendar;
22 import java.util.Collections;
23 import java.util.List;
24 import java.util.Set;
25
26 import org.apache.pdfbox.cos.COSArray;
27 import org.apache.pdfbox.cos.COSBase;
28 import org.apache.pdfbox.cos.COSName;
29 import org.apache.pdfbox.cos.COSString;
30 import org.apache.pdfbox.io.RandomAccess;
31 import org.apache.pdfbox.io.RandomAccessBuffer;
32 import org.apache.pdfbox.io.RandomAccessFile;
33 import org.apache.pdfbox.pdmodel.PDDocument;
34 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
35 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
37 import org.apache.tika.io.CloseShieldInputStream;
38 import org.apache.tika.io.TemporaryResources;
39 import org.apache.tika.io.TikaInputStream;
40 import org.apache.tika.metadata.Metadata;
41 import org.apache.tika.metadata.PagedText;
42 import org.apache.tika.metadata.Property;
43 import org.apache.tika.metadata.TikaCoreProperties;
44 import org.apache.tika.mime.MediaType;
45 import org.apache.tika.parser.AbstractParser;
46 import org.apache.tika.parser.ParseContext;
47 import org.apache.tika.parser.PasswordProvider;
48 import org.xml.sax.ContentHandler;
49 import org.xml.sax.SAXException;
50
51 /**
52 * PDF parser.
53 * <p>
54 * This parser can process also encrypted PDF documents if the required
55 * password is given as a part of the input metadata associated with a
56 * document. If no password is given, then this parser will try decrypting
57 * the document using the empty password that's often used with PDFs. If
58 * the PDF contains any embedded documents (for example as part of a PDF
59 * package) then this parser will use the {@link EmbeddedDocumentExtractor}
60 * to handle them.
61 */
62 public class PDFParser extends AbstractParser {
63
64 /** Serial version UID */
65 private static final long serialVersionUID = -752276948656079347L;
66
67 private PDFParserConfig defaultConfig = new PDFParserConfig();
68 /**
69 * Metadata key for giving the document password to the parser.
70 *
71 * @since Apache Tika 0.5
72 * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
73 */
74 public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
75
76 private static final Set<MediaType> SUPPORTED_TYPES =
77 Collections.singleton(MediaType.application("pdf"));
78
79 public Set<MediaType> getSupportedTypes(ParseContext context) {
80 return SUPPORTED_TYPES;
81 }
82
83 public void parse(
84 InputStream stream, ContentHandler handler,
85 Metadata metadata, ParseContext context)
86 throws IOException, SAXException, TikaException {
87
88 PDDocument pdfDocument = null;
89 TemporaryResources tmp = new TemporaryResources();
90 //config from context, or default if not set via context
91 PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
92 try {
93 // PDFBox can process entirely in memory, or can use a temp file
94 // for unpacked / processed resources
95 // Decide which to do based on if we're reading from a file or not already
96 TikaInputStream tstream = TikaInputStream.cast(stream);
97 if (tstream != null && tstream.hasFile()) {
98 // File based, take that as a cue to use a temporary file
99 RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
100 if (localConfig.getUseNonSequentialParser() == true){
101 pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
102 } else {
103 pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
104 }
105 } else {
106 // Go for the normal, stream based in-memory parsing
107 if (localConfig.getUseNonSequentialParser() == true){
108 pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer());
109 } else {
110 pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
111 }
112 }
113
114
115 if (pdfDocument.isEncrypted()) {
116 String password = null;
117
118 // Did they supply a new style Password Provider?
119 PasswordProvider passwordProvider = context.get(PasswordProvider.class);
120 if (passwordProvider != null) {
121 password = passwordProvider.getPassword(metadata);
122 }
123
124 // Fall back on the old style metadata if set
125 if (password == null && metadata.get(PASSWORD) != null) {
126 password = metadata.get(PASSWORD);
127 }
128
129 // If no password is given, use an empty string as the default
130 if (password == null) {
131 password = "";
132 }
133
134 try {
135 pdfDocument.decrypt(password);
136 } catch (Exception e) {
137 // Ignore
138 }
139 }
140 metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
141 extractMetadata(pdfDocument, metadata);
142 PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
143
144 } finally {
145 if (pdfDocument != null) {
146 pdfDocument.close();
147 }
148 tmp.dispose();
149 }
150 handler.endDocument();
151 }
152
153
154
155 private void extractMetadata(PDDocument document, Metadata metadata)
156 throws TikaException {
157 PDDocumentInformation info = document.getDocumentInformation();
158 metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
159 addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
160 addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
161 addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
162 addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
163 addMetadata(metadata, "producer", info.getProducer());
164 // TODO: Move to description in Tika 2.0
165 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
166 addMetadata(metadata, "trapped", info.getTrapped());
167 try {
168 // TODO Remove these in Tika 2.0
169 addMetadata(metadata, "created", info.getCreationDate());
170 addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
171 } catch (IOException e) {
172 // Invalid date format, just ignore
173 }
174 try {
175 Calendar modified = info.getModificationDate();
176 addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
177 addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
178 } catch (IOException e) {
179 // Invalid date format, just ignore
180 }
181
182 // All remaining metadata is custom
183 // Copy this over as-is
184 List<String> handledMetadata = Arrays.asList(new String[] {
185 "Author", "Creator", "CreationDate", "ModDate",
186 "Keywords", "Producer", "Subject", "Title", "Trapped"
187 });
188 for(COSName key : info.getDictionary().keySet()) {
189 String name = key.getName();
190 if(! handledMetadata.contains(name)) {
191 addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
192 }
193 }
194 }
195
196 private void addMetadata(Metadata metadata, Property property, String value) {
197 if (value != null) {
198 metadata.add(property, value);
199 }
200 }
201
202 private void addMetadata(Metadata metadata, String name, String value) {
203 if (value != null) {
204 metadata.add(name, value);
205 }
206 }
207
208 private void addMetadata(Metadata metadata, String name, Calendar value) {
209 if (value != null) {
210 metadata.set(name, value.getTime().toString());
211 }
212 }
213
214 private void addMetadata(Metadata metadata, Property property, Calendar value) {
215 if (value != null) {
216 metadata.set(property, value.getTime());
217 }
218 }
219
220 /**
221 * Used when processing custom metadata entries, as PDFBox won't do
222 * the conversion for us in the way it does for the standard ones
223 */
224 private void addMetadata(Metadata metadata, String name, COSBase value) {
225 if(value instanceof COSArray) {
226 for(Object v : ((COSArray)value).toList()) {
227 addMetadata(metadata, name, ((COSBase) v));
228 }
229 } else if(value instanceof COSString) {
230 addMetadata(metadata, name, ((COSString)value).getString());
231 } else if (value != null){
232 addMetadata(metadata, name, value.toString());
233 }
234 }
235
236 public void setPDFParserConfig(PDFParserConfig config){
237 this.defaultConfig = config;
238 }
239
240 public PDFParserConfig getPDFParserConfig(){
241 return defaultConfig;
242 }
243
244 /**
245 * If true, the parser will use the NonSequentialParser. This may
246 * be faster than the full doc parser.
247 * If false (default), this will use the full doc parser.
248 *
249 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
250 */
251 public void setUseNonSequentialParser(boolean v){
252 defaultConfig.setUseNonSequentialParser(v);
253 }
254
255 /**
256 * @see #setUseNonSequentialParser(boolean)
257 * @deprecated use {@link #getPDFParserConfig()}
258 */
259 public boolean getUseNonSequentialParser(){
260 return defaultConfig.getUseNonSequentialParser();
261 }
262
263 /**
264 * If true (the default), the parser should estimate
265 * where spaces should be inserted between words. For
266 * many PDFs this is necessary as they do not include
267 * explicit whitespace characters.
268 *
269 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
270 */
271 public void setEnableAutoSpace(boolean v) {
272 defaultConfig.setEnableAutoSpace(v);
273 }
274
275 /**
276 * @see #setEnableAutoSpace.
277 * @deprecated use {@link #getPDFParserConfig()}
278 */
279 public boolean getEnableAutoSpace() {
280 return defaultConfig.getEnableAutoSpace();
281 }
282
283 /**
284 * If true (the default), text in annotations will be
285 * extracted.
286 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
287 */
288 public void setExtractAnnotationText(boolean v) {
289 defaultConfig.setExtractAnnotationText(v);
290 }
291
292 /**
293 * If true, text in annotations will be extracted.
294 *
295 * @deprecated use {@link #getPDFParserConfig()}
296 */
297 public boolean getExtractAnnotationText() {
298 return defaultConfig.getExtractAnnotationText();
299 }
300
301 /**
302 * If true, the parser should try to remove duplicated
303 * text over the same region. This is needed for some
304 * PDFs that achieve bolding by re-writing the same
305 * text in the same area. Note that this can
306 * slow down extraction substantially (PDFBOX-956) and
307 * sometimes remove characters that were not in fact
308 * duplicated (PDFBOX-1155). By default this is disabled.
309 *
310 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
311 */
312 public void setSuppressDuplicateOverlappingText(boolean v) {
313 defaultConfig.setSuppressDuplicateOverlappingText(v);
314 }
315
316 /**
317 * @see #setSuppressDuplicateOverlappingText.
318 *
319 * @deprecated use {@link #getPDFParserConfig()}
320 */
321 public boolean getSuppressDuplicateOverlappingText() {
322 return defaultConfig.getSuppressDuplicateOverlappingText();
323 }
324
325 /**
326 * If true, sort text tokens by their x/y position
327 * before extracting text. This may be necessary for
328 * some PDFs (if the text tokens are not rendered "in
329 * order"), while for other PDFs it can produce the
330 * wrong result (for example if there are 2 columns,
331 * the text will be interleaved). Default is false.
332 *
333 * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
334 */
335 public void setSortByPosition(boolean v) {
336 defaultConfig.setSortByPosition(v);
337 }
338
339 /**
340 * @see #setSortByPosition.
341 *
342 * @deprecated use {@link #getPDFParserConfig()}
343 */
344 public boolean getSortByPosition() {
345 return defaultConfig.getSortByPosition();
346 }
347
348 }
0 package org.apache.tika.parser.pdf;
1
2 import java.io.IOException;
3 import java.io.InputStream;
4 import java.io.Serializable;
5 import java.util.Properties;
6 /*
7 * Licensed to the Apache Software Foundation (ASF) under one or more
8 * contributor license agreements. See the NOTICE file distributed with
9 * this work for additional information regarding copyright ownership.
10 * The ASF licenses this file to You under the Apache License, Version 2.0
11 * (the "License"); you may not use this file except in compliance with
12 * the License. You may obtain a copy of the License at
13 *
14 * http://www.apache.org/licenses/LICENSE-2.0
15 *
16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and
20 * limitations under the License.
21 */
22
23 /**
24 * Config for PDFParser.
25 *
26 * This allows parameters to be set programmatically:
27 * <ol>
28 * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
29 * <li>Constructor of PDFParser</li>
30 * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li>
31 * </ol>
32 *
33 * Parameters can also be set by modifying the PDFParserConfig.properties file,
34 * which lives in the expected places, in trunk:
35 * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
36 *
37 * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
38 * org/apache/tika/parser/pdf
39 *
40 */
41 public class PDFParserConfig implements Serializable{
42
43 private static final long serialVersionUID = 6492570218190936986L;
44
45 // True if we let PDFBox "guess" where spaces should go:
46 private boolean enableAutoSpace = true;
47
48 // True if we let PDFBox remove duplicate overlapping text:
49 private boolean suppressDuplicateOverlappingText;
50
51 // True if we extract annotation text ourselves
52 // (workaround for PDFBOX-1143):
53 private boolean extractAnnotationText = true;
54
55 // True if we should sort text tokens by position
56 // (necessary for some PDFs, but messes up other PDFs):
57 private boolean sortByPosition = false;
58
59 //True if we should use PDFBox's NonSequentialParser
60 private boolean useNonSequentialParser = false;
61
62 //True if acroform content should be extracted
63 private boolean extractAcroFormContent = true;
64
65 public PDFParserConfig(){
66 init(this.getClass().getResourceAsStream("PDFParser.properties"));
67 }
68
69 /**
70 * Loads properties from InputStream and then tries to close InputStream.
71 * If there is an IOException, this silently swallows the exception
72 * and goes back to the default.
73 *
74 * @param is
75 */
76 public PDFParserConfig(InputStream is){
77 init(is);
78 }
79
80 //initializes object and then tries to close inputstream
81 private void init(InputStream is){
82
83 if (is == null){
84 return;
85 }
86 Properties props = new Properties();
87 try{
88 props.load(is);
89 } catch (IOException e){
90 } finally {
91 if (is != null){
92 try{
93 is.close();
94 } catch (IOException e){
95 //swallow
96 }
97 }
98 }
99 setEnableAutoSpace(
100 getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
101 setSuppressDuplicateOverlappingText(
102 getProp(props.getProperty("suppressDuplicateOverlappingText"),
103 getSuppressDuplicateOverlappingText()));
104 setExtractAnnotationText(
105 getProp(props.getProperty("extractAnnotationText"),
106 getExtractAnnotationText()));
107 setSortByPosition(
108 getProp(props.getProperty("sortByPosition"),
109 getSortByPosition()));
110 setUseNonSequentialParser(
111 getProp(props.getProperty("useNonSequentialParser"),
112 getUseNonSequentialParser()));
113 setExtractAcroFormContent(
114 getProp(props.getProperty("extractAcroFormContent"),
115 getExtractAcroFormContent()));
116 }
117
118
119 /**
120 * If true (the default), extract content from AcroForms
121 * at the end of the document.
122 *
123 * @param b
124 */
125 public void setExtractAcroFormContent(boolean extractAcroFormContent) {
126 this.extractAcroFormContent = extractAcroFormContent;
127
128 }
129
130 /** @see #setExtractAcroFormContent(boolean) */
131 public boolean getExtractAcroFormContent() {
132 return extractAcroFormContent;
133 }
134
135 /** @see #setEnableAutoSpace. */
136 public boolean getEnableAutoSpace() {
137 return enableAutoSpace;
138 }
139
140 /**
141 * If true (the default), the parser should estimate
142 * where spaces should be inserted between words. For
143 * many PDFs this is necessary as they do not include
144 * explicit whitespace characters.
145 */
146 public void setEnableAutoSpace(boolean enableAutoSpace) {
147 this.enableAutoSpace = enableAutoSpace;
148 }
149
150 /** @see #setSuppressDuplicateOverlappingText(boolean)*/
151 public boolean getSuppressDuplicateOverlappingText() {
152 return suppressDuplicateOverlappingText;
153 }
154
155 /**
156 * If true, the parser should try to remove duplicated
157 * text over the same region. This is needed for some
158 * PDFs that achieve bolding by re-writing the same
159 * text in the same area. Note that this can
160 * slow down extraction substantially (PDFBOX-956) and
161 * sometimes remove characters that were not in fact
162 * duplicated (PDFBOX-1155). By default this is disabled.
163 */
164 public void setSuppressDuplicateOverlappingText(
165 boolean suppressDuplicateOverlappingText) {
166 this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
167 }
168
169 /** @see #setExtractAnnotationText(boolean)*/
170 public boolean getExtractAnnotationText() {
171 return extractAnnotationText;
172 }
173
174 /**
175 * If true (the default), text in annotations will be
176 * extracted.
177 */
178 public void setExtractAnnotationText(boolean extractAnnotationText) {
179 this.extractAnnotationText = extractAnnotationText;
180 }
181 /** @see #setSortByPosition(boolean)*/
182 public boolean getSortByPosition() {
183 return sortByPosition;
184 }
185
186 /**
187 * If true, sort text tokens by their x/y position
188 * before extracting text. This may be necessary for
189 * some PDFs (if the text tokens are not rendered "in
190 * order"), while for other PDFs it can produce the
191 * wrong result (for example if there are 2 columns,
192 * the text will be interleaved). Default is false.
193 */
194 public void setSortByPosition(boolean sortByPosition) {
195 this.sortByPosition = sortByPosition;
196 }
197
198 /** @see #setUseNonSequentialParser(boolean)*/
199 public boolean getUseNonSequentialParser() {
200 return useNonSequentialParser;
201 }
202
203 /**
204 * If true, uses PDFBox's non-sequential parser.
205 * The non-sequential parser should be much faster than the traditional
206 * full doc parser. However, until PDFBOX-XXX is fixed,
207 * the non-sequential parser fails
208 * to extract some document metadata.
209 * <p>
210 * Default is false (use the traditional parser)
211 * @param useNonSequentialParser
212 */
213 public void setUseNonSequentialParser(boolean useNonSequentialParser) {
214 this.useNonSequentialParser = useNonSequentialParser;
215 }
216
217 private boolean getProp(String p, boolean defaultMissing){
218 if (p == null){
219 return defaultMissing;
220 }
221 if (p.toLowerCase().equals("true")){
222 return true;
223 } else if (p.toLowerCase().equals("false")){
224 return false;
225 } else {
226 return defaultMissing;
227 }
228 }
229
230 @Override
231 public int hashCode() {
232 final int prime = 31;
233 int result = 1;
234 result = prime * result + (enableAutoSpace ? 1231 : 1237);
235 result = prime * result + (extractAcroFormContent ? 1231 : 1237);
236 result = prime * result + (extractAnnotationText ? 1231 : 1237);
237 result = prime * result + (sortByPosition ? 1231 : 1237);
238 result = prime * result
239 + (suppressDuplicateOverlappingText ? 1231 : 1237);
240 result = prime * result + (useNonSequentialParser ? 1231 : 1237);
241 return result;
242 }
243
244 @Override
245 public boolean equals(Object obj) {
246 if (this == obj)
247 return true;
248 if (obj == null)
249 return false;
250 if (getClass() != obj.getClass())
251 return false;
252 PDFParserConfig other = (PDFParserConfig) obj;
253 if (enableAutoSpace != other.enableAutoSpace)
254 return false;
255 if (extractAcroFormContent != other.extractAcroFormContent)
256 return false;
257 if (extractAnnotationText != other.extractAnnotationText)
258 return false;
259 if (sortByPosition != other.sortByPosition)
260 return false;
261 if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText)
262 return false;
263 if (useNonSequentialParser != other.useNonSequentialParser)
264 return false;
265 return true;
266 }
267
268 @Override
269 public String toString() {
270 return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
271 + ", suppressDuplicateOverlappingText="
272 + suppressDuplicateOverlappingText + ", extractAnnotationText="
273 + extractAnnotationText + ", sortByPosition=" + sortByPosition
274 + ", useNonSequentialParser=" + useNonSequentialParser
275 + ", extractAcroFormContent=" + extractAcroFormContent + "]";
276 }
277
278
279
280 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
19
20 import java.io.BufferedInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Set;
24
25 import org.apache.commons.compress.compressors.CompressorException;
26 import org.apache.commons.compress.compressors.CompressorInputStream;
27 import org.apache.commons.compress.compressors.CompressorStreamFactory;
28 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
29 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
30 import org.apache.commons.compress.compressors.gzip.GzipUtils;
31 import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
32 import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
35 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
36 import org.apache.tika.io.CloseShieldInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39 import org.apache.tika.parser.AbstractParser;
40 import org.apache.tika.parser.ParseContext;
41 import org.apache.tika.sax.XHTMLContentHandler;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.SAXException;
44
45 /**
46 * Parser for various compression formats.
47 */
48 public class CompressorParser extends AbstractParser {
49
50 /** Serial version UID */
51 private static final long serialVersionUID = 2793565792967222459L;
52
53 private static final MediaType BZIP = MediaType.application("x-bzip");
54 private static final MediaType BZIP2 = MediaType.application("x-bzip2");
55 private static final MediaType GZIP = MediaType.application("x-gzip");
56 private static final MediaType XZ = MediaType.application("x-xz");
57 private static final MediaType PACK = MediaType.application("application/x-java-pack200");
58
59 private static final Set<MediaType> SUPPORTED_TYPES =
60 MediaType.set(BZIP, BZIP2, GZIP, XZ, PACK);
61
62 static MediaType getMediaType(CompressorInputStream stream) {
63 if (stream instanceof BZip2CompressorInputStream) {
64 return BZIP2;
65 } else if (stream instanceof GzipCompressorInputStream) {
66 return GZIP;
67 } else if (stream instanceof XZCompressorInputStream) {
68 return XZ;
69 } else if (stream instanceof Pack200CompressorInputStream) {
70 return PACK;
71 } else {
72 return MediaType.OCTET_STREAM;
73 }
74 }
75
76 public Set<MediaType> getSupportedTypes(ParseContext context) {
77 return SUPPORTED_TYPES;
78 }
79
80 public void parse(
81 InputStream stream, ContentHandler handler,
82 Metadata metadata, ParseContext context)
83 throws IOException, SAXException, TikaException {
84 // At the end we want to close the compression stream to release
85 // any associated resources, but the underlying document stream
86 // should not be closed
87 stream = new CloseShieldInputStream(stream);
88
89 // Ensure that the stream supports the mark feature
90 stream = new BufferedInputStream(stream);
91
92 CompressorInputStream cis;
93 try {
94 CompressorStreamFactory factory = new CompressorStreamFactory();
95 CompressorParserOptions options =
96 context.get(CompressorParserOptions.class, new CompressorParserOptions() {
97 public boolean decompressConcatenated(Metadata metadata) {
98 return false;
99 }
100 });
101 factory.setDecompressConcatenated(options.decompressConcatenated(metadata));
102 cis = factory.createCompressorInputStream(stream);
103 } catch (CompressorException e) {
104 throw new TikaException("Unable to uncompress document stream", e);
105 }
106
107 MediaType type = getMediaType(cis);
108 if (!type.equals(MediaType.OCTET_STREAM)) {
109 metadata.set(CONTENT_TYPE, type.toString());
110 }
111
112 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
113 xhtml.startDocument();
114
115 try {
116 Metadata entrydata = new Metadata();
117 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
118 if (name != null) {
119 if (name.endsWith(".tbz")) {
120 name = name.substring(0, name.length() - 4) + ".tar";
121 } else if (name.endsWith(".tbz2")) {
122 name = name.substring(0, name.length() - 5) + ".tar";
123 } else if (name.endsWith(".bz")) {
124 name = name.substring(0, name.length() - 3);
125 } else if (name.endsWith(".bz2")) {
126 name = name.substring(0, name.length() - 4);
127 } else if (name.endsWith(".xz")) {
128 name = name.substring(0, name.length() - 3);
129 } else if (name.endsWith(".pack")) {
130 name = name.substring(0, name.length() - 5);
131 } else if (name.length() > 0) {
132 name = GzipUtils.getUncompressedFilename(name);
133 }
134 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
135 }
136
137 // Use the delegate parser to parse the compressed document
138 EmbeddedDocumentExtractor extractor = context.get(
139 EmbeddedDocumentExtractor.class,
140 new ParsingEmbeddedDocumentExtractor(context));
141 if (extractor.shouldParseEmbedded(entrydata)) {
142 extractor.parseEmbedded(cis, xhtml, entrydata, true);
143 }
144 } finally {
145 cis.close();
146 }
147
148 xhtml.endDocument();
149 }
150
151 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import org.apache.tika.metadata.Metadata;
19
20 /**
21 * Interface for setting options for the {@link CompressorParser} by passing
22 * via the {@link ParseContext}.
23 */
24 public interface CompressorParserOptions {
25
26 /**
27 * @param metadata document metadata
28 * @return whether to decompress concatenated streams or not
29 */
30 boolean decompressConcatenated(Metadata metadata);
31 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Set;
22
23 import org.apache.commons.compress.archivers.ArchiveEntry;
24 import org.apache.commons.compress.archivers.ArchiveException;
25 import org.apache.commons.compress.archivers.ArchiveInputStream;
26 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
27 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
28 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
29 import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
30 import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
31 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
32 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
35 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
36 import org.apache.tika.io.CloseShieldInputStream;
37 import org.apache.tika.io.TemporaryResources;
38 import org.apache.tika.io.TikaInputStream;
39 import org.apache.tika.metadata.Metadata;
40 import org.apache.tika.mime.MediaType;
41 import org.apache.tika.parser.AbstractParser;
42 import org.apache.tika.parser.ParseContext;
43 import org.apache.tika.sax.XHTMLContentHandler;
44 import org.xml.sax.ContentHandler;
45 import org.xml.sax.SAXException;
46 import org.xml.sax.helpers.AttributesImpl;
47
48 import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
49
50 /**
51 * Parser for various packaging formats. Package entries will be written to
52 * the XHTML event stream as &lt;div class="package-entry"&gt; elements that
53 * contain the (optional) entry name as a &lt;h1&gt; element and the full
54 * structured body content of the parsed entry.
55 */
56 public class PackageParser extends AbstractParser {
57
58 /** Serial version UID */
59 private static final long serialVersionUID = -5331043266963888708L;
60
61 private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
62 private static final MediaType JAR = MediaType.application("java-archive");
63 private static final MediaType AR = MediaType.application("x-archive");
64 private static final MediaType CPIO = MediaType.application("x-cpio");
65 private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
66 private static final MediaType TAR = MediaType.application("x-tar");
67
68 private static final Set<MediaType> SUPPORTED_TYPES =
69 MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR);
70
71 static MediaType getMediaType(ArchiveInputStream stream) {
72 if (stream instanceof JarArchiveInputStream) {
73 return JAR;
74 } else if (stream instanceof ZipArchiveInputStream) {
75 return ZIP;
76 } else if (stream instanceof ArArchiveInputStream) {
77 return AR;
78 } else if (stream instanceof CpioArchiveInputStream) {
79 return CPIO;
80 } else if (stream instanceof DumpArchiveInputStream) {
81 return DUMP;
82 } else if (stream instanceof TarArchiveInputStream) {
83 return TAR;
84 } else {
85 return MediaType.OCTET_STREAM;
86 }
87 }
88
89 static boolean isZipArchive(MediaType type) {
90 return type.equals(ZIP) || type.equals(JAR);
91 }
92
93 public Set<MediaType> getSupportedTypes(ParseContext context) {
94 return SUPPORTED_TYPES;
95 }
96
97 public void parse(
98 InputStream stream, ContentHandler handler,
99 Metadata metadata, ParseContext context)
100 throws IOException, SAXException, TikaException {
101 // At the end we want to close the archive stream to release
102 // any associated resources, but the underlying document stream
103 // should not be closed
104 stream = new CloseShieldInputStream(stream);
105
106 // Ensure that the stream supports the mark feature
107 stream = new BufferedInputStream(stream);
108
109 ArchiveInputStream ais;
110 try {
111 ArchiveStreamFactory factory = new ArchiveStreamFactory();
112 ais = factory.createArchiveInputStream(stream);
113 } catch (ArchiveException e) {
114 throw new TikaException("Unable to unpack document stream", e);
115 }
116
117 MediaType type = getMediaType(ais);
118 if (!type.equals(MediaType.OCTET_STREAM)) {
119 metadata.set(CONTENT_TYPE, type.toString());
120 }
121
122 // Use the delegate parser to parse the contained document
123 EmbeddedDocumentExtractor extractor = context.get(
124 EmbeddedDocumentExtractor.class,
125 new ParsingEmbeddedDocumentExtractor(context));
126
127 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
128 xhtml.startDocument();
129
130 try {
131 ArchiveEntry entry = ais.getNextEntry();
132 while (entry != null) {
133 if (!entry.isDirectory()) {
134 parseEntry(ais, entry, extractor, xhtml);
135 }
136 entry = ais.getNextEntry();
137 }
138 } finally {
139 ais.close();
140 }
141
142 xhtml.endDocument();
143 }
144
145 private void parseEntry(
146 ArchiveInputStream archive, ArchiveEntry entry,
147 EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
148 throws SAXException, IOException, TikaException {
149 String name = entry.getName();
150 if (archive.canReadEntryData(entry)) {
151 Metadata entrydata = new Metadata();
152 if (name != null && name.length() > 0) {
153 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
154 AttributesImpl attributes = new AttributesImpl();
155 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
156 attributes.addAttribute("", "id", "id", "CDATA", name);
157 xhtml.startElement("div", attributes);
158 xhtml.endElement("div");
159
160 entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
161 }
162 if (extractor.shouldParseEmbedded(entrydata)) {
163 // For detectors to work, we need a mark/reset supporting
164 // InputStream, which ArchiveInputStream isn't, so wrap
165 TemporaryResources tmp = new TemporaryResources();
166 try {
167 TikaInputStream tis = TikaInputStream.get(archive, tmp);
168 extractor.parseEmbedded(tis, xhtml, entrydata, true);
169 } finally {
170 tmp.dispose();
171 }
172 }
173 } else if (name != null && name.length() > 0) {
174 xhtml.element("p", name);
175 }
176 }
177
178 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Enumeration;
22 import java.util.HashSet;
23 import java.util.Iterator;
24 import java.util.Set;
25 import java.util.regex.Pattern;
26
27 import org.apache.commons.compress.archivers.ArchiveException;
28 import org.apache.commons.compress.archivers.ArchiveInputStream;
29 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
30 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
31 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
32 import org.apache.commons.compress.archivers.zip.ZipFile;
33 import org.apache.commons.compress.compressors.CompressorException;
34 import org.apache.commons.compress.compressors.CompressorInputStream;
35 import org.apache.commons.compress.compressors.CompressorStreamFactory;
36 import org.apache.poi.extractor.ExtractorFactory;
37 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
38 import org.apache.poi.openxml4j.opc.OPCPackage;
39 import org.apache.poi.openxml4j.opc.PackageAccess;
40 import org.apache.poi.openxml4j.opc.PackagePart;
41 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
42 import org.apache.tika.detect.Detector;
43 import org.apache.tika.exception.TikaException;
44 import org.apache.tika.io.IOUtils;
45 import org.apache.tika.io.TemporaryResources;
46 import org.apache.tika.io.TikaInputStream;
47 import org.apache.tika.metadata.Metadata;
48 import org.apache.tika.mime.MediaType;
49 import org.apache.tika.parser.iwork.IWorkPackageParser;
50 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
51
52 /**
53 * A detector that works on Zip documents and other archive and compression
54 * formats to figure out exactly what the file is.
55 */
56 public class ZipContainerDetector implements Detector {
57 private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
58
59 /** Serial version UID */
60 private static final long serialVersionUID = 2891763938430295453L;
61
62 public MediaType detect(InputStream input, Metadata metadata)
63 throws IOException {
64 // Check if we have access to the document
65 if (input == null) {
66 return MediaType.OCTET_STREAM;
67 }
68
69 TemporaryResources tmp = new TemporaryResources();
70 try {
71 TikaInputStream tis = TikaInputStream.get(input, tmp);
72
73 byte[] prefix = new byte[1024]; // enough for all known formats
74 int length = tis.peek(prefix);
75
76 MediaType type = detectArchiveFormat(prefix, length);
77 if (PackageParser.isZipArchive(type)
78 && TikaInputStream.isTikaInputStream(input)) {
79 return detectZipFormat(tis);
80 } else if (!type.equals(MediaType.OCTET_STREAM)) {
81 return type;
82 } else {
83 return detectCompressorFormat(prefix, length);
84 }
85 } finally {
86 try {
87 tmp.dispose();
88 } catch (TikaException e) {
89 // ignore
90 }
91 }
92 }
93
94 private static MediaType detectCompressorFormat(byte[] prefix, int length) {
95 try {
96 CompressorStreamFactory factory = new CompressorStreamFactory();
97 CompressorInputStream cis = factory.createCompressorInputStream(
98 new ByteArrayInputStream(prefix, 0, length));
99 try {
100 return CompressorParser.getMediaType(cis);
101 } finally {
102 IOUtils.closeQuietly(cis);
103 }
104 } catch (CompressorException e) {
105 return MediaType.OCTET_STREAM;
106 }
107 }
108
109 private static MediaType detectArchiveFormat(byte[] prefix, int length) {
110 try {
111 ArchiveStreamFactory factory = new ArchiveStreamFactory();
112 ArchiveInputStream ais = factory.createArchiveInputStream(
113 new ByteArrayInputStream(prefix, 0, length));
114 try {
115 if ((ais instanceof TarArchiveInputStream)
116 && !TarArchiveInputStream.matches(prefix, length)) {
117 // ArchiveStreamFactory is too relaxed, see COMPRESS-117
118 return MediaType.OCTET_STREAM;
119 } else {
120 return PackageParser.getMediaType(ais);
121 }
122 } finally {
123 IOUtils.closeQuietly(ais);
124 }
125 } catch (ArchiveException e) {
126 return MediaType.OCTET_STREAM;
127 }
128 }
129
130 private static MediaType detectZipFormat(TikaInputStream tis) {
131 try {
132 ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
133 try {
134 MediaType type = detectOpenDocument(zip);
135 if (type == null) {
136 type = detectOfficeOpenXML(zip, tis);
137 }
138 if (type == null) {
139 type = detectIWork(zip);
140 }
141 if (type == null) {
142 type = detectJar(zip);
143 }
144 if (type == null) {
145 type = detectKmz(zip);
146 }
147 if (type == null) {
148 type = detectIpa(zip);
149 }
150 if (type != null) {
151 return type;
152 }
153 } finally {
154 // TODO: shouldn't we record the open
155 // container so it can be later
156 // reused...?
157 // tis.setOpenContainer(zip);
158 try {
159 zip.close();
160 } catch (IOException e) {
161 // ignore
162 }
163 }
164 } catch (IOException e) {
165 // ignore
166 }
167 // Fallback: it's still a zip file, we just don't know what kind of one
168 return MediaType.APPLICATION_ZIP;
169 }
170
171 /**
172 * OpenDocument files, along with EPub files, have a mimetype
173 * entry in the root of their Zip file. This entry contains the
174 * mimetype of the overall file, stored as a single string.
175 */
176 private static MediaType detectOpenDocument(ZipFile zip) {
177 try {
178 ZipArchiveEntry mimetype = zip.getEntry("mimetype");
179 if (mimetype != null) {
180 InputStream stream = zip.getInputStream(mimetype);
181 try {
182 return MediaType.parse(IOUtils.toString(stream, "UTF-8"));
183 } finally {
184 stream.close();
185 }
186 } else {
187 return null;
188 }
189 } catch (IOException e) {
190 return null;
191 }
192 }
193
194 private static MediaType detectOfficeOpenXML(ZipFile zip, TikaInputStream stream) {
195 try {
196 if (zip.getEntry("_rels/.rels") != null
197 || zip.getEntry("[Content_Types].xml") != null) {
198 // Use POI to open and investigate it for us
199 OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
200 stream.setOpenContainer(pkg);
201
202 // Detect based on the open OPC Package
203 return detectOfficeOpenXML(pkg);
204 } else {
205 return null;
206 }
207 } catch (IOException e) {
208 return null;
209 } catch (RuntimeException e) {
210 return null;
211 } catch (InvalidFormatException e) {
212 return null;
213 }
214 }
215 /**
216 * Detects the type of an OfficeOpenXML (OOXML) file from
217 * opened Package
218 */
219 public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
220 PackageRelationshipCollection core =
221 pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
222 if (core.size() != 1) {
223 // Invalid OOXML Package received
224 return null;
225 }
226
227 // Get the type of the core document part
228 PackagePart corePart = pkg.getPart(core.getRelationship(0));
229 String coreType = corePart.getContentType();
230
231 // Turn that into the type of the overall document
232 String docType = coreType.substring(0, coreType.lastIndexOf('.'));
233
234 // The Macro Enabled formats are a little special
235 if(docType.toLowerCase().endsWith("macroenabled")) {
236 docType = docType.toLowerCase() + ".12";
237 }
238
239 if(docType.toLowerCase().endsWith("macroenabledtemplate")) {
240 docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
241 }
242
243 // Build the MediaType object and return
244 return MediaType.parse(docType);
245 }
246
247 private static MediaType detectIWork(ZipFile zip) {
248 if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
249 // Locate the appropriate index file entry, and reads from that
250 // the root element of the document. That is used to the identify
251 // the correct type of the keynote container.
252 for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) {
253 IWORKDocumentType type = IWORKDocumentType.detectType(zip.getEntry(entryName), zip);
254 if (type != null) {
255 return type.getType();
256 }
257 }
258
259 // Not sure, fallback to the container type
260 return MediaType.application("vnd.apple.iwork");
261 } else {
262 return null;
263 }
264 }
265
266 private static MediaType detectJar(ZipFile zip) {
267 if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
268 // It's a Jar file, or something based on Jar
269
270 // Is it an Android APK?
271 if (zip.getEntry("AndroidManifest.xml") != null) {
272 return MediaType.application("vnd.android.package-archive");
273 }
274
275 // Check for WAR and EAR
276 if (zip.getEntry("WEB-INF/") != null) {
277 return MediaType.application("x-tika-java-web-archive");
278 }
279 if (zip.getEntry("META-INF/application.xml") != null) {
280 return MediaType.application("x-tika-java-enterprise-archive");
281 }
282
283 // Looks like a regular Jar Archive
284 return MediaType.application("java-archive");
285 } else {
286 // Some Android APKs miss the default Manifest
287 if (zip.getEntry("AndroidManifest.xml") != null) {
288 return MediaType.application("vnd.android.package-archive");
289 }
290
291 return null;
292 }
293 }
294
295 private static MediaType detectKmz(ZipFile zip) {
296 boolean kmlFound = false;
297
298 Enumeration<ZipArchiveEntry> entries = zip.getEntries();
299 while (entries.hasMoreElements()) {
300 ZipArchiveEntry entry = entries.nextElement();
301 String name = entry.getName();
302 if (!entry.isDirectory()
303 && name.indexOf('/') == -1 && name.indexOf('\\') == -1) {
304 if (name.endsWith(".kml") && !kmlFound) {
305 kmlFound = true;
306 } else {
307 return null;
308 }
309 }
310 }
311
312 if (kmlFound) {
313 return MediaType.application("vnd.google-earth.kmz");
314 } else {
315 return null;
316 }
317 }
318
319 /**
320 * To be considered as an IPA file, it needs to match all of these
321 */
322 private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
323 private static final long serialVersionUID = 6545295886322115362L;
324 {
325 add(Pattern.compile("^Payload/$"));
326 add(Pattern.compile("^Payload/.*\\.app/$"));
327 add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
328 add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
329 add(Pattern.compile("^Payload/.*\\.app/CodeResources$"));
330 add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
331 add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
332 add(Pattern.compile("^Payload/.*\\.app/ResourceRules\\.plist$"));
333 }};
334 @SuppressWarnings("unchecked")
335 private static MediaType detectIpa(ZipFile zip) {
336 // Note - consider generalising this logic, if another format needs many regexp matching
337 Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
338
339 Enumeration<ZipArchiveEntry> entries = zip.getEntries();
340 while (entries.hasMoreElements()) {
341 ZipArchiveEntry entry = entries.nextElement();
342 String name = entry.getName();
343
344 Iterator<Pattern> ip = tmpPatterns.iterator();
345 while (ip.hasNext()) {
346 if (ip.next().matcher(name).matches()) {
347 ip.remove();
348 }
349 }
350 if (tmpPatterns.isEmpty()) {
351 // We've found everything we need to find
352 return MediaType.application("x-itunes-ipa");
353 }
354 }
355
356 // If we get here, not all required entries were found
357 return null;
358 }
359 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.prt;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.UnsupportedEncodingException;
21 import java.util.Collections;
22 import java.util.Set;
23
24 import org.apache.poi.util.IOUtils;
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.EndianUtils;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * A basic text extracting parser for the CADKey PRT (CAD Drawing)
38 * format. It outputs text from note entries.
39 */
40
41 public class PRTParser extends AbstractParser {
42
43 /** Serial version UID */
44 private static final long serialVersionUID = 4659638314375035178L;
45
46 private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
47 public static final String PRT_MIME_TYPE = "application/x-prt";
48
49 public Set<MediaType> getSupportedTypes(ParseContext context) {
50 return SUPPORTED_TYPES;
51 }
52
53 /**
54 * How long do we allow a text run to claim to be, before we
55 * decide we're confused and it's not really text after all?
56 */
57 private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
58
59 /*
60 * Text types:
61 * 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
62 * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
63 * (anything) e0 3f sz sz TEXT *view name*
64 * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
65 *
66 * Note - all text is null terminated
67 */
68
69 public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
70 ParseContext context) throws IOException, SAXException, TikaException {
71
72 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
73 Last5 l5 = new Last5();
74 int read;
75
76 // Try to get the creation date, which is YYYYMMDDhhmm
77 byte[] header = new byte[30];
78 IOUtils.readFully(stream, header);
79 byte[] date = new byte[12];
80 IOUtils.readFully(stream, date);
81
82 String dateStr = new String(date, "ASCII");
83 if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
84 String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
85 "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
86 dateStr.substring(10, 12) + ":00";
87 metadata.set(TikaCoreProperties.CREATED, formattedDate);
88 // TODO Metadata.DATE is used as modified, should it be here?
89 metadata.set(Metadata.DATE, formattedDate);
90 }
91 metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
92
93 // The description, if set, is the next up-to-500 bytes
94 byte[] desc = new byte[500];
95 IOUtils.readFully(stream, desc);
96 String description = extractText(desc, true);
97 if(description.length() > 0) {
98 metadata.set(TikaCoreProperties.DESCRIPTION, description);
99 }
100
101 // Now look for text
102 while( (read = stream.read()) > -1) {
103 if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
104 int nread = stream.read();
105 if(nread == 0x3f || nread == 0xbf) {
106 // Looks promising, check back for a suitable value
107 if(read == 0xe3 && nread == 0x3f) {
108 if(l5.is33()) {
109 // Bingo, note text
110 handleNoteText(stream, xhtml);
111 }
112 } else if(l5.is00()) {
113 // Likely view name
114 handleViewName(read, nread, stream, xhtml, l5);
115 }
116 }
117 } else {
118 l5.record(read);
119 }
120 }
121 }
122
123 private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
124 throws IOException, SAXException, TikaException {
125 // Ensure we have the right padding text
126 int read;
127 for(int i=0; i<10; i++) {
128 read = stream.read();
129 if(read >= 0 && read <= 0x0f) {
130 // Promising
131 } else {
132 // Wrong, false detection
133 return;
134 }
135 }
136 read = stream.read();
137 if(read != 0x1f) {
138 // Wrong, false detection
139 return;
140 }
141
142 int length = EndianUtils.readUShortLE(stream);
143 if(length <= MAX_SANE_TEXT_LENGTH) {
144 // Length sanity check passed
145 handleText(length, stream, xhtml);
146 }
147 }
148
149 private void handleViewName(int typeA, int typeB, InputStream stream,
150 XHTMLContentHandler xhtml, Last5 l5)
151 throws IOException, SAXException, TikaException {
152 // Is it 8 byte zero padded?
153 int maybeLength = EndianUtils.readUShortLE(stream);
154 if(maybeLength == 0) {
155 // Check the next 6 bytes too
156 for(int i=0; i<6; i++) {
157 int read = stream.read();
158 if(read >= 0 && read <= 0x0f) {
159 // Promising
160 } else {
161 // Wrong, false detection
162 return;
163 }
164 }
165
166 byte[] b2 = new byte[2];
167 IOUtils.readFully(stream, b2);
168 int length = EndianUtils.getUShortLE(b2);
169 if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
170 // Length sanity check passed
171 handleText(length, stream, xhtml);
172 } else {
173 // Was probably something else
174 l5.record(b2[0]);
175 l5.record(b2[1]);
176 }
177 } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
178 // Looks like it's straight into the text
179 handleText(maybeLength, stream, xhtml);
180 }
181 }
182
183 private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
184 throws IOException, SAXException, TikaException {
185 byte[] str = new byte[length];
186 IOUtils.readFully(stream, str);
187 if(str[length-1] != 0) {
188 // Not properly null terminated, must be wrong
189 return;
190 }
191
192 String text = extractText(str, false);
193
194 xhtml.startElement("p");
195 xhtml.characters(text);
196 xhtml.endElement("p");
197 }
198
199 /**
200 * Does our best to turn the bytes into text
201 */
202 private String extractText(byte[] data, boolean trim) throws TikaException {
203 // The text is always stored null terminated, but sometimes
204 // may have extra null padding too
205 int length = data.length - 1;
206 if(trim) {
207 for(int i=0; i<data.length; i++) {
208 if(data[i] == 0) {
209 length = i;
210 break;
211 }
212 }
213 }
214
215 // We believe that the text is basically stored as CP437
216 // That said, there are a few characters slightly wrong for that...
217 String text;
218 try {
219 text = new String(data, 0, length, "cp437");
220 } catch(UnsupportedEncodingException e) {
221 throw new TikaException("JVM Broken, core codepage CP437 missing!");
222 }
223
224 // Fix up the known character issues
225 text = text.replace("\u03C6","\u00D8");
226
227 // All done, as best as we can!
228 return text;
229 }
230
231 /**
232 * Provides a view on the previous 5 bytes
233 */
234 private static class Last5 {
235 byte[] data = new byte[5];
236 int pos = 0;
237
238 private void record(int b) {
239 data[pos] = (byte)b;
240 pos++;
241 if(pos >= data.length) {
242 pos = 0;
243 }
244 }
245
246 private byte[] get() {
247 byte[] ret = new byte[5];
248 for(int i=0; i<ret.length; i++) {
249 int p = pos - i;
250 if(p < 0) { p += ret.length; }
251 ret[i] = data[p];
252 }
253 return ret;
254 }
255
256 private boolean is33() {
257 byte[] last5 = get();
258 for(byte b : last5) {
259 if(b != 0x33) return false;
260 }
261 return true;
262 }
263
264 private boolean is00() {
265 byte[] last5 = get();
266 for(byte b : last5) {
267 if(b != 0x00) return false;
268 }
269 return true;
270 }
271 }
272 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.rtf;
17
18 import java.nio.charset.Charset;
19
20 /* Holds all state associated with current RTF group, ie {
21 * ... }. */
22
23 class GroupState {
24 public int depth;
25 public boolean bold;
26 public boolean italic;
27 // True if we are skipping all text in current group,
28 // eg if group leads with a \*:
29 public boolean ignore;
30 // Default is 1 if no uc control has been seen yet:
31 public int ucSkip = 1;
32 public int list;
33 public int listLevel;
34 public Charset fontCharset;
35
36 // Create default (root) GroupState
37 public GroupState() {
38 }
39
40 // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
41 public GroupState(GroupState other) {
42 bold = other.bold;
43 italic = other.italic;
44 ignore = other.ignore;
45 ucSkip = other.ucSkip;
46 list = other.list;
47 listLevel = other.listLevel;
48 fontCharset = other.fontCharset;
49 depth = 1+other.depth;
50 }
51 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.rtf;
17
18 /**
19 * Contains the information for a single list in the list or list override tables.
20 */
21 public class ListDescriptor {
22 public final static int NUMBER_TYPE_BULLET = 23;
23
24 public int id;
25 // We record this but don't make use if it today:
26 public int templateID;
27 // We record this but don't make use if it today:
28 public boolean isStyle;
29 public int[] numberType = new int[9];
30
31 public boolean isUnordered(int level)
32 {
33 return numberType[level] == NUMBER_TYPE_BULLET;
34 }
35 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.rtf;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.io.TaggedInputStream;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MediaType;
27 import org.apache.tika.parser.AbstractParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.sax.XHTMLContentHandler;
30 import org.xml.sax.ContentHandler;
31 import org.xml.sax.SAXException;
32
33 /**
34 * RTF parser
35 */
36 public class RTFParser extends AbstractParser {
37
38 /** Serial version UID */
39 private static final long serialVersionUID = -4165069489372320313L;
40
41 private static final Set<MediaType> SUPPORTED_TYPES =
42 Collections.singleton(MediaType.application("rtf"));
43
44 public Set<MediaType> getSupportedTypes(ParseContext context) {
45 return SUPPORTED_TYPES;
46 }
47
48 public void parse(
49 InputStream stream, ContentHandler handler,
50 Metadata metadata, ParseContext context)
51 throws IOException, SAXException, TikaException {
52 TaggedInputStream tagged = new TaggedInputStream(stream);
53 try {
54 final TextExtractor ert = new TextExtractor(new XHTMLContentHandler(handler, metadata), metadata);
55 ert.extract(stream);
56 metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
57 } catch (IOException e) {
58 tagged.throwIfCauseOf(e);
59 throw new TikaException("Error parsing an RTF document", e);
60 }
61 }
62 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.rtf;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.PushbackInputStream;
22 import java.nio.ByteBuffer;
23 import java.nio.CharBuffer;
24 import java.nio.charset.Charset;
25 import java.nio.charset.CharsetDecoder;
26 import java.nio.charset.CoderResult;
27 import java.nio.charset.CodingErrorAction;
28 import java.util.Calendar;
29 import java.util.HashMap;
30 import java.util.LinkedList;
31 import java.util.Map;
32
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.metadata.Office;
36 import org.apache.tika.metadata.OfficeOpenXMLCore;
37 import org.apache.tika.metadata.OfficeOpenXMLExtended;
38 import org.apache.tika.metadata.Property;
39 import org.apache.tika.metadata.TikaCoreProperties;
40 import org.apache.tika.sax.XHTMLContentHandler;
41 import org.apache.tika.utils.CharsetUtils;
42 import org.xml.sax.SAXException;
43
44 /* Tokenizes and performs a "shallow" parse of the RTF
45 * document, just enough to properly decode the text.
46 *
47 * TODO: we should cutover to a "real" tokenizer (eg JFlex);
48 * it should give better perf, by replacing the excessive
49 * "else if" string compares with FSA traversal. */
50
51 final class TextExtractor {
52
53 private static final Charset ASCII = Charset.forName("US-ASCII");
54
55 private static Charset getCharset(String name) {
56 try {
57 return CharsetUtils.forName(name);
58 } catch (Exception e) {
59 return ASCII;
60 }
61 }
62
63 private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
64 private static final Charset MAC_ROMAN = getCharset("MacRoman");
65 private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
66 private static final Charset WINDOWS_57011 = getCharset("windows-57011");
67 private static final Charset WINDOWS_57010 = getCharset("windows-57010");
68 private static final Charset WINDOWS_57009 = getCharset("windows-57009");
69 private static final Charset WINDOWS_57008 = getCharset("windows-57008");
70 private static final Charset WINDOWS_57007 = getCharset("windows-57007");
71 private static final Charset WINDOWS_57006 = getCharset("windows-57006");
72 private static final Charset WINDOWS_57005 = getCharset("windows-57005");
73 private static final Charset WINDOWS_57004 = getCharset("windows-57004");
74 private static final Charset WINDOWS_57003 = getCharset("windows-57003");
75 private static final Charset X_ISCII91 = getCharset("x-ISCII91");
76 private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
77 private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
78 private static final Charset X_JOHAB = getCharset("x-Johab");
79 private static final Charset CP12582 = getCharset("CP1258");
80 private static final Charset CP12572 = getCharset("CP1257");
81 private static final Charset CP12562 = getCharset("CP1256");
82 private static final Charset CP12552 = getCharset("CP1255");
83 private static final Charset CP12542 = getCharset("CP1254");
84 private static final Charset CP12532 = getCharset("CP1253");
85 private static final Charset CP1252 = getCharset("CP1252");
86 private static final Charset CP12512 = getCharset("CP1251");
87 private static final Charset CP12502 = getCharset("CP1250");
88 private static final Charset CP950 = getCharset("CP950");
89 private static final Charset CP949 = getCharset("CP949");
90 private static final Charset MS9362 = getCharset("MS936");
91 private static final Charset MS8742 = getCharset("MS874");
92 private static final Charset CP866 = getCharset("CP866");
93 private static final Charset CP865 = getCharset("CP865");
94 private static final Charset CP864 = getCharset("CP864");
95 private static final Charset CP863 = getCharset("CP863");
96 private static final Charset CP862 = getCharset("CP862");
97 private static final Charset CP860 = getCharset("CP860");
98 private static final Charset CP852 = getCharset("CP852");
99 private static final Charset CP8502 = getCharset("CP850");
100 private static final Charset CP819 = getCharset("CP819");
101 private static final Charset WINDOWS_720 = getCharset("windows-720");
102 private static final Charset WINDOWS_711 = getCharset("windows-711");
103 private static final Charset WINDOWS_710 = getCharset("windows-710");
104 private static final Charset WINDOWS_709 = getCharset("windows-709");
105 private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
106 private static final Charset CP4372 = getCharset("CP437");
107 private static final Charset CP850 = getCharset("cp850");
108 private static final Charset CP437 = getCharset("cp437");
109 private static final Charset MS874 = getCharset("ms874");
110 private static final Charset CP1257 = getCharset("cp1257");
111 private static final Charset CP1256 = getCharset("cp1256");
112 private static final Charset CP1255 = getCharset("cp1255");
113 private static final Charset CP1258 = getCharset("cp1258");
114 private static final Charset CP1254 = getCharset("cp1254");
115 private static final Charset CP1253 = getCharset("cp1253");
116 private static final Charset MS950 = getCharset("ms950");
117 private static final Charset MS936 = getCharset("ms936");
118 private static final Charset MS1361 = getCharset("ms1361");
119 private static final Charset MS932 = getCharset("MS932");
120 private static final Charset CP1251 = getCharset("cp1251");
121 private static final Charset CP1250 = getCharset("cp1250");
122 private static final Charset MAC_THAI = getCharset("MacThai");
123 private static final Charset MAC_TURKISH = getCharset("MacTurkish");
124 private static final Charset MAC_GREEK = getCharset("MacGreek");
125 private static final Charset MAC_ARABIC = getCharset("MacArabic");
126 private static final Charset MAC_HEBREW = getCharset("MacHebrew");
127 private static final Charset JOHAB = getCharset("johab");
128 private static final Charset BIG5 = getCharset("Big5");
129 private static final Charset GB2312 = getCharset("GB2312");
130 private static final Charset MS949 = getCharset("ms949");
131
132 // Hold pending bytes (encoded in the current charset)
133 // for text output:
134 private byte[] pendingBytes = new byte[16];
135 private int pendingByteCount;
136 private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
137
138 // Holds pending chars for text output
139 private char[] pendingChars = new char[10];
140 private int pendingCharCount;
141
142 // Holds chars for a still-being-tokenized control word
143 private byte[] pendingControl = new byte[10];
144 private int pendingControlCount;
145
146 // Used when we decode bytes -> chars using CharsetDecoder:
147 private final char[] outputArray = new char[128];
148 private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
149
150 // Reused when possible:
151 private CharsetDecoder decoder;
152 private Charset lastCharset;
153
154 private Charset globalCharset = WINDOWS_1252;
155 private int globalDefaultFont = -1;
156 private int curFontID = -1;
157
158 // Holds the font table from this RTF doc, mapping
159 // the font number (from \fN control word) to the
160 // corresponding charset:
161 private final Map<Integer, Charset> fontToCharset =
162 new HashMap<Integer, Charset>();
163
164 // Group stack: when we open a new group, we push
165 // the previous group state onto the stack; when we
166 // close the group, we restore it
167 private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
168
169 // Current group state; in theory this initial
170 // GroupState is unused because the RTF doc should
171 // immediately open the top group (start with {):
172 private GroupState groupState = new GroupState();
173
174 private boolean inHeader = true;
175 private int fontTableState;
176 private int fontTableDepth;
177
178 // Non null if we are processing metadata (title,
179 // keywords, etc.) inside the info group:
180 private Property nextMetaData;
181 private boolean inParagraph;
182
183 // Non-zero if we are processing inside a field destination:
184 private int fieldState;
185
186 // Non-zero list index
187 private int pendingListEnd;
188 private Map<Integer, ListDescriptor> listTable = new HashMap<Integer, ListDescriptor>();
189 private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<Integer, ListDescriptor>();
190 private Map<Integer, ListDescriptor> currentListTable;
191 private ListDescriptor currentList;
192 private int listTableLevel = -1;
193 private boolean ignoreLists;
194
195 // Non-null if we've seen the url for a HYPERLINK but not yet
196 // its text:
197 private String pendingURL;
198
199 private final StringBuilder pendingBuffer = new StringBuilder();
200
201 // Used to process the sub-groups inside the upr
202 // group:
203 private int uprState = -1;
204
205 private final XHTMLContentHandler out;
206 private final Metadata metadata;
207
208 // Used when extracting CREATION date:
209 private int year, month, day, hour, minute;
210
211 // How many next ansi chars we should skip; this
212 // is 0 except when we are still in the "ansi
213 // shadow" after seeing a unicode escape, at which
214 // point it's set to the last ucN skip we had seen:
215 int ansiSkip = 0;
216
217 // The RTF doc has a "font table" that assigns ords
218 // (f0, f1, f2, etc.) to fonts and charsets, using the
219 // \fcharsetN control word. This mapping maps from the
220 // N to corresponding Java charset:
221 private static final Map<Integer, Charset> FCHARSET_MAP =
222 new HashMap<Integer, Charset>();
223
224 static {
225 FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
226 // charset 1 is Default
227 // charset 2 is Symbol
228
229 FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
230 FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
231 FCHARSET_MAP.put(79, MS949); // Mac Hangul
232 FCHARSET_MAP.put(80, GB2312); // Mac GB2312
233 FCHARSET_MAP.put(81, BIG5); // Mac Big5
234 FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
235 FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
236 FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
237 FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
238 FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
239 FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
240 FCHARSET_MAP.put(88, CP1250); // Mac East Europe
241 FCHARSET_MAP.put(89, CP1251); // Mac Russian
242
243 FCHARSET_MAP.put(128, MS932); // Shift JIS
244 FCHARSET_MAP.put(129, MS949); // Hangul
245 FCHARSET_MAP.put(130, MS1361); // Johab
246 FCHARSET_MAP.put(134, MS936); // GB2312
247 FCHARSET_MAP.put(136, MS950); // Big5
248 FCHARSET_MAP.put(161, CP1253); // Greek
249 FCHARSET_MAP.put(162, CP1254); // Turkish
250 FCHARSET_MAP.put(163, CP1258); // Vietnamese
251 FCHARSET_MAP.put(177, CP1255); // Hebrew
252 FCHARSET_MAP.put(178, CP1256); // Arabic
253 // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
254 // FCHARSET_MAP.put( 180, "" ); // Arabic user
255 // FCHARSET_MAP.put( 181, "" ); // Hebrew user
256 FCHARSET_MAP.put(186, CP1257); // Baltic
257
258 FCHARSET_MAP.put(204, CP1251); // Russian
259 FCHARSET_MAP.put(222, MS874); // Thai
260 FCHARSET_MAP.put(238, CP1250); // Eastern European
261 FCHARSET_MAP.put(254, CP437); // PC 437
262 FCHARSET_MAP.put(255, CP850); // OEM
263 }
264
265 // The RTF may specify the \ansicpgN charset in the
266 // header; this maps the N to the corresponding Java
267 // character set:
268 private static final Map<Integer, Charset> ANSICPG_MAP =
269 new HashMap<Integer, Charset>();
270 static {
271 ANSICPG_MAP.put(437, CP4372); // US IBM
272 ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
273
274 ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
275 ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
276 ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
277 ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
278 ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
279 ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
280
281 ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
282 ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
283 ANSICPG_MAP.put(852, CP852); // Eastern European
284 ANSICPG_MAP.put(860, CP860); // Portuguese
285 ANSICPG_MAP.put(862, CP862); // Hebrew
286 ANSICPG_MAP.put(863, CP863); // French Canadian
287 ANSICPG_MAP.put(864, CP864); // Arabic
288 ANSICPG_MAP.put(865, CP865); // Norwegian
289 ANSICPG_MAP.put(866, CP866); // Soviet Union
290 ANSICPG_MAP.put(874, MS8742); // Thai
291 ANSICPG_MAP.put(932, MS932); // Japanese
292 ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
293 ANSICPG_MAP.put(949, CP949); // Korean
294 ANSICPG_MAP.put(950, CP950); // Traditional Chinese
295 ANSICPG_MAP.put(1250, CP12502); // Eastern European
296 ANSICPG_MAP.put(1251, CP12512); // Cyrillic
297 ANSICPG_MAP.put(1252, CP1252); // Western European
298 ANSICPG_MAP.put(1253, CP12532); // Greek
299 ANSICPG_MAP.put(1254, CP12542); // Turkish
300 ANSICPG_MAP.put(1255, CP12552); // Hebrew
301 ANSICPG_MAP.put(1256, CP12562); // Arabic
302 ANSICPG_MAP.put(1257, CP12572); // Baltic
303 ANSICPG_MAP.put(1258, CP12582); // Vietnamese
304 ANSICPG_MAP.put(1361, X_JOHAB); // Johab
305 ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
306 ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
307 ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
308 ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
309 ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
310 ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
311 ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
312 ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
313 ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
314
315 // TODO: in theory these other charsets are simple
316 // shifts off of Devanagari, so we could impl that
317 // here:
318 ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
319 ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
320 ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
321 ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
322 ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
323 ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
324 ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
325 ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
326 ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
327 }
328
329 public TextExtractor(XHTMLContentHandler out, Metadata metadata) {
330 this.metadata = metadata;
331 this.out = out;
332 }
333
334 public boolean isIgnoringLists() {
335 return ignoreLists;
336 }
337
338 public void setIgnoreLists(boolean ignore) {
339 this.ignoreLists = ignore;
340 }
341
342 private static boolean isHexChar(int ch) {
343 return (ch >= '0' && ch <= '9') ||
344 (ch >= 'a' && ch <= 'f') ||
345 (ch >= 'A' && ch <= 'F');
346 }
347
348 private static boolean isAlpha(int ch) {
349 return (ch >= 'a' && ch <= 'z') ||
350 (ch >= 'A' && ch <= 'Z');
351 }
352
353 private static boolean isDigit(int ch) {
354 return ch >= '0' && ch <= '9';
355 }
356
357 private static int hexValue(int ch) {
358 if (ch >= '0' && ch <= '9') {
359 return ch - '0';
360 } else if (ch >= 'a' && ch <= 'z') {
361 return 10 + (ch - 'a');
362 } else {
363 assert ch >= 'A' && ch <= 'Z';
364 return 10 + (ch - 'A');
365 }
366 }
367
368 // Push pending bytes or pending chars:
369 private void pushText() throws IOException, SAXException, TikaException {
370 if (pendingByteCount != 0) {
371 assert pendingCharCount == 0;
372 pushBytes();
373 } else {
374 pushChars();
375 }
376 }
377
378 // Buffers the byte (unit in the current charset) for
379 // output:
380 private void addOutputByte(int b) throws IOException, SAXException, TikaException {
381 assert b >= 0 && b < 256 : "byte value out of range: " + b;
382
383 if (pendingCharCount != 0) {
384 pushChars();
385 }
386
387 // Save the byte in pending buffer:
388 if (pendingByteCount == pendingBytes.length) {
389 // Gradual but exponential growth:
390 final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)];
391 System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
392 pendingBytes = newArray;
393 pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
394 }
395 pendingBytes[pendingByteCount++] = (byte) b;
396 }
397
398 // Buffers a byte as part of a control word:
399 private void addControl(int b) {
400 assert isAlpha(b);
401 // Save the byte in pending buffer:
402 if (pendingControlCount == pendingControl.length) {
403 // Gradual but exponential growth:
404 final byte[] newArray = new byte[(int) (pendingControl.length*1.25)];
405 System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
406 pendingControl = newArray;
407 }
408 pendingControl[pendingControlCount++] = (byte) b;
409 }
410
411 // Buffers a UTF16 code unit for output
412 private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
413 if (pendingByteCount != 0) {
414 pushBytes();
415 }
416
417 if (inHeader || fieldState == 1) {
418 pendingBuffer.append(ch);
419 } else {
420 if (pendingCharCount == pendingChars.length) {
421 // Gradual but exponential growth:
422 final char[] newArray = new char[(int) (pendingChars.length*1.25)];
423 System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
424 pendingChars = newArray;
425 }
426 pendingChars[pendingCharCount++] = ch;
427 }
428 }
429
430 // Shallow parses the entire doc, writing output to
431 // this.out and this.metadata
432 public void extract(InputStream in) throws IOException, SAXException, TikaException {
433 // in = new FilterInputStream(in) {
434 // public int read() throws IOException {
435 // int r = super.read();
436 // System.out.write(r);
437 // System.out.flush();
438 // return r;
439 // }
440 // public int read(byte b[], int off, int len) throws IOException {
441 // int r = super.read(b, off, len);
442 // System.out.write(b, off, r);
443 // System.out.flush();
444 // return r;
445 // }
446 // };
447 extract(new PushbackInputStream(in, 2));
448 }
449
450 private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
451 out.startDocument();
452
453 while (true) {
454 final int b = in.read();
455 if (b == -1) {
456 break;
457 } else if (b == '\\') {
458 parseControlToken(in);
459 } else if (b == '{') {
460 pushText();
461 processGroupStart(in);
462 } else if (b == '}') {
463 pushText();
464 processGroupEnd();
465 if (groupStates.isEmpty()) {
466 // parsed document closing brace
467 break;
468 }
469 } else if (b != '\r' && b != '\n' && (!groupState.ignore || nextMetaData != null)) {
470 // Linefeed and carriage return are not
471 // significant
472 if (ansiSkip != 0) {
473 ansiSkip--;
474 } else {
475 addOutputByte(b);
476 }
477 }
478 }
479
480 endParagraph(false);
481 out.endDocument();
482 }
483
484 private void parseControlToken(PushbackInputStream in) throws IOException, SAXException, TikaException {
485 int b = in.read();
486 if (b == '\'') {
487 // escaped hex char
488 parseHexChar(in);
489 } else if (isAlpha(b)) {
490 // control word
491 parseControlWord((char)b, in);
492 } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
493 // escaped char
494 addOutputByte(b);
495 } else if (b != -1) {
496 // control symbol, eg \* or \~
497 processControlSymbol((char)b);
498 }
499 }
500
501 private void parseHexChar(PushbackInputStream in) throws IOException, SAXException, TikaException {
502 int hex1 = in.read();
503 if (!isHexChar(hex1)) {
504 // DOC ERROR (malformed hex escape): ignore
505 in.unread(hex1);
506 return;
507 }
508
509 int hex2 = in.read();
510 if (!isHexChar(hex2)) {
511 // TODO: log a warning here, somehow?
512 // DOC ERROR (malformed hex escape):
513 // ignore
514 in.unread(hex2);
515 return;
516 }
517
518 if (ansiSkip != 0) {
519 // Skip this ansi char since we are
520 // still in the shadow of a unicode
521 // escape:
522 ansiSkip--;
523 } else {
524 // Unescape:
525 addOutputByte(16*hexValue(hex1) + hexValue(hex2));
526 }
527 }
528
529 private void parseControlWord(int firstChar, PushbackInputStream in) throws IOException, SAXException, TikaException {
530 addControl(firstChar);
531
532 int b = in.read();
533 while (isAlpha(b)) {
534 addControl(b);
535 b = in.read();
536 }
537
538 boolean hasParam = false;
539 boolean negParam = false;
540 if (b == '-') {
541 negParam = true;
542 hasParam = true;
543 b = in.read();
544 }
545
546 int param = 0;
547 while (isDigit(b)) {
548 param *= 10;
549 param += (b - '0');
550 hasParam = true;
551 b = in.read();
552 }
553
554 // space is consumed as part of the
555 // control word, but is not added to the
556 // control word
557 if (b != ' ') {
558 in.unread(b);
559 }
560
561 if (hasParam) {
562 if (negParam) {
563 param = -param;
564 }
565 processControlWord(param, in);
566 } else {
567 processControlWord();
568 }
569
570 pendingControlCount = 0;
571 }
572
573 private void lazyStartParagraph() throws IOException, SAXException, TikaException {
574 if (!inParagraph) {
575 // Ensure </i></b> order
576 if (groupState.italic) {
577 end("i");
578 }
579 if (groupState.bold) {
580 end("b");
581 }
582 if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
583 endList(pendingListEnd);
584 pendingListEnd = 0;
585 }
586 if (inList() && pendingListEnd != groupState.list) {
587 startList(groupState.list);
588 }
589 if (inList()) {
590 out.startElement("li");
591 } else {
592 out.startElement("p");
593 }
594
595 // Ensure <b><i> order
596 if (groupState.bold) {
597 start("b");
598 }
599 if (groupState.italic) {
600 start("i");
601 }
602 inParagraph = true;
603 }
604 }
605
606 private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
607 pushText();
608 if (inParagraph) {
609 if (groupState.italic) {
610 end("i");
611 groupState.italic = preserveStyles;
612 }
613 if (groupState.bold) {
614 end("b");
615 groupState.bold = preserveStyles;
616 }
617 if (inList()) {
618 out.endElement("li");
619 } else {
620 out.endElement("p");
621 }
622
623 if (preserveStyles && (groupState.bold || groupState.italic)) {
624 start("p");
625 if (groupState.bold) {
626 start("b");
627 }
628 if (groupState.italic) {
629 start("i");
630 }
631 inParagraph = true;
632 } else {
633 inParagraph = false;
634 }
635 }
636
637 // Ensure closing the list at document end
638 if (!preserveStyles && pendingListEnd != 0) {
639 endList(pendingListEnd);
640 pendingListEnd = 0;
641 }
642 }
643
644 // Push pending UTF16 units to out ContentHandler
645 private void pushChars() throws IOException, SAXException, TikaException {
646 if (pendingCharCount != 0) {
647 lazyStartParagraph();
648 out.characters(pendingChars, 0, pendingCharCount);
649 pendingCharCount = 0;
650 }
651 }
652
653 // Decodes the buffered bytes in pendingBytes
654 // into UTF16 code units, and sends the characters
655 // to the out ContentHandler, if we are in the body,
656 // else appends the characters to the pendingBuffer
657 private void pushBytes() throws IOException, SAXException, TikaException {
658 if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
659
660 final CharsetDecoder decoder = getDecoder();
661 pendingByteBuffer.limit(pendingByteCount);
662 assert pendingByteBuffer.position() == 0;
663 assert outputBuffer.position() == 0;
664
665 while (true) {
666 // We pass true for endOfInput because, when
667 // we are called, we should have seen a
668 // complete sequence of characters for this
669 // charset:
670 final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
671
672 final int pos = outputBuffer.position();
673 if (pos > 0) {
674 if (inHeader || fieldState == 1) {
675 pendingBuffer.append(outputArray, 0, pos);
676 } else {
677 lazyStartParagraph();
678 out.characters(outputArray, 0, pos);
679 }
680 outputBuffer.position(0);
681 }
682
683 if (result == CoderResult.UNDERFLOW) {
684 break;
685 }
686 }
687
688 while (true) {
689 final CoderResult result = decoder.flush(outputBuffer);
690
691 final int pos = outputBuffer.position();
692 if (pos > 0) {
693 if (inHeader || fieldState == 1) {
694 pendingBuffer.append(outputArray, 0, pos);
695 } else {
696 lazyStartParagraph();
697 out.characters(outputArray, 0, pos);
698 }
699 outputBuffer.position(0);
700 }
701
702 if (result == CoderResult.UNDERFLOW) {
703 break;
704 }
705 }
706
707 // Reset for next decode
708 decoder.reset();
709 pendingByteBuffer.position(0);
710 }
711
712 pendingByteCount = 0;
713 }
714
715 // NOTE: s must be ascii alpha only
716 private boolean equals(String s) {
717 if (pendingControlCount != s.length()) {
718 return false;
719 }
720 for(int idx=0;idx<pendingControlCount;idx++) {
721 assert isAlpha(s.charAt(idx));
722 if (((byte) s.charAt(idx)) != pendingControl[idx]) {
723 return false;
724 }
725 }
726 return true;
727 }
728
729 private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
730 switch(ch) {
731 case '~':
732 // Non-breaking space -> unicode NON-BREAKING SPACE
733 addOutputChar('\u00a0');
734 break;
735 case '*':
736 // Ignorable destination (control words defined after
737 // the 1987 RTF spec). These are already handled by
738 // processGroupStart()
739 break;
740 case '-':
741 // Optional hyphen -> unicode SOFT HYPHEN
742 addOutputChar('\u00ad');
743 break;
744 case '_':
745 // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
746 addOutputChar('\u2011');
747 break;
748 default:
749 break;
750 }
751 }
752
753 private CharsetDecoder getDecoder() throws TikaException {
754 Charset charset = getCharset();
755
756 // Common case: charset is same as last time, so
757 // just reuse it:
758 if (lastCharset == null || !charset.equals(lastCharset)) {
759 decoder = charset.newDecoder();
760 decoder.onMalformedInput(CodingErrorAction.REPLACE);
761 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
762 lastCharset = charset;
763 }
764
765 return decoder;
766 }
767
768 // Return current charset in-use
769 private Charset getCharset() throws TikaException {
770 // If a specific font (fN) was set, use its charset
771 if (groupState.fontCharset != null) {
772 return groupState.fontCharset;
773 }
774
775 // Else, if global default font (defN) was set, use that one
776 if (globalDefaultFont != -1 && !inHeader) {
777 Charset cs = fontToCharset.get(globalDefaultFont);
778 if (cs != null) {
779 return cs;
780 }
781 }
782
783 // Else, use the global charset
784 if (globalCharset == null) {
785 throw new TikaException("unable to determine charset");
786 }
787
788 return globalCharset;
789 }
790
791 // Handle control word that takes a parameter:
792 private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
793
794 // TODO: afN? (associated font number)
795
796 // TODO: do these alter text output...?
797 /*
798 } else if (equals("stshfdbch")) {
799 // font to be used by default in
800 // style sheet for East Asian chars
801 // arg N is font table entry
802 } else if (equals("stshfloch")) {
803 // font to be used by default in
804 // style sheet for ASCII chars
805 // arg N is font table entry
806 } else if (equals("stshfhich")) {
807 // font to be used by default in
808 // style sheet for High Ansi chars
809 // arg N is font table entry
810 } else if (equals("stshfbi")) {
811 // style sheet for Complex Scripts (BIDI) chars
812 // arg N is font table entry
813 */
814
815 // TODO: inefficient that we check equals N times;
816 // we'd get better perf w/ real lexer (eg
817 // JFlex), which uses single-pass FSM to do cmp:
818 if (inHeader) {
819 if (equals("ansicpg")) {
820 // ANSI codepage
821 Charset cs = ANSICPG_MAP.get(param);
822 if (cs != null) {
823 globalCharset = cs;
824 }
825 } else if (equals("deff")) {
826 // Default font
827 globalDefaultFont = param;
828 } else if (equals("nofpages")) {
829 metadata.add(Office.PAGE_COUNT, Integer.toString(param));
830 } else if (equals("nofwords")) {
831 metadata.add(Office.WORD_COUNT, Integer.toString(param));
832 } else if (equals("nofchars")) {
833 metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
834 } else if (equals("yr")) {
835 year = param;
836 } else if (equals("mo")) {
837 month = param;
838 } else if (equals("dy")) {
839 day = param;
840 } else if (equals("hr")) {
841 hour = param;
842 } else if (equals("min")) {
843 minute = param;
844 }
845
846 if (fontTableState == 1) {
847 // Still inside font table -- record the
848 // mappings of fN to the fcharset:
849 if (groupState.depth < fontTableDepth) {
850 fontTableState = 2;
851 } else {
852 if (equals("f")) {
853 // Start new font definition
854 curFontID = param;
855 } else if (equals("fcharset")) {
856 Charset cs = FCHARSET_MAP.get(param);
857 if (cs != null) {
858 fontToCharset.put(curFontID, cs);
859 }
860 }
861 }
862 }
863
864 if (currentList != null) {
865 if (equals("listid")) {
866 currentList.id = param;
867 currentListTable.put(currentList.id, currentList);
868 } else if (equals("listtemplateid")) {
869 currentList.templateID = param;
870 } else if (equals("levelnfc") || equals("levelnfcn")) {
871 currentList.numberType[listTableLevel] = param;
872 }
873 }
874 } else {
875 // In document
876 if (equals("b")) {
877 // b0
878 assert param == 0;
879 if (groupState.bold) {
880 pushText();
881 if (groupState.italic) {
882 end("i");
883 }
884 end("b");
885 if (groupState.italic) {
886 start("i");
887 }
888 groupState.bold = false;
889 }
890 } else if (equals("i")) {
891 // i0
892 assert param == 0;
893 if (groupState.italic) {
894 pushText();
895 end("i");
896 groupState.italic = false;
897 }
898 } else if (equals("f")) {
899 // Change current font
900 Charset fontCharset = fontToCharset.get(param);
901
902 // Push any buffered text before changing
903 // font:
904 pushText();
905
906 if (fontCharset != null) {
907 groupState.fontCharset = fontCharset;
908 } else {
909 // DOC ERROR: font change referenced a
910 // non-table'd font number
911 // TODO: log a warning? Throw an exc?
912 groupState.fontCharset = null;
913 }
914 } else if (equals("ls")) {
915 groupState.list = param;
916 } else if (equals("lslvl")) {
917 groupState.listLevel = param;
918 }
919 }
920
921 // Process unicode escape. This can appear in doc
922 // or in header, since the metadata (info) fields
923 // in the header can be unicode escaped as well:
924 if (equals("u")) {
925 // Unicode escape
926 if (!groupState.ignore) {
927 final char utf16CodeUnit = (char) (param & 0xffff);
928 addOutputChar(utf16CodeUnit);
929 }
930
931 // After seeing a unicode escape we must
932 // skip the next ucSkip ansi chars (the
933 // "unicode shadow")
934 ansiSkip = groupState.ucSkip;
935 } else if (equals("uc")) {
936 // Change unicode shadow length
937 groupState.ucSkip = (int) param;
938 } else if (equals("bin")) {
939 if (param >= 0) {
940 int bytesToRead = param;
941 byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
942 while (bytesToRead > 0) {
943 int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length));
944 if (r < 0) {
945 throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param-bytesToRead));
946 }
947 bytesToRead -= r;
948 }
949 } else {
950 // log some warning?
951 }
952 }
953 }
954
955 private boolean inList() {
956 return !ignoreLists && groupState.list != 0;
957 }
958
959 /**
960 * Marks the current list as pending to end. This is done to be able to merge list items of
961 * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
962 * <code>"ol"</code>).
963 */
964 private void pendingListEnd() {
965 pendingListEnd = groupState.list;
966 groupState.list = 0;
967 }
968
969 /**
970 * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
971 * type for the given <code>listID</code>.
972 * @param listID The ID of the list.
973 * @throws IOException
974 * @throws SAXException
975 * @throws TikaException
976 */
977 private void endList(int listID) throws IOException, SAXException, TikaException {
978 if (!ignoreLists) {
979 out.endElement(isUnorderedList(listID) ? "ul" : "ol");
980 }
981 }
982
983 /**
984 * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
985 * type for the given <code>listID</code>.
986 * @param listID The ID of the list.
987 * @throws IOException
988 * @throws SAXException
989 * @throws TikaException
990 */
991 private void startList(int listID) throws IOException, SAXException, TikaException {
992 if (!ignoreLists) {
993 out.startElement(isUnorderedList(listID) ? "ul" : "ol");
994 }
995 }
996
997 private boolean isUnorderedList(int listID) {
998 ListDescriptor list = listTable.get(listID);
999 if (list != null) {
1000 return list.isUnordered(groupState.listLevel);
1001 }
1002 return true;
1003 }
1004
1005 private void end(String tag) throws IOException, SAXException, TikaException {
1006 out.endElement(tag);
1007 }
1008
1009 private void start(String tag) throws IOException, SAXException, TikaException {
1010 out.startElement(tag);
1011 }
1012
1013 // Handle non-parameter control word:
1014 private void processControlWord() throws IOException, SAXException, TikaException {
1015 if (inHeader) {
1016 if (equals("ansi")) {
1017 globalCharset = WINDOWS_1252;
1018 } else if (equals("pca")) {
1019 globalCharset = CP850;
1020 } else if (equals("pc")) {
1021 globalCharset = CP437;
1022 } else if (equals("mac")) {
1023 globalCharset = MAC_ROMAN;
1024 }
1025
1026 if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
1027 groupState.ignore = true;
1028 } else if (equals("listtable")) {
1029 currentListTable = listTable;
1030 } else if (equals("listoverridetable")) {
1031 currentListTable = listOverrideTable;
1032 }
1033
1034 if (uprState == -1) {
1035 // TODO: we can also parse \creatim, \revtim,
1036 // \printim, \version, etc.
1037 if (equals("author")) {
1038 nextMetaData = TikaCoreProperties.CREATOR;
1039 } else if (equals("title")) {
1040 nextMetaData = TikaCoreProperties.TITLE;
1041 } else if (equals("subject")) {
1042 // TODO: Move to OO subject in Tika 2.0
1043 nextMetaData = TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT;
1044 } else if (equals("keywords")) {
1045 nextMetaData = TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT;
1046 } else if (equals("category")) {
1047 nextMetaData = OfficeOpenXMLCore.CATEGORY;
1048 } else if (equals("comment")) {
1049 nextMetaData = TikaCoreProperties.COMMENTS;
1050 } else if (equals("company")) {
1051 nextMetaData = OfficeOpenXMLExtended.COMPANY;
1052 } else if (equals("manager")) {
1053 nextMetaData = OfficeOpenXMLExtended.MANAGER;
1054 } else if (equals("template")) {
1055 nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
1056 } else if (equals("creatim")) {
1057 nextMetaData = TikaCoreProperties.CREATED;
1058 }
1059 }
1060
1061 if (fontTableState == 0) {
1062 // Didn't see font table yet
1063 if (equals("fonttbl")) {
1064 fontTableState = 1;
1065 fontTableDepth = groupState.depth;
1066 }
1067 } else if (fontTableState == 1) {
1068 // Inside font table
1069 if (groupState.depth < fontTableDepth) {
1070 fontTableState = 2;
1071 }
1072 }
1073
1074 // List table handling
1075 if (currentListTable != null) {
1076 if (equals("list") || equals("listoverride")) {
1077 currentList = new ListDescriptor();
1078 listTableLevel = -1;
1079 } else if (currentList != null) {
1080 if (equals("liststylename")) {
1081 currentList.isStyle = true;
1082 } else if (equals("listlevel")) {
1083 listTableLevel++;
1084 }
1085 }
1086 }
1087
1088 if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
1089 inHeader = false;
1090 }
1091 } else {
1092 if (equals("b")) {
1093 if (!groupState.bold) {
1094 pushText();
1095 lazyStartParagraph();
1096 if (groupState.italic) {
1097 // Make sure nesting is always <b><i>
1098 end("i");
1099 }
1100 groupState.bold = true;
1101 start("b");
1102 if (groupState.italic) {
1103 start("i");
1104 }
1105 }
1106 } else if (equals("i")) {
1107 if (!groupState.italic) {
1108 pushText();
1109 lazyStartParagraph();
1110 groupState.italic = true;
1111 start("i");
1112 }
1113 }
1114 }
1115
1116 final boolean ignored = groupState.ignore;
1117
1118 if (equals("pard")) {
1119 // Reset styles
1120 pushText();
1121 if (groupState.italic) {
1122 end("i");
1123 groupState.italic = false;
1124 }
1125 if (groupState.bold) {
1126 end("b");
1127 groupState.bold = false;
1128 }
1129 if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
1130 pendingListEnd();
1131 }
1132 } else if (equals("par")) {
1133 if (!ignored) {
1134 endParagraph(true);
1135 }
1136 } else if (equals("shptxt")) {
1137 pushText();
1138 // Text inside a shape
1139 groupState.ignore = false;
1140 } else if (equals("atnid")) {
1141 pushText();
1142 // Annotation ID
1143 groupState.ignore = false;
1144 } else if (equals("atnauthor")) {
1145 pushText();
1146 // Annotation author
1147 groupState.ignore = false;
1148 } else if (equals("annotation")) {
1149 pushText();
1150 // Annotation
1151 groupState.ignore = false;
1152 } else if (equals("listtext")) {
1153 groupState.ignore = true;
1154 } else if (equals("cell")) {
1155 // TODO: we should produce a table output here?
1156 //addOutputChar(' ');
1157 endParagraph(true);
1158 } else if (equals("pict")) {
1159 pushText();
1160 // TODO: create img tag? but can that support
1161 // embedded image data?
1162 groupState.ignore = true;
1163 } else if (equals("line")) {
1164 if (!ignored) {
1165 addOutputChar('\n');
1166 }
1167 } else if (equals("column")) {
1168 if (!ignored) {
1169 addOutputChar(' ');
1170 }
1171 } else if (equals("page")) {
1172 if (!ignored) {
1173 addOutputChar('\n');
1174 }
1175 } else if (equals("softline")) {
1176 if (!ignored) {
1177 addOutputChar('\n');
1178 }
1179 } else if (equals("softcolumn")) {
1180 if (!ignored) {
1181 addOutputChar(' ');
1182 }
1183 } else if (equals("softpage")) {
1184 if (!ignored) {
1185 addOutputChar('\n');
1186 }
1187 } else if (equals("tab")) {
1188 if (!ignored) {
1189 addOutputChar('\t');
1190 }
1191 } else if (equals("upr")) {
1192 uprState = 0;
1193 } else if (equals("ud") && uprState == 1) {
1194 uprState = -1;
1195 // 2nd group inside the upr destination, which
1196 // contains the unicode encoding of the text, so
1197 // we want to keep that:
1198 groupState.ignore = false;
1199 } else if (equals("bullet")) {
1200 if (!ignored) {
1201 // unicode BULLET
1202 addOutputChar('\u2022');
1203 }
1204 } else if (equals("endash")) {
1205 if (!ignored) {
1206 // unicode EN DASH
1207 addOutputChar('\u2013');
1208 }
1209 } else if (equals("emdash")) {
1210 if (!ignored) {
1211 // unicode EM DASH
1212 addOutputChar('\u2014');
1213 }
1214 } else if (equals("enspace")) {
1215 if (!ignored) {
1216 // unicode EN SPACE
1217 addOutputChar('\u2002');
1218 }
1219 } else if (equals("qmspace")) {
1220 if (!ignored) {
1221 // quarter em space -> unicode FOUR-PER-EM SPACE
1222 addOutputChar('\u2005');
1223 }
1224 } else if (equals("emspace")) {
1225 if (!ignored) {
1226 // unicode EM SPACE
1227 addOutputChar('\u2003');
1228 }
1229 } else if (equals("lquote")) {
1230 if (!ignored) {
1231 // unicode LEFT SINGLE QUOTATION MARK
1232 addOutputChar('\u2018');
1233 }
1234 } else if (equals("rquote")) {
1235 if (!ignored) {
1236 // unicode RIGHT SINGLE QUOTATION MARK
1237 addOutputChar('\u2019');
1238 }
1239 } else if (equals("ldblquote")) {
1240 if (!ignored) {
1241 // unicode LEFT DOUBLE QUOTATION MARK
1242 addOutputChar('\u201C');
1243 }
1244 } else if (equals("rdblquote")) {
1245 if (!ignored) {
1246 // unicode RIGHT DOUBLE QUOTATION MARK
1247 addOutputChar('\u201D');
1248 }
1249 } else if (equals("fldinst")) {
1250 fieldState = 1;
1251 groupState.ignore = false;
1252 } else if (equals("fldrslt") && fieldState == 2) {
1253 assert pendingURL != null;
1254 lazyStartParagraph();
1255 out.startElement("a", "href", pendingURL);
1256 pendingURL = null;
1257 fieldState = 3;
1258 groupState.ignore = false;
1259 }
1260 }
1261
1262 // Push new GroupState
1263 private void processGroupStart(PushbackInputStream in) throws IOException {
1264 ansiSkip = 0;
1265 // Push current groupState onto the stack
1266 groupStates.add(groupState);
1267
1268 // Make new GroupState
1269 groupState = new GroupState(groupState);
1270 assert groupStates.size() == groupState.depth: "size=" + groupStates.size() + " depth=" + groupState.depth;
1271
1272 if (uprState == 0) {
1273 uprState = 1;
1274 groupState.ignore = true;
1275 }
1276
1277 // Check for ignorable groups. Note that
1278 // sometimes we un-ignore within this group, eg
1279 // when handling upr escape.
1280 int b2 = in.read();
1281 if (b2 == '\\') {
1282 int b3 = in.read();
1283 if (b3 == '*') {
1284 groupState.ignore = true;
1285 }
1286 in.unread(b3);
1287 }
1288 in.unread(b2);
1289 }
1290
1291 // Pop current GroupState
1292 private void processGroupEnd() throws IOException, SAXException, TikaException {
1293 if (inHeader) {
1294 if (nextMetaData != null) {
1295 if (nextMetaData == TikaCoreProperties.CREATED) {
1296 Calendar cal = Calendar.getInstance();
1297 cal.set(year, month-1, day, hour, minute, 0);
1298 metadata.set(nextMetaData, cal.getTime());
1299 } else if (nextMetaData.isMultiValuePermitted()) {
1300 metadata.add(nextMetaData, pendingBuffer.toString());
1301 } else {
1302 metadata.set(nextMetaData, pendingBuffer.toString());
1303 }
1304 nextMetaData = null;
1305 }
1306 pendingBuffer.setLength(0);
1307 }
1308
1309 assert groupState.depth > 0;
1310 ansiSkip = 0;
1311
1312 // Be robust if RTF doc is corrupt (has too many
1313 // closing }s):
1314 // TODO: log a warning?
1315 if (groupStates.size() > 0) {
1316 // Restore group state:
1317 final GroupState outerGroupState = groupStates.removeLast();
1318
1319 // Close italic, if outer does not have italic or
1320 // bold changed:
1321 if (groupState.italic) {
1322 if (!outerGroupState.italic ||
1323 groupState.bold != outerGroupState.bold) {
1324 end("i");
1325 groupState.italic = false;
1326 }
1327 }
1328
1329 // Close bold
1330 if (groupState.bold && !outerGroupState.bold) {
1331 end("b");
1332 }
1333
1334 // Open bold
1335 if (!groupState.bold && outerGroupState.bold) {
1336 start("b");
1337 }
1338
1339 // Open italic
1340 if (!groupState.italic && outerGroupState.italic) {
1341 start("i");
1342 }
1343 groupState = outerGroupState;
1344 }
1345 assert groupStates.size() == groupState.depth;
1346
1347 if (fieldState == 1) {
1348 String s = pendingBuffer.toString().trim();
1349 pendingBuffer.setLength(0);
1350 if (s.startsWith("HYPERLINK")) {
1351 s = s.substring(9).trim();
1352 // TODO: what other instructions can be in a
1353 // HYPERLINK destination?
1354 final boolean isLocalLink = s.indexOf("\\l ") != -1;
1355 int idx = s.indexOf('"');
1356 if (idx != -1) {
1357 int idx2 = s.indexOf('"', 1+idx);
1358 if (idx2 != -1) {
1359 s = s.substring(1+idx, idx2);
1360 }
1361 }
1362 pendingURL = (isLocalLink ? "#" : "") + s;
1363 fieldState = 2;
1364 } else {
1365 fieldState = 0;
1366 }
1367
1368 // TODO: we could process the other known field
1369 // types. Right now, we will extract their text
1370 // inlined, but fail to record them in metadata
1371 // as a field value.
1372 } else if (fieldState == 3) {
1373 out.endElement("a");
1374 fieldState = 0;
1375 }
1376 }
1377 }
0 /**
1 *******************************************************************************
2 * Copyright (C) 2005-2009, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 */
6 package org.apache.tika.parser.txt;
7
8 import java.io.InputStream;
9 import java.io.Reader;
10 import java.io.IOException;
11 import java.nio.charset.Charset;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.Arrays;
15
16
17 /**
18 * <code>CharsetDetector</code> provides a facility for detecting the
19 * charset or encoding of character data in an unknown format.
20 * The input data can either be from an input stream or an array of bytes.
21 * The result of the detection operation is a list of possibly matching
22 * charsets, or, for simple use, you can just ask for a Java Reader that
23 * will will work over the input data.
24 * <p/>
25 * Character set detection is at best an imprecise operation. The detection
26 * process will attempt to identify the charset that best matches the characteristics
27 * of the byte data, but the process is partly statistical in nature, and
28 * the results can not be guaranteed to always be correct.
29 * <p/>
30 * For best accuracy in charset detection, the input data should be primarily
31 * in a single language, and a minimum of a few hundred bytes worth of plain text
32 * in the language are needed. The detection process will attempt to
33 * ignore html or xml style markup that could otherwise obscure the content.
34 * <p/>
35 * @stable ICU 3.4
36 */
37 public class CharsetDetector {
38
39 // Question: Should we have getters corresponding to the setters for input text
40 // and declared encoding?
41
42 // A thought: If we were to create our own type of Java Reader, we could defer
43 // figuring out an actual charset for data that starts out with too much English
44 // only ASCII until the user actually read through to something that didn't look
45 // like 7 bit English. If nothing else ever appeared, we would never need to
46 // actually choose the "real" charset. All assuming that the application just
47 // wants the data, and doesn't care about a char set name.
48
49 /**
50 * Constructor
51 *
52 * @stable ICU 3.4
53 */
54 public CharsetDetector() {
55 }
56
57 /**
58 * Set the declared encoding for charset detection.
59 * The declared encoding of an input text is an encoding obtained
60 * from an http header or xml declaration or similar source that
61 * can be provided as additional information to the charset detector.
62 * A match between a declared encoding and a possible detected encoding
63 * will raise the quality of that detected encoding by a small delta,
64 * and will also appear as a "reason" for the match.
65 * <p/>
66 * A declared encoding that is incompatible with the input data being
67 * analyzed will not be added to the list of possible encodings.
68 *
69 * @param encoding The declared encoding
70 *
71 * @stable ICU 3.4
72 */
73 public CharsetDetector setDeclaredEncoding(String encoding) {
74 setCanonicalDeclaredEncoding(encoding);
75 return this;
76 }
77
78 /**
79 * Set the input text (byte) data whose charset is to be detected.
80 *
81 * @param in the input text of unknown encoding
82 *
83 * @return This CharsetDetector
84 *
85 * @stable ICU 3.4
86 */
87 public CharsetDetector setText(byte [] in) {
88 fRawInput = in;
89 fRawLength = in.length;
90
91 MungeInput();
92
93 return this;
94 }
95
96 private static final int kBufSize = 12000;
97
98 private static final int MAX_CONFIDENCE = 100;
99
100 /**
101 * Set the input text (byte) data whose charset is to be detected.
102 * <p/>
103 * The input stream that supplies the character data must have markSupported()
104 * == true; the charset detection process will read a small amount of data,
105 * then return the stream to its original position via
106 * the InputStream.reset() operation. The exact amount that will
107 * be read depends on the characteristics of the data itself.
108 *
109 * @param in the input text of unknown encoding
110 *
111 * @return This CharsetDetector
112 *
113 * @stable ICU 3.4
114 */
115
116 public CharsetDetector setText(InputStream in) throws IOException {
117 fInputStream = in;
118 fInputStream.mark(kBufSize);
119 fRawInput = new byte[kBufSize]; // Always make a new buffer because the
120 // previous one may have come from the caller,
121 // in which case we can't touch it.
122 fRawLength = 0;
123 int remainingLength = kBufSize;
124 while (remainingLength > 0 ) {
125 // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
126 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
127 if (bytesRead <= 0) {
128 break;
129 }
130 fRawLength += bytesRead;
131 remainingLength -= bytesRead;
132 }
133 fInputStream.reset();
134
135 MungeInput(); // Strip html markup, collect byte stats.
136 return this;
137 }
138
139
140 /**
141 * Return the charset that best matches the supplied input data.
142 *
143 * Note though, that because the detection
144 * only looks at the start of the input data,
145 * there is a possibility that the returned charset will fail to handle
146 * the full set of input data.
147 * <p/>
148 * Raise an exception if
149 * <ul>
150 * <li>no charset appears to match the data.</li>
151 * <li>no input text has been provided</li>
152 * </ul>
153 *
154 * @return a CharsetMatch object representing the best matching charset, or
155 * <code>null</code> if there are no matches.
156 *
157 * @stable ICU 3.4
158 */
159 public CharsetMatch detect() {
160 // TODO: A better implementation would be to copy the detect loop from
161 // detectAll(), and cut it short as soon as a match with a high confidence
162 // is found. This is something to be done later, after things are otherwise
163 // working.
164 CharsetMatch matches[] = detectAll();
165
166 if (matches == null || matches.length == 0) {
167 return null;
168 }
169
170 return matches[0];
171 }
172
173 /**
174 * Return an array of all charsets that appear to be plausible
175 * matches with the input data. The array is ordered with the
176 * best quality match first.
177 * <p/>
178 * Raise an exception if
179 * <ul>
180 * <li>no charsets appear to match the input data.</li>
181 * <li>no input text has been provided</li>
182 * </ul>
183 *
184 * @return An array of CharsetMatch objects representing possibly matching charsets.
185 *
186 * @stable ICU 3.4
187 */
188 public CharsetMatch[] detectAll() {
189 CharsetRecognizer csr;
190 int i;
191 int detectResults;
192 int confidence;
193 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
194
195 // Iterate over all possible charsets, remember all that
196 // give a match quality > 0.
197 for (i=0; i<fCSRecognizers.size(); i++) {
198 csr = fCSRecognizers.get(i);
199 detectResults = csr.match(this);
200 confidence = detectResults & 0x000000ff;
201 if (confidence > 0) {
202 // Just to be safe, constrain
203 confidence = Math.min(confidence, MAX_CONFIDENCE);
204
205 // Apply charset hint.
206 if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
207 // Reduce lack of confidence (delta between "sure" and current) by 50%.
208 confidence += (MAX_CONFIDENCE - confidence)/2;
209 }
210
211 CharsetMatch m = new CharsetMatch(this, csr, confidence);
212 matches.add(m);
213 }
214 }
215
216 Collections.sort(matches); // CharsetMatch compares on confidence
217 Collections.reverse(matches); // Put best match first.
218 CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
219 resultArray = (CharsetMatch[]) matches.toArray(resultArray);
220 return resultArray;
221 }
222
223
224 /**
225 * Autodetect the charset of an inputStream, and return a Java Reader
226 * to access the converted input data.
227 * <p/>
228 * This is a convenience method that is equivalent to
229 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
230 * <p/>
231 * For the input stream that supplies the character data, markSupported()
232 * must be true; the charset detection will read a small amount of data,
233 * then return the stream to its original position via
234 * the InputStream.reset() operation. The exact amount that will
235 * be read depends on the characteristics of the data itself.
236 *<p/>
237 * Raise an exception if no charsets appear to match the input data.
238 *
239 * @param in The source of the byte data in the unknown charset.
240 *
241 * @param declaredEncoding A declared encoding for the data, if available,
242 * or null or an empty string if none is available.
243 *
244 * @stable ICU 3.4
245 */
246 public Reader getReader(InputStream in, String declaredEncoding) {
247 setCanonicalDeclaredEncoding(declaredEncoding);
248
249 try {
250 setText(in);
251
252 CharsetMatch match = detect();
253
254 if (match == null) {
255 return null;
256 }
257
258 return match.getReader();
259 } catch (IOException e) {
260 return null;
261 }
262 }
263
264 /**
265 * Autodetect the charset of an inputStream, and return a String
266 * containing the converted input data.
267 * <p/>
268 * This is a convenience method that is equivalent to
269 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
270 *<p/>
271 * Raise an exception if no charsets appear to match the input data.
272 *
273 * @param in The source of the byte data in the unknown charset.
274 *
275 * @param declaredEncoding A declared encoding for the data, if available,
276 * or null or an empty string if none is available.
277 *
278 * @stable ICU 3.4
279 */
280 public String getString(byte[] in, String declaredEncoding) {
281 setCanonicalDeclaredEncoding(declaredEncoding);
282
283 try {
284 setText(in);
285
286 CharsetMatch match = detect();
287
288 if (match == null) {
289 return null;
290 }
291
292 return match.getString(-1);
293 } catch (IOException e) {
294 return null;
295 }
296 }
297
298
299 /**
300 * Get the names of all char sets that can be recognized by the char set detector.
301 *
302 * @return an array of the names of all charsets that can be recognized
303 * by the charset detector.
304 *
305 * @stable ICU 3.4
306 */
307 public static String[] getAllDetectableCharsets() {
308 return fCharsetNames;
309 }
310
311 /**
312 * Test whether or not input filtering is enabled.
313 *
314 * @return <code>true</code> if input text will be filtered.
315 *
316 * @see #enableInputFilter
317 *
318 * @stable ICU 3.4
319 */
320 public boolean inputFilterEnabled()
321 {
322 return fStripTags;
323 }
324
325 /**
326 * Enable filtering of input text. If filtering is enabled,
327 * text within angle brackets ("<" and ">") will be removed
328 * before detection.
329 *
330 * @param filter <code>true</code> to enable input text filtering.
331 *
332 * @return The previous setting.
333 *
334 * @stable ICU 3.4
335 */
336 public boolean enableInputFilter(boolean filter)
337 {
338 boolean previous = fStripTags;
339
340 fStripTags = filter;
341
342 return previous;
343 }
344
345 /**
346 * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
347 *
348 * @param encoding - name of character encoding
349 */
350 private void setCanonicalDeclaredEncoding(String encoding) {
351 Charset cs = Charset.forName(encoding);
352 if (cs != null) {
353 fDeclaredEncoding = cs.name();
354 }
355 }
356
357 /*
358 * MungeInput - after getting a set of raw input data to be analyzed, preprocess
359 * it by removing what appears to be html markup.
360 */
361 private void MungeInput() {
362 int srci = 0;
363 int dsti = 0;
364 byte b;
365 boolean inMarkup = false;
366 int openTags = 0;
367 int badTags = 0;
368
369 //
370 // html / xml markup stripping.
371 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
372 // discard everything within < brackets >
373 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
374 // guess as to whether the input was actually marked up at all.
375 if (fStripTags) {
376 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
377 b = fRawInput[srci];
378 if (b == (byte)'<') {
379 if (inMarkup) {
380 badTags++;
381 }
382 inMarkup = true;
383 openTags++;
384 }
385
386 if (! inMarkup) {
387 fInputBytes[dsti++] = b;
388 }
389
390 if (b == (byte)'>') {
391 inMarkup = false;
392 }
393 }
394
395 fInputLen = dsti;
396 }
397
398 //
399 // If it looks like this input wasn't marked up, or if it looks like it's
400 // essentially nothing but markup abandon the markup stripping.
401 // Detection will have to work on the unstripped input.
402 //
403 if (openTags<5 || openTags/5 < badTags ||
404 (fInputLen < 100 && fRawLength>600)) {
405 int limit = fRawLength;
406
407 if (limit > kBufSize) {
408 limit = kBufSize;
409 }
410
411 for (srci=0; srci<limit; srci++) {
412 fInputBytes[srci] = fRawInput[srci];
413 }
414 fInputLen = srci;
415 }
416
417 //
418 // Tally up the byte occurence statistics.
419 // These are available for use by the various detectors.
420 //
421 Arrays.fill(fByteStats, (short)0);
422 for (srci=0; srci<fInputLen; srci++) {
423 int val = fInputBytes[srci] & 0x00ff;
424 fByteStats[val]++;
425 }
426
427 fC1Bytes = false;
428 for (int i = 0x80; i <= 0x9F; i += 1) {
429 if (fByteStats[i] != 0) {
430 fC1Bytes = true;
431 break;
432 }
433 }
434 }
435
436 /*
437 * The following items are accessed by individual CharsetRecongizers during
438 * the recognition process
439 *
440 */
441 byte[] fInputBytes = // The text to be checked. Markup will have been
442 new byte[kBufSize]; // removed if appropriate.
443
444 int fInputLen; // Length of the byte data in fInputText.
445
446 short fByteStats[] = // byte frequency statistics for the input text.
447 new short[256]; // Value is percent, not absolute.
448 // Value is rounded up, so zero really means zero occurences.
449
450 boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
451 false;
452
453 String fDeclaredEncoding;
454
455
456
457 //
458 // Stuff private to CharsetDetector
459 //
460 byte[] fRawInput; // Original, untouched input bytes.
461 // If user gave us a byte array, this is it.
462 // If user gave us a stream, it's read to a
463 // buffer here.
464 int fRawLength; // Length of data in fRawInput array.
465
466 InputStream fInputStream; // User's input stream, or null if the user
467 // gave us a byte array.
468
469 boolean fStripTags = // If true, setText() will strip tags from input text.
470 false;
471
472
473 /*
474 * List of recognizers for all charsets known to the implementation.
475 */
476 private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
477 private static String [] fCharsetNames;
478
479 /*
480 * Create the singleton instances of the CharsetRecognizer classes
481 */
482 private static ArrayList<CharsetRecognizer> createRecognizers() {
483 ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
484
485 recognizers.add(new CharsetRecog_UTF8());
486
487 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
488 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
489 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
490 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
491
492 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
493 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
494 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
495 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
496 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
497 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
498 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
499 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
500
501 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
502 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
503 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
504 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
505 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
506 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
507 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
508 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
509 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
510 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
511 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
512 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
513 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
514 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
515 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
516 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
517 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
518 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
519 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
520 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
521 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
522 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
523 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
524
525 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
526 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
527 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
528 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
529
530 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
531 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
532 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
533 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
534 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
535 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
536
537 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
538
539 // Create an array of all charset names, as a side effect.
540 // Needed for the getAllDetectableCharsets() API.
541 String[] charsetNames = new String [recognizers.size()];
542 int out = 0;
543
544 for (int i = 0; i < recognizers.size(); i++) {
545 String name = ((CharsetRecognizer)recognizers.get(i)).getName();
546
547 if (out == 0 || ! name.equals(charsetNames[out - 1])) {
548 charsetNames[out++] = name;
549 }
550 }
551
552 fCharsetNames = new String[out];
553 System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
554
555 return recognizers;
556 }
557 }
0 /**
1 *******************************************************************************
2 * Copyright (C) 2005-2007, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 */
6 package org.apache.tika.parser.txt;
7
8 import java.io.ByteArrayInputStream;
9 import java.io.IOException;
10 import java.io.InputStream;
11 import java.io.InputStreamReader;
12 import java.io.Reader;
13
14
15 /**
16 * This class represents a charset that has been identified by a CharsetDetector
17 * as a possible encoding for a set of input data. From an instance of this
18 * class, you can ask for a confidence level in the charset identification,
19 * or for Java Reader or String to access the original byte data in Unicode form.
20 * <p/>
21 * Instances of this class are created only by CharsetDetectors.
22 * <p/>
23 * Note: this class has a natural ordering that is inconsistent with equals.
24 * The natural ordering is based on the match confidence value.
25 *
26 * @stable ICU 3.4
27 */
28 public class CharsetMatch implements Comparable<CharsetMatch> {
29
30
31 /**
32 * Create a java.io.Reader for reading the Unicode character data corresponding
33 * to the original byte data supplied to the Charset detect operation.
34 * <p/>
35 * CAUTION: if the source of the byte data was an InputStream, a Reader
36 * can be created for only one matching char set using this method. If more
37 * than one charset needs to be tried, the caller will need to reset
38 * the InputStream and create InputStreamReaders itself, based on the charset name.
39 *
40 * @return the Reader for the Unicode character data.
41 *
42 * @stable ICU 3.4
43 */
44 public Reader getReader() {
45 InputStream inputStream = fInputStream;
46
47 if (inputStream == null) {
48 inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
49 }
50
51 try {
52 inputStream.reset();
53 return new InputStreamReader(inputStream, getName());
54 } catch (IOException e) {
55 return null;
56 }
57 }
58
59 /**
60 * Create a Java String from Unicode character data corresponding
61 * to the original byte data supplied to the Charset detect operation.
62 *
63 * @return a String created from the converted input data.
64 *
65 * @stable ICU 3.4
66 */
67 public String getString() throws java.io.IOException {
68 return getString(-1);
69
70 }
71
72 /**
73 * Create a Java String from Unicode character data corresponding
74 * to the original byte data supplied to the Charset detect operation.
75 * The length of the returned string is limited to the specified size;
76 * the string will be trunctated to this length if necessary. A limit value of
77 * zero or less is ignored, and treated as no limit.
78 *
79 * @param maxLength The maximium length of the String to be created when the
80 * source of the data is an input stream, or -1 for
81 * unlimited length.
82 * @return a String created from the converted input data.
83 *
84 * @stable ICU 3.4
85 */
86 public String getString(int maxLength) throws java.io.IOException {
87 String result = null;
88 if (fInputStream != null) {
89 StringBuffer sb = new StringBuffer();
90 char[] buffer = new char[1024];
91 Reader reader = getReader();
92 int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
93 int bytesRead = 0;
94
95 while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
96 sb.append(buffer, 0, bytesRead);
97 max -= bytesRead;
98 }
99
100 reader.close();
101
102 return sb.toString();
103 } else {
104 result = new String(fRawInput, getName());
105 }
106 return result;
107
108 }
109
110 /**
111 * Get an indication of the confidence in the charset detected.
112 * Confidence values range from 0-100, with larger numbers indicating
113 * a better match of the input data to the characteristics of the
114 * charset.
115 *
116 * @return the confidence in the charset match
117 *
118 * @stable ICU 3.4
119 */
120 public int getConfidence() {
121 return fConfidence;
122 }
123
124
125 /**
126 * Bit flag indicating the match is based on the the encoding scheme.
127 *
128 * @see #getMatchType
129 * @stable ICU 3.4
130 */
131 static public final int ENCODING_SCHEME = 1;
132
133 /**
134 * Bit flag indicating the match is based on the presence of a BOM.
135 *
136 * @see #getMatchType
137 * @stable ICU 3.4
138 */
139 static public final int BOM = 2;
140
141 /**
142 * Bit flag indicating he match is based on the declared encoding.
143 *
144 * @see #getMatchType
145 * @stable ICU 3.4
146 */
147 static public final int DECLARED_ENCODING = 4;
148
149 /**
150 * Bit flag indicating the match is based on language statistics.
151 *
152 * @see #getMatchType
153 * @stable ICU 3.4
154 */
155 static public final int LANG_STATISTICS = 8;
156
157 /**
158 * Return flags indicating what it was about the input data
159 * that caused this charset to be considered as a possible match.
160 * The result is a bitfield containing zero or more of the flags
161 * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
162 * A result of zero means no information is available.
163 * <p>
164 * Note: currently, this method always returns zero.
165 * <p>
166 *
167 * @return the type of match found for this charset.
168 *
169 * @draft ICU 3.4
170 * @provisional This API might change or be removed in a future release.
171 */
172 public int getMatchType() {
173 // TODO: create a list of enum-like constants for common combinations of types of matches.
174 return 0;
175 }
176
177 /**
178 * Get the name of the detected charset.
179 * The name will be one that can be used with other APIs on the
180 * platform that accept charset names. It is the "Canonical name"
181 * as defined by the class java.nio.charset.Charset; for
182 * charsets that are registered with the IANA charset registry,
183 * this is the MIME-preferred registerd name.
184 *
185 * @see java.nio.charset.Charset
186 * @see java.io.InputStreamReader
187 *
188 * @return The name of the charset.
189 *
190 * @stable ICU 3.4
191 */
192 public String getName() {
193 return fRecognizer.getName();
194 }
195
196 /**
197 * Get the ISO code for the language of the detected charset.
198 *
199 * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
200 *
201 * @stable ICU 3.4
202 */
203 public String getLanguage() {
204 return fRecognizer.getLanguage();
205 }
206
207 /**
208 * Compare to other CharsetMatch objects.
209 * Comparison is based on the match confidence value, which
210 * allows CharsetDetector.detectAll() to order its results.
211 *
212 * @param o the CharsetMatch object to compare against.
213 * @return a negative integer, zero, or a positive integer as the
214 * confidence level of this CharsetMatch
215 * is less than, equal to, or greater than that of
216 * the argument.
217 * @throws ClassCastException if the argument is not a CharsetMatch.
218 * @stable ICU 3.4
219 */
220 public int compareTo(CharsetMatch other) {
221 int compareResult = 0;
222 if (this.fConfidence > other.fConfidence) {
223 compareResult = 1;
224 } else if (this.fConfidence < other.fConfidence) {
225 compareResult = -1;
226 }
227 return compareResult;
228 }
229
230 /**
231 * compare this CharsetMatch to another based on confidence value
232 * @param o the CharsetMatch object to compare against
233 * @return true if equal
234 */
235 public boolean equals(Object o) {
236 if (o instanceof CharsetMatch) {
237 CharsetMatch that = (CharsetMatch) o;
238 return (this.fConfidence == that.fConfidence);
239 }
240
241 return false;
242 }
243
244 /**
245 * generates a hashCode based on the confidence value
246 * @return the hashCode
247 */
248 public int hashCode() {
249 return fConfidence;
250 }
251
252 /*
253 * Constructor. Implementation internal
254 */
255 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
256 fRecognizer = rec;
257 fConfidence = conf;
258
259 // The references to the original aplication input data must be copied out
260 // of the charset recognizer to here, in case the application resets the
261 // recognizer before using this CharsetMatch.
262 if (det.fInputStream == null) {
263 // We only want the existing input byte data if it came straight from the user,
264 // not if is just the head of a stream.
265 fRawInput = det.fRawInput;
266 fRawLength = det.fRawLength;
267 }
268 fInputStream = det.fInputStream;
269 }
270
271
272 //
273 // Private Data
274 //
275 private int fConfidence;
276 private CharsetRecognizer fRecognizer;
277 private byte[] fRawInput = null; // Original, untouched input bytes.
278 // If user gave us a byte array, this is it.
279 private int fRawLength; // Length of data in fRawInput array.
280
281 private InputStream fInputStream = null; // User's input stream, or null if the user
282 // gave us a byte array.
283
284 public String toString() {
285 String s = "Match of " + fRecognizer.getName();
286 if(fRecognizer.getLanguage() != null) {
287 s += " in " + fRecognizer.getLanguage();
288 }
289 s += " with confidence " + fConfidence;
290 return s;
291 }
292 }
0 /*
1 *******************************************************************************
2 * Copyright (C) 2005 - 2008, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 */
6 package org.apache.tika.parser.txt;
7
8 /**
9 * class CharsetRecog_2022 part of the ICU charset detection imlementation.
10 * This is a superclass for the individual detectors for
11 * each of the detectable members of the ISO 2022 family
12 * of encodings.
13 *
14 * The separate classes are nested within this class.
15 *
16 * @internal
17 */
18 abstract class CharsetRecog_2022 extends CharsetRecognizer {
19
20
21 /**
22 * Matching function shared among the 2022 detectors JP, CN and KR
23 * Counts up the number of legal an unrecognized escape sequences in
24 * the sample of text, and computes a score based on the total number &
25 * the proportion that fit the encoding.
26 *
27 *
28 * @param text the byte buffer containing text to analyse
29 * @param textLen the size of the text in the byte.
30 * @param escapeSequences the byte escape sequences to test for.
31 * @return match quality, in the range of 0-100.
32 */
33 int match(byte [] text, int textLen, byte [][] escapeSequences) {
34 int i, j;
35 int escN;
36 int hits = 0;
37 int misses = 0;
38 int shifts = 0;
39 int quality;
40 scanInput:
41 for (i=0; i<textLen; i++) {
42 if (text[i] == 0x1b) {
43 checkEscapes:
44 for (escN=0; escN<escapeSequences.length; escN++) {
45 byte [] seq = escapeSequences[escN];
46
47 if ((textLen - i) < seq.length) {
48 continue checkEscapes;
49 }
50
51 for (j=1; j<seq.length; j++) {
52 if (seq[j] != text[i+j]) {
53 continue checkEscapes;
54 }
55 }
56
57 hits++;
58 i += seq.length-1;
59 continue scanInput;
60 }
61
62 misses++;
63 }
64
65 if (text[i] == 0x0e || text[i] == 0x0f) {
66 // Shift in/out
67 shifts++;
68 }
69 }
70
71 if (hits == 0) {
72 return 0;
73 }
74
75 //
76 // Initial quality is based on relative proportion of recongized vs.
77 // unrecognized escape sequences.
78 // All good: quality = 100;
79 // half or less good: quality = 0;
80 // linear inbetween.
81 quality = (100*hits - 100*misses) / (hits + misses);
82
83 // Back off quality if there were too few escape sequences seen.
84 // Include shifts in this computation, so that KR does not get penalized
85 // for having only a single Escape sequence, but many shifts.
86 if (hits+shifts < 5) {
87 quality -= (5-(hits+shifts))*10;
88 }
89
90 if (quality < 0) {
91 quality = 0;
92 }
93 return quality;
94 }
95
96
97
98
99 static class CharsetRecog_2022JP extends CharsetRecog_2022 {
100 private byte [] [] escapeSequences = {
101 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
102 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
103 {0x1b, 0x24, 0x40}, // JIS C 6226-1978
104 {0x1b, 0x24, 0x41}, // GB 2312-80
105 {0x1b, 0x24, 0x42}, // JIS X 208-1983
106 {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
107 {0x1b, 0x28, 0x42}, // ASCII
108 {0x1b, 0x28, 0x48}, // JIS-Roman
109 {0x1b, 0x28, 0x49}, // Half-width katakana
110 {0x1b, 0x28, 0x4a}, // JIS-Roman
111 {0x1b, 0x2e, 0x41}, // ISO 8859-1
112 {0x1b, 0x2e, 0x46} // ISO 8859-7
113 };
114
115 String getName() {
116 return "ISO-2022-JP";
117 }
118
119 int match(CharsetDetector det) {
120 return match(det.fInputBytes, det.fInputLen, escapeSequences);
121 }
122 }
123
124 static class CharsetRecog_2022KR extends CharsetRecog_2022 {
125 private byte [] [] escapeSequences = {
126 {0x1b, 0x24, 0x29, 0x43}
127 };
128
129 String getName() {
130 return "ISO-2022-KR";
131 }
132
133 int match(CharsetDetector det) {
134 return match(det.fInputBytes, det.fInputLen, escapeSequences);
135 }
136
137 }
138
139 static class CharsetRecog_2022CN extends CharsetRecog_2022 {
140 private byte [] [] escapeSequences = {
141 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
142 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
143 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
144 {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
145 {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
146 {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
147 {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
148 {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
149 {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
150 {0x1b, 0x4e}, // SS2
151 {0x1b, 0x4f}, // SS3
152 };
153
154 String getName() {
155 return "ISO-2022-CN";
156 }
157
158
159 int match(CharsetDetector det) {
160 return match(det.fInputBytes, det.fInputLen, escapeSequences);
161 }
162 }
163
164 }
165
0 /**
1 *******************************************************************************
2 * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 */
6 package org.apache.tika.parser.txt;
7
8 /**
9 * Charset recognizer for UTF-8
10 *
11 * @internal
12 */
13 class CharsetRecog_UTF8 extends CharsetRecognizer {
14
15 String getName() {
16 return "UTF-8";
17 }
18
19 /* (non-Javadoc)
20 * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
21 */
22 int match(CharsetDetector det) {
23 boolean hasBOM = false;
24 int numValid = 0;
25 int numInvalid = 0;
26 byte input[] = det.fRawInput;
27 int i;
28 int trailBytes = 0;
29 int confidence;
30
31 if (det.fRawLength >= 3 &&
32 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
33 hasBOM = true;
34 }
35
36 // Scan for multi-byte sequences
37 for (i=0; i<det.fRawLength; i++) {
38 int b = input[i];
39 if ((b & 0x80) == 0) {
40 continue; // ASCII
41 }
42
43 // Hi bit on char found. Figure out how long the sequence should be
44 if ((b & 0x0e0) == 0x0c0) {
45 trailBytes = 1;
46 } else if ((b & 0x0f0) == 0x0e0) {
47 trailBytes = 2;
48 } else if ((b & 0x0f8) == 0xf0) {
49 trailBytes = 3;
50 } else {
51 numInvalid++;
52 if (numInvalid > 5) {
53 break;
54 }
55 trailBytes = 0;
56 }
57
58 // Verify that we've got the right number of trail bytes in the sequence
59 for (;;) {
60 i++;
61 if (i>=det.fRawLength) {
62 break;
63 }
64 b = input[i];
65 if ((b & 0xc0) != 0x080) {
66 numInvalid++;
67 break;
68 }
69 if (--trailBytes == 0) {
70 numValid++;
71 break;
72 }
73 }
74
75 }
76
77 // Cook up some sort of confidence score, based on presense of a BOM
78 // and the existence of valid and/or invalid multi-byte sequences.
79 confidence = 0;
80 if (hasBOM && numInvalid==0) {
81 confidence = 100;
82 } else if (hasBOM && numValid > numInvalid*10) {
83 confidence = 80;
84 } else if (numValid > 3 && numInvalid == 0) {
85 confidence = 100;
86 } else if (numValid > 0 && numInvalid == 0) {
87 confidence = 80;
88 } else if (numValid == 0 && numInvalid == 0) {
89 // Plain ASCII.
90 confidence = 10;
91 } else if (numValid > numInvalid*10) {
92 // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
93 confidence = 25;
94 }
95 return confidence;
96 }
97
98 }
0 /*
1 *******************************************************************************
2 * Copyright (C) 1996-2007, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 *
6 */
7 package org.apache.tika.parser.txt;
8
9 /**
10 * This class matches UTF-16 and UTF-32, both big- and little-endian. The
11 * BOM will be used if it is present.
12 *
13 * @internal
14 */
15 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
16
17 /* (non-Javadoc)
18 * @see com.ibm.icu.text.CharsetRecognizer#getName()
19 */
20 abstract String getName();
21
22 /* (non-Javadoc)
23 * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
24 */
25 abstract int match(CharsetDetector det);
26
27 static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
28 {
29 String getName()
30 {
31 return "UTF-16BE";
32 }
33
34 int match(CharsetDetector det)
35 {
36 byte[] input = det.fRawInput;
37
38 if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
39 return 100;
40 }
41
42 // TODO: Do some statistics to check for unsigned UTF-16BE
43 return 0;
44 }
45 }
46
47 static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
48 {
49 String getName()
50 {
51 return "UTF-16LE";
52 }
53
54 int match(CharsetDetector det)
55 {
56 byte[] input = det.fRawInput;
57
58 if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
59 {
60 // An LE BOM is present.
61 if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
62 // It is probably UTF-32 LE, not UTF-16
63 return 0;
64 }
65 return 100;
66 }
67
68 // TODO: Do some statistics to check for unsigned UTF-16LE
69 return 0;
70 }
71 }
72
73 static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
74 {
75 abstract int getChar(byte[] input, int index);
76
77 abstract String getName();
78
79 int match(CharsetDetector det)
80 {
81 byte[] input = det.fRawInput;
82 int limit = (det.fRawLength / 4) * 4;
83 int numValid = 0;
84 int numInvalid = 0;
85 boolean hasBOM = false;
86 int confidence = 0;
87
88 if (limit==0) {
89 return 0;
90 }
91 if (getChar(input, 0) == 0x0000FEFF) {
92 hasBOM = true;
93 }
94
95 for(int i = 0; i < limit; i += 4) {
96 int ch = getChar(input, i);
97
98 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
99 numInvalid += 1;
100 } else {
101 numValid += 1;
102 }
103 }
104
105
106 // Cook up some sort of confidence score, based on presence of a BOM
107 // and the existence of valid and/or invalid multi-byte sequences.
108 if (hasBOM && numInvalid==0) {
109 confidence = 100;
110 } else if (hasBOM && numValid > numInvalid*10) {
111 confidence = 80;
112 } else if (numValid > 3 && numInvalid == 0) {
113 confidence = 100;
114 } else if (numValid > 0 && numInvalid == 0) {
115 confidence = 80;
116 } else if (numValid > numInvalid*10) {
117 // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
118 confidence = 25;
119 }
120
121 return confidence;
122 }
123 }
124
125 static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
126 {
127 int getChar(byte[] input, int index)
128 {
129 return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
130 (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF);
131 }
132
133 String getName()
134 {
135 return "UTF-32BE";
136 }
137 }
138
139
140 static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
141 {
142 int getChar(byte[] input, int index)
143 {
144 return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
145 (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF);
146 }
147
148 String getName()
149 {
150 return "UTF-32LE";
151 }
152 }
153 }
0 /*
1 ****************************************************************************
2 * Copyright (C) 2005-2008, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 ****************************************************************************
5 *
6 */
7 package org.apache.tika.parser.txt;
8
9 import java.util.Arrays;
10
11 /**
12 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
13 * Match is determined mostly by the input data adhering to the
14 * encoding scheme for the charset, and, optionally,
15 * frequency-of-occurence of characters.
16 * <p/>
17 * Instances of this class are singletons, one per encoding
18 * being recognized. They are created in the main
19 * CharsetDetector class and kept in the global list of available
20 * encodings to be checked. The specific encoding being recognized
21 * is determined by subclass.
22 *
23 * @internal
24 */
25 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
26
27 /**
28 * Get the IANA name of this charset.
29 * @return the charset name.
30 */
31 abstract String getName() ;
32
33
34 /**
35 * Test the match of this charset with the input text data
36 * which is obtained via the CharsetDetector object.
37 *
38 * @param det The CharsetDetector, which contains the input text
39 * to be checked for being in this charset.
40 * @return Two values packed into one int (Damn java, anyhow)
41 * <br/>
42 * bits 0-7: the match confidence, ranging from 0-100
43 * <br/>
44 * bits 8-15: The match reason, an enum-like value.
45 */
46 int match(CharsetDetector det, int [] commonChars) {
47 int singleByteCharCount = 0;
48 int doubleByteCharCount = 0;
49 int commonCharCount = 0;
50 int badCharCount = 0;
51 int totalCharCount = 0;
52 int confidence = 0;
53 iteratedChar iter = new iteratedChar();
54
55 detectBlock: {
56 for (iter.reset(); nextChar(iter, det);) {
57 totalCharCount++;
58 if (iter.error) {
59 badCharCount++;
60 } else {
61 long cv = iter.charValue & 0xFFFFFFFFL;
62
63 if (cv <= 0xff) {
64 singleByteCharCount++;
65 } else {
66 doubleByteCharCount++;
67 if (commonChars != null) {
68 // NOTE: This assumes that there are no 4-byte common chars.
69 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
70 commonCharCount++;
71 }
72 }
73 }
74 }
75 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
76 // Bail out early if the byte data is not matching the encoding scheme.
77 break detectBlock;
78 }
79 }
80
81 if (doubleByteCharCount <= 10 && badCharCount== 0) {
82 // Not many multi-byte chars.
83 if (doubleByteCharCount == 0 && totalCharCount < 10) {
84 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
85 // We don't have enough data to have any confidence.
86 // Statistical analysis of single byte non-ASCII charcters would probably help here.
87 confidence = 0;
88 }
89 else {
90 // ASCII or ISO file? It's probably not our encoding,
91 // but is not incompatible with our encoding, so don't give it a zero.
92 confidence = 10;
93 }
94
95 break detectBlock;
96 }
97
98 //
99 // No match if there are too many characters that don't fit the encoding scheme.
100 // (should we have zero tolerance for these?)
101 //
102 if (doubleByteCharCount < 20*badCharCount) {
103 confidence = 0;
104 break detectBlock;
105 }
106
107 if (commonChars == null) {
108 // We have no statistics on frequently occuring characters.
109 // Assess confidence purely on having a reasonable number of
110 // multi-byte characters (the more the better
111 confidence = 30 + doubleByteCharCount - 20*badCharCount;
112 if (confidence > 100) {
113 confidence = 100;
114 }
115 }else {
116 //
117 // Frequency of occurence statistics exist.
118 //
119 double maxVal = Math.log((float)doubleByteCharCount / 4);
120 double scaleFactor = 90.0 / maxVal;
121 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
122 confidence = Math.min(confidence, 100);
123 }
124 } // end of detectBlock:
125
126 return confidence;
127 }
128
129 // "Character" iterated character class.
130 // Recognizers for specific mbcs encodings make their "characters" available
131 // by providing a nextChar() function that fills in an instance of iteratedChar
132 // with the next char from the input.
133 // The returned characters are not converted to Unicode, but remain as the raw
134 // bytes (concatenated into an int) from the codepage data.
135 //
136 // For Asian charsets, use the raw input rather than the input that has been
137 // stripped of markup. Detection only considers multi-byte chars, effectively
138 // stripping markup anyway, and double byte chars do occur in markup too.
139 //
140 static class iteratedChar {
141 int charValue = 0; // 1-4 bytes from the raw input data
142 int index = 0;
143 int nextIndex = 0;
144 boolean error = false;
145 boolean done = false;
146
147 void reset() {
148 charValue = 0;
149 index = -1;
150 nextIndex = 0;
151 error = false;
152 done = false;
153 }
154
155 int nextByte(CharsetDetector det) {
156 if (nextIndex >= det.fRawLength) {
157 done = true;
158 return -1;
159 }
160 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
161 return byteValue;
162 }
163 }
164
165 /**
166 * Get the next character (however many bytes it is) from the input data
167 * Subclasses for specific charset encodings must implement this function
168 * to get characters according to the rules of their encoding scheme.
169 *
170 * This function is not a method of class iteratedChar only because
171 * that would require a lot of extra derived classes, which is awkward.
172 * @param it The iteratedChar "struct" into which the returned char is placed.
173 * @param det The charset detector, which is needed to get at the input byte data
174 * being iterated over.
175 * @return True if a character was returned, false at end of input.
176 */
177 abstract boolean nextChar(iteratedChar it, CharsetDetector det);
178
179
180
181
182
183 /**
184 * Shift-JIS charset recognizer.
185 *
186 */
187 static class CharsetRecog_sjis extends CharsetRecog_mbcs {
188 static int [] commonChars =
189 // TODO: This set of data comes from the character frequency-
190 // of-occurence analysis tool. The data needs to be moved
191 // into a resource and loaded from there.
192 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
193 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
194 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
195 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
196 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
197 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
198
199 boolean nextChar(iteratedChar it, CharsetDetector det) {
200 it.index = it.nextIndex;
201 it.error = false;
202 int firstByte;
203 firstByte = it.charValue = it.nextByte(det);
204 if (firstByte < 0) {
205 return false;
206 }
207
208 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
209 return true;
210 }
211
212 int secondByte = it.nextByte(det);
213 if (secondByte < 0) {
214 return false;
215 }
216 it.charValue = (firstByte << 8) | secondByte;
217 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
218 // Illegal second byte value.
219 it.error = true;
220 }
221 return true;
222 }
223
224 int match(CharsetDetector det) {
225 return match(det, commonChars);
226 }
227
228 String getName() {
229 return "Shift_JIS";
230 }
231
232 public String getLanguage()
233 {
234 return "ja";
235 }
236
237
238 }
239
240
241 /**
242 * Big5 charset recognizer.
243 *
244 */
245 static class CharsetRecog_big5 extends CharsetRecog_mbcs {
246 static int [] commonChars =
247 // TODO: This set of data comes from the character frequency-
248 // of-occurence analysis tool. The data needs to be moved
249 // into a resource and loaded from there.
250 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
251 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
252 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
253 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
254 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
255 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
256 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
257 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
258 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
259 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
260
261 boolean nextChar(iteratedChar it, CharsetDetector det) {
262 it.index = it.nextIndex;
263 it.error = false;
264 int firstByte;
265 firstByte = it.charValue = it.nextByte(det);
266 if (firstByte < 0) {
267 return false;
268 }
269
270 if (firstByte <= 0x7f || firstByte==0xff) {
271 // single byte character.
272 return true;
273 }
274
275 int secondByte = it.nextByte(det);
276 if (secondByte < 0) {
277 return false;
278 }
279 it.charValue = (it.charValue << 8) | secondByte;
280
281 if (secondByte < 0x40 ||
282 secondByte ==0x7f ||
283 secondByte == 0xff) {
284 it.error = true;
285 }
286 return true;
287 }
288
289 int match(CharsetDetector det) {
290 return match(det, commonChars);
291 }
292
293 String getName() {
294 return "Big5";
295 }
296
297
298 public String getLanguage()
299 {
300 return "zh";
301 }
302 }
303
304
305 /**
306 * EUC charset recognizers. One abstract class that provides the common function
307 * for getting the next character according to the EUC encoding scheme,
308 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
309 *
310 */
311 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
312
313 /*
314 * (non-Javadoc)
315 * Get the next character value for EUC based encodings.
316 * Character "value" is simply the raw bytes that make up the character
317 * packed into an int.
318 */
319 boolean nextChar(iteratedChar it, CharsetDetector det) {
320 it.index = it.nextIndex;
321 it.error = false;
322 int firstByte = 0;
323 int secondByte = 0;
324 int thirdByte = 0;
325 //int fourthByte = 0;
326
327 buildChar: {
328 firstByte = it.charValue = it.nextByte(det);
329 if (firstByte < 0) {
330 // Ran off the end of the input data
331 it.done = true;
332 break buildChar;
333 }
334 if (firstByte <= 0x8d) {
335 // single byte char
336 break buildChar;
337 }
338
339 secondByte = it.nextByte(det);
340 it.charValue = (it.charValue << 8) | secondByte;
341
342 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
343 // Two byte Char
344 if (secondByte < 0xa1) {
345 it.error = true;
346 }
347 break buildChar;
348 }
349 if (firstByte == 0x8e) {
350 // Code Set 2.
351 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
352 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
353 // We don't know which we've got.
354 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
355 // bytes will look like a well formed 2 byte char.
356 if (secondByte < 0xa1) {
357 it.error = true;
358 }
359 break buildChar;
360 }
361
362 if (firstByte == 0x8f) {
363 // Code set 3.
364 // Three byte total char size, two bytes of actual char value.
365 thirdByte = it.nextByte(det);
366 it.charValue = (it.charValue << 8) | thirdByte;
367 if (thirdByte < 0xa1) {
368 it.error = true;
369 }
370 }
371 }
372
373 return (it.done == false);
374 }
375
376 /**
377 * The charset recognize for EUC-JP. A singleton instance of this class
378 * is created and kept by the public CharsetDetector class
379 */
380 static class CharsetRecog_euc_jp extends CharsetRecog_euc {
381 static int [] commonChars =
382 // TODO: This set of data comes from the character frequency-
383 // of-occurence analysis tool. The data needs to be moved
384 // into a resource and loaded from there.
385 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
386 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
387 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
388 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
389 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
390 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
391 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
392 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
393 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
394 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
395 String getName() {
396 return "EUC-JP";
397 }
398
399 int match(CharsetDetector det) {
400 return match(det, commonChars);
401 }
402
403 public String getLanguage()
404 {
405 return "ja";
406 }
407 }
408
409 /**
410 * The charset recognize for EUC-KR. A singleton instance of this class
411 * is created and kept by the public CharsetDetector class
412 */
413 static class CharsetRecog_euc_kr extends CharsetRecog_euc {
414 static int [] commonChars =
415 // TODO: This set of data comes from the character frequency-
416 // of-occurence analysis tool. The data needs to be moved
417 // into a resource and loaded from there.
418 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
419 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
420 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
421 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
422 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
423 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
424 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
425 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
426 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
427 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
428
429 String getName() {
430 return "EUC-KR";
431 }
432
433 int match(CharsetDetector det) {
434 return match(det, commonChars);
435 }
436
437 public String getLanguage()
438 {
439 return "ko";
440 }
441 }
442 }
443
444 /**
445 *
446 * GB-18030 recognizer. Uses simplified Chinese statistics.
447 *
448 */
449 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
450
451 /*
452 * (non-Javadoc)
453 * Get the next character value for EUC based encodings.
454 * Character "value" is simply the raw bytes that make up the character
455 * packed into an int.
456 */
457 boolean nextChar(iteratedChar it, CharsetDetector det) {
458 it.index = it.nextIndex;
459 it.error = false;
460 int firstByte = 0;
461 int secondByte = 0;
462 int thirdByte = 0;
463 int fourthByte = 0;
464
465 buildChar: {
466 firstByte = it.charValue = it.nextByte(det);
467
468 if (firstByte < 0) {
469 // Ran off the end of the input data
470 it.done = true;
471 break buildChar;
472 }
473
474 if (firstByte <= 0x80) {
475 // single byte char
476 break buildChar;
477 }
478
479 secondByte = it.nextByte(det);
480 it.charValue = (it.charValue << 8) | secondByte;
481
482 if (firstByte >= 0x81 && firstByte <= 0xFE) {
483 // Two byte Char
484 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
485 break buildChar;
486 }
487
488 // Four byte char
489 if (secondByte >= 0x30 && secondByte <= 0x39) {
490 thirdByte = it.nextByte(det);
491
492 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
493 fourthByte = it.nextByte(det);
494
495 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
496 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
497 break buildChar;
498 }
499 }
500 }
501
502 it.error = true;
503 break buildChar;
504 }
505 }
506
507 return (it.done == false);
508 }
509
510 static int [] commonChars =
511 // TODO: This set of data comes from the character frequency-
512 // of-occurence analysis tool. The data needs to be moved
513 // into a resource and loaded from there.
514 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
515 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
516 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
517 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
518 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
519 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
520 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
521 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
522 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
523 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
524
525
526 String getName() {
527 return "GB18030";
528 }
529
530 int match(CharsetDetector det) {
531 return match(det, commonChars);
532 }
533
534 public String getLanguage()
535 {
536 return "zh";
537 }
538 }
539
540
541 }
0 /*
1 ****************************************************************************
2 * Copyright (C) 2005-2009, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 ************************************************************************** *
5 *
6 */
7 package org.apache.tika.parser.txt;
8
9 import java.nio.ByteBuffer;
10
11 /**
12 * This class recognizes single-byte encodings. Because the encoding scheme is so
13 * simple, language statistics are used to do the matching.
14 *
15 * The Recognizer works by first mapping from bytes in the encoding under test
16 * into that Recognizer's ngram space. Normally this means performing a
17 * lowercase, and excluding codepoints that don't correspond to numbers of
18 * letters. (Accented letters may or may not be ignored or normalised, depending
19 * on the needs of the ngrams)
20 * Then, ngram analysis is run against the transformed text, and a confidence
21 * is calculated.
22 *
23 * For many of our Recognizers, we have one ngram set per language in each
24 * encoding, and do a simultanious language+charset detection.
25 *
26 * When adding new Recognizers, the easiest way is to byte map to an existing
27 * encoding for which we have ngrams, excluding non text, and re-use the ngrams.
28 *
29 * @internal
30 */
31 abstract class CharsetRecog_sbcs extends CharsetRecognizer {
32
33 /* (non-Javadoc)
34 * @see com.ibm.icu.text.CharsetRecognizer#getName()
35 */
36 abstract String getName();
37
38 /* (non-Javadoc)
39 * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
40 */
41 abstract int match(CharsetDetector det);
42
43 static class NGramParser
44 {
45 // private static final int N_GRAM_SIZE = 3;
46 private static final int N_GRAM_MASK = 0xFFFFFF;
47
48 private int byteIndex = 0;
49 private int ngram = 0;
50
51 private int[] ngramList;
52 private byte[] byteMap;
53
54 private int ngramCount;
55 private int hitCount;
56
57 private byte spaceChar;
58
59 public NGramParser(int[] theNgramList, byte[] theByteMap)
60 {
61 ngramList = theNgramList;
62 byteMap = theByteMap;
63
64 ngram = 0;
65
66 ngramCount = hitCount = 0;
67 }
68
69 /*
70 * Binary search for value in table, which must have exactly 64 entries.
71 */
72 private static int search(int[] table, int value)
73 {
74 int index = 0;
75
76 if (table[index + 32] <= value) {
77 index += 32;
78 }
79
80 if (table[index + 16] <= value) {
81 index += 16;
82 }
83
84 if (table[index + 8] <= value) {
85 index += 8;
86 }
87
88 if (table[index + 4] <= value) {
89 index += 4;
90 }
91
92 if (table[index + 2] <= value) {
93 index += 2;
94 }
95
96 if (table[index + 1] <= value) {
97 index += 1;
98 }
99
100 if (table[index] > value) {
101 index -= 1;
102 }
103
104 if (index < 0 || table[index] != value) {
105 return -1;
106 }
107
108 return index;
109 }
110
111 private void lookup(int thisNgram)
112 {
113 ngramCount += 1;
114
115 if (search(ngramList, thisNgram) >= 0) {
116 hitCount += 1;
117 }
118
119 }
120
121 private void addByte(int b)
122 {
123 ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
124 lookup(ngram);
125 }
126
127 private int nextByte(CharsetDetector det)
128 {
129 if (byteIndex >= det.fInputLen) {
130 return -1;
131 }
132
133 return det.fInputBytes[byteIndex++] & 0xFF;
134 }
135
136 public int parse(CharsetDetector det)
137 {
138 return parse (det, (byte)0x20);
139 }
140 public int parse(CharsetDetector det, byte spaceCh)
141 {
142 int b;
143 boolean ignoreSpace = false;
144 this.spaceChar = spaceCh;
145
146 while ((b = nextByte(det)) >= 0) {
147 byte mb = byteMap[b];
148
149 // TODO: 0x20 might not be a space in all character sets...
150 if (mb != 0) {
151 if (!(mb == spaceChar && ignoreSpace)) {
152 addByte(mb);
153 }
154
155 ignoreSpace = (mb == spaceChar);
156 } else if(mb == 0 && b != 0) {
157 // Indicates an invalid character in the charset
158 // Bump the ngram count up a bit to indicate uncertainty
159 ngramCount += 4;
160 }
161 }
162
163 // TODO: Is this OK? The buffer could have ended in the middle of a word...
164 addByte(spaceChar);
165
166 double rawPercent = (double) hitCount / (double) ngramCount;
167
168 // if (rawPercent <= 2.0) {
169 // return 0;
170 // }
171
172 // TODO - This is a bit of a hack to take care of a case
173 // were we were getting a confidence of 135...
174 if (rawPercent > 0.33) {
175 return 98;
176 }
177
178 return (int) (rawPercent * 300.0);
179 }
180 }
181
182 protected boolean haveC1Bytes = false;
183
184 int match(CharsetDetector det, int[] ngrams, byte[] byteMap)
185 {
186 return match (det, ngrams, byteMap, (byte)0x20);
187 }
188
189 int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar)
190 {
191 NGramParser parser = new NGramParser(ngrams, byteMap);
192
193 haveC1Bytes = det.fC1Bytes;
194
195 return parser.parse(det, spaceChar);
196 }
197
198 abstract static class CharsetRecog_8859_1 extends CharsetRecog_sbcs
199 {
200 protected static byte[] byteMap = {
201 /* 0x00-0x07 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
202 /* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
203 /* 0x10-0x17 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
204 /* 0x18-0x1f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
205 /* 0x20-0x27 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
206 /* 0x28-0x2f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
207 /* 0x30-0x37 */ (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
208 /* 0x38-0x3f */ (byte) 0x38, (byte) 0x39, (byte) 0x40, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
209 /* 0x40-0x47 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
210 /* 0x48-0x4f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
211 /* 0x50-0x57 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
212 /* 0x58-0x0f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
213 /* 0x60-0x67 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
214 /* 0x68-0x6f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
215 /* 0x70-0x77 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
216 /* 0x78-0x7f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
217 /* 0x80-0x87 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
218 /* 0x88-0x8f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
219 /* 0x90-0x97 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
220 /* 0x98-0x9f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
221 /* 0xa0-0xa7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
222 /* 0xa8-0xaf */ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
223 /* 0xb0-0xb7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
224 /* 0xb8-0xbf */ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
225 /* 0xc0-0xc7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
226 /* 0xc8-0xcf */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
227 /* 0xd0-0xd7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
228 /* 0xd8-0xdf */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
229 /* 0xe0-0xe7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
230 /* 0xe8-0xef */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
231 /* 0xf0-0xf7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
232 /* 0xf8-0xff */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
233 };
234
235 public String getName()
236 {
237 return haveC1Bytes? "windows-1252" : "ISO-8859-1";
238 }
239 }
240
241 static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1
242 {
243 private static int[] ngrams = {
244 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
245 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
246 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
247 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
248 };
249
250 public String getLanguage()
251 {
252 return "da";
253 }
254
255 public int match(CharsetDetector det)
256 {
257 return match(det, ngrams, byteMap);
258 }
259 }
260
261 static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1
262 {
263 private static int[] ngrams = {
264 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
265 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
266 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
267 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
268 };
269
270 public String getLanguage()
271 {
272 return "de";
273 }
274
275 public int match(CharsetDetector det)
276 {
277 return match(det, ngrams, byteMap);
278 }
279 }
280
281 static class CharsetRecog_8859_1_en extends CharsetRecog_8859_1
282 {
283 private static int[] ngrams = {
284 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
285 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
286 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
287 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
288 };
289
290 public String getLanguage()
291 {
292 return "en";
293 }
294
295 public int match(CharsetDetector det)
296 {
297 return match(det, ngrams, byteMap);
298 }
299 }
300
301 static class CharsetRecog_8859_1_es extends CharsetRecog_8859_1
302 {
303 private static int[] ngrams = {
304 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
305 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
306 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
307 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
308 };
309
310 public String getLanguage()
311 {
312 return "es";
313 }
314
315 public int match(CharsetDetector det)
316 {
317 return match(det, ngrams, byteMap);
318 }
319 }
320
321 static class CharsetRecog_8859_1_fr extends CharsetRecog_8859_1
322 {
323 private static int[] ngrams = {
324 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
325 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
326 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
327 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
328 };
329
330 public String getLanguage()
331 {
332 return "fr";
333 }
334
335 public int match(CharsetDetector det)
336 {
337 return match(det, ngrams, byteMap);
338 }
339 }
340
341 static class CharsetRecog_8859_1_it extends CharsetRecog_8859_1
342 {
343 private static int[] ngrams = {
344 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
345 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
346 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
347 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
348 };
349
350 public String getLanguage()
351 {
352 return "it";
353 }
354
355 public int match(CharsetDetector det)
356 {
357 return match(det, ngrams, byteMap);
358 }
359 }
360
361 static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1
362 {
363 private static int[] ngrams = {
364 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
365 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
366 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
367 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
368 };
369
370 public String getLanguage()
371 {
372 return "nl";
373 }
374
375 public int match(CharsetDetector det)
376 {
377 return match(det, ngrams, byteMap);
378 }
379 }
380
381 static class CharsetRecog_8859_1_no extends CharsetRecog_8859_1
382 {
383 private static int[] ngrams = {
384 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
385 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
386 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
387 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
388 };
389
390 public String getLanguage()
391 {
392 return "no";
393 }
394
395 public int match(CharsetDetector det)
396 {
397 return match(det, ngrams, byteMap);
398 }
399 }
400
401 static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1
402 {
403 private static int[] ngrams = {
404 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
405 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
406 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
407 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
408 };
409
410 public String getLanguage()
411 {
412 return "pt";
413 }
414
415 public int match(CharsetDetector det)
416 {
417 return match(det, ngrams, byteMap);
418 }
419 }
420
421 static class CharsetRecog_8859_1_sv extends CharsetRecog_8859_1
422 {
423 private static int[] ngrams = {
424 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
425 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
426 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
427 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
428 };
429
430 public String getLanguage()
431 {
432 return "sv";
433 }
434
435 public int match(CharsetDetector det)
436 {
437 return match(det, ngrams, byteMap);
438 }
439 }
440
441 abstract static class CharsetRecog_8859_2 extends CharsetRecog_sbcs
442 {
443 protected static byte[] byteMap = {
444 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
445 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
446 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
447 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
448 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
449 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
450 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
451 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
452 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
453 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
454 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
455 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
456 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
457 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
458 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
459 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
460 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
461 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
462 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
463 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
464 (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0x20,
465 (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
466 (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
467 (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF,
468 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
469 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
470 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
471 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
472 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
473 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
474 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
475 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
476 };
477
478 public String getName()
479 {
480 return haveC1Bytes? "windows-1250" : "ISO-8859-2";
481 }
482 }
483
484 static class CharsetRecog_8859_2_cs extends CharsetRecog_8859_2
485 {
486 private static int[] ngrams = {
487 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
488 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
489 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
490 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
491 };
492
493 public String getLanguage()
494 {
495 return "cs";
496 }
497
498 public int match(CharsetDetector det)
499 {
500 return match(det, ngrams, byteMap);
501 }
502 }
503
504 static class CharsetRecog_8859_2_hu extends CharsetRecog_8859_2
505 {
506 private static int[] ngrams = {
507 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
508 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
509 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
510 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
511 };
512
513 public String getLanguage()
514 {
515 return "hu";
516 }
517
518 public int match(CharsetDetector det)
519 {
520 return match(det, ngrams, byteMap);
521 }
522 }
523
524 static class CharsetRecog_8859_2_pl extends CharsetRecog_8859_2
525 {
526 private static int[] ngrams = {
527 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
528 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
529 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
530 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
531 };
532
533 public String getLanguage()
534 {
535 return "pl";
536 }
537
538 public int match(CharsetDetector det)
539 {
540 return match(det, ngrams, byteMap);
541 }
542 }
543
544 static class CharsetRecog_8859_2_ro extends CharsetRecog_8859_2
545 {
546 private static int[] ngrams = {
547 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
548 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
549 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
550 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
551 };
552
553 public String getLanguage()
554 {
555 return "ro";
556 }
557
558 public int match(CharsetDetector det)
559 {
560 return match(det, ngrams, byteMap);
561 }
562 }
563
564 abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs
565 {
566 protected static byte[] byteMap = {
567 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
568 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
569 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
570 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
571 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
572 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
573 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
574 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
575 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
576 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
577 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
578 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
579 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
580 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
581 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
582 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
583 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
584 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
585 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
586 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
587 (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
588 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
589 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
590 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
591 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
592 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
593 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
594 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
595 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
596 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
597 (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
598 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF,
599 };
600
601 public String getName()
602 {
603 return "ISO-8859-5";
604 }
605 }
606
607 static class CharsetRecog_8859_5_ru extends CharsetRecog_8859_5
608 {
609 private static int[] ngrams = {
610 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
611 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
612 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
613 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
614 };
615
616 public String getLanguage()
617 {
618 return "ru";
619 }
620
621 public int match(CharsetDetector det)
622 {
623 return match(det, ngrams, byteMap);
624 }
625 }
626
627 abstract static class CharsetRecog_8859_6 extends CharsetRecog_sbcs
628 {
629 protected static byte[] byteMap = {
630 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
631 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
632 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
633 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
634 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
635 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
636 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
637 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
638 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
639 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
640 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
641 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
642 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
643 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
644 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
645 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
646 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
647 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
648 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
649 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
650 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
651 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
652 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
653 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
654 (byte) 0x20, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
655 (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
656 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
657 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
658 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
659 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
660 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
661 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
662 };
663
664 public String getName()
665 {
666 return "ISO-8859-6";
667 }
668 }
669
670 static class CharsetRecog_8859_6_ar extends CharsetRecog_8859_6
671 {
672 private static int[] ngrams = {
673 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
674 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
675 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
676 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
677 };
678
679 public String getLanguage()
680 {
681 return "ar";
682 }
683
684 public int match(CharsetDetector det)
685 {
686 return match(det, ngrams, byteMap);
687 }
688 }
689
690 abstract static class CharsetRecog_8859_7 extends CharsetRecog_sbcs
691 {
692 protected static byte[] byteMap = {
693 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
694 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
695 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
696 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
697 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
698 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
699 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
700 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
701 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
702 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
703 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
704 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
705 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
706 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
707 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
708 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
709 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
710 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
711 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
712 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
713 (byte) 0x20, (byte) 0xA1, (byte) 0xA2, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
714 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
715 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xDC, (byte) 0x20,
716 (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, (byte) 0x20, (byte) 0xFC, (byte) 0x20, (byte) 0xFD, (byte) 0xFE,
717 (byte) 0xC0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
718 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
719 (byte) 0xF0, (byte) 0xF1, (byte) 0x20, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
720 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
721 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
722 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
723 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
724 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
725 };
726
727 public String getName()
728 {
729 return haveC1Bytes? "windows-1253" : "ISO-8859-7";
730 }
731 }
732
733 static class CharsetRecog_8859_7_el extends CharsetRecog_8859_7
734 {
735 private static int[] ngrams = {
736 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
737 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
738 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
739 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
740 };
741
742 public String getLanguage()
743 {
744 return "el";
745 }
746
747 public int match(CharsetDetector det)
748 {
749 return match(det, ngrams, byteMap);
750 }
751 }
752
753 abstract static class CharsetRecog_8859_8 extends CharsetRecog_sbcs
754 {
755 protected static byte[] byteMap = {
756 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
757 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
758 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
759 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
760 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
761 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
762 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
763 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
764 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
765 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
766 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
767 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
768 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
769 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
770 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
771 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
772 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
773 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
774 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
775 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
776 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
777 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
778 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
779 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
780 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
781 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
782 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
783 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
784 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
785 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
786 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
787 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
788 };
789
790 public String getName()
791 {
792 return haveC1Bytes? "windows-1255" : "ISO-8859-8";
793 }
794 }
795
796 static class CharsetRecog_8859_8_I_he extends CharsetRecog_8859_8
797 {
798 private static int[] ngrams = {
799 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
800 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
801 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
802 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
803 };
804
805 public String getName()
806 {
807 return haveC1Bytes? "windows-1255" : /*"ISO-8859-8-I"*/ "ISO-8859-8";
808 }
809
810 public String getLanguage()
811 {
812 return "he";
813 }
814
815 public int match(CharsetDetector det)
816 {
817 return match(det, ngrams, byteMap);
818 }
819 }
820
821 static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8
822 {
823 private static int[] ngrams = {
824 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
825 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
826 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
827 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
828 };
829
830 public String getLanguage()
831 {
832 return "he";
833 }
834
835 public int match(CharsetDetector det)
836 {
837 return match(det, ngrams, byteMap);
838 }
839 }
840
841 abstract static class CharsetRecog_8859_9 extends CharsetRecog_sbcs
842 {
843 protected static byte[] byteMap = {
844 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
845 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
846 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
847 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
848 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
849 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
850 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
851 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
852 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
853 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
854 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
855 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
856 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
857 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
858 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
859 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
860 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
861 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
862 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
863 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
864 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
865 (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
866 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
867 (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
868 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
869 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
870 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
871 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x69, (byte) 0xFE, (byte) 0xDF,
872 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
873 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
874 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
875 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
876 };
877
878 public String getName()
879 {
880 return haveC1Bytes? "windows-1254" : "ISO-8859-9";
881 }
882 }
883
884 static class CharsetRecog_8859_9_tr extends CharsetRecog_8859_9
885 {
886 private static int[] ngrams = {
887 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
888 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
889 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
890 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
891 };
892
893 public String getLanguage()
894 {
895 return "tr";
896 }
897
898 public int match(CharsetDetector det)
899 {
900 return match(det, ngrams, byteMap);
901 }
902 }
903
904 static class CharsetRecog_windows_1251 extends CharsetRecog_sbcs
905 {
906 private static int[] ngrams = {
907 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
908 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
909 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
910 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
911 };
912
913 private static byte[] byteMap = {
914 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
915 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
916 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
917 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
918 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
919 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
920 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
921 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
922 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
923 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
924 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
925 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
926 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
927 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
928 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
929 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
930 (byte) 0x90, (byte) 0x83, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
931 (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
932 (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
933 (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
934 (byte) 0x20, (byte) 0xA2, (byte) 0xA2, (byte) 0xBC, (byte) 0x20, (byte) 0xB4, (byte) 0x20, (byte) 0x20,
935 (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xBF,
936 (byte) 0x20, (byte) 0x20, (byte) 0xB3, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
937 (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0xBC, (byte) 0xBE, (byte) 0xBE, (byte) 0xBF,
938 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
939 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
940 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
941 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
942 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
943 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
944 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
945 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
946 };
947
948 public String getName()
949 {
950 return "windows-1251";
951 }
952
953 public String getLanguage()
954 {
955 return "ru";
956 }
957
958 public int match(CharsetDetector det)
959 {
960 return match(det, ngrams, byteMap);
961 }
962 }
963
964 static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs
965 {
966 private static int[] ngrams = {
967 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
968 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
969 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
970 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
971 };
972
973 // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged
974 private static byte[] byteMap = {
975 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
976 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
977 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
978 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
979 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
980 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
981 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
982 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
983 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
984 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
985 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
986 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
987 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
988 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
989 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
990 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
991 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
992 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
993 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
994 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
995 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
996 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
997 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
998 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
999 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1000 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1001 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1002 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1003 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
1004 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
1005 (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
1006 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1007 };
1008
1009 public String getName()
1010 {
1011 return "IBM866";
1012 }
1013
1014 public String getLanguage()
1015 {
1016 return "ru";
1017 }
1018
1019 public int match(CharsetDetector det)
1020 {
1021 return match(det, ngrams, byteMap);
1022 }
1023 }
1024
1025 static class CharsetRecog_windows_1256 extends CharsetRecog_sbcs
1026 {
1027 private static int[] ngrams = {
1028 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
1029 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
1030 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
1031 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
1032 };
1033
1034 private static byte[] byteMap = {
1035 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1036 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1037 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1038 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1039 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
1040 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1041 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1042 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1043 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
1044 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
1045 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
1046 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1047 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
1048 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
1049 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
1050 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1051 (byte) 0x20, (byte) 0x81, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1052 (byte) 0x88, (byte) 0x20, (byte) 0x8A, (byte) 0x20, (byte) 0x9C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
1053 (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1054 (byte) 0x98, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x20, (byte) 0x20, (byte) 0x9F,
1055 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1056 (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1057 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
1058 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1059 (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
1060 (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
1061 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0x20,
1062 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
1063 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
1064 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
1065 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xF4, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1066 (byte) 0x20, (byte) 0xF9, (byte) 0x20, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0x20, (byte) 0xFF,
1067 };
1068
1069 public String getName()
1070 {
1071 return "windows-1256";
1072 }
1073
1074 public String getLanguage()
1075 {
1076 return "ar";
1077 }
1078
1079 public int match(CharsetDetector det)
1080 {
1081 return match(det, ngrams, byteMap);
1082 }
1083 }
1084
1085 static class CharsetRecog_KOI8_R extends CharsetRecog_sbcs
1086 {
1087 private static int[] ngrams = {
1088 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
1089 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
1090 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
1091 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
1092 };
1093
1094 private static byte[] byteMap = {
1095 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1096 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1097 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1098 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1099 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
1100 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1101 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1102 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1103 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
1104 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
1105 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
1106 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1107 (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
1108 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
1109 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
1110 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1111 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1112 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1113 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1114 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1115 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1116 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1117 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1118 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
1119 (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
1120 (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
1121 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
1122 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
1123 (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
1124 (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
1125 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
1126 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
1127 };
1128
1129 public String getName()
1130 {
1131 return "KOI8-R";
1132 }
1133
1134 public String getLanguage()
1135 {
1136 return "ru";
1137 }
1138
1139 public int match(CharsetDetector det)
1140 {
1141 return match(det, ngrams, byteMap);
1142 }
1143 }
1144
1145 abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs
1146 {
1147 protected static byte[] byteMap = {
1148 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1149 /* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1150 /* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1151 /* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1152 /* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1153 /* 4- */ (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1154 /* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1155 /* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1156 /* 7- */ (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40,
1157 /* 8- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1158 /* 9- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1159 /* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1160 /* B- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1161 /* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1162 /* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1163 /* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1164 /* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1165 };
1166
1167 public String getLanguage()
1168 {
1169 return "he";
1170 }
1171 }
1172 static class CharsetRecog_IBM424_he_rtl extends CharsetRecog_IBM424_he
1173 {
1174 public String getName()
1175 {
1176 return "IBM424_rtl";
1177 }
1178 private static int[] ngrams = {
1179 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
1180 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
1181 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
1182 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
1183 };
1184 public int match(CharsetDetector det)
1185 {
1186 return match(det, ngrams, byteMap, (byte)0x40);
1187 }
1188 }
1189 static class CharsetRecog_IBM424_he_ltr extends CharsetRecog_IBM424_he
1190 {
1191 public String getName()
1192 {
1193 return "IBM424_ltr";
1194 }
1195 private static int[] ngrams = {
1196 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
1197 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
1198 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
1199 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651
1200
1201 };
1202 public int match(CharsetDetector det)
1203 {
1204 return match(det, ngrams, byteMap, (byte)0x40);
1205 }
1206 }
1207
1208 abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs
1209 {
1210 //arabic shaping class, method shape/unshape
1211 //protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
1212 protected byte[] prev_fInputBytes = null;
1213
1214 protected static byte[] byteMap = {
1215 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1216 /* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1217 /* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1218 /* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1219 /* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1220 /* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1221 /* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x40, (byte) 0x40, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1222 /* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1223 /* 7- */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x78, (byte) 0x79, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1224 /* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
1225 /* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
1226 /* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
1227 /* B- */ (byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x40, (byte) 0x40, (byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
1228 /* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0xCB, (byte) 0x40, (byte) 0xCD, (byte) 0x40, (byte) 0xCF,
1229 /* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
1230 /* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
1231 /* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40,
1232 };
1233
1234 protected static byte[] unshapeMap = {
1235 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1236 /* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1237 /* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1238 /* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1239 /* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
1240 /* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
1241 /* 5- */ (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
1242 /* 6- */ (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
1243 /* 7- */ (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
1244 /* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F,
1245 /* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E,
1246 /* A- */ (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF,
1247 /* B- */ (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF,
1248 /* C- */ (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF,
1249 /* D- */ (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF,
1250 /* E- */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
1251 /* F- */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
1252 };
1253
1254 public String getLanguage()
1255 {
1256 return "ar";
1257 }
1258 protected void matchInit(CharsetDetector det)
1259 {
1260 prev_fInputBytes = (byte[])det.fInputBytes.clone();
1261 byte bb[] = unshape(det.fInputBytes);
1262 det.setText(bb);
1263 }
1264
1265 /*
1266 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
1267 * because CharsetDetector is dealing with bytes not Unicode code points. We could
1268 * convert the bytes to Unicode code points but that would leave us dependent
1269 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
1270 * of JDK can produce different results and therefore is also avoided.
1271 */
1272 private byte[] unshape(byte[] inputBytes) {
1273 byte resultByteArr[] = unshapeLamAlef(inputBytes);
1274
1275 for (int i=0; i<inputBytes.length; i++){
1276 resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
1277 }
1278 return resultByteArr;
1279 }
1280
1281 private byte[] unshapeLamAlef(byte[] inputBytes) {
1282 ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputBytes.length*2);
1283 ByteBuffer resultBuffer;
1284 byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};
1285
1286
1287 for (int i=0; i<inputBytes.length; i++){
1288 if (isLamAlef(inputBytes[i]))
1289 resultBigBuffer.put(unshapedLamAlef);
1290 else
1291 resultBigBuffer.put(inputBytes[i]);
1292 }
1293 resultBuffer = ByteBuffer.allocate(resultBigBuffer.position());
1294 resultBuffer.put(resultBigBuffer.array(),0, resultBigBuffer.position());
1295 return resultBuffer.array();
1296 }
1297
1298 private boolean isLamAlef(byte b) {
1299 // Return true if byte is any of these:
1300 //
1301 // {(byte)0xb2,(byte)0xb3,(byte)0xb4,(byte)0xb5,(byte)0xb7,(byte)0xb8}
1302 //
1303 // NOTE: 0xb2 is -78; 0xb8 is -72:
1304 return (b <= (byte)0xb8) && (b >= (byte)0xb2) && (b != (byte)0xb6);
1305 }
1306
1307 protected void matchFinish(CharsetDetector det) {
1308 if (prev_fInputBytes != null)
1309 det.setText(prev_fInputBytes);
1310 }
1311
1312 }
1313 static class CharsetRecog_IBM420_ar_rtl extends CharsetRecog_IBM420_ar
1314 {
1315 private static int[] ngrams = {
1316 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
1317 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
1318 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
1319 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
1320 };
1321
1322 public String getName()
1323 {
1324 return "IBM420_rtl";
1325 }
1326 public int match(CharsetDetector det)
1327 {
1328 matchInit(det);
1329 int result = match(det, ngrams, byteMap, (byte)0x40);
1330 matchFinish(det);
1331 return result;
1332 }
1333
1334 }
1335 static class CharsetRecog_IBM420_ar_ltr extends CharsetRecog_IBM420_ar
1336 {
1337 private static int[] ngrams = {
1338 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
1339 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
1340 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
1341 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
1342 };
1343
1344 public String getName()
1345 {
1346 return "IBM420_ltr";
1347 }
1348 public int match(CharsetDetector det)
1349 {
1350 matchInit(det);
1351 int result = match(det, ngrams, byteMap, (byte)0x40);
1352 matchFinish(det);
1353 return result;
1354 }
1355 }
1356
1357 static abstract class CharsetRecog_EBCDIC_500 extends CharsetRecog_sbcs
1358 {
1359 // This maps EBCDIC 500 codepoints onto either space (not of interest), or a lower
1360 // case ISO_8859_1 number/letter/accented-letter codepoint for ngram matching
1361 // Because we map to ISO_8859_1, we can re-use the ngrams from those detectors
1362 // To avoid mis-detection, we skip many of the control characters in the 0x00-0x3f range
1363 protected static byte[] byteMap = {
1364 /* 0x00-0x07 */ (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00,
1365 /* 0x08-0x0f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1366 /* 0x10-0x17 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1367 /* 0x18-0x1f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1368 /* 0x20-0x27 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1369 /* 0x28-0x2f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x00, (byte)0x00,
1370 /* 0x30-0x37 */ (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00,
1371 /* 0x38-0x3f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00,
1372 /* 0x40-0x47 */ (byte)0x20, (byte)0x20, (byte)0xe2, (byte)0xe4, (byte)0xe0, (byte)0xe1, (byte)0xe3, (byte)0xe5,
1373 /* 0x48-0x4f */ (byte)0xe7, (byte)0xf1, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1374 /* 0x50-0x57 */ (byte)0x20, (byte)0xe9, (byte)0xea, (byte)0xeb, (byte)0xe8, (byte)0xed, (byte)0xee, (byte)0xef,
1375 /* 0x58-0x5f */ (byte)0xec, (byte)0xdf, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1376 /* 0x60-0x67 */ (byte)0x20, (byte)0x20, (byte)0xe2, (byte)0xe4, (byte)0xe0, (byte)0xe1, (byte)0xe3, (byte)0xe5,
1377 /* 0x68-0x6f */ (byte)0xe7, (byte)0xf1, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1378 /* 0x70-0x77 */ (byte)0xf8, (byte)0xe9, (byte)0xea, (byte)0xeb, (byte)0xe8, (byte)0xed, (byte)0xee, (byte)0xef,
1379 /* 0x78-0x7f */ (byte)0xec, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1380 /* 0x80-0x87 */ (byte)0xd8, (byte)'a', (byte)'b', (byte)'c', (byte)'d', (byte)'e', (byte)'f', (byte)'g',
1381 /* 0x88-0x8f */ (byte)'h', (byte)'i', (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1382 /* 0x90-0x97 */ (byte)0x20, (byte)'j', (byte)'k', (byte)'l', (byte)'m', (byte)'n', (byte)'o', (byte)'p',
1383 /* 0x98-0x9f */ (byte)'q', (byte)'r', (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1384 /* 0xa0-0xa7 */ (byte)0x20, (byte)0x20, (byte)'s', (byte)'t', (byte)'u', (byte)'v', (byte)'w', (byte)'x',
1385 /* 0xa8-0xaf */ (byte)'y', (byte)'z', (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1386 /* 0xb0-0xb7 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1387 /* 0xb8-0xbf */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
1388 /* 0xc0-0xc7 */ (byte)0x20, (byte)'a', (byte)'b', (byte)'c', (byte)'d', (byte)'e', (byte)'f', (byte)'g',
1389 /* 0xc8-0xcf */ (byte)'h', (byte)'i', (byte)0x20, (byte)0xf4, (byte)0xf6, (byte)0xf2, (byte)0xf3, (byte)0xf5,
1390 /* 0xd0-0xd7 */ (byte)0x20, (byte)'j', (byte)'k', (byte)'l', (byte)'m', (byte)'n', (byte)'o', (byte)'p',
1391 /* 0xd8-0xdf */ (byte)'q', (byte)'r', (byte)0x20, (byte)0xfb, (byte)0xfc, (byte)0xf9, (byte)0xfa, (byte)0xff,
1392 /* 0xe0-0xe7 */ (byte)0x20, (byte)0x20, (byte)'s', (byte)'t', (byte)'u', (byte)'v', (byte)'w', (byte)'x',
1393 /* 0xe8-0xef */ (byte)'y', (byte)'z', (byte)0x20, (byte)0xf4, (byte)0xf6, (byte)0xf2, (byte)0xf3, (byte)0xf5,
1394 /* 0xf0-0xf7 */ (byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5', (byte)'6', (byte)'7',
1395 /* 0xf8-0xff */ (byte)'8', (byte)'9', (byte)0x20, (byte)0xfb, (byte)0xfc, (byte)0xf9, (byte)0xfa, (byte)0x20,
1396 };
1397
1398 public String getName()
1399 {
1400 return "IBM500";
1401 }
1402 }
1403
1404 static class CharsetRecog_EBCDIC_500_en extends CharsetRecog_EBCDIC_500
1405 {
1406 public String getLanguage()
1407 {
1408 return "en";
1409 }
1410 public int match(CharsetDetector det)
1411 {
1412 return match(det, CharsetRecog_8859_1_en.ngrams, byteMap);
1413 }
1414 }
1415
1416 static class CharsetRecog_EBCDIC_500_de extends CharsetRecog_EBCDIC_500
1417 {
1418 public String getLanguage()
1419 {
1420 return "de";
1421 }
1422 public int match(CharsetDetector det)
1423 {
1424 return match(det, CharsetRecog_8859_1_de.ngrams, byteMap);
1425 }
1426 }
1427
1428 static class CharsetRecog_EBCDIC_500_fr extends CharsetRecog_EBCDIC_500
1429 {
1430 public String getLanguage()
1431 {
1432 return "fr";
1433 }
1434 public int match(CharsetDetector det)
1435 {
1436 return match(det, CharsetRecog_8859_1_fr.ngrams, byteMap);
1437 }
1438 }
1439
1440 static class CharsetRecog_EBCDIC_500_es extends CharsetRecog_EBCDIC_500
1441 {
1442 public String getLanguage()
1443 {
1444 return "es";
1445 }
1446 public int match(CharsetDetector det)
1447 {
1448 return match(det, CharsetRecog_8859_1_es.ngrams, byteMap);
1449 }
1450 }
1451
1452 static class CharsetRecog_EBCDIC_500_it extends CharsetRecog_EBCDIC_500
1453 {
1454 public String getLanguage()
1455 {
1456 return "it";
1457 }
1458 public int match(CharsetDetector det)
1459 {
1460 return match(det, CharsetRecog_8859_1_it.ngrams, byteMap);
1461 }
1462 }
1463
1464 static class CharsetRecog_EBCDIC_500_nl extends CharsetRecog_EBCDIC_500
1465 {
1466 public String getLanguage()
1467 {
1468 return "nl";
1469 }
1470 public int match(CharsetDetector det)
1471 {
1472 return match(det, CharsetRecog_8859_1_nl.ngrams, byteMap);
1473 }
1474 }
1475 }
0 /**
1 *******************************************************************************
2 * Copyright (C) 2005, International Business Machines Corporation and *
3 * others. All Rights Reserved. *
4 *******************************************************************************
5 */
6 package org.apache.tika.parser.txt;
7
8 /**
9 * Abstract class for recognizing a single charset.
10 * Part of the implementation of ICU's CharsetDetector.
11 *
12 * Each specific charset that can be recognized will have an instance
13 * of some subclass of this class. All interaction between the overall
14 * CharsetDetector and the stuff specific to an individual charset happens
15 * via the interface provided here.
16 *
17 * Instances of CharsetDetector DO NOT have or maintain
18 * state pertaining to a specific match or detect operation.
19 * The WILL be shared by multiple instances of CharsetDetector.
20 * They encapsulate const charset-specific information.
21 *
22 * @internal
23 */
24 abstract class CharsetRecognizer {
25 /**
26 * Get the IANA name of this charset.
27 * @return the charset name.
28 */
29 abstract String getName();
30
31 /**
32 * Get the ISO language code for this charset.
33 * @return the language code, or <code>null</code> if the language cannot be determined.
34 */
35 public String getLanguage()
36 {
37 return null;
38 }
39
40 /**
41 * Test the match of this charset with the input text data
42 * which is obtained via the CharsetDetector object.
43 *
44 * @param det The CharsetDetector, which contains the input text
45 * to be checked for being in this charset.
46 * @return Two values packed into one int (Damn java, anyhow)
47 * <br/>
48 * bits 0-7: the match confidence, ranging from 0-100
49 * <br/>
50 * bits 8-15: The match reason, an enum-like value.
51 */
52 abstract int match(CharsetDetector det);
53
54 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.charset.Charset;
21
22 import org.apache.tika.detect.EncodingDetector;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.apache.tika.utils.CharsetUtils;
26
27 public class Icu4jEncodingDetector implements EncodingDetector {
28
29 public Charset detect(InputStream input, Metadata metadata)
30 throws IOException {
31 if (input == null) {
32 return null;
33 }
34
35 CharsetDetector detector = new CharsetDetector();
36
37 String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
38 String incomingType = metadata.get(Metadata.CONTENT_TYPE);
39 if (incomingCharset == null && incomingType != null) {
40 // TIKA-341: Use charset in content-type
41 MediaType mt = MediaType.parse(incomingType);
42 if (mt != null) {
43 incomingCharset = mt.getParameters().get("charset");
44 }
45 }
46
47 if (incomingCharset != null) {
48 String cleaned = CharsetUtils.clean(incomingCharset);
49 if (cleaned != null) {
50 detector.setDeclaredEncoding(cleaned);
51 } else {
52 // TODO: log a warning?
53 }
54 }
55
56 // TIKA-341 without enabling input filtering (stripping of tags)
57 // short HTML tests don't work well
58 detector.enableInputFilter(true);
59
60 detector.setText(input);
61
62 for (CharsetMatch match : detector.detectAll()) {
63 try {
64 return CharsetUtils.forName(match.getName());
65 } catch (Exception e) {
66 // ignore
67 }
68 }
69
70 return null;
71 }
72
73 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.charset.Charset;
21 import java.util.Collections;
22 import java.util.Set;
23
24 import org.apache.tika.config.ServiceLoader;
25 import org.apache.tika.detect.AutoDetectReader;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.io.CloseShieldInputStream;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.AbstractParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Plain text parser. The text encoding of the document stream is
38 * automatically detected based on the byte patterns found at the
39 * beginning of the stream and the given document metadata, most
40 * notably the <code>charset</code> parameter of a
41 * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
42 * <p>
43 * This parser sets the following output metadata entries:
44 * <dl>
45 * <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
46 * <dd><code>text/plain; charset=...</code></dd>
47 * </dl>
48 */
49 public class TXTParser extends AbstractParser {
50
51 /** Serial version UID */
52 private static final long serialVersionUID = -6656102320836888910L;
53
54 private static final Set<MediaType> SUPPORTED_TYPES =
55 Collections.singleton(MediaType.TEXT_PLAIN);
56
57 private static final ServiceLoader LOADER =
58 new ServiceLoader(TXTParser.class.getClassLoader());
59
60 public Set<MediaType> getSupportedTypes(ParseContext context) {
61 return SUPPORTED_TYPES;
62 }
63
64 public void parse(
65 InputStream stream, ContentHandler handler,
66 Metadata metadata, ParseContext context)
67 throws IOException, SAXException, TikaException {
68 // Automatically detect the character encoding
69 AutoDetectReader reader = new AutoDetectReader(
70 new CloseShieldInputStream(stream), metadata,
71 context.get(ServiceLoader.class, LOADER));
72 try {
73 Charset charset = reader.getCharset();
74 MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
75 metadata.set(Metadata.CONTENT_TYPE, type.toString());
76 // deprecated, see TIKA-431
77 metadata.set(Metadata.CONTENT_ENCODING, charset.name());
78
79 XHTMLContentHandler xhtml =
80 new XHTMLContentHandler(handler, metadata);
81 xhtml.startDocument();
82
83 xhtml.startElement("p");
84 char[] buffer = new char[4096];
85 int n = reader.read(buffer);
86 while (n != -1) {
87 xhtml.characters(buffer, 0, n);
88 n = reader.read(buffer);
89 }
90 xhtml.endElement("p");
91
92 xhtml.endDocument();
93 } finally {
94 reader.close();
95 }
96 }
97
98 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.charset.Charset;
21
22 import org.apache.tika.detect.EncodingDetector;
23 import org.apache.tika.metadata.Metadata;
24
25 public class UniversalEncodingDetector implements EncodingDetector {
26
27 private static final int BUFSIZE = 1024;
28
29 private static final int LOOKAHEAD = 16 * BUFSIZE;
30
31 public Charset detect(InputStream input, Metadata metadata)
32 throws IOException {
33 if (input == null) {
34 return null;
35 }
36
37 input.mark(LOOKAHEAD);
38 try {
39 UniversalEncodingListener listener =
40 new UniversalEncodingListener(metadata);
41
42 byte[] b = new byte[BUFSIZE];
43 int n = 0;
44 int m = input.read(b);
45 while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
46 n += m;
47 listener.handleData(b, 0, m);
48 m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
49 }
50
51 return listener.dataEnd();
52 } catch (IOException e) {
53 throw e;
54 } catch (LinkageError e) {
55 return null; // juniversalchardet is not available
56 } finally {
57 input.reset();
58 }
59 }
60
61 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import java.nio.charset.Charset;
19
20 import org.apache.tika.detect.TextStatistics;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.mime.MediaType;
23 import org.apache.tika.utils.CharsetUtils;
24 import org.mozilla.universalchardet.CharsetListener;
25 import org.mozilla.universalchardet.Constants;
26 import org.mozilla.universalchardet.UniversalDetector;
27
28 /**
29 * Helper class used by {@link UniversalEncodingDetector} to access the
30 * <code>juniversalchardet</code> detection logic.
31 */
32 class UniversalEncodingListener implements CharsetListener {
33
34 private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
35
36 private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
37
38 private final TextStatistics statistics = new TextStatistics();
39
40 private final UniversalDetector detector = new UniversalDetector(this);
41
42 private String hint = null;
43
44 private Charset charset = null;
45
46 public UniversalEncodingListener(Metadata metadata) {
47 MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
48 if (type != null) {
49 hint = type.getParameters().get("charset");
50 }
51 if (hint == null) {
52 hint = metadata.get(Metadata.CONTENT_ENCODING);
53 }
54 }
55
56 public void report(String name) {
57 if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
58 if (hint != null) {
59 // Use the encoding hint when available
60 name = hint;
61 } else if (statistics.count('\r') == 0) {
62 // If there are no CR(LF)s, then the encoding is more
63 // likely to be ISO-8859-1(5) than windows-1252
64 if (statistics.count(0xa4) > 0) { // currency/euro sign
65 // The general currency sign is hardly ever used in
66 // ISO-8859-1, so it's more likely that we're dealing
67 // with ISO-8859-15, where the character is used for
68 // the euro symbol, which is more commonly used.
69 name = CHARSET_ISO_8859_15;
70 } else {
71 name = CHARSET_ISO_8859_1;
72 }
73 }
74 }
75 try {
76 this.charset = CharsetUtils.forName(name);
77 } catch (Exception e) {
78 // ignore
79 }
80 }
81
82 public boolean isDone() {
83 return detector.isDone();
84 }
85
86 public void handleData(byte[] buf, int offset, int length) {
87 statistics.addData(buf, offset, length);
88 detector.handleData(buf, offset, length);
89 }
90
91 public Charset dataEnd() {
92 detector.dataEnd();
93 if (charset == null && statistics.isMostlyAscii()) {
94 report(Constants.CHARSET_WINDOWS_1252);
95 }
96 return charset;
97 }
98
99 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.video;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.DataInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.Date;
25 import java.util.HashMap;
26 import java.util.Map;
27 import java.util.Map.Entry;
28 import java.util.Set;
29
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.parser.AbstractParser;
34 import org.apache.tika.parser.ParseContext;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * <p>
41 * Parser for metadata contained in Flash Videos (.flv). Resources:
42 * http://osflash.org/flv and for AMF:
43 * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
44 * <p>
45 * This parser is capable of extracting the general metadata from header as well
46 * as embedded metadata.
47 * <p>
48 * Known keys for metadata (from file header):
49 * <ol>
50 * <li>hasVideo: true|false
51 * <li>hasSound: true|false
52 * </ol>
53 * <p>
54 * In addition to the above values also metadata that is inserted in to the
55 * actual stream will be picked. Usually there are keys like:
56 * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
57 * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
58 * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
59 * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
60 * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
61 */
62 public class FLVParser extends AbstractParser {
63
64 /** Serial version UID */
65 private static final long serialVersionUID = -8718013155719197679L;
66
67 private static int TYPE_METADATA = 0x12;
68 private static byte MASK_AUDIO = 1;
69 private static byte MASK_VIDEO = 4;
70
71 private static final Set<MediaType> SUPPORTED_TYPES =
72 Collections.singleton(MediaType.video("x-flv"));
73
74 public Set<MediaType> getSupportedTypes(ParseContext context) {
75 return SUPPORTED_TYPES;
76 }
77
78 private long readUInt32(DataInputStream input) throws IOException {
79 return input.readInt() & 0xFFFFFFFFL;
80 }
81
82 private int readUInt24(DataInputStream input) throws IOException {
83 int uint = input.read()<<16;
84 uint += input.read()<<8;
85 uint += input.read();
86 return uint;
87 }
88
89 private Object readAMFData(DataInputStream input, int type)
90 throws IOException {
91 if (type == -1) {
92 type = input.readUnsignedByte();
93 }
94 switch (type) {
95 case 0:
96 return input.readDouble();
97 case 1:
98 return input.readUnsignedByte() == 1;
99 case 2:
100 return readAMFString(input);
101 case 3:
102 return readAMFObject(input);
103 case 8:
104 return readAMFEcmaArray(input);
105 case 10:
106 return readAMFStrictArray(input);
107 case 11:
108 final Date date = new Date((long) input.readDouble());
109 input.readShort(); // time zone
110 return date;
111 case 13:
112 return "UNDEFINED";
113 default:
114 return null;
115 }
116 }
117
118 private Object readAMFStrictArray(DataInputStream input) throws IOException {
119 long count = readUInt32(input);
120 ArrayList<Object> list = new ArrayList<Object>();
121 for (int i = 0; i < count; i++) {
122 list.add(readAMFData(input, -1));
123 }
124 return list;
125 }
126
127
128 private String readAMFString(DataInputStream input) throws IOException {
129 int size = input.readUnsignedShort();
130 byte[] chars = new byte[size];
131 input.readFully(chars);
132 return new String(chars);
133 }
134
135 private Object readAMFObject(DataInputStream input) throws IOException {
136 HashMap<String, Object> array = new HashMap<String, Object>();
137 while (true) {
138 String key = readAMFString(input);
139 int dataType = input.read();
140 if (dataType == 9) { // object end marker
141 break;
142 }
143 array.put(key, readAMFData(input, dataType));
144 }
145 return array;
146 }
147
148 private Object readAMFEcmaArray(DataInputStream input) throws IOException {
149 long size = readUInt32(input);
150 HashMap<String, Object> array = new HashMap<String, Object>();
151 for (int i = 0; i < size; i++) {
152 String key = readAMFString(input);
153 int dataType = input.read();
154 array.put(key, readAMFData(input, dataType));
155 }
156 return array;
157 }
158
159 private boolean checkSignature(DataInputStream fis) throws IOException {
160 return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
161 }
162
163 public void parse(
164 InputStream stream, ContentHandler handler,
165 Metadata metadata, ParseContext context)
166 throws IOException, SAXException, TikaException {
167 DataInputStream datainput = new DataInputStream(stream);
168 if (!checkSignature(datainput)) {
169 throw new TikaException("FLV signature not detected");
170 }
171
172 // header
173 int version = datainput.readUnsignedByte();
174 if (version != 1) {
175 // should be 1, perhaps this is not flv?
176 throw new TikaException("Unpexpected FLV version: " + version);
177 }
178
179 int typeFlags = datainput.readUnsignedByte();
180
181 long len = readUInt32(datainput);
182 if (len != 9) {
183 // we only know about format with header of 9 bytes
184 throw new TikaException("Unpexpected FLV header length: " + len);
185 }
186
187 long sizePrev = readUInt32(datainput);
188 if (sizePrev != 0) {
189 // should be 0, perhaps this is not flv?
190 throw new TikaException(
191 "Unpexpected FLV first previous block size: " + sizePrev);
192 }
193
194 metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
195 metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
196 metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
197
198 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
199 xhtml.startDocument();
200
201 // flv tag stream follows...
202 while (true) {
203 int type = datainput.read();
204 if (type == -1) {
205 // EOF
206 break;
207 }
208
209 int datalen = readUInt24(datainput); //body length
210 readUInt32(datainput); // timestamp
211 readUInt24(datainput); // streamid
212
213 if (type == TYPE_METADATA) {
214 // found metadata Tag, read content to buffer
215 byte[] metaBytes = new byte[datalen];
216 for (int readCount = 0; readCount < datalen;) {
217 int r = stream.read(metaBytes, readCount, datalen - readCount);
218 if(r!=-1) {
219 readCount += r;
220
221 } else {
222 break;
223 }
224 }
225
226 ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
227
228 DataInputStream dis = new DataInputStream(is);
229
230 Object data = null;
231
232 for (int i = 0; i < 2; i++) {
233 data = readAMFData(dis, -1);
234 }
235
236 if (data instanceof Map) {
237 // TODO if there are multiple metadata values with same key (in
238 // separate AMF blocks, we currently loose previous values)
239 Map<String, Object> extractedMetadata = (Map<String, Object>) data;
240 for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
241 if (entry.getValue() == null) {
242 continue;
243 }
244 metadata.set(entry.getKey(), entry.getValue().toString());
245 }
246 }
247
248 } else {
249 // Tag was not metadata, skip over data we cannot handle
250 for (int i = 0; i < datalen; i++) {
251 datainput.readByte();
252 }
253 }
254
255 sizePrev = readUInt32(datainput); // previous block size
256 if (sizePrev != datalen + 11) {
257 // file was corrupt or we could not parse it...
258 break;
259 }
260 }
261
262 xhtml.endDocument();
263 }
264
265 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import java.util.Arrays;
19 import java.util.List;
20
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.Property;
23 import org.xml.sax.helpers.DefaultHandler;
24
25 /**
26 * Base class for SAX handlers that map SAX events into document metadata.
27 *
28 * @since Apache Tika 0.10
29 */
30 class AbstractMetadataHandler extends DefaultHandler {
31
32 private final Metadata metadata;
33 private final Property property;
34 private final String name;
35
36 protected AbstractMetadataHandler(Metadata metadata, String name) {
37 this.metadata = metadata;
38 this.property = null;
39 this.name = name;
40 }
41 protected AbstractMetadataHandler(Metadata metadata, Property property) {
42 this.metadata = metadata;
43 this.property = property;
44 this.name = property.getName();
45 }
46
47 /**
48 * Adds the given metadata value. The value is ignored if it is
49 * <code>null</code> or empty. If the metadata entry already exists,
50 * then the given value is appended to it with a comma as the separator.
51 *
52 * @param value metadata value
53 */
54 protected void addMetadata(String value) {
55 if (value != null && value.length() > 0) {
56 if (metadata.isMultiValued(name)) {
57 // Add the value, assuming it's not already there
58 List<String> previous = Arrays.asList(metadata.getValues(name));
59 if (!previous.contains(value)) {
60 if (property != null) {
61 metadata.add(property, value);
62 } else {
63 metadata.add(name, value);
64 }
65 }
66 } else {
67 // Set the value, assuming it's not already there
68 String previous = metadata.get(name);
69 if (previous != null && previous.length() > 0) {
70 if (!previous.equals(value)) {
71 if (property != null) {
72 if (property.isMultiValuePermitted()) {
73 metadata.add(property, value);
74 } else {
75 // Replace the existing value if isMultiValuePermitted is false
76 metadata.set(property, value);
77 }
78 } else {
79 metadata.add(name, value);
80 }
81 }
82 } else {
83 if (property != null) {
84 metadata.set(property, value);
85 } else {
86 metadata.set(name, value);
87 }
88 }
89 }
90 }
91 }
92 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.xml.sax.Attributes;
20 import org.xml.sax.helpers.DefaultHandler;
21
22 /**
23 * This adds a Metadata entry for a given node.
24 * The textual content of the node is used as the
25 * value, and the Metadata name is taken from
26 * an attribute, with a prefix if required.
27 */
28 public class AttributeDependantMetadataHandler extends DefaultHandler {
29
30 private final Metadata metadata;
31
32 private final String nameHoldingAttribute;
33 private final String namePrefix;
34 private String name;
35
36 private final StringBuilder buffer = new StringBuilder();
37
38 public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
39 this.metadata = metadata;
40 this.nameHoldingAttribute = nameHoldingAttribute;
41 this.namePrefix = namePrefix;
42 }
43
44 public void addMetadata(String value) {
45 if(name == null || name.length() == 0) {
46 // We didn't find the attribute which holds the name
47 return;
48 }
49 if (value.length() > 0) {
50 String previous = metadata.get(name);
51 if (previous != null && previous.length() > 0) {
52 value = previous + ", " + value;
53 }
54 metadata.set(name, value);
55 }
56 }
57
58 public void endElement(String uri, String localName, String name) {
59 addMetadata(buffer.toString());
60 buffer.setLength(0);
61 }
62
63 public void startElement(
64 String uri, String localName, String name, Attributes attributes) {
65 String rawName = attributes.getValue(nameHoldingAttribute);
66 if (rawName != null) {
67 if (namePrefix == null) {
68 this.name = rawName;
69 } else {
70 this.name = namePrefix + rawName;
71 }
72 }
73 // All other attributes are ignored
74 }
75
76
77 public void characters(char[] ch, int start, int length) {
78 buffer.append(ch, start, length);
79 }
80
81 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.xml.sax.Attributes;
21 import org.xml.sax.SAXException;
22
23 /**
24 * SAX event handler that maps the contents of an XML attribute into
25 * a metadata field.
26 *
27 * @since Apache Tika 0.10
28 */
29 public class AttributeMetadataHandler extends AbstractMetadataHandler {
30
31 private final String uri;
32
33 private final String localName;
34
35 public AttributeMetadataHandler(
36 String uri, String localName, Metadata metadata, String name) {
37 super(metadata, name);
38 this.uri = uri;
39 this.localName = localName;
40 }
41 public AttributeMetadataHandler(
42 String uri, String localName, Metadata metadata, Property property) {
43 super(metadata, property);
44 this.uri = uri;
45 this.localName = localName;
46 }
47
48 @Override
49 public void startElement(
50 String uri, String localName, String qName, Attributes attributes)
51 throws SAXException {
52 for (int i = 0; i < attributes.getLength(); i++) {
53 if (attributes.getURI(i).equals(this.uri)
54 && attributes.getLocalName(i).equals(this.localName)) {
55 addMetadata(attributes.getValue(i).trim());
56 }
57 }
58 }
59
60 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import org.apache.tika.metadata.DublinCore;
19 import org.apache.tika.metadata.Metadata;
20 import org.apache.tika.metadata.Property;
21 import org.apache.tika.metadata.TikaCoreProperties;
22 import org.apache.tika.parser.ParseContext;
23 import org.apache.tika.sax.TeeContentHandler;
24 import org.xml.sax.ContentHandler;
25
26 /**
27 * Dublin Core metadata parser
28 */
29 public class DcXMLParser extends XMLParser {
30
31 /** Serial version UID */
32 private static final long serialVersionUID = 4905318835463880819L;
33
34 private static ContentHandler getDublinCoreHandler(
35 Metadata metadata, Property property, String element) {
36 return new ElementMetadataHandler(
37 DublinCore.NAMESPACE_URI_DC, element,
38 metadata, property);
39 }
40
41 protected ContentHandler getContentHandler(
42 ContentHandler handler, Metadata metadata, ParseContext context) {
43 return new TeeContentHandler(
44 super.getContentHandler(handler, metadata, context),
45 getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
46 getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
47 getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
48 getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
49 getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
50 getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
51 getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
52 getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
53 getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
54 getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
55 getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
56 getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
57 }
58
59 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import java.util.Arrays;
19
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.Property;
24 import org.xml.sax.Attributes;
25
26 /**
27 * SAX event handler that maps the contents of an XML element into
28 * a metadata field.
29 *
30 * @since Apache Tika 0.10
31 */
32 public class ElementMetadataHandler extends AbstractMetadataHandler {
33 /**
34 * Logger for this class
35 */
36 private static final Log logger = LogFactory
37 .getLog(ElementMetadataHandler.class);
38
39 private static final String LOCAL_NAME_RDF_BAG = "Bag";
40 private static final String LOCAL_NAME_RDF_LI = "li";
41 private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
42
43 private final String uri;
44
45 private final String localName;
46
47 private final Metadata metadata;
48
49 private final String name;
50 private Property targetProperty;
51
52 private final boolean allowDuplicateValues;
53 private final boolean allowEmptyValues;
54
55 /**
56 * The buffer used to capture characters when inside a bag li element.
57 */
58 private final StringBuilder bufferBagged = new StringBuilder();
59
60 /**
61 * The buffer used to capture characters inside standard elements.
62 */
63 private final StringBuilder bufferBagless = new StringBuilder();
64
65 /**
66 * Whether or not the value was found in a standard element structure or inside a bag.
67 */
68 private boolean isBagless = true;
69
70 private int matchLevel = 0;
71 private int parentMatchLevel = 0;
72
73 /**
74 * Constructor for string metadata keys.
75 *
76 * @param uri the uri of the namespace of the element
77 * @param localName the local name of the element
78 * @param metadata the Tika metadata object to populate
79 * @param name the Tika metadata field key
80 */
81 public ElementMetadataHandler(
82 String uri, String localName, Metadata metadata, String name) {
83 super(metadata, name);
84 this.uri = uri;
85 this.localName = localName;
86 this.metadata = metadata;
87 this.name = name;
88 this.allowDuplicateValues = false;
89 this.allowEmptyValues = false;
90 if (logger.isTraceEnabled()) {
91 logger.trace("created simple handler for " + this.name);
92 }
93 }
94
95 /**
96 * Constructor for string metadata keys which allows change of behavior
97 * for duplicate and empty entry values.
98 *
99 * @param uri the uri of the namespace of the element
100 * @param localName the local name of the element
101 * @param metadata the Tika metadata object to populate
102 * @param name the Tika metadata field key
103 * @param allowDuplicateValues add duplicate values to the Tika metadata
104 * @param allowEmptyValues add empty values to the Tika metadata
105 */
106 public ElementMetadataHandler(
107 String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
108 super(metadata, name);
109 this.uri = uri;
110 this.localName = localName;
111 this.metadata = metadata;
112 this.name = name;
113 this.allowDuplicateValues = allowDuplicateValues;
114 this.allowEmptyValues = allowEmptyValues;
115 if (logger.isTraceEnabled()) {
116 logger.trace("created simple handler for " + this.name);
117 }
118 }
119
120 /**
121 * Constructor for Property metadata keys.
122 *
123 * @param uri the uri of the namespace of the element
124 * @param localName the local name of the element
125 * @param metadata the Tika metadata object to populate
126 * @param targetProperty the Tika metadata Property key
127 */
128 public ElementMetadataHandler(
129 String uri, String localName, Metadata metadata, Property targetProperty) {
130 super(metadata, targetProperty);
131 this.uri = uri;
132 this.localName = localName;
133 this.metadata = metadata;
134 this.targetProperty = targetProperty;
135 this.name = targetProperty.getName();
136 this.allowDuplicateValues = false;
137 this.allowEmptyValues = false;
138 if (logger.isTraceEnabled()) {
139 logger.trace("created property handler for " + this.name);
140 }
141 }
142
143 /**
144 * Constructor for Property metadata keys which allows change of behavior
145 * for duplicate and empty entry values.
146 *
147 * @param uri the uri of the namespace of the element
148 * @param localName the local name of the element
149 * @param metadata the Tika metadata object to populate
150 * @param targetProperty the Tika metadata Property key
151 * @param allowDuplicateValues add duplicate values to the Tika metadata
152 * @param allowEmptyValues add empty values to the Tika metadata
153 */
154 public ElementMetadataHandler(
155 String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
156 super(metadata, targetProperty);
157 this.uri = uri;
158 this.localName = localName;
159 this.metadata = metadata;
160 this.targetProperty = targetProperty;
161 this.name = targetProperty.getName();
162 this.allowDuplicateValues = allowDuplicateValues;
163 this.allowEmptyValues = allowEmptyValues;
164 if (logger.isTraceEnabled()) {
165 logger.trace("created property handler for " + this.name);
166 }
167 }
168
169 protected boolean isMatchingParentElement(String uri, String localName) {
170 return (uri.equals(this.uri) && localName.equals(this.localName));
171 }
172
173 protected boolean isMatchingElement(String uri, String localName) {
174 // match if we're inside the parent element or within some bag element
175 return (uri.equals(this.uri) && localName.equals(this.localName)) ||
176 (parentMatchLevel > 0 &&
177 ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
178 (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
179 )
180 );
181 }
182
183 @Override
184 public void startElement(
185 String uri, String localName, String name, Attributes attributes) {
186 if (isMatchingElement(uri, localName)) {
187 matchLevel++;
188 }
189 if (isMatchingParentElement(uri, localName)) {
190 parentMatchLevel++;
191 }
192 }
193
194 @Override
195 public void endElement(String uri, String localName, String name) {
196 if (isMatchingParentElement(uri, localName)) {
197 parentMatchLevel--;
198 }
199 if (isMatchingElement(uri, localName)) {
200 matchLevel--;
201 if (matchLevel == 2) {
202 // we're inside a bag li element, add the bagged buffer
203 addMetadata(bufferBagged.toString().trim());
204 bufferBagged.setLength(0);
205 isBagless = false;
206 }
207 if (matchLevel == 0 && isBagless) {
208 String valueBagless = bufferBagless.toString();
209 if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
210 // we're in a standard element, add the bagless buffer
211 addMetadata(valueBagless.trim());
212 bufferBagless.setLength(0);
213 }
214 isBagless = true;
215 }
216 }
217 }
218
219 @Override
220 public void characters(char[] ch, int start, int length) {
221 // We need to append to both buffers since we don't if we're inside a bag until we're done
222 if (parentMatchLevel > 0 && matchLevel > 2) {
223 bufferBagged.append(ch, start, length);
224 }
225 if (parentMatchLevel > 0 && matchLevel > 0) {
226 bufferBagless.append(ch, start, length);
227 }
228 }
229
230 @Override
231 public void ignorableWhitespace(char[] ch, int start, int length) {
232 characters(ch, start, length);
233 }
234
235 @Override
236 protected void addMetadata(String value) {
237 if (logger.isTraceEnabled()) {
238 logger.trace("adding " + name + "=" + value);
239 }
240 if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
241 if ((value != null && value.length() > 0) || allowEmptyValues) {
242 if (value == null || value.length() == 0 && allowEmptyValues) {
243 value = "";
244 }
245 String[] previous = metadata.getValues(name);
246 if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
247 metadata.add(targetProperty, value);
248 }
249 }
250 } else {
251 super.addMetadata(value);
252 }
253 }
254 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import org.apache.commons.codec.binary.Base64;
19 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
20 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.TikaMetadataKeys;
23 import org.apache.tika.mime.MediaType;
24 import org.apache.tika.parser.ParseContext;
25 import org.xml.sax.Attributes;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.helpers.DefaultHandler;
29
30 import java.io.ByteArrayInputStream;
31 import java.io.IOException;
32 import java.util.Collections;
33 import java.util.Set;
34
35 public class FictionBookParser extends XMLParser {
36 private static final long serialVersionUID = 4195954546491524374L;
37
38 @Override
39 public Set<MediaType> getSupportedTypes(ParseContext context) {
40 return Collections.singleton(MediaType.application("x-fictionbook+xml"));
41 }
42
43 @Override
44 protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
45 EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
46
47 if (ex == null) {
48 ex = new ParsingEmbeddedDocumentExtractor(context);
49 }
50
51 return new BinaryElementsDataHandler(ex, handler);
52 }
53
54 private static class BinaryElementsDataHandler extends DefaultHandler {
55 private static final String ELEMENT_BINARY = "binary";
56
57 private boolean binaryMode = false;
58 private static final String ATTRIBUTE_ID = "id";
59
60 private final EmbeddedDocumentExtractor partExtractor;
61 private final ContentHandler handler;
62 private final StringBuilder binaryData = new StringBuilder();
63 private Metadata metadata;
64 private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
65
66 private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
67 this.partExtractor = partExtractor;
68 this.handler = handler;
69 }
70
71 @Override
72 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
73 binaryMode = ELEMENT_BINARY.equals(localName);
74 if (binaryMode) {
75 binaryData.setLength(0);
76 metadata = new Metadata();
77
78 metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
79 metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
80 }
81 }
82
83 @Override
84 public void endElement(String uri, String localName, String qName) throws SAXException {
85 if (binaryMode) {
86 try {
87 partExtractor.parseEmbedded(
88 new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
89 handler,
90 metadata,
91 true
92 );
93 } catch (IOException e) {
94 throw new SAXException("IOException in parseEmbedded", e);
95 }
96
97 binaryMode = false;
98 binaryData.setLength(0);
99 }
100 }
101
102 @Override
103 public void characters(char[] ch, int start, int length) throws SAXException {
104 if (!binaryMode) {
105 handler.characters(ch, start, length);
106 } else {
107 binaryData.append(ch, start, length);
108 }
109 }
110
111 @Override
112 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
113 handler.ignorableWhitespace(ch, start, length);
114 }
115 }
116 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.xml.sax.Attributes;
21 import org.xml.sax.helpers.DefaultHandler;
22
23 /**
24 * This adds Metadata entries with a specified name for
25 * the textual content of a node (if present), and
26 * all attribute values passed through the matcher
27 * (but not their names).
28 *
29 * @deprecated Use the {@link AttributeMetadataHandler} and
30 * {@link ElementMetadataHandler} classes instead
31 */
32 public class MetadataHandler extends DefaultHandler {
33
34 private final Metadata metadata;
35
36 private final Property property;
37 private final String name;
38
39 private final StringBuilder buffer = new StringBuilder();
40
41 public MetadataHandler(Metadata metadata, String name) {
42 this.metadata = metadata;
43 this.property = null;
44 this.name = name;
45 }
46 public MetadataHandler(Metadata metadata, Property property) {
47 this.metadata = metadata;
48 this.property = property;
49 this.name = property.getName();
50 }
51
52 public void addMetadata(String value) {
53 if (value.length() > 0) {
54 String previous = metadata.get(name);
55 if (previous != null && previous.length() > 0) {
56 value = previous + ", " + value;
57 }
58
59 if (this.property != null) {
60 metadata.set(property, value);
61 } else {
62 metadata.set(name, value);
63 }
64 }
65 }
66
67 public void endElement(String uri, String localName, String name) {
68 addMetadata(buffer.toString());
69 buffer.setLength(0);
70 }
71
72 public void startElement(
73 String uri, String localName, String name, Attributes attributes) {
74 for (int i = 0; i < attributes.getLength(); i++) {
75 addMetadata(attributes.getValue(i));
76 }
77 }
78
79
80 public void characters(char[] ch, int start, int length) {
81 buffer.append(ch, start, length);
82 }
83
84 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashSet;
23 import java.util.Set;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.io.CloseShieldInputStream;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.mime.MediaType;
29 import org.apache.tika.parser.AbstractParser;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.sax.EmbeddedContentHandler;
32 import org.apache.tika.sax.OfflineContentHandler;
33 import org.apache.tika.sax.TaggedContentHandler;
34 import org.apache.tika.sax.TextContentHandler;
35 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * XML parser.
41 */
42 public class XMLParser extends AbstractParser {
43
44 /** Serial version UID */
45 private static final long serialVersionUID = -6028836725280212837L;
46
47 private static final Set<MediaType> SUPPORTED_TYPES =
48 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
49 MediaType.application("xml"),
50 MediaType.image("svg+xml"))));
51
52 public Set<MediaType> getSupportedTypes(ParseContext context) {
53 return SUPPORTED_TYPES;
54 }
55
56 public void parse(
57 InputStream stream, ContentHandler handler,
58 Metadata metadata, ParseContext context)
59 throws IOException, SAXException, TikaException {
60 if (metadata.get(Metadata.CONTENT_TYPE) == null) {
61 metadata.set(Metadata.CONTENT_TYPE, "application/xml");
62 }
63
64 final XHTMLContentHandler xhtml =
65 new XHTMLContentHandler(handler, metadata);
66 xhtml.startDocument();
67 xhtml.startElement("p");
68
69 TaggedContentHandler tagged = new TaggedContentHandler(handler);
70 try {
71 context.getSAXParser().parse(
72 new CloseShieldInputStream(stream),
73 new OfflineContentHandler(new EmbeddedContentHandler(
74 getContentHandler(tagged, metadata, context))));
75 } catch (SAXException e) {
76 tagged.throwIfCauseOf(e);
77 throw new TikaException("XML parse error", e);
78 }
79
80 xhtml.endElement("p");
81 xhtml.endDocument();
82 }
83
84 protected ContentHandler getContentHandler(
85 ContentHandler handler, Metadata metadata, ParseContext context) {
86 return new TextContentHandler(handler, true);
87 }
88 }
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 org.apache.tika.parser.microsoft.POIFSContainerDetector
16 org.apache.tika.parser.pkg.ZipContainerDetector
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 org.apache.tika.parser.html.HtmlEncodingDetector
16 org.apache.tika.parser.txt.UniversalEncodingDetector
17 org.apache.tika.parser.txt.Icu4jEncodingDetector
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 org.apache.tika.parser.asm.ClassParser
16 org.apache.tika.parser.audio.AudioParser
17 org.apache.tika.parser.audio.MidiParser
18 org.apache.tika.parser.crypto.Pkcs7Parser
19 org.apache.tika.parser.dwg.DWGParser
20 org.apache.tika.parser.epub.EpubParser
21 org.apache.tika.parser.executable.ExecutableParser
22 org.apache.tika.parser.feed.FeedParser
23 org.apache.tika.parser.font.AdobeFontMetricParser
24 org.apache.tika.parser.font.TrueTypeParser
25 org.apache.tika.parser.html.HtmlParser
26 org.apache.tika.parser.image.ImageParser
27 org.apache.tika.parser.image.PSDParser
28 org.apache.tika.parser.image.TiffParser
29 org.apache.tika.parser.iptc.IptcAnpaParser
30 org.apache.tika.parser.iwork.IWorkPackageParser
31 org.apache.tika.parser.jpeg.JpegParser
32 org.apache.tika.parser.mail.RFC822Parser
33 org.apache.tika.parser.mbox.MboxParser
34 org.apache.tika.parser.microsoft.OfficeParser
35 org.apache.tika.parser.microsoft.TNEFParser
36 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
37 org.apache.tika.parser.mp3.Mp3Parser
38 org.apache.tika.parser.mp4.MP4Parser
39 org.apache.tika.parser.hdf.HDFParser
40 org.apache.tika.parser.netcdf.NetCDFParser
41 org.apache.tika.parser.odf.OpenDocumentParser
42 org.apache.tika.parser.pdf.PDFParser
43 org.apache.tika.parser.pkg.CompressorParser
44 org.apache.tika.parser.pkg.PackageParser
45 org.apache.tika.parser.rtf.RTFParser
46 org.apache.tika.parser.txt.TXTParser
47 org.apache.tika.parser.video.FLVParser
48 org.apache.tika.parser.xml.DcXMLParser
49 org.apache.tika.parser.xml.FictionBookParser
50 org.apache.tika.parser.chm.ChmParser
51 org.apache.tika.parser.code.SourceCodeParser
0 <?xml version="1.0" encoding="UTF-8"?>
1 <!--
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
8
9 http://www.apache.org/licenses/LICENSE-2.0
10
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 -->
17 <!--
18 Description: This xml file defines external commands to be run by Tika
19 as parsers.
20 -->
21 <external-parsers>
22 <!-- This example uses ffmpeg for video metadata extraction -->
23 <parser>
24 <check>
25 <command>ffmpeg -version</command>
26 <error-codes>126,127</error-codes>
27 </check>
28 <command>ffmpeg -i ${INPUT}</command>
29 <mime-types>
30 <mime-type>video/avi</mime-type>
31 <mime-type>video/mpeg</mime-type>
32 </mime-types>
33 <metadata>
34 <match key="xmpDM:audioChannelType">Stream.*? Audio:.*? Hz, (\w+),</match>
35 <match key="xmpDM:audioCompressor">Stream.*? Audio: (\w+),</match>
36 </metadata>
37 </parser>
38 </external-parsers>
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 enableAutospace true
16 extractAnnotationText true
17 sortByPosition false
18 suppressDuplicateOverlappingText false
19 useNonSequentialParser false
20 extractAcroFormContent true
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.InputStream;
24
25 import org.apache.tika.config.TikaConfig;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.junit.Before;
31 import org.junit.Test;
32 import org.xml.sax.helpers.DefaultHandler;
33
34 /**
35 * Junit test class for Tika {@link Parser}s.
36 */
37 public class TestParsers extends TikaTest {
38
39 private TikaConfig tc;
40
41 private Tika tika;
42
43 @Before
44 public void setUp() throws Exception {
45 tc = TikaConfig.getDefaultConfig();
46 tika = new Tika(tc);
47 }
48
49 @Test
50 public void testWORDxtraction() throws Exception {
51 File file = getResourceAsFile("/test-documents/testWORD.doc");
52 Parser parser = tika.getParser();
53 Metadata metadata = new Metadata();
54 InputStream stream = new FileInputStream(file);
55 try {
56 parser.parse(
57 stream, new DefaultHandler(), metadata, new ParseContext());
58 } finally {
59 stream.close();
60 }
61 assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
62 }
63
64 @Test
65 public void testEXCELExtraction() throws Exception {
66 final String expected = "Numbers and their Squares";
67 File file = getResourceAsFile("/test-documents/testEXCEL.xls");
68 String s1 = tika.parseToString(file);
69 assertTrue("Text does not contain '" + expected + "'", s1
70 .contains(expected));
71 Parser parser = tika.getParser();
72 Metadata metadata = new Metadata();
73 InputStream stream = new FileInputStream(file);
74 try {
75 parser.parse(
76 stream, new DefaultHandler(), metadata, new ParseContext());
77 } finally {
78 stream.close();
79 }
80 assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
81 }
82
83 @Test
84 public void testOptionalHyphen() throws Exception {
85 String[] extensions =
86 new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"};
87 for (String extension : extensions) {
88 File file = getResourceAsFile("/test-documents/testOptionalHyphen." + extension);
89 String content = tika.parseToString(file);
90 assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content,
91 content.contains("optionalhyphen") ||
92 content.contains("optional\u00adhyphen") || // soft hyphen
93 content.contains("optional\u200bhyphen") || // zero width space
94 content.contains("optional\u2027")); // hyphenation point
95
96 }
97 }
98
99 private void verifyComment(String extension, String fileName) throws Exception {
100 File file = getResourceAsFile("/test-documents/" + fileName + "." + extension);
101 String content = tika.parseToString(file);
102 assertTrue(extension + ": content=" + content + " did not extract text",
103 content.contains("Here is some text"));
104 assertTrue(extension + ": content=" + content + " did not extract comment",
105 content.contains("Here is a comment"));
106 }
107
108 @Test
109 public void testComment() throws Exception {
110 final String[] extensions = new String[] {"ppt", "pptx", "doc", "docx", "pdf", "rtf"};
111 for(String extension : extensions) {
112 verifyComment(extension, "testComment");
113 }
114 }
115 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika;
17
18 import static org.junit.Assert.assertTrue;
19 import static org.junit.Assert.fail;
20
21 import java.io.File;
22 import java.io.InputStream;
23 import java.net.URISyntaxException;
24 import java.net.URL;
25
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.parser.AutoDetectParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.apache.tika.sax.ToXMLContentHandler;
32 import org.xml.sax.ContentHandler;
33
34 /**
35 * Parent class of Tika tests
36 */
37 public abstract class TikaTest {
38 /**
39 * This method will give you back the filename incl. the absolute path name
40 * to the resource. If the resource does not exist it will give you back the
41 * resource name incl. the path.
42 *
43 * @param name
44 * The named resource to search for.
45 * @return an absolute path incl. the name which is in the same directory as
46 * the the class you've called it from.
47 */
48 public File getResourceAsFile(String name) throws URISyntaxException {
49 URL url = this.getClass().getResource(name);
50 if (url != null) {
51 return new File(url.toURI());
52 } else {
53 // We have a file which does not exists
54 // We got the path
55 url = this.getClass().getResource(".");
56 File file = new File(new File(url.toURI()), name);
57 if (file == null) {
58 fail("Unable to find requested file " + name);
59 }
60 return file;
61 }
62 }
63
64 public InputStream getResourceAsStream(String name) {
65 InputStream stream = this.getClass().getResourceAsStream(name);
66 if (stream == null) {
67 fail("Unable to find requested resource " + name);
68 }
69 return stream;
70 }
71
72 public void assertContains(String needle, String haystack) {
73 assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
74 }
75
76 protected static class XMLResult {
77 public final String xml;
78 public final Metadata metadata;
79
80 public XMLResult(String xml, Metadata metadata) {
81 this.xml = xml;
82 this.metadata = metadata;
83 }
84 }
85
86 protected XMLResult getXML(String filePath) throws Exception {
87 return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
88 }
89
90 protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
91 ParseContext context = new ParseContext();
92 context.set(Parser.class, parser);
93
94 try {
95 ContentHandler handler = new ToXMLContentHandler();
96 parser.parse(input, handler, metadata, context);
97 return new XMLResult(handler.toString(), metadata);
98 } finally {
99 input.close();
100 }
101 }
102
103 /**
104 * Basic text extraction.
105 * <p>
106 * Tries to close input stream after processing.
107 */
108 public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
109 ContentHandler handler = new BodyContentHandler(1000000);
110 try {
111 parser.parse(is, handler, metadata, context);
112 } finally {
113 is.close();
114 }
115 return handler.toString();
116 }
117
118 public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{
119 return getText(is, parser, new ParseContext(), metadata);
120 }
121
122 public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{
123 return getText(is, parser, context, new Metadata());
124 }
125
126 public String getText(InputStream is, Parser parser) throws Exception{
127 return getText(is, parser, new ParseContext(), new Metadata());
128 }
129
130
131 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.detect;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.File;
23 import java.io.FilenameFilter;
24 import java.io.IOException;
25 import java.io.InputStream;
26
27 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
28 import org.apache.tika.io.TikaInputStream;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.mime.MediaType;
31 import org.junit.Test;
32
33 /**
34 * Junit test class for {@link ContainerAwareDetector}
35 */
36 public class TestContainerAwareDetector {
37
38 private final Detector detector = new DefaultDetector();
39
40 private void assertTypeByData(String file, String type) throws Exception {
41 assertTypeByNameAndData(file, null, type);
42 }
43 private void assertTypeByNameAndData(String file, String type) throws Exception {
44 assertTypeByNameAndData(file, file, type);
45 }
46 private void assertType(String file, String byData, String byNameAndData) throws Exception {
47 assertTypeByData(file, byData);
48 assertTypeByNameAndData(file, byNameAndData);
49 }
50 private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
51 TikaInputStream stream = TikaInputStream.get(
52 TestContainerAwareDetector.class.getResource(
53 "/test-documents/" + dataFile));
54 try {
55 Metadata m = new Metadata();
56 if (name != null)
57 m.add(Metadata.RESOURCE_NAME_KEY, name);
58
59 assertEquals(
60 MediaType.parse(type),
61 detector.detect(stream, m));
62 } finally {
63 stream.close();
64 }
65 }
66
67 @Test
68 public void testDetectOLE2() throws Exception {
69 // Microsoft office types known by POI
70 assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel");
71 assertTypeByData("testWORD.doc", "application/msword");
72 assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint");
73
74 assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook");
75 assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook");
76 assertTypeByData("testVISIO.vsd", "application/vnd.visio");
77 assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
78 assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
79 assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
80 // older Works Word Processor files can't be recognized
81 // they were created with Works Word Processor 7.0 (hence the text inside)
82 // and exported to the older formats with the "Save As" feature
83 assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works");
84 assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works");
85 assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
86 assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
87 assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
88 // Excel95 can be detected by not parsed
89 assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
90
91 // Try some ones that POI doesn't handle, that are still OLE2 based
92 assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
93 assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
94 assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
95
96
97 // With the filename and data
98 assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");
99 assertTypeByNameAndData("testWORD.doc", "application/msword");
100 assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint");
101
102 // With the wrong filename supplied, data will trump filename
103 assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel");
104 assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword");
105 assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint");
106
107 // With a filename of a totally different type, data will trump filename
108 assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel");
109 assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel");
110 }
111
112 /**
113 * There is no way to distinguish "proper" StarOffice files from templates.
114 * All templates have the same extension but their actual type depends on
115 * the magic. Our current MimeTypes class doesn't allow us to use the same
116 * glob pattern in more than one mimetype.
117 *
118 * @throws Exception
119 */
120 @Test
121 public void testDetectStarOfficeFiles() throws Exception {
122 assertType("testStarOffice-5.2-calc.sdc",
123 "application/vnd.stardivision.calc",
124 "application/vnd.stardivision.calc");
125 assertType("testVORCalcTemplate.vor",
126 "application/vnd.stardivision.calc",
127 "application/vnd.stardivision.calc");
128 assertType("testStarOffice-5.2-draw.sda",
129 "application/vnd.stardivision.draw",
130 "application/vnd.stardivision.draw");
131 assertType("testVORDrawTemplate.vor",
132 "application/vnd.stardivision.draw",
133 "application/vnd.stardivision.draw");
134 assertType("testStarOffice-5.2-impress.sdd",
135 "application/vnd.stardivision.impress",
136 "application/vnd.stardivision.impress");
137 assertType("testVORImpressTemplate.vor",
138 "application/vnd.stardivision.impress",
139 "application/vnd.stardivision.impress");
140 assertType("testStarOffice-5.2-writer.sdw",
141 "application/vnd.stardivision.writer",
142 "application/vnd.stardivision.writer");
143 assertType("testVORWriterTemplate.vor",
144 "application/vnd.stardivision.writer",
145 "application/vnd.stardivision.writer");
146
147 }
148
149 @Test
150 public void testOpenContainer() throws Exception {
151 TikaInputStream stream = TikaInputStream.get(
152 TestContainerAwareDetector.class.getResource(
153 "/test-documents/testPPT.ppt"));
154 try {
155 assertNull(stream.getOpenContainer());
156 assertEquals(
157 MediaType.parse("application/vnd.ms-powerpoint"),
158 detector.detect(stream, new Metadata()));
159 assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
160 } finally {
161 stream.close();
162 }
163 }
164
165 /**
166 * EPub uses a similar mimetype entry to OpenDocument for storing
167 * the mimetype within the parent zip file
168 */
169 @Test
170 public void testDetectEPub() throws Exception {
171 assertTypeByData("testEPUB.epub", "application/epub+zip");
172 assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip");
173 }
174
175 @Test
176 public void testDetectLotusNotesEml() throws Exception {
177 // Lotus .eml files aren't guaranteed to have any of the magic
178 // matches as the first line, but should have X-Notes-Item and Message-ID
179 assertTypeByData("testLotusEml.eml", "message/rfc822");
180 }
181
182 @Test
183 public void testDetectODF() throws Exception {
184 assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text");
185 assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula");
186 }
187
188 @Test
189 public void testDetectOOXML() throws Exception {
190 assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
191 assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
192 assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
193
194 // Check some of the less common OOXML types
195 assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12");
196 assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
197 assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
198 assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
199
200 // .xlsb is an OOXML file containing the binary parts, and not
201 // an OLE2 file as you might initially expect!
202 assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");
203
204 // With the filename and data
205 assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
206 assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
207 assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
208
209 // With the wrong filename supplied, data will trump filename
210 assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
211 assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
212 assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
213
214 // With an incorrect filename of a different container type, data trumps filename
215 assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
216 }
217
218 /**
219 * Password Protected OLE2 files are fairly straightforward to detect, as they
220 * have the same structure as regular OLE2 files. (Core streams may be encrypted
221 * however)
222 */
223 @Test
224 public void testDetectProtectedOLE2() throws Exception {
225 assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
226 assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
227 assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
228 assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
229 assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
230 assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
231 }
232
233 /**
234 * Password Protected OOXML files are much more tricky beasts to work with.
235 * They have a very different structure to regular OOXML files, and instead
236 * of being ZIP based they are actually an OLE2 file which contains the
237 * OOXML structure within an encrypted stream.
238 * This makes detecting them much harder...
239 */
240 @Test
241 public void testDetectProtectedOOXML() throws Exception {
242 // Encrypted Microsoft Office OOXML files have OLE magic but
243 // special streams, so we can tell they're Protected OOXML
244 assertTypeByData("testEXCEL_protected_passtika.xlsx",
245 "application/x-tika-ooxml-protected");
246 assertTypeByData("testWORD_protected_passtika.docx",
247 "application/x-tika-ooxml-protected");
248 assertTypeByData("testPPT_protected_passtika.pptx",
249 "application/x-tika-ooxml-protected");
250
251 // At the moment, we can't use the name to specialise
252 // See discussions on TIKA-790 for details
253 assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx",
254 "application/x-tika-ooxml-protected");
255 assertTypeByNameAndData("testWORD_protected_passtika.docx",
256 "application/x-tika-ooxml-protected");
257 assertTypeByNameAndData("testPPT_protected_passtika.pptx",
258 "application/x-tika-ooxml-protected");
259 }
260
261 /**
262 * Check that temporary files created by Tika are removed after
263 * closing TikaInputStream.
264 */
265 @Test
266 public void testRemovalTempfiles() throws Exception {
267 assertRemovalTempfiles("testWORD.docx");
268 assertRemovalTempfiles("test-documents.zip");
269 }
270
271 private int countTemporaryFiles() {
272 return new File(System.getProperty("java.io.tmpdir")).listFiles(
273 new FilenameFilter() {
274 public boolean accept(File dir, String name) {
275 return name.startsWith("apache-tika-");
276 }
277 }).length;
278 }
279
280 private void assertRemovalTempfiles(String fileName) throws Exception {
281 int numberOfTempFiles = countTemporaryFiles();
282
283 TikaInputStream stream = TikaInputStream.get(
284 TestContainerAwareDetector.class.getResource(
285 "/test-documents/" + fileName));
286 try {
287 detector.detect(stream, new Metadata());
288 } finally {
289 stream.close();
290 }
291
292 assertEquals(numberOfTempFiles, countTemporaryFiles());
293 }
294
295 @Test
296 public void testDetectIWork() throws Exception {
297 assertTypeByData("testKeynote.key", "application/vnd.apple.keynote");
298 assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers");
299 assertTypeByData("testPages.pages", "application/vnd.apple.pages");
300 }
301
302 @Test
303 public void testDetectKMZ() throws Exception {
304 assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
305 }
306
307 @Test
308 public void testDetectIPA() throws Exception {
309 assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa");
310 assertTypeByData("testIPA.ipa", "application/x-itunes-ipa");
311 }
312
313 @Test
314 public void testDetectZip() throws Exception {
315 assertTypeByData("test-documents.zip", "application/zip");
316 assertTypeByData("test-zip-of-zip.zip", "application/zip");
317
318 // JAR based formats
319 assertTypeByData("testJAR.jar", "application/java-archive");
320 assertTypeByData("testWAR.war", "application/x-tika-java-web-archive");
321 assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive");
322 assertTypeByData("testAPK.apk", "application/vnd.android.package-archive");
323 }
324
325 private TikaInputStream getTruncatedFile(String name, int n)
326 throws IOException {
327 InputStream input =
328 TestContainerAwareDetector.class.getResourceAsStream(
329 "/test-documents/" + name);
330 try {
331 byte[] bytes = new byte[n];
332 int m = 0;
333 while (m < bytes.length) {
334 int i = input.read(bytes, m, bytes.length - m);
335 if (i != -1) {
336 m += i;
337 } else {
338 throw new IOException("Unexpected end of stream");
339 }
340 }
341 return TikaInputStream.get(bytes);
342 } finally {
343 input.close();
344 }
345 }
346
347 @Test
348 public void testTruncatedFiles() throws Exception {
349 // First up a truncated OOXML (zip) file
350
351 // With only the data supplied, the best we can do is the container
352 TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
353 Metadata m = new Metadata();
354 try {
355 assertEquals(
356 MediaType.application("x-tika-ooxml"),
357 detector.detect(xlsx, m));
358 } finally {
359 xlsx.close();
360 }
361
362 // With truncated data + filename, we can use the filename to specialise
363 xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
364 m = new Metadata();
365 m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
366 try {
367 assertEquals(
368 MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
369 detector.detect(xlsx, m));
370 } finally {
371 xlsx.close();
372 }
373
374
375 // Now a truncated OLE2 file
376 TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400);
377 m = new Metadata();
378 try {
379 assertEquals(
380 MediaType.application("x-tika-msoffice"),
381 detector.detect(xls, m));
382 } finally {
383 xls.close();
384 }
385
386 // Finally a truncated OLE2 file, with a filename available
387 xls = getTruncatedFile("testEXCEL.xls", 400);
388 m = new Metadata();
389 m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
390 try {
391 assertEquals(
392 MediaType.application("vnd.ms-excel"),
393 detector.detect(xls, m));
394 } finally {
395 xls.close();
396 }
397 }
398
399 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.embedder;
17
18 import static org.junit.Assert.assertNotNull;
19 import static org.junit.Assert.assertTrue;
20 import static org.junit.Assert.fail;
21
22 import java.io.ByteArrayOutputStream;
23 import java.io.File;
24 import java.io.FileInputStream;
25 import java.io.FileNotFoundException;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.OutputStreamWriter;
30 import java.net.URISyntaxException;
31 import java.net.URL;
32 import java.text.DateFormat;
33 import java.text.SimpleDateFormat;
34 import java.util.Date;
35 import java.util.HashMap;
36 import java.util.Map;
37
38 import org.apache.tika.embedder.Embedder;
39 import org.apache.tika.embedder.ExternalEmbedder;
40 import org.apache.tika.exception.TikaException;
41 import org.apache.tika.io.TemporaryResources;
42 import org.apache.tika.io.TikaInputStream;
43 import org.apache.tika.metadata.Metadata;
44 import org.apache.tika.metadata.Property;
45 import org.apache.tika.metadata.TikaCoreProperties;
46 import org.apache.tika.parser.ParseContext;
47 import org.apache.tika.parser.Parser;
48 import org.apache.tika.parser.txt.TXTParser;
49 import org.apache.tika.sax.BodyContentHandler;
50 import org.junit.Test;
51 import org.xml.sax.ContentHandler;
52 import org.xml.sax.SAXException;
53
54 /**
55 * Unit test for {@link ExternalEmbedder}s.
56 */
57 public class ExternalEmbedderTest {
58
59 protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER =
60 new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
61 protected static final String DEFAULT_CHARSET = "UTF-8";
62 private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description";
63 private static final String TEST_TXT_PATH = "/test-documents/testTXT.txt";
64
65 private TemporaryResources tmp = new TemporaryResources();
66
67 /**
68 * Gets the expected returned metadata value for the given field
69 *
70 * @param fieldName
71 * @return a prefix added to the field name
72 */
73 protected String getExpectedMetadataValueString(String fieldName, Date timestamp) {
74 return this.getClass().getSimpleName() + " embedded " + fieldName +
75 " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp);
76 }
77
78 /**
79 * Gets the tika <code>Metadata</code> object containing data to be
80 * embedded.
81 *
82 * @return the populated tika metadata object
83 */
84 protected Metadata getMetadataToEmbed(Date timestamp) {
85 Metadata metadata = new Metadata();
86 metadata.add(TikaCoreProperties.DESCRIPTION,
87 getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp));
88 return metadata;
89 }
90
91 /**
92 * Gets the <code>Embedder</code> to test.
93 *
94 * @return the embedder under test
95 */
96 protected Embedder getEmbedder() {
97 ExternalEmbedder embedder = new ExternalEmbedder();
98 Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1);
99 metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION,
100 new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION });
101 embedder.setMetadataCommandArguments(metadataCommandArguments);
102 return embedder;
103 }
104
105 /**
106 * Gets the source input stream through standard Java resource loaders
107 * before metadata has been embedded.
108 *
109 * @return a fresh input stream
110 */
111 protected InputStream getSourceStandardInputStream() {
112 return this.getClass().getResourceAsStream(TEST_TXT_PATH);
113 }
114
115 /**
116 * Gets the source input stream via {@link TikaInputStream}
117 * before metadata has been embedded.
118 *
119 * @return a fresh input stream
120 * @throws FileNotFoundException
121 */
122 protected InputStream getSourceTikaInputStream() throws FileNotFoundException {
123 return TikaInputStream.get(getSourceInputFile());
124 }
125
126 /**
127 * Gets the source input file through standard Java resource loaders
128 * before metadata has been embedded.
129 *
130 * @return a fresh input stream
131 * @throws FileNotFoundException
132 */
133 protected File getSourceInputFile() throws FileNotFoundException {
134 URL origUrl = this.getClass().getResource(TEST_TXT_PATH);
135 if (origUrl == null) {
136 throw new FileNotFoundException("could not load " + TEST_TXT_PATH);
137 }
138 try {
139 return new File(origUrl.toURI());
140 } catch (URISyntaxException e) {
141 throw new FileNotFoundException(e.getMessage());
142 }
143 }
144
145 /**
146 * Gets the parser to use to verify the result of the embed operation.
147 *
148 * @return the parser to read embedded metadata
149 */
150 protected Parser getParser() {
151 return new TXTParser();
152 }
153
154 /**
155 * Whether or not the final result of reading the now embedded metadata is
156 * expected in the output of the external tool
157 *
158 * @return whether or not results are expected in command line output
159 */
160 protected boolean getIsMetadataExpectedInOutput() {
161 return true;
162 }
163
164 /**
165 * Tests embedding metadata then reading metadata to verify the results.
166 *
167 * @param isResultExpectedInOutput whether or not results are expected in command line output
168 */
169 protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) {
170 Embedder embedder = getEmbedder();
171
172 // TODO Move this check to ExternalEmbedder
173 String os = System.getProperty("os.name", "");
174 if (os.contains("Windows")) {
175 // Skip test on Windows
176 return;
177 }
178
179 Date timestamp = new Date();
180 Metadata metadataToEmbed = getMetadataToEmbed(timestamp);
181
182 try {
183 File tempOutputFile = tmp.createTemporaryFile();
184 FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile);
185
186 // Embed the metadata into a copy of the original output stream
187 embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null);
188
189 ParseContext context = new ParseContext();
190 Parser parser = getParser();
191 context.set(Parser.class, parser);
192
193 // Setup the extracting content handler
194 ByteArrayOutputStream result = new ByteArrayOutputStream();
195 OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET);
196 ContentHandler handler = new BodyContentHandler(outputWriter);
197
198 // Create a new metadata object to read the new metadata into
199 Metadata embeddedMetadata = new Metadata();
200
201 // Setup a re-read of the now embeded temp file
202 FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);
203
204 parser.parse(embeddedFileInputStream, handler, embeddedMetadata,
205 context);
206
207 tmp.dispose();
208
209 String outputString = null;
210 if (isResultExpectedInOutput) {
211 outputString = result.toString(DEFAULT_CHARSET);
212 } else {
213 assertTrue("no metadata found", embeddedMetadata.size() > 0);
214 }
215
216 // Check each metadata property for the expected value
217 for (String metadataName : metadataToEmbed.names()) {
218 if (metadataToEmbed.get(metadataName) != null) {
219 String expectedValue = metadataToEmbed.get(metadataName);
220 boolean foundExpectedValue = false;
221 if (isResultExpectedInOutput) {
222 // just check that the entire output contains the expected string
223 foundExpectedValue = outputString.contains(expectedValue);
224 } else {
225 if (embeddedMetadata.isMultiValued(metadataName)) {
226 for (String embeddedValue : embeddedMetadata.getValues(metadataName)) {
227 if (embeddedValue != null) {
228 if (embeddedValue.contains(expectedValue)) {
229 foundExpectedValue = true;
230 break;
231 }
232 }
233 }
234 } else {
235 String embeddedValue = embeddedMetadata.get(metadataName);
236 assertNotNull("expected metadata for "
237 + metadataName + " not found",
238 embeddedValue);
239 foundExpectedValue = embeddedValue.contains(expectedValue);
240 }
241 }
242 assertTrue(
243 "result did not contain expected appended metadata "
244 + metadataName + "="
245 + expectedValue,
246 foundExpectedValue);
247 }
248 }
249 } catch (IOException e) {
250 fail(e.getMessage());
251 } catch (TikaException e) {
252 fail(e.getMessage());
253 } catch (SAXException e) {
254 fail(e.getMessage());
255 }
256 }
257
258 protected void checkSourceFileExists() {
259 String message = "the original input file was deleted";
260 try {
261 File origInputFile = getSourceInputFile();
262 assertNotNull(message, origInputFile);
263 assertTrue(message, origInputFile.exists());
264 } catch (FileNotFoundException e) {
265 fail(message + ": " + e.getMessage());
266 }
267 }
268
269 /**
270 * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceStandardInputStream()}
271 *
272 * @throws IOException
273 */
274 @Test
275 public void testEmbedStandardInputStream() throws IOException {
276 embedInTempFile(getSourceStandardInputStream(), getIsMetadataExpectedInOutput());
277 checkSourceFileExists();
278 }
279
280 /**
281 * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceTikaInputStream()}
282 *
283 * @throws IOException
284 */
285 @Test
286 public void testEmbedTikaInputStream() throws IOException {
287 embedInTempFile(getSourceTikaInputStream(), getIsMetadataExpectedInOutput());
288 checkSourceFileExists();
289 }
290
291 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import static org.junit.Assert.assertFalse;
19 import static org.junit.Assert.assertTrue;
20 import static org.junit.Assert.fail;
21
22 import org.junit.Before;
23 import org.junit.Test;
24
25 public class MimeTypeTest {
26
27 private MimeTypes types;
28 private MimeType text;
29
30 @Before
31 public void setUp() throws MimeTypeException {
32 types = new MimeTypes();
33 text = types.forName("text/plain");
34 }
35
36 /** Test MimeType constructor */
37 @Test
38 public void testConstrctor() {
39 // Missing name
40 try {
41 new MimeType(null);
42 fail("Expected IllegalArgumentException");
43 } catch (IllegalArgumentException e) {
44 // expected result
45 }
46 }
47
48 @Test
49 public void testIsValidName() {
50 assertTrue(MimeType.isValid("application/octet-stream"));
51 assertTrue(MimeType.isValid("text/plain"));
52 assertTrue(MimeType.isValid("foo/bar"));
53 assertTrue(MimeType.isValid("a/b"));
54
55 assertFalse(MimeType.isValid("application"));
56 assertFalse(MimeType.isValid("application/"));
57 assertFalse(MimeType.isValid("/"));
58 assertFalse(MimeType.isValid("/octet-stream"));
59 assertFalse(MimeType.isValid("application//octet-stream"));
60 assertFalse(MimeType.isValid("application/octet=stream"));
61 assertFalse(MimeType.isValid("application/\u00f6ctet-stream"));
62 assertFalse(MimeType.isValid("text/plain;"));
63 assertFalse(MimeType.isValid("text/plain; charset=UTF-8"));
64 try {
65 MimeType.isValid(null);
66 fail("Expected IllegalArgumentException");
67 } catch (IllegalArgumentException e) {
68 // expected result
69 }
70 }
71
72 /** Test MimeType setDescription() */
73 @Test
74 public void testSetEmptyValues() {
75 try {
76 text.setDescription(null);
77 fail("Expected IllegalArgumentException");
78 } catch (IllegalArgumentException e) {
79 // expected result
80 }
81
82 try {
83 text.setAcronym(null);
84 fail("Expected IllegalArgumentException");
85 } catch (IllegalArgumentException e) {
86 // expected result
87 }
88
89 try {
90 text.addLink(null);
91 fail("Expected IllegalArgumentException");
92 } catch (IllegalArgumentException e) {
93 // expected result
94 }
95
96 try {
97 text.setUniformTypeIdentifier(null);
98 fail("Expected IllegalArgumentException");
99 } catch (IllegalArgumentException e) {
100 // expected result
101 }
102 }
103
104 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 import static org.apache.tika.mime.MediaType.OCTET_STREAM;
19 import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
20 import static org.junit.Assert.assertEquals;
21 import static org.junit.Assert.assertFalse;
22 import static org.junit.Assert.assertNotNull;
23 import static org.junit.Assert.assertNull;
24 import static org.junit.Assert.assertTrue;
25 import static org.junit.Assert.fail;
26
27 import org.junit.Before;
28 import org.junit.Test;
29
30 public class MimeTypesTest {
31
32 private MimeTypes types;
33
34 private MediaTypeRegistry registry;
35
36 private MimeType binary;
37
38 private MimeType text;
39
40 private MimeType html;
41
42 @Before
43 public void setUp() throws MimeTypeException {
44 types = new MimeTypes();
45 registry = types.getMediaTypeRegistry();
46 binary = types.forName("application/octet-stream");
47 text = types.forName("text/plain");
48 types.addAlias(text, MediaType.parse("text/x-plain"));
49 html = types.forName("text/html");
50 types.setSuperType(html, TEXT_PLAIN);
51 }
52
53 @Test
54 public void testForName() throws MimeTypeException {
55 assertEquals(text, types.forName("text/plain"));
56 assertEquals(text, types.forName("TEXT/PLAIN"));
57
58 try {
59 types.forName("invalid");
60 fail("MimeTypeException not thrown on invalid type name");
61 } catch (MimeTypeException e) {
62 // expected
63 }
64 }
65
66 @Test
67 public void testRegisteredMimes() throws MimeTypeException {
68 String dummy = "text/xxxxx";
69 assertEquals(text, types.getRegisteredMimeType("text/plain"));
70 assertNull(types.getRegisteredMimeType(dummy));
71 assertNotNull(types.forName(dummy));
72 assertEquals(dummy, types.forName("text/xxxxx").getType().toString());
73 assertEquals(dummy, types.getRegisteredMimeType("text/xxxxx").getType().toString());
74
75 try {
76 types.forName("invalid");
77 fail("MimeTypeException not thrown on invalid type name");
78 } catch (MimeTypeException e) {
79 // expected
80 }
81 }
82
83 @Test
84 public void testSuperType() throws MimeTypeException {
85 assertNull(registry.getSupertype(OCTET_STREAM));
86 assertEquals(OCTET_STREAM, registry.getSupertype(TEXT_PLAIN));
87 assertEquals(TEXT_PLAIN, registry.getSupertype(html.getType()));
88 }
89
90 @Test
91 public void testIsDescendantOf() {
92 assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM));
93 assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN));
94 assertFalse(registry.isSpecializationOf(html.getType(), html.getType()));
95
96 assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM));
97 assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType()));
98
99 assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN));
100 assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType()));
101
102 assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM));
103 assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN));
104 }
105
106 @Test
107 public void testCompareTo() {
108 assertTrue(binary.compareTo(binary) == 0);
109 assertTrue(binary.compareTo(text) != 0);
110 assertTrue(binary.compareTo(html) != 0);
111
112 assertTrue(text.compareTo(binary) != 0);
113 assertTrue(text.compareTo(text) == 0);
114 assertTrue(text.compareTo(html) != 0);
115
116 assertTrue(html.compareTo(binary) != 0);
117 assertTrue(html.compareTo(text) != 0);
118 assertTrue(html.compareTo(html) == 0);
119 }
120
121 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.mime;
17
18 // Junit imports
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNotNull;
21 import static org.junit.Assert.assertNotSame;
22
23 import java.io.ByteArrayInputStream;
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.net.URL;
28
29 import org.apache.tika.Tika;
30 import org.apache.tika.config.TikaConfig;
31 import org.apache.tika.metadata.Metadata;
32 import org.junit.Before;
33 import org.junit.Test;
34
35 /**
36 *
37 * Test Suite for the {@link MimeTypes} repository.
38 *
39 */
40 public class TestMimeTypes {
41
42 private Tika tika;
43
44 private MimeTypes repo;
45
46 private URL u;
47
48 private static final File f = new File("/a/b/c/x.pdf");
49
50 @Before
51 public void setUp() throws Exception{
52 TikaConfig config = TikaConfig.getDefaultConfig();
53 repo = config.getMimeRepository();
54 tika = new Tika(config);
55 u = new URL("http://mydomain.com/x.pdf?x=y");
56 }
57
58 @Test
59 public void testCaseSensitivity() {
60 String type = tika.detect("test.PDF");
61 assertNotNull(type);
62 assertEquals(type, tika.detect("test.pdf"));
63 assertEquals(type, tika.detect("test.PdF"));
64 assertEquals(type, tika.detect("test.pdF"));
65 }
66
67 @Test
68 public void testLoadMimeTypes() throws MimeTypeException {
69 assertNotNull(repo.forName("application/octet-stream"));
70 assertNotNull(repo.forName("text/x-tex"));
71 }
72
73 /**
74 * Tests MIME type determination based solely on the URL's extension.
75 */
76 @Test
77 public void testGuessMimeTypes() throws Exception {
78 assertTypeByName("application/pdf", "x.pdf");
79 assertEquals("application/pdf", tika.detect(u.toExternalForm()));
80 assertEquals("application/pdf", tika.detect(f.getPath()));
81 assertTypeByName("text/plain", "x.txt");
82 assertTypeByName("text/html", "x.htm");
83 assertTypeByName("text/html", "x.html");
84 assertTypeByName("application/xhtml+xml", "x.xhtml");
85 assertTypeByName("application/xml", "x.xml");
86 assertTypeByName("application/zip", "x.zip");
87 assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
88 assertTypeByName("application/octet-stream", "x.unknown");
89
90 // Test for the MS Office media types and file extensions listed in
91 // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
92 assertTypeByName("application/msword", "x.doc");
93 assertTypeByName("application/msword", "x.dot");
94 assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
95 assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
96 assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
97 assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
98 assertTypeByName("application/vnd.ms-excel", "x.xls");
99 assertTypeByName("application/vnd.ms-excel", "x.xlt");
100 assertTypeByName("application/vnd.ms-excel", "x.xla");
101 assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
102 assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
103 assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
104 assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
105 assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
106 assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
107 assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
108 assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
109 assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
110 assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
111 assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
112 assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
113 assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
114 assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
115 assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
116 assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
117 assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
118 }
119
120 /**
121 * Note - detecting container formats by mime magic is very very
122 * iffy, as we can't be sure where things will end up.
123 * People really ought to use the container aware detection...
124 */
125 @Test
126 public void testOLE2Detection() throws Exception {
127 // These have the properties block near the start, so our mime
128 // magic will spot them
129 assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
130
131 // This one quite legitimately doesn't have its properties block
132 // as one of the first couple of entries
133 // As such, our mime magic can't figure it out...
134 assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
135 assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
136
137
138 // By name + data:
139
140 // Those we got right to start with are fine
141 assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
142
143 // And the name lets us specialise the generic OOXML
144 // ones to their actual type
145 assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
146 assertTypeByNameAndData("application/msword", "testWORD.doc");
147 }
148
149 /**
150 * Files generated by Works 7.0 Spreadsheet application use the OLE2
151 * structure and resemble Excel files (they contain a "Workbook"). They are
152 * not Excel though. They are distinguished from Excel files with an
153 * additional top-level entry in below the root of the POI filesystem.
154 *
155 * @throws Exception
156 */
157 @Test
158 public void testWorksSpreadsheetDetection() throws Exception {
159 assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
160 // with name-only, everything should be all right
161 "application/x-tika-msworks-spreadsheet",
162 // this is possible due to MimeTypes guessing the type
163 // based on the WksSSWorkBook near the beginning of the
164 // file
165 "application/x-tika-msworks-spreadsheet",
166 // this is right, the magic-based detection works, there is
167 // no need for the name-based detection to refine it
168 "application/x-tika-msworks-spreadsheet");
169 }
170
171 @Test
172 public void testStarOfficeDetection() throws Exception {
173 assertTypeDetection("testVORCalcTemplate.vor",
174 "application/x-staroffice-template",
175 "application/vnd.stardivision.calc",
176 "application/vnd.stardivision.calc");
177 assertTypeDetection("testVORDrawTemplate.vor",
178 "application/x-staroffice-template",
179 "application/vnd.stardivision.draw",
180 "application/vnd.stardivision.draw");
181 assertTypeDetection("testVORImpressTemplate.vor",
182 "application/x-staroffice-template",
183 "application/vnd.stardivision.impress",
184 "application/vnd.stardivision.impress");
185 assertTypeDetection("testVORWriterTemplate.vor",
186 "application/x-staroffice-template",
187 "application/vnd.stardivision.writer",
188 "application/vnd.stardivision.writer");
189
190 assertTypeDetection("testStarOffice-5.2-calc.sdc",
191 "application/vnd.stardivision.calc",
192 "application/vnd.stardivision.calc",
193 "application/vnd.stardivision.calc");
194 assertTypeDetection("testStarOffice-5.2-draw.sda",
195 "application/vnd.stardivision.draw",
196 "application/vnd.stardivision.draw",
197 "application/vnd.stardivision.draw");
198 assertTypeDetection("testStarOffice-5.2-impress.sdd",
199 "application/vnd.stardivision.impress",
200 "application/vnd.stardivision.impress",
201 "application/vnd.stardivision.impress");
202 assertTypeDetection("testStarOffice-5.2-writer.sdw",
203 "application/vnd.stardivision.writer",
204 "application/vnd.stardivision.writer",
205 "application/vnd.stardivision.writer");
206 }
207
208 /**
209 * Files generated by Works Word Processor versions 3.0 and 4.0 use the
210 * OLE2 structure. They don't resemble Word though.
211 *
212 * @throws Exception
213 */
214 @Test
215 public void testOldWorksWordProcessorDetection() throws Exception {
216 assertTypeDetection(
217 "testWORKSWordProcessor3.0.wps",
218 // .wps is just like any other works extension
219 "application/vnd.ms-works",
220 // this is due to MatOST substring
221 "application/vnd.ms-works",
222 // magic-based detection works, no need to refine it
223 "application/vnd.ms-works");
224
225 // files in version 4.0 are no different from those in version 3.0
226 assertTypeDetection(
227 "testWORKSWordProcessor4.0.wps",
228 "application/vnd.ms-works",
229 "application/vnd.ms-works",
230 "application/vnd.ms-works");
231 }
232
233 /**
234 * Note - detecting container formats by mime magic is very very
235 * iffy, as we can't be sure where things will end up.
236 * People really ought to use the container aware detection...
237 */
238 @Test
239 public void testOoxmlDetection() throws Exception {
240 // These two do luckily have [Content_Types].xml near the start,
241 // so our mime magic will spot them
242 assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
243 assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
244
245 // This one quite legitimately doesn't have its [Content_Types].xml
246 // file as one of the first couple of entries
247 // As such, our mime magic can't figure it out...
248 assertTypeByData("application/zip", "testWORD.docx");
249
250 // If we give the filename as well as the data, we can
251 // specialise the ooxml generic one to the correct type
252 assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
253 assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
254 assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
255
256 // Test a few of the less usual ones
257 assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
258 assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
259 assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
260 assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
261 }
262
263 /**
264 * Note - detecting container formats by mime magic is very very
265 * iffy, as we can't be sure where things will end up.
266 * People really ought to use the container aware detection...
267 */
268 @Test
269 public void testIWorkDetection() throws Exception {
270 // By name is easy
271 assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
272 assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
273 assertTypeByName("application/vnd.apple.pages", "testPages.pages");
274
275 // We can't do it by data, as we'd need to unpack
276 // the zip file to check the XML
277 assertTypeByData("application/zip", "testKeynote.key");
278
279 assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
280 assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
281 assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
282 }
283
284 @Test
285 public void testArchiveDetection() throws Exception {
286 assertTypeByName("application/x-archive", "test.ar");
287 assertTypeByName("application/zip", "test.zip");
288 assertTypeByName("application/x-tar", "test.tar");
289 assertTypeByName("application/x-gzip", "test.tgz"); // See GZIP, not tar contents of it
290 assertTypeByName("application/x-cpio", "test.cpio");
291
292 // TODO Add an example .deb and .udeb, then check these
293
294 // Check the mime magic patterns for them work too
295 assertTypeByData("application/x-archive", "testARofText.ar");
296 assertTypeByData("application/x-archive", "testARofSND.ar");
297 assertTypeByData("application/zip", "test-documents.zip");
298 assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
299 assertTypeByData("application/x-gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
300 assertTypeByData("application/x-cpio", "test-documents.cpio");
301 }
302
303 @Test
304 public void testFitsDetection() throws Exception {
305 // FITS image created using imagemagick convert of testJPEG.jpg
306 assertType("application/fits", "testFITS.fits");
307 assertTypeByData("application/fits", "testFITS.fits");
308 assertTypeByName("application/fits", "testFITS.fits");
309 }
310
311 @Test
312 public void testJpegDetection() throws Exception {
313 assertType("image/jpeg", "testJPEG.jpg");
314 assertTypeByData("image/jpeg", "testJPEG.jpg");
315 assertTypeByName("image/jpeg", "x.jpg");
316 assertTypeByName("image/jpeg", "x.JPG");
317 assertTypeByName("image/jpeg", "x.jpeg");
318 assertTypeByName("image/jpeg", "x.JPEG");
319 assertTypeByName("image/jpeg", "x.jpe");
320 assertTypeByName("image/jpeg", "x.jif");
321 assertTypeByName("image/jpeg", "x.jfif");
322 assertTypeByName("image/jpeg", "x.jfi");
323 }
324
325 @Test
326 public void testTiffDetection() throws Exception {
327 assertType("image/tiff", "testTIFF.tif");
328 assertTypeByData("image/tiff", "testTIFF.tif");
329 assertTypeByName("image/tiff", "x.tiff");
330 assertTypeByName("image/tiff", "x.tif");
331 assertTypeByName("image/tiff", "x.TIF");
332 }
333
334 @Test
335 public void testGifDetection() throws Exception {
336 assertType("image/gif", "testGIF.gif");
337 assertTypeByData("image/gif", "testGIF.gif");
338 assertTypeByName("image/gif", "x.gif");
339 assertTypeByName("image/gif", "x.GIF");
340 }
341
342 @Test
343 public void testPngDetection() throws Exception {
344 assertType("image/png", "testPNG.png");
345 assertTypeByData("image/png", "testPNG.png");
346 assertTypeByName("image/png", "x.png");
347 assertTypeByName("image/png", "x.PNG");
348 }
349
350 @Test
351 public void testBmpDetection() throws Exception {
352 assertType("image/x-ms-bmp", "testBMP.bmp");
353 assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
354 assertTypeByName("image/x-ms-bmp", "x.bmp");
355 assertTypeByName("image/x-ms-bmp", "x.BMP");
356 assertTypeByName("image/x-ms-bmp", "x.dib");
357 assertTypeByName("image/x-ms-bmp", "x.DIB");
358 //false positive check -- contains part of BMP signature
359 assertType("text/plain", "testBMPfp.txt");
360 }
361
362 @Test
363 public void testPnmDetection() throws Exception {
364 assertType("image/x-portable-bitmap", "testPBM.pbm");
365 assertType("image/x-portable-graymap", "testPGM.pgm");
366 assertType("image/x-portable-pixmap", "testPPM.ppm");
367 assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
368 assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
369 assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
370 assertTypeByName("image/x-portable-anymap", "x.pnm");
371 assertTypeByName("image/x-portable-anymap", "x.PNM");
372 assertTypeByName("image/x-portable-bitmap", "x.pbm");
373 assertTypeByName("image/x-portable-bitmap", "x.PBM");
374 assertTypeByName("image/x-portable-graymap", "x.pgm");
375 assertTypeByName("image/x-portable-graymap", "x.PGM");
376 assertTypeByName("image/x-portable-pixmap", "x.ppm");
377 assertTypeByName("image/x-portable-pixmap", "x.PPM");
378 }
379
380 @Test
381 public void testPictDetection() throws Exception {
382 assertType("image/x-pict", "testPICT.pct");
383 assertTypeByData("image/x-pict", "testPICT.pct");
384 assertTypeByName("image/x-pict", "x.pic");
385 assertTypeByName("image/x-pict", "x.PCT");
386 }
387
388 @Test
389 public void testCgmDetection() throws Exception {
390 // TODO: Need a test image file
391 assertTypeByName("image/cgm", "x.cgm");
392 assertTypeByName("image/cgm", "x.CGM");
393 }
394
395 @Test
396 public void testRdfXmlDetection() throws Exception {
397 assertTypeByName("application/rdf+xml", "x.rdf");
398 assertTypeByName("application/rdf+xml", "x.owl");
399 }
400
401 @Test
402 public void testSvgDetection() throws Exception {
403 assertType("image/svg+xml", "testSVG.svg");
404 assertTypeByData("image/svg+xml", "testSVG.svg");
405 assertTypeByName("image/svg+xml", "x.svg");
406 assertTypeByName("image/svg+xml", "x.SVG");
407
408 // Should *.svgz be svg or gzip
409 assertType("application/x-gzip", "testSVG.svgz");
410 assertTypeByData("application/x-gzip", "testSVG.svgz");
411 assertTypeByName("image/svg+xml", "x.svgz");
412 assertTypeByName("image/svg+xml", "x.SVGZ");
413 }
414
415 @Test
416 public void testPdfDetection() throws Exception {
417 assertType("application/pdf", "testPDF.pdf");
418 assertTypeByData("application/pdf", "testPDF.pdf");
419 assertTypeByName("application/pdf", "x.pdf");
420 assertTypeByName("application/pdf", "x.PDF");
421 }
422
423 @Test
424 public void testSwfDetection() throws Exception {
425 assertTypeByName("application/x-shockwave-flash", "x.swf");
426 assertTypeByName("application/x-shockwave-flash", "x.SWF");
427 assertTypeByName("application/x-shockwave-flash", "test1.swf");
428 assertTypeByName("application/x-shockwave-flash", "test2.swf");
429 assertTypeByName("application/x-shockwave-flash", "test3.swf");
430 }
431
432 @Test
433 public void testDwgDetection() throws Exception {
434 assertTypeByName("image/vnd.dwg", "x.dwg");
435 assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
436 assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
437 assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
438 }
439
440 @Test
441 public void testprtDetection() throws Exception {
442 assertTypeByName("application/x-prt", "x.prt");
443 assertTypeByData("application/x-prt", "testCADKEY.prt");
444 }
445
446 /**
447 * Formats which are based on plain text
448 */
449 @Test
450 public void testTextBasedFormatsDetection() throws Exception {
451 assertTypeByName("text/plain", "testTXT.txt");
452 assertType( "text/plain", "testTXT.txt");
453
454 assertTypeByName("text/css", "testCSS.css");
455 assertType( "text/css", "testCSS.css");
456
457 assertTypeByName("text/html", "testHTML.html");
458 assertType( "text/html", "testHTML.html");
459
460 assertTypeByName("application/javascript", "testJS.js");
461 assertType( "application/javascript", "testJS.js");
462 }
463
464 @Test
465 public void testJavaDetection() throws Exception {
466 // TODO Classloader doesn't seem to find the .class file in test-documents
467 //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
468
469 // OSX Native Extension
470 assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
471 }
472
473 @Test
474 public void testWmfDetection() throws Exception {
475 assertTypeByName("application/x-msmetafile", "x.wmf");
476 assertTypeByData("application/x-msmetafile", "testWMF.wmf");
477 assertTypeByName("application/x-msmetafile", "x.WMF");
478
479 assertTypeByName("application/x-emf", "x.emf");
480 assertTypeByData("application/x-emf","testEMF.emf");
481 assertTypeByName("application/x-emf", "x.EMF");
482 // TODO: Need a test wmz file
483 assertTypeByName("application/x-ms-wmz", "x.wmz");
484 assertTypeByName("application/x-ms-wmz", "x.WMZ");
485 // TODO: Need a test emz file
486 assertTypeByName("application/x-gzip", "x.emz");
487 assertTypeByName("application/x-gzip", "x.EMZ");
488 }
489
490 @Test
491 public void testPsDetection() throws Exception {
492 // TODO: Need a test postscript file
493 assertTypeByName("application/postscript", "x.ps");
494 assertTypeByName("application/postscript", "x.PS");
495 assertTypeByName("application/postscript", "x.eps");
496 assertTypeByName("application/postscript", "x.epsf");
497 assertTypeByName("application/postscript", "x.epsi");
498 }
499
500 @Test
501 public void testMicrosoftMultiMediaDetection() throws Exception {
502 assertTypeByName("video/x-ms-asf", "x.asf");
503 assertTypeByName("video/x-ms-wmv", "x.wmv");
504 assertTypeByName("audio/x-ms-wma", "x.wma");
505
506 assertTypeByData("video/x-ms-asf", "testASF.asf");
507 assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
508 assertTypeByData("audio/x-ms-wma", "testWMA.wma");
509 }
510
511 /**
512 * All 3 DITA types are in theory handled by the same mimetype,
513 * but we specialise them
514 */
515 @Test
516 public void testDITADetection() throws Exception {
517 assertTypeByName("application/dita+xml; format=topic", "test.dita");
518 assertTypeByName("application/dita+xml; format=map", "test.ditamap");
519 assertTypeByName("application/dita+xml; format=val", "test.ditaval");
520
521 assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
522 assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
523 assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
524
525 assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
526 assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
527 assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
528
529 // These are all children of the official type
530 assertEquals("application/dita+xml",
531 repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
532 assertEquals("application/dita+xml",
533 repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
534 assertEquals("application/dita+xml",
535 repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
536 }
537
538 /**
539 * @since TIKA-194
540 */
541 @Test
542 public void testJavaRegex() throws Exception{
543 MimeType testType = new MimeType(MediaType.parse("foo/bar"));
544 this.repo.add(testType);
545 assertNotNull(repo.forName("foo/bar"));
546 String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
547 this.repo.addPattern(testType, pattern, true);
548 String testFileName = "rtg_sst_grb_0.5.12345678";
549 assertEquals("foo/bar", tika.detect(testFileName));
550
551 MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
552 this.repo.add(testType2);
553 assertNotNull(repo.forName("foo/bar2"));
554 this.repo.addPattern(testType2, pattern, false);
555 assertNotSame("foo/bar2", tika.detect(testFileName));
556 }
557
558 @Test
559 public void testRawDetection() throws Exception {
560 assertTypeByName("image/x-raw-adobe", "x.dng");
561 assertTypeByName("image/x-raw-adobe", "x.DNG");
562 assertTypeByName("image/x-raw-hasselblad", "x.3fr");
563 assertTypeByName("image/x-raw-fuji", "x.raf");
564 assertTypeByName("image/x-raw-canon", "x.crw");
565 assertTypeByName("image/x-raw-canon", "x.cr2");
566 assertTypeByName("image/x-raw-kodak", "x.k25");
567 assertTypeByName("image/x-raw-kodak", "x.kdc");
568 assertTypeByName("image/x-raw-kodak", "x.dcs");
569 assertTypeByName("image/x-raw-kodak", "x.drf");
570 assertTypeByName("image/x-raw-minolta", "x.mrw");
571 assertTypeByName("image/x-raw-nikon", "x.nef");
572 assertTypeByName("image/x-raw-nikon", "x.nrw");
573 assertTypeByName("image/x-raw-olympus", "x.orf");
574 assertTypeByName("image/x-raw-pentax", "x.ptx");
575 assertTypeByName("image/x-raw-pentax", "x.pef");
576 assertTypeByName("image/x-raw-sony", "x.arw");
577 assertTypeByName("image/x-raw-sony", "x.srf");
578 assertTypeByName("image/x-raw-sony", "x.sr2");
579 assertTypeByName("image/x-raw-sigma", "x.x3f");
580 assertTypeByName("image/x-raw-epson", "x.erf");
581 assertTypeByName("image/x-raw-mamiya", "x.mef");
582 assertTypeByName("image/x-raw-leaf", "x.mos");
583 assertTypeByName("image/x-raw-panasonic", "x.raw");
584 assertTypeByName("image/x-raw-panasonic", "x.rw2");
585 assertTypeByName("image/x-raw-phaseone", "x.iiq");
586 assertTypeByName("image/x-raw-red", "x.r3d");
587 assertTypeByName("image/x-raw-imacon", "x.fff");
588 assertTypeByName("image/x-raw-logitech", "x.pxn");
589 assertTypeByName("image/x-raw-casio", "x.bay");
590 assertTypeByName("image/x-raw-rawzor", "x.rwz");
591 }
592
593 /**
594 * Tests that we correctly detect the font types
595 */
596 @Test
597 public void testFontDetection() throws Exception {
598 assertTypeByName("application/x-font-adobe-metric", "x.afm");
599 assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
600
601 assertTypeByName("application/x-font-printer-metric", "x.pfm");
602 // TODO Get a sample .pfm file
603 assertTypeByData(
604 "application/x-font-printer-metric",
605 new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
606 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
607 );
608
609 assertTypeByName("application/x-font-type1", "x.pfa");
610 // TODO Get a sample .pfa file
611 assertTypeByData(
612 "application/x-font-type1",
613 new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
614 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
615 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
616 );
617
618 assertTypeByName("application/x-font-type1", "x.pfb");
619 // TODO Get a sample .pfm file
620 assertTypeByData(
621 "application/x-font-type1",
622 new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
623 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
624 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
625 );
626 }
627
628 /**
629 * Tests MimeTypes.getMimeType(URL), which examines both the byte header
630 * and, if necessary, the URL's extension.
631 */
632 @Test
633 public void testMimeDeterminationForTestDocuments() throws Exception {
634 assertType("text/html", "testHTML.html");
635 assertType("application/zip", "test-documents.zip");
636
637 assertType("text/html", "testHTML_utf8.html");
638 assertType(
639 "application/vnd.oasis.opendocument.text",
640 "testOpenOffice2.odt");
641 assertType("application/pdf", "testPDF.pdf");
642 assertType("application/rtf", "testRTF.rtf");
643 assertType("text/plain", "testTXT.txt");
644 assertType("application/xml", "testXML.xml");
645 assertType("audio/basic", "testAU.au");
646 assertType("audio/x-aiff", "testAIFF.aif");
647 assertType("audio/x-wav", "testWAV.wav");
648 assertType("audio/midi", "testMID.mid");
649 assertType("application/x-msaccess", "testACCESS.mdb");
650 assertType("application/x-font-ttf", "testTrueType.ttf");
651 }
652
653 @Test
654 public void test7ZipDetection() throws Exception {
655 assertTypeByName("application/x-7z-compressed","test-documents.7z");
656 assertTypeByData("application/x-7z-compressed","test-documents.7z");
657 assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
658 }
659
660 @Test
661 public void testWebArchiveDetection() throws Exception {
662 assertTypeByName("application/x-webarchive","x.webarchive");
663 assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
664 assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
665 }
666
667 /**
668 * KML, and KMZ (zipped KML)
669 */
670 @Test
671 public void testKMLZDetection() throws Exception {
672 assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
673 assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
674 assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
675
676 assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
677 assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
678
679 // By data only, mimetype magic only gets us to a .zip
680 // We need to use the Zip Aware detector to get the full type
681 assertTypeByData("application/zip","testKMZ.kmz");
682 }
683
684 @Test
685 public void testCreativeSuite() throws IOException {
686 assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
687 assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
688 }
689
690 @Test
691 public void testAMR() throws IOException {
692 // AMR matches on name, data or both
693 assertTypeDetection("testAMR.amr", "audio/amr");
694
695 // AMR-WB subtype shares extension, so needs data to identify
696 assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
697
698 // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
699 //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
700 }
701
702 @Test
703 public void testEmlx() throws IOException {
704 assertTypeDetection("testEMLX.emlx", "message/x-emlx");
705 }
706
707 @Test
708 public void testGroupWiseEml() throws Exception {
709 assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
710 }
711
712 @Test
713 public void testMatroskaDetection() throws Exception {
714 assertType("video/x-matroska", "testMKV.mkv");
715 // TODO: Need custom detector data detection, see TIKA-1180
716 assertTypeByData("application/x-matroska", "testMKV.mkv");
717 assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
718 assertTypeByName("video/x-matroska", "x.mkv");
719 assertTypeByName("video/x-matroska", "x.MKV");
720 assertTypeByName("audio/x-matroska", "x.mka");
721 assertTypeByName("audio/x-matroska", "x.MKA");
722 }
723
724 @Test
725 public void testWebMDetection() throws Exception {
726 assertType("video/webm", "testWEBM.webm");
727 // TODO: Need custom detector data detection, see TIKA-1180
728 assertTypeByData("application/x-matroska", "testWEBM.webm");
729 assertTypeByNameAndData("video/webm", "testWEBM.webm");
730 assertTypeByName("video/webm", "x.webm");
731 assertTypeByName("video/webm", "x.WEBM");
732 }
733
734 /** Test getMimeType(byte[]) */
735 @Test
736 public void testGetMimeType_byteArray() throws IOException {
737 // Plain text detection
738 assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
739 assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
740 assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
741 assertText(new byte[] { 'a', 'b', 'c' });
742 assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
743 assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
744 }
745
746 private void assertText(byte[] prefix) throws IOException {
747 assertMagic("text/plain", prefix);
748 }
749
750 private void assertNotText(byte[] prefix) throws IOException {
751 assertMagic("application/octet-stream", prefix);
752 }
753
754 private void assertMagic(String expected, byte[] prefix) throws IOException {
755 MediaType type =
756 repo.detect(new ByteArrayInputStream(prefix), new Metadata());
757 assertNotNull(type);
758 assertEquals(expected, type.toString());
759 }
760
761 private void assertType(String expected, String filename) throws Exception {
762 InputStream stream = TestMimeTypes.class.getResourceAsStream(
763 "/test-documents/" + filename);
764 try {
765 Metadata metadata = new Metadata();
766 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
767 assertEquals(expected, repo.detect(stream, metadata).toString());
768 } finally {
769 stream.close();
770 }
771 }
772
773 private void assertTypeByName(String expected, String filename)
774 throws IOException {
775 Metadata metadata = new Metadata();
776 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
777 assertEquals(expected, repo.detect(null, metadata).toString());
778 }
779
780 private void assertTypeByData(String expected, String filename)
781 throws IOException {
782 InputStream stream = TestMimeTypes.class.getResourceAsStream(
783 "/test-documents/" + filename);
784 assertNotNull("Test file not found: " + filename, stream);
785 try {
786 Metadata metadata = new Metadata();
787 assertEquals(expected, repo.detect(stream, metadata).toString());
788 } finally {
789 stream.close();
790 }
791 }
792
793 private void assertTypeByData(String expected, byte[] data)
794 throws IOException {
795 InputStream stream = new ByteArrayInputStream(data);
796 try {
797 Metadata metadata = new Metadata();
798 assertEquals(expected, repo.detect(stream, metadata).toString());
799 } finally {
800 stream.close();
801 }
802 }
803
804 private void assertTypeDetection(String filename, String type)
805 throws IOException {
806 assertTypeDetection(filename, type, type, type);
807 }
808
809 private void assertTypeDetection(String filename, String byName, String byData,
810 String byNameAndData) throws IOException {
811 assertTypeByName(byName, filename);
812 assertTypeByData(byData, filename);
813 assertTypeByNameAndData(byNameAndData, filename);
814 }
815
816 private void assertTypeByNameAndData(String expected, String filename)
817 throws IOException {
818 assertEquals(expected, getTypeByNameAndData(filename).toString());
819 }
820
821 private MediaType getTypeByNameAndData(String filename) throws IOException {
822 InputStream stream = TestMimeTypes.class.getResourceAsStream(
823 "/test-documents/" + filename);
824 assertNotNull("Test document not found: " + filename, stream);
825 try {
826 Metadata metadata = new Metadata();
827 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
828 return repo.detect(stream, metadata);
829 } finally {
830 stream.close();
831 }
832 }
833 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21 import static org.junit.Assert.fail;
22
23 import java.io.ByteArrayInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.util.HashSet;
27 import java.util.Set;
28
29 import org.apache.tika.config.TikaConfig;
30 import org.apache.tika.detect.Detector;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.metadata.Metadata;
33 import org.apache.tika.metadata.XMPDM;
34 import org.apache.tika.mime.MediaType;
35 import org.apache.tika.sax.BodyContentHandler;
36 import org.junit.Test;
37 import org.xml.sax.ContentHandler;
38
39 public class AutoDetectParserTest {
40 private TikaConfig tika = TikaConfig.getDefaultConfig();
41
42 // Easy to read constants for the MIME types:
43 private static final String RAW = "application/octet-stream";
44 private static final String EXCEL = "application/vnd.ms-excel";
45 private static final String HTML = "text/html; charset=ISO-8859-1";
46 private static final String PDF = "application/pdf";
47 private static final String POWERPOINT = "application/vnd.ms-powerpoint";
48 private static final String KEYNOTE = "application/vnd.apple.keynote";
49 private static final String PAGES = "application/vnd.apple.pages";
50 private static final String NUMBERS = "application/vnd.apple.numbers";
51 private static final String CHM = "application/vnd.ms-htmlhelp";
52 private static final String RTF = "application/rtf";
53 private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
54 private static final String UTF8TEXT = "text/plain; charset=UTF-8";
55 private static final String WORD = "application/msword";
56 private static final String XML = "application/xml";
57 private static final String RSS = "application/rss+xml";
58 private static final String BMP = "image/x-ms-bmp";
59 private static final String GIF = "image/gif";
60 private static final String JPEG = "image/jpeg";
61 private static final String PNG = "image/png";
62 private static final String OGG_VORBIS = "audio/vorbis";
63 private static final String OGG_FLAC = "audio/x-flac";
64 private static final String FLAC_NATIVE= "audio/x-flac";
65 private static final String OPENOFFICE
66 = "application/vnd.oasis.opendocument.text";
67
68
69 /**
70 * This is where a single test is done.
71 * @param tp the parameters encapsulated in a TestParams instance
72 * @throws IOException
73 */
74 private void assertAutoDetect(TestParams tp) throws Exception {
75
76 InputStream input =
77 AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName);
78
79 if (input == null) {
80 fail("Could not open stream from specified resource: "
81 + tp.resourceRealName);
82 }
83
84 try {
85 Metadata metadata = new Metadata();
86 metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
87 metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
88 ContentHandler handler = new BodyContentHandler();
89 new AutoDetectParser(tika).parse(input, handler, metadata);
90
91 assertEquals("Bad content type: " + tp,
92 tp.realType, metadata.get(Metadata.CONTENT_TYPE));
93
94 if (tp.expectedContentFragment != null) {
95 assertTrue("Expected content not found: " + tp,
96 handler.toString().contains(tp.expectedContentFragment));
97 }
98 } finally {
99 input.close();
100 }
101 }
102
103 /**
104 * Convenience method -- its sole purpose of existence is to make the
105 * call to it more readable than it would be if a TestParams instance
106 * would need to be instantiated there.
107 *
108 * @param resourceRealName real name of resource
109 * @param resourceStatedName stated name -- will a bad name fool us?
110 * @param realType - the real MIME type
111 * @param statedType - stated MIME type - will a wrong one fool us?
112 * @param expectedContentFragment - something expected in the text
113 * @throws Exception
114 */
115 private void assertAutoDetect(String resourceRealName,
116 String resourceStatedName,
117 String realType,
118 String statedType,
119 String expectedContentFragment)
120 throws Exception {
121
122 assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
123 realType, statedType, expectedContentFragment));
124 }
125
126 private void assertAutoDetect(
127 String resource, String type, String content) throws Exception {
128
129 resource = "/test-documents/" + resource;
130
131 // TODO !!!! The disabled tests below should work!
132 // The correct MIME type should be determined regardless of the
133 // stated type (ContentType hint) and the stated URL name.
134
135
136 // Try different combinations of correct and incorrect arguments:
137 final String wrongMimeType = RAW;
138 assertAutoDetect(resource, resource, type, type, content);
139 assertAutoDetect(resource, resource, type, null, content);
140 assertAutoDetect(resource, resource, type, wrongMimeType, content);
141
142 assertAutoDetect(resource, null, type, type, content);
143 assertAutoDetect(resource, null, type, null, content);
144 assertAutoDetect(resource, null, type, wrongMimeType, content);
145
146 final String badResource = "a.xyz";
147 assertAutoDetect(resource, badResource, type, type, content);
148 assertAutoDetect(resource, badResource, type, null, content);
149 assertAutoDetect(resource, badResource, type, wrongMimeType, content);
150 }
151
152 @Test
153 public void testKeynote() throws Exception {
154 assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
155 }
156
157 @Test
158 public void testPages() throws Exception {
159 assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
160 }
161
162 @Test
163 public void testNumbers() throws Exception {
164 assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
165 }
166
167 @Test
168 public void testChm() throws Exception {
169 assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
170 }
171
172 @Test
173 public void testEpub() throws Exception {
174 assertAutoDetect(
175 "testEPUB.epub", "application/epub+zip",
176 "The previous headings were subchapters");
177 }
178
179 @Test
180 public void testExcel() throws Exception {
181 assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
182 }
183
184 @Test
185 public void testHTML() throws Exception {
186 assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
187 }
188
189 @Test
190 public void testOpenOffice() throws Exception {
191 assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
192 "This is a sample Open Office document");
193 }
194
195 @Test
196 public void testPDF() throws Exception {
197 assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
198
199 }
200
201 @Test
202 public void testPowerpoint() throws Exception {
203 assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
204 }
205
206 @Test
207 public void testRdfXml() throws Exception {
208 assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
209 }
210
211 @Test
212 public void testRTF() throws Exception {
213 assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
214 }
215
216 @Test
217 public void testText() throws Exception {
218 assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
219 }
220
221 @Test
222 public void testTextNonASCIIUTF8() throws Exception {
223 assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
224 }
225
226 @Test
227 public void testWord() throws Exception {
228 assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
229 }
230
231 @Test
232 public void testXML() throws Exception {
233 assertAutoDetect("testXML.xml", XML, "Lius");
234 }
235
236 @Test
237 public void testRss() throws Exception {
238 assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
239 }
240
241 @Test
242 public void testImages() throws Exception {
243 assertAutoDetect("testBMP.bmp", BMP, null);
244 assertAutoDetect("testGIF.gif", GIF, null);
245 assertAutoDetect("testJPEG.jpg", JPEG, null);
246 assertAutoDetect("testPNG.png", PNG, null);
247 }
248
249 /**
250 * Make sure that zip bomb attacks are prevented.
251 *
252 * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
253 */
254 @Test
255 public void testZipBombPrevention() throws Exception {
256 InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
257 "/test-documents/TIKA-216.tgz");
258 try {
259 Metadata metadata = new Metadata();
260 ContentHandler handler = new BodyContentHandler(-1);
261 new AutoDetectParser(tika).parse(tgz, handler, metadata);
262 fail("Zip bomb was not detected");
263 } catch (TikaException e) {
264 // expected
265 } finally {
266 tgz.close();
267 }
268
269 }
270
271 /**
272 * Test to ensure that the Vorbis and FLAC parsers have been correctly
273 * included, and are available
274 */
275 @SuppressWarnings("deprecation")
276 @Test
277 public void testVorbisFlac() throws Exception {
278 // The three test files should all have similar test data
279 String[] testFiles = new String[] {
280 "testVORBIS.ogg", "testFLAC.oga", "testFLAC.flac"
281 };
282 String[] mimetypes = new String[] {
283 OGG_VORBIS, OGG_FLAC, FLAC_NATIVE
284 };
285
286 // Check we found the parser
287 CompositeParser parser = (CompositeParser)tika.getParser();
288 for (String type : mimetypes) {
289 MediaType mt = MediaType.parse(type);
290 assertNotNull("Parser not found for " + type, parser.getParsers().get(mt) );
291 }
292
293 // Have each file parsed, and check
294 for (int i=0; i<testFiles.length; i++) {
295 String file = testFiles[i];
296 InputStream input = AutoDetectParserTest.class.getResourceAsStream(
297 "/test-documents/"+file);
298
299 if (input == null) {
300 fail("Could not find test file " + file);
301 }
302
303 try {
304 Metadata metadata = new Metadata();
305 ContentHandler handler = new BodyContentHandler();
306 new AutoDetectParser(tika).parse(input, handler, metadata);
307
308 assertEquals("Incorrect content type for " + file,
309 mimetypes[i], metadata.get(Metadata.CONTENT_TYPE));
310
311 // Check some of the common metadata
312 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
313 assertEquals("Test Title", metadata.get(Metadata.TITLE));
314 // assertEquals("Test Artist", metadata.get(TikaCoreProperties.AUTHOR));
315 // assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
316
317 // Check some of the XMPDM metadata
318 assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
319 assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
320 assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
321 assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
322
323 // Check some of the text
324 String content = handler.toString();
325 assertTrue(content.contains("Test Title"));
326 assertTrue(content.contains("Test Artist"));
327 } finally {
328 input.close();
329 }
330 }
331 }
332
333 /**
334 * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
335 * list of supported parsers.
336 * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
337 */
338 @Test
339 public void testSpecificParserList() throws Exception {
340 AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
341
342 InputStream is = new ByteArrayInputStream("test".getBytes());
343 Metadata metadata = new Metadata();
344 parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
345
346 assertEquals("value", metadata.get("MyParser"));
347 }
348
349 private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
350
351 /**
352 * A test detector which always returns the type supported
353 * by the test parser
354 */
355 @SuppressWarnings("serial")
356 private static class MyDetector implements Detector {
357 public MediaType detect(InputStream input, Metadata metadata) throws IOException {
358 return MY_MEDIA_TYPE;
359 }
360 }
361
362 @SuppressWarnings("serial")
363 private static class MyParser extends AbstractParser {
364 public Set<MediaType> getSupportedTypes(ParseContext context) {
365 Set<MediaType> supportedTypes = new HashSet<MediaType>();
366 supportedTypes.add(MY_MEDIA_TYPE);
367 return supportedTypes;
368 }
369
370 public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
371 metadata.add("MyParser", "value");
372 }
373
374 }
375
376 /**
377 * Minimal class to encapsulate all parameters -- the main reason for
378 * its existence is to aid in debugging via its toString() method.
379 *
380 * Getters and setters intentionally not provided.
381 */
382 private static class TestParams {
383
384 public String resourceRealName;
385 public String resourceStatedName;
386 public String realType;
387 public String statedType;
388 public String expectedContentFragment;
389
390
391 private TestParams(String resourceRealName,
392 String resourceStatedName,
393 String realType,
394 String statedType,
395 String expectedContentFragment) {
396 this.resourceRealName = resourceRealName;
397 this.resourceStatedName = resourceStatedName;
398 this.realType = realType;
399 this.statedType = statedType;
400 this.expectedContentFragment = expectedContentFragment;
401 }
402
403
404 /**
405 * Produces a string like the following:
406 *
407 * <pre>
408 * Test parameters:
409 * resourceRealName = /test-documents/testEXCEL.xls
410 * resourceStatedName = null
411 * realType = application/vnd.ms-excel
412 * statedType = null
413 * expectedContentFragment = Sample Excel Worksheet
414 * </pre>
415 */
416 public String toString() {
417 return "Test parameters:\n"
418 + " resourceRealName = " + resourceRealName + "\n"
419 + " resourceStatedName = " + resourceStatedName + "\n"
420 + " realType = " + realType + "\n"
421 + " statedType = " + statedType + "\n"
422 + " expectedContentFragment = " + expectedContentFragment + "\n";
423 }
424 }
425 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.ByteArrayInputStream;
21 import java.io.InputStream;
22 import java.io.Reader;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.junit.Test;
27
28 public class ParsingReaderTest {
29
30 @Test
31 public void testPlainText() throws Exception {
32 String data = "test content";
33 InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
34 Reader reader = new ParsingReader(stream, "test.txt");
35 assertEquals('t', reader.read());
36 assertEquals('e', reader.read());
37 assertEquals('s', reader.read());
38 assertEquals('t', reader.read());
39 assertEquals(' ', reader.read());
40 assertEquals('c', reader.read());
41 assertEquals('o', reader.read());
42 assertEquals('n', reader.read());
43 assertEquals('t', reader.read());
44 assertEquals('e', reader.read());
45 assertEquals('n', reader.read());
46 assertEquals('t', reader.read());
47 assertEquals('\n', reader.read());
48 assertEquals(-1, reader.read());
49 reader.close();
50 assertEquals(-1, stream.read());
51 }
52
53 @Test
54 public void testXML() throws Exception {
55 String data = "<p>test <span>content</span></p>";
56 InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
57 Reader reader = new ParsingReader(stream, "test.xml");
58 assertEquals(' ', (char) reader.read());
59 assertEquals('t', (char) reader.read());
60 assertEquals('e', (char) reader.read());
61 assertEquals('s', (char) reader.read());
62 assertEquals('t', (char) reader.read());
63 assertEquals(' ', (char) reader.read());
64 assertEquals(' ', (char) reader.read());
65 assertEquals('c', (char) reader.read());
66 assertEquals('o', (char) reader.read());
67 assertEquals('n', (char) reader.read());
68 assertEquals('t', (char) reader.read());
69 assertEquals('e', (char) reader.read());
70 assertEquals('n', (char) reader.read());
71 assertEquals('t', (char) reader.read());
72 assertEquals('\n', (char) reader.read());
73 assertEquals(-1, reader.read());
74 reader.close();
75 assertEquals(-1, stream.read());
76 }
77
78 /**
79 * Test case for TIKA-203
80 *
81 * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
82 */
83 @Test
84 public void testMetadata() throws Exception {
85 Metadata metadata = new Metadata();
86 InputStream stream = ParsingReaderTest.class.getResourceAsStream(
87 "/test-documents/testEXCEL.xls");
88 Reader reader = new ParsingReader(
89 new AutoDetectParser(), stream, metadata, new ParseContext());
90 try {
91 // Metadata should already be available
92 assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
93 // Check that the internal buffering isn't broken
94 assertEquals('F', (char) reader.read());
95 assertEquals('e', (char) reader.read());
96 assertEquals('u', (char) reader.read());
97 assertEquals('i', (char) reader.read());
98 assertEquals('l', (char) reader.read());
99 assertEquals('1', (char) reader.read());
100 } finally {
101 reader.close();
102 }
103 }
104
105 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.asm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20 import org.apache.tika.Tika;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.TikaCoreProperties;
23 import org.junit.Test;
24
25 /**
26 * Test case for parsing Java class files.
27 */
28 public class ClassParserTest {
29
30 @Test
31 public void testClassParsing() throws Exception {
32 String path = "/test-documents/AutoDetectParser.class";
33 Metadata metadata = new Metadata();
34 String content = new Tika().parseToString(
35 ClassParserTest.class.getResourceAsStream(path), metadata);
36
37 assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
38 assertEquals(
39 "AutoDetectParser.class",
40 metadata.get(Metadata.RESOURCE_NAME_KEY));
41
42 assertTrue(content.contains("package org.apache.tika.parser;"));
43 assertTrue(content.contains(
44 "class AutoDetectParser extends CompositeParser"));
45 assertTrue(content.contains(
46 "private org.apache.tika.mime.MimeTypes types"));
47 assertTrue(content.contains(
48 "public void parse("
49 + "java.io.InputStream, org.xml.sax.ContentHandler,"
50 + " org.apache.tika.metadata.Metadata) throws"
51 + " java.io.IOException, org.xml.sax.SAXException,"
52 + " org.apache.tika.exception.TikaException;"));
53 assertTrue(content.contains(
54 "private byte[] getPrefix(java.io.InputStream, int)"
55 + " throws java.io.IOException;"));
56 }
57
58 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.audio;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.apache.tika.Tika;
21 import org.apache.tika.metadata.Metadata;
22 import org.junit.Test;
23
24 public class AudioParserTest {
25
26 @Test
27 public void testWAV() throws Exception {
28 String path = "/test-documents/testWAV.wav";
29 Metadata metadata = new Metadata();
30 String content = new Tika().parseToString(
31 AudioParserTest.class.getResourceAsStream(path), metadata);
32
33 assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
34 assertEquals("44100.0", metadata.get("samplerate"));
35 assertEquals("2", metadata.get("channels"));
36 assertEquals("16", metadata.get("bits"));
37 assertEquals("PCM_SIGNED", metadata.get("encoding"));
38
39 assertEquals("", content);
40 }
41
42 @Test
43 public void testAIFF() throws Exception {
44 String path = "/test-documents/testAIFF.aif";
45 Metadata metadata = new Metadata();
46 String content = new Tika().parseToString(
47 AudioParserTest.class.getResourceAsStream(path), metadata);
48
49 assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
50 assertEquals("44100.0", metadata.get("samplerate"));
51 assertEquals("2", metadata.get("channels"));
52 assertEquals("16", metadata.get("bits"));
53 assertEquals("PCM_SIGNED", metadata.get("encoding"));
54
55 assertEquals("", content);
56 }
57
58 @Test
59 public void testAU() throws Exception {
60 String path = "/test-documents/testAU.au";
61 Metadata metadata = new Metadata();
62 String content = new Tika().parseToString(
63 AudioParserTest.class.getResourceAsStream(path), metadata);
64
65 assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
66 assertEquals("44100.0", metadata.get("samplerate"));
67 assertEquals("2", metadata.get("channels"));
68 assertEquals("16", metadata.get("bits"));
69 assertEquals("PCM_SIGNED", metadata.get("encoding"));
70
71 assertEquals("", content);
72 }
73
74 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.audio;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.Tika;
22 import org.apache.tika.metadata.Metadata;
23 import org.junit.Test;
24
25 public class MidiParserTest {
26
27 @Test
28 public void testMID() throws Exception {
29 String path = "/test-documents/testMID.mid";
30 Metadata metadata = new Metadata();
31 String content = new Tika().parseToString(
32 MidiParserTest.class.getResourceAsStream(path), metadata);
33
34 assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
35 assertEquals("2", metadata.get("tracks"));
36 assertEquals("0", metadata.get("patches"));
37 assertEquals("PPQ", metadata.get("divisionType"));
38
39 assertTrue(content.contains("Untitled"));
40 }
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertTrue;
19
20 import java.util.Iterator;
21
22 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
23 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
24 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
25 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
26 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
27 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
28 import org.apache.tika.parser.chm.core.ChmCommons;
29 import org.apache.tika.parser.chm.core.ChmConstants;
30 import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
31 import org.junit.After;
32 import org.junit.Before;
33 import org.junit.Test;
34
35 /**
36 * Tests major functionality of ChmBlockInfo
37 *
38 */
39 public class TestChmBlockInfo {
40 private byte[] data;
41 private ChmBlockInfo chmBlockInfo;
42 private ChmDirectoryListingSet chmDirListCont = null;
43 private ChmLzxcResetTable clrt = null;
44 private ChmLzxcControlData chmLzxcControlData = null;
45
46 @Before
47 public void setUp() throws Exception {
48 data = TestParameters.chmData;
49 /* Creates and parses itsf header */
50 ChmItsfHeader chmItsHeader = new ChmItsfHeader();
51 // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
52 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
53 chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
54 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
55 /* Creates and parses itsp block */
56 ChmItspHeader chmItspHeader = new ChmItspHeader();
57 // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
58 // chmItsHeader.getDirOffset(),
59 // (int) chmItsHeader.getDirOffset()
60 // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
61 chmItspHeader.parse(ChmCommons.copyOfRange(data,
62 (int) chmItsHeader.getDirOffset(),
63 (int) chmItsHeader.getDirOffset()
64 + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
65 /* Creating instance of ChmDirListingContainer */
66 chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader,
67 chmItspHeader);
68 int indexOfControlData = chmDirListCont.getControlDataIndex();
69
70 int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
71 ChmConstants.LZXC.getBytes());
72 byte[] dir_chunk = null;
73 if (indexOfResetTable > 0) {
74 // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
75 // indexOfResetTable
76 // +
77 // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
78 dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
79 indexOfResetTable
80 + chmDirListCont.getDirectoryListingEntryList()
81 .get(indexOfControlData).getLength());
82 }
83
84 /* Creates and parses control block */
85 chmLzxcControlData = new ChmLzxcControlData();
86 chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
87
88 int indexOfFeList = chmDirListCont.getResetTableIndex();
89 int startIndex = (int) chmDirListCont.getDataOffset()
90 + chmDirListCont.getDirectoryListingEntryList()
91 .get(indexOfFeList).getOffset();
92 // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
93 // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
94 dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex
95 + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
96 clrt = new ChmLzxcResetTable();
97 clrt.parse(dir_chunk, clrt);
98 }
99
100 @Test
101 public void testToString() {
102 if (chmBlockInfo == null)
103 testGetChmBlockInfo();
104 assertTrue(chmBlockInfo.toString().length() > 0);
105 }
106
107 @Test
108 public void testGetChmBlockInfo() {
109 for (Iterator<DirectoryListingEntry> it = chmDirListCont
110 .getDirectoryListingEntryList().iterator(); it.hasNext();) {
111 DirectoryListingEntry directoryListingEntry = it.next();
112 chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
113 directoryListingEntry, (int) clrt.getBlockLen(),
114 chmLzxcControlData);
115 // Assert.assertTrue(!directoryListingEntry.getName().isEmpty() &&
116 // chmBlockInfo.toString() != null);
117 assertTrue(!ChmCommons.isEmpty(directoryListingEntry
118 .getName()) && chmBlockInfo.toString() != null);
119 }
120 }
121
122 @After
123 public void tearDown() throws Exception {
124 data = null;
125 chmBlockInfo = null;
126 }
127 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertTrue;
19
20 import java.io.ByteArrayInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Arrays;
24 import java.util.List;
25 import java.util.concurrent.ExecutorService;
26 import java.util.concurrent.Executors;
27
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.sax.BodyContentHandler;
32 import org.junit.Test;
33
34 public class TestChmExtraction {
35
36 private final Parser parser = new ChmParser();
37
38 private final List<String> files = Arrays.asList(
39 "/test-documents/testChm.chm",
40 "/test-documents/testChm3.chm");
41
42 @Test
43 public void testGetText() throws Exception {
44 BodyContentHandler handler = new BodyContentHandler();
45 new ChmParser().parse(
46 new ByteArrayInputStream(TestParameters.chmData),
47 handler, new Metadata(), new ParseContext());
48 assertTrue(handler.toString().contains(
49 "The TCard method accepts only numeric arguments"));
50 }
51
52 @Test
53 public void testChmParser() throws Exception{
54 for (String fileName : files) {
55 InputStream stream =
56 TestChmExtraction.class.getResourceAsStream(fileName);
57 try {
58 BodyContentHandler handler = new BodyContentHandler(-1);
59 parser.parse(stream, handler, new Metadata(), new ParseContext());
60 assertTrue(!handler.toString().isEmpty());
61 } finally {
62 stream.close();
63 }
64 }
65 }
66
67
68 @Test
69 public void testMultiThreadedChmExtraction() throws InterruptedException {
70 ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
71 for (int i = 0; i < TestParameters.NTHREADS; i++) {
72 executor.execute(new Runnable() {
73 public void run() {
74 for (String fileName : files) {
75 InputStream stream = null;
76 try {
77 stream = TestChmExtraction.class.getResourceAsStream(fileName);
78 BodyContentHandler handler = new BodyContentHandler(-1);
79 parser.parse(stream, handler, new Metadata(), new ParseContext());
80 assertTrue(!handler.toString().isEmpty());
81 } catch (Exception e) {
82 e.printStackTrace();
83 } finally {
84 try {
85 stream.close();
86 } catch (IOException e) {
87 e.printStackTrace();
88 }
89 }
90 }
91 }
92 });
93 }
94 executor.shutdown();
95 // Waits until all threads will have finished
96 while (!executor.isTerminated()) {
97 Thread.sleep(500);
98 }
99 }
100 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20
21 import java.io.ByteArrayInputStream;
22 import java.util.Iterator;
23 import java.util.List;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
27 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
28 import org.apache.tika.parser.chm.core.ChmExtractor;
29 import org.junit.Before;
30 import org.junit.Test;
31
32 public class TestChmExtractor {
33 private ChmExtractor chmExtractor = null;
34
35 @Before
36 public void setUp() throws Exception {
37 chmExtractor = new ChmExtractor(
38 new ByteArrayInputStream(TestParameters.chmData));
39 }
40
41 @Test
42 public void testEnumerateChm() {
43 List<String> chmEntries = chmExtractor.enumerateChm();
44 assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER,
45 chmEntries.size());
46 }
47
48 @Test
49 public void testGetChmDirList() {
50 assertNotNull(chmExtractor.getChmDirList());
51 }
52
53 @Test
54 public void testExtractChmEntry() throws TikaException{
55 ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
56 int count = 0;
57 for (Iterator<DirectoryListingEntry> it = entries
58 .getDirectoryListingEntryList().iterator(); it.hasNext();) {
59 chmExtractor.extractChmEntry(it.next());
60 ++count;
61 }
62 assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
63 }
64
65 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21
22 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
23 import org.apache.tika.parser.chm.core.ChmCommons;
24 import org.apache.tika.parser.chm.core.ChmConstants;
25 import org.junit.After;
26 import org.junit.Before;
27 import org.junit.Test;
28
29 /**
30 * Tests all public functions of ChmItsfHeader
31 *
32 */
33 public class TestChmItsfHeader {
34 private ChmItsfHeader chmItsfHeader = null;
35
36 @Before
37 public void setUp() throws Exception {
38 chmItsfHeader = new ChmItsfHeader();
39 byte[] data = TestParameters.chmData;
40 // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
41 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
42 chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
43 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
44 }
45
46 @Test
47 public void getDataOffset() {
48 assertEquals(TestParameters.VP_DATA_OFFSET_LENGTH,
49 chmItsfHeader.getDataOffset());
50 }
51
52 @Test
53 public void getDir_uuid() {
54 assertNotNull(chmItsfHeader.getDir_uuid());
55 }
56
57 @Test
58 public void getDirLen() {
59 assertEquals(TestParameters.VP_DIRECTORY_LENGTH,
60 chmItsfHeader.getDirLen());
61 }
62
63 @Test
64 public void getDirOffset() {
65 assertEquals(TestParameters.VP_DIRECTORY_OFFSET,
66 chmItsfHeader.getDirOffset());
67 }
68
69 @Test
70 public void getHeaderLen() {
71 assertEquals(TestParameters.VP_ITSF_HEADER_LENGTH,
72 chmItsfHeader.getHeaderLen());
73 }
74
75 @Test
76 public void getLangId() {
77 assertEquals(TestParameters.VP_LANGUAGE_ID,
78 chmItsfHeader.getLangId());
79 }
80
81 @Test
82 public void getLastModified() {
83 assertEquals(TestParameters.VP_LAST_MODIFIED,
84 chmItsfHeader.getLastModified());
85 }
86
87 @Test
88 public void getUnknown_000c() {
89 assertEquals(TestParameters.VP_UNKNOWN_000C,
90 chmItsfHeader.getUnknown_000c());
91 }
92
93 @Test
94 public void getUnknownLen() {
95 assertEquals(TestParameters.VP_UNKNOWN_LEN,
96 chmItsfHeader.getUnknownLen());
97 }
98
99 @Test
100 public void getUnknownOffset() {
101 assertEquals(TestParameters.VP_UNKNOWN_OFFSET,
102 chmItsfHeader.getUnknownOffset());
103 }
104
105 @Test
106 public void getVersion() {
107 assertEquals(TestParameters.VP_VERSION,
108 chmItsfHeader.getVersion());
109 }
110
111 @Test
112 public void testToString() {
113 assertTrue(chmItsfHeader.toString().contains(
114 TestParameters.VP_ISTF_SIGNATURE));
115 }
116
117 @After
118 public void tearDown() throws Exception {
119 chmItsfHeader = null;
120 }
121 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
22 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
23 import org.apache.tika.parser.chm.core.ChmCommons;
24 import org.apache.tika.parser.chm.core.ChmConstants;
25 import org.junit.After;
26 import org.junit.Before;
27 import org.junit.Test;
28
29 /**
30 * Tests all public methods of the ChmItspHeader
31 *
32 */
33 public class TestChmItspHeader {
34 private ChmItspHeader chmItspHeader = null;
35
36 @Before
37 public void setUp() throws Exception {
38 byte[] data = TestParameters.chmData;
39
40 ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
41 // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
42 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
43 chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
44 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
45
46 chmItspHeader = new ChmItspHeader();
47 // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
48 // chmItsfHeader.getDirOffset(),
49 // (int) chmItsfHeader.getDirOffset()
50 // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
51 chmItspHeader.parse(ChmCommons.copyOfRange(data,
52 (int) chmItsfHeader.getDirOffset(),
53 (int) chmItsfHeader.getDirOffset()
54 + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
55 }
56
57 @Test
58 public void testGetBlock_len() {
59 assertEquals(TestParameters.VP_BLOCK_LENGTH,
60 chmItspHeader.getBlock_len());
61 }
62
63 @Test
64 public void testGetBlockidx_intvl() {
65 assertEquals(TestParameters.VP_BLOCK_INDEX_INTERVAL,
66 chmItspHeader.getBlockidx_intvl());
67 }
68
69 @Test
70 public void testGetHeader_len() {
71 assertEquals(TestParameters.VP_ITSP_HEADER_LENGTH,
72 chmItspHeader.getHeader_len());
73 }
74
75 @Test
76 public void testGetIndex_depth() {
77 assertEquals(TestParameters.VP_INDEX_DEPTH,
78 chmItspHeader.getIndex_depth());
79 }
80
81 @Test
82 public void testGetIndex_head() {
83 assertEquals(TestParameters.VP_INDEX_HEAD,
84 chmItspHeader.getIndex_head());
85 }
86
87 @Test
88 public void testGetIndex_root() {
89 assertEquals(TestParameters.VP_INDEX_ROOT,
90 chmItspHeader.getIndex_root());
91 }
92
93 @Test
94 public void testGetLang_id() {
95 assertEquals(TestParameters.VP_LANGUAGE_ID,
96 chmItspHeader.getLang_id());
97 }
98
99 @Test
100 public void testGetNum_blocks() {
101 assertEquals(TestParameters.VP_UNKNOWN_NUM_BLOCKS,
102 chmItspHeader.getNum_blocks());
103 }
104
105 @Test
106 public void testGetUnknown_000c() {
107 assertEquals(TestParameters.VP_ITSP_UNKNOWN_000C,
108 chmItspHeader.getUnknown_000c());
109 }
110
111 @Test
112 public void testGetUnknown_0024() {
113 assertEquals(TestParameters.VP_ITSP_UNKNOWN_0024,
114 chmItspHeader.getUnknown_0024());
115 }
116
117 @Test
118 public void testGetUnknown_002() {
119 assertEquals(TestParameters.VP_ITSP_UNKNOWN_002C,
120 chmItspHeader.getUnknown_002c());
121 }
122
123 @Test
124 public void testGetUnknown_0044() {
125 assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
126 chmItspHeader.getUnknown_0044().length);
127 }
128
129 @Test
130 public void testGetVersion() {
131 assertEquals(TestParameters.VP_ITSP_VERSION,
132 chmItspHeader.getVersion());
133 }
134
135 @Test
136 public void testGetSignature() {
137 assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String(
138 chmItspHeader.getSignature()));
139 }
140
141 @Test
142 public void testGetSystem_uuid() {
143 assertEquals(TestParameters.VP_ITSP_BYTEARR_LEN,
144 chmItspHeader.getSystem_uuid().length);
145 }
146
147 @Test
148 public void testToString() {
149 assertTrue(chmItspHeader.toString().contains(
150 TestParameters.VP_ISTP_SIGNATURE));
151 }
152
153 @After
154 public void tearDown() throws Exception {
155 chmItspHeader = null;
156 }
157
158 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
24 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
25 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
26 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
27 import org.apache.tika.parser.chm.core.ChmCommons;
28 import org.apache.tika.parser.chm.core.ChmConstants;
29 import org.apache.tika.parser.chm.lzx.ChmLzxState;
30 import org.junit.Before;
31 import org.junit.Test;
32
33 public class TestChmLzxState {
34 private ChmLzxState chmLzxState;
35 private int windowSize;
36
37 @Before
38 public void setUp() throws Exception {
39 byte[] data = TestParameters.chmData;
40
41 /* Creates and parses itsf header */
42 ChmItsfHeader chmItsHeader = new ChmItsfHeader();
43 // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
44 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
45 chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
46 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
47 /* Creates and parses itsp block */
48 ChmItspHeader chmItspHeader = new ChmItspHeader();
49 // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
50 // chmItsHeader.getDirOffset(),
51 // (int) chmItsHeader.getDirOffset()
52 // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
53 chmItspHeader.parse(ChmCommons.copyOfRange(data,
54 (int) chmItsHeader.getDirOffset(),
55 (int) chmItsHeader.getDirOffset()
56 + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
57
58 /* Creating instance of ChmDirListingContainer */
59 ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
60 data, chmItsHeader, chmItspHeader);
61 int indexOfControlData = ChmCommons.indexOf(
62 chmDirListCont.getDirectoryListingEntryList(),
63 ChmConstants.CONTROL_DATA);
64
65 int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
66 ChmConstants.LZXC.getBytes());
67 byte[] dir_chunk = null;
68 if (indexOfResetTable > 0) {
69 // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
70 // indexOfResetTable
71 // +
72 // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
73 dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
74 indexOfResetTable
75 + chmDirListCont.getDirectoryListingEntryList()
76 .get(indexOfControlData).getLength());
77 }
78
79 ChmLzxcControlData clcd = new ChmLzxcControlData();
80 clcd.parse(dir_chunk, clcd);
81 windowSize = (int) clcd.getWindowSize();
82 }
83
84 @Test
85 public void testChmLzxStateConstructor() throws TikaException {
86 chmLzxState = new ChmLzxState(windowSize);
87 assertNotNull(chmLzxState);
88 }
89
90 @Test
91 public void testToString() throws TikaException {
92 if (chmLzxState == null)
93 testChmLzxStateConstructor();
94 assertTrue(chmLzxState.toString().length() > 20);
95 }
96
97 // TODO add more tests
98
99 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21
22 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
23 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
24 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
25 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
26 import org.apache.tika.parser.chm.core.ChmCommons;
27 import org.apache.tika.parser.chm.core.ChmConstants;
28 import org.junit.Before;
29 import org.junit.Test;
30
31 /**
32 * Tests all public methods of ChmLzxcControlData block
33 */
34 public class TestChmLzxcControlData {
35 private ChmLzxcControlData chmLzxcControlData = null;
36
37 @Before
38 public void setUp() throws Exception {
39 byte[] data = TestParameters.chmData;
40 /* Creates and parses itsf header */
41 ChmItsfHeader chmItsHeader = new ChmItsfHeader();
42 // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
43 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
44 chmItsHeader.parse(ChmCommons.copyOfRange(data, 0,
45 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
46 /* Creates and parses itsp block */
47 ChmItspHeader chmItspHeader = new ChmItspHeader();
48 // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
49 // chmItsHeader.getDirOffset(),
50 // (int) chmItsHeader.getDirOffset()
51 // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
52 chmItspHeader.parse(ChmCommons.copyOfRange(data,
53 (int) chmItsHeader.getDirOffset(),
54 (int) chmItsHeader.getDirOffset()
55 + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
56 /* Creating instance of ChmDirListingContainer */
57 ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
58 data, chmItsHeader, chmItspHeader);
59 int indexOfControlData = chmDirListCont.getControlDataIndex();
60
61 int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
62 ChmConstants.LZXC.getBytes());
63 byte[] dir_chunk = null;
64 if (indexOfResetTable > 0) {
65 // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
66 // indexOfResetTable
67 // +
68 // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
69 dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
70 indexOfResetTable
71 + chmDirListCont.getDirectoryListingEntryList()
72 .get(indexOfControlData).getLength());
73 }
74
75 /* Creates and parses control block */
76 chmLzxcControlData = new ChmLzxcControlData();
77 chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
78
79 }
80
81 @Test
82 public void testConstructorNotNull() {
83 assertNotNull(chmLzxcControlData);
84 }
85
86 @Test
87 public void testGetResetInterval() {
88 assertEquals(TestParameters.VP_RESET_INTERVAL,
89 chmLzxcControlData.getResetInterval());
90 }
91
92 @Test
93 public void testGetSize() {
94 assertEquals(TestParameters.VP_CONTROL_DATA_SIZE,
95 chmLzxcControlData.getSize());
96 }
97
98 @Test
99 public void testGetUnknown_18() {
100 assertEquals(TestParameters.VP_UNKNOWN_18,
101 chmLzxcControlData.getUnknown_18());
102 }
103
104 @Test
105 public void testGetVersion() {
106 assertEquals(TestParameters.VP_CONTROL_DATA_VERSION,
107 chmLzxcControlData.getVersion());
108 }
109
110 @Test
111 public void testGetWindowSize() {
112 assertEquals(TestParameters.VP_WINDOW_SIZE,
113 chmLzxcControlData.getWindowSize());
114 }
115
116 @Test
117 public void testGetWindowsPerReset() {
118 assertEquals(TestParameters.VP_WINDOWS_PER_RESET,
119 chmLzxcControlData.getWindowsPerReset());
120 }
121
122 @Test
123 public void testGetToString() {
124 assertTrue(chmLzxcControlData.toString().contains(
125 TestParameters.VP_CONTROL_DATA_SIGNATURE));
126 }
127
128 @Test
129 public void testGetSignature() {
130 assertEquals(
131 TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes().length,
132 chmLzxcControlData.getSignature().length);
133 }
134
135 @Test
136 public void testGetSignaure() {
137 assertEquals(
138 TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes().length,
139 chmLzxcControlData.getSignature().length);
140 }
141
142 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.chm;
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertTrue;
21
22 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
23 import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
24 import org.apache.tika.parser.chm.accessor.ChmItspHeader;
25 import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
26 import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
27 import org.apache.tika.parser.chm.assertion.ChmAssert;
28 import org.apache.tika.parser.chm.core.ChmCommons;
29 import org.apache.tika.parser.chm.core.ChmConstants;
30 import org.junit.Before;
31 import org.junit.Test;
32
33 public class TestChmLzxcResetTable {
34 private ChmLzxcResetTable chmLzxcResetTable = null;
35
36 @Before
37 public void setUp() throws Exception {
38 byte[] data = TestParameters.chmData;
39 /* Creates and parses itsf header */
40 ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
41 // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
42 // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
43 chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0,
44 ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
45 /* Creates and parses itsp block */
46 ChmItspHeader chmItspHeader = new ChmItspHeader();
47 // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
48 // chmItsfHeader.getDirOffset(),
49 // (int) chmItsfHeader.getDirOffset()
50 // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
51 chmItspHeader.parse(ChmCommons.copyOfRange(data,
52 (int) chmItsfHeader.getDirOffset(),
53 (int) chmItsfHeader.getDirOffset()
54 + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
55 /* Creating instance of ChmDirListingContainer */
56 ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(
57 data, chmItsfHeader, chmItspHeader);
58 int indexOfControlData = chmDirListCont.getControlDataIndex();
59
60 int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data,
61 ChmConstants.LZXC.getBytes());
62 byte[] dir_chunk = null;
63 if (indexOfResetTable > 0) {
64 // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
65 // indexOfResetTable
66 // +
67 // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
68 dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable,
69 indexOfResetTable
70 + chmDirListCont.getDirectoryListingEntryList()
71 .get(indexOfControlData).getLength());
72 }
73
74 /* Creates and parses control block */
75 ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
76 chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
77
78 indexOfResetTable = chmDirListCont.getResetTableIndex();
79 chmLzxcResetTable = new ChmLzxcResetTable();
80
81 int startIndex = (int) chmDirListCont.getDataOffset()
82 + chmDirListCont.getDirectoryListingEntryList()
83 .get(indexOfResetTable).getOffset();
84
85 ChmAssert.assertCopyingDataIndex(startIndex, data.length);
86
87 // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
88 // +
89 // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
90 dir_chunk = ChmCommons.copyOfRange(
91 data,
92 startIndex,
93 startIndex
94 + chmDirListCont.getDirectoryListingEntryList()
95 .get(indexOfResetTable).getLength());
96
97 chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
98 }
99
100 @Test
101 public void testGetBlockAddress() {
102 assertEquals(TestParameters.VP_RESET_TABLE_BA,
103 chmLzxcResetTable.getBlockAddress().length);
104 }
105
106 @Test
107 public void testGetBlockCount() {
108 assertEquals(TestParameters.VP_RESET_TABLE_BA,
109 chmLzxcResetTable.getBlockCount());
110 }
111
112 @Test
113 public void testGetBlockLen() {
114 assertEquals(TestParameters.VP_RES_TBL_BLOCK_LENGTH,
115 chmLzxcResetTable.getBlockLen());
116 }
117
118 @Test
119 public void testGetCompressedLen() {
120 assertEquals(TestParameters.VP_RES_TBL_COMPR_LENGTH,
121 chmLzxcResetTable.getCompressedLen());
122 }
123
124 @Test
125 public void testGetTableOffset() {
126 assertEquals(TestParameters.VP_TBL_OFFSET,
127 chmLzxcResetTable.getTableOffset());
128 }
129
130 @Test
131 public void testGetUncompressedLen() {
132 assertEquals(TestParameters.VP_RES_TBL_UNCOMP_LENGTH,
133 chmLzxcResetTable.getUncompressedLen());
134 }
135
136 @Test
137 public void testGetUnknown() {
138 assertEquals(TestParameters.VP_RES_TBL_UNKNOWN,
139 chmLzxcResetTable.getUnknown());
140 }
141
142 @Test
143 public void testGetVersion() {
144 assertEquals(TestParameters.VP_RES_TBL_VERSION,
145 chmLzxcResetTable.getVersion());
146 }
147
148 @Test
149 public void testToString() {
150 assertTrue(chmLzxcResetTable.toString().length() > 0);
151 }
152
153 // TODO: add setters to be tested
154 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20
21 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
22 import org.junit.Before;
23 import org.junit.Test;
24
25 /**
26 * Tests public methods of the DirectoryListingEntry class
27 *
28 * @author olegt
29 *
30 */
31 public class TestDirectoryListingEntry {
32 private DirectoryListingEntry dle = null;
33
34 @Before
35 public void setUp() throws Exception {
36 dle = new DirectoryListingEntry(TestParameters.nameLength,
37 TestParameters.entryName, TestParameters.entryType,
38 TestParameters.offset, TestParameters.length);
39 }
40
41 @Test
42 public void testDefaultConstructor() {
43 assertNotNull(dle);
44 }
45
46 @Test
47 public void testParamConstructor() {
48 assertEquals(TestParameters.nameLength, dle.getNameLength());
49 assertEquals(TestParameters.entryName, dle.getName());
50 assertEquals(TestParameters.entryType, dle.getEntryType());
51 assertEquals(TestParameters.offset, dle.getOffset());
52 assertEquals(TestParameters.length, dle.getLength());
53 }
54
55 @Test
56 public void testToString() {
57 assertNotNull(dle.toString());
58 }
59
60 @Test
61 public void testGetNameLength() {
62 assertEquals(TestParameters.nameLength, dle.getNameLength());
63 }
64
65 @Test
66 public void testGetName() {
67 assertEquals(TestParameters.entryName, dle.getName());
68 }
69
70 @Test
71 public void testGetEntryType() {
72 assertEquals(TestParameters.entryType, dle.getEntryType());
73 }
74
75 @Test
76 public void testGetOffset() {
77 assertEquals(TestParameters.offset, dle.getOffset());
78 }
79
80 @Test
81 public void testGetLength() {
82 assertEquals(TestParameters.length, dle.getLength());
83 }
84 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20
21 import org.apache.tika.io.IOUtils;
22 import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
23
24 /**
25 * Holds test parameters such as verification points
26 */
27 public class TestParameters {
28 /* Prevents initialization */
29 private TestParameters() {
30 }
31
32 /* Tests values */
33 static final int nameLength = 5;
34 static final String entryName = TestParameters.class.getName();
35 static EntryType entryType = EntryType.COMPRESSED;
36 static final int offset = 3;
37 static final int length = 20;
38 static final int NTHREADS = 2;
39
40 static final int BUFFER_SIZE = 16384;
41
42 static final byte[] chmData = readResource("/test-documents/testChm.chm");
43
44 private static byte[] readResource(String name) {
45 try {
46 InputStream stream = TestParameters.class.getResourceAsStream(name);
47 try {
48 return IOUtils.toByteArray(stream);
49 } finally {
50 stream.close();
51 }
52 } catch (IOException e) {
53 throw new RuntimeException(e);
54 }
55 }
56
57 /* Verification points */
58 static final String VP_CHM_MIME_TYPE = "Content-Type=application/x-chm";
59 static final String VP_EXTRACTED_TEXT = "The TCard method accepts only numeric arguments";
60 static final String VP_ISTF_SIGNATURE = "ITSF";
61 static final String VP_ISTP_SIGNATURE = "ITSP";
62 static final String VP_PMGL_SIGNATURE = "PMGL";
63 static final String VP_CONTROL_DATA_SIGNATURE = "LZXC";
64
65 static final int VP_DIRECTORY_LENGTH = 4180;
66 static final int VP_DATA_OFFSET_LENGTH = 4300;
67 static final int VP_DIRECTORY_OFFSET = 120;
68 static final int VP_ITSF_HEADER_LENGTH = 96;
69 static final int VP_LANGUAGE_ID = 1033;
70 static final int VP_LAST_MODIFIED = 1042357880;
71 static final int VP_UNKNOWN_000C = 1;
72 static final int VP_UNKNOWN_LEN = 24;
73 static final int VP_UNKNOWN_OFFSET = 96;
74 static final int VP_VERSION = 3;
75 static final int VP_BLOCK_LENGTH = 4096;
76 static final int VP_BLOCK_INDEX_INTERVAL = 2;
77 static final int VP_ITSP_HEADER_LENGTH = 84;
78 static final int VP_INDEX_DEPTH = 1;
79 static final int VP_INDEX_HEAD = 0;
80 static final int VP_INDEX_ROOT = -1;
81 static final int VP_UNKNOWN_NUM_BLOCKS = -1;
82 static final int VP_ITSP_UNKNOWN_000C = 10;
83 static final int VP_ITSP_UNKNOWN_0024 = 0;
84 static final int VP_ITSP_UNKNOWN_002C = 1;
85 static final int VP_ITSP_BYTEARR_LEN = 16;
86 static final int VP_ITSP_VERSION = 1;
87 static final int VP_RESET_INTERVAL = 2;
88 static final int VP_CONTROL_DATA_SIZE = 6;
89 static final int VP_UNKNOWN_18 = 0;
90 static final int VP_CONTROL_DATA_VERSION = 2;
91 static final int VP_WINDOW_SIZE = 65536;
92 static final int VP_WINDOWS_PER_RESET = 1;
93 static final int VP_CHM_ENTITIES_NUMBER = 101;
94 static final int VP_PMGI_FREE_SPACE = 3;
95 static final int VP_PMGL_BLOCK_NEXT = -1;
96 static final int VP_PMGL_BLOCK_PREV = -1;
97 static final int VP_PMGL_FREE_SPACE = 1644;
98 static final int VP_PMGL_UNKNOWN_008 = 0;
99 static final int VP_RESET_TABLE_BA = 12;
100 static final int VP_RES_TBL_BLOCK_LENGTH = 32768;
101 static final int VP_RES_TBL_COMPR_LENGTH = 177408;
102 static final int VP_RES_TBL_UNCOMP_LENGTH = 383786;
103 static final int VP_TBL_OFFSET = 40;
104 static final int VP_RES_TBL_UNKNOWN = 8;
105 static final int VP_RES_TBL_VERSION = 2;
106 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.parser.chm.accessor.ChmPmgiHeader;
22 import org.junit.Before;
23 import org.junit.Test;
24
25 public class TestPmgiHeader {
26 ChmPmgiHeader chmPmgiHeader = null;
27
28 @Before
29 public void setUp() throws Exception {
30 byte[] data = TestParameters.chmData;
31 chmPmgiHeader = new ChmPmgiHeader();
32 chmPmgiHeader.parse(data, chmPmgiHeader);
33 }
34
35 @Test
36 public void testToString() {
37 assertTrue((chmPmgiHeader != null) && (chmPmgiHeader.toString().length() > 0));
38 }
39
40 @Test
41 public void testGetFreeSpace() {
42 assertEquals(TestParameters.VP_PMGI_FREE_SPACE, chmPmgiHeader.getFreeSpace());
43 }
44 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.chm;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.parser.chm.accessor.ChmPmglHeader;
22 import org.apache.tika.parser.chm.core.ChmCommons;
23 import org.apache.tika.parser.chm.core.ChmConstants;
24 import org.junit.Before;
25 import org.junit.Test;
26
27 public class TestPmglHeader {
28 ChmPmglHeader chmPmglHeader = null;
29
30 @Before
31 public void setUp() throws Exception {
32 byte[] data = TestParameters.chmData;
33 chmPmglHeader = new ChmPmglHeader();
34 chmPmglHeader.parse(ChmCommons.copyOfRange(data,
35 ChmConstants.START_PMGL, ChmConstants.START_PMGL
36 + ChmConstants.CHM_PMGL_LEN + 10), chmPmglHeader);
37 }
38
39 @Test
40 public void testToString() {
41 assertTrue((chmPmglHeader != null)
42 && chmPmglHeader.toString().length() > 0);
43 }
44
45 @Test
46 public void testChmPmglHeaderGet() {
47 assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String(
48 chmPmglHeader.getSignature()));
49 }
50
51 @Test
52 public void testGetBlockNext() {
53 assertEquals(TestParameters.VP_PMGL_BLOCK_NEXT,
54 chmPmglHeader.getBlockNext());
55 }
56
57 @Test
58 public void testGetBlockPrev() {
59 assertEquals(TestParameters.VP_PMGL_BLOCK_PREV,
60 chmPmglHeader.getBlockPrev());
61 }
62
63 @Test
64 public void testGetFreeSpace() {
65 assertEquals(TestParameters.VP_PMGL_FREE_SPACE,
66 chmPmglHeader.getFreeSpace());
67 }
68
69 @Test
70 public void testGetUnknown0008() {
71 assertEquals(TestParameters.VP_PMGL_UNKNOWN_008,
72 chmPmglHeader.getUnknown0008());
73 }
74 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.crypto;
18
19 import static org.junit.Assert.assertTrue;
20 import static org.junit.Assert.fail;
21
22 import java.io.InputStream;
23
24 import org.apache.tika.TikaTest;
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.sax.BodyContentHandler;
29 import org.xml.sax.ContentHandler;
30
31 public class Pkcs7ParserTest extends TikaTest {
32 public void testDetachedSignature() throws Exception {
33 InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
34 "/test-documents/testDetached.p7s");
35 try {
36 ContentHandler handler = new BodyContentHandler();
37 Metadata metadata = new Metadata();
38 new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
39 } catch (NullPointerException npe) {
40 fail("should not get NPE");
41 } catch (TikaException te) {
42 assertTrue(te.toString().indexOf("cannot parse detached pkcs7 signature") != -1);
43 } finally {
44 input.close();
45 }
46 }
47 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.dwg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class DWGParserTest {
31
32 @Test
33 public void testDWG2000Parser() throws Exception {
34 InputStream input = DWGParserTest.class.getResourceAsStream(
35 "/test-documents/testDWG2000.dwg");
36 testParserAlt(input);
37 }
38
39 @Test
40 public void testDWG2004Parser() throws Exception {
41 InputStream input = DWGParserTest.class.getResourceAsStream(
42 "/test-documents/testDWG2004.dwg");
43 testParser(input);
44 }
45
46 @Test
47 public void testDWG2004ParserNoHeaderAddress() throws Exception {
48 InputStream input = DWGParserTest.class.getResourceAsStream(
49 "/test-documents/testDWG2004_no_header.dwg");
50 testParserNoHeader(input);
51 }
52
53 @Test
54 public void testDWG2007Parser() throws Exception {
55 InputStream input = DWGParserTest.class.getResourceAsStream(
56 "/test-documents/testDWG2007.dwg");
57 testParser(input);
58 }
59
60 @Test
61 public void testDWG2010Parser() throws Exception {
62 InputStream input = DWGParserTest.class.getResourceAsStream(
63 "/test-documents/testDWG2010.dwg");
64 testParser(input);
65 }
66
67 @Test
68 public void testDWG2010CustomPropertiesParser() throws Exception {
69 // Check that standard parsing works
70 InputStream input = DWGParserTest.class.getResourceAsStream(
71 "/test-documents/testDWG2010_custom_props.dwg");
72 testParser(input);
73
74 // Check that custom properties with alternate padding work
75 input = DWGParserTest.class.getResourceAsStream(
76 "/test-documents/testDWG2010_custom_props.dwg");
77 try {
78 Metadata metadata = new Metadata();
79 ContentHandler handler = new BodyContentHandler();
80 new DWGParser().parse(input, handler, metadata, null);
81
82 assertEquals("valueforcustomprop1",
83 metadata.get("customprop1"));
84 assertEquals("valueforcustomprop2",
85 metadata.get("customprop2"));
86 } finally {
87 input.close();
88 }
89 }
90
91 @Test
92 public void testDWGMechParser() throws Exception {
93 String[] types = new String[] {
94 "6", "2004", "2004DX", "2005", "2006",
95 "2007", "2008", "2009", "2010", "2011"
96 };
97 for (String type : types) {
98 InputStream input = DWGParserTest.class.getResourceAsStream(
99 "/test-documents/testDWGmech"+type+".dwg");
100 testParserAlt(input);
101 }
102 }
103
104 @SuppressWarnings("deprecation")
105 private void testParser(InputStream input) throws Exception {
106 try {
107 Metadata metadata = new Metadata();
108 ContentHandler handler = new BodyContentHandler();
109 new DWGParser().parse(input, handler, metadata);
110
111 assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
112
113 assertEquals("The quick brown fox jumps over the lazy dog",
114 metadata.get(TikaCoreProperties.TITLE));
115 assertEquals("Gym class featuring a brown fox and lazy dog",
116 metadata.get(TikaCoreProperties.DESCRIPTION));
117 assertEquals("Gym class featuring a brown fox and lazy dog",
118 metadata.get(Metadata.SUBJECT));
119 assertEquals("Nevin Nollop",
120 metadata.get(TikaCoreProperties.CREATOR));
121 assertEquals("Pangram, fox, dog",
122 metadata.get(TikaCoreProperties.KEYWORDS));
123 assertEquals("Lorem ipsum",
124 metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
125 assertEquals("http://www.alfresco.com",
126 metadata.get(TikaCoreProperties.RELATION));
127
128 // Check some of the old style metadata too
129 assertEquals("The quick brown fox jumps over the lazy dog",
130 metadata.get(Metadata.TITLE));
131 assertEquals("Gym class featuring a brown fox and lazy dog",
132 metadata.get(Metadata.SUBJECT));
133
134 String content = handler.toString();
135 assertTrue(content.contains("The quick brown fox jumps over the lazy dog"));
136 assertTrue(content.contains("Gym class"));
137 assertTrue(content.contains("www.alfresco.com"));
138 } finally {
139 input.close();
140 }
141 }
142
143 @SuppressWarnings("deprecation")
144 private void testParserNoHeader(InputStream input) throws Exception {
145 try {
146 Metadata metadata = new Metadata();
147 ContentHandler handler = new BodyContentHandler();
148 new DWGParser().parse(input, handler, metadata);
149
150 assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
151
152 assertNull(metadata.get(TikaCoreProperties.TITLE));
153 assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
154 assertNull(metadata.get(Metadata.SUBJECT));
155 assertNull(metadata.get(TikaCoreProperties.CREATOR));
156 assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
157 assertNull(metadata.get(TikaCoreProperties.COMMENTS));
158 assertNull(metadata.get(TikaCoreProperties.RELATION));
159
160 String content = handler.toString();
161 assertTrue(content.contains(""));
162 } finally {
163 input.close();
164 }
165 }
166
167 @SuppressWarnings("deprecation")
168 private void testParserAlt(InputStream input) throws Exception {
169 try {
170 Metadata metadata = new Metadata();
171 ContentHandler handler = new BodyContentHandler();
172 new DWGParser().parse(input, handler, metadata);
173
174 assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
175
176 assertEquals("Test Title",
177 metadata.get(TikaCoreProperties.TITLE));
178 assertEquals("Test Subject",
179 metadata.get(TikaCoreProperties.DESCRIPTION));
180 assertEquals("Test Subject",
181 metadata.get(Metadata.SUBJECT));
182 assertEquals("My Author",
183 metadata.get(TikaCoreProperties.CREATOR));
184 assertEquals("My keyword1, MyKeyword2",
185 metadata.get(TikaCoreProperties.KEYWORDS));
186 assertEquals("This is a comment",
187 metadata.get(TikaCoreProperties.COMMENTS));
188 assertEquals("bejanpol",
189 metadata.get(TikaCoreProperties.MODIFIER));
190 assertEquals("bejanpol",
191 metadata.get(Metadata.LAST_AUTHOR));
192 assertEquals("http://mycompany/drawings",
193 metadata.get(TikaCoreProperties.RELATION));
194 assertEquals("MyCustomPropertyValue",
195 metadata.get("MyCustomProperty"));
196
197 String content = handler.toString();
198 assertTrue(content.contains("This is a comment"));
199 assertTrue(content.contains("mycompany"));
200 } finally {
201 input.close();
202 }
203 }
204 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.epub;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class EpubParserTest {
31
32 @Test
33 public void testXMLParser() throws Exception {
34 InputStream input = EpubParserTest.class.getResourceAsStream(
35 "/test-documents/testEPUB.epub");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 new EpubParser().parse(input, handler, metadata, new ParseContext());
40
41 assertEquals("application/epub+zip",
42 metadata.get(Metadata.CONTENT_TYPE));
43 assertEquals("en",
44 metadata.get(TikaCoreProperties.LANGUAGE));
45 assertEquals("This is an ePub test publication for Tika.",
46 metadata.get(TikaCoreProperties.DESCRIPTION));
47 assertEquals("Apache",
48 metadata.get(TikaCoreProperties.PUBLISHER));
49
50 String content = handler.toString();
51 assertTrue(content.contains("Plus a simple div"));
52 assertTrue(content.contains("First item"));
53 assertTrue(content.contains("The previous headings were subchapters"));
54 assertTrue(content.contains("Table data"));
55 } finally {
56 input.close();
57 }
58 }
59
60 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.executable;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.parser.ParseContext;
24 import org.apache.tika.sax.BodyContentHandler;
25 import org.junit.Test;
26 import org.xml.sax.ContentHandler;
27
28 public class ExecutableParserTest {
29
30 @Test
31 public void testWin32Parser() throws Exception {
32 InputStream input = ExecutableParserTest.class.getResourceAsStream(
33 "/test-documents/testWindows-x86-32.exe");
34 try {
35 Metadata metadata = new Metadata();
36 ContentHandler handler = new BodyContentHandler();
37 new ExecutableParser().parse(input, handler, metadata, new ParseContext());
38
39 assertEquals("application/x-msdownload",
40 metadata.get(Metadata.CONTENT_TYPE));
41 assertEquals("2012-05-13T13:40:11Z",
42 metadata.get(Metadata.CREATION_DATE));
43
44 assertEquals(ExecutableParser.MACHINE_x86_32,
45 metadata.get(ExecutableParser.MACHINE_TYPE));
46 assertEquals("Little",
47 metadata.get(ExecutableParser.ENDIAN));
48 assertEquals("32",
49 metadata.get(ExecutableParser.ARCHITECTURE_BITS));
50 assertEquals("Windows",
51 metadata.get(ExecutableParser.PLATFORM));
52
53 String content = handler.toString();
54 assertEquals("", content); // No text yet
55 } finally {
56 input.close();
57 }
58 }
59
60 @Test
61 public void testElfParser_x86_32() throws Exception {
62 InputStream input = ExecutableParserTest.class.getResourceAsStream(
63 "/test-documents/testLinux-x86-32");
64 try {
65 Metadata metadata = new Metadata();
66 ContentHandler handler = new BodyContentHandler();
67 new ExecutableParser().parse(input, handler, metadata, new ParseContext());
68
69 assertEquals("application/x-executable",
70 metadata.get(Metadata.CONTENT_TYPE));
71
72 assertEquals(ExecutableParser.MACHINE_x86_32,
73 metadata.get(ExecutableParser.MACHINE_TYPE));
74 assertEquals("Little",
75 metadata.get(ExecutableParser.ENDIAN));
76 assertEquals("32",
77 metadata.get(ExecutableParser.ARCHITECTURE_BITS));
78 // assertEquals("Linux",
79 // metadata.get(ExecutableParser.PLATFORM));
80
81 String content = handler.toString();
82 assertEquals("", content); // No text yet
83 } finally {
84 input.close();
85 }
86 }
87
88 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.feed;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class FeedParserTest {
31
32 @Test
33 public void testXMLParser() throws Exception {
34 InputStream input = FeedParserTest.class
35 .getResourceAsStream("/test-documents/rsstest.rss");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 ParseContext context = new ParseContext();
40
41 new FeedParser().parse(input, handler, metadata, context);
42
43 String content = handler.toString();
44 assertFalse(content == null);
45
46 assertEquals("Sample RSS File for Junit test",
47 metadata.get(TikaCoreProperties.DESCRIPTION));
48 assertEquals("TestChannel", metadata.get(TikaCoreProperties.TITLE));
49
50 // TODO find a way of testing the paragraphs and anchors
51
52 } finally {
53 input.close();
54 }
55 }
56
57 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.font;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.TikaCoreProperties;
23 import org.apache.tika.parser.AutoDetectParser;
24 import org.apache.tika.parser.ParseContext;
25 import org.apache.tika.parser.Parser;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.xml.sax.ContentHandler;
28 import org.apache.tika.io.TikaInputStream;
29 import org.junit.Test;
30
31 /**
32 * Test case for parsing afm files.
33 */
34 public class AdobeFontMetricParserTest {
35
36 @Test
37 public void testAdobeFontMetricParsing() throws Exception {
38 Parser parser = new AutoDetectParser(); // Should auto-detect!
39 ContentHandler handler = new BodyContentHandler();
40 Metadata metadata = new Metadata();
41 ParseContext context = new ParseContext();
42 TikaInputStream stream = TikaInputStream.get(
43 AdobeFontMetricParserTest.class.getResource(
44 "/test-documents/testAFM.afm"));
45
46 try {
47 parser.parse(stream, handler, metadata, context);
48 } finally {
49 stream.close();
50 }
51
52 assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
53 assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
54 assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
55
56 assertEquals("TestFontName", metadata.get("FontName"));
57 assertEquals("TestFullName", metadata.get("FontFullName"));
58 assertEquals("TestSymbol", metadata.get("FontFamilyName"));
59
60 assertEquals("Medium", metadata.get("FontWeight"));
61 assertEquals("001.008", metadata.get("FontVersion"));
62
63 String content = handler.toString();
64
65 // Test that the comments got extracted
66 assertTrue(content.contains("Comments"));
67 assertTrue(content.contains("This is a comment in a sample file"));
68 assertTrue(content.contains("UniqueID 12345"));
69 }
70 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.fork;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20 import static org.junit.Assert.assertTrue;
21 import static org.junit.Assert.fail;
22
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.NotSerializableException;
26 import java.util.Arrays;
27 import java.util.HashSet;
28 import java.util.Set;
29
30 import org.apache.tika.Tika;
31 import org.apache.tika.config.TikaConfig;
32 import org.apache.tika.detect.DefaultDetector;
33 import org.apache.tika.detect.Detector;
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.fork.ForkParser;
36 import org.apache.tika.metadata.Metadata;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.Parser;
40 import org.apache.tika.sax.BodyContentHandler;
41 import org.junit.Test;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.SAXException;
44
45 /**
46 * Test that the ForkParser correctly behaves when
47 * wired in to the regular Parsers and their test data
48 */
49 public class ForkParserIntegrationTest {
50
51 private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works
52
53 /**
54 * Simple text parsing
55 */
56 @Test
57 public void testForkedTextParsing() throws Exception {
58 ForkParser parser = new ForkParser(
59 ForkParserIntegrationTest.class.getClassLoader(),
60 tika.getParser());
61
62 try {
63 ContentHandler output = new BodyContentHandler();
64 InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
65 "/test-documents/testTXT.txt");
66 ParseContext context = new ParseContext();
67 parser.parse(stream, output, new Metadata(), context);
68
69 String content = output.toString();
70 assertTrue(content.contains("Test d'indexation"));
71 assertTrue(content.contains("http://www.apache.org"));
72 } finally {
73 parser.close();
74 }
75 }
76
77 /**
78 * This error has a message and an equals() implementation as to be able
79 * to match it against the serialized version of itself.
80 */
81 static class AnError extends Error {
82 private static final long serialVersionUID = -6197267350768803348L;
83 private String message;
84 AnError(String message) {
85 super(message);
86 this.message = message;
87 }
88
89 @Override
90 public boolean equals(Object o) {
91 if (this == o) return true;
92 if (o == null || getClass() != o.getClass()) return false;
93
94 AnError anError = (AnError) o;
95
96 if (!message.equals(anError.message)) return false;
97
98 return true;
99 }
100
101 @Override
102 public int hashCode() {
103 return message.hashCode();
104 }
105 }
106
107 /**
108 * This error isn't serializable on the server, so can't be sent back
109 * to the Fork Client once it has occured
110 */
111 static class WontBeSerializedError extends RuntimeException {
112 private static final long serialVersionUID = 1L;
113
114 WontBeSerializedError(String message) {
115 super(message);
116 }
117
118 private void writeObject(java.io.ObjectOutputStream out) {
119 RuntimeException e = new RuntimeException("Bang!");
120 boolean found = false;
121 for (StackTraceElement ste : e.getStackTrace()) {
122 if (ste.getClassName().equals(ForkParser.class.getName())) {
123 found = true;
124 }
125 }
126 if (!found) {
127 throw e;
128 }
129 }
130 }
131
132 static class BrokenParser implements Parser {
133 private static final long serialVersionUID = 995871497930817839L;
134 public Error err = new AnError("Simulated fail");
135 public RuntimeException re = null;
136
137 public Set<MediaType> getSupportedTypes(ParseContext context) {
138 return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN));
139 }
140
141 public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
142 if (re != null) throw re;
143 throw err;
144 }
145 }
146
147 /**
148 * TIKA-831 Parsers throwing errors should be caught and
149 * properly reported
150 */
151 @Test
152 public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
153 BrokenParser brokenParser = new BrokenParser();
154 Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
155 InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
156
157 // With a serializable error, we'll get that back
158 try {
159 ContentHandler output = new BodyContentHandler();
160 ParseContext context = new ParseContext();
161 parser.parse(stream, output, new Metadata(), context);
162 fail("Expected TikaException caused by Error");
163 } catch (TikaException e) {
164 assertEquals(brokenParser.err, e.getCause());
165 }
166
167 // With a non serializable one, we'll get something else
168 // TODO Fix this test
169 brokenParser = new BrokenParser();
170 brokenParser.re= new WontBeSerializedError("Can't Serialize");
171 parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
172 // try {
173 // ContentHandler output = new BodyContentHandler();
174 // ParseContext context = new ParseContext();
175 // parser.parse(stream, output, new Metadata(), context);
176 // fail("Expected TikaException caused by Error");
177 // } catch (TikaException e) {
178 // assertEquals(TikaException.class, e.getCause().getClass());
179 // assertEquals("Bang!", e.getCause().getMessage());
180 // }
181 }
182
183 /**
184 * If we supply a non serializable object on the ParseContext,
185 * check we get a helpful exception back
186 */
187 @Test
188 public void testParserHandlingOfNonSerializable() throws Exception {
189 ForkParser parser = new ForkParser(
190 ForkParserIntegrationTest.class.getClassLoader(),
191 tika.getParser());
192
193 ParseContext context = new ParseContext();
194 context.set(Detector.class, new Detector() {
195 public MediaType detect(InputStream input, Metadata metadata) {
196 return MediaType.OCTET_STREAM;
197 }
198 });
199
200 try {
201 ContentHandler output = new BodyContentHandler();
202 InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
203 "/test-documents/testTXT.txt");
204 parser.parse(stream, output, new Metadata(), context);
205 fail("Should have blown up with a non serializable ParseContext");
206 } catch(TikaException e) {
207 // Check the right details
208 assertNotNull(e.getCause());
209 assertEquals(NotSerializableException.class, e.getCause().getClass());
210 assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
211 } finally {
212 parser.close();
213 }
214 }
215
216 /**
217 * TIKA-832
218 */
219 @Test
220 public void testAttachingADebuggerOnTheForkedParserShouldWork()
221 throws Exception {
222 ParseContext context = new ParseContext();
223 context.set(Parser.class, tika.getParser());
224
225 ForkParser parser = new ForkParser(
226 ForkParserIntegrationTest.class.getClassLoader(),
227 tika.getParser());
228 parser.setJavaCommand(
229 "java -Xmx32m -Xdebug -Xrunjdwp:"
230 + "transport=dt_socket,address=54321,server=y,suspend=n");
231 try {
232 ContentHandler body = new BodyContentHandler();
233 InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
234 "/test-documents/testTXT.txt");
235 parser.parse(stream, body, new Metadata(), context);
236 String content = body.toString();
237 assertTrue(content.contains("Test d'indexation"));
238 assertTrue(content.contains("http://www.apache.org"));
239 } finally {
240 parser.close();
241 }
242 }
243
244 /**
245 * TIKA-808 - Ensure that parsing of our test PDFs work under
246 * the Fork Parser, to ensure that complex parsing behaves
247 */
248 @Test
249 public void testForkedPDFParsing() throws Exception {
250 ForkParser parser = new ForkParser(
251 ForkParserIntegrationTest.class.getClassLoader(),
252 tika.getParser());
253 try {
254 ContentHandler output = new BodyContentHandler();
255 InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
256 "/test-documents/testPDF.pdf");
257 ParseContext context = new ParseContext();
258 parser.parse(stream, output, new Metadata(), context);
259
260 String content = output.toString();
261 assertTrue(content.contains("Apache Tika"));
262 assertTrue(content.contains("Tika - Content Analysis Toolkit"));
263 assertTrue(content.contains("incubator"));
264 assertTrue(content.contains("Apache Software Foundation"));
265 } finally {
266 parser.close();
267 }
268 }
269 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.hdf;
17
18 //JDK imports
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNotNull;
21
22 import java.io.InputStream;
23
24
25
26
27 //TIKA imports
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.parser.hdf.HDFParser;
32 import org.apache.tika.sax.BodyContentHandler;
33 import org.junit.Test;
34 import org.xml.sax.ContentHandler;
35
36 /**
37 *
38 * Test suite for the {@link HDFParser}.
39 *
40 */
41 public class HDFParserTest {
42
43 @Test
44 public void testParseGlobalMetadata() throws Exception {
45 if(System.getProperty("java.version").startsWith("1.5")) {
46 return;
47 }
48 Parser parser = new HDFParser();
49 ContentHandler handler = new BodyContentHandler();
50 Metadata metadata = new Metadata();
51
52 /*
53 * this is a publicly available HDF5 file from the MLS mission:
54 *
55 *
56 * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
57 * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
58 */
59 InputStream stream = HDFParser.class
60 .getResourceAsStream("/test-documents/test.he5");
61 try {
62 parser.parse(stream, handler, metadata, new ParseContext());
63 } finally {
64 stream.close();
65 }
66
67 assertNotNull(metadata);
68 assertEquals("5", metadata.get("GranuleMonth"));
69 }
70
71 @Test
72 public void testHDF4() throws Exception {
73 if(System.getProperty("java.version").startsWith("1.5")) {
74 return;
75 }
76 Parser parser = new HDFParser();
77 ContentHandler handler = new BodyContentHandler();
78 Metadata metadata = new Metadata();
79
80 /*
81 * this is a publicly available HDF4 file from the HD4 examples:
82 *
83 * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
84 */
85 InputStream stream = HDFParser.class
86 .getResourceAsStream("/test-documents/test.hdf");
87 try {
88 parser.parse(stream, handler, metadata, new ParseContext());
89 } finally {
90 stream.close();
91 }
92
93 assertNotNull(metadata);
94 assertEquals("Direct read of HDF4 file through CDM library", metadata.get("_History"));
95 assertEquals("Ascending", metadata.get("Pass"));
96 }
97 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertNotNull;
21 import static org.junit.Assert.assertTrue;
22
23 import java.io.ByteArrayInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.StringWriter;
27 import java.io.Writer;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.regex.Pattern;
31
32 import javax.xml.transform.OutputKeys;
33 import javax.xml.transform.sax.SAXTransformerFactory;
34 import javax.xml.transform.sax.TransformerHandler;
35 import javax.xml.transform.stream.StreamResult;
36
37 import org.apache.tika.Tika;
38 import org.apache.tika.exception.TikaException;
39 import org.apache.tika.metadata.Geographic;
40 import org.apache.tika.metadata.Metadata;
41 import org.apache.tika.metadata.TikaCoreProperties;
42 import org.apache.tika.parser.ParseContext;
43 import org.apache.tika.sax.BodyContentHandler;
44 import org.apache.tika.sax.LinkContentHandler;
45 import org.apache.tika.sax.TeeContentHandler;
46 import org.apache.tika.sax.TextContentHandler;
47 import org.ccil.cowan.tagsoup.HTMLSchema;
48 import org.ccil.cowan.tagsoup.Schema;
49 import org.junit.Ignore;
50 import org.junit.Test;
51 import org.xml.sax.Attributes;
52 import org.xml.sax.ContentHandler;
53 import org.xml.sax.Locator;
54 import org.xml.sax.SAXException;
55 import org.xml.sax.helpers.DefaultHandler;
56
57 public class HtmlParserTest {
58
59 @Test
60 public void testParseAscii() throws Exception {
61 String path = "/test-documents/testHTML.html";
62 final StringWriter href = new StringWriter();
63 final StringWriter name = new StringWriter();
64 ContentHandler body = new BodyContentHandler();
65 Metadata metadata = new Metadata();
66 InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
67 try {
68 ContentHandler link = new DefaultHandler() {
69 @Override
70 public void startElement(
71 String u, String l, String n, Attributes a)
72 throws SAXException {
73 if ("a".equals(l)) {
74 if (a.getValue("href") != null) {
75 href.append(a.getValue("href"));
76 } else if (a.getValue("name") != null) {
77 name.append(a.getValue("name"));
78 }
79 }
80 }
81 };
82 new HtmlParser().parse(
83 stream, new TeeContentHandler(body, link),
84 metadata, new ParseContext());
85 } finally {
86 stream.close();
87 }
88
89 assertEquals(
90 "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
91 assertEquals("Tika Developers", metadata.get("Author"));
92 assertEquals("5", metadata.get("refresh"));
93
94 assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
95 assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
96
97 assertEquals("http://www.apache.org/", href.toString());
98 assertEquals("test-anchor", name.toString());
99
100 String content = body.toString();
101 assertTrue(
102 "Did not contain expected text:" + "Test Indexation Html",
103 content.contains("Test Indexation Html"));
104 assertTrue(
105 "Did not contain expected text:" + "Indexation du fichier",
106 content.contains("Indexation du fichier"));
107 }
108
109 @Test
110 @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
111 public void XtestParseUTF8() throws IOException, SAXException, TikaException {
112 String path = "/test-documents/testXHTML_utf8.html";
113 Metadata metadata = new Metadata();
114 String content = new Tika().parseToString(
115 HtmlParserTest.class.getResourceAsStream(path), metadata);
116
117 assertTrue("Did not contain expected text:"
118 + "Title : Tilte with UTF-8 chars öäå", content
119 .contains("Title : Tilte with UTF-8 chars öäå"));
120
121 assertTrue("Did not contain expected text:"
122 + "Content with UTF-8 chars", content
123 .contains("Content with UTF-8 chars"));
124
125 assertTrue("Did not contain expected text:" + "åäö", content
126 .contains("åäö"));
127 }
128
129 @Test
130 public void testXhtmlParsing() throws Exception {
131 String path = "/test-documents/testXHTML.html";
132 Metadata metadata = new Metadata();
133 String content = new Tika().parseToString(
134 HtmlParserTest.class.getResourceAsStream(path), metadata);
135
136 assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
137 assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
138
139 assertEquals("Tika Developers", metadata.get("Author"));
140 assertEquals("5", metadata.get("refresh"));
141 assertTrue(content.contains("ability of Apache Tika"));
142 assertTrue(content.contains("extract content"));
143 assertTrue(content.contains("an XHTML document"));
144 }
145
146 @Test
147 public void testParseEmpty() throws Exception {
148 ContentHandler handler = new BodyContentHandler();
149 new HtmlParser().parse(
150 new ByteArrayInputStream(new byte[0]),
151 handler, new Metadata(), new ParseContext());
152 assertEquals("", handler.toString());
153 }
154
155 /**
156 * Test case for TIKA-210
157 * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
158 */
159 @Test
160 public void testCharactersDirectlyUnderBodyElement() throws Exception {
161 String test = "<html><body>test</body></html>";
162 String content = new Tika().parseToString(
163 new ByteArrayInputStream(test.getBytes("UTF-8")));
164 assertEquals("test", content);
165 }
166
167 /**
168 * Test case for TIKA-287
169 * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
170 */
171 @Test
172 public void testBaseHref() throws Exception {
173 assertRelativeLink(
174 "http://lucene.apache.org/tika/",
175 "http://lucene.apache.org/", "tika/");
176
177 assertRelativeLink(
178 "http://domain.com/?pid=1",
179 "http://domain.com", "?pid=1");
180 assertRelativeLink(
181 "http://domain.com/?pid=2",
182 "http://domain.com?pid=1", "?pid=2");
183
184 assertRelativeLink(
185 "http://domain.com/file.html",
186 "http://domain.com/path/", "/file.html");
187 assertRelativeLink(
188 "http://domain.com/path/file.html",
189 "http://domain.com/path/", "./file.html");
190 assertRelativeLink(
191 "http://domain.com/path/file.html",
192 "http://domain.com/path/", "file.html");
193
194 assertRelativeLink(
195 "http://domain2.com/newpath",
196 "http://domain.com/path/to/file", "http://domain2.com/newpath");
197
198 // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
199 // Also http://www.ietf.org/rfc/rfc3986.txt
200 // Also http://issues.apache.org/jira/browse/NUTCH-566
201 // Also http://issues.apache.org/jira/browse/NUTCH-436
202 assertRelativeLink(
203 "http://domain.com/path/?pid=1",
204 "http://domain.com/path/", "?pid=1");
205 assertRelativeLink(
206 "http://domain.com/file?pid=1",
207 "http://domain.com/file", "?pid=1");
208 assertRelativeLink(
209 "http://domain.com/path/d;p?pid=1",
210 "http://domain.com/path/d;p?q#f", "?pid=1");
211 }
212
213 private void assertRelativeLink(String url, String base, String relative)
214 throws Exception {
215 String test =
216 "<html><head><base href=\"" + base + "\"></head>"
217 + "<body><a href=\"" + relative + "\">test</a></body></html>";
218 final List<String> links = new ArrayList<String>();
219 new HtmlParser().parse(
220 new ByteArrayInputStream(test.getBytes("UTF-8")),
221 new DefaultHandler() {
222 @Override
223 public void startElement(
224 String u, String l, String name, Attributes atts) {
225 if (name.equals("a") && atts.getValue("", "href") != null) {
226 links.add(atts.getValue("", "href"));
227 }
228 }
229 },
230 new Metadata(),
231 new ParseContext());
232 assertEquals(1, links.size());
233 assertEquals(url, links.get(0));
234 }
235
236 /**
237 * Test case for TIKA-268
238 * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
239 */
240 @Test
241 public void testWhitespaceBetweenTableCells() throws Exception {
242 String test =
243 "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
244 String content = new Tika().parseToString(
245 new ByteArrayInputStream(test.getBytes("UTF-8")));
246 assertTrue(content.contains("a"));
247 assertTrue(content.contains("b"));
248 assertFalse(content.contains("ab"));
249 }
250
251 /**
252 * Test case for TIKA-332
253 * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
254 */
255 @Test
256 public void testHttpEquivCharset() throws Exception {
257 String test =
258 "<html><head><meta http-equiv=\"content-type\""
259 + " content=\"text/html; charset=ISO-8859-1\" />"
260 + "<title>the name is \u00e1ndre</title>"
261 + "</head><body></body></html>";
262 Metadata metadata = new Metadata();
263 new HtmlParser().parse (
264 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
265 new BodyContentHandler(), metadata, new ParseContext());
266 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
267 }
268
269 /**
270 * Test case for TIKA-892
271 * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
272 */
273 @Test
274 public void testHtml5Charset() throws Exception {
275 String test =
276 "<html><head><meta charset=\"ISO-8859-15\" />"
277 + "<title>the name is \u00e1ndre</title>"
278 + "</head><body></body></html>";
279 Metadata metadata = new Metadata();
280 new HtmlParser().parse(
281 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
282 new BodyContentHandler(), metadata, new ParseContext());
283 assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
284 }
285
286 /**
287 * Test case for TIKA-334
288 * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
289 */
290 @Test
291 public void testDetectOfCharset() throws Exception {
292 String test =
293 "<html><head><title>\u017d</title></head><body></body></html>";
294 Metadata metadata = new Metadata();
295 new HtmlParser().parse (
296 new ByteArrayInputStream(test.getBytes("UTF-8")),
297 new BodyContentHandler(), metadata, new ParseContext());
298 assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
299 }
300
301 /**
302 * Test case for TIKA-341
303 * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
304 */
305 @Test
306 public void testUsingCharsetInContentTypeHeader() throws Exception {
307 final String test =
308 "<html><head><title>the name is \u00e1ndre</title></head>"
309 + "<body></body></html>";
310
311 Metadata metadata = new Metadata();
312 new HtmlParser().parse (
313 new ByteArrayInputStream(test.getBytes("UTF-8")),
314 new BodyContentHandler(), metadata, new ParseContext());
315 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
316
317 metadata = new Metadata();
318 metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
319 new HtmlParser().parse (
320 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
321 new BodyContentHandler(), metadata, new ParseContext());
322 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
323 }
324
325 /**
326 * Test case for HTML content like
327 * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
328 * in three whitespace-separated tokens "foo", "bar" and "baz" instead
329 * of a single token "foobarbaz".
330 *
331 * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
332 */
333 @Test
334 public void testLineBreak() throws Exception {
335 String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
336 String text = new Tika().parseToString(
337 new ByteArrayInputStream(test.getBytes("US-ASCII")));
338 String[] parts = text.trim().split("\\s+");
339 assertEquals(3, parts.length);
340 assertEquals("foo", parts[0]);
341 assertEquals("bar", parts[1]);
342 assertEquals("baz", parts[2]);
343 }
344
345 /**
346 * Test case for TIKA-339: Don't use language returned by CharsetDetector
347 * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
348 */
349 @Test
350 public void testIgnoreCharsetDetectorLanguage() throws Exception {
351 String test = "<html><title>Simple Content</title><body></body></html>";
352 Metadata metadata = new Metadata();
353 metadata.add(Metadata.CONTENT_LANGUAGE, "en");
354 new HtmlParser().parse (
355 new ByteArrayInputStream(test.getBytes("UTF-8")),
356 new BodyContentHandler(), metadata, new ParseContext());
357
358 assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
359 }
360
361 /**
362 * Test case for TIKA-349
363 * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
364 */
365 @Test
366 public void testHttpEquivCharsetFunkyAttributes() throws Exception {
367 String test1 =
368 "<html><head><meta http-equiv=\"content-type\""
369 + " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />"
370 + "<title>the name is \u00e1ndre</title>"
371 + "</head><body></body></html>";
372 Metadata metadata = new Metadata();
373 new HtmlParser().parse (
374 new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
375 new BodyContentHandler(), metadata, new ParseContext());
376 assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
377
378 // Some HTML pages have errors like ';;' versus '; ' as separator
379 String test2 =
380 "<html><head><meta http-equiv=\"content-type\""
381 + " content=\"text/html;;charset=ISO-8859-15\" />"
382 + "<title>the name is \u00e1ndre</title>"
383 + "</head><body></body></html>";
384 metadata = new Metadata();
385 new HtmlParser().parse (
386 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
387 new BodyContentHandler(), metadata, new ParseContext());
388 assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
389 }
390
391 /**
392 * Test case for TIKA-350
393 * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
394 */
395 @Test
396 public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
397 final String test =
398 "<html><head><title>the name is \u00e1ndre</title></head>"
399 + "<body></body></html>";
400
401 Metadata metadata = new Metadata();
402 new HtmlParser().parse (
403 new ByteArrayInputStream(test.getBytes("UTF-8")),
404 new BodyContentHandler(), metadata, new ParseContext());
405 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
406
407 metadata = new Metadata();
408 metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
409 new HtmlParser().parse (
410 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
411 new BodyContentHandler(), metadata, new ParseContext());
412 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
413 }
414
415
416 /**
417 * Test case for TIKA-357
418 * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
419 */
420 @Test
421 public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
422 String path = "/test-documents/big-preamble.html";
423 Metadata metadata = new Metadata();
424 new HtmlParser().parse(
425 HtmlParserTest.class.getResourceAsStream(path),
426 new BodyContentHandler(), metadata, new ParseContext());
427
428 assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
429 }
430
431 /**
432 * Test case for TIKA-420
433 * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
434 */
435 @Test
436 public void testBoilerplateRemoval() throws Exception {
437 String path = "/test-documents/boilerplate.html";
438
439 Metadata metadata = new Metadata();
440 BodyContentHandler handler = new BodyContentHandler();
441 new HtmlParser().parse(
442 HtmlParserTest.class.getResourceAsStream(path),
443 new BoilerpipeContentHandler(handler), metadata, new ParseContext());
444
445 String content = handler.toString();
446 assertTrue(content.startsWith("This is the real meat"));
447 assertTrue(content.endsWith("This is the end of the text.\n"));
448 assertFalse(content.contains("boilerplate"));
449 assertFalse(content.contains("footer"));
450 }
451
452 /**
453 * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
454 * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
455 */
456 @Test
457 public void testElementOrdering() throws Exception {
458 final String test = "<html><head><title>Title</title>" +
459 "<meta http-equiv=\"content-type\" content=\"text/html\">" +
460 "<link rel=\"next\" href=\"next.html\" />" +
461 "</head><body><p>Simple Content</p></body></html>";
462
463 StringWriter sw = new StringWriter();
464 new HtmlParser().parse(
465 new ByteArrayInputStream(test.getBytes("UTF-8")),
466 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
467
468 String result = sw.toString();
469
470 // Title element in <head> section
471 assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
472
473 // No meta elements in body
474 assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
475
476 // meta elements should show up in <head> section
477 assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
478
479 // No link elements in body
480 assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
481
482 // link element should be in <head> section
483 assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
484
485 // There should be ending elements.
486 assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
487
488 }
489
490 /**
491 * Test case for TIKA-463. Don't skip elements that have URLs.
492 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
493 */
494 @Test
495 public void testImgUrlExtraction() throws Exception {
496 final String test = "<html><head><title>Title</title>" +
497 "<base href=\"http://domain.com\" />" +
498 "</head><body><img src=\"image.jpg\" /></body></html>";
499
500 StringWriter sw = new StringWriter();
501 new HtmlParser().parse(
502 new ByteArrayInputStream(test.getBytes("UTF-8")),
503 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
504
505 String result = sw.toString();
506
507 // <img> tag should exist, with fully resolved URL
508 assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
509 }
510
511 /**
512 * Test case for TIKA-463. Don't skip elements that have URLs.
513 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
514 */
515 @Test
516 public void testFrameSrcExtraction() throws Exception {
517 final String test = "<html><head><title>Title</title>" +
518 "<base href=\"http://domain.com\" />" +
519 "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
520
521 StringWriter sw = new StringWriter();
522 new HtmlParser().parse(
523 new ByteArrayInputStream(test.getBytes("UTF-8")),
524 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
525
526 String result = sw.toString();
527
528 // <frame> tag should exist, with fully resolved URL
529 assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
530 }
531
532 /**
533 * Test case for TIKA-463. Don't skip elements that have URLs.
534 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
535 */
536 @Test
537 public void testIFrameSrcExtraction() throws Exception {
538 final String test = "<html><head><title>Title</title>" +
539 "<base href=\"http://domain.com\" />" +
540 "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
541 "<p>Your browser doesn't support iframes!</p></body></html>";
542
543 StringWriter sw = new StringWriter();
544 new HtmlParser().parse(
545 new ByteArrayInputStream(test.getBytes("UTF-8")),
546 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
547
548 String result = sw.toString();
549
550 // <iframe> tag should exist, with fully resolved URL
551 assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
552 }
553
554 /**
555 * Test case for TIKA-463. Don't skip elements that have URLs.
556 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
557 */
558 @Test
559 public void testAreaExtraction() throws Exception {
560 final String test = "<html><head><title>Title</title>" +
561 "<base href=\"http://domain.com\" />" +
562 "</head><body><p><map name=\"map\" id=\"map\">" +
563 "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
564 "</map></p></body></html>";
565
566 StringWriter sw = new StringWriter();
567 new HtmlParser().parse(
568 new ByteArrayInputStream(test.getBytes("UTF-8")),
569 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
570
571 String result = sw.toString();
572
573 // <map> tag should exist, with <area> tag with fully resolved URL
574 assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
575 }
576
577 /**
578 * Test case for TIKA-463. Don't skip elements that have URLs.
579 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
580 */
581 @Test
582 public void testObjectExtraction() throws Exception {
583 final String test = "<html><head><title>Title</title>" +
584 "<base href=\"http://domain.com\" />" +
585 "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
586 "<param name=\"name\" value=\"value\" />" +
587 "</object></p></body></html>";
588
589 StringWriter sw = new StringWriter();
590 new HtmlParser().parse(
591 new ByteArrayInputStream(test.getBytes("UTF-8")),
592 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
593
594 String result = sw.toString();
595
596 // <object> tag should exist with fully resolved URLs
597 assertTrue(
598 "<object> tag not correctly found in:\n" + result,
599 Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$", result)
600 );
601 }
602
603 /**
604 * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
605 * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
606 */
607 @Test
608 public void testMetaTagHandling() throws Exception {
609 final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";
610
611 Metadata metadata = new Metadata();
612 metadata.add("Content-Type", "text/html; charset=utf-8");
613 metadata.add("Language", null);
614
615 StringWriter sw = new StringWriter();
616 new HtmlParser().parse(
617 new ByteArrayInputStream(test.getBytes("UTF-8")),
618 makeHtmlTransformer(sw), metadata, new ParseContext());
619
620 String result = sw.toString();
621
622 // <meta> tag for Content-Type should exist, but nothing for Language
623 assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
624 assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
625 }
626
627 /**
628 * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
629 * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
630 */
631 @Test
632 public void testBrokenFrameset() throws Exception {
633 final String test1 = "<html><head><title>Title</title>" +
634 "<base href=\"http://domain.com\" />" +
635 "</head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>";
636
637 StringWriter sw1 = new StringWriter();
638 new HtmlParser().parse(
639 new ByteArrayInputStream(test1.getBytes("UTF-8")),
640 makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
641
642 String result = sw1.toString();
643
644 // <frame> tag should exist, with fully resolved URL
645 assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
646
647 // <body> tag should not exist.
648 assertFalse(Pattern.matches("(?s).*<body>.*$", result));
649
650 // Test the example from the Nutch project.
651 final String test2 = "<html><head><title> my title </title></head><body>" +
652 "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
653 "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
654 "<frame src=\"invalid.html\"/></frame>" +
655 "<frame src=\"right.html\"></frame>" +
656 "</frameset></frameset></body></html>";
657
658 StringWriter sw2 = new StringWriter();
659 new HtmlParser().parse(
660 new ByteArrayInputStream(test2.getBytes("UTF-8")),
661 makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
662
663 result = sw2.toString();
664
665 // <frame> tags should exist, with relative URL (no base element specified)
666 assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
667 assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
668 assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
669 assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));
670
671 // <body> tag should not exist.
672 assertFalse(Pattern.matches("(?s).*<body>.*$", result));
673 }
674
675 /**
676 * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
677 * as delegate for BoilerpipeContentHandler
678 * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
679 */
680 @Test
681 public void testBoilerplateDelegation() throws Exception {
682 String path = "/test-documents/boilerplate.html";
683
684 Metadata metadata = new Metadata();
685 StringWriter sw = new StringWriter();
686 new HtmlParser().parse(
687 HtmlParserTest.class.getResourceAsStream(path),
688 makeHtmlTransformer(sw), metadata, new ParseContext());
689
690 String content = sw.toString();
691
692 // Should have <html>, <head>, <title>, <body> elements
693 assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
694 assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
695 assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
696 assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
697 }
698
699 /**
700 * Test case for TIKA-481. Verify href in <link> is resolved.
701 * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
702 */
703 @Test
704 public void testLinkHrefResolution() throws Exception {
705 final String test = "<html><head><title>Title</title>" +
706 "<base href=\"http://domain.com\" />" +
707 "<link rel=\"next\" href=\"next.html\" />" +
708 "</head><body></body></html>";
709
710 StringWriter sw = new StringWriter();
711 new HtmlParser().parse(
712 new ByteArrayInputStream(test.getBytes("UTF-8")),
713 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
714
715 String result = sw.toString();
716
717 // <link> tag should exist in <head>, with fully resolved URL
718 assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
719 }
720
721
722 /**
723 * Create ContentHandler that transforms SAX events into textual HTML output,
724 * and writes it out to <writer> - typically this is a StringWriter.
725 *
726 * @param writer Where to write resulting HTML text.
727 * @return ContentHandler suitable for passing to parse() methods.
728 * @throws Exception
729 */
730 private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
731 SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
732 TransformerHandler handler = factory.newTransformerHandler();
733 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
734 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
735 handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
736 handler.setResult(new StreamResult(writer));
737 return handler;
738 }
739
740 /**
741 * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
742 * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
743 */
744 @Test
745 public void testBoilerplateWithMarkup() throws Exception {
746 String path = "/test-documents/boilerplate.html";
747
748 Metadata metadata = new Metadata();
749 StringWriter sw = new StringWriter();
750 ContentHandler ch = makeHtmlTransformer(sw);
751 BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
752 bpch.setIncludeMarkup(true);
753
754 new HtmlParser().parse(
755 HtmlParserTest.class.getResourceAsStream(path),
756 bpch, metadata, new ParseContext());
757
758 String content = sw.toString();
759 assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
760 assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
761 assertTrue("Has real content", content.contains("<p>This is the real meat"));
762 assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
763 assertFalse(content.contains("boilerplate"));
764 assertFalse(content.contains("footer"));
765 }
766
767 /**
768 * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
769 */
770 @Test
771 public void testPushback() throws IOException, TikaException {
772 String content = new Tika().parseToString(
773 HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());
774 assertNotNull(content);
775 }
776
777 /**
778 * Test case for TIKA-869
779 * IdentityHtmlMapper needs to lower-case tag names.
780 *
781 * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
782 */
783 @Test
784 public void testIdentityMapper() throws Exception {
785 final String html = "<html><head><title>Title</title></head>" +
786 "<body></body></html>";
787 Metadata metadata = new Metadata();
788 ParseContext parseContext = new ParseContext();
789 parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
790
791 StringWriter sw = new StringWriter();
792
793 new HtmlParser().parse (
794 new ByteArrayInputStream(html.getBytes("UTF-8")),
795 makeHtmlTransformer(sw), metadata, parseContext);
796
797 String result = sw.toString();
798 // Make sure we don't get <body><BODY/></body>
799 assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
800 }
801
802 /**
803 * Test case for TIKA-889
804 * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
805 *
806 * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
807 */
808 @Test
809 public void testNewlineAndIndent() throws Exception {
810 final String html = "<html><head><title>Title</title></head>" +
811 "<body><ul><li>one</li></ul></body></html>";
812
813 BodyContentHandler handler = new BodyContentHandler();
814 new HtmlParser().parse(
815 new ByteArrayInputStream(html.getBytes("UTF-8")),
816 handler, new Metadata(), new ParseContext());
817
818 // Make sure we get <tab>, "one", newline, newline
819 String result = handler.toString();
820
821 assertTrue(Pattern.matches("\tone\n\n", result));
822 }
823
824 /**
825 * Test case for TIKA-961
826 * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
827 */
828 @Test
829 public void testBoilerplateWhitespace() throws Exception {
830 String path = "/test-documents/boilerplate-whitespace.html";
831
832 Metadata metadata = new Metadata();
833 BodyContentHandler handler = new BodyContentHandler();
834
835 BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
836 bpHandler.setIncludeMarkup(true);
837
838 new HtmlParser().parse(
839 HtmlParserTest.class.getResourceAsStream(path),
840 bpHandler, metadata, new ParseContext());
841
842 String content = handler.toString();
843
844 // Should not contain item_aitem_b
845 assertFalse(content.contains("item_aitem_b"));
846
847 // Should contain the two list items with a newline in between.
848 assertTrue(content.contains("item_a\nitem_b"));
849
850 // Should contain 有什么需要我帮你的 (can i help you) without whitespace
851 assertTrue(content.contains("有什么需要我帮你的"));
852 }
853
854 /**
855 * Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
856 *
857 * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
858 */
859 @Test
860 public void testOpenGraphMetadata() throws Exception {
861 String test1 =
862 "<html><head><meta property=\"og:description\""
863 + " content=\"some description\" />"
864 + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
865 + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
866 + "<title>hello</title>"
867 + "</head><body></body></html>";
868 Metadata metadata = new Metadata();
869 new HtmlParser().parse (
870 new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
871 new BodyContentHandler(), metadata, new ParseContext());
872 assertEquals("some description", metadata.get("og:description"));
873 assertTrue(metadata.isMultiValued("og:image"));
874 }
875
876 // TIKA-1011
877 @Test
878 public void testUserDefinedCharset() throws Exception {
879 String content = new Tika().parseToString(
880 HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
881 assertNotNull(content);
882 }
883
884 //TIKA-1001
885 @Test
886 public void testNoisyMetaCharsetHeaders() throws Exception {
887 Tika tika = new Tika();
888 String hit = "\u0623\u0639\u0631\u0628";
889
890 for (int i = 1; i <=4; i++){
891 String fileName = "/test-documents/testHTMLNoisyMetaEncoding_"+i+".html";
892 String content = tika.parseToString(
893 HtmlParserTest.class.getResourceAsStream(fileName));
894 assertTrue("testing: " +fileName, content.contains(hit));
895 }
896 }
897
898 // TIKA-1193
899 @Test
900 public void testCustomHtmlSchema() throws Exception {
901 // Default schema does not allow tables inside anchors
902 String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
903
904 Metadata metadata = new Metadata();
905 LinkContentHandler linkContentHandler = new LinkContentHandler();
906
907 new HtmlParser().parse (
908 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
909 linkContentHandler, metadata, new ParseContext());
910
911 // Expect no anchor text
912 assertEquals("", linkContentHandler.getLinks().get(0).getText());
913
914 // We'll change the schema to allow tables inside anchors!
915 Schema schema = new HTMLSchema();
916 schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
917
918 ParseContext parseContext = new ParseContext();
919 parseContext.set(Schema.class, schema);
920 linkContentHandler = new LinkContentHandler();
921 new HtmlParser().parse (
922 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
923 linkContentHandler, metadata, parseContext);
924
925 // Expect anchor text
926 assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
927 }
928
929 /**
930 * Test case for TIKA-820: Locator is unset for HTML parser
931 *
932 * @see <a href="https://issues.apache.org/jira/browse/TIKA-820">TIKA-820</a>
933 */
934 @Test
935 public void testLocator() throws Exception {
936 final int line = 0;
937 final int col = 1;
938 final int[] textPosition = new int[2];
939
940 new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream("/test-documents/testHTML.html"),
941 new ContentHandler(){
942 Locator locator;
943
944 public void setDocumentLocator(Locator locator) {
945 this.locator = locator;
946 }
947
948 public void startDocument() throws SAXException {
949 }
950
951 public void endDocument() throws SAXException {
952 }
953
954 public void startPrefixMapping(String prefix, String uri)
955 throws SAXException {
956 }
957
958 public void endPrefixMapping(String prefix)
959 throws SAXException {
960 }
961
962 public void startElement(String uri, String localName,
963 String qName, Attributes atts) throws SAXException {
964 }
965
966 public void endElement(String uri, String localName,
967 String qName) throws SAXException {
968 }
969
970 public void characters(char[] ch, int start, int length)
971 throws SAXException {
972 String text = new String(ch, start, length);
973 if (text.equals("Test Indexation Html") && locator != null) {
974 textPosition[line] = locator.getLineNumber();
975 textPosition[col] = locator.getColumnNumber();
976 }
977 }
978
979 public void ignorableWhitespace(char[] ch, int start,
980 int length) throws SAXException {
981 }
982
983 public void processingInstruction(String target, String data)
984 throws SAXException {
985 }
986
987 public void skippedEntity(String name) throws SAXException {
988 }},
989 new Metadata(),
990 new ParseContext());
991
992 // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
993 assertEquals(24, textPosition[line]);
994 // The column reported seems fuzzy, just test it is close enough.
995 assertTrue(Math.abs(textPosition[col]-47) < 10);
996 }
997
998 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.ibooks;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.TikaCoreProperties;
24 import org.apache.tika.parser.ParseContext;
25 import org.apache.tika.parser.epub.EpubParser;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class iBooksParserTest {
31
32 @Test
33 public void testiBooksParser() throws Exception {
34 InputStream input = iBooksParserTest.class.getResourceAsStream(
35 "/test-documents/testiBooks.ibooks");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 new EpubParser().parse(input, handler, metadata, new ParseContext());
40
41 assertEquals("application/x-ibooks+zip",
42 metadata.get(Metadata.CONTENT_TYPE));
43 assertEquals("en-GB",
44 metadata.get(TikaCoreProperties.LANGUAGE));
45 assertEquals("iBooks Author v1.0",
46 metadata.get(TikaCoreProperties.CONTRIBUTOR));
47 assertEquals("Apache",
48 metadata.get(TikaCoreProperties.CREATOR));
49
50 /* TODO For some reason, the xhtml files in iBooks-style ePub are not parsed properly, and the content comes back empty.git che
51 String content = handler.toString();
52 System.out.println("content="+content);
53 assertTrue(content.contains("Plus a simple div"));
54 assertTrue(content.contains("First item"));
55 assertTrue(content.contains("The previous headings were subchapters"));
56 assertTrue(content.contains("Table data"));
57 assertTrue(content.contains("Lorem ipsum dolor rutur amet"));
58 */
59 } finally {
60 input.close();
61 }
62 }
63
64 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.util.Arrays;
19 import java.util.GregorianCalendar;
20 import java.util.Iterator;
21 import java.util.List;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.junit.Test;
26
27 import com.drew.metadata.Directory;
28 import com.drew.metadata.MetadataException;
29 import com.drew.metadata.Tag;
30 import com.drew.metadata.exif.ExifIFD0Directory;
31 import com.drew.metadata.exif.ExifSubIFDDirectory;
32 import com.drew.metadata.jpeg.JpegCommentDirectory;
33
34 import static org.junit.Assert.assertEquals;
35 import static org.junit.Assert.assertFalse;
36 import static org.junit.Assert.assertNull;
37 import static org.junit.Assert.assertTrue;
38 import static org.mockito.Mockito.*;
39
40 public class ImageMetadataExtractorTest {
41
42 @SuppressWarnings({ "rawtypes", "unchecked" })
43 @Test
44 public void testHandleDirectories() throws MetadataException {
45 Metadata metadata = mock(Metadata.class);
46 ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
47 ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
48
49 Directory directory = new JpegCommentDirectory();
50 Iterator directories = mock(Iterator.class);
51 when(directories.hasNext()).thenReturn(true, false);
52 when(directories.next()).thenReturn(directory);
53 when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
54
55 e.handle(directories);
56 verify(handler1).supports(JpegCommentDirectory.class);
57 verify(handler1).handle(directory, metadata);
58 }
59
60 @Test
61 public void testExifHandlerSupports() {
62 assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
63 assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
64 assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
65 assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
66 }
67
68 @Test
69 public void testExifHandlerParseDate() throws MetadataException {
70 ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
71 when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
72 when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
73 new GregorianCalendar(2000, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
74 Metadata metadata = new Metadata();
75
76 new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
77 assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
78 metadata.get(TikaCoreProperties.CREATED));
79 }
80
81 @Test
82 public void testExifHandlerParseDateFallback() throws MetadataException {
83 ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
84 when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
85 when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
86 new GregorianCalendar(1999, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
87 Metadata metadata = new Metadata();
88
89 new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
90 assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
91 metadata.get(TikaCoreProperties.CREATED));
92 }
93
94 @Test
95 public void testExifHandlerParseDateError() throws MetadataException {
96 ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
97 when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
98 when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
99 Metadata metadata = new Metadata();
100
101 new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
102 assertEquals("Parsing should proceed without date", null,
103 metadata.get(TikaCoreProperties.CREATED));
104 }
105
106 @Test
107 public void testCopyUnknownFieldsHandler() throws MetadataException {
108 Directory d = mock(Directory.class);
109 Tag t1 = mock(Tag.class);
110 when(t1.getTagName()).thenReturn("Image Description");
111 when(t1.getDescription()).thenReturn("t1");
112 Tag t2 = mock(Tag.class);
113 when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
114 when(t2.getDescription()).thenReturn("known");
115 Tag t3 = mock(Tag.class);
116 when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
117 when(t3.getDescription()).thenReturn("known");
118 List<Tag> tags = Arrays.asList(t1, t2, t3);
119 when(d.getTags()).thenReturn(tags);
120 Metadata metadata = new Metadata();
121 new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
122 assertEquals("t1", metadata.get("Image Description"));
123 assertNull("keywords should be excluded from bulk copy because it is a defined field",
124 metadata.get(Metadata.KEYWORDS));
125 assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
126 }
127
128 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.TikaCoreProperties;
24 import org.apache.tika.parser.ParseContext;
25 import org.apache.tika.parser.Parser;
26 import org.junit.Test;
27 import org.xml.sax.helpers.DefaultHandler;
28
29 public class ImageParserTest {
30
31 private final Parser parser = new ImageParser();
32
33 @Test
34 public void testBMP() throws Exception {
35 Metadata metadata = new Metadata();
36 metadata.set(Metadata.CONTENT_TYPE, "image/bmp");
37 InputStream stream =
38 getClass().getResourceAsStream("/test-documents/testBMP.bmp");
39 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
40
41 assertEquals("75", metadata.get("height"));
42 assertEquals("100", metadata.get("width"));
43 assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
44 assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
45 assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
46 assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
47 assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
48 assertEquals("image/bmp", metadata.get("Content-Type"));
49
50 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
51 assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
52 assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
53 }
54
55 @Test
56 public void testGIF() throws Exception {
57 Metadata metadata = new Metadata();
58 metadata.set(Metadata.CONTENT_TYPE, "image/gif");
59 InputStream stream =
60 getClass().getResourceAsStream("/test-documents/testGIF.gif");
61 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
62
63 assertEquals("75", metadata.get("height"));
64 assertEquals("100", metadata.get("width"));
65 assertEquals("true", metadata.get("Compression Lossless"));
66 assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
67 assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
68 assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
69 assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
70 assertEquals("Index", metadata.get("Data SampleFormat"));
71 assertEquals("3", metadata.get("Chroma NumChannels"));
72 assertEquals("1", metadata.get("Compression NumProgressiveScans"));
73 assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
74 assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
75 assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
76 assertEquals("true", metadata.get("Chroma BlackIsZero"));
77 assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
78 assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
79 assertEquals("image/gif", metadata.get("Content-Type"));
80
81 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
82 assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
83 assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
84 }
85
86 @Test
87 public void testJPEG() throws Exception {
88 Metadata metadata = new Metadata();
89 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
90 InputStream stream =
91 getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
92 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
93
94 assertEquals("75", metadata.get("height"));
95 assertEquals("100", metadata.get("width"));
96 assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
97 assertEquals("false", metadata.get("Compression Lossless"));
98 assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
99 assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
100 assertEquals("225", metadata.get("markerSequence unknown"));
101 assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
102 assertEquals("normal", metadata.get("Dimension ImageOrientation"));
103 assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
104 assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
105 assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
106 assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
107 assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
108 assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
109 assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
110 assertEquals("3", metadata.get("Chroma NumChannels"));
111 assertEquals("1", metadata.get("Compression NumProgressiveScans"));
112 assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
113 assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
114 assertEquals("image/jpeg", metadata.get("Content-Type"));
115 assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
116
117 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
118 assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
119 assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
120 }
121
122 @Test
123 public void testPNG() throws Exception {
124 Metadata metadata = new Metadata();
125 metadata.set(Metadata.CONTENT_TYPE, "image/png");
126 InputStream stream =
127 getClass().getResourceAsStream("/test-documents/testPNG.png");
128 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
129
130 assertEquals("75", metadata.get("height"));
131 assertEquals("100", metadata.get("width"));
132 assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
133 assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
134 assertEquals("Perceptual", metadata.get("sRGB"));
135 assertEquals("true", metadata.get("Compression Lossless"));
136 assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
137 assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
138 assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
139 assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
140 assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
141 assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
142 assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
143 assertEquals("none", metadata.get("Transparency Alpha"));
144 assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
145 assertEquals("3", metadata.get("Chroma NumChannels"));
146 assertEquals("1", metadata.get("Compression NumProgressiveScans"));
147 assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
148 assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
149 assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
150 assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
151 assertEquals("true", metadata.get("Chroma BlackIsZero"));
152 assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
153 assertEquals("image/png", metadata.get("Content-Type"));
154
155 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
156 assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
157 assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
158 }
159
160 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import static org.junit.Assert.assertFalse;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.metadata.TIFF;
22 import org.apache.tika.metadata.TikaCoreProperties;
23 import org.junit.Test;
24
25 public class MetadataFieldsTest {
26
27 @Test
28 public void testIsMetadataField() {
29 assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
30 assertFalse(MetadataFields.isMetadataField("xyz"));
31 assertTrue(MetadataFields.isMetadataField(TikaCoreProperties.KEYWORDS));
32 assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
33 }
34
35 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import java.io.InputStream;
19
20 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.parser.ParseContext;
22 import org.apache.tika.parser.Parser;
23 import org.junit.Test;
24 import org.xml.sax.helpers.DefaultHandler;
25
26 import static junit.framework.Assert.assertEquals;
27
28 public class PSDParserTest {
29
30 private final Parser parser = new PSDParser();
31
32 /**
33 * Tests a very basic file, without much metadata
34 */
35 @Test
36 public void testPSD() throws Exception {
37 Metadata metadata = new Metadata();
38 metadata.set(Metadata.CONTENT_TYPE, "image/x-psd");
39 InputStream stream =
40 getClass().getResourceAsStream("/test-documents/testPSD.psd");
41 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
42
43 assertEquals("537", metadata.get(Metadata.IMAGE_WIDTH));
44 assertEquals("51", metadata.get(Metadata.IMAGE_LENGTH));
45 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
46 }
47
48 /**
49 * Tests a very basic file, without much metadata,
50 * where some of the data lengths are padded to be even
51 */
52 @Test
53 public void testOddPSD() throws Exception {
54 Metadata metadata = new Metadata();
55 metadata.set(Metadata.CONTENT_TYPE, "image/x-psd");
56 InputStream stream =
57 getClass().getResourceAsStream("/test-documents/testPSD2.psd");
58 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
59 assertEquals("69", metadata.get(Metadata.IMAGE_WIDTH));
60 assertEquals("70", metadata.get(Metadata.IMAGE_LENGTH));
61 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
62 }
63 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.parser.ParseContext;
22 import org.apache.tika.parser.Parser;
23 import org.apache.tika.parser.image.TiffParser;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.junit.Test;
27 import org.xml.sax.helpers.DefaultHandler;
28
29 import java.io.InputStream;
30 import java.util.Arrays;
31 import java.util.List;
32
33 public class TiffParserTest {
34 private final Parser parser = new TiffParser();
35
36 @Test
37 public void testTIFF() throws Exception {
38 Metadata metadata = new Metadata();
39 metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
40 InputStream stream =
41 getClass().getResourceAsStream("/test-documents/testTIFF.tif");
42 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
43
44 assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
45 "more contributor license agreements. See the NOTICE file " +
46 "distributed with this work for additional information regarding " +
47 "copyright ownership.", metadata.get(TikaCoreProperties.DESCRIPTION));
48
49 // All EXIF/TIFF tags
50 assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
51
52 // Core EXIF/TIFF tags
53 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
54 assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
55 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
56 assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
57
58 // Embedded XMP
59 List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
60 assertTrue("got " + keywords, keywords.contains("cat"));
61 assertTrue("got " + keywords, keywords.contains("garden"));
62 List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
63 assertTrue("got " + subject, subject.contains("cat"));
64 assertTrue("got " + subject, subject.contains("garden"));
65 }
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.image.xmp;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Arrays;
24 import java.util.Collection;
25
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.parser.image.xmp.JempboxExtractor;
30 import org.junit.Test;
31
32 public class JempboxExtractorTest {
33
34 @Test
35 public void testParseJpeg() throws IOException, TikaException {
36 Metadata metadata = new Metadata();
37 InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
38 // set some values before extraction to see that they are overridden
39 metadata.set(TikaCoreProperties.TITLE, "old title");
40 metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
41 metadata.set(TikaCoreProperties.CREATOR, "previous author");
42 // ... or kept in case the field is multi-value
43 metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword");
44
45 JempboxExtractor extractor = new JempboxExtractor(metadata);
46 extractor.parse(stream);
47
48 // DublinCore fields
49 assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
50 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
51 assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
52 Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
53 assertTrue(keywords.contains("oldkeyword"));
54 assertTrue(keywords.contains("grazelands"));
55 assertTrue(keywords.contains("nature reserve"));
56 assertTrue(keywords.contains("bird watching"));
57 assertTrue(keywords.contains("coast"));
58 Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
59 assertTrue(subject.contains("oldkeyword"));
60 assertTrue(subject.contains("grazelands"));
61 assertTrue(subject.contains("nature reserve"));
62 assertTrue(subject.contains("bird watching"));
63 assertTrue(subject.contains("coast"));
64 }
65
66 @Test
67 public void testParseJpegPhotoshop() throws IOException, TikaException {
68 Metadata metadata = new Metadata();
69 InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
70
71 JempboxExtractor extractor = new JempboxExtractor(metadata);
72 extractor.parse(stream);
73
74 // DublinCore fields
75 assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
76 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
77 assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
78 Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
79 assertTrue(keywords.contains("bird watching"));
80 assertTrue(keywords.contains("coast"));
81 }
82
83 @Test
84 public void testParseJpegXnviewmp() throws IOException, TikaException {
85 Metadata metadata = new Metadata();
86 InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
87
88 JempboxExtractor extractor = new JempboxExtractor(metadata);
89 extractor.parse(stream);
90
91 // XnViewMp fields not understood by Jempbox
92 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
93 Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
94 assertTrue(keywords.contains("coast"));
95 assertTrue(keywords.contains("nature reserve"));
96 }
97
98 @Test
99 public void testJoinCreators() {
100 assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
101 Arrays.asList("Mr B")));
102 // TODO use multi-value property instead?
103 assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
104 Arrays.asList("Mr B", "Mr A")));
105 }
106
107 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.junit.Test;
21
22 /**
23 * Test class for the <code>AutoPageNumberUtils</code> helper class.
24 */
25 public class AutoPageNumberUtilsTest {
26
27 /**
28 * Check upper-case alpha-numeric numbers are generated based on the
29 * input page number.
30 */
31 @Test
32 public void testAlphaUpper() {
33 assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
34 assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
35 assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
36 assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
37 assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
38 assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
39 }
40
41 /**
42 * Check lower-case alpha-numeric numbers are generated based on the
43 * input page number.
44 */
45 @Test
46 public void testAlphaLower() {
47 assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
48 assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
49 assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
50 assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
51 assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
52 assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
53 }
54
55 /**
56 * Check upper-case Roman numerals numbers are generated based on the
57 * input page number.
58 */
59 @Test
60 public void testRomanUpper() {
61 assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
62 assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
63 assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
64 }
65
66 /**
67 * Check lower-case Roman numerals numbers are generated based on the
68 * input page number.
69 */
70 @Test
71 public void testRomanLower() {
72 assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
73 assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
74 assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
75 }
76
77 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.iwork;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22 import java.util.Arrays;
23 import java.util.List;
24
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.parser.AutoDetectParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.junit.Before;
32 import org.junit.Test;
33 import org.xml.sax.ContentHandler;
34
35 /**
36 * Tests if the IWork parser parses the content and metadata properly of the supported formats.
37 */
38 public class IWorkParserTest {
39
40 private IWorkPackageParser iWorkParser;
41 private ParseContext parseContext;
42
43 @Before
44 public void setUp() {
45 iWorkParser = new IWorkPackageParser();
46 parseContext = new ParseContext();
47 parseContext.set(Parser.class, new AutoDetectParser());
48 }
49
50 @Test
51 public void testParseKeynote() throws Exception {
52 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
53 Metadata metadata = new Metadata();
54 ContentHandler handler = new BodyContentHandler();
55 iWorkParser.parse(input, handler, metadata, parseContext);
56
57 // Make sure enough keys came through
58 // (Exact numbers will vary based on composites)
59 assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
60 List<String> metadataKeys = Arrays.asList(metadata.names());
61 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
62 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
63 // assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
64 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
65 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
66
67 // Check the metadata values
68 assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
69 assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
70 assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
71 assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
72 assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
73 assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
74
75 String content = handler.toString();
76 assertTrue(content.contains("A sample presentation"));
77 assertTrue(content.contains("For the Apache Tika project"));
78 assertTrue(content.contains("Slide 1"));
79 assertTrue(content.contains("Some random text for the sake of testability."));
80 assertTrue(content.contains("A nice comment"));
81 assertTrue(content.contains("A nice note"));
82
83 // test table data
84 assertTrue(content.contains("Cell one"));
85 assertTrue(content.contains("Cell two"));
86 assertTrue(content.contains("Cell three"));
87 assertTrue(content.contains("Cell four"));
88 assertTrue(content.contains("Cell 5"));
89 assertTrue(content.contains("Cell six"));
90 assertTrue(content.contains("7"));
91 assertTrue(content.contains("Cell eight"));
92 assertTrue(content.contains("5/5/1985"));
93 }
94
95 // TIKA-910
96 @Test
97 public void testKeynoteTextBoxes() throws Exception {
98 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
99 Metadata metadata = new Metadata();
100 ContentHandler handler = new BodyContentHandler();
101 iWorkParser.parse(input, handler, metadata, parseContext);
102
103 String content = handler.toString();
104 assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
105 }
106
107 // TIKA-910
108 @Test
109 public void testKeynoteBulletPoints() throws Exception {
110 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
111 Metadata metadata = new Metadata();
112 ContentHandler handler = new BodyContentHandler();
113 iWorkParser.parse(input, handler, metadata, parseContext);
114
115 String content = handler.toString();
116 assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
117 }
118
119 // TIKA-923
120 @Test
121 public void testKeynoteTables() throws Exception {
122 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
123 Metadata metadata = new Metadata();
124 ContentHandler handler = new BodyContentHandler();
125 iWorkParser.parse(input, handler, metadata, parseContext);
126
127 String content = handler.toString();
128 content = content.replaceAll("\\s+", " ");
129 assertTrue(content.contains("row 1 row 2 row 3"));
130 }
131
132 // TIKA-923
133 @Test
134 public void testKeynoteMasterSlideTable() throws Exception {
135 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
136 Metadata metadata = new Metadata();
137 ContentHandler handler = new BodyContentHandler();
138 iWorkParser.parse(input, handler, metadata, parseContext);
139
140 String content = handler.toString();
141 content = content.replaceAll("\\s+", " ");
142 assertTrue(content.contains("master row 1"));
143 assertTrue(content.contains("master row 2"));
144 assertTrue(content.contains("master row 3"));
145 }
146
147 @Test
148 public void testParsePages() throws Exception {
149 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
150 Metadata metadata = new Metadata();
151 ContentHandler handler = new BodyContentHandler();
152 iWorkParser.parse(input, handler, metadata, parseContext);
153
154 // Make sure enough keys came through
155 // (Exact numbers will vary based on composites)
156 assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
157 List<String> metadataKeys = Arrays.asList(metadata.names());
158 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
159 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
160 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
161 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
162 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
163 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
164
165 // Check the metadata values
166 assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
167 assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
168 assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
169 assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
170 assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
171 assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
172 assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
173
174 String content = handler.toString();
175
176 // text on page 1
177 assertTrue(content.contains("Sample pages document"));
178 assertTrue(content.contains("Some plain text to parse."));
179 assertTrue(content.contains("Cell one"));
180 assertTrue(content.contains("Cell two"));
181 assertTrue(content.contains("Cell three"));
182 assertTrue(content.contains("Cell four"));
183 assertTrue(content.contains("Cell five"));
184 assertTrue(content.contains("Cell six"));
185 assertTrue(content.contains("Cell seven"));
186 assertTrue(content.contains("Cell eight"));
187 assertTrue(content.contains("Cell nine"));
188 assertTrue(content.contains("Both Pages 1.x and Keynote 2.x")); // ...
189
190 // text on page 2
191 assertTrue(content.contains("A second page...."));
192 assertTrue(content.contains("Extensible Markup Language")); // ...
193 }
194
195 // TIKA-904
196 @Test
197 public void testPagesLayoutMode() throws Exception {
198 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
199 Metadata metadata = new Metadata();
200 ContentHandler handler = new BodyContentHandler();
201
202 iWorkParser.parse(input, handler, metadata, parseContext);
203
204 String content = handler.toString();
205 assertTrue(content.contains("text box 1 - here is some text"));
206 assertTrue(content.contains("created in a text box in layout mode"));
207 assertTrue(content.contains("text box 2 - more text!@!$@#"));
208 assertTrue(content.contains("this is text inside of a green box"));
209 assertTrue(content.contains("text inside of a green circle"));
210 }
211
212 @Test
213 public void testParseNumbers() throws Exception {
214 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
215 Metadata metadata = new Metadata();
216 ContentHandler handler = new BodyContentHandler();
217
218 iWorkParser.parse(input, handler, metadata, parseContext);
219
220 // Make sure enough keys came through
221 // (Exact numbers will vary based on composites)
222 assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
223 List<String> metadataKeys = Arrays.asList(metadata.names());
224 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
225 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
226 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
227 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
228 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
229 assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
230
231 // Check the metadata values
232 assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
233 assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
234 assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
235 assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
236
237 String content = handler.toString();
238 assertTrue(content.contains("Category"));
239 assertTrue(content.contains("Home"));
240 assertTrue(content.contains("-226"));
241 assertTrue(content.contains("-137.5"));
242 assertTrue(content.contains("Checking Account: 300545668"));
243 assertTrue(content.contains("4650"));
244 assertTrue(content.contains("Credit Card"));
245 assertTrue(content.contains("Groceries"));
246 assertTrue(content.contains("-210"));
247 assertTrue(content.contains("Food"));
248 assertTrue(content.contains("Try adding your own account transactions to this table."));
249 }
250
251 // TIKA- 924
252 @Test
253 public void testParseNumbersTableNames() throws Exception {
254 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
255 Metadata metadata = new Metadata();
256 ContentHandler handler = new BodyContentHandler();
257 iWorkParser.parse(input, handler, metadata, parseContext);
258 String content = handler.toString();
259 assertTrue(content.contains("This is the main table"));
260 }
261
262 @Test
263 public void testParseNumbersTableHeaders() throws Exception {
264 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
265 Metadata metadata = new Metadata();
266 ContentHandler handler = new BodyContentHandler();
267 iWorkParser.parse(input, handler, metadata, parseContext);
268
269 String content = handler.toString();
270 for(int header=1;header<=5;header++) {
271 assertTrue(content.contains("header" + header));
272 }
273 for(int row=1;row<=3;row++) {
274 assertTrue(content.contains("row" + row));
275 }
276 }
277
278 /**
279 * We don't currently support password protected Pages files, as
280 * we don't know how the encryption works (it's not regular Zip
281 * Encryption). See TIKA-903 for details
282 */
283 @Test
284 public void testParsePagesPasswordProtected() throws Exception {
285 // Document password is "tika", but we can't use that yet...
286 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
287 Metadata metadata = new Metadata();
288 ContentHandler handler = new BodyContentHandler();
289
290 iWorkParser.parse(input, handler, metadata, parseContext);
291
292 // Content will be empty
293 String content = handler.toString();
294 assertEquals("", content);
295
296 // Will have been identified as encrypted
297 assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
298 }
299
300 /**
301 * Check we get headers, footers and footnotes from Pages
302 */
303 @Test
304 public void testParsePagesHeadersFootersFootnotes() throws Exception {
305 String footnote = "Footnote: Do a lot of people really use iWork?!?!";
306 String header = "THIS IS SOME HEADER TEXT";
307 String footer = "THIS IS SOME FOOTER TEXT\t1";
308 String footer2 = "THIS IS SOME FOOTER TEXT\t2";
309
310 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
311 Metadata metadata = new Metadata();
312 ContentHandler handler = new BodyContentHandler();
313
314 iWorkParser.parse(input, handler, metadata, parseContext);
315 String contents = handler.toString();
316
317 // Check regular text
318 assertContains(contents, "Both Pages 1.x"); // P1
319 assertContains(contents, "understanding the Pages document"); // P1
320 assertContains(contents, "should be page 2"); // P2
321
322 // Check for headers, footers and footnotes
323 assertContains(contents, header);
324 assertContains(contents, footer);
325 assertContains(contents, footer2);
326 assertContains(contents, footnote);
327 }
328
329 /**
330 * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
331 */
332 @Test
333 public void testParsePagesHeadersFootersRomanUpper() throws Exception {
334 String header = "THIS IS SOME HEADER TEXT";
335 String footer = "THIS IS SOME FOOTER TEXT\tI";
336 String footer2 = "THIS IS SOME FOOTER TEXT\tII";
337
338 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
339 ContentHandler handler = new BodyContentHandler();
340
341 iWorkParser.parse(input, handler, new Metadata(), parseContext);
342 String contents = handler.toString();
343
344 // Check for headers, footers and footnotes
345 assertContains(contents, header);
346 assertContains(contents, footer);
347 assertContains(contents, footer2);
348 }
349
350 /**
351 * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
352 */
353 @Test
354 public void testParsePagesHeadersFootersRomanLower() throws Exception {
355 String header = "THIS IS SOME HEADER TEXT";
356 String footer = "THIS IS SOME FOOTER TEXT\ti";
357 String footer2 = "THIS IS SOME FOOTER TEXT\tii";
358
359 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
360 ContentHandler handler = new BodyContentHandler();
361
362 iWorkParser.parse(input, handler, new Metadata(), parseContext);
363 String contents = handler.toString();
364
365 // Check for headers, footers and footnotes
366 assertContains(contents, header);
367 assertContains(contents, footer);
368 assertContains(contents, footer2);
369 }
370
371 /**
372 * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
373 */
374 @Test
375 public void testParsePagesHeadersAlphaUpper() throws Exception {
376 String header = "THIS IS SOME HEADER TEXT\tA";
377 String footer = "THIS IS SOME FOOTER TEXT\tA";
378 String footer2 = "THIS IS SOME FOOTER TEXT\tB";
379
380 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
381 ContentHandler handler = new BodyContentHandler();
382
383 iWorkParser.parse(input, handler, new Metadata(), parseContext);
384 String contents = handler.toString();
385
386 // Check for headers, footers and footnotes
387 assertContains(contents, header);
388 assertContains(contents, footer);
389 assertContains(contents, footer2);
390 }
391
392 /**
393 * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
394 */
395 @Test
396 public void testParsePagesHeadersAlphaLower() throws Exception {
397 String header = "THIS IS SOME HEADER TEXT";
398 String footer = "THIS IS SOME FOOTER TEXT\ta";
399 String footer2 = "THIS IS SOME FOOTER TEXT\tb";
400
401 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
402 ContentHandler handler = new BodyContentHandler();
403
404 iWorkParser.parse(input, handler, new Metadata(), parseContext);
405 String contents = handler.toString();
406
407 // Check for headers, footers and footnotes
408 assertContains(contents, header);
409 assertContains(contents, footer);
410 assertContains(contents, footer2);
411 }
412
413 /**
414 * Check we get annotations (eg comments) from Pages
415 */
416 @Test
417 public void testParsePagesAnnotations() throws Exception {
418 String commentA = "comment about the APXL file";
419 String commentB = "comment about UIMA";
420
421 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
422 Metadata metadata = new Metadata();
423 ContentHandler handler = new BodyContentHandler();
424
425 iWorkParser.parse(input, handler, metadata, parseContext);
426 String contents = handler.toString();
427
428 // Check regular text
429 assertContains(contents, "Both Pages 1.x"); // P1
430 assertContains(contents, "understanding the Pages document"); // P1
431 assertContains(contents, "should be page 2"); // P2
432
433 // Check for comments
434 assertContains(contents, commentA);
435 assertContains(contents, commentB);
436 }
437
438 // TIKA-918
439 @Test
440 public void testNumbersExtractChartNames() throws Exception {
441 InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
442 Metadata metadata = new Metadata();
443 ContentHandler handler = new BodyContentHandler();
444 iWorkParser.parse(input, handler, metadata, parseContext);
445 String contents = handler.toString();
446 assertContains(contents, "Expenditure by Category");
447 assertContains(contents, "Currency Chart name");
448 assertContains(contents, "Chart 2");
449 }
450
451 public void assertContains(String haystack, String needle) {
452 assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
453 }
454 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.jpeg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23 import java.util.Arrays;
24 import java.util.List;
25
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.TIFF;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.junit.Test;
32 import org.xml.sax.helpers.DefaultHandler;
33
34 public class JpegParserTest {
35
36 private final Parser parser = new JpegParser();
37
38 @Test
39 public void testJPEG() throws Exception {
40 Metadata metadata = new Metadata();
41 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
42 InputStream stream =
43 getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
44 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
45
46 // Core EXIF/TIFF tags
47 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
48 assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
49 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
50 assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
51
52 assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
53 assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
54 assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
55 assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
56 assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
57 assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
58 assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
59 assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
60 assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
61 assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
62 assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
63 assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
64
65 // Check that EXIF/TIFF tags come through with their raw values too
66 // (This may be removed for Tika 1.0, as we support more of them
67 // with explicit Metadata entries)
68 assertEquals("Canon EOS 40D", metadata.get("Model"));
69
70 // Common tags
71 //assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
72 assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
73 "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
74 List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
75 assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
76 assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
77 assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
78 assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
79 List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
80 assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250"));
81 assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds"));
82 assertTrue("'serbor' expected in " + subject, subject.contains("serbor"));
83 assertFalse(subject.contains("canon-55-250 moscow-birds serbor"));
84 }
85
86 /**
87 * Test for a file with Geographic information (lat, long etc) in it
88 */
89 @Test
90 public void testJPEGGeo() throws Exception {
91 Metadata metadata = new Metadata();
92 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
93 InputStream stream =
94 getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
95 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
96
97 // Geo tags
98 assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
99 assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
100
101 // Core EXIF/TIFF tags
102 assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
103 assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
104 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
105 assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
106
107 assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
108 assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
109 assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
110 assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
111 assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
112 assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
113 assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
114 assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
115 assertEquals(null, metadata.get(Metadata.ORIENTATION)); // Not present
116 assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
117 assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
118 assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
119
120 // Common tags
121 assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
122 "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED));
123 assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
124 "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
125 assertEquals("Date/Time Original should be stored in EXIF field too",
126 "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
127 assertEquals("canon-55-250", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
128 assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
129 }
130
131 /**
132 * Test for an image with the geographic information stored in a slightly
133 * different way, see TIKA-915 for details
134 * Disabled for now, pending a fix to the underlying library
135 */
136 @Test
137 public void testJPEGGeo2() throws Exception {
138 Metadata metadata = new Metadata();
139 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
140 InputStream stream =
141 getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg");
142 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
143
144 // Geo tags should be there with 5dp, and not rounded
145 assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
146 assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE));
147 }
148
149 @Test
150 public void testJPEGTitleAndDescription() throws Exception {
151 Metadata metadata = new Metadata();
152 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
153 InputStream stream =
154 getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
155 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
156
157 // embedded comments with non-ascii characters
158 assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
159 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
160 assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core
161 // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
162 // but we have to replace them with underscore
163
164 List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
165 assertTrue(keywords.contains("coast"));
166 assertTrue(keywords.contains("bird watching"));
167 assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
168
169 // Core EXIF/TIFF tags
170 assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
171 assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
172 assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
173 assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
174
175 assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
176 assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
177 assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
178 assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
179 assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
180 assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
181 assertEquals(null, metadata.get(Metadata.SOFTWARE));
182 assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
183 assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
184 assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
185 assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
186 }
187
188 @Test
189 public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
190 Metadata metadata = new Metadata();
191 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
192 InputStream stream =
193 getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
194 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
195
196 // embedded comments with non-ascii characters
197 assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
198 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
199 assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
200 List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
201 assertTrue("got " + keywords, keywords.contains("bird watching"));
202 List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
203 assertTrue("got " + subject, subject.contains("bird watching"));
204 }
205
206 @Test
207 public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
208 Metadata metadata = new Metadata();
209 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
210 InputStream stream =
211 getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
212 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
213
214 // XnViewMp's default comment dialog has only comment, not headline.
215 // Comment is embedded only if "Write comments in XMP" is enabled in settings
216 assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
217 // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
218 // but we have to replace them with underscore
219 String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS);
220 List<String> keywords = Arrays.asList(subject);
221 assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
222 assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
223 }
224
225 @Test
226 public void testJPEGoddTagComponent() throws Exception {
227 Metadata metadata = new Metadata();
228 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
229 InputStream stream =
230 getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg");
231 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
232
233 assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
234 assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
235 assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH));
236 assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH));
237 }
238
239 @Test
240 public void testJPEGEmptyEXIFDateTime() throws Exception {
241 Metadata metadata = new Metadata();
242 metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
243 InputStream stream =
244 getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
245 parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
246 assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
247 assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
248 }
249 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mail;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21 import static org.junit.Assert.fail;
22 import static org.mockito.Matchers.any;
23 import static org.mockito.Matchers.eq;
24 import static org.mockito.Mockito.mock;
25 import static org.mockito.Mockito.never;
26 import static org.mockito.Mockito.times;
27 import static org.mockito.Mockito.verify;
28
29 import java.io.ByteArrayInputStream;
30 import java.io.InputStream;
31
32 import org.apache.james.mime4j.stream.MimeConfig;
33 import org.apache.tika.exception.TikaException;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.metadata.TikaCoreProperties;
36 import org.apache.tika.parser.ParseContext;
37 import org.apache.tika.parser.Parser;
38 import org.apache.tika.sax.BodyContentHandler;
39 import org.apache.tika.sax.XHTMLContentHandler;
40 import org.junit.Test;
41 import org.xml.sax.Attributes;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.helpers.DefaultHandler;
44
45 public class RFC822ParserTest {
46
47 @Test
48 public void testSimple() {
49 Parser parser = new RFC822Parser();
50 Metadata metadata = new Metadata();
51 InputStream stream = getStream("test-documents/testRFC822");
52 ContentHandler handler = mock(DefaultHandler.class);
53
54 try {
55 parser.parse(stream, handler, metadata, new ParseContext());
56 verify(handler).startDocument();
57 //just one body
58 verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
59 verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
60 //no multi-part body parts
61 verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
62 verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
63 verify(handler).endDocument();
64 //note no leading spaces, and no quotes
65 assertEquals("Julien Nioche (JIRA) <jira@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
66 assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
67 metadata.get(TikaCoreProperties.TITLE));
68 assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed",
69 metadata.get(Metadata.SUBJECT));
70 } catch (Exception e) {
71 fail("Exception thrown: " + e.getMessage());
72 }
73 }
74
75 @Test
76 public void testMultipart() {
77 Parser parser = new RFC822Parser();
78 Metadata metadata = new Metadata();
79 InputStream stream = getStream("test-documents/testRFC822-multipart");
80 ContentHandler handler = mock(XHTMLContentHandler.class);
81
82 try {
83 parser.parse(stream, handler, metadata, new ParseContext());
84 verify(handler).startDocument();
85 //4 body-part divs -- two outer bodies and two inner bodies
86 verify(handler, times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
87 verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
88 //5 paragraph elements, 4 for body-parts and 1 for encompassing message
89 verify(handler, times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
90 verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
91 verify(handler).endDocument();
92 } catch (Exception e) {
93 fail("Exception thrown: " + e.getMessage());
94 }
95
96 //repeat, this time looking at content
97 parser = new RFC822Parser();
98 metadata = new Metadata();
99 stream = getStream("test-documents/testRFC822-multipart");
100 handler = new BodyContentHandler();
101 try {
102 parser.parse(stream, handler, metadata, new ParseContext());
103 //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
104 String bodyText = handler.toString();
105 assertTrue(bodyText.contains("body 1"));
106 assertTrue(bodyText.contains("body 2"));
107 assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
108 } catch (Exception e) {
109 fail("Exception thrown: " + e.getMessage());
110 }
111 }
112
113 @Test
114 public void testQuotedPrintable() {
115 Parser parser = new RFC822Parser();
116 Metadata metadata = new Metadata();
117 InputStream stream = getStream("test-documents/testRFC822_quoted");
118 ContentHandler handler = new BodyContentHandler();
119
120 try {
121 parser.parse(stream, handler, metadata, new ParseContext());
122 //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
123 String bodyText = handler.toString();
124 assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
125 assertTrue(bodyText.contains("Lines can be split like this."));
126 assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
127 assertFalse(bodyText.contains("=")); //there should be no escape sequences
128 } catch (Exception e) {
129 fail("Exception thrown: " + e.getMessage());
130 }
131 }
132
133 @Test
134 public void testBase64() {
135 Parser parser = new RFC822Parser();
136 Metadata metadata = new Metadata();
137 InputStream stream = getStream("test-documents/testRFC822_base64");
138 ContentHandler handler = new BodyContentHandler();
139
140 try {
141 parser.parse(stream, handler, metadata, new ParseContext());
142 //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
143 assertTrue(handler.toString().contains("Here is some text, with international characters, voil\u00E0!"));
144 } catch (Exception e) {
145 fail("Exception thrown: " + e.getMessage());
146 }
147 }
148
149 @Test
150 public void testI18NHeaders() {
151 Parser parser = new RFC822Parser();
152 Metadata metadata = new Metadata();
153 InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
154 ContentHandler handler = mock(DefaultHandler.class);
155
156 try {
157 parser.parse(stream, handler, metadata, new ParseContext());
158 //tests correct decoding of internationalized headers, both
159 //quoted-printable (Q) and Base64 (B).
160 assertEquals("Keld J\u00F8rn Simonsen <keld@dkuug.dk>",
161 metadata.get(TikaCoreProperties.CREATOR));
162 assertEquals("If you can read this you understand the example.",
163 metadata.get(TikaCoreProperties.TITLE));
164 assertEquals("If you can read this you understand the example.",
165 metadata.get(Metadata.SUBJECT));
166 } catch (Exception e) {
167 fail("Exception thrown: " + e.getMessage());
168 }
169 }
170
171 /**
172 * The from isn't in the usual form.
173 * See TIKA-618
174 */
175 @Test
176 public void testUnusualFromAddress() throws Exception {
177 Parser parser = new RFC822Parser();
178 Metadata metadata = new Metadata();
179 InputStream stream = getStream("test-documents/testRFC822_oddfrom");
180 ContentHandler handler = mock(DefaultHandler.class);
181
182 parser.parse(stream, handler, metadata, new ParseContext());
183 assertEquals("Saved by Windows Internet Explorer 7",
184 metadata.get(TikaCoreProperties.CREATOR));
185 assertEquals("Air Permit Programs | Air & Radiation | US EPA",
186 metadata.get(TikaCoreProperties.TITLE));
187 assertEquals("Air Permit Programs | Air & Radiation | US EPA",
188 metadata.get(Metadata.SUBJECT));
189 }
190
191 /**
192 * Test for TIKA-640, increase header max beyond 10k bytes
193 */
194 @Test
195 public void testLongHeader() throws Exception {
196 StringBuilder inputBuilder = new StringBuilder();
197 for (int i = 0; i < 2000; ++i) {
198 inputBuilder.append( //len > 50
199 "really really really really really really long name ");
200 }
201 String name = inputBuilder.toString();
202 byte[] data = ("From: " + name + "\r\n\r\n").getBytes("US-ASCII");
203
204 Parser parser = new RFC822Parser();
205 ContentHandler handler = new DefaultHandler();
206 Metadata metadata = new Metadata();
207 ParseContext context = new ParseContext();
208
209 try {
210 parser.parse(
211 new ByteArrayInputStream(data), handler, metadata, context);
212 fail();
213 } catch (TikaException expected) {
214 }
215
216 MimeConfig config = new MimeConfig();
217 config.setMaxHeaderLen(-1);
218 config.setMaxLineLen(-1);
219 context.set(MimeConfig.class, config);
220 parser.parse(
221 new ByteArrayInputStream(data), handler, metadata, context);
222 assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
223 }
224
225 /**
226 * Test for TIKA-678 - not all headers may be present
227 */
228 @Test
229 public void testSomeMissingHeaders() throws Exception {
230 Parser parser = new RFC822Parser();
231 Metadata metadata = new Metadata();
232 InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
233 ContentHandler handler = new BodyContentHandler();
234
235 parser.parse(stream, handler, metadata, new ParseContext());
236 assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
237 assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
238 assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
239 assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
240 assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
241 assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]);
242 assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO));
243 assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]);
244 assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]);
245 assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE));
246 assertEquals("abcd", metadata.get(Metadata.SUBJECT));
247 assertTrue(handler.toString().contains("bar biz bat"));
248 }
249
250 private static InputStream getStream(String name) {
251 return Thread.currentThread().getContextClassLoader()
252 .getResourceAsStream(name);
253 }
254
255 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mbox;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.fail;
20 import static org.mockito.Matchers.any;
21 import static org.mockito.Matchers.eq;
22 import static org.mockito.Mockito.mock;
23 import static org.mockito.Mockito.times;
24 import static org.mockito.Mockito.verify;
25
26 import java.io.InputStream;
27
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.parser.Parser;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.junit.Test;
34 import org.xml.sax.Attributes;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.helpers.DefaultHandler;
37
38 public class MboxParserTest {
39
40 @Test
41 public void testSimple() {
42 Parser parser = new MboxParser();
43 Metadata metadata = new Metadata();
44 InputStream stream = getStream("test-documents/simple.mbox");
45 ContentHandler handler = mock(DefaultHandler.class);
46
47 try {
48 parser.parse(stream, handler, metadata, new ParseContext());
49 verify(handler).startDocument();
50 verify(handler, times(2)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
51 verify(handler, times(2)).endElement(XHTMLContentHandler.XHTML, "p", "p");
52 verify(handler).characters(new String("Test content 1").toCharArray(), 0, 14);
53 verify(handler).characters(new String("Test content 2").toCharArray(), 0, 14);
54 verify(handler).endDocument();
55 } catch (Exception e) {
56 fail("Exception thrown: " + e.getMessage());
57 }
58 }
59
60 @Test
61 public void testHeaders() {
62 Parser parser = new MboxParser();
63 Metadata metadata = new Metadata();
64 InputStream stream = getStream("test-documents/headers.mbox");
65 ContentHandler handler = mock(DefaultHandler.class);
66
67 try {
68 parser.parse(stream, handler, metadata, new ParseContext());
69
70 verify(handler).startDocument();
71 verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
72 verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
73 verify(handler).endDocument();
74
75 assertEquals("subject", metadata.get(TikaCoreProperties.TITLE));
76 assertEquals("subject", metadata.get(Metadata.SUBJECT));
77 assertEquals("<author@domain.com>", metadata.get(Metadata.AUTHOR));
78 assertEquals("<author@domain.com>", metadata.get(TikaCoreProperties.CREATOR));
79 assertEquals(null, metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
80 assertEquals("<name@domain.com>", metadata.get("MboxParser-return-path"));
81 assertEquals("Should be ISO date in UTC, converted from 'Tue, 9 Jun 2009 23:58:45 -0400'",
82 "2009-06-10T03:58:45Z", metadata.get(TikaCoreProperties.CREATED));
83 } catch (Exception e) {
84 fail("Exception thrown: " + e.getMessage());
85 }
86 }
87
88 @Test
89 public void testMultilineHeader() {
90 Parser parser = new MboxParser();
91 Metadata metadata = new Metadata();
92 InputStream stream = getStream("test-documents/multiline.mbox");
93 ContentHandler handler = mock(DefaultHandler.class);
94
95 try {
96 parser.parse(stream, handler, metadata, new ParseContext());
97
98 verify(handler).startDocument();
99 verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
100 verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
101 verify(handler).endDocument();
102
103 assertEquals("from xxx by xxx with xxx; date", metadata.get("MboxParser-received"));
104 } catch (Exception e) {
105 fail("Exception thrown: " + e.getMessage());
106 }
107 }
108
109 @Test
110 public void testQuoted() {
111 Parser parser = new MboxParser();
112 Metadata metadata = new Metadata();
113 InputStream stream = getStream("test-documents/quoted.mbox");
114 ContentHandler handler = mock(DefaultHandler.class);
115
116 try {
117 parser.parse(stream, handler, metadata, new ParseContext());
118
119 verify(handler).startDocument();
120 verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
121 verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), any(Attributes.class));
122 verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
123 verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
124 verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
125 verify(handler).characters(new String("> quoted stuff").toCharArray(), 0, 14);
126 verify(handler).endDocument();
127 } catch (Exception e) {
128 fail("Exception thrown: " + e.getMessage());
129 }
130 }
131
132 @Test
133 public void testComplex() {
134 Parser parser = new MboxParser();
135 Metadata metadata = new Metadata();
136 InputStream stream = getStream("test-documents/complex.mbox");
137 ContentHandler handler = mock(DefaultHandler.class);
138
139 try {
140 parser.parse(stream, handler, metadata, new ParseContext());
141
142 // TODO: Remove subject and author in Tika 2.0
143 assertEquals("Re: question about when shuffle/sort start working", metadata.get(Metadata.SUBJECT));
144 assertEquals("Re: question about when shuffle/sort start working", metadata.get(TikaCoreProperties.TITLE));
145 assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", metadata.get(Metadata.AUTHOR));
146 assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", metadata.get(TikaCoreProperties.CREATOR));
147 assertEquals("core-user@hadoop.apache.org", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
148
149 verify(handler).startDocument();
150 verify(handler, times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
151 verify(handler, times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
152 verify(handler, times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), any(Attributes.class));
153 verify(handler, times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
154 verify(handler).endDocument();
155 } catch (Exception e) {
156 fail("Exception thrown: " + e.getMessage());
157 }
158 }
159
160 private static InputStream getStream(String name) {
161 return Thread.currentThread().getContextClassLoader()
162 .getResourceAsStream(name);
163 }
164
165
166 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNotNull;
20
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import java.util.List;
25
26 import org.apache.tika.extractor.ContainerExtractor;
27 import org.apache.tika.extractor.EmbeddedResourceHandler;
28 import org.apache.tika.io.TikaInputStream;
29 import org.apache.tika.mime.MediaType;
30
31 /**
32 * Parent class of tests that the various POI powered parsers are
33 * able to extract their embedded contents.
34 */
35 public abstract class AbstractPOIContainerExtractionTest {
36 public static final MediaType TYPE_DOC = MediaType.application("msword");
37 public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
38 public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
39 public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
40 public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
41 public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
42 public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook");
43
44 public static final MediaType TYPE_TXT = MediaType.text("plain");
45 public static final MediaType TYPE_PDF = MediaType.application("pdf");
46
47 public static final MediaType TYPE_JPG = MediaType.image("jpeg");
48 public static final MediaType TYPE_GIF = MediaType.image("gif");
49 public static final MediaType TYPE_PNG = MediaType.image("png");
50 public static final MediaType TYPE_EMF = MediaType.application("x-emf");
51 public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");
52
53 protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
54 TikaInputStream stream = getTestFile(filename);
55 try {
56 assertEquals(true, extractor.isSupported(stream));
57
58 // Process it
59 TrackingHandler handler = new TrackingHandler();
60 if(recurse) {
61 extractor.extract(stream, extractor, handler);
62 } else {
63 extractor.extract(stream, null, handler);
64 }
65
66 // So they can check what happened
67 return handler;
68 } finally {
69 stream.close();
70 }
71 }
72
73 protected TikaInputStream getTestFile(String filename) throws Exception {
74 URL input = AbstractPOIContainerExtractionTest.class.getResource(
75 "/test-documents/" + filename);
76 assertNotNull(filename + " not found", input);
77
78 return TikaInputStream.get(input);
79 }
80
81 public static class TrackingHandler implements EmbeddedResourceHandler {
82 public List<String> filenames = new ArrayList<String>();
83 public List<MediaType> mediaTypes = new ArrayList<MediaType>();
84
85 public void handle(String filename, MediaType mediaType,
86 InputStream stream) {
87 filenames.add(filename);
88 mediaTypes.add(mediaType);
89 }
90 }
91 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23 import java.util.Locale;
24
25 import org.apache.tika.detect.DefaultDetector;
26 import org.apache.tika.detect.Detector;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.OfficeOpenXMLExtended;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AutoDetectParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
34 import org.apache.tika.sax.BodyContentHandler;
35 import org.junit.Test;
36 import org.xml.sax.ContentHandler;
37
38 public class ExcelParserTest {
39
40 @Test
41 public void testExcelParser() throws Exception {
42 InputStream input = ExcelParserTest.class.getResourceAsStream(
43 "/test-documents/testEXCEL.xls");
44 try {
45 Metadata metadata = new Metadata();
46 ContentHandler handler = new BodyContentHandler();
47 ParseContext context = new ParseContext();
48 context.set(Locale.class, Locale.US);
49 new OfficeParser().parse(input, handler, metadata, context);
50
51 assertEquals(
52 "application/vnd.ms-excel",
53 metadata.get(Metadata.CONTENT_TYPE));
54 assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
55 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
56 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
57
58 // Mon Oct 01 17:13:56 BST 2007
59 assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
60 assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
61
62 // Mon Oct 01 17:31:43 BST 2007
63 assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
64 assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
65
66 String content = handler.toString();
67 assertTrue(content.contains("Sample Excel Worksheet"));
68 assertTrue(content.contains("Numbers and their Squares"));
69 assertTrue(content.contains("\t\tNumber\tSquare"));
70 assertTrue(content.contains("9"));
71 assertFalse(content.contains("9.0"));
72 assertTrue(content.contains("196"));
73 assertFalse(content.contains("196.0"));
74 } finally {
75 input.close();
76 }
77 }
78
79 @Test
80 public void testExcelParserFormatting() throws Exception {
81 InputStream input = ExcelParserTest.class.getResourceAsStream(
82 "/test-documents/testEXCEL-formats.xls");
83 try {
84 Metadata metadata = new Metadata();
85 ParseContext context = new ParseContext();
86 context.set(Locale.class, Locale.US);
87 ContentHandler handler = new BodyContentHandler();
88 new OfficeParser().parse(input, handler, metadata, context);
89
90 assertEquals(
91 "application/vnd.ms-excel",
92 metadata.get(Metadata.CONTENT_TYPE));
93
94 String content = handler.toString();
95
96 // Number #,##0.00
97 assertTrue(content.contains("1,599.99"));
98 assertTrue(content.contains("-1,599.99"));
99
100 // Currency $#,##0.00;[Red]($#,##0.00)
101 assertTrue(content.contains("$1,599.99"));
102 assertTrue(content.contains("($1,599.99)"));
103
104 // Scientific 0.00E+00
105 // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
106 assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
107 assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
108
109 // Percentage.
110 assertTrue(content.contains("2.50%"));
111 // Excel rounds up to 3%, but that requires Java 1.6 or later
112 if(System.getProperty("java.version").startsWith("1.5")) {
113 assertTrue(content.contains("2%"));
114 } else {
115 assertTrue(content.contains("3%"));
116 }
117
118 // Time Format: h:mm
119 assertTrue(content.contains("6:15"));
120 assertTrue(content.contains("18:15"));
121
122 // Date Format: d-mmm-yy
123 assertTrue(content.contains("17-May-07"));
124
125 // Date Format: m/d/yy
126 assertTrue(content.contains("10/3/09"));
127
128 // Date/Time Format: m/d/yy h:mm
129 assertTrue(content.contains("1/19/08 4:35"));
130
131
132 // Below assertions represent outstanding formatting issues to be addressed
133 // they are included to allow the issues to be progressed with the Apache POI
134 // team - See TIKA-103.
135
136 /*************************************************************************
137 // Custom Number (0 "dollars and" .00 "cents")
138 assertTrue(content.contains("19 dollars and .99 cents"));
139
140 // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
141 assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
142
143 // Fraction (2.5): # ?/? (TODO Coming in POI 3.8 beta 6)
144 assertTrue(content.contains("2 1 / 2"));
145 **************************************************************************/
146
147 } finally {
148 input.close();
149 }
150 }
151
152 /**
153 * TIKA-214 - Ensure we extract labels etc from Charts
154 */
155 @Test
156 public void testExcelParserCharts() throws Exception {
157 InputStream input = ExcelParserTest.class.getResourceAsStream(
158 "/test-documents/testEXCEL-charts.xls");
159 try {
160 Metadata metadata = new Metadata();
161 ParseContext context = new ParseContext();
162 context.set(Locale.class, Locale.US);
163 ContentHandler handler = new BodyContentHandler();
164 new OfficeParser().parse(input, handler, metadata, context);
165
166 assertEquals(
167 "application/vnd.ms-excel",
168 metadata.get(Metadata.CONTENT_TYPE));
169
170 String content = handler.toString();
171
172 // The first sheet has a pie chart
173 assertTrue(content.contains("charttabyodawg"));
174 assertTrue(content.contains("WhamPuff"));
175
176 // The second sheet has a bar chart and some text
177 assertTrue(content.contains("Sheet1"));
178 assertTrue(content.contains("Test Excel Spreasheet"));
179 assertTrue(content.contains("foo"));
180 assertTrue(content.contains("bar"));
181 assertTrue(content.contains("fizzlepuff"));
182 assertTrue(content.contains("whyaxis"));
183 assertTrue(content.contains("eksaxis"));
184
185 // The third sheet has some text
186 assertTrue(content.contains("Sheet2"));
187 assertTrue(content.contains("dingdong"));
188 } finally {
189 input.close();
190 }
191 }
192
193 @Test
194 public void testJXL() throws Exception {
195 InputStream input = ExcelParserTest.class.getResourceAsStream(
196 "/test-documents/jxl.xls");
197 try {
198 Metadata metadata = new Metadata();
199 ContentHandler handler = new BodyContentHandler(-1);
200 ParseContext context = new ParseContext();
201 context.set(Locale.class, Locale.US);
202 new OfficeParser().parse(input, handler, metadata, context);
203
204 assertEquals(
205 "application/vnd.ms-excel",
206 metadata.get(Metadata.CONTENT_TYPE));
207 String content = handler.toString();
208 assertTrue(content.contains("Number Formats"));
209 } finally {
210 input.close();
211 }
212 }
213
214 @Test
215 public void testWorksSpreadsheet70() throws Exception {
216 InputStream input = ExcelParserTest.class.getResourceAsStream(
217 "/test-documents/testWORKSSpreadsheet7.0.xlr");
218 try {
219 Metadata metadata = new Metadata();
220 ContentHandler handler = new BodyContentHandler(-1);
221 ParseContext context = new ParseContext();
222 context.set(Locale.class, Locale.US);
223 new OfficeParser().parse(input, handler, metadata, context);
224
225 String content = handler.toString();
226 assertTrue(content.contains("Microsoft Works"));
227 } finally {
228 input.close();
229 }
230 }
231
232 /**
233 * We don't currently support the .xlsb file format
234 * (an OOXML container with binary blobs), but we
235 * shouldn't break on these files either (TIKA-826)
236 */
237 @Test
238 public void testExcelXLSB() throws Exception {
239 Detector detector = new DefaultDetector();
240 AutoDetectParser parser = new AutoDetectParser();
241
242 InputStream input = ExcelParserTest.class.getResourceAsStream(
243 "/test-documents/testEXCEL.xlsb");
244 Metadata m = new Metadata();
245 m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
246
247 // Should be detected correctly
248 MediaType type = null;
249 try {
250 type = detector.detect(input, m);
251 assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
252 } finally {
253 input.close();
254 }
255
256 // OfficeParser won't handle it
257 assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
258
259 // OOXMLParser won't handle it
260 assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
261
262 // AutoDetectParser doesn't break on it
263 input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");
264
265 try {
266 ContentHandler handler = new BodyContentHandler(-1);
267 ParseContext context = new ParseContext();
268 context.set(Locale.class, Locale.US);
269 parser.parse(input, handler, m, context);
270
271 String content = handler.toString();
272 assertEquals("", content);
273 } finally {
274 input.close();
275 }
276 }
277
278 /**
279 * We don't currently support the old Excel 95 .xls file format,
280 * but we shouldn't break on these files either (TIKA-976)
281 */
282 @Test
283 public void testExcel95() throws Exception {
284 Detector detector = new DefaultDetector();
285 AutoDetectParser parser = new AutoDetectParser();
286
287 InputStream input = ExcelParserTest.class.getResourceAsStream(
288 "/test-documents/testEXCEL_95.xls");
289 Metadata m = new Metadata();
290 m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
291
292 // Should be detected correctly
293 MediaType type = null;
294 try {
295 type = detector.detect(input, m);
296 assertEquals("application/vnd.ms-excel", type.toString());
297 } finally {
298 input.close();
299 }
300
301 // OfficeParser will claim to handle it
302 assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
303
304 // OOXMLParser won't handle it
305 assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
306
307 // AutoDetectParser doesn't break on it
308 input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
309
310 try {
311 ContentHandler handler = new BodyContentHandler(-1);
312 ParseContext context = new ParseContext();
313 context.set(Locale.class, Locale.US);
314 parser.parse(input, handler, m, context);
315
316 String content = handler.toString();
317 assertEquals("", content);
318 } finally {
319 input.close();
320 }
321 }
322
323 /**
324 * Ensures that custom OLE2 (HPSF) properties are extracted
325 */
326 @Test
327 public void testCustomProperties() throws Exception {
328 InputStream input = ExcelParserTest.class.getResourceAsStream(
329 "/test-documents/testEXCEL_custom_props.xls");
330 Metadata metadata = new Metadata();
331
332 try {
333 ContentHandler handler = new BodyContentHandler(-1);
334 ParseContext context = new ParseContext();
335 context.set(Locale.class, Locale.US);
336 new OfficeParser().parse(input, handler, metadata, context);
337 } finally {
338 input.close();
339 }
340
341 assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
342 assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
343 assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
344 assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
345 assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
346 assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
347 assertEquals("true", metadata.get("custom:myCustomBoolean"));
348 assertEquals("3", metadata.get("custom:myCustomNumber"));
349 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
350 assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
351 assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
352 }
353 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23 import java.io.StringWriter;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 import javax.xml.transform.OutputKeys;
28 import javax.xml.transform.sax.SAXTransformerFactory;
29 import javax.xml.transform.sax.TransformerHandler;
30 import javax.xml.transform.stream.StreamResult;
31
32 import org.apache.tika.metadata.Metadata;
33 import org.apache.tika.metadata.TikaCoreProperties;
34 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.sax.BodyContentHandler;
38 import org.junit.Test;
39 import org.xml.sax.ContentHandler;
40
41 /**
42 * Test case for parsing Outlook files.
43 */
44 public class OutlookParserTest {
45
46 @Test
47 public void testOutlookParsing() throws Exception {
48 Parser parser = new AutoDetectParser(); // Should auto-detect!
49 ContentHandler handler = new BodyContentHandler();
50 Metadata metadata = new Metadata();
51
52 InputStream stream = OutlookParserTest.class.getResourceAsStream(
53 "/test-documents/test-outlook.msg");
54 try {
55 parser.parse(stream, handler, metadata, new ParseContext());
56 } finally {
57 stream.close();
58 }
59
60 assertEquals(
61 "application/vnd.ms-outlook",
62 metadata.get(Metadata.CONTENT_TYPE));
63 assertEquals(
64 "Microsoft Outlook Express 6",
65 metadata.get(TikaCoreProperties.TITLE));
66 assertEquals(
67 "Nouvel utilisateur de Outlook Express",
68 metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
69 assertEquals(
70 "L'\u00C9quipe Microsoft Outlook Express",
71 metadata.get(TikaCoreProperties.CREATOR));
72 assertEquals(
73 "L'\u00C9quipe Microsoft Outlook Express",
74 metadata.get(Metadata.AUTHOR));
75
76 // Stored as Thu, 5 Apr 2007 09:26:06 -0700
77 assertEquals(
78 "2007-04-05T16:26:06Z",
79 metadata.get(TikaCoreProperties.CREATED));
80
81 String content = handler.toString();
82 assertTrue(content.contains(""));
83 assertTrue(content.contains("Microsoft Outlook Express 6"));
84 assertTrue(content.contains("L'\u00C9quipe Microsoft Outlook Express"));
85 assertTrue(content.contains("Nouvel utilisateur de Outlook Express"));
86 assertTrue(content.contains("Messagerie et groupes de discussion"));
87 }
88
89 /**
90 * Test case for TIKA-197
91 *
92 * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
93 */
94 @Test
95 public void testMultipleCopies() throws Exception {
96 Parser parser = new AutoDetectParser();
97 ContentHandler handler = new BodyContentHandler();
98 Metadata metadata = new Metadata();
99
100 InputStream stream = OutlookParserTest.class.getResourceAsStream(
101 "/test-documents/testMSG.msg");
102 try {
103 parser.parse(stream, handler, metadata, new ParseContext());
104 } finally {
105 stream.close();
106 }
107
108 assertEquals(
109 "application/vnd.ms-outlook",
110 metadata.get(Metadata.CONTENT_TYPE));
111
112 String content = handler.toString();
113 Pattern pattern = Pattern.compile("From");
114 Matcher matcher = pattern.matcher(content);
115 assertTrue(matcher.find());
116 assertFalse(matcher.find());
117 }
118
119 /**
120 * Test case for TIKA-395, to ensure parser works for new Outlook formats.
121 *
122 * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
123 */
124 @Test
125 public void testOutlookNew() throws Exception {
126 Parser parser = new AutoDetectParser();
127 ContentHandler handler = new BodyContentHandler();
128 Metadata metadata = new Metadata();
129
130 InputStream stream = OutlookParserTest.class.getResourceAsStream(
131 "/test-documents/test-outlook2003.msg");
132 try {
133 parser.parse(stream, handler, metadata, new ParseContext());
134 } finally {
135 stream.close();
136 }
137
138 assertEquals(
139 "application/vnd.ms-outlook",
140 metadata.get(Metadata.CONTENT_TYPE));
141 assertEquals(
142 "Welcome to Microsoft Office Outlook 2003",
143 metadata.get(TikaCoreProperties.TITLE));
144
145 String content = handler.toString();
146 assertTrue(content.contains("Outlook 2003"));
147 assertTrue(content.contains("Streamlined Mail Experience"));
148 assertTrue(content.contains("Navigation Pane"));
149 }
150
151 @Test
152 public void testOutlookHTMLVersion() throws Exception {
153 Parser parser = new AutoDetectParser();
154 Metadata metadata = new Metadata();
155
156 // Check the HTML version
157 StringWriter sw = new StringWriter();
158 SAXTransformerFactory factory = (SAXTransformerFactory)
159 SAXTransformerFactory.newInstance();
160 TransformerHandler handler = factory.newTransformerHandler();
161 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
162 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
163 handler.setResult(new StreamResult(sw));
164
165 InputStream stream = OutlookParserTest.class.getResourceAsStream(
166 "/test-documents/testMSG_chinese.msg");
167 try {
168 parser.parse(stream, handler, metadata, new ParseContext());
169 } finally {
170 stream.close();
171 }
172
173 // As the HTML version should have been processed, ensure
174 // we got some of the links
175 String content = sw.toString();
176 assertTrue(content.contains("<dd>tests.chang@fengttt.com</dd>"));
177 assertTrue(content.contains("<p>Alfresco MSG format testing"));
178 assertTrue(content.contains("<li>1"));
179 assertTrue(content.contains("<li>2"));
180
181 // Make sure we don't have nested html docs
182 assertEquals(2, content.split("<body>").length);
183 assertEquals(2, content.split("<\\/body>").length);
184 }
185
186 @Test
187 public void testOutlookForwarded() throws Exception {
188 Parser parser = new AutoDetectParser();
189 Metadata metadata = new Metadata();
190
191 // Check the HTML version
192 StringWriter sw = new StringWriter();
193 SAXTransformerFactory factory = (SAXTransformerFactory)
194 SAXTransformerFactory.newInstance();
195 TransformerHandler handler = factory.newTransformerHandler();
196 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
197 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
198 handler.setResult(new StreamResult(sw));
199
200 InputStream stream = OutlookParserTest.class.getResourceAsStream(
201 "/test-documents/testMSG_forwarded.msg");
202 try {
203 parser.parse(stream, handler, metadata, new ParseContext());
204 } finally {
205 stream.close();
206 }
207
208 // Make sure we don't have nested docs
209 String content = sw.toString();
210 assertEquals(2, content.split("<body>").length);
211 assertEquals(2, content.split("<\\/body>").length);
212 }
213
214 @Test
215 public void testOutlookHTMLfromRTF() throws Exception {
216 Parser parser = new AutoDetectParser();
217 Metadata metadata = new Metadata();
218
219 // Check the HTML version
220 StringWriter sw = new StringWriter();
221 SAXTransformerFactory factory = (SAXTransformerFactory)
222 SAXTransformerFactory.newInstance();
223 TransformerHandler handler = factory.newTransformerHandler();
224 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
225 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
226 handler.setResult(new StreamResult(sw));
227
228 InputStream stream = OutlookParserTest.class.getResourceAsStream(
229 "/test-documents/test-outlook2003.msg");
230 try {
231 parser.parse(stream, handler, metadata, new ParseContext());
232 } finally {
233 stream.close();
234 }
235
236 // As the HTML version should have been processed, ensure
237 // we got some of the links
238 String content = sw.toString().replaceAll("<p>\\s+","<p>");
239 assertTrue(content.contains("<dd>New Outlook User</dd>"));
240 assertTrue(content.contains("designed <i>to help you"));
241 assertTrue(content.contains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>"));
242
243 // Link - check text around it, and the link itself
244 assertTrue(content.contains("sign up for a free subscription"));
245 assertTrue(content.contains("Office Newsletter"));
246 assertTrue(content.contains("newsletter will be sent to you"));
247 assertTrue(content.contains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033"));
248
249 // Make sure we don't have nested html docs
250 assertEquals(2, content.split("<body>").length);
251 //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
252 }
253 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import org.apache.tika.extractor.ContainerExtractor;
23 import org.apache.tika.extractor.ParserContainerExtractor;
24 import org.apache.tika.mime.MediaType;
25 import org.junit.Test;
26
27 /**
28 * Tests that the various POI powered parsers are
29 * able to extract their embedded contents.
30 */
31 public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
32
33 /**
34 * For office files which don't have anything embedded in them
35 */
36 @Test
37 public void testWithoutEmbedded() throws Exception {
38 ContainerExtractor extractor = new ParserContainerExtractor();
39
40 String[] files = new String[] {
41 "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
42 "testVISIO.vsd", "test-outlook.msg"
43 };
44 for(String file : files) {
45 // Process it without recursing
46 TrackingHandler handler = process(file, extractor, false);
47
48 // Won't have fired
49 assertEquals(0, handler.filenames.size());
50 assertEquals(0, handler.mediaTypes.size());
51
52 // Ditto with recursing
53 handler = process(file, extractor, true);
54 assertEquals(0, handler.filenames.size());
55 assertEquals(0, handler.mediaTypes.size());
56 }
57 }
58
59 /**
60 * Office files with embedded images, but no other
61 * office files in them
62 */
63 @Test
64 public void testEmbeddedImages() throws Exception {
65 ContainerExtractor extractor = new ParserContainerExtractor();
66 TrackingHandler handler;
67
68 // Excel with 1 image
69 handler = process("testEXCEL_1img.xls", extractor, false);
70 assertEquals(1, handler.filenames.size());
71 assertEquals(1, handler.mediaTypes.size());
72
73 assertEquals(null, handler.filenames.get(0));
74 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
75
76
77 // PowerPoint with 2 images + sound
78 // TODO
79
80
81 // Word with 1 image
82 handler = process("testWORD_1img.doc", extractor, false);
83 assertEquals(1, handler.filenames.size());
84 assertEquals(1, handler.mediaTypes.size());
85
86 assertEquals("image1.png", handler.filenames.get(0));
87 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
88
89
90 // Word with 3 images
91 handler = process("testWORD_3imgs.doc", extractor, false);
92 assertEquals(3, handler.filenames.size());
93 assertEquals(3, handler.mediaTypes.size());
94
95 assertEquals("image1.png", handler.filenames.get(0));
96 assertEquals("image2.jpg", handler.filenames.get(1));
97 assertEquals("image3.png", handler.filenames.get(2));
98 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
99 assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
100 assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
101 }
102
103 /**
104 * Office files which have other office files
105 * embedded into them. The embedded office files
106 * will sometimes have images in them.
107 *
108 * eg xls
109 * -> word
110 * -> image
111 * -> image
112 * -> powerpoint
113 * -> excel
114 * -> image
115 */
116 @Test
117 public void testEmbeddedOfficeFiles() throws Exception {
118 ContainerExtractor extractor = new ParserContainerExtractor();
119 TrackingHandler handler;
120
121
122 // Excel with a word doc and a powerpoint doc, both of which have images in them
123 // Without recursion, should see both documents + the images
124 handler = process("testEXCEL_embeded.xls", extractor, false);
125 assertEquals(5, handler.filenames.size());
126 assertEquals(5, handler.mediaTypes.size());
127
128 // We don't know their filenames
129 assertEquals(null, handler.filenames.get(0));
130 assertEquals(null, handler.filenames.get(1));
131 assertEquals(null, handler.filenames.get(2));
132 assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
133 assertEquals("MBD00032A24.doc", handler.filenames.get(4));
134 // But we do know their types
135 assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
136 assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
137 assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
138 assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
139 assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
140
141
142 // With recursion, should get the images embedded in the office files too
143 handler = process("testEXCEL_embeded.xls", extractor, true);
144 assertEquals(17, handler.filenames.size());
145 assertEquals(17, handler.mediaTypes.size());
146
147 assertEquals(null, handler.filenames.get(0));
148 assertEquals(null, handler.filenames.get(1));
149 assertEquals(null, handler.filenames.get(2));
150 assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
151 assertEquals("1", handler.filenames.get(4));
152 assertEquals(null, handler.filenames.get(5));
153 assertEquals("2", handler.filenames.get(6));
154 assertEquals("image1.png", handler.filenames.get(7));
155 assertEquals("image2.jpg", handler.filenames.get(8));
156 assertEquals("image3.png", handler.filenames.get(9));
157 assertEquals("image1.png", handler.filenames.get(16));
158
159 assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
160 assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
161 assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
162 assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
163 assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
164 assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
165 assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
166 assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
167 assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
168 assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
169 assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
170 assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
171
172 // Word with .docx, powerpoint and excel
173 handler = process("testWORD_embeded.doc", extractor, false);
174 assertEquals(9, handler.filenames.size());
175 assertEquals(9, handler.mediaTypes.size());
176
177 // Filenames are a bit iffy...
178 // Should really be 3*embedded pictures then 3*icons then embedded docs
179 assertEquals("image1.emf", handler.filenames.get(0));
180 assertEquals("image4.png", handler.filenames.get(1));
181 assertEquals("image5.jpg", handler.filenames.get(2));
182 assertEquals("image6.png", handler.filenames.get(3));
183 assertEquals("image2.emf", handler.filenames.get(4));
184 assertEquals("image3.emf", handler.filenames.get(5));
185 assertEquals(null, handler.filenames.get(6));
186 assertEquals("_1345471035.ppt", handler.filenames.get(7));
187 assertEquals("_1345470949.xls", handler.filenames.get(8));
188
189 // But we do know their types
190 assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
191 assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
192 assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
193 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
194 assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
195 assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
196 assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
197 assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
198 assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
199
200
201 // With recursion, should get their images too
202 handler = process("testWORD_embeded.doc", extractor, true);
203 assertEquals(16, handler.filenames.size());
204 assertEquals(16, handler.mediaTypes.size());
205
206 // We don't know their filenames, except for doc images + docx
207 assertEquals("image1.emf", handler.filenames.get(0));
208 assertEquals("image4.png", handler.filenames.get(1));
209 assertEquals("image5.jpg", handler.filenames.get(2));
210 assertEquals("image6.png", handler.filenames.get(3));
211 assertEquals("image2.emf", handler.filenames.get(4));
212 assertEquals("image3.emf", handler.filenames.get(5));
213 assertEquals(null, handler.filenames.get(6));
214 assertEquals("image2.png", handler.filenames.get(7));
215 assertEquals("image3.jpeg", handler.filenames.get(8));
216 assertEquals("image4.png", handler.filenames.get(9));
217 for(int i=11; i<14; i++) {
218 assertNull(handler.filenames.get(i));
219 }
220 // But we do know their types
221 assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
222 assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
223 assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
224 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
225 assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
226 assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
227 assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
228 assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx
229 assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx
230 assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx
231 assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
232 assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
233 assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls
234
235
236 // PowerPoint with excel and word
237 // TODO
238
239
240 // Word, with a non-office file (PDF)
241 handler = process("testWORD_embedded_pdf.doc", extractor, true);
242 assertEquals(2, handler.filenames.size());
243 assertEquals(2, handler.mediaTypes.size());
244
245 assertEquals("image1.emf", handler.filenames.get(0));
246 assertEquals("_1402837031.pdf", handler.filenames.get(1));
247
248 assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
249 assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
250
251
252
253 // Outlook with a text file and a word document
254 handler = process("testMSG_att_doc.msg", extractor, true);
255 assertEquals(2, handler.filenames.size());
256 assertEquals(2, handler.mediaTypes.size());
257
258 assertEquals("test-unicode.doc", handler.filenames.get(0));
259 assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
260
261 assertEquals("pj1.txt", handler.filenames.get(1));
262 assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
263
264
265 // Outlook with a pdf and another outlook message
266 handler = process("testMSG_att_msg.msg", extractor, true);
267 assertEquals(2, handler.filenames.size());
268 assertEquals(2, handler.mediaTypes.size());
269
270 assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
271 assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
272
273 assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
274 assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
275 }
276
277 @Test
278 public void testEmbeddedOfficeFilesXML() throws Exception {
279 ContainerExtractor extractor = new ParserContainerExtractor();
280 TrackingHandler handler;
281
282 handler = process("EmbeddedDocument.docx", extractor, false);
283 assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
284 assertEquals(2, handler.filenames.size());
285 }
286
287 @Test
288 public void testPowerpointImages() throws Exception {
289 ContainerExtractor extractor = new ParserContainerExtractor();
290 TrackingHandler handler;
291
292 handler = process("pictures.ppt", extractor, false);
293 assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
294 assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
295 }
296 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22 import java.util.Locale;
23
24 import org.apache.tika.TikaTest;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.Office;
27 import org.apache.tika.metadata.OfficeOpenXMLCore;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.junit.Test;
32 import org.xml.sax.ContentHandler;
33
34 public class PowerPointParserTest extends TikaTest {
35
36 @Test
37 public void testPowerPointParser() throws Exception {
38 InputStream input = PowerPointParserTest.class.getResourceAsStream(
39 "/test-documents/testPPT.ppt");
40 try {
41 Metadata metadata = new Metadata();
42 ContentHandler handler = new BodyContentHandler();
43 new OfficeParser().parse(input, handler, metadata, new ParseContext());
44
45 assertEquals(
46 "application/vnd.ms-powerpoint",
47 metadata.get(Metadata.CONTENT_TYPE));
48 assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
49 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
50 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
51 String content = handler.toString();
52 assertTrue(content.contains("Sample Powerpoint Slide"));
53 assertTrue(content.contains("Powerpoint X for Mac"));
54 } finally {
55 input.close();
56 }
57 }
58
59 @Test
60 public void testVarious() throws Exception {
61 ContentHandler handler = new BodyContentHandler();
62 Metadata metadata = new Metadata();
63
64 InputStream stream = PowerPointParserTest.class.getResourceAsStream(
65 "/test-documents/testPPT_various.ppt");
66 try {
67 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
68 } finally {
69 stream.close();
70 }
71
72 String content = handler.toString();
73 //content = content.replaceAll("\\s+"," ");
74 assertContains("Footnote appears here", content);
75 assertContains("This is a footnote.", content);
76 assertContains("This is the header text.", content);
77 assertContains("This is the footer text.", content);
78 assertContains("Here is a text box", content);
79 assertContains("Bold", content);
80 assertContains("italic", content);
81 assertContains("underline", content);
82 assertContains("superscript", content);
83 assertContains("subscript", content);
84 assertContains("Here is a citation:", content);
85 assertContains("Figure 1 This is a caption for Figure 1", content);
86 assertContains("(Kramer)", content);
87 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
88 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
89 assertContains("This is a hyperlink", content);
90 assertContains("Here is a list:", content);
91 for(int row=1;row<=3;row++) {
92 //assertContains("·\tBullet " + row, content);
93 //assertContains("\u00b7\tBullet " + row, content);
94 assertContains("Bullet " + row, content);
95 }
96 assertContains("Here is a numbered list:", content);
97 for(int row=1;row<=3;row++) {
98 //assertContains(row + ")\tNumber bullet " + row, content);
99 //assertContains(row + ") Number bullet " + row, content);
100 // TODO: OOXMLExtractor fails to number the bullets:
101 assertContains("Number bullet " + row, content);
102 }
103
104 for(int row=1;row<=2;row++) {
105 for(int col=1;col<=3;col++) {
106 // TODO Work out why the upgrade to POI 3.9 broke this test (table text)
107 // assertContains("Row " + row + " Col " + col, content);
108 }
109 }
110
111 assertContains("Keyword1 Keyword2", content);
112 assertEquals("Keyword1 Keyword2",
113 metadata.get(TikaCoreProperties.KEYWORDS));
114
115 assertContains("Subject is here", content);
116 assertEquals("Subject is here",
117 metadata.get(OfficeOpenXMLCore.SUBJECT));
118 // TODO: Remove subject in Tika 2.0
119 assertEquals("Subject is here",
120 metadata.get(Metadata.SUBJECT));
121
122 assertContains("Suddenly some Japanese text:", content);
123 // Special version of (GHQ)
124 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
125 // 6 other characters
126 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
127
128 assertContains("And then some Gothic text:", content);
129 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
130 }
131
132 @Test
133 public void testMasterFooter() throws Exception {
134 ContentHandler handler = new BodyContentHandler();
135 Metadata metadata = new Metadata();
136
137 InputStream stream = PowerPointParserTest.class.getResourceAsStream(
138 "/test-documents/testPPT_masterFooter.ppt");
139 try {
140 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
141 } finally {
142 stream.close();
143 }
144
145 String content = handler.toString();
146 assertContains("Master footer is here", content);
147
148 // Make sure boilerplate text didn't come through:
149 assertEquals(-1, content.indexOf("Click to edit Master"));
150
151 //TIKA-1171
152 assertEquals(-1, content.indexOf("*"));
153 }
154
155 // TODO: once we fix TIKA-712, re-enable this
156 @Test
157 public void testMasterText() throws Exception {
158 ContentHandler handler = new BodyContentHandler();
159 Metadata metadata = new Metadata();
160
161 InputStream stream = PowerPointParserTest.class.getResourceAsStream(
162 "/test-documents/testPPT_masterText.ppt");
163 try {
164 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
165 } finally {
166 stream.close();
167 }
168
169 String content = handler.toString();
170 assertContains("Text that I added to the master slide", content);
171
172 // Make sure boilerplate text didn't come through:
173 assertEquals(-1, content.indexOf("Click to edit Master"));
174
175 //TIKA-1171
176 assertEquals(-1, content.indexOf("*"));
177 }
178
179 // TODO: once we fix TIKA-712, re-enable this
180 @Test
181 public void testMasterText2() throws Exception {
182 ContentHandler handler = new BodyContentHandler();
183 Metadata metadata = new Metadata();
184
185 InputStream stream = PowerPointParserTest.class.getResourceAsStream(
186 "/test-documents/testPPT_masterText2.ppt");
187 try {
188 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
189 } finally {
190 stream.close();
191 }
192
193 String content = handler.toString();
194 assertContains("Text that I added to the master slide", content);
195
196 // Make sure boilerplate text didn't come through:
197 assertEquals(-1, content.indexOf("Click to edit Master"));
198 //TIKA-1171
199 assertEquals(-1, content.indexOf("*"));
200 }
201
202 /**
203 * Ensures that custom OLE2 (HPSF) properties are extracted
204 */
205 @Test
206 public void testCustomProperties() throws Exception {
207 InputStream input = PowerPointParserTest.class.getResourceAsStream(
208 "/test-documents/testPPT_custom_props.ppt");
209 Metadata metadata = new Metadata();
210
211 try {
212 ContentHandler handler = new BodyContentHandler(-1);
213 ParseContext context = new ParseContext();
214 context.set(Locale.class, Locale.US);
215 new OfficeParser().parse(input, handler, metadata, context);
216 } finally {
217 input.close();
218 }
219
220 assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
221 assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
222 assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
223 assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
224 assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
225 assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
226 assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
227 assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
228 assertEquals("1", metadata.get(Office.SLIDE_COUNT));
229 assertEquals("3", metadata.get(Office.WORD_COUNT));
230 assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
231 assertEquals("true", metadata.get("custom:myCustomBoolean"));
232 assertEquals("3", metadata.get("custom:myCustomNumber"));
233 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
234 assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
235 assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
236 }
237
238 // TIKA-1025
239 @Test
240 public void testEmbeddedPlacedholder() throws Exception {
241 XMLResult result = getXML("testPPT_embedded2.ppt");
242 assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
243 assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
244 }
245
246 // TIKA-817
247 @Test
248 public void testAutoDatePPT() throws Exception {
249 //decision was made in POI-52367 not to generate
250 //autodate automatically. For pptx, where value is stored,
251 //value is extracted. For ppt, however, no date is extracted.
252 XMLResult result = getXML("testPPT_autodate.ppt");
253 assertContains(
254 "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
255 result.xml);
256 }
257 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.metadata.Office;
24 import org.apache.tika.metadata.OfficeOpenXMLCore;
25 import org.apache.tika.metadata.OfficeOpenXMLExtended;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.sax.BodyContentHandler;
29 import org.junit.Test;
30 import org.xml.sax.ContentHandler;
31
32 /**
33 * Tests for Microsoft Project (MPP) Files.
34 *
35 * Note - we don't currently have a dedicated Project
36 * Parser, all we have is the common office metadata
37 */
38 public class ProjectParserTest {
39
40 @Test
41 public void testProject2003() throws Exception {
42 InputStream input = ProjectParserTest.class.getResourceAsStream(
43 "/test-documents/testPROJECT2003.mpp");
44 try {
45 doTestProject(input);
46 } finally {
47 input.close();
48 }
49 }
50
51 @Test
52 public void testProject2007() throws Exception {
53 InputStream input = ProjectParserTest.class.getResourceAsStream(
54 "/test-documents/testPROJECT2007.mpp");
55 try {
56 doTestProject(input);
57 } finally {
58 input.close();
59 }
60 }
61
62 private void doTestProject(InputStream input) throws Exception {
63 Metadata metadata = new Metadata();
64 ContentHandler handler = new BodyContentHandler();
65 new OfficeParser().parse(input, handler, metadata, new ParseContext());
66
67 assertEquals(
68 "application/vnd.ms-project",
69 metadata.get(Metadata.CONTENT_TYPE));
70
71 assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
72 assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
73 assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
74 assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
75 assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
76 assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS));
77 assertEquals("Comment Vulpes vulpes comment", metadata.get(TikaCoreProperties.COMMENTS));
78
79 assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY));
80 assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER));
81 assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY));
82
83 assertEquals("2011-11-24T10:58:00Z", metadata.get(TikaCoreProperties.CREATED));
84 assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE));
85 assertEquals("2011-11-24T11:31:00Z", metadata.get(TikaCoreProperties.MODIFIED));
86 assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE));
87
88 // Custom Project metadata is present with prefix
89 assertEquals("0%", metadata.get("custom:% Complete"));
90 assertEquals("0%", metadata.get("custom:% Work Complete"));
91 assertEquals("\u00a3"+"0.00", metadata.get("custom:Cost"));
92 assertEquals("2d?", metadata.get("custom:Duration"));
93 assertEquals("16h", metadata.get("custom:Work"));
94
95 // Currently, we don't do textual contents of the file
96 String content = handler.toString();
97 assertEquals("", content);
98 }
99 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class PublisherParserTest {
31
32 @Test
33 public void testPublisherParser() throws Exception {
34 InputStream input = PublisherParserTest.class.getResourceAsStream(
35 "/test-documents/testPUBLISHER.pub");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 new OfficeParser().parse(input, handler, metadata, new ParseContext());
40
41 assertEquals(
42 "application/x-mspublisher",
43 metadata.get(Metadata.CONTENT_TYPE));
44 assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
45 assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
46 assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
47 String content = handler.toString();
48 assertTrue(content.contains("0123456789"));
49 assertTrue(content.contains("abcdef"));
50 } finally {
51 input.close();
52 }
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.apache.tika.detect.DefaultDetector;
21 import org.apache.tika.detect.Detector;
22 import org.apache.tika.extractor.ContainerExtractor;
23 import org.apache.tika.extractor.ParserContainerExtractor;
24 import org.apache.tika.io.TikaInputStream;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.sax.BodyContentHandler;
30 import org.junit.Test;
31 import org.xml.sax.ContentHandler;
32
33 /**
34 * Tests for the TNEF (winmail.dat) parser
35 */
36 public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
37 private static final String file = "testWINMAIL.dat";
38
39 @Test
40 public void testBasics() throws Exception {
41 TikaInputStream stream = getTestFile(file);
42 Detector detector = new DefaultDetector();
43 try {
44 assertEquals(
45 MediaType.application("vnd.ms-tnef"),
46 detector.detect(stream, new Metadata()));
47 } finally {
48 stream.close();
49 }
50 }
51
52 @Test
53 public void testMetadata() throws Exception {
54 TikaInputStream stream = getTestFile(file);
55
56 Metadata metadata = new Metadata();
57 ContentHandler handler = new BodyContentHandler();
58
59 TNEFParser tnef = new TNEFParser();
60 tnef.parse(stream, handler, metadata, new ParseContext());
61
62 assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
63 assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
64 }
65
66 /**
67 * Check the Rtf and Attachments are returned
68 * as expected
69 */
70 @Test
71 public void testBodyAndAttachments() throws Exception {
72 ContainerExtractor extractor = new ParserContainerExtractor();
73
74 // Process it with recursing
75 // Will have the message body RTF and the attachments
76 TrackingHandler handler = process(file, extractor, true);
77 assertEquals(6, handler.filenames.size());
78 assertEquals(6, handler.mediaTypes.size());
79
80 // We know the filenames for all of them
81 assertEquals("message.rtf", handler.filenames.get(0));
82 assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
83
84 assertEquals("quick.doc", handler.filenames.get(1));
85 assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
86
87 assertEquals("quick.html", handler.filenames.get(2));
88 assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
89
90 assertEquals("quick.pdf", handler.filenames.get(3));
91 assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
92
93 assertEquals("quick.txt", handler.filenames.get(4));
94 assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
95
96 assertEquals("quick.xml", handler.filenames.get(5));
97 assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
98 }
99 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 public class VisioParserTest {
31
32 @Test
33 public void testVisioParser() throws Exception {
34 InputStream input = VisioParserTest.class.getResourceAsStream(
35 "/test-documents/testVISIO.vsd");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 new OfficeParser().parse(input, handler, metadata, new ParseContext());
40
41 assertEquals(
42 "application/vnd.visio",
43 metadata.get(Metadata.CONTENT_TYPE));
44 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
45 assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
46 String content = handler.toString();
47 assertTrue(content.contains("Some random text, on a page"));
48 } finally {
49 input.close();
50 }
51 }
52
53 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20 import static org.junit.Assert.assertFalse;
21
22 import java.io.InputStream;
23 import java.util.Locale;
24
25 import org.apache.log4j.Level;
26 import org.apache.log4j.Logger;
27 import org.apache.tika.TikaTest;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.metadata.Office;
30 import org.apache.tika.metadata.OfficeOpenXMLCore;
31 import org.apache.tika.metadata.OfficeOpenXMLExtended;
32 import org.apache.tika.metadata.TikaCoreProperties;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.sax.BodyContentHandler;
35 import org.junit.Test;
36 import org.xml.sax.ContentHandler;
37
38 public class WordParserTest extends TikaTest {
39
40 @Test
41 public void testWordParser() throws Exception {
42 InputStream input = WordParserTest.class.getResourceAsStream(
43 "/test-documents/testWORD.doc");
44 try {
45 ContentHandler handler = new BodyContentHandler();
46 Metadata metadata = new Metadata();
47 new OfficeParser().parse(input, handler, metadata, new ParseContext());
48
49 assertEquals(
50 "application/msword",
51 metadata.get(Metadata.CONTENT_TYPE));
52 assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
53 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
54 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
55 assertTrue(handler.toString().contains("Sample Word Document"));
56 } finally {
57 input.close();
58 }
59 }
60
61 @Test
62 public void testWordWithWAV() throws Exception {
63 InputStream input = WordParserTest.class.getResourceAsStream(
64 "/test-documents/Doc1_ole.doc");
65 try {
66 ContentHandler handler = new BodyContentHandler();
67 Metadata metadata = new Metadata();
68 new OfficeParser().parse(input, handler, metadata, new ParseContext());
69
70 assertTrue(handler.toString().contains("MSj00974840000[1].wav"));
71 } finally {
72 input.close();
73 }
74 }
75
76 /**
77 * Test that the word converter is able to generate the
78 * correct HTML for the document
79 */
80 @Test
81 public void testWordHTML() throws Exception {
82
83 // Try with a document containing various tables and
84 // formattings
85 XMLResult result = getXML("testWORD.doc");
86 String xml = result.xml;
87 Metadata metadata = result.metadata;
88
89 assertEquals(
90 "application/msword",
91 metadata.get(Metadata.CONTENT_TYPE));
92 assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
93 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
94 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
95 assertTrue(xml.contains("Sample Word Document"));
96
97 // Check that custom headings came through
98 assertTrue(xml.contains("<h1 class=\"title\">"));
99 // Regular headings
100 assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
101 assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
102 // Bold and italic
103 assertTrue(xml.contains("<b>BOLD</b>"));
104 assertTrue(xml.contains("<i>ITALIC</i>"));
105 // Table
106 assertTrue(xml.contains("<table>"));
107 assertTrue(xml.contains("<td>"));
108 // TODO - Check for the nested table
109 // Links
110 assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
111 // Paragraphs with other styles
112 assertTrue(xml.contains("<p class=\"signature\">This one"));
113
114 // Try with a document that contains images
115 xml = getXML("testWORD_3imgs.doc").xml;
116
117 // Images 1-3
118 assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
119 assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
120 assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
121
122 // Text too
123 assertTrue(xml.contains("<p>The end!"));
124
125 // TIKA-692: test document containing multiple
126 // character runs within a bold tag:
127 xml = getXML("testWORD_bold_character_runs.doc").xml;
128
129 // Make sure bold text arrived as single
130 // contiguous string even though Word parser
131 // handled this as 3 character runs
132 assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
133
134 // TIKA-692: test document containing multiple
135 // character runs within a bold tag:
136 xml = getXML("testWORD_bold_character_runs2.doc").xml;
137
138 // Make sure bold text arrived as single
139 // contiguous string even though Word parser
140 // handled this as 3 character runs
141 assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
142 }
143
144 @Test
145 public void testEmbeddedNames() throws Exception {
146 String result = getXML("testWORD_embedded_pdf.doc").xml;
147
148 // Make sure the embedded div comes out after "Here
149 // is the pdf file" and before "Bye Bye":
150 int i = result.indexOf("Here is the pdf file:");
151 assertTrue(i != -1);
152 int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
153 assertTrue(j != -1);
154 int k = result.indexOf("Bye Bye");
155 assertTrue(k != -1);
156
157 assertTrue(i < j);
158 assertTrue(j < k);
159 }
160
161 // TIKA-982
162 @Test
163 public void testEmbeddedRTF() throws Exception {
164 String result = getXML("testWORD_embedded_rtf.doc").xml;
165 assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1404039792\" />") != -1);
166 assertTrue(result.indexOf("_1404039792.rtf") != -1);
167 }
168
169 // TIKA-1019
170 @Test
171 public void testDocumentLink() throws Exception {
172 String result = getXML("testDocumentLink.doc").xml;
173 assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\" />") != -1);
174 assertTrue(result.indexOf("_1327495610.unknown") != -1);
175 }
176
177 @Test
178 public void testWord6Parser() throws Exception {
179 InputStream input = WordParserTest.class.getResourceAsStream(
180 "/test-documents/testWORD6.doc");
181 try {
182 ContentHandler handler = new BodyContentHandler();
183 Metadata metadata = new Metadata();
184 new OfficeParser().parse(input, handler, metadata, new ParseContext());
185
186 assertEquals(
187 "application/msword",
188 metadata.get(Metadata.CONTENT_TYPE));
189 assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
190 assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
191 assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
192 assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
193 assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
194 assertTrue(handler.toString().contains("The quick brown fox jumps over the lazy dog"));
195 } finally {
196 input.close();
197 }
198 }
199
200 @Test
201 public void testVarious() throws Exception {
202 ContentHandler handler = new BodyContentHandler();
203 Metadata metadata = new Metadata();
204
205 InputStream stream = WordParserTest.class.getResourceAsStream(
206 "/test-documents/testWORD_various.doc");
207 try {
208 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
209 } finally {
210 stream.close();
211 }
212
213 String content = handler.toString();
214 //content = content.replaceAll("\\s+"," ");
215 assertContains("Footnote appears here", content);
216 assertContains("This is a footnote.", content);
217 assertContains("This is the header text.", content);
218 assertContains("This is the footer text.", content);
219 assertContains("Here is a text box", content);
220 assertContains("Bold", content);
221 assertContains("italic", content);
222 assertContains("underline", content);
223 assertContains("superscript", content);
224 assertContains("subscript", content);
225 assertContains("Here is a citation:", content);
226 assertContains("Figure 1 This is a caption for Figure 1", content);
227 assertContains("(Kramer)", content);
228 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
229 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
230 assertContains("This is a hyperlink", content);
231 assertContains("Here is a list:", content);
232 for(int row=1;row<=3;row++) {
233 //assertContains("·\tBullet " + row, content);
234 //assertContains("\u00b7\tBullet " + row, content);
235 assertContains("Bullet " + row, content);
236 }
237 assertContains("Here is a numbered list:", content);
238 for(int row=1;row<=3;row++) {
239 //assertContains(row + ")\tNumber bullet " + row, content);
240 //assertContains(row + ") Number bullet " + row, content);
241 // TODO: WordExtractor fails to number the bullets:
242 assertContains("Number bullet " + row, content);
243 }
244
245 for(int row=1;row<=2;row++) {
246 for(int col=1;col<=3;col++) {
247 assertContains("Row " + row + " Col " + col, content);
248 }
249 }
250
251 assertContains("Keyword1 Keyword2", content);
252 assertEquals("Keyword1 Keyword2",
253 metadata.get(TikaCoreProperties.KEYWORDS));
254
255 assertContains("Subject is here", content);
256 // TODO: Move to OO subject in Tika 2.0
257 assertEquals("Subject is here",
258 metadata.get(Metadata.SUBJECT));
259 assertEquals("Subject is here",
260 metadata.get(OfficeOpenXMLCore.SUBJECT));
261
262 assertContains("Suddenly some Japanese text:", content);
263 // Special version of (GHQ)
264 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
265 // 6 other characters
266 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
267
268 assertContains("And then some Gothic text:", content);
269 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
270 }
271
272 /**
273 * TIKA-1044 - Handle documents where parts of the
274 * text have no formatting or styles applied to them
275 */
276 @Test
277 public void testNoFormat() throws Exception {
278 ContentHandler handler = new BodyContentHandler();
279 Metadata metadata = new Metadata();
280
281 InputStream stream = WordParserTest.class.getResourceAsStream(
282 "/test-documents/testWORD_no_format.doc");
283 try {
284 new OfficeParser().parse(stream, handler, metadata, new ParseContext());
285 } finally {
286 stream.close();
287 }
288
289 String content = handler.toString();
290 assertContains("Will generate an exception", content);
291 }
292
293 /**
294 * Ensures that custom OLE2 (HPSF) properties are extracted
295 */
296 @Test
297 public void testCustomProperties() throws Exception {
298 InputStream input = WordParserTest.class.getResourceAsStream(
299 "/test-documents/testWORD_custom_props.doc");
300 Metadata metadata = new Metadata();
301
302 try {
303 ContentHandler handler = new BodyContentHandler(-1);
304 ParseContext context = new ParseContext();
305 context.set(Locale.class, Locale.US);
306 new OfficeParser().parse(input, handler, metadata, context);
307 } finally {
308 input.close();
309 }
310
311 assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
312 assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
313 assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
314 assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
315 assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
316 assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
317 assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
318 assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
319 assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION));
320 assertEquals("1", metadata.get(Office.PAGE_COUNT));
321 assertEquals("2", metadata.get(Office.WORD_COUNT));
322 assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
323 assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
324 assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
325 assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
326 // TODO: Move to OO subject in Tika 2.0
327 assertEquals("My subject", metadata.get(Metadata.SUBJECT));
328 assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
329 assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
330 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
331 assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
332 }
333
334 @Test
335 public void testExceptions1() throws Exception {
336 XMLResult xml;
337 Level logLevelStart = Logger.getRootLogger().getLevel();
338 Logger.getRootLogger().setLevel(Level.ERROR);
339 try {
340 xml = getXML("testException1.doc");
341 assertContains("total population", xml.xml);
342 xml = getXML("testException2.doc");
343 assertContains("electric charge", xml.xml);
344 } finally {
345 Logger.getRootLogger().setLevel(logLevelStart);
346 }
347 }
348
349 @Test
350 public void testTabularSymbol() throws Exception {
351 assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
352 }
353
354 /**
355 * TIKA-1229 Hyperlinks in Headers should be output as such,
356 * not plain text with control characters
357 */
358 @Test
359 public void testHeaderHyperlinks() throws Exception {
360 XMLResult result = getXML("testWORD_header_hyperlink.doc");
361 String xml = result.xml;
362 Metadata metadata = result.metadata;
363
364 assertEquals(
365 "application/msword",
366 metadata.get(Metadata.CONTENT_TYPE));
367 assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
368 assertContains("example.com", xml);
369
370 // Check we don't have the special text HYPERLINK
371 assertFalse(xml.contains("HYPERLINK"));
372
373 // Check we do have the link
374 assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
375
376 // Check we do have the email
377 assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
378 }
379 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft;
17
18 import static org.junit.Assert.assertTrue;
19
20 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.parser.ParseContext;
22 import org.apache.tika.sax.BodyContentHandler;
23 import org.junit.Test;
24 import org.xml.sax.ContentHandler;
25
26 import java.io.InputStream;
27
28 public class WriteProtectedParserTest {
29
30 @Test
31 public void testWriteProtected() throws Exception {
32 InputStream input = ExcelParserTest.class.getResourceAsStream(
33 "/test-documents/protect.xlsx");
34
35 Metadata metadata = new Metadata();
36 ContentHandler handler = new BodyContentHandler();
37 new OfficeParser().parse(input, handler, metadata, new ParseContext());
38 String content = handler.toString();
39 assertTrue(content.contains("Office"));
40 }
41 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20
21 import org.apache.tika.Tika;
22 import org.apache.tika.extractor.ContainerExtractor;
23 import org.apache.tika.extractor.ParserContainerExtractor;
24 import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
25 import org.junit.Before;
26 import org.junit.Test;
27
28 /**
29 * Tests that the various POI OOXML powered parsers are
30 * able to extract their embedded contents.
31 */
32 public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtractionTest {
33 private ContainerExtractor extractor;
34
35 @Before
36 public void setUp() {
37 Tika tika = new Tika();
38 extractor = new ParserContainerExtractor(
39 tika.getParser(), tika.getDetector());
40 }
41
42 /**
43 * For office files which don't have anything embedded in them
44 */
45 @Test
46 public void testWithoutEmbedded() throws Exception {
47 String[] files = new String[] {
48 "testEXCEL.xlsx", "testWORD.docx", "testPPT.pptx",
49 };
50 for(String file : files) {
51 // Process it without recursing
52 TrackingHandler handler = process(file, extractor, false);
53
54 // Won't have fired
55 assertEquals(0, handler.filenames.size());
56 assertEquals(0, handler.mediaTypes.size());
57
58 // Ditto with recursing
59 handler = process(file, extractor, true);
60 assertEquals(0, handler.filenames.size());
61 assertEquals(0, handler.mediaTypes.size());
62 }
63 }
64
65 /**
66 * Office files with embedded images, but no other
67 * office files in them
68 */
69 @Test
70 public void testEmbeddedImages() throws Exception {
71 TrackingHandler handler;
72
73 // Excel with 1 image
74 handler = process("testEXCEL_1img.xlsx", extractor, false);
75 assertEquals(1, handler.filenames.size());
76 assertEquals(1, handler.mediaTypes.size());
77
78 assertEquals("image1.png", handler.filenames.get(0));
79 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
80
81
82 // PowerPoint with 2 images + sound
83 // TODO Figure out why we can't find the sound anywhere...
84 handler = process("testPPT_2imgs.pptx", extractor, false);
85 assertEquals(3, handler.filenames.size());
86 assertEquals(3, handler.mediaTypes.size());
87
88 assertEquals("image1.png", handler.filenames.get(0));
89 assertEquals("image2.gif", handler.filenames.get(1));
90 assertEquals("image3.png", handler.filenames.get(2));
91 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
92 assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound
93 assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
94
95
96 // Word with 1 image
97 handler = process("testWORD_1img.docx", extractor, false);
98 assertEquals(1, handler.filenames.size());
99 assertEquals(1, handler.mediaTypes.size());
100
101 assertEquals("image1.png", handler.filenames.get(0));
102 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
103
104
105 // Word with 3 images
106 handler = process("testWORD_3imgs.docx", extractor, false);
107 assertEquals(3, handler.filenames.size());
108 assertEquals(3, handler.mediaTypes.size());
109
110 assertEquals("image2.png", handler.filenames.get(0));
111 assertEquals("image3.jpeg", handler.filenames.get(1));
112 assertEquals("image4.png", handler.filenames.get(2));
113 assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
114 assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
115 assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
116 }
117
118 /**
119 * Office files which have other office files
120 * embedded into them. The embedded office files
121 * will sometimes have images in them.
122 *
123 * eg xls
124 * -> word
125 * -> image
126 * -> image
127 * -> powerpoint
128 * -> excel
129 * -> image
130 */
131 @Test
132 public void testEmbeddedOfficeFiles() throws Exception {
133 TrackingHandler handler;
134
135
136 // Excel with a word doc and a powerpoint doc, both of which have images in them
137 // Without recursion, should see both documents + the images
138 handler = process("testEXCEL_embeded.xlsx", extractor, false);
139 assertEquals(7, handler.filenames.size());
140 assertEquals(7, handler.mediaTypes.size());
141
142 // We know the rough filenames
143 assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", handler.filenames.get(0));
144 assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(1));
145 assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(2));
146 assertEquals("image1.png", handler.filenames.get(3));
147 assertEquals("image2.emf", handler.filenames.get(4));
148 assertEquals("image3.emf", handler.filenames.get(5));
149 assertEquals("image4.emf", handler.filenames.get(6));
150 // But we do know their types
151 assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
152 assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
153 assertEquals(TYPE_DOCX, handler.mediaTypes.get(2)); // Embedded office doc
154 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
155 assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
156 assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
157 assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
158
159
160 // With recursion, should get the images embedded in the office files too
161 handler = process("testEXCEL_embeded.xlsx", extractor, true);
162 assertEquals(23, handler.filenames.size());
163 assertEquals(23, handler.mediaTypes.size());
164
165 assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
166 assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx
167 assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // PNG inside .pptx
168 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx
169 assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); // .xlsx inside .pptx
170 assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .xlsx inside .pptx
171 assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // .docx inside .pptx
172 assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx inside .pptx
173 assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx inside .pptx
174 assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx inside .pptx
175 assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // .doc inside .pptx
176 assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // PNG inside .doc inside .pptx
177 assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); // Icon of item inside .pptx
178 assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); // Icon of item inside .pptx
179 assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of item inside .pptx
180 assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
181 assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // PNG inside .doc
182 assertEquals(TYPE_DOCX, handler.mediaTypes.get(17)); // Embedded office doc
183 assertEquals(TYPE_PNG, handler.mediaTypes.get(18)); // PNG inside .docx
184 assertEquals(TYPE_PNG, handler.mediaTypes.get(19)); // Embedded image
185 assertEquals(TYPE_EMF, handler.mediaTypes.get(20)); // Icon of embedded office doc
186 assertEquals(TYPE_EMF, handler.mediaTypes.get(21)); // Icon of embedded office doc
187 assertEquals(TYPE_EMF, handler.mediaTypes.get(22)); // Icon of embedded office doc
188
189
190 // Word with .docx, powerpoint and excel
191 handler = process("testWORD_embeded.docx", extractor, false);
192 assertEquals(9, handler.filenames.size());
193 assertEquals(9, handler.mediaTypes.size());
194
195 // We know their rough filenames
196 assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", handler.filenames.get(0));
197 assertEquals("image6.emf", handler.filenames.get(1));
198 assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(2));
199 assertEquals("image1.png", handler.filenames.get(3));
200 assertEquals("image2.jpeg", handler.filenames.get(4));
201 assertEquals("image3.png", handler.filenames.get(5));
202 assertEquals("image4.emf", handler.filenames.get(6));
203 assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(7));
204 assertEquals("image5.emf", handler.filenames.get(8));
205 // But we do know their types
206 assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
207 assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
208 assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
209 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
210 assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded image
211 assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
212 assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
213 assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // Embeded office doc
214 assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc
215
216
217 // With recursion, should get their images too
218 handler = process("testWORD_embeded.docx", extractor, true);
219 assertEquals(14, handler.filenames.size());
220 assertEquals(14, handler.mediaTypes.size());
221
222 // But we do know their types
223 assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
224 assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx
225 assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // GIF inside .pptx
226 assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx
227 assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
228 assertEquals(TYPE_DOC, handler.mediaTypes.get(5)); // Embedded office doc
229 assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // PNG inside .doc
230 assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
231 assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
232 assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
233 assertEquals(TYPE_EMF, handler.mediaTypes.get(10)); // Icon of embedded office doc
234 assertEquals(TYPE_XLSX, handler.mediaTypes.get(11)); // Embeded office doc
235 assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); // PNG inside .xlsx
236 assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); // Icon of embedded office doc
237
238
239 // PowerPoint with excel and word
240 handler = process("testPPT_embeded.pptx", extractor, false);
241 assertEquals(9, handler.filenames.size());
242 assertEquals(9, handler.mediaTypes.size());
243
244 // We don't know their exact filenames
245 assertEquals("image4.png", handler.filenames.get(0));
246 assertEquals("image5.gif", handler.filenames.get(1));
247 assertEquals("image6.png", handler.filenames.get(2));
248 assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3));
249 assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4));
250 assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5));
251 assertEquals("image1.emf", handler.filenames.get(6));
252 assertEquals("image2.emf", handler.filenames.get(7));
253 assertEquals("image3.emf", handler.filenames.get(8));
254 // But we do know their types
255 assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); // Embedded image
256 assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // Embedded image
257 assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
258 assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc
259 assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc
260 assertEquals(TYPE_DOC, handler.mediaTypes.get(5)); // Embedded office doc
261 assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
262 assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
263 assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc
264 }
265
266 @Test
267 public void testEmbeddedOutlook() throws Exception {
268 TrackingHandler handler =
269 process("EmbeddedOutlook.docx", extractor, false);
270
271 assertEquals(2, handler.filenames.size());
272 assertEquals(2, handler.mediaTypes.size());
273
274 assertEquals("image1.emf", handler.filenames.get(0));
275 assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
276
277 assertEquals("licensedTestMsgwAtt.msg", handler.filenames.get(1));
278 assertEquals(TYPE_MSG, handler.mediaTypes.get(1));
279 }
280
281 @Test
282 public void testEmbeddedPDF() throws Exception {
283 TrackingHandler handler =
284 process("EmbeddedPDF.docx", extractor, false);
285
286 assertEquals(2, handler.filenames.size());
287 assertEquals(2, handler.mediaTypes.size());
288
289 assertEquals("image1.emf", handler.filenames.get(0));
290 assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
291
292 assertNull(handler.filenames.get(1));
293 assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
294 }
295
296 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.ByteArrayOutputStream;
23 import java.io.InputStream;
24 import java.io.PrintStream;
25 import java.io.StringWriter;
26 import java.util.Locale;
27
28 import javax.xml.transform.OutputKeys;
29 import javax.xml.transform.sax.SAXTransformerFactory;
30 import javax.xml.transform.sax.TransformerHandler;
31 import javax.xml.transform.stream.StreamResult;
32
33 import org.apache.tika.TikaTest;
34 import org.apache.tika.io.TikaInputStream;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.Office;
37 import org.apache.tika.metadata.OfficeOpenXMLCore;
38 import org.apache.tika.metadata.OfficeOpenXMLExtended;
39 import org.apache.tika.metadata.TikaCoreProperties;
40 import org.apache.tika.metadata.TikaMetadataKeys;
41 import org.apache.tika.parser.AutoDetectParser;
42 import org.apache.tika.parser.ParseContext;
43 import org.apache.tika.parser.Parser;
44 import org.apache.tika.parser.microsoft.WordParserTest;
45 import org.apache.tika.sax.BodyContentHandler;
46 import org.junit.Test;
47 import org.xml.sax.ContentHandler;
48
49 public class OOXMLParserTest extends TikaTest {
50
51 private Parser parser = new AutoDetectParser();
52
53 private InputStream getTestDocument(String name) {
54 return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream(
55 "/test-documents/" + name));
56 }
57
58 @Test
59 public void testExcel() throws Exception {
60 Metadata metadata = new Metadata();
61 ContentHandler handler = new BodyContentHandler();
62 ParseContext context = new ParseContext();
63 context.set(Locale.class, Locale.US);
64
65 InputStream input = getTestDocument("testEXCEL.xlsx");
66 try {
67 parser.parse(input, handler, metadata, context);
68
69 assertEquals(
70 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
71 metadata.get(Metadata.CONTENT_TYPE));
72 assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
73 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
74 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
75 String content = handler.toString();
76 assertTrue(content.contains("Sample Excel Worksheet"));
77 assertTrue(content.contains("Numbers and their Squares"));
78 assertTrue(content.contains("9"));
79 assertFalse(content.contains("9.0"));
80 assertTrue(content.contains("196"));
81 assertFalse(content.contains("196.0"));
82 assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
83 } finally {
84 input.close();
85 }
86 }
87
88 @Test
89 public void testExcelFormats() throws Exception {
90 Metadata metadata = new Metadata();
91 ContentHandler handler = new BodyContentHandler();
92 ParseContext context = new ParseContext();
93 context.set(Locale.class, Locale.US);
94
95 InputStream input = getTestDocument("testEXCEL-formats.xlsx");
96 try {
97 parser.parse(input, handler, metadata, context);
98
99 assertEquals(
100 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
101 metadata.get(Metadata.CONTENT_TYPE));
102
103 String content = handler.toString();
104
105 // Number #,##0.00
106 assertTrue(content.contains("1,599.99"));
107 assertTrue(content.contains("-1,599.99"));
108
109 // Currency $#,##0.00;[Red]($#,##0.00)
110 assertTrue(content.contains("$1,599.99"));
111 assertTrue(content.contains("$1,599.99)"));
112
113 // Scientific 0.00E+00
114 // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
115 assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
116 assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
117
118 // Percentage
119 assertTrue(content.contains("2.50%"));
120 // Excel rounds up to 3%, but that requires Java 1.6 or later
121 if(System.getProperty("java.version").startsWith("1.5")) {
122 assertTrue(content.contains("2%"));
123 } else {
124 assertTrue(content.contains("3%"));
125 }
126
127 // Time Format: h:mm
128 assertTrue(content.contains("6:15"));
129 assertTrue(content.contains("18:15"));
130
131 // Date Format: d-mmm-yy
132 assertTrue(content.contains("17-May-07"));
133
134 // Currency $#,##0.00;[Red]($#,##0.00)
135 assertTrue(content.contains("$1,599.99"));
136 assertTrue(content.contains("($1,599.99)"));
137
138 // Below assertions represent outstanding formatting issues to be addressed
139 // they are included to allow the issues to be progressed with the Apache POI
140 // team - See TIKA-103.
141
142 /*************************************************************************
143 // Date Format: m/d/yy
144 assertTrue(content.contains("03/10/2009"));
145
146 // Date/Time Format
147 assertTrue(content.contains("19/01/2008 04:35"));
148
149 // Custom Number (0 "dollars and" .00 "cents")
150 assertTrue(content.contains("19 dollars and .99 cents"));
151
152 // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
153 assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
154
155 // Fraction (2.5): # ?/?
156 assertTrue(content.contains("2 1 / 2"));
157 **************************************************************************/
158 } finally {
159 input.close();
160 }
161 }
162
163 /**
164 * We have a number of different powerpoint files,
165 * such as presentation, macro-enabled etc
166 */
167 @Test
168 public void testPowerPoint() throws Exception {
169 String[] extensions = new String[] {
170 "pptx", "pptm", "ppsm", "ppsx", "potm"
171 //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
172 //"xps" // TIKA-418: Not yet supported by POI
173 };
174
175 String[] mimeTypes = new String[] {
176 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
177 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
178 "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
179 "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
180 "application/vnd.ms-powerpoint.template.macroenabled.12"
181 };
182
183 for (int i=0; i<extensions.length; i++) {
184 String extension = extensions[i];
185 String filename = "testPPT." + extension;
186
187 Parser parser = new AutoDetectParser();
188 Metadata metadata = new Metadata();
189 // TODO: should auto-detect without the resource name
190 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
191 ContentHandler handler = new BodyContentHandler();
192 ParseContext context = new ParseContext();
193
194 InputStream input = getTestDocument(filename);
195 try {
196 parser.parse(input, handler, metadata, context);
197
198 assertEquals(
199 "Mime-type checking for " + filename,
200 mimeTypes[i],
201 metadata.get(Metadata.CONTENT_TYPE));
202 assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
203 assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
204 assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
205
206 String content = handler.toString();
207 // Theme files don't have the text in them
208 if(extension.equals("thmx")) {
209 assertEquals("", content);
210 } else {
211 assertTrue(
212 "Text missing for " + filename + "\n" + content,
213 content.contains("Attachment Test")
214 );
215 assertTrue(
216 "Text missing for " + filename + "\n" + content,
217 content.contains("This is a test file data with the same content")
218 );
219 assertTrue(
220 "Text missing for " + filename + "\n" + content,
221 content.contains("content parsing")
222 );
223 assertTrue(
224 "Text missing for " + filename + "\n" + content,
225 content.contains("Different words to test against")
226 );
227 assertTrue(
228 "Text missing for " + filename + "\n" + content,
229 content.contains("Mystery")
230 );
231 }
232 } finally {
233 input.close();
234 }
235 }
236 }
237
238 /**
239 * Test that the metadata is already extracted when the body is processed.
240 * See TIKA-1109
241 */
242 @Test
243 public void testPowerPointMetadataEarly() throws Exception {
244 String[] extensions = new String[] {
245 "pptx", "pptm", "ppsm", "ppsx", "potm"
246 //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
247 //"xps" // TIKA-418: Not yet supported by POI
248 };
249
250 final String[] mimeTypes = new String[] {
251 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
252 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
253 "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
254 "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
255 "application/vnd.ms-powerpoint.template.macroenabled.12"
256 };
257
258 for (int i=0; i<extensions.length; i++) {
259 String extension = extensions[i];
260 final String filename = "testPPT." + extension;
261
262 Parser parser = new AutoDetectParser();
263 final Metadata metadata = new Metadata();
264 // TODO: should auto-detect without the resource name
265 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
266
267 // Allow the value to be access from the inner class
268 final int currentI = i;
269 ContentHandler handler = new BodyContentHandler()
270 {
271 public void startDocument ()
272 {
273 assertEquals(
274 "Mime-type checking for " + filename,
275 mimeTypes[currentI],
276 metadata.get(Metadata.CONTENT_TYPE));
277 assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
278 assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
279 assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
280
281 }
282
283 };
284 ParseContext context = new ParseContext();
285
286 InputStream input = getTestDocument(filename);
287 try {
288 parser.parse(input, handler, metadata, context);
289 } finally {
290 input.close();
291 }
292 }
293 }
294
295 /**
296 * For the PowerPoint formats we don't currently support, ensure that
297 * we don't break either
298 */
299 @Test
300 public void testUnsupportedPowerPoint() throws Exception {
301 String[] extensions = new String[] { "xps", "thmx" };
302 String[] mimeTypes = new String[] {
303 "application/vnd.ms-xpsdocument",
304 "application/vnd.openxmlformats-officedocument" // Is this right?
305 };
306
307 for (int i=0; i<extensions.length; i++) {
308 String extension = extensions[i];
309 String filename = "testPPT." + extension;
310
311 Parser parser = new AutoDetectParser();
312 Metadata metadata = new Metadata();
313 metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
314 ContentHandler handler = new BodyContentHandler();
315 ParseContext context = new ParseContext();
316
317 InputStream input = getTestDocument(filename);
318 try {
319 parser.parse(input, handler, metadata, context);
320
321 // Should get the metadata
322 assertEquals(
323 "Mime-type checking for " + filename,
324 mimeTypes[i],
325 metadata.get(Metadata.CONTENT_TYPE));
326
327 // But that's about it
328 } finally {
329 input.close();
330 }
331 }
332 }
333
334 /**
335 * Test the plain text output of the Word converter
336 * @throws Exception
337 */
338 @Test
339 public void testWord() throws Exception {
340 Metadata metadata = new Metadata();
341 ContentHandler handler = new BodyContentHandler();
342 ParseContext context = new ParseContext();
343
344 InputStream input = getTestDocument("testWORD.docx");
345 try {
346 parser.parse(input, handler, metadata, context);
347 assertEquals(
348 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
349 metadata.get(Metadata.CONTENT_TYPE));
350 assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
351 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
352 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
353 assertTrue(handler.toString().contains("Sample Word Document"));
354 } finally {
355 input.close();
356 }
357 }
358
359 /**
360 * Test the plain text output of the Word converter
361 * @throws Exception
362 */
363 @Test
364 public void testWordFootnote() throws Exception {
365 Metadata metadata = new Metadata();
366 ContentHandler handler = new BodyContentHandler();
367 ParseContext context = new ParseContext();
368
369 InputStream input = getTestDocument("footnotes.docx");
370 try {
371 parser.parse(input, handler, metadata, context);
372 assertEquals(
373 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
374 metadata.get(Metadata.CONTENT_TYPE));
375 assertTrue(handler.toString().contains("snoska"));
376 } finally {
377 input.close();
378 }
379 }
380
381 /**
382 * Test that the word converter is able to generate the
383 * correct HTML for the document
384 */
385 @Test
386 public void testWordHTML() throws Exception {
387
388 XMLResult result = getXML("testWORD.docx");
389 String xml = result.xml;
390 Metadata metadata = result.metadata;
391 assertEquals(
392 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
393 metadata.get(Metadata.CONTENT_TYPE));
394 assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
395 assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
396 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
397 assertTrue(xml.contains("Sample Word Document"));
398
399 // Check that custom headings came through
400 assertTrue(xml.contains("<h1 class=\"title\">"));
401 // Regular headings
402 assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
403 assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
404 // Headings with anchor tags in them
405 assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>"));
406 // Bold and italic
407 assertTrue(xml.contains("<b>BOLD</b>"));
408 assertTrue(xml.contains("<i>ITALIC</i>"));
409 // Table
410 assertTrue(xml.contains("<table>"));
411 assertTrue(xml.contains("<td>"));
412 // Links
413 assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
414 // Anchor links
415 assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
416 // Paragraphs with other styles
417 assertTrue(xml.contains("<p class=\"signature\">This one"));
418
419 result = getXML("testWORD_3imgs.docx");
420 xml = result.xml;
421
422 // Images 2-4 (there is no 1!)
423 assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
424 assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
425 assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
426
427 // Text too
428 assertTrue(xml.contains("<p>The end!</p>"));
429
430 // TIKA-692: test document containing multiple
431 // character runs within a bold tag:
432 xml = getXML("testWORD_bold_character_runs.docx").xml;
433
434 // Make sure bold text arrived as single
435 // contiguous string even though Word parser
436 // handled this as 3 character runs
437 assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
438
439 // TIKA-692: test document containing multiple
440 // character runs within a bold tag:
441 xml = getXML("testWORD_bold_character_runs2.docx").xml;
442
443 // Make sure bold text arrived as single
444 // contiguous string even though Word parser
445 // handled this as 3 character runs
446 assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
447 }
448
449 /**
450 * Test that we can extract image from docx header
451 */
452 @Test
453 public void testWordPicturesInHeader() throws Exception {
454 Metadata metadata = new Metadata();
455 ParseContext context = new ParseContext();
456
457 StringWriter sw = new StringWriter();
458 SAXTransformerFactory factory = (SAXTransformerFactory)
459 SAXTransformerFactory.newInstance();
460 TransformerHandler handler = factory.newTransformerHandler();
461 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
462 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
463 handler.setResult(new StreamResult(sw));
464
465 // Try with a document containing various tables and formattings
466 InputStream input = getTestDocument("headerPic.docx");
467 try {
468 parser.parse(input, handler, metadata, context);
469 String xml = sw.toString();
470 assertEquals(
471 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
472 metadata.get(Metadata.CONTENT_TYPE));
473 // Check that custom headings came through
474 assertTrue(xml.contains("<img"));
475 } finally {
476 input.close();
477 }
478 }
479
480 /**
481 * Documents with some sheets are protected, but not all.
482 * See TIKA-364.
483 */
484 @Test
485 public void testProtectedExcelSheets() throws Exception {
486 InputStream input = OOXMLParserTest.class
487 .getResourceAsStream("/test-documents/protectedSheets.xlsx");
488
489 Parser parser = new AutoDetectParser();
490 Metadata metadata = new Metadata();
491 ContentHandler handler = new BodyContentHandler();
492 ParseContext context = new ParseContext();
493
494 try {
495 parser.parse(input, handler, metadata, context);
496
497 assertEquals(
498 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
499 metadata.get(Metadata.CONTENT_TYPE));
500
501 assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
502 } finally {
503 input.close();
504 }
505 }
506
507 /**
508 * An excel document which is password protected.
509 * See TIKA-437.
510 */
511 @Test
512 public void testProtectedExcelFile() throws Exception {
513
514 Parser parser = new AutoDetectParser();
515 Metadata metadata = new Metadata();
516 ContentHandler handler = new BodyContentHandler();
517 ParseContext context = new ParseContext();
518
519 InputStream input = getTestDocument("protectedFile.xlsx");
520 try {
521 parser.parse(input, handler, metadata, context);
522
523 assertEquals(
524 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
525 metadata.get(Metadata.CONTENT_TYPE));
526
527 assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
528
529 String content = handler.toString();
530 assertTrue(content.contains("Office"));
531 } finally {
532 input.close();
533 }
534 }
535
536 /**
537 * Test docx without headers
538 * TIKA-633
539 */
540 @Test
541 public void testNullHeaders() throws Exception {
542 Parser parser = new AutoDetectParser();
543 Metadata metadata = new Metadata();
544 ContentHandler handler = new BodyContentHandler();
545 ParseContext context = new ParseContext();
546
547 InputStream input = getTestDocument("NullHeader.docx");
548 try {
549 parser.parse(input, handler, metadata, context);
550 assertFalse(handler.toString().length()==0);
551 } finally {
552 input.close();
553 }
554 }
555
556 @Test
557 public void testVarious() throws Exception {
558 ContentHandler handler = new BodyContentHandler();
559 Metadata metadata = new Metadata();
560
561 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
562 "/test-documents/testWORD_various.docx");
563 try {
564 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
565 } finally {
566 stream.close();
567 }
568
569 String content = handler.toString();
570 //content = content.replaceAll("\\s+"," ");
571 assertContains("Footnote appears here", content);
572 assertContains("This is a footnote.", content);
573 assertContains("This is the header text.", content);
574 assertContains("This is the footer text.", content);
575 assertContains("Here is a text box", content);
576 assertContains("Bold", content);
577 assertContains("italic", content);
578 assertContains("underline", content);
579 assertContains("superscript", content);
580 assertContains("subscript", content);
581 assertContains("Here is a citation:", content);
582 assertContains("Figure 1 This is a caption for Figure 1", content);
583 assertContains("(Kramer)", content);
584 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
585 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
586 assertContains("This is a hyperlink", content);
587 assertContains("Here is a list:", content);
588 for(int row=1;row<=3;row++) {
589 //assertContains("·\tBullet " + row, content);
590 //assertContains("\u00b7\tBullet " + row, content);
591 assertContains("Bullet " + row, content);
592 }
593 assertContains("Here is a numbered list:", content);
594 for(int row=1;row<=3;row++) {
595 //assertContains(row + ")\tNumber bullet " + row, content);
596 //assertContains(row + ") Number bullet " + row, content);
597 // TODO: OOXMLExtractor fails to number the bullets:
598 assertContains("Number bullet " + row, content);
599 }
600
601 for(int row=1;row<=2;row++) {
602 for(int col=1;col<=3;col++) {
603 assertContains("Row " + row + " Col " + col, content);
604 }
605 }
606
607 assertContains("Keyword1 Keyword2", content);
608 assertEquals("Keyword1 Keyword2",
609 metadata.get(Metadata.KEYWORDS));
610
611 assertContains("Subject is here", content);
612 // TODO: Remove subject in Tika 2.0
613 assertEquals("Subject is here",
614 metadata.get(Metadata.SUBJECT));
615 assertEquals("Subject is here",
616 metadata.get(OfficeOpenXMLCore.SUBJECT));
617
618 assertContains("Suddenly some Japanese text:", content);
619 // Special version of (GHQ)
620 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
621 // 6 other characters
622 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
623
624 assertContains("And then some Gothic text:", content);
625 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
626 }
627
628 @Test
629 public void testVariousPPTX() throws Exception {
630 ContentHandler handler = new BodyContentHandler();
631 Metadata metadata = new Metadata();
632
633 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
634 "/test-documents/testPPT_various.pptx");
635 try {
636 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
637 } finally {
638 stream.close();
639 }
640
641 String content = handler.toString();
642 //content = content.replaceAll("\\s+"," ");
643 assertContains("Footnote appears here", content);
644 assertContains("This is a footnote.", content);
645 assertContains("This is the header text.", content);
646 assertContains("This is the footer text.", content);
647 assertContains("Here is a text box", content);
648 assertContains("Bold", content);
649 assertContains("italic", content);
650 assertContains("underline", content);
651 assertContains("superscript", content);
652 assertContains("subscript", content);
653 assertContains("Here is a citation:", content);
654 assertContains("Figure 1 This is a caption for Figure 1", content);
655 assertContains("(Kramer)", content);
656 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
657 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
658 assertContains("This is a hyperlink", content);
659 assertContains("Here is a list:", content);
660 for(int row=1;row<=3;row++) {
661 //assertContains("·\tBullet " + row, content);
662 //assertContains("\u00b7\tBullet " + row, content);
663 assertContains("Bullet " + row, content);
664 }
665 assertContains("Here is a numbered list:", content);
666 for(int row=1;row<=3;row++) {
667 //assertContains(row + ")\tNumber bullet " + row, content);
668 //assertContains(row + ") Number bullet " + row, content);
669 // TODO: OOXMLExtractor fails to number the bullets:
670 assertContains("Number bullet " + row, content);
671 }
672
673 for(int row=1;row<=2;row++) {
674 for(int col=1;col<=3;col++) {
675 assertContains("Row " + row + " Col " + col, content);
676 }
677 }
678
679 assertContains("Keyword1 Keyword2", content);
680 assertEquals("Keyword1 Keyword2",
681 metadata.get(Metadata.KEYWORDS));
682
683 assertContains("Subject is here", content);
684 // TODO: Remove subject in Tika 2.0
685 assertEquals("Subject is here",
686 metadata.get(Metadata.SUBJECT));
687 assertEquals("Subject is here",
688 metadata.get(OfficeOpenXMLCore.SUBJECT));
689
690 assertContains("Suddenly some Japanese text:", content);
691 // Special version of (GHQ)
692 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
693 // 6 other characters
694 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
695
696 assertContains("And then some Gothic text:", content);
697 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
698 }
699
700 @Test
701 public void testMasterFooter() throws Exception {
702 ContentHandler handler = new BodyContentHandler();
703 Metadata metadata = new Metadata();
704
705 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
706 "/test-documents/testPPT_masterFooter.pptx");
707 try {
708 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
709 } finally {
710 stream.close();
711 }
712
713 String content = handler.toString();
714 assertContains("Master footer is here", content);
715 }
716
717 // TODO: once we fix TIKA-712, re-enable this
718 /*
719 public void testMasterText() throws Exception {
720 ContentHandler handler = new BodyContentHandler();
721 Metadata metadata = new Metadata();
722
723 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
724 "/test-documents/testPPT_masterText.pptx");
725 try {
726 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
727 } finally {
728 stream.close();
729 }
730
731 String content = handler.toString();
732 assertContains("Text that I added to the master slide", content);
733 }
734 */
735
736 // TODO: once we fix TIKA-712, re-enable this
737 /*
738 public void testMasterText2() throws Exception {
739 ContentHandler handler = new BodyContentHandler();
740 Metadata metadata = new Metadata();
741
742 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
743 "/test-documents/testPPT_masterText2.pptx");
744 try {
745 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
746 } finally {
747 stream.close();
748 }
749
750 String content = handler.toString();
751 assertContains("Text that I added to the master slide", content);
752 }
753 */
754
755 @Test
756 public void testWordArt() throws Exception {
757 ContentHandler handler = new BodyContentHandler();
758 Metadata metadata = new Metadata();
759
760 InputStream stream = OOXMLParserTest.class.getResourceAsStream(
761 "/test-documents/testWordArt.pptx");
762 try {
763 new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
764 } finally {
765 stream.close();
766 }
767 String content = handler.toString();
768 assertContains("Here is some red word Art", content);
769 }
770
771 /**
772 * Ensures that custom OOXML properties are extracted
773 */
774 @Test
775 public void testExcelCustomProperties() throws Exception {
776 InputStream input = OOXMLParserTest.class.getResourceAsStream(
777 "/test-documents/testEXCEL_custom_props.xlsx");
778 Metadata metadata = new Metadata();
779
780 try {
781 ContentHandler handler = new BodyContentHandler(-1);
782 ParseContext context = new ParseContext();
783 context.set(Locale.class, Locale.US);
784 new OOXMLParser().parse(input, handler, metadata, context);
785 } finally {
786 input.close();
787 }
788
789 assertEquals(
790 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
791 metadata.get(Metadata.CONTENT_TYPE));
792 assertEquals(null, metadata.get(TikaCoreProperties.CREATOR));
793 assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER));
794 assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
795 assertEquals("2006-09-12T15:06:44Z", metadata.get(Metadata.CREATION_DATE));
796 assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED));
797 assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED));
798 assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE));
799 assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME));
800 assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
801 assertEquals("true", metadata.get("custom:myCustomBoolean"));
802 assertEquals("3", metadata.get("custom:myCustomNumber"));
803 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
804 assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
805 assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
806 }
807
808 @Test
809 public void testWordCustomProperties() throws Exception {
810 InputStream input = OOXMLParserTest.class.getResourceAsStream(
811 "/test-documents/testWORD_custom_props.docx");
812 Metadata metadata = new Metadata();
813
814 try {
815 ContentHandler handler = new BodyContentHandler(-1);
816 ParseContext context = new ParseContext();
817 context.set(Locale.class, Locale.US);
818 new OOXMLParser().parse(input, handler, metadata, context);
819 } finally {
820 input.close();
821 }
822
823 assertEquals(
824 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
825 metadata.get(Metadata.CONTENT_TYPE));
826 assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
827 assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
828 assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
829 assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
830 assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
831 assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
832 assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
833 assertEquals("Microsoft Office Word",metadata.get(Metadata.APPLICATION_NAME));
834 assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION));
835 assertEquals("1", metadata.get(Office.PAGE_COUNT));
836 assertEquals("2", metadata.get(Office.WORD_COUNT));
837 assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
838 assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
839 assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
840 assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
841 // TODO: Remove subject in Tika 2.0
842 assertEquals("My subject", metadata.get(Metadata.SUBJECT));
843 assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
844 assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
845 assertEquals("true", metadata.get("custom:myCustomBoolean"));
846 assertEquals("3", metadata.get("custom:myCustomNumber"));
847 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
848 assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
849 assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
850 }
851
852 @Test
853 public void testPowerPointCustomProperties() throws Exception {
854 InputStream input = OOXMLParserTest.class.getResourceAsStream(
855 "/test-documents/testPPT_custom_props.pptx");
856 Metadata metadata = new Metadata();
857
858 try {
859 ContentHandler handler = new BodyContentHandler(-1);
860 ParseContext context = new ParseContext();
861 context.set(Locale.class, Locale.US);
862 new OOXMLParser().parse(input, handler, metadata, context);
863 } finally {
864 input.close();
865 }
866
867 assertEquals(
868 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
869 metadata.get(Metadata.CONTENT_TYPE));
870 assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
871 assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
872 assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
873 assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
874 assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
875 assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
876 assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
877 assertEquals("1", metadata.get(Office.SLIDE_COUNT));
878 assertEquals("3", metadata.get(Office.WORD_COUNT));
879 assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
880 assertEquals("true", metadata.get("custom:myCustomBoolean"));
881 assertEquals("3", metadata.get("custom:myCustomNumber"));
882 assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
883 assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
884 assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
885 }
886
887 // TIKA-989:
888 @Test
889 public void testEmbeddedPDF() throws Exception {
890 InputStream input = OOXMLParserTest.class.getResourceAsStream(
891 "/test-documents/testWORD_embedded_pdf.docx");
892 Metadata metadata = new Metadata();
893 StringWriter sw = new StringWriter();
894 SAXTransformerFactory factory = (SAXTransformerFactory)
895 SAXTransformerFactory.newInstance();
896 TransformerHandler handler = factory.newTransformerHandler();
897 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
898 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
899 handler.setResult(new StreamResult(sw));
900
901 try {
902 new OOXMLParser().parse(input, handler, metadata, new ParseContext());
903 } finally {
904 input.close();
905 }
906 String xml = sw.toString();
907 int i = xml.indexOf("Here is the pdf file:");
908 int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
909 int k = xml.indexOf("Bye Bye");
910 int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
911 int m = xml.indexOf("Bye for real.");
912 assertTrue(i != -1);
913 assertTrue(j != -1);
914 assertTrue(k != -1);
915 assertTrue(l != -1);
916 assertTrue(m != -1);
917 assertTrue(i < j);
918 assertTrue(j < k);
919 assertTrue(k < l);
920 assertTrue(l < m);
921 }
922
923 // TIKA-997:
924 @Test
925 public void testEmbeddedZipInPPTX() throws Exception {
926 String xml = getXML("test_embedded_zip.pptx").xml;
927 int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\" />");
928 int i = xml.indexOf("Send me a note");
929 int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\" />");
930 int k = xml.indexOf("<p>No title</p>");
931 assertTrue(h != -1);
932 assertTrue(i != -1);
933 assertTrue(j != -1);
934 assertTrue(k != -1);
935 assertTrue(h < i);
936 assertTrue(i < j);
937 assertTrue(j < k);
938 }
939
940 // TIKA-1006
941 @Test
942 public void testWordNullStyle() throws Exception {
943 String xml = getXML("testWORD_null_style.docx").xml;
944 assertContains("Test av styrt dokument", xml);
945 }
946
947 /**
948 * TIKA-1044 - Handle word documents where parts of the
949 * text have no formatting or styles applied to them
950 */
951 @Test
952 public void testNoFormat() throws Exception {
953 ContentHandler handler = new BodyContentHandler();
954 Metadata metadata = new Metadata();
955
956 InputStream stream = WordParserTest.class.getResourceAsStream(
957 "/test-documents/testWORD_no_format.docx");
958 try {
959 new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
960 } finally {
961 stream.close();
962 }
963
964 String content = handler.toString();
965 assertContains("This is a piece of text that causes an exception", content);
966 }
967
968 // TIKA-1005:
969 @Test
970 public void testTextInsideTextBox() throws Exception {
971 String xml = getXML("testWORD_text_box.docx").xml;
972 assertContains("This text is directly in the body of the document.", xml);
973 assertContains("This text is inside of a text box in the body of the document.", xml);
974 assertContains("This text is inside of a text box in the header of the document.", xml);
975 assertContains("This text is inside of a text box in the footer of the document.", xml);
976 }
977
978 // TIKA-1032:
979 @Test
980 public void testEmbeddedPPTXTwoSlides() throws Exception {
981 String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
982 assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />" , xml);
983 assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />" , xml);
984 }
985
986 /**
987 * Test for missing text described in
988 * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
989 */
990 @Test
991 public void testMissingText() throws Exception {
992 Metadata metadata = new Metadata();
993 ContentHandler handler = new BodyContentHandler();
994 ParseContext context = new ParseContext();
995
996 InputStream input = getTestDocument("testWORD_missing_text.docx");
997 try {
998 parser.parse(input, handler, metadata, context);
999 assertEquals(
1000 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1001 metadata.get(Metadata.CONTENT_TYPE));
1002 assertTrue(handler.toString().contains("BigCompany"));
1003 assertTrue(handler.toString().contains("Seasoned"));
1004 } finally {
1005 input.close();
1006 }
1007 }
1008
1009 //TIKA-1100:
1010 @Test
1011 public void testExcelTextBox() throws Exception {
1012 Metadata metadata = new Metadata();
1013 ContentHandler handler = new BodyContentHandler();
1014 ParseContext context = new ParseContext();
1015 InputStream input = getTestDocument("testEXCEL_textbox.xlsx");
1016 parser.parse(input, handler, metadata, context);
1017 String content = handler.toString();
1018 assertContains("some autoshape", content);
1019 }
1020
1021 //TIKA-792; with room for future missing bean tests
1022 @Test
1023 public void testWordMissingOOXMLBeans() throws Exception{
1024 //If a bean is missing, POI prints stack trace to stderr
1025 String[] fileNames = new String[]{
1026 "testWORD_missing_ooxml_bean1.docx",//TIKA-792
1027 };
1028 PrintStream origErr = System.err;
1029 for (String fileName : fileNames){
1030 Metadata metadata = new Metadata();
1031 ContentHandler handler = new BodyContentHandler();
1032 ParseContext context = new ParseContext();
1033 InputStream input = getTestDocument(fileName);
1034
1035 //grab stderr
1036 ByteArrayOutputStream errContent = new ByteArrayOutputStream();
1037 System.setErr(new PrintStream(errContent));
1038 parser.parse(input, handler, metadata, context);
1039
1040 //return stderr
1041 System.setErr(origErr);
1042
1043 String err = errContent.toString();
1044 assertTrue(err.length() == 0);
1045 input.close();
1046 }
1047 }
1048
1049 //TIKA-817
1050 @Test
1051 public void testPPTXAutodate() throws Exception {
1052 //Following POI-52368, the stored date is extracted,
1053 //not the auto-generated date.
1054
1055 XMLResult result = getXML("testPPT_autodate.pptx");
1056 assertContains("<p>Now</p>\n"+
1057 "<p>2011-12-19 10:20:04 AM</p>\n", result.xml);
1058
1059 }
1060 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.ByteArrayInputStream;
22 import java.io.InputStream;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.apache.tika.metadata.XMPDM;
27 import org.apache.tika.parser.AutoDetectParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.junit.Test;
32 import org.xml.sax.ContentHandler;
33
34 /**
35 * Test case for parsing mp3 files.
36 */
37 public class Mp3ParserTest {
38
39 /**
40 * Checks the duration of an MP3 file.
41 * @param metadata the metadata object
42 * @param expected the expected duration, rounded as seconds
43 */
44 private static void checkDuration(Metadata metadata, int expected) {
45 assertEquals("Wrong duration", expected,
46 Math.round(Float.valueOf(metadata.get(XMPDM.DURATION)) / 1000));
47 }
48
49 /**
50 * Test that with only ID3v1 tags, we get some information out
51 */
52 @Test
53 public void testMp3ParsingID3v1() throws Exception {
54 Parser parser = new AutoDetectParser(); // Should auto-detect!
55 ContentHandler handler = new BodyContentHandler();
56 Metadata metadata = new Metadata();
57
58 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
59 "/test-documents/testMP3id3v1.mp3");
60 try {
61 parser.parse(stream, handler, metadata, new ParseContext());
62 } finally {
63 stream.close();
64 }
65
66 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
67 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
68 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
69 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
70
71 String content = handler.toString();
72 assertTrue(content.contains("Test Title"));
73 assertTrue(content.contains("Test Artist"));
74 assertTrue(content.contains("Test Album"));
75 assertTrue(content.contains("2008"));
76 assertTrue(content.contains("Test Comment"));
77 assertTrue(content.contains("Rock"));
78
79 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
80 assertEquals("44100", metadata.get("samplerate"));
81 assertEquals("1", metadata.get("channels"));
82 checkDuration(metadata, 2);
83 }
84
85 /**
86 * Test that with only ID3v2 tags, we get the full
87 * set of information out.
88 */
89 @Test
90 public void testMp3ParsingID3v2() throws Exception {
91 Parser parser = new AutoDetectParser(); // Should auto-detect!
92 ContentHandler handler = new BodyContentHandler();
93 Metadata metadata = new Metadata();
94
95 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
96 "/test-documents/testMP3id3v2.mp3");
97 try {
98 parser.parse(stream, handler, metadata, new ParseContext());
99 } finally {
100 stream.close();
101 }
102
103 // Check core properties
104 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
105 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
106 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
107 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
108
109 // Check the textual contents
110 String content = handler.toString();
111 assertTrue(content.contains("Test Title"));
112 assertTrue(content.contains("Test Artist"));
113 assertTrue(content.contains("Test Album"));
114 assertTrue(content.contains("2008"));
115 assertTrue(content.contains("Test Comment"));
116 assertTrue(content.contains("Rock"));
117
118 // Check un-typed audio properties
119 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
120 assertEquals("44100", metadata.get("samplerate"));
121 assertEquals("1", metadata.get("channels"));
122
123 // Check XMPDM-typed audio properties
124 assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
125 assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
126 assertEquals(null, metadata.get(XMPDM.COMPOSER));
127 assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
128 assertEquals("Rock", metadata.get(XMPDM.GENRE));
129 assertEquals("XXX - ID3v1 Comment\nTest Comment", metadata.get(XMPDM.LOG_COMMENT.getName()));
130 assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
131
132 assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
133 assertEquals("Mono", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
134 assertEquals("MP3", metadata.get(XMPDM.AUDIO_COMPRESSOR));
135 checkDuration(metadata, 2);
136 }
137
138 /**
139 * Test that with both id3v2 and id3v1, we prefer the
140 * details from id3v2
141 */
142 @Test
143 public void testMp3ParsingID3v1v2() throws Exception {
144 Parser parser = new AutoDetectParser(); // Should auto-detect!
145 ContentHandler handler = new BodyContentHandler();
146 Metadata metadata = new Metadata();
147
148 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
149 "/test-documents/testMP3id3v1_v2.mp3");
150 try {
151 parser.parse(stream, handler, metadata, new ParseContext());
152 } finally {
153 stream.close();
154 }
155
156 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
157 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
158 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
159 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
160
161 String content = handler.toString();
162 assertTrue(content.contains("Test Title"));
163 assertTrue(content.contains("Test Artist"));
164 assertTrue(content.contains("Test Album"));
165 assertTrue(content.contains("2008"));
166 assertTrue(content.contains("Test Comment"));
167 assertTrue(content.contains("Rock"));
168
169 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
170 assertEquals("44100", metadata.get("samplerate"));
171 assertEquals("1", metadata.get("channels"));
172 checkDuration(metadata, 2);
173 }
174
175 /**
176 * Test that with only ID3v2 tags, of version 2.4, we get the full
177 * set of information out.
178 */
179 @Test
180 public void testMp3ParsingID3v24() throws Exception {
181 Parser parser = new AutoDetectParser(); // Should auto-detect!
182 ContentHandler handler = new BodyContentHandler();
183 Metadata metadata = new Metadata();
184
185 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
186 "/test-documents/testMP3id3v24.mp3");
187 try {
188 parser.parse(stream, handler, metadata, new ParseContext());
189 } finally {
190 stream.close();
191 }
192
193 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
194 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
195 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
196 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
197
198 String content = handler.toString();
199 assertTrue(content.contains("Test Title"));
200 assertTrue(content.contains("Test Artist"));
201 assertTrue(content.contains("Test Album"));
202 assertTrue(content.contains("2008"));
203 assertTrue(content.contains("Test Comment"));
204 assertTrue(content.contains("Rock"));
205
206 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
207 assertEquals("44100", metadata.get("samplerate"));
208 assertEquals("1", metadata.get("channels"));
209 checkDuration(metadata, 2);
210 }
211
212 /**
213 * Tests that a file with characters not in the ISO 8859-1
214 * range is correctly handled
215 */
216 @Test
217 public void testMp3ParsingID3i18n() throws Exception {
218 Parser parser = new AutoDetectParser(); // Should auto-detect!
219 ContentHandler handler = new BodyContentHandler();
220 Metadata metadata = new Metadata();
221
222 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
223 "/test-documents/testMP3i18n.mp3");
224 try {
225 parser.parse(stream, handler, metadata, new ParseContext());
226 } finally {
227 stream.close();
228 }
229
230 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
231 assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
232 assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
233 assertEquals("Test Artist \u2468\u2460", metadata.get(Metadata.AUTHOR));
234 assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
235 assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
236
237 assertEquals(
238 "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
239 metadata.get(XMPDM.LOG_COMMENT)
240 );
241
242 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
243 assertEquals("44100", metadata.get("samplerate"));
244 assertEquals("1", metadata.get("channels"));
245 checkDuration(metadata, 2);
246 }
247
248
249 /**
250 * Tests that a file with both lyrics and
251 * ID3v2 tags gets both extracted correctly
252 */
253 @Test
254 public void testMp3ParsingLyrics() throws Exception {
255 Parser parser = new AutoDetectParser(); // Should auto-detect!
256 ContentHandler handler = new BodyContentHandler();
257 Metadata metadata = new Metadata();
258
259 // Note - our test file has a lyrics tag, but lacks any
260 // lyrics in the tags, so we can't test that bit
261 // TODO Find a better sample file
262
263 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
264 "/test-documents/testMP3lyrics.mp3");
265 try {
266 parser.parse(stream, handler, metadata, new ParseContext());
267 } finally {
268 stream.close();
269 }
270
271 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
272 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
273 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
274 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
275
276 String content = handler.toString();
277 assertTrue(content.contains("Test Title"));
278 assertTrue(content.contains("Test Artist"));
279 assertTrue(content.contains("Test Album"));
280 assertTrue(content.contains("2008"));
281 assertTrue(content.contains("Test Comment"));
282 assertTrue(content.contains("Rock"));
283
284 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
285 assertEquals("44100", metadata.get("samplerate"));
286 assertEquals("2", metadata.get("channels"));
287 checkDuration(metadata, 1);
288 }
289
290 @Test
291 public void testID3v2Frame() throws Exception {
292 byte[] empty = new byte[] {
293 0x49, 0x44, 0x33, 3, 1, 0,
294 0, 0, 0, 0
295 };
296
297 assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
298 assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
299
300 ID3v2Frame f = (ID3v2Frame)
301 ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
302 assertEquals(3, f.getMajorVersion());
303 assertEquals(1, f.getMinorVersion());
304 assertEquals(0, f.getFlags());
305 assertEquals(0, f.getLength());
306 assertEquals(0, f.getData().length);
307
308 assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0));
309 assertEquals("", ID3v2Frame.getTagString(new byte[] {0,0,0,0}, 0, 3));
310 assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte)'A',0,0,0}, 0, 3));
311 }
312
313 /**
314 * This test will do nothing, unless you've downloaded the
315 * mp3 file from TIKA-424 - the file cannot be
316 * distributed with Tika.
317 * This test will check for the complicated set of ID3v2.4
318 * tags.
319 */
320 @Test
321 public void testTIKA424() throws Exception {
322 Parser parser = new AutoDetectParser(); // Should auto-detect!
323 ContentHandler handler = new BodyContentHandler();
324 Metadata metadata = new Metadata();
325
326 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
327 "/test-documents/test2.mp3");
328 if(stream == null) {
329 // You haven't downloaded the file
330 // Skip the test
331 return;
332 }
333
334 try {
335 parser.parse(stream, handler, metadata, new ParseContext());
336 } finally {
337 stream.close();
338 }
339
340 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
341 assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
342 assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
343 assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
344
345 String content = handler.toString();
346 assertTrue(content.contains("Plus loin vers l'ouest"));
347
348 assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
349 assertEquals("44100", metadata.get("samplerate"));
350 assertEquals("2", metadata.get("channels"));
351 }
352
353 /**
354 * This tests that we can handle without errors (but perhaps not
355 * all content) a file with a very very large ID3 frame that
356 * has been truncated before the end of the ID3 tags.
357 * In this case, it is a file with JPEG data in the ID3, which
358 * is trunacted before the end of the JPEG bit of the ID3 frame.
359 */
360 @Test
361 public void testTIKA474() throws Exception {
362 Parser parser = new AutoDetectParser(); // Should auto-detect!
363 ContentHandler handler = new BodyContentHandler();
364 Metadata metadata = new Metadata();
365
366 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
367 "/test-documents/testMP3truncated.mp3");
368
369
370 try {
371 parser.parse(stream, handler, metadata, new ParseContext());
372 } finally {
373 stream.close();
374 }
375
376 // Check we could get the headers from the start
377 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
378 assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
379 assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
380 assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
381
382 String content = handler.toString();
383 assertTrue(content.contains("Girl you have no faith in medicine"));
384 assertTrue(content.contains("The White Stripes"));
385 assertTrue(content.contains("Elephant"));
386 assertTrue(content.contains("2003"));
387
388 // File lacks any audio frames, so we can't know these
389 assertEquals(null, metadata.get("version"));
390 assertEquals(null, metadata.get("samplerate"));
391 assertEquals(null, metadata.get("channels"));
392 }
393
394 // TIKA-1024
395 @Test
396 public void testNakedUTF16BOM() throws Exception {
397 Parser parser = new AutoDetectParser(); // Should auto-detect!
398 ContentHandler handler = new BodyContentHandler();
399 Metadata metadata = new Metadata();
400
401 InputStream stream = Mp3ParserTest.class.getResourceAsStream(
402 "/test-documents/testNakedUTF16BOM.mp3");
403
404 try {
405 parser.parse(stream, handler, metadata, new ParseContext());
406 } finally {
407 stream.close();
408 }
409 assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
410 assertEquals("", metadata.get(XMPDM.GENRE));
411 }
412 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp3;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertNotNull;
21 import static org.junit.Assert.assertNull;
22
23 import java.io.ByteArrayInputStream;
24 import java.io.ByteArrayOutputStream;
25 import java.io.IOException;
26 import java.io.OutputStream;
27
28 import org.junit.After;
29 import org.junit.Test;
30
31 /**
32 * Test class for {@code MpegStream}.
33 */
34 public class MpegStreamTest
35 {
36 /** The stream to be tested. */
37 private MpegStream stream;
38
39 @After
40 public void tearDown() throws Exception
41 {
42 if (stream != null)
43 {
44 stream.close();
45 }
46 }
47
48 /**
49 * Tests whether the default test header can be found in a stream.
50 *
51 * @param bos the stream
52 * @throws IOException if an error occurs
53 */
54 private void checkDefaultHeader(ByteArrayOutputStream bos)
55 throws IOException
56 {
57 ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
58 stream = new MpegStream(in);
59 AudioFrame header = stream.nextFrame();
60 assertNotNull("No header found", header);
61 assertEquals("Wrong MPEG version", AudioFrame.MPEG_V2,
62 header.getVersionCode());
63 assertEquals("Wrong layer", AudioFrame.LAYER_3, header.getLayer());
64 assertEquals("Wrong bit rate", 80000, header.getBitRate());
65 assertEquals("Wrong sample rate", 24000, header.getSampleRate());
66 }
67
68 /**
69 * Writes the given byte the given number of times into an output stream.
70 *
71 * @param out the output stream
72 * @param value the value to write
73 * @param count the number of bytes to write
74 * @throws IOException if an error occurs
75 */
76 private static void writeBytes(OutputStream out, int value, int count)
77 throws IOException
78 {
79 for (int i = 0; i < count; i++)
80 {
81 out.write(value);
82 }
83 }
84
85 /**
86 * Writes a frame header in the given output stream.
87 *
88 * @param out the output stream
89 * @param b2 byte 2 of the header
90 * @param b3 byte 3 of the header
91 * @param b4 byte 4 of the header
92 * @throws IOException if an error occurs
93 */
94 private static void writeFrame(OutputStream out, int b2, int b3, int b4)
95 throws IOException
96 {
97 out.write(0xFF);
98 out.write(b2);
99 out.write(b3);
100 out.write(b4);
101 }
102
103 /**
104 * Tests whether an audio frame header can be found somewhere in a stream.
105 */
106 @Test
107 public void testSearchNextFrame() throws IOException
108 {
109 ByteArrayOutputStream bos = new ByteArrayOutputStream();
110 writeBytes(bos, 0xFF, 32);
111 writeBytes(bos, 0, 16);
112 writeBytes(bos, 0xFF, 8);
113 bos.write(0xF3);
114 bos.write(0x96);
115 bos.write(0);
116 checkDefaultHeader(bos);
117 }
118
119 /**
120 * Tests whether invalid frame headers are detected and skipped.
121 */
122 @Test
123 public void testSearchNextFrameInvalid() throws IOException
124 {
125 ByteArrayOutputStream bos = new ByteArrayOutputStream();
126 writeFrame(bos, 0xEB, 0x96, 0);
127 writeFrame(bos, 0xF9, 0x96, 0);
128 writeFrame(bos, 0xF3, 0, 0);
129 writeFrame(bos, 0xF3, 0xF0, 0);
130 writeFrame(bos, 0xF3, 0x7C, 0);
131 writeFrame(bos, 0xF3, 0x96, 0);
132 checkDefaultHeader(bos);
133 }
134
135 /**
136 * Tests a search for another frame which is interrupted because the stream
137 * ends.
138 */
139 @Test
140 public void testSeachNextFrameEOS() throws IOException
141 {
142 ByteArrayOutputStream bos = new ByteArrayOutputStream();
143 bos.write(0xFF);
144 bos.write(0xFF);
145 bos.write(0xF3);
146 bos.write(0x96);
147 ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
148 stream = new MpegStream(in);
149 assertNull("Got a frame", stream.nextFrame());
150 }
151
152 /**
153 * Tries to skip a frame if no current header is available.
154 */
155 @Test
156 public void testSkipNoCurrentHeader() throws IOException
157 {
158 ByteArrayOutputStream bos = new ByteArrayOutputStream();
159 bos.write("This is a test".getBytes());
160 ByteArrayInputStream in = new ByteArrayInputStream(bos.toByteArray());
161 stream = new MpegStream(in);
162 assertFalse("Wrong result", stream.skipFrame());
163 }
164 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.mp4;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.io.TikaInputStream;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.apache.tika.metadata.XMPDM;
27 import org.apache.tika.parser.AutoDetectParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.junit.Test;
32 import org.xml.sax.ContentHandler;
33
34 /**
35 * Test case for parsing mp4 files.
36 */
37 public class MP4ParserTest {
38 /**
39 * Test that we can extract information from
40 * a M4A MP4 Audio file
41 */
42 @Test
43 public void testMP4ParsingAudio() throws Exception {
44 Parser parser = new AutoDetectParser(); // Should auto-detect!
45 ContentHandler handler = new BodyContentHandler();
46 Metadata metadata = new Metadata();
47
48 InputStream stream = MP4ParserTest.class.getResourceAsStream(
49 "/test-documents/testMP4.m4a");
50 try {
51 parser.parse(stream, handler, metadata, new ParseContext());
52 } finally {
53 stream.close();
54 }
55
56 // Check core properties
57 assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
58 assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
59 assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
60 assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
61 assertEquals("2012-01-28T18:39:18Z", metadata.get(TikaCoreProperties.CREATED));
62 assertEquals("2012-01-28T18:39:18Z", metadata.get(Metadata.CREATION_DATE));
63 assertEquals("2012-01-28T18:40:25Z", metadata.get(TikaCoreProperties.MODIFIED));
64 assertEquals("2012-01-28T18:40:25Z", metadata.get(Metadata.DATE));
65
66 // Check the textual contents
67 String content = handler.toString();
68 assertTrue(content.contains("Test Title"));
69 assertTrue(content.contains("Test Artist"));
70 assertTrue(content.contains("Test Album"));
71 assertTrue(content.contains("2008"));
72 assertTrue(content.contains("Test Comment"));
73 assertTrue(content.contains("Test Genre"));
74
75 // Check XMPDM-typed audio properties
76 assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
77 assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
78 assertEquals("Test Composer", metadata.get(XMPDM.COMPOSER));
79 assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
80 assertEquals("Test Genre", metadata.get(XMPDM.GENRE));
81 assertEquals("Test Comments", metadata.get(XMPDM.LOG_COMMENT.getName()));
82 assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
83
84 assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
85 //assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); // TODO Extract
86 assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
87
88
89 // Check again by file, rather than stream
90 TikaInputStream tstream = TikaInputStream.get(
91 MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
92 tstream.getFile();
93 try {
94 parser.parse(tstream, handler, metadata, new ParseContext());
95 } finally {
96 tstream.close();
97 }
98 }
99
100 // TODO Test a MP4 Video file
101 // TODO Test an old QuickTime Video File
102 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.netcdf;
17
18 //JDK imports
19 import static org.junit.Assert.assertEquals;
20
21 import java.io.InputStream;
22
23
24
25 //TIKA imports
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.BodyContentHandler;
31 import org.junit.Test;
32 import org.xml.sax.ContentHandler;
33
34 /**
35 * Test cases to exercise the {@link NetCDFParser}.
36 *
37 */
38 public class NetCDFParserTest {
39
40 @Test
41 public void testParseGlobalMetadata() throws Exception {
42 if(System.getProperty("java.version").startsWith("1.5")) {
43 return;
44 }
45
46 Parser parser = new NetCDFParser();
47 ContentHandler handler = new BodyContentHandler();
48 Metadata metadata = new Metadata();
49
50 InputStream stream = NetCDFParser.class
51 .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
52 try {
53 parser.parse(stream, handler, metadata, new ParseContext());
54 } finally {
55 stream.close();
56 }
57
58 assertEquals(metadata.get(TikaCoreProperties.TITLE),
59 "model output prepared for IPCC AR4");
60 assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
61 assertEquals(metadata.get(Metadata.PROJECT_ID),
62 "IPCC Fourth Assessment");
63 assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
64 assertEquals(metadata.get(Metadata.REALIZATION), "1");
65 assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
66 "720 ppm stabilization experiment (SRESA1B)");
67
68 }
69
70 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.odf;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.TikaTest;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.Office;
26 import org.apache.tika.metadata.OfficeOpenXMLCore;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.parser.AutoDetectParser;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.parser.opendocument.OpenOfficeParser;
32 import org.apache.tika.sax.BodyContentHandler;
33 import org.junit.Test;
34 import org.xml.sax.ContentHandler;
35
36 public class ODFParserTest extends TikaTest {
37 /**
38 * For now, allow us to run some tests against both
39 * the old and the new parser
40 */
41 private Parser[] getParsers() {
42 return new Parser[] {
43 new OpenDocumentParser(),
44 new OpenOfficeParser()
45 };
46 }
47
48 @Test
49 public void testOO3() throws Exception {
50 for (Parser parser : getParsers()) {
51 InputStream input = ODFParserTest.class.getResourceAsStream(
52 "/test-documents/testODFwithOOo3.odt");
53 try {
54 Metadata metadata = new Metadata();
55 ContentHandler handler = new BodyContentHandler();
56 parser.parse(input, handler, metadata, new ParseContext());
57
58 assertEquals(
59 "application/vnd.oasis.opendocument.text",
60 metadata.get(Metadata.CONTENT_TYPE));
61
62 String content = handler.toString();
63 assertTrue(content.contains("Tika is part of the Lucene project."));
64 assertTrue(content.contains("Solr"));
65 assertTrue(content.contains("one embedded"));
66 assertTrue(content.contains("Rectangle Title"));
67 assertTrue(content.contains("a blue background and dark border"));
68 } finally {
69 input.close();
70 }
71 }
72 }
73
74 @Test
75 public void testOO2() throws Exception {
76 for (Parser parser : getParsers()) {
77 InputStream input = ODFParserTest.class.getResourceAsStream(
78 "/test-documents/testOpenOffice2.odt");
79 try {
80 Metadata metadata = new Metadata();
81 ContentHandler handler = new BodyContentHandler();
82 parser.parse(input, handler, metadata, new ParseContext());
83
84 assertEquals(
85 "application/vnd.oasis.opendocument.text",
86 metadata.get(Metadata.CONTENT_TYPE));
87 assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
88 assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
89 assertEquals(
90 "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
91 metadata.get("generator"));
92
93 // Check date metadata, both old-style and new-style
94 assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
95 assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
96 assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
97 assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
98 assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
99
100 // Check the document statistics
101 assertEquals("1", metadata.get(Office.PAGE_COUNT));
102 assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
103 assertEquals("14", metadata.get(Office.WORD_COUNT));
104 assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
105 assertEquals("0", metadata.get(Office.TABLE_COUNT));
106 assertEquals("0", metadata.get(Office.OBJECT_COUNT));
107 assertEquals("0", metadata.get(Office.IMAGE_COUNT));
108
109 // Check the Tika-1.0 style document statistics
110 assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
111 assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
112 assertEquals("14", metadata.get(Metadata.WORD_COUNT));
113 assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
114 assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
115 assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
116 assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
117
118 // Check the very old style statistics (these will be removed shortly)
119 assertEquals("0", metadata.get("nbTab"));
120 assertEquals("0", metadata.get("nbObject"));
121 assertEquals("0", metadata.get("nbImg"));
122 assertEquals("1", metadata.get("nbPage"));
123 assertEquals("1", metadata.get("nbPara"));
124 assertEquals("14", metadata.get("nbWord"));
125 assertEquals("78", metadata.get("nbCharacter"));
126
127 // Custom metadata tags present but without values
128 assertEquals(null, metadata.get("custom:Info 1"));
129 assertEquals(null, metadata.get("custom:Info 2"));
130 assertEquals(null, metadata.get("custom:Info 3"));
131 assertEquals(null, metadata.get("custom:Info 4"));
132
133 String content = handler.toString();
134 assertTrue(content.contains(
135 "This is a sample Open Office document,"
136 + " written in NeoOffice 2.2.1 for the Mac."));
137 } finally {
138 input.close();
139 }
140 }
141 }
142
143 /**
144 * Similar to {@link #testXMLParser()}, but using a different
145 * OO2 file with different metadata in it
146 */
147 @Test
148 public void testOO2Metadata() throws Exception {
149 InputStream input = ODFParserTest.class.getResourceAsStream(
150 "/test-documents/testOpenOffice2.odf");
151 try {
152 Metadata metadata = new Metadata();
153 ContentHandler handler = new BodyContentHandler();
154 new OpenDocumentParser().parse(input, handler, metadata);
155
156 assertEquals(
157 "application/vnd.oasis.opendocument.formula",
158 metadata.get(Metadata.CONTENT_TYPE));
159 assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
160 assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
161 assertEquals("The quick brown fox jumps over the lazy dog",
162 metadata.get(TikaCoreProperties.TITLE));
163 assertEquals("Gym class featuring a brown fox and lazy dog",
164 metadata.get(TikaCoreProperties.DESCRIPTION));
165 assertEquals("Gym class featuring a brown fox and lazy dog",
166 metadata.get(OfficeOpenXMLCore.SUBJECT));
167 assertEquals("Gym class featuring a brown fox and lazy dog",
168 metadata.get(Metadata.SUBJECT));
169 assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
170 assertEquals("1", metadata.get("editing-cycles"));
171 assertEquals(
172 "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
173 metadata.get("generator"));
174 assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
175
176 // User defined metadata
177 assertEquals("Text 1", metadata.get("custom:Info 1"));
178 assertEquals("2", metadata.get("custom:Info 2"));
179 assertEquals("false", metadata.get("custom:Info 3"));
180 assertEquals("true", metadata.get("custom:Info 4"));
181
182 // No statistics present
183 assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
184 assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
185 assertEquals(null, metadata.get(Metadata.WORD_COUNT));
186 assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
187 assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
188 assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
189 assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
190 assertEquals(null, metadata.get("nbTab"));
191 assertEquals(null, metadata.get("nbObject"));
192 assertEquals(null, metadata.get("nbImg"));
193 assertEquals(null, metadata.get("nbPage"));
194 assertEquals(null, metadata.get("nbPara"));
195 assertEquals(null, metadata.get("nbWord"));
196 assertEquals(null, metadata.get("nbCharacter"));
197
198 // Note - contents of maths files not currently supported
199 String content = handler.toString();
200 assertEquals("", content);
201 } finally {
202 input.close();
203 }
204 }
205
206 /**
207 * Similar to {@link #testXMLParser()}, but using an OO3 file
208 */
209 @Test
210 public void testOO3Metadata() throws Exception {
211 InputStream input = ODFParserTest.class.getResourceAsStream(
212 "/test-documents/testODFwithOOo3.odt");
213 try {
214 Metadata metadata = new Metadata();
215 ContentHandler handler = new BodyContentHandler();
216 new OpenDocumentParser().parse(input, handler, metadata);
217
218 assertEquals(
219 "application/vnd.oasis.opendocument.text",
220 metadata.get(Metadata.CONTENT_TYPE));
221 assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
222 assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
223 assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
224 assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
225 assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
226 assertEquals("Test document", metadata.get(Metadata.SUBJECT));
227 assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
228 assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
229 assertEquals("Bart Hanssens", metadata.get("initial-creator"));
230 assertEquals("2", metadata.get("editing-cycles"));
231 assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
232 assertEquals(
233 "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
234 metadata.get("generator"));
235 assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
236
237 // User defined metadata
238 assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
239 assertEquals(null, metadata.get("custom:Info 2"));
240 assertEquals(null, metadata.get("custom:Info 3"));
241 assertEquals(null, metadata.get("custom:Info 4"));
242
243 // Check the document statistics
244 assertEquals("2", metadata.get(Office.PAGE_COUNT));
245 assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
246 assertEquals("54", metadata.get(Office.WORD_COUNT));
247 assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
248 assertEquals("0", metadata.get(Office.TABLE_COUNT));
249 assertEquals("2", metadata.get(Office.OBJECT_COUNT));
250 assertEquals("0", metadata.get(Office.IMAGE_COUNT));
251
252 // Check the Tika-1.0 style document statistics
253 assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
254 assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
255 assertEquals("54", metadata.get(Metadata.WORD_COUNT));
256 assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
257 assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
258 assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
259 assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
260
261 // Check the old style statistics (these will be removed shortly)
262 assertEquals("0", metadata.get("nbTab"));
263 assertEquals("2", metadata.get("nbObject"));
264 assertEquals("0", metadata.get("nbImg"));
265 assertEquals("2", metadata.get("nbPage"));
266 assertEquals("13", metadata.get("nbPara"));
267 assertEquals("54", metadata.get("nbWord"));
268 assertEquals("351", metadata.get("nbCharacter"));
269
270 String content = handler.toString();
271 assertTrue(content.contains(
272 "Apache Tika Tika is part of the Lucene project."
273 ));
274 } finally {
275 input.close();
276 }
277 }
278
279 @Test
280 public void testODPMasterFooter() throws Exception {
281 InputStream input = ODFParserTest.class.getResourceAsStream(
282 "/test-documents/testMasterFooter.odp");
283 try {
284 Metadata metadata = new Metadata();
285 ContentHandler handler = new BodyContentHandler();
286 new AutoDetectParser().parse(input, handler, metadata);
287
288 String content = handler.toString();
289 assertContains("Master footer is here", content);
290 } finally {
291 input.close();
292 }
293 }
294
295 @Test
296 public void testODTFooter() throws Exception {
297 InputStream input = ODFParserTest.class.getResourceAsStream(
298 "/test-documents/testFooter.odt");
299 try {
300 Metadata metadata = new Metadata();
301 ContentHandler handler = new BodyContentHandler();
302 new AutoDetectParser().parse(input, handler, metadata);
303
304 String content = handler.toString();
305 assertContains("Here is some text...", content);
306 assertContains("Here is some text on page 2", content);
307 assertContains("Here is footer text", content);
308 } finally {
309 input.close();
310 }
311 }
312
313 @Test
314 public void testODSFooter() throws Exception {
315 InputStream input = ODFParserTest.class.getResourceAsStream(
316 "/test-documents/testFooter.ods");
317 try {
318 Metadata metadata = new Metadata();
319 ContentHandler handler = new BodyContentHandler();
320 new AutoDetectParser().parse(input, handler, metadata);
321
322 String content = handler.toString();
323 assertContains("Here is a footer in the center area", content);
324 } finally {
325 input.close();
326 }
327 }
328 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pdf;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertNull;
21 import static org.junit.Assert.assertTrue;
22
23 import java.io.File;
24 import java.io.FileInputStream;
25 import java.io.InputStream;
26 import java.util.HashSet;
27 import java.util.Set;
28
29 import org.apache.tika.TikaTest;
30 import org.apache.tika.extractor.ContainerExtractor;
31 import org.apache.tika.extractor.ParserContainerExtractor;
32 import org.apache.tika.io.TikaInputStream;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.metadata.OfficeOpenXMLCore;
35 import org.apache.tika.metadata.TikaCoreProperties;
36 import org.apache.tika.mime.MediaType;
37 import org.apache.tika.parser.AutoDetectParser;
38 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.Parser;
40 import org.apache.tika.parser.PasswordProvider;
41 import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.TrackingHandler;
42 import org.apache.tika.sax.BodyContentHandler;
43 import org.junit.Test;
44 import org.xml.sax.ContentHandler;
45 /**
46 * Test case for parsing pdf files.
47 */
48 public class PDFParserTest extends TikaTest {
49
50 public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
51 public static final MediaType TYPE_EMF = MediaType.application("x-emf");
52 public static final MediaType TYPE_PDF = MediaType.application("pdf");
53 public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
54 public static final MediaType TYPE_DOC = MediaType.application("msword");
55
56 @Test
57 public void testPdfParsing() throws Exception {
58 Parser parser = new AutoDetectParser(); // Should auto-detect!
59 Metadata metadata = new Metadata();
60
61 InputStream stream = PDFParserTest.class.getResourceAsStream(
62 "/test-documents/testPDF.pdf");
63
64 String content = getText(stream, parser, metadata);
65
66 assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
67 assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
68 assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
69 assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
70 assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
71
72 // Can't reliably test dates yet - see TIKA-451
73 // assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
74 // assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
75
76 assertTrue(content.contains("Apache Tika"));
77 assertTrue(content.contains("Tika - Content Analysis Toolkit"));
78 assertTrue(content.contains("incubator"));
79 assertTrue(content.contains("Apache Software Foundation"));
80 // testing how the end of one paragraph is separated from start of the next one
81 assertTrue("should have word boundary after headline",
82 !content.contains("ToolkitApache"));
83 assertTrue("should have word boundary between paragraphs",
84 !content.contains("libraries.Apache"));
85 }
86
87 @Test
88 public void testCustomMetadata() throws Exception {
89 Parser parser = new AutoDetectParser(); // Should auto-detect!
90 Metadata metadata = new Metadata();
91
92 InputStream stream = PDFParserTest.class.getResourceAsStream(
93 "/test-documents/testPDF-custommetadata.pdf");
94
95 String content = getText(stream, parser, metadata);
96
97 assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
98 assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
99 assertEquals("Document author", metadata.get(Metadata.AUTHOR));
100 assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
101
102 assertEquals("Custom Value", metadata.get("Custom Property"));
103
104 assertEquals("Array Entry 1", metadata.get("Custom Array"));
105 assertEquals(2, metadata.getValues("Custom Array").length);
106 assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
107 assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
108
109 assertTrue(content.contains("Hello World!"));
110 }
111
112 /**
113 * PDFs can be "protected" with the default password. This means
114 * they're encrypted (potentially both text and metadata),
115 * but we can decrypt them easily.
116 */
117 @Test
118 public void testProtectedPDF() throws Exception {
119 Parser parser = new AutoDetectParser(); // Should auto-detect!
120 ContentHandler handler = new BodyContentHandler();
121 Metadata metadata = new Metadata();
122 ParseContext context = new ParseContext();
123
124 InputStream stream = PDFParserTest.class.getResourceAsStream(
125 "/test-documents/testPDF_protected.pdf");
126 try {
127 parser.parse(stream, handler, metadata, context);
128 } finally {
129 stream.close();
130 }
131
132 assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
133 assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
134 assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
135 assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
136 assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
137 assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
138
139 String content = handler.toString();
140 assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
141 assertTrue(content.contains("On 16 November 2002"));
142 assertTrue(content.contains("In many important respects"));
143
144
145 // Try again with an explicit empty password
146 handler = new BodyContentHandler();
147 metadata = new Metadata();
148
149 context = new ParseContext();
150 context.set(PasswordProvider.class, new PasswordProvider() {
151 public String getPassword(Metadata metadata) {
152 return "";
153 }
154 });
155
156 stream = PDFParserTest.class.getResourceAsStream(
157 "/test-documents/testPDF_protected.pdf");
158 try {
159 parser.parse(stream, handler, metadata, context);
160 } finally {
161 stream.close();
162 }
163
164 assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
165 assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
166 assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
167 assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
168 assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
169
170 assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
171 assertTrue(content.contains("On 16 November 2002"));
172 assertTrue(content.contains("In many important respects"));
173 }
174
175 @Test
176 public void testTwoTextBoxes() throws Exception {
177 Parser parser = new AutoDetectParser(); // Should auto-detect!
178 InputStream stream = PDFParserTest.class.getResourceAsStream(
179 "/test-documents/testPDFTwoTextBoxes.pdf");
180 String content = getText(stream, parser);
181 content = content.replaceAll("\\s+"," ");
182 assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
183 }
184
185 @Test
186 public void testVarious() throws Exception {
187 Parser parser = new AutoDetectParser(); // Should auto-detect!
188 Metadata metadata = new Metadata();
189 InputStream stream = PDFParserTest.class.getResourceAsStream(
190 "/test-documents/testPDFVarious.pdf");
191
192 String content = getText(stream, parser, metadata);
193 //content = content.replaceAll("\\s+"," ");
194 assertContains("Footnote appears here", content);
195 assertContains("This is a footnote.", content);
196 assertContains("This is the header text.", content);
197 assertContains("This is the footer text.", content);
198 assertContains("Here is a text box", content);
199 assertContains("Bold", content);
200 assertContains("italic", content);
201 assertContains("underline", content);
202 assertContains("superscript", content);
203 assertContains("subscript", content);
204 assertContains("Here is a citation:", content);
205 assertContains("Figure 1 This is a caption for Figure 1", content);
206 assertContains("(Kramer)", content);
207 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
208 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
209 assertContains("This is a hyperlink", content);
210 assertContains("Here is a list:", content);
211 for(int row=1;row<=3;row++) {
212 //assertContains("·\tBullet " + row, content);
213 //assertContains("\u00b7\tBullet " + row, content);
214 assertContains("Bullet " + row, content);
215 }
216 assertContains("Here is a numbered list:", content);
217 for(int row=1;row<=3;row++) {
218 //assertContains(row + ")\tNumber bullet " + row, content);
219 assertContains(row + ") Number bullet " + row, content);
220 }
221
222 for(int row=1;row<=2;row++) {
223 for(int col=1;col<=3;col++) {
224 assertContains("Row " + row + " Col " + col, content);
225 }
226 }
227
228 assertContains("Keyword1 Keyword2", content);
229 assertEquals("Keyword1 Keyword2",
230 metadata.get(Metadata.KEYWORDS));
231
232 assertContains("Subject is here", content);
233 assertEquals("Subject is here",
234 metadata.get(OfficeOpenXMLCore.SUBJECT));
235 assertEquals("Subject is here",
236 metadata.get(Metadata.SUBJECT));
237
238 assertContains("Suddenly some Japanese text:", content);
239 // Special version of (GHQ)
240 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
241 // 6 other characters
242 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
243
244 assertContains("And then some Gothic text:", content);
245 // TODO: I saved the word doc as a PDF, but that
246 // process somehow, apparently lost the gothic
247 // chars, so we cannot test this here:
248 //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
249 }
250
251 @Test
252 public void testAnnotations() throws Exception {
253 Parser parser = new AutoDetectParser(); // Should auto-detect!
254 InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
255 String content = getText(stream, parser);
256 content = content.replaceAll("[\\s\u00a0]+"," ");
257 assertContains("Here is some text", content);
258 assertContains("Here is a comment", content);
259
260 // Test w/ annotation text disabled:
261 PDFParser pdfParser = new PDFParser();
262 pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
263 stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
264 content = getText(stream, pdfParser);
265 content = content.replaceAll("[\\s\u00a0]+"," ");
266 assertContains("Here is some text", content);
267 assertEquals(-1, content.indexOf("Here is a comment"));
268
269 // annotation text disabled through parsecontext
270 ParseContext context = new ParseContext();
271 PDFParserConfig config = new PDFParserConfig();
272 config.setExtractAnnotationText(false);
273 context.set(PDFParserConfig.class, config);
274 stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
275 content = getText(stream, parser, context);
276 content = content.replaceAll("[\\s\u00a0]+"," ");
277 assertContains("Here is some text", content);
278 assertEquals(-1, content.indexOf("Here is a comment"));
279
280
281 // TIKA-738: make sure no extra </p> tags
282 String xml = getXML("testAnnotations.pdf").xml;
283 assertEquals(substringCount("<p>", xml),
284 substringCount("</p>", xml));
285 }
286
287 // TIKA-981
288 @Test
289 public void testPopupAnnotation() throws Exception {
290 Parser parser = new AutoDetectParser(); // Should auto-detect!
291 InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
292 String content = getText(stream, parser);
293 assertContains("this is the note", content);
294 assertContains("igalsh", content);
295 }
296
297 @Test
298 public void testEmbeddedPDFs() throws Exception {
299 String xml = getXML("testPDFPackage.pdf").xml;
300 assertContains("PDF1", xml);
301 assertContains("PDF2", xml);
302 }
303
304 private static int substringCount(String needle, String haystack) {
305 int upto = -1;
306 int count = 0;
307 while(true) {
308 final int next = haystack.indexOf(needle, upto);
309 if (next == -1) {
310 break;
311 }
312 count++;
313 upto = next+1;
314 }
315
316 return count;
317 }
318
319 @Test
320 public void testPageNumber() throws Exception {
321 final XMLResult result = getXML("testPageNumber.pdf");
322 final String content = result.xml.replaceAll("\\s+","");
323 assertContains("<p>1</p>", content);
324 }
325
326 /**
327 * Test to ensure that Links are extracted from the text
328 *
329 * Note - the PDF contains the text "This is a hyperlink" which
330 * a hyperlink annotation, linking to the tika site, on it. This
331 * test will need updating when we're able to apply the annotation
332 * to the text itself, rather than following on afterwards as now
333 */
334 @Test
335 public void testLinks() throws Exception {
336 final XMLResult result = getXML("testPDFVarious.pdf");
337 assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
338 }
339
340 @Test
341 public void testDisableAutoSpace() throws Exception {
342 PDFParser parser = new PDFParser();
343 parser.getPDFParserConfig().setEnableAutoSpace(false);
344 InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
345 String content = getText(stream, parser);
346 content = content.replaceAll("[\\s\u00a0]+"," ");
347 // Text is correct when autoSpace is off:
348 assertContains("Here is some formatted text", content);
349
350 parser.getPDFParserConfig().setEnableAutoSpace(true);
351 stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
352 content = getText(stream, parser);
353 content = content.replaceAll("[\\s\u00a0]+"," ");
354 // Text is correct when autoSpace is off:
355
356 // Text has extra spaces when autoSpace is on
357 assertEquals(-1, content.indexOf("Here is some formatted text"));
358
359 //now try with autodetect
360 Parser autoParser = new AutoDetectParser();
361 ParseContext context = new ParseContext();
362 PDFParserConfig config = new PDFParserConfig();
363 context.set(PDFParserConfig.class, config);
364 //default is true
365 stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
366 content = getText(stream, autoParser, context);
367 content = content.replaceAll("[\\s\u00a0]+"," ");
368 // Text has extra spaces when autoSpace is on
369 assertEquals(-1, content.indexOf("Here is some formatted text"));
370
371 config.setEnableAutoSpace(false);
372
373 stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
374 content = getText(stream, parser, context);
375 content = content.replaceAll("[\\s\u00a0]+"," ");
376 // Text is correct when autoSpace is off:
377 assertContains("Here is some formatted text", content);
378
379 }
380
381 @Test
382 public void testDuplicateOverlappingText() throws Exception {
383 PDFParser parser = new PDFParser();
384 InputStream stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
385 // Default is false (keep overlapping text):
386 String content = getText(stream, parser);
387 assertContains("Text the first timeText the second time", content);
388
389 parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
390 stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
391 content = getText(stream, parser);
392 // "Text the first" was dedup'd:
393 assertContains("Text the first timesecond time", content);
394
395 //now try with autodetect
396 Parser autoParser = new AutoDetectParser();
397 ParseContext context = new ParseContext();
398 PDFParserConfig config = new PDFParserConfig();
399 context.set(PDFParserConfig.class, config);
400 stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
401 // Default is false (keep overlapping text):
402 content = getText(stream, autoParser, context);
403 assertContains("Text the first timeText the second time", content);
404
405 config.setSuppressDuplicateOverlappingText(true);
406 stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
407 content = getText(stream, autoParser, context);
408 // "Text the first" was dedup'd:
409 assertContains("Text the first timesecond time", content);
410
411 }
412
413 @Test
414 public void testSortByPosition() throws Exception {
415 PDFParser parser = new PDFParser();
416 parser.getPDFParserConfig().setEnableAutoSpace(false);
417 InputStream stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
418 // Default is false (do not sort):
419 String content = getText(stream, parser);
420 content = content.replaceAll("\\s+", " ");
421 assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
422
423 parser.getPDFParserConfig().setSortByPosition(true);
424 stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
425 content = getText(stream, parser);
426 content = content.replaceAll("\\s+", " ");
427 // Column text is now interleaved:
428 assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
429
430 //now try setting autodetect via parsecontext
431 AutoDetectParser autoParser = new AutoDetectParser();
432 ParseContext context = new ParseContext();
433 PDFParserConfig config = new PDFParserConfig();
434 context.set(PDFParserConfig.class, config);
435 stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
436 // Default is false (do not sort):
437 content = getText(stream, autoParser, context);
438 content = content.replaceAll("\\s+", " ");
439 assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
440
441 config.setSortByPosition(true);
442 context.set(PDFParserConfig.class, config);
443 stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
444 content = getText(stream, parser);
445 content = content.replaceAll("\\s+", " ");
446 // Column text is now interleaved:
447 assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
448
449 }
450
451 // TIKA-1035
452 @Test
453 public void testBookmarks() throws Exception {
454 String xml = getXML("testPDF_bookmarks.pdf").xml;
455 int i = xml.indexOf("Denmark bookmark is here");
456 int j = xml.indexOf("</body>");
457 assertTrue(i != -1);
458 assertTrue(j != -1);
459 assertTrue(i < j);
460 }
461
462 //TIKA-1124
463 @Test
464 public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
465 /* format of test doc:
466 docx/
467 pdf/
468 docx
469 */
470 Parser parser = new AutoDetectParser(); // Should auto-detect!
471 ContentHandler handler = new BodyContentHandler();
472 Metadata metadata = new Metadata();
473 ParseContext context = new ParseContext();
474 String content = "";
475 InputStream stream = null;
476 try{
477 context.set(org.apache.tika.parser.Parser.class, parser);
478 stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
479 parser.parse(stream, handler, metadata, context);
480 content = handler.toString();
481 } finally {
482 stream.close();
483 }
484 int outerHaystack = content.indexOf("Outer_haystack");
485 int pdfHaystack = content.indexOf("pdf_haystack");
486 int needle = content.indexOf("Needle");
487 assertTrue(outerHaystack > -1);
488 assertTrue(pdfHaystack > -1);
489 assertTrue(needle > -1);
490 assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
491
492 //plagiarized from POIContainerExtractionTest. Thank you!
493 TrackingHandler tracker = new TrackingHandler();
494 TikaInputStream tis;
495 ContainerExtractor ex = new ParserContainerExtractor();
496 try{
497 tis= TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
498 ex.extract(tis, ex, tracker);
499 } finally {
500 stream.close();
501 }
502 assertEquals(true, ex.isSupported(tis));
503 assertEquals(3, tracker.filenames.size());
504 assertEquals(3, tracker.mediaTypes.size());
505 assertEquals("image1.emf", tracker.filenames.get(0));
506 assertNull(tracker.filenames.get(1));
507 assertEquals("My first attachment", tracker.filenames.get(2));
508 assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
509 assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
510 assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
511 }
512
513 /**
514 * tests for equality between traditional sequential parser
515 * and newer nonsequential parser.
516 *
517 * TODO: more testing
518 */
519 @Test
520 public void testSequentialParser() throws Exception{
521 Parser defaultParser = new AutoDetectParser();
522 Parser sequentialParser = new AutoDetectParser();
523 ParseContext context = new ParseContext();
524 PDFParserConfig config = new PDFParserConfig();
525 config.setUseNonSequentialParser(true);
526 context.set(PDFParserConfig.class, config);
527
528 File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
529 int pdfs = 0;
530 Set<String> knownMetadataDiffs = new HashSet<String>();
531 //PDFBox-1792/Tika-1203
532 knownMetadataDiffs.add("testAnnotations.pdf");
533 //PDFBox-1806
534 knownMetadataDiffs.add("test_acroForm2.pdf");
535
536 //empty for now
537 Set<String> knownContentDiffs = new HashSet<String>();
538
539 for (File f : testDocs.listFiles()){
540 if (! f.getName().toLowerCase().endsWith(".pdf")){
541 continue;
542 }
543
544 pdfs++;
545 Metadata defaultMetadata = new Metadata();
546 String defaultContent = getText(new FileInputStream(f), defaultParser, defaultMetadata);
547
548 Metadata sequentialMetadata = new Metadata();
549 String sequentialContent = getText(new FileInputStream(f), sequentialParser, context, sequentialMetadata);
550
551 if (knownContentDiffs.contains(f.getName())){
552 assertFalse(f.getName(), defaultContent.equals(sequentialContent));
553 } else {
554 assertEquals(f.getName(), defaultContent, sequentialContent);
555 }
556
557 //skip this one file.
558 if (knownMetadataDiffs.contains(f.getName())){
559 assertFalse(f.getName(), defaultMetadata.equals(sequentialMetadata));
560 } else {
561 assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
562 }
563 }
564 //make sure nothing went wrong with getting the resource to test-documents
565 //This will require modification with each new pdf test.
566 //If this is too annoying, we can turn it off.
567 assertEquals("Number of pdf files tested", 16, pdfs);
568 }
569
570
571 // TIKA-973
572 //commented out until test documents that are unambiguously
573 //consistent with Apache License v2.0 are contributed.
574 //TODO: add back test for AcroForm extraction; test document should include
575 //recursive forms
576 /* public void testAcroForm() throws Exception{
577 Parser p = new AutoDetectParser();
578 ParseContext context = new ParseContext();
579 InputStream stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
580 String txt = getText(stream, p, context);
581 stream.close();
582
583 //simple first level form contents
584 assertContains("to: John Doe", txt);
585 //checkbox
586 assertContains("xpackaging: Yes", txt);
587
588 //this guarantees that the form processor
589 //worked recursively at least once...i.e. it didn't just
590 //take the first form
591 stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
592 txt = getText(stream, p, context);
593 stream.close();
594 assertContains("123 Main St.", txt);
595
596
597 //now test with nonsequential parser
598 PDFParserConfig config = new PDFParserConfig();
599 config.setUseNonSequentialParser(true);
600 context.set(PDFParserConfig.class, config);
601 stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
602 txt = getText(stream, p, context);
603 stream.close();
604
605 //simple first level form contents
606 assertContains("to: John Doe", txt);
607 //checkbox
608 assertContains("xpackaging: Yes", txt);
609
610 //this guarantees that the form processor
611 //worked recursively at least once...i.e. it didn't just
612 //take the first form
613 stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
614 txt = getText(stream, p, context);
615 assertContains("123 Main St.", txt);
616 stream.close();
617 }
618 */
619
620 //TIKA-1226
621 public void testSignatureInAcroForm() throws Exception{
622 //The current test doc does not contain any content in the signature area.
623 //This just tests that a RuntimeException is not thrown.
624 //TODO: find a better test file for this issue.
625 String xml = getXML("/testPDF_acroform3.pdf").xml;
626 assertTrue("found", (xml.indexOf("<li>aTextField: TIKA-1226</li>") > -1));
627 }
628
629 //TIKA-1228
630 public void testEmbeddedFilesInChildren() throws Exception {
631 String xml = getXML("/testPDF_childAttachments.pdf").xml;
632 //"regressiveness" exists only in Unit10.doc not in the container pdf document
633 assertTrue(xml.contains("regressiveness"));
634
635 TrackingHandler tracker = new TrackingHandler();
636 TikaInputStream tis = null;
637 ContainerExtractor ex = new ParserContainerExtractor();
638 try{
639 tis= TikaInputStream.get(
640 getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"));
641 ex.extract(tis, ex, tracker);
642 } finally {
643 if (tis != null){
644 tis.close();
645 }
646 }
647 assertEquals(2, tracker.filenames.size());
648 assertEquals(2, tracker.mediaTypes.size());
649 assertEquals("Press Quality(1).joboptions", tracker.filenames.get(0));
650 assertEquals("Unit10.doc", tracker.filenames.get(1));
651 assertEquals(TYPE_TEXT, tracker.mediaTypes.get(0));
652 assertEquals(TYPE_DOC, tracker.mediaTypes.get(1));
653 }
654 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.Set;
23
24 import org.apache.tika.TikaTest;
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MediaType;
28 import org.apache.tika.parser.AbstractParser;
29 import org.apache.tika.parser.AutoDetectParser;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.parser.Parser;
32 import org.junit.Before;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Parent class for all Package based Test cases
38 */
39 public abstract class AbstractPkgTest extends TikaTest {
40 protected ParseContext trackingContext;
41 protected ParseContext recursingContext;
42
43 protected Parser autoDetectParser;
44 protected EmbeddedTrackingParser tracker;
45
46 @Before
47 public void setUp() throws Exception {
48 tracker = new EmbeddedTrackingParser();
49 trackingContext = new ParseContext();
50 trackingContext.set(Parser.class, tracker);
51
52 autoDetectParser = new AutoDetectParser();
53 recursingContext = new ParseContext();
54 recursingContext.set(Parser.class, autoDetectParser);
55 }
56
57
58 @SuppressWarnings("serial")
59 protected static class EmbeddedTrackingParser extends AbstractParser {
60 protected List<String> filenames = new ArrayList<String>();
61 protected List<String> mediatypes = new ArrayList<String>();
62 protected byte[] lastSeenStart;
63
64 public void reset() {
65 filenames.clear();
66 mediatypes.clear();
67 }
68
69 public Set<MediaType> getSupportedTypes(ParseContext context) {
70 // Cheat!
71 return (new AutoDetectParser()).getSupportedTypes(context);
72 }
73
74 public void parse(InputStream stream, ContentHandler handler,
75 Metadata metadata, ParseContext context) throws IOException,
76 SAXException, TikaException {
77 filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
78 mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
79
80 lastSeenStart = new byte[32];
81 stream.read(lastSeenStart);
82 }
83
84 }
85 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.pkg;
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNull;
21 import static org.junit.Assert.assertTrue;
22
23 import java.io.InputStream;
24
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.parser.AutoDetectParser;
27 import org.apache.tika.parser.Parser;
28 import org.apache.tika.sax.BodyContentHandler;
29 import org.junit.Test;
30 import org.xml.sax.ContentHandler;
31
32 public class ArParserTest extends AbstractPkgTest {
33 @Test
34 public void testArParsing() throws Exception {
35 Parser parser = new AutoDetectParser();
36
37 ContentHandler handler = new BodyContentHandler();
38 Metadata metadata = new Metadata();
39
40 InputStream stream = ArParserTest.class
41 .getResourceAsStream("/test-documents/testARofText.ar");
42 try {
43 parser.parse(stream, handler, metadata, recursingContext);
44 } finally {
45 stream.close();
46 }
47
48 assertEquals("application/x-archive",
49 metadata.get(Metadata.CONTENT_TYPE));
50 String content = handler.toString();
51 assertTrue(content.contains("testTXT.txt"));
52 assertTrue(content.contains("Test d'indexation de Txt"));
53 assertTrue(content.contains("http://www.apache.org"));
54
55 stream = ArParserTest.class
56 .getResourceAsStream("/test-documents/testARofSND.ar");
57 try {
58 parser.parse(stream, handler, metadata, recursingContext);
59 } finally {
60 stream.close();
61 }
62
63 assertEquals("application/x-archive",
64 metadata.get(Metadata.CONTENT_TYPE));
65 content = handler.toString();
66 assertTrue(content.contains("testAU.au"));
67 }
68
69 /**
70 * Tests that the ParseContext parser is correctly fired for all the
71 * embedded entries.
72 */
73 @Test
74 public void testEmbedded() throws Exception {
75 Parser parser = new AutoDetectParser(); // Should auto-detect!
76 ContentHandler handler = new BodyContentHandler();
77 Metadata metadata = new Metadata();
78
79 InputStream stream = ArParserTest.class
80 .getResourceAsStream("/test-documents/testARofText.ar");
81 try {
82 parser.parse(stream, handler, metadata, trackingContext);
83 } finally {
84 stream.close();
85 }
86
87 assertEquals(1, tracker.filenames.size());
88 assertEquals(1, tracker.mediatypes.size());
89
90 assertEquals("testTXT.txt", tracker.filenames.get(0));
91
92 for (String type : tracker.mediatypes) {
93 assertNull(type);
94 }
95
96 tracker.reset();
97 stream = ArParserTest.class
98 .getResourceAsStream("/test-documents/testARofSND.ar");
99 try {
100 parser.parse(stream, handler, metadata, trackingContext);
101 } finally {
102 stream.close();
103 }
104
105 assertEquals(1, tracker.filenames.size());
106 assertEquals(1, tracker.mediatypes.size());
107 assertEquals("testAU.au", tracker.filenames.get(0));
108
109 for (String type : tracker.mediatypes) {
110 assertNull(type);
111 }
112 }
113 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.parser.AutoDetectParser;
25 import org.apache.tika.parser.Parser;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 /**
31 * Test case for parsing bzip2 files.
32 */
33 public class Bzip2ParserTest extends AbstractPkgTest {
34
35 @Test
36 public void testBzip2Parsing() throws Exception {
37 Parser parser = new AutoDetectParser(); // Should auto-detect!
38 ContentHandler handler = new BodyContentHandler();
39 Metadata metadata = new Metadata();
40
41 InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
42 "/test-documents/test-documents.tbz2");
43 try {
44 parser.parse(stream, handler, metadata, recursingContext);
45 } finally {
46 stream.close();
47 }
48
49 assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
50 String content = handler.toString();
51 assertTrue(content.contains("test-documents/testEXCEL.xls"));
52 assertTrue(content.contains("Sample Excel Worksheet"));
53 assertTrue(content.contains("test-documents/testHTML.html"));
54 assertTrue(content.contains("Test Indexation Html"));
55 assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
56 assertTrue(content.contains("This is a sample Open Office document"));
57 assertTrue(content.contains("test-documents/testPDF.pdf"));
58 assertTrue(content.contains("Apache Tika"));
59 assertTrue(content.contains("test-documents/testPPT.ppt"));
60 assertTrue(content.contains("Sample Powerpoint Slide"));
61 assertTrue(content.contains("test-documents/testRTF.rtf"));
62 assertTrue(content.contains("indexation Word"));
63 assertTrue(content.contains("test-documents/testTXT.txt"));
64 assertTrue(content.contains("Test d'indexation de Txt"));
65 assertTrue(content.contains("test-documents/testWORD.doc"));
66 assertTrue(content.contains("This is a sample Microsoft Word Document"));
67 assertTrue(content.contains("test-documents/testXML.xml"));
68 assertTrue(content.contains("Rida Benjelloun"));
69 }
70
71
72 /**
73 * Tests that the ParseContext parser is correctly
74 * fired for all the embedded entries.
75 */
76 @Test
77 public void testEmbedded() throws Exception {
78 Parser parser = new AutoDetectParser(); // Should auto-detect!
79 ContentHandler handler = new BodyContentHandler();
80 Metadata metadata = new Metadata();
81
82 InputStream stream = ZipParserTest.class.getResourceAsStream(
83 "/test-documents/test-documents.tbz2");
84 try {
85 parser.parse(stream, handler, metadata, trackingContext);
86 } finally {
87 stream.close();
88 }
89
90 // Should find a single entry, for the (compressed) tar file
91 assertEquals(1, tracker.filenames.size());
92 assertEquals(1, tracker.mediatypes.size());
93
94 assertEquals(null, tracker.filenames.get(0));
95 assertEquals(null, tracker.mediatypes.get(0));
96
97 // Tar file starts with the directory name
98 assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
99 }
100 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import java.io.InputStream;
22
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.parser.AutoDetectParser;
25 import org.apache.tika.parser.Parser;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 /**
31 * Test case for parsing gzip files.
32 */
33 public class GzipParserTest extends AbstractPkgTest {
34
35 @Test
36 public void testGzipParsing() throws Exception {
37 Parser parser = new AutoDetectParser(); // Should auto-detect!
38 ContentHandler handler = new BodyContentHandler();
39 Metadata metadata = new Metadata();
40
41 InputStream stream = GzipParserTest.class.getResourceAsStream(
42 "/test-documents/test-documents.tgz");
43 try {
44 parser.parse(stream, handler, metadata, recursingContext);
45 } finally {
46 stream.close();
47 }
48
49 assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
50 String content = handler.toString();
51 assertTrue(content.contains("test-documents/testEXCEL.xls"));
52 assertTrue(content.contains("Sample Excel Worksheet"));
53 assertTrue(content.contains("test-documents/testHTML.html"));
54 assertTrue(content.contains("Test Indexation Html"));
55 assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
56 assertTrue(content.contains("This is a sample Open Office document"));
57 assertTrue(content.contains("test-documents/testPDF.pdf"));
58 assertTrue(content.contains("Apache Tika"));
59 assertTrue(content.contains("test-documents/testPPT.ppt"));
60 assertTrue(content.contains("Sample Powerpoint Slide"));
61 assertTrue(content.contains("test-documents/testRTF.rtf"));
62 assertTrue(content.contains("indexation Word"));
63 assertTrue(content.contains("test-documents/testTXT.txt"));
64 assertTrue(content.contains("Test d'indexation de Txt"));
65 assertTrue(content.contains("test-documents/testWORD.doc"));
66 assertTrue(content.contains("This is a sample Microsoft Word Document"));
67 assertTrue(content.contains("test-documents/testXML.xml"));
68 assertTrue(content.contains("Rida Benjelloun"));
69 }
70
71 /**
72 * Tests that the ParseContext parser is correctly
73 * fired for all the embedded entries.
74 */
75 @Test
76 public void testEmbedded() throws Exception {
77 Parser parser = new AutoDetectParser(); // Should auto-detect!
78 ContentHandler handler = new BodyContentHandler();
79 Metadata metadata = new Metadata();
80
81 InputStream stream = ZipParserTest.class.getResourceAsStream(
82 "/test-documents/test-documents.tgz");
83 try {
84 parser.parse(stream, handler, metadata, trackingContext);
85 } finally {
86 stream.close();
87 }
88
89 // Should find a single entry, for the (compressed) tar file
90 assertEquals(1, tracker.filenames.size());
91 assertEquals(1, tracker.mediatypes.size());
92
93 assertEquals(null, tracker.filenames.get(0));
94 assertEquals(null, tracker.mediatypes.get(0));
95
96 // Tar file starts with the directory name
97 assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
98 }
99
100 @Test
101 public void testSvgzParsing() throws Exception {
102 Parser parser = new AutoDetectParser(); // Should auto-detect!
103 ContentHandler handler = new BodyContentHandler();
104 Metadata metadata = new Metadata();
105
106 InputStream stream = GzipParserTest.class.getResourceAsStream(
107 "/test-documents/testSVG.svgz");
108 try {
109 parser.parse(stream, handler, metadata, recursingContext);
110 } finally {
111 stream.close();
112 }
113
114 assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
115 String content = handler.toString();
116 assertTrue(content.contains("Test SVG image"));
117 }
118
119 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.parser.AutoDetectParser;
26 import org.apache.tika.parser.Parser;
27 import org.apache.tika.sax.BodyContentHandler;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30
31 /**
32 * Test case for parsing tar files.
33 */
34 public class TarParserTest extends AbstractPkgTest {
35
36 @Test
37 public void testTarParsing() throws Exception {
38 Parser parser = new AutoDetectParser(); // Should auto-detect!
39 ContentHandler handler = new BodyContentHandler();
40 Metadata metadata = new Metadata();
41
42 InputStream stream = TarParserTest.class.getResourceAsStream(
43 "/test-documents/test-documents.tar");
44 try {
45 parser.parse(stream, handler, metadata, recursingContext);
46 } finally {
47 stream.close();
48 }
49
50 assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
51 String content = handler.toString();
52 assertTrue(content.contains("test-documents/testEXCEL.xls"));
53 assertTrue(content.contains("Sample Excel Worksheet"));
54 assertTrue(content.contains("test-documents/testHTML.html"));
55 assertTrue(content.contains("Test Indexation Html"));
56 assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
57 assertTrue(content.contains("This is a sample Open Office document"));
58 assertTrue(content.contains("test-documents/testPDF.pdf"));
59 assertTrue(content.contains("Apache Tika"));
60 assertTrue(content.contains("test-documents/testPPT.ppt"));
61 assertTrue(content.contains("Sample Powerpoint Slide"));
62 assertTrue(content.contains("test-documents/testRTF.rtf"));
63 assertTrue(content.contains("indexation Word"));
64 assertTrue(content.contains("test-documents/testTXT.txt"));
65 assertTrue(content.contains("Test d'indexation de Txt"));
66 assertTrue(content.contains("test-documents/testWORD.doc"));
67 assertTrue(content.contains("This is a sample Microsoft Word Document"));
68 assertTrue(content.contains("test-documents/testXML.xml"));
69 assertTrue(content.contains("Rida Benjelloun"));
70 }
71
72 /**
73 * Tests that the ParseContext parser is correctly
74 * fired for all the embedded entries.
75 */
76 @Test
77 public void testEmbedded() throws Exception {
78 Parser parser = new AutoDetectParser(); // Should auto-detect!
79 ContentHandler handler = new BodyContentHandler();
80 Metadata metadata = new Metadata();
81
82 InputStream stream = ZipParserTest.class.getResourceAsStream(
83 "/test-documents/test-documents.tar");
84 try {
85 parser.parse(stream, handler, metadata, trackingContext);
86 } finally {
87 stream.close();
88 }
89
90 // Should have found all 9 documents, but not the directory
91 assertEquals(9, tracker.filenames.size());
92 assertEquals(9, tracker.mediatypes.size());
93
94 // Should have names but not content types, as tar doesn't
95 // store the content types
96 assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
97 assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
98 assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
99 assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
100 assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
101 assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
102 assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
103 assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
104 assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
105
106 for(String type : tracker.mediatypes) {
107 assertNull(type);
108 }
109 }
110 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.pkg;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import org.apache.tika.Tika;
27 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.parser.AutoDetectParser;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.parser.Parser;
32 import org.apache.tika.sax.BodyContentHandler;
33 import org.junit.Test;
34 import org.xml.sax.ContentHandler;
35
36 /**
37 * Test case for parsing zip files.
38 */
39 public class ZipParserTest extends AbstractPkgTest {
40
41 @Test
42 public void testZipParsing() throws Exception {
43 Parser parser = new AutoDetectParser(); // Should auto-detect!
44 ContentHandler handler = new BodyContentHandler();
45 Metadata metadata = new Metadata();
46
47 InputStream stream = ZipParserTest.class.getResourceAsStream(
48 "/test-documents/test-documents.zip");
49 try {
50 parser.parse(stream, handler, metadata, recursingContext);
51 } finally {
52 stream.close();
53 }
54
55 assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
56 String content = handler.toString();
57 assertTrue(content.contains("testEXCEL.xls"));
58 assertTrue(content.contains("Sample Excel Worksheet"));
59 assertTrue(content.contains("testHTML.html"));
60 assertTrue(content.contains("Test Indexation Html"));
61 assertTrue(content.contains("testOpenOffice2.odt"));
62 assertTrue(content.contains("This is a sample Open Office document"));
63 assertTrue(content.contains("testPDF.pdf"));
64 assertTrue(content.contains("Apache Tika"));
65 assertTrue(content.contains("testPPT.ppt"));
66 assertTrue(content.contains("Sample Powerpoint Slide"));
67 assertTrue(content.contains("testRTF.rtf"));
68 assertTrue(content.contains("indexation Word"));
69 assertTrue(content.contains("testTXT.txt"));
70 assertTrue(content.contains("Test d'indexation de Txt"));
71 assertTrue(content.contains("testWORD.doc"));
72 assertTrue(content.contains("This is a sample Microsoft Word Document"));
73 assertTrue(content.contains("testXML.xml"));
74 assertTrue(content.contains("Rida Benjelloun"));
75 }
76
77 /**
78 * Tests that the ParseContext parser is correctly
79 * fired for all the embedded entries.
80 */
81 @Test
82 public void testEmbedded() throws Exception {
83 Parser parser = new AutoDetectParser(); // Should auto-detect!
84 ContentHandler handler = new BodyContentHandler();
85 Metadata metadata = new Metadata();
86
87 InputStream stream = ZipParserTest.class.getResourceAsStream(
88 "/test-documents/test-documents.zip");
89 try {
90 parser.parse(stream, handler, metadata, trackingContext);
91 } finally {
92 stream.close();
93 }
94
95 // Should have found all 9 documents
96 assertEquals(9, tracker.filenames.size());
97 assertEquals(9, tracker.mediatypes.size());
98
99 // Should have names but not content types, as zip doesn't
100 // store the content types
101 assertEquals("testEXCEL.xls", tracker.filenames.get(0));
102 assertEquals("testHTML.html", tracker.filenames.get(1));
103 assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
104 assertEquals("testPDF.pdf", tracker.filenames.get(3));
105 assertEquals("testPPT.ppt", tracker.filenames.get(4));
106 assertEquals("testRTF.rtf", tracker.filenames.get(5));
107 assertEquals("testTXT.txt", tracker.filenames.get(6));
108 assertEquals("testWORD.doc", tracker.filenames.get(7));
109 assertEquals("testXML.xml", tracker.filenames.get(8));
110
111 for(String type : tracker.mediatypes) {
112 assertNull(type);
113 }
114 }
115
116 /**
117 * Test case for the ability of the ZIP parser to extract the name of
118 * a ZIP entry even if the content of the entry is unreadable due to an
119 * unsupported compression method.
120 *
121 * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
122 */
123 @Test
124 public void testUnsupportedZipCompressionMethod() throws Exception {
125 String content = new Tika().parseToString(
126 ZipParserTest.class.getResourceAsStream(
127 "/test-documents/moby.zip"));
128 assertTrue(content.contains("README"));
129 }
130
131 private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
132 public Set<String> allRelIDs = new HashSet<String>();
133 public boolean shouldParseEmbedded(Metadata metadata) {
134 String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
135 if (relID != null) {
136 allRelIDs.add(relID);
137 }
138 return false;
139 }
140
141 public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
142 throw new UnsupportedOperationException("should never be called");
143 }
144 }
145
146 // TIKA-1036
147 @Test
148 public void testPlaceholders() throws Exception {
149 String xml = getXML("testEmbedded.zip").xml;
150 assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
151 assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
152
153 // Also make sure EMBEDDED_RELATIONSHIP_ID was
154 // passed when parsing the embedded docs:
155 Parser parser = new AutoDetectParser();
156 ParseContext context = new ParseContext();
157 context.set(Parser.class, parser);
158 GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
159 context.set(EmbeddedDocumentExtractor.class, relIDs);
160 InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip");
161 try {
162 parser.parse(input,
163 new BodyContentHandler(),
164 new Metadata(),
165 context);
166 } finally {
167 input.close();
168 }
169
170 assertTrue(relIDs.allRelIDs.contains("test1.txt"));
171 assertTrue(relIDs.allRelIDs.contains("test2.txt"));
172 }
173 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.prt;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.TikaTest;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.sax.BodyContentHandler;
26 import org.junit.Test;
27 import org.xml.sax.ContentHandler;
28
29 public class PRTParserTest extends TikaTest {
30 /**
31 * Try with a simple file
32 */
33 @Test
34 public void testPRTParserBasics() throws Exception {
35 InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt");
36 try {
37 Metadata metadata = new Metadata();
38 ContentHandler handler = new BodyContentHandler();
39 new PRTParser().parse(input, handler, metadata);
40
41 assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
42
43 // This file has a date
44 assertEquals("2011-06-20T16:54:00",
45 metadata.get(TikaCoreProperties.CREATED));
46 assertEquals("2011-06-20T16:54:00",
47 metadata.get(Metadata.CREATION_DATE));
48 // But no description
49 assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
50
51 String contents = handler.toString();
52
53 assertContains("Front View", contents);
54 assertContains("Back View", contents);
55 assertContains("Bottom View", contents);
56 assertContains("Right View", contents);
57 assertContains("Left View", contents);
58 //assertContains("Isometric View", contents); // Can't detect yet
59 assertContains("Axonometric View", contents);
60
61 assertContains("You've managed to extract all the text!", contents);
62 assertContains("This is more text", contents);
63 assertContains("Text Inside a PRT file", contents);
64 } finally {
65 input.close();
66 }
67 }
68
69 /**
70 * Now a more complex one
71 */
72 @Test
73 public void testPRTParserComplex() throws Exception {
74 InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt");
75 try {
76 Metadata metadata = new Metadata();
77 ContentHandler handler = new BodyContentHandler();
78 new PRTParser().parse(input, handler, metadata);
79
80 assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
81
82 // File has both a date and a description
83 assertEquals("1997-04-01T08:59:00",
84 metadata.get(Metadata.DATE));
85 assertEquals("1997-04-01T08:59:00",
86 metadata.get(Metadata.CREATION_DATE));
87 assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
88 metadata.get(TikaCoreProperties.DESCRIPTION));
89
90 String contents = handler.toString();
91
92 assertContains("ITEM", contents);
93 assertContains("REQ.", contents);
94 assertContains("DESCRIPTION", contents);
95 assertContains("MAT'L", contents);
96 assertContains("TOLERANCES UNLESS", contents);
97 assertContains("FRACTIONS", contents);
98 assertContains("ANGLES", contents);
99 assertContains("Acme Corporation", contents);
100
101 assertContains("DATE", contents);
102 assertContains("CHANGE", contents);
103 assertContains("DRAWN BY", contents);
104 assertContains("SCALE", contents);
105 assertContains("TIKA TEST DRAWING", contents);
106 assertContains("TIKA LETTERS", contents);
107 assertContains("5.82", contents);
108 assertContains("112"+'\u00b0', contents); // Degrees
109 assertContains("TIKA TEST LETTER", contents);
110 assertContains("17.11", contents);
111 assertContains('\u00d8'+"\ufffd2.000", contents); // Diameter
112 assertContains("Diameter", contents);
113 assertContains("The Apache Tika toolkit", contents);
114 } finally {
115 input.close();
116 }
117 }
118 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.rtf;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.File;
23 import java.io.FileInputStream;
24 import java.io.InputStream;
25 import java.io.StringWriter;
26
27 import org.apache.tika.Tika;
28 import org.apache.tika.TikaTest;
29 import org.apache.tika.io.TikaInputStream;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.Office;
32 import org.apache.tika.metadata.OfficeOpenXMLCore;
33 import org.apache.tika.metadata.TikaCoreProperties;
34 import org.apache.tika.parser.ParseContext;
35 import org.apache.tika.sax.WriteOutContentHandler;
36 import org.junit.Test;
37
38 /**
39 * Junit test class for the Tika {@link RTFParser}
40 */
41 public class RTFParserTest extends TikaTest {
42
43 private Tika tika = new Tika();
44
45 private static class Result {
46 public final String text;
47 public final Metadata metadata;
48
49 public Result(String text, Metadata metadata) {
50 this.text = text;
51 this.metadata = metadata;
52 }
53 }
54
55 @Test
56 public void testBasicExtraction() throws Exception {
57 File file = getResourceAsFile("/test-documents/testRTF.rtf");
58
59 Metadata metadata = new Metadata();
60 StringWriter writer = new StringWriter();
61 tika.getParser().parse(
62 new FileInputStream(file),
63 new WriteOutContentHandler(writer),
64 metadata,
65 new ParseContext());
66 String content = writer.toString();
67
68 assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
69 assertContains("Test", content);
70 assertContains("indexation Word", content);
71 }
72
73 @Test
74 public void testUmlautSpacesExtraction2() throws Exception {
75 String content = getText("testRTFUmlautSpaces2.rtf");
76 content = content.replaceAll("\\s+", "");
77 assertEquals("\u00DCbersicht", content);
78 }
79
80 @Test
81 public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
82 String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
83
84 assertContains("\u5E74", content);
85 assertContains("\u5ff5", content);
86 assertContains("0 ", content);
87 assertContains("abc", content);
88 assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74"));
89 }
90
91 @Test
92 public void testHexEscapeInsideWord() throws Exception {
93 String content = getText("testRTFHexEscapeInsideWord.rtf");
94 assertContains("ESP\u00cdRITO", content);
95 }
96
97 @Test
98 public void testWindowsCodepage1250() throws Exception {
99 String content = getText("testRTFWindowsCodepage1250.rtf");
100 assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
101 assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
102 }
103
104 @Test
105 public void testTableCellSeparation() throws Exception {
106 File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
107 String content = tika.parseToString(file);
108 content = content.replaceAll("\\s+"," ");
109 assertTrue(content.contains("a b c d \u00E4 \u00EB \u00F6 \u00FC"));
110 assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
111 }
112
113 @Test
114 public void testTableCellSeparation2() throws Exception {
115 String content = getText("testRTFTableCellSeparation2.rtf");
116 // TODO: why do we insert extra whitespace...?
117 content = content.replaceAll("\\s+"," ");
118 assertContains("Station Fax", content);
119 }
120
121 @Test
122 public void testWordPadCzechCharactersExtraction() throws Exception {
123 File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
124 String s1 = tika.parseToString(file);
125 assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
126 assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
127 }
128
129 @Test
130 public void testWord2010CzechCharactersExtraction() throws Exception {
131 File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
132 String s1 = tika.parseToString(file);
133 assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
134 assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
135 }
136
137 @Test
138 public void testMS932Extraction() throws Exception {
139 File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
140 String s1 = tika.parseToString(file);
141
142 // Hello in Japanese
143 assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
144
145 // Verify title, since it was also encoded with MS932:
146 Result r = getResult("testRTF-ms932.rtf");
147 assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
148 }
149
150 @Test
151 public void testUmlautSpacesExtraction() throws Exception {
152 File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
153 String s1 = tika.parseToString(file);
154 assertTrue(s1.contains("\u00DCbersicht"));
155 }
156
157 @Test
158 public void testGothic() throws Exception {
159 String content = getText("testRTFUnicodeGothic.rtf");
160 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
161 }
162
163 @Test
164 public void testJapaneseText() throws Exception {
165 Result r = getResult("testRTFJapanese.rtf");
166 String content = r.text;
167
168 // Verify title -- this title uses upr escape inside
169 // title info field:
170 assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
171 r.metadata.get(TikaCoreProperties.TITLE));
172 assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
173 assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
174 assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
175
176 // Special version of (GHQ)
177 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
178
179 // 6 other characters
180 assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
181 }
182
183 @Test
184 public void testMaxLength() throws Exception {
185 File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
186 Metadata metadata = new Metadata();
187 InputStream stream = TikaInputStream.get(file, metadata);
188
189 // Test w/ default limit:
190 Tika localTika = new Tika();
191 String content = localTika.parseToString(stream, metadata);
192 // parseToString closes for convenience:
193 //stream.close();
194 assertTrue(content.length() > 500);
195
196 // Test setting max length on the instance:
197 localTika.setMaxStringLength(200);
198 stream = TikaInputStream.get(file, metadata);
199 content = localTika.parseToString(stream, metadata);
200
201 // parseToString closes for convenience:
202 //stream.close();
203 assertTrue(content.length() <= 200);
204
205 // Test setting max length per-call:
206 stream = TikaInputStream.get(file, metadata);
207 content = localTika.parseToString(stream, metadata, 100);
208 // parseToString closes for convenience:
209 //stream.close();
210 assertTrue(content.length() <= 100);
211 }
212
213 @Test
214 public void testTextWithCurlyBraces() throws Exception {
215 String content = getText("testRTFWithCurlyBraces.rtf");
216 assertContains("{ some text inside curly brackets }", content);
217 }
218
219 @Test
220 public void testControls() throws Exception {
221 Result r = getResult("testRTFControls.rtf");
222 String content = r.text;
223 assertContains("Thiswordhasanem\u2014dash", content);
224 assertContains("Thiswordhasanen\u2013dash", content);
225 assertContains("Thiswordhasanon\u2011breakinghyphen", content);
226 assertContains("Thiswordhasanonbreaking\u00a0space", content);
227 assertContains("Thiswordhasanoptional\u00adhyphen", content);
228 assertContains("\u2018Single quoted text\u2019", content);
229 assertContains("\u201cDouble quoted text\u201d", content);
230 assertContains("\u201cDouble quoted text again\u201d", content);
231 }
232
233 @Test
234 public void testInvalidUnicode() throws Exception {
235 Result r = getResult("testRTFInvalidUnicode.rtf");
236 String content = r.text;
237 assertContains("Unpaired hi \ufffd here", content);
238 assertContains("Unpaired lo \ufffd here", content);
239 assertContains("Mismatched pair \ufffd\ufffd here", content);
240 }
241
242 @Test
243 public void testVarious() throws Exception {
244 Result r = getResult("testRTFVarious.rtf");
245 String content = r.text;
246 assertContains("Footnote appears here", content);
247 assertContains("This is a footnote.", content);
248 assertContains("This is the header text.", content);
249 assertContains("This is the footer text.", content);
250 assertContains("Here is a text box", content);
251 assertContains("Bold", content);
252 assertContains("italic", content);
253 assertContains("underline", content);
254 assertContains("superscript", content);
255 assertContains("subscript", content);
256 assertContains("Here is a citation:", content);
257 assertContains("Figure 1 This is a caption for Figure 1", content);
258 assertContains("(Kramer)", content);
259
260 // Table
261 assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
262
263 // 2-columns
264 assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
265 assertContains("This is a hyperlink", content);
266 assertContains("Here is a list:", content);
267 for(int row=1;row<=3;row++) {
268 assertContains("Bullet " + row, content);
269 }
270 assertContains("Here is a numbered list:", content);
271 for(int row=1;row<=3;row++) {
272 assertContains("Number bullet " + row, content);
273 }
274
275 for(int row=1;row<=2;row++) {
276 for(int col=1;col<=3;col++) {
277 assertContains("Row " + row + " Col " + col, content);
278 }
279 }
280
281 assertContains("Keyword1 Keyword2", content);
282 assertEquals("Keyword1 Keyword2",
283 r.metadata.get(TikaCoreProperties.KEYWORDS));
284
285 assertContains("Subject is here", content);
286 assertEquals("Subject is here",
287 r.metadata.get(OfficeOpenXMLCore.SUBJECT));
288 assertEquals("Subject is here",
289 r.metadata.get(Metadata.SUBJECT));
290
291 assertContains("Suddenly some Japanese text:", content);
292 // Special version of (GHQ)
293 assertContains("\uff08\uff27\uff28\uff31\uff09", content);
294 // 6 other characters
295 assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
296
297 assertContains("And then some Gothic text:", content);
298 assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
299 }
300
301 @Test
302 public void testVariousStyle() throws Exception {
303 String content = getXML("testRTFVarious.rtf").xml;
304 assertContains("<b>Bold</b>", content);
305 assertContains("<i>italic</i>", content);
306 }
307
308 @Test
309 public void testBoldItalic() throws Exception {
310 String content = getXML("testRTFBoldItalic.rtf").xml;
311 assertContains("<b>bold</b>", content);
312 assertContains("<b>bold </b><b><i>italic</i></b>", content);
313 assertContains("<b><i>italic </i></b><b>bold</b>", content);
314 assertContains("<i>italic</i>", content);
315 assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
316 assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
317 }
318
319 @Test
320 public void testHyperlink() throws Exception {
321 String content = getXML("testRTFHyperlink.rtf").xml;
322 assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
323 assertEquals(-1, content.indexOf("<p>\t\t</p>"));
324 }
325
326 @Test
327 public void testIgnoredControlWord() throws Exception {
328 assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
329 }
330
331 @Test
332 public void testFontAfterBufferedText() throws Exception {
333 assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
334 getXML("testFontAfterBufferedText.rtf").xml);
335 }
336
337 @Test
338 public void testListMicrosoftWord() throws Exception {
339 String content = getXML("testRTFListMicrosoftWord.rtf").xml;
340 assertContains("<ol>\t<li>one</li>", content);
341 assertContains("</ol>", content);
342 assertContains("<ul>\t<li>first</li>", content);
343 assertContains("</ul>", content);
344 }
345
346 @Test
347 public void testListLibreOffice() throws Exception {
348 String content = getXML("testRTFListLibreOffice.rtf").xml;
349 assertContains("<ol>\t<li>one</li>", content);
350 assertContains("</ol>", content);
351 assertContains("<ul>\t<li>first</li>", content);
352 assertContains("</ul>", content);
353 }
354
355 // TIKA-782
356 @Test
357 public void testBinControlWord() throws Exception {
358 assertTrue(getXML("testBinControlWord.rtf").xml.indexOf("\u00ff\u00ff\u00ff\u00ff") == -1);
359 }
360
361 // TIKA-999
362 @Test
363 public void testMetaDataCounts() throws Exception {
364 XMLResult xml = getXML("test_embedded_package.rtf");
365 assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
366 assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
367 assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
368 assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2012-09-02T"));
369 }
370
371 // TIKA-1192
372 @Test
373 public void testListOverride() throws Exception {
374 Result r = getResult("testRTFListOverride.rtf");
375 String content = r.text;
376 assertContains("Body", content);
377 }
378
379 private Result getResult(String filename) throws Exception {
380 File file = getResourceAsFile("/test-documents/" + filename);
381
382 Metadata metadata = new Metadata();
383 StringWriter writer = new StringWriter();
384 tika.getParser().parse(
385 new FileInputStream(file),
386 new WriteOutContentHandler(writer),
387 metadata,
388 new ParseContext());
389 String content = writer.toString();
390 return new Result(content, metadata);
391 }
392
393 private String getText(String filename) throws Exception {
394 return getResult(filename).text;
395 }
396 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.solidworks;
17
18 import static junit.framework.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.TikaTest;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.parser.microsoft.OfficeParser;
27 import org.apache.tika.sax.BodyContentHandler;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30
31 public class SolidworksParserTest extends TikaTest {
32
33 /**
34 * Test the parsing of an solidWorks part in version 2013SP2
35 */
36 @Test
37 public void testPart2013SP2Parser() throws Exception {
38 InputStream input = SolidworksParserTest.class.getResourceAsStream(
39 "/test-documents/testsolidworksPart2013SP2.SLDPRT");
40 try {
41 ContentHandler handler = new BodyContentHandler();
42 Metadata metadata = new Metadata();
43 new OfficeParser().parse(input, handler, metadata, new ParseContext());
44
45 //Check content type
46 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
47
48 //Check properties
49 assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
50 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
51 assertEquals("2013-09-06T08:12:12Z", metadata.get(Metadata.MODIFIED));
52 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
53 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
54 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
55 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
56 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
57 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
58 } finally {
59 input.close();
60 }
61 }
62
63 /**
64 * Test the parsing of an solidWorks part in version 2014SP0
65 */
66 @Test
67 public void testPart2014SP0Parser() throws Exception {
68 InputStream input = SolidworksParserTest.class.getResourceAsStream(
69 "/test-documents/testsolidworksPart2014SP0.SLDPRT");
70 try {
71 ContentHandler handler = new BodyContentHandler();
72 Metadata metadata = new Metadata();
73 new OfficeParser().parse(input, handler, metadata, new ParseContext());
74
75 //Check content type
76 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
77
78 //Check properties
79 assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
80 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
81 assertEquals("2013-11-28T12:38:28Z", metadata.get(Metadata.MODIFIED));
82 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
83 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
84 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
85 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
86 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
87 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
88 } finally {
89 input.close();
90 }
91 }
92
93 /**
94 * Test the parsing of an solidWorks assembly in version 2013SP2
95 */
96 @Test
97 public void testAssembly2013SP2Parser() throws Exception {
98 InputStream input = SolidworksParserTest.class.getResourceAsStream(
99 "/test-documents/testsolidworksAssembly2013SP2.SLDASM");
100 try {
101 ContentHandler handler = new BodyContentHandler();
102 Metadata metadata = new Metadata();
103 new OfficeParser().parse(input, handler, metadata, new ParseContext());
104
105 //Check content type
106 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
107
108 //Check properties
109 assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
110 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
111 assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED));
112 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
113 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
114 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
115 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
116 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
117 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
118 } finally {
119 input.close();
120 }
121 }
122
123 /**
124 * Test the parsing of an solidWorks assembly in version 2014SP0
125 */
126 @Test
127 public void testAssembly2014SP0Parser() throws Exception {
128 InputStream input = SolidworksParserTest.class.getResourceAsStream(
129 "/test-documents/testsolidworksAssembly2014SP0.SLDASM");
130 try {
131 ContentHandler handler = new BodyContentHandler();
132 Metadata metadata = new Metadata();
133 new OfficeParser().parse(input, handler, metadata, new ParseContext());
134
135 //Check content type
136 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
137
138 //Check properties
139 assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
140 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
141 assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
142 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
143 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
144 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
145 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
146 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
147 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
148 } finally {
149 input.close();
150 }
151 }
152
153 /*
154 * Test the parsing of an solidWorks drawing in version 2013SP2
155 */
156 @Test
157 public void testDrawing2013SP2Parser() throws Exception {
158 InputStream input = SolidworksParserTest.class.getResourceAsStream(
159 "/test-documents/testsolidworksDrawing2013SP2.SLDDRW");
160 try {
161 ContentHandler handler = new BodyContentHandler();
162 Metadata metadata = new Metadata();
163 new OfficeParser().parse(input, handler, metadata, new ParseContext());
164
165 //Check content type
166 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
167
168 //Check properties
169 assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
170 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
171 assertEquals("2013-09-06T08:06:57Z", metadata.get(Metadata.MODIFIED));
172 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
173 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
174 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
175 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
176 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
177 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
178 } finally {
179 input.close();
180 }
181 }
182
183 /**
184 * Test the parsing of an solidWorks drawing in version 2014SP0
185 */
186 @Test
187 public void testDrawing2014SP0Parser() throws Exception {
188 InputStream input = SolidworksParserTest.class.getResourceAsStream(
189 "/test-documents/testsolidworksDrawing2014SP0.SLDDRW");
190 try {
191 ContentHandler handler = new BodyContentHandler();
192 Metadata metadata = new Metadata();
193 new OfficeParser().parse(input, handler, metadata, new ParseContext());
194
195 //Check content type
196 assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
197
198 //Check properties
199 assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
200 assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
201 assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
202 assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
203 assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
204 assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
205 assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
206 assertEquals("", metadata.get(TikaCoreProperties.TITLE));
207 assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
208 } finally {
209 input.close();
210 }
211 }
212 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import org.junit.Test;
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.IOException;
23 import java.io.InputStream;
24
25 public class CharsetDetectorTest {
26
27 @Test
28 public void testTagDropper() throws IOException {
29 InputStream in = CharsetDetectorTest.class.getResourceAsStream( "/test-documents/resume.html" );
30
31 try {
32 CharsetDetector detector = new CharsetDetector();
33 detector.enableInputFilter(true);
34 detector.setText(in);
35 CharsetMatch [] matches = detector.detectAll();
36 CharsetMatch mm = null;
37 for ( CharsetMatch m : matches ) {
38 if ( mm == null || mm.getConfidence() < m.getConfidence() ) {
39 mm = m;
40 }
41 }
42 assertTrue( mm != null );
43 assertEquals( "UTF-8", mm.getName() );
44 } finally {
45 in.close();
46 }
47 }
48 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.txt;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertNull;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.ByteArrayInputStream;
23 import java.io.StringWriter;
24
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.parser.Parser;
29 import org.apache.tika.sax.BodyContentHandler;
30 import org.apache.tika.sax.WriteOutContentHandler;
31 import org.junit.Test;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.helpers.DefaultHandler;
34
35 public class TXTParserTest {
36
37 private Parser parser = new TXTParser();
38
39 @Test
40 public void testEnglishText() throws Exception {
41 String text =
42 "Hello, World! This is simple UTF-8 text content written"
43 + " in English to test autodetection of both the character"
44 + " encoding and the language of the input stream.";
45
46 Metadata metadata = new Metadata();
47 StringWriter writer = new StringWriter();
48 parser.parse(
49 new ByteArrayInputStream(text.getBytes("ISO-8859-1")),
50 new WriteOutContentHandler(writer),
51 metadata,
52 new ParseContext());
53 String content = writer.toString();
54
55 assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
56
57 // TIKA-501: Remove language detection from TXTParser
58 assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
59 assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
60
61 assertTrue(content.contains("Hello"));
62 assertTrue(content.contains("World"));
63 assertTrue(content.contains("autodetection"));
64 assertTrue(content.contains("stream"));
65 }
66
67 @Test
68 public void testUTF8Text() throws Exception {
69 String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
70
71 ContentHandler handler = new BodyContentHandler();
72 Metadata metadata = new Metadata();
73 parser.parse(
74 new ByteArrayInputStream(text.getBytes("UTF-8")),
75 handler, metadata, new ParseContext());
76 assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
77 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
78
79 assertTrue(handler.toString().contains(text));
80 }
81
82 @Test
83 public void testEmptyText() throws Exception {
84 ContentHandler handler = new BodyContentHandler();
85 Metadata metadata = new Metadata();
86 parser.parse(
87 new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
88 assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
89 assertEquals("\n", handler.toString());
90 }
91
92 /**
93 * Test for the heuristics that we use to assign an eight-bit character
94 * encoding to mostly ASCII sequences. If a more specific match can not
95 * be made, a string with a CR(LF) in it is most probably windows-1252,
96 * otherwise ISO-8859-1, except if it contains the currency/euro symbol
97 * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
98 */
99 @Test
100 public void testLatinDetectionHeuristics() throws Exception {
101 String windows = "test\r\n";
102 String unix = "test\n";
103 String euro = "test \u20ac\n";
104
105 Metadata metadata;
106
107 metadata = new Metadata();
108 parser.parse(
109 new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
110 new DefaultHandler(), metadata, new ParseContext());
111 assertEquals(
112 "text/plain; charset=windows-1252",
113 metadata.get(Metadata.CONTENT_TYPE));
114
115 metadata = new Metadata();
116 parser.parse(
117 new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
118 new DefaultHandler(), metadata, new ParseContext());
119 assertEquals(
120 "text/plain; charset=ISO-8859-1",
121 metadata.get(Metadata.CONTENT_TYPE));
122
123 metadata = new Metadata();
124 parser.parse(
125 new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
126 new DefaultHandler(), metadata, new ParseContext());
127 assertEquals(
128 "text/plain; charset=ISO-8859-15",
129 metadata.get(Metadata.CONTENT_TYPE));
130 }
131
132 /**
133 * Test case for TIKA-240: Drop the BOM when extracting plain text
134 *
135 * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
136 */
137 @Test
138 public void testDropByteOrderMark() throws Exception {
139 assertExtractText("UTF-8 BOM", "test", new byte[] {
140 (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
141 assertExtractText("UTF-16 BE BOM", "test", new byte[] {
142 (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
143 assertExtractText("UTF-16 LE BOM", "test", new byte[] {
144 (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
145 }
146
147 /**
148 * Test case for TIKA-335: using incoming charset
149 *
150 * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
151 */
152 @Test
153 public void testUseIncomingCharsetAsHint() throws Exception {
154 // Could be ISO 8859-1 or ISO 8859-15 or ...
155 // u00e1 is latin small letter a with acute
156 final String test2 = "the name is \u00e1ndre";
157
158 Metadata metadata = new Metadata();
159 parser.parse(
160 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
161 new BodyContentHandler(), metadata, new ParseContext());
162 assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
163 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
164
165 metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
166 parser.parse(
167 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
168 new BodyContentHandler(), metadata, new ParseContext());
169 assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
170 assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
171 }
172
173 /**
174 * Test case for TIKA-341: using charset in content-type
175 *
176 * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
177 */
178 @Test
179 public void testUsingCharsetInContentTypeHeader() throws Exception {
180 // Could be ISO 8859-1 or ISO 8859-15 or ...
181 // u00e1 is latin small letter a with acute
182 final String test2 = "the name is \u00e1ndre";
183
184 Metadata metadata = new Metadata();
185 parser.parse(
186 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
187 new BodyContentHandler(), metadata, new ParseContext());
188 assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
189 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
190
191 metadata = new Metadata();
192 metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
193 parser.parse(
194 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
195 new BodyContentHandler(), metadata, new ParseContext());
196 assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
197 assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
198 }
199
200 private void assertExtractText(String msg, String expected, byte[] input)
201 throws Exception {
202 ContentHandler handler = new BodyContentHandler() {
203 public void ignorableWhitespace(char[] ch, int off, int len) {
204 // Ignore the whitespace added by XHTMLContentHandler
205 }
206 };
207 Metadata metadata = new Metadata();
208 parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
209 assertEquals(msg, expected, handler.toString());
210 }
211
212 /**
213 * Test case for TIKA-339: don't override incoming language
214 *
215 * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
216 */
217 @Test
218 public void testRetainIncomingLanguage() throws Exception {
219 final String test = "Simple Content";
220
221 Metadata metadata = new Metadata();
222 metadata.set(TikaCoreProperties.LANGUAGE, "en");
223
224 parser.parse(
225 new ByteArrayInputStream(test.getBytes("UTF-8")),
226 new BodyContentHandler(), metadata, new ParseContext());
227
228 assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
229 }
230
231 @Test
232 public void testCP866() throws Exception {
233 Metadata metadata = new Metadata();
234 StringWriter writer = new StringWriter();
235 parser.parse(
236 TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
237 new WriteOutContentHandler(writer),
238 metadata,
239 new ParseContext());
240
241 assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
242 }
243
244 @Test
245 public void testEBCDIC_CP500() throws Exception {
246 Metadata metadata = new Metadata();
247 StringWriter writer = new StringWriter();
248 parser.parse(
249 TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
250 new WriteOutContentHandler(writer),
251 metadata,
252 new ParseContext());
253
254 assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
255
256 // Additional check that it isn't too eager on short blocks of text
257 metadata = new Metadata();
258 writer = new StringWriter();
259 parser.parse(
260 new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("ISO-8859-1")),
261 new WriteOutContentHandler(writer),
262 metadata,
263 new ParseContext());
264
265 assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
266 }
267
268 /**
269 * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
270 *
271 * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
272 */
273 @Test
274 public void testCharsetDetectionWithShortSnipet() throws Exception {
275 final String text = "Hello, World!";
276
277 Metadata metadata = new Metadata();
278 parser.parse(
279 new ByteArrayInputStream(text.getBytes("UTF-8")),
280 new BodyContentHandler(), metadata, new ParseContext());
281 assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
282
283 // Now verify that if we tell the parser the encoding is UTF-8, that's what
284 // we get back (see TIKA-868)
285 metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
286 parser.parse(
287 new ByteArrayInputStream(text.getBytes("UTF-8")),
288 new BodyContentHandler(), metadata, new ParseContext());
289 assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
290 }
291
292 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.video;
17
18 import static org.junit.Assert.assertEquals;
19
20 import org.apache.tika.Tika;
21 import org.apache.tika.metadata.Metadata;
22 import org.junit.Test;
23
24 public class FLVParserTest {
25
26 @Test
27 public void testFLV() throws Exception {
28 String path = "/test-documents/testFLV.flv";
29 Metadata metadata = new Metadata();
30
31 String content = new Tika().parseToString(
32 FLVParserTest.class.getResourceAsStream(path), metadata);
33
34 assertEquals("", content);
35 assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
36 assertEquals("true", metadata.get("hasVideo"));
37 assertEquals("false", metadata.get("stereo"));
38 assertEquals("true", metadata.get("hasAudio"));
39 assertEquals("120.0", metadata.get("height"));
40 assertEquals("16.0", metadata.get("audiosamplesize"));
41 }
42
43 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23
24 import org.apache.tika.TikaTest;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.sax.BodyContentHandler;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30 import org.xml.sax.helpers.DefaultHandler;
31
32 public class DcXMLParserTest extends TikaTest {
33
34 @Test
35 public void testXMLParserAsciiChars() throws Exception {
36 InputStream input = DcXMLParserTest.class.getResourceAsStream(
37 "/test-documents/testXML.xml");
38 try {
39 Metadata metadata = new Metadata();
40 ContentHandler handler = new BodyContentHandler();
41 new DcXMLParser().parse(input, handler, metadata);
42
43 assertEquals(
44 "application/xml",
45 metadata.get(Metadata.CONTENT_TYPE));
46 assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
47 assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
48
49 // The file contains 5 dc:subject tags, which come through as
50 // a multi-valued Tika Metadata entry in file order
51 assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
52 assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
53 assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
54 assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
55 assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
56 assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
57 assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
58 assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
59 assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
60 assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
61 assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
62 assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
63 assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
64 assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
65
66 assertEquals(
67 "Framework d\'indexation des documents XML, HTML, PDF etc..",
68 metadata.get(TikaCoreProperties.DESCRIPTION));
69 assertEquals(
70 "http://www.apache.org",
71 metadata.get(TikaCoreProperties.IDENTIFIER));
72 assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
73 assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
74 assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
75 assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
76
77 String content = handler.toString();
78 assertTrue(content.contains("Tika test document"));
79
80 assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
81 } finally {
82 input.close();
83 }
84 }
85
86 @Test
87 public void testXMLParserNonAsciiChars() throws Exception {
88 InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml");
89 try {
90 Metadata metadata = new Metadata();
91 new DcXMLParser().parse(input, new DefaultHandler(), metadata);
92
93 final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
94 assertEquals(expected,metadata.get(TikaCoreProperties.RIGHTS));
95 } finally {
96 input.close();
97 }
98 }
99
100 // TIKA-1048
101 @Test
102 public void testNoSpaces() throws Exception {
103 String text = getXML("testXML2.xml").xml;
104 assertFalse(text.contains("testSubject"));
105 }
106 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21
22 import org.apache.tika.TikaTest;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.Property;
25 import org.apache.tika.parser.ParseContext;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.apache.tika.sax.TeeContentHandler;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30
31 public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
32
33 private Property FIRST_NAME = Property.internalTextBag(
34 "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
35 private Property LAST_NAME = Property.internalTextBag(
36 "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
37
38 @Test
39 public void testDefaultBehavior() throws Exception {
40 InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
41 "/test-documents/testXML3.xml");
42 try {
43 Metadata metadata = new Metadata();
44 ContentHandler handler = new BodyContentHandler();
45 new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
46
47 assertEquals(4, metadata.getValues(FIRST_NAME).length);
48 assertEquals(2, metadata.getValues(LAST_NAME).length);
49
50 assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
51 assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
52
53 assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
54 assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
55
56 // We didn't know Bob's last name, but now we don't know an entry existed
57 assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
58
59 // We don't know Kate's last name because it was a duplicate
60 assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
61 } finally {
62 input.close();
63 }
64 }
65
66 @Test
67 public void testEmptiesAndRepeats() throws Exception {
68 InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
69 "/test-documents/testXML3.xml");
70 try {
71 Metadata metadata = new Metadata();
72 ContentHandler handler = new BodyContentHandler();
73 new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
74
75 assertEquals(4, metadata.getValues(FIRST_NAME).length);
76 assertEquals(4, metadata.getValues(LAST_NAME).length);
77
78 assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
79 assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
80
81 assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
82 assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
83
84 assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
85 assertEquals("", metadata.getValues(LAST_NAME)[2]);
86
87 assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
88 assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
89 } finally {
90 input.close();
91 }
92 }
93
94 private class DefaultCustomXMLTestParser extends XMLParser {
95
96 private static final long serialVersionUID = 2458579047014545931L;
97
98 protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
99 return new ElementMetadataHandler(
100 "http://custom",
101 localPart,
102 metadata,
103 (Property) tikaMetadata);
104 }
105
106 protected ContentHandler getContentHandler(
107 ContentHandler handler, Metadata metadata, ParseContext context) {
108 return new TeeContentHandler(
109 super.getContentHandler(handler, metadata, context),
110 getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
111 getCustomElementHandler(metadata, LAST_NAME, "LastName"));
112 }
113 }
114
115 private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
116
117 private static final long serialVersionUID = 3735646809954466229L;
118
119 protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
120 return new ElementMetadataHandler(
121 "http://custom",
122 localPart,
123 metadata,
124 (Property) tikaMetadata,
125 true,
126 true);
127 }
128 }
129
130
131 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.xml;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertTrue;
20
21 import org.apache.tika.extractor.ContainerExtractor;
22 import org.apache.tika.extractor.ParserContainerExtractor;
23 import org.apache.tika.io.TikaInputStream;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
26 import org.apache.tika.sax.BodyContentHandler;
27 import org.junit.Test;
28 import org.xml.sax.ContentHandler;
29
30 import java.io.InputStream;
31
32 public class FictionBookParserTest {
33
34 @Test
35 public void testFB2() throws Exception {
36 InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2");
37 try {
38 Metadata metadata = new Metadata();
39 ContentHandler handler = new BodyContentHandler();
40 new FictionBookParser().parse(input, handler, metadata);
41 String content = handler.toString();
42
43 assertTrue(content.contains("1812"));
44 } finally {
45 input.close();
46 }
47 }
48
49 @Test
50 public void testEmbedded() throws Exception {
51 InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2");
52 try {
53 ContainerExtractor extractor = new ParserContainerExtractor();
54 TikaInputStream stream = TikaInputStream.get(input);
55
56 assertEquals(true, extractor.isSupported(stream));
57
58 // Process it
59 AbstractPOIContainerExtractionTest.TrackingHandler handler = new AbstractPOIContainerExtractionTest.TrackingHandler();
60 extractor.extract(stream, null, handler);
61
62 assertEquals(2, handler.filenames.size());
63 } finally {
64 input.close();
65 }
66 }
67 }
0 This is JAX-RS Tika server for Tika
1 (https://issues.apache.org/jira/browse/TIKA-593)
2
3 Running
4 -------
5 java -jar target/tikaserver-1.0-SNAPSHOT.jar
6
7 Usage
8 -----
9 Usage examples from command line with curl utility:
10
11 1) Extract plain text:
12
13 curl -T price.xls http://localhost:9998/tika
14
15 2) Extract text with mime-type hint:
16
17 curl -v -H "Content-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document" -T document.docx http://localhost:9998/tika
18
19 3) Get all document attachments as ZIP-file:
20
21 curl -v -T Doc1_ole.doc http://localhost:9998/unpacker > /var/tmp/x.zip
22
23 4) Extract metadata to CSV format:
24
25 curl -T price.xls http://localhost:9998/meta
26
27 HTTP Codes
28 ----------
29 200 - Ok
30 204 - No content (for example when we are unpacking file without attachments)
31 415 - Unknown file type
32 422 - Unparsable document of known type (password protected documents and unsupported versions like Biff5 Excel)
33 500 - Internal error
34
0 <!--
1 Licensed to the Apache Software Foundation (ASF) under one or more
2 contributor license agreements. See the NOTICE file distributed with
3 this work for additional information regarding copyright ownership.
4 The ASF licenses this file to You under the Apache License, Version 2.0
5 (the "License"); you may not use this file except in compliance with
6 the License. You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 -->
16 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
17 <modelVersion>4.0.0</modelVersion>
18
19 <parent>
20 <groupId>org.apache.tika</groupId>
21 <artifactId>tika-parent</artifactId>
22 <version>1.5</version>
23 <relativePath>../tika-parent/pom.xml</relativePath>
24 </parent>
25
26 <artifactId>tika-server</artifactId>
27 <name>Apache Tika server</name>
28
29 <dependencies>
30 <dependency>
31 <groupId>${project.groupId}</groupId>
32 <artifactId>tika-parsers</artifactId>
33 <version>${project.version}</version>
34 </dependency>
35 <dependency>
36 <groupId>net.sf.opencsv</groupId>
37 <artifactId>opencsv</artifactId>
38 <version>2.0</version>
39 </dependency>
40 <dependency>
41 <groupId>org.apache.cxf</groupId>
42 <artifactId>cxf-rt-frontend-jaxrs</artifactId>
43 <version>2.7.8</version>
44 </dependency>
45 <dependency>
46 <groupId>org.apache.cxf</groupId>
47 <artifactId>cxf-rt-transports-http-jetty</artifactId>
48 <version>2.7.8</version>
49 </dependency>
50 <dependency>
51 <groupId>commons-cli</groupId>
52 <artifactId>commons-cli</artifactId>
53 <version>1.2</version>
54 </dependency>
55 <dependency>
56 <groupId>commons-lang</groupId>
57 <artifactId>commons-lang</artifactId>
58 <version>2.5</version>
59 </dependency>
60 <dependency>
61 <groupId>junit</groupId>
62 <artifactId>junit</artifactId>
63 <scope>test</scope>
64 <version>4.11</version>
65 </dependency>
66 </dependencies>
67
68 <build>
69 <plugins>
70 <plugin>
71 <artifactId>maven-surefire-plugin</artifactId>
72 <configuration>
73 <redirectTestOutputToFile>true</redirectTestOutputToFile>
74 <argLine>-da -XX:+HeapDumpOnOutOfMemoryError -Xmx512m</argLine>
75 <systemPropertyVariables>
76 <java.util.logging.config.file>
77 ${basedir}/src/main/resources/commons-logging.properties
78 </java.util.logging.config.file>
79 </systemPropertyVariables>
80 </configuration>
81 </plugin>
82 <plugin>
83 <artifactId>maven-shade-plugin</artifactId>
84 <executions>
85 <execution>
86 <phase>package</phase>
87 <goals>
88 <goal>shade</goal>
89 </goals>
90 <configuration>
91 <createDependencyReducedPom>
92 false
93 </createDependencyReducedPom>
94 <filters>
95 <filter>
96 <artifact>*:*</artifact>
97 <excludes>
98 <exclude>META-INF/*.SF</exclude>
99 <exclude>META-INF/*.DSA</exclude>
100 <exclude>META-INF/*.RSA</exclude>
101 <exclude>META-INF/*.txt</exclude>
102 <exclude>META-INF/ASL2.0</exclude>
103 <exclude>META-INF/DEPENDENCIES</exclude>
104 <exclude>META-INF/LICENSE</exclude>
105 <exclude>META-INF/NOTICE</exclude>
106 <exclude>META-INF/README</exclude>
107 <exclude>LICENSE.txt</exclude>
108 <exclude>NOTICE.txt</exclude>
109 <exclude>CHANGES</exclude>
110 <exclude>README</exclude>
111 <exclude>builddef.lst</exclude>
112 <!-- TIKA-763: Workaround to avoid including LGPL classes -->
113 <exclude>ucar/nc2/iosp/fysat/Fysat*.class</exclude>
114 <exclude>ucar/nc2/dataset/transform/VOceanSG1*class</exclude>
115 <exclude>ucar/unidata/geoloc/vertical/OceanSG*.class</exclude>
116 </excludes>
117 </filter>
118 </filters>
119 <transformers>
120 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
121 <mainClass>org.apache.tika.server.TikaServerCli</mainClass>
122 </transformer>
123 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
124 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
125 <resource>META-INF/LICENSE</resource>
126 <file>target/classes/META-INF/LICENSE</file>
127 </transformer>
128 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
129 <resource>META-INF/NOTICE</resource>
130 <file>target/classes/META-INF/NOTICE</file>
131 </transformer>
132 <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
133 <resource>META-INF/DEPENDENCIES</resource>
134 <file>target/classes/META-INF/DEPENDENCIES</file>
135 </transformer>
136 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
137 <resource>META-INF/spring.handlers</resource>
138 </transformer>
139 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
140 <resource>META-INF/spring.schemas</resource>
141 </transformer>
142 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
143 <resource>META-INF/cxf/cxf.extension</resource>
144 </transformer>
145 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
146 <resource>META-INF/extensions.xml</resource>
147 </transformer>
148 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
149 <resource>META-INF/cxf/extensions.xml</resource>
150 </transformer>
151 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
152 <resource>META-INF/cxf/bus-extensions.txt</resource>
153 </transformer>
154 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
155 <resource>META-INF/cxf/bus-extensions.xml</resource>
156 </transformer>
157 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
158 <resource>META-INF/wsdl.plugin.xml</resource>
159 </transformer>
160 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
161 <resource>META-INF/tools.service.validator.xml</resource>
162 </transformer>
163 <transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
164 <resource>META-INF/cxf/java2wsbeans.xml</resource>
165 </transformer>
166 </transformers>
167 </configuration>
168 </execution>
169 </executions>
170 </plugin>
171 </plugins>
172 </build>
173 <profiles>
174 <profile>
175 <id>server</id>
176 <build>
177 <defaultGoal>test</defaultGoal>
178 <plugins>
179 <plugin>
180 <groupId>org.codehaus.mojo</groupId>
181 <artifactId>exec-maven-plugin</artifactId>
182 <executions>
183 <execution>
184 <phase>test</phase>
185 <goals>
186 <goal>java</goal>
187 </goals>
188 <configuration>
189 <mainClass>org.apache.tika.server.TikaServerCli</mainClass>
190 </configuration>
191 </execution>
192 </executions>
193 </plugin>
194 </plugins>
195 </build>
196 </profile>
197 </profiles>
198 <url>http://tika.apache.org/</url>
199 <organization>
200 <name>The Apache Software Foundation</name>
201 <url>http://www.apache.org</url>
202 </organization>
203 <scm>
204 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-server</url>
205 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-server</connection>
206 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-server</developerConnection>
207 </scm>
208 <issueManagement>
209 <system>JIRA</system>
210 <url>https://issues.apache.org/jira/browse/TIKA</url>
211 </issueManagement>
212 <ciManagement>
213 <system>Jenkins</system>
214 <url>https://builds.apache.org/job/Tika-trunk/</url>
215 </ciManagement>
216 </project>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.tika.metadata.Metadata;
20
21 import javax.ws.rs.Produces;
22 import javax.ws.rs.WebApplicationException;
23 import javax.ws.rs.core.MediaType;
24 import javax.ws.rs.core.MultivaluedMap;
25 import javax.ws.rs.ext.MessageBodyWriter;
26 import javax.ws.rs.ext.Provider;
27
28 import java.io.IOException;
29 import java.io.OutputStream;
30 import java.io.OutputStreamWriter;
31 import java.lang.annotation.Annotation;
32 import java.lang.reflect.Type;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35
36 import au.com.bytecode.opencsv.CSVWriter;
37
38 @Provider
39 @Produces("text/csv")
40 public class CSVMessageBodyWriter implements MessageBodyWriter<Metadata> {
41
42 public boolean isWriteable(Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
43 return Metadata.class.isAssignableFrom(type);
44 }
45
46 public long getSize(Metadata data, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
47 return -1;
48 }
49
50 @Override
51 public void writeTo(Metadata metadata, Class<?> type, Type genericType, Annotation[] annotations,
52 MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException,
53 WebApplicationException {
54
55 CSVWriter writer = new CSVWriter(new OutputStreamWriter(entityStream, "UTF-8"));
56
57 for (String name : metadata.names()) {
58 String[] values = metadata.getValues(name);
59 ArrayList<String> list = new ArrayList<String>(values.length + 1);
60 list.add(name);
61 list.addAll(Arrays.asList(values));
62 writer.writeNext(list.toArray(values));
63 }
64 // don't close, just flush the stream
65 writer.flush();
66 }
67 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.tika.io.IOUtils;
20 import org.apache.tika.metadata.Metadata;
21 import org.eclipse.jetty.util.ajax.JSON;
22
23 import javax.ws.rs.Produces;
24 import javax.ws.rs.WebApplicationException;
25 import javax.ws.rs.core.MediaType;
26 import javax.ws.rs.core.MultivaluedMap;
27 import javax.ws.rs.ext.MessageBodyWriter;
28 import javax.ws.rs.ext.Provider;
29
30 import java.io.IOException;
31 import java.io.OutputStream;
32 import java.io.StringReader;
33 import java.lang.annotation.Annotation;
34 import java.lang.reflect.Type;
35 import java.util.Map;
36 import java.util.TreeMap;
37
38 @Provider
39 @Produces(MediaType.APPLICATION_JSON)
40 public class JSONMessageBodyWriter implements MessageBodyWriter<Metadata> {
41
42 public boolean isWriteable(Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
43 return Metadata.class.isAssignableFrom(type);
44 }
45
46 public long getSize(Metadata data, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
47 return -1;
48 }
49
50 @Override
51 public void writeTo(Metadata metadata, Class<?> type, Type genericType, Annotation[] annotations,
52 MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException,
53 WebApplicationException {
54
55 Map<String, Object> res = new TreeMap<String, Object>();
56
57 for (String name : metadata.names()) {
58 String[] values = metadata.getValues(name);
59 if (metadata.isMultiValued(name)) {
60 res.put(name, values);
61 } else {
62 res.put(name, values[0]);
63 }
64 }
65
66 String json = JSON.toString(res);
67 System.err.println("JSON : "+json);
68 StringReader r = new StringReader(json);
69 IOUtils.copy(r, entityStream);
70 entityStream.flush();
71 }
72 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.InputStream;
20
21 import javax.ws.rs.POST;
22 import javax.ws.rs.Path;
23 import javax.ws.rs.PathParam;
24 import javax.ws.rs.Produces;
25 import javax.ws.rs.core.Context;
26 import javax.ws.rs.core.HttpHeaders;
27 import javax.ws.rs.core.MediaType;
28 import javax.ws.rs.core.Response;
29 import javax.ws.rs.core.Response.Status;
30 import javax.ws.rs.core.UriInfo;
31
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.parser.AutoDetectParser;
36 import org.xml.sax.helpers.DefaultHandler;
37
38 /**
39 * This JAX-RS endpoint provides access to the metadata contained within a
40 * document. It is possible to submit a relatively small prefix (a few KB) of a
41 * document's content to retrieve individual metadata fields.
42 * <p>
43 */
44 @Path("/metadata")
45 public class MetadataEP {
46 private static final Log logger = LogFactory.getLog(MetadataEP.class);
47
48 /** The parser to use */
49 private final AutoDetectParser parser;
50
51 /** The metdata for the request */
52 private final Metadata metadata = new Metadata();
53
54 public MetadataEP(@Context HttpHeaders httpHeaders, @Context UriInfo info) {
55 parser = TikaResource.createParser();
56 TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
57 TikaResource.logRequest(logger, info, metadata);
58 }
59
60 /**
61 * Get all metadata that can be parsed from the specified input stream. An
62 * error is produced if the input stream cannot be parsed.
63 *
64 * @param is
65 * an input stream
66 * @return the metadata
67 * @throws Exception
68 */
69 @POST
70 public Response getMetadata(InputStream is) throws Exception {
71 parser.parse(is, new DefaultHandler(), metadata);
72 return Response.ok(metadata).build();
73 }
74
75 /**
76 * Get a specific TIKA metadata field as a simple text string. If the field is
77 * multivalued, then only the first value is returned. If the input stream
78 * cannot be parsed, but a value was found for the given metadata field, then
79 * the value of the field is returned as part of a 200 OK response; otherwise
80 * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully
81 * parsed but the specific metadata field was not found, then a
82 * {@link Status#NOT_FOUND} is returned.
83 * <p>
84 *
85 * @param field
86 * the tika metadata field name
87 * @param is
88 * the document stream
89 * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
90 * {@link Status#BAD_REQUEST}
91 * @throws Exception
92 */
93 @POST
94 @Path("{field}")
95 @Produces(MediaType.TEXT_PLAIN)
96 public Response getSimpleMetadataField(@PathParam("field") String field, InputStream is) throws Exception {
97
98 // use BAD request to indicate that we may not have had enough data to
99 // process the request
100 Status defaultErrorResponse = Status.BAD_REQUEST;
101 try {
102 parser.parse(is, new DefaultHandler(), metadata);
103 // once we've parsed the document successfully, we should use NOT_FOUND
104 // if we did not see the field
105 defaultErrorResponse = Status.NOT_FOUND;
106 } catch (Exception e) {
107 logger.info("Failed to process field " + field, e);
108 }
109 String value = metadata.get(field);
110 if (value == null) {
111 return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build();
112 }
113 return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build();
114 }
115
116 /**
117 * Get a specific metadata field. If the input stream cannot be parsed, but a
118 * value was found for the given metadata field, then the value of the field
119 * is returned as part of a 200 OK response; otherwise a
120 * {@link Status#BAD_REQUEST} is generated. If the stream was successfully
121 * parsed but the specific metadata field was not found, then a
122 * {@link Status#NOT_FOUND} is returned.
123 * <p>
124 * Note that this method handles multivalue fields and returns possibly more
125 * metadata than requested.
126 *
127 * @param field
128 * the tika metadata field name
129 * @param is
130 * the document stream
131 * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
132 * {@link Status#BAD_REQUEST}
133 * @throws Exception
134 */
135 @POST
136 @Path("{field}")
137 public Response getMetadataField(@PathParam("field") String field, InputStream is) throws Exception {
138
139 // use BAD request to indicate that we may not have had enough data to
140 // process the request
141 Status defaultErrorResponse = Status.BAD_REQUEST;
142 try {
143 parser.parse(is, new DefaultHandler(), metadata);
144 // once we've parsed the document successfully, we should use NOT_FOUND
145 // if we did not see the field
146 defaultErrorResponse = Status.NOT_FOUND;
147 } catch (Exception e) {
148 logger.info("Failed to process field " + field, e);
149 }
150 String[] values = metadata.getValues(field);
151 if (values.length == 0) {
152 return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build();
153 }
154 // remove fields we don't care about for the response
155 for (String name : metadata.names()) {
156 if (!field.equals(name)) {
157 metadata.remove(name);
158 }
159 }
160 return Response.ok(metadata).build();
161 }
162
163 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.OutputStream;
22 import java.io.OutputStreamWriter;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25
26 import javax.ws.rs.Consumes;
27 import javax.ws.rs.PUT;
28 import javax.ws.rs.Path;
29 import javax.ws.rs.Produces;
30 import javax.ws.rs.WebApplicationException;
31 import javax.ws.rs.core.Context;
32 import javax.ws.rs.core.HttpHeaders;
33 import javax.ws.rs.core.MultivaluedMap;
34 import javax.ws.rs.core.StreamingOutput;
35 import javax.ws.rs.core.UriInfo;
36
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
40 import org.apache.tika.metadata.Metadata;
41 import org.apache.tika.parser.AutoDetectParser;
42 import org.xml.sax.helpers.DefaultHandler;
43
44 import au.com.bytecode.opencsv.CSVWriter;
45
46 @Path("/meta")
47 public class MetadataResource {
48 private static final Log logger = LogFactory.getLog(MetadataResource.class);
49
50 @PUT
51 @Consumes("multipart/form-data")
52 @Produces("text/csv")
53 @Path("form")
54 public StreamingOutput getMetadataFromMultipart(Attachment att, @Context UriInfo info) throws Exception {
55 return produceMetadata(att.getObject(InputStream.class), att.getHeaders(), info);
56 }
57
58 @PUT
59 @Produces("text/csv")
60 public StreamingOutput getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
61 return produceMetadata(is, httpHeaders.getRequestHeaders(), info);
62 }
63
64 private StreamingOutput produceMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws Exception {
65 final Metadata metadata = new Metadata();
66 AutoDetectParser parser = TikaResource.createParser();
67 TikaResource.fillMetadata(parser, metadata, httpHeaders);
68 TikaResource.logRequest(logger, info, metadata);
69
70 parser.parse(is, new DefaultHandler(), metadata);
71
72 return new StreamingOutput() {
73 public void write(OutputStream outputStream) throws IOException, WebApplicationException {
74 metadataToCsv(metadata, outputStream);
75 }
76 };
77 }
78
79 public static void metadataToCsv(Metadata metadata, OutputStream outputStream) throws IOException {
80 CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream, "UTF-8"));
81
82 for (String name : metadata.names()) {
83 String[] values = metadata.getValues(name);
84 ArrayList<String> list = new ArrayList<String>(values.length+1);
85 list.add(name);
86 list.addAll(Arrays.asList(values));
87 writer.writeNext(list.toArray(values));
88 }
89
90 writer.close();
91 }
92 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.tika.sax.WriteOutContentHandler;
20 import org.xml.sax.Attributes;
21 import org.xml.sax.SAXException;
22
23 import java.io.Writer;
24
25 class RichTextContentHandler extends WriteOutContentHandler {
26 public RichTextContentHandler(Writer writer) {
27 super(writer);
28 }
29
30 @Override
31 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
32 super.startElement(uri, localName, qName, attributes);
33
34 if ("img".equals(localName) && attributes.getValue("alt")!=null) {
35 String nfo = "[image: "+attributes.getValue("alt")+ ']';
36
37 characters(nfo.toCharArray(), 0, nfo.length());
38 }
39
40 if ("a".equals(localName) && attributes.getValue("name")!=null) {
41 String nfo = "[bookmark: "+attributes.getValue("name")+ ']';
42
43 characters(nfo.toCharArray(), 0, nfo.length());
44 }
45 }
46 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
20 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
21
22 import javax.ws.rs.Produces;
23 import javax.ws.rs.WebApplicationException;
24 import javax.ws.rs.core.MediaType;
25 import javax.ws.rs.core.MultivaluedMap;
26 import javax.ws.rs.ext.MessageBodyWriter;
27 import javax.ws.rs.ext.Provider;
28 import java.io.IOException;
29 import java.io.OutputStream;
30 import java.lang.annotation.Annotation;
31 import java.lang.reflect.Type;
32 import java.util.Map;
33
34 @Provider
35 @Produces("application/x-tar")
36 public class TarWriter implements MessageBodyWriter<Map<String, byte[]>> {
37 private static void tarStoreBuffer(TarArchiveOutputStream zip, String name, byte[] dataBuffer) throws IOException {
38 TarArchiveEntry entry = new TarArchiveEntry(name);
39
40 entry.setSize(dataBuffer.length);
41
42 zip.putArchiveEntry(entry);
43
44 zip.write(dataBuffer);
45
46 zip.closeArchiveEntry();
47 }
48
49 public boolean isWriteable(Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
50 return Map.class.isAssignableFrom(type);
51 }
52
53 public long getSize(Map<String, byte[]> stringMap, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
54 return -1;
55 }
56
57 public void writeTo(Map<String, byte[]> parts, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException, WebApplicationException {
58 TarArchiveOutputStream zip = new TarArchiveOutputStream(entityStream);
59
60 for (Map.Entry<String, byte[]> entry : parts.entrySet()) {
61 tarStoreBuffer(zip, entry.getKey(), entry.getValue());
62 }
63
64 zip.close();
65 }
66 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.tika.exception.TikaException;
20
21 import javax.ws.rs.WebApplicationException;
22 import javax.ws.rs.core.Response;
23 import javax.ws.rs.ext.ExceptionMapper;
24 import javax.ws.rs.ext.Provider;
25
26 @Provider
27 public class TikaExceptionMapper implements ExceptionMapper<TikaException> {
28 public Response toResponse(TikaException e) {
29 if (e.getCause() !=null && e.getCause() instanceof WebApplicationException) {
30 return ((WebApplicationException) e.getCause()).getResponse();
31 } else {
32 return Response.serverError().build();
33 }
34 }
35 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.OutputStream;
22 import java.io.OutputStreamWriter;
23 import java.io.Writer;
24 import java.util.Map;
25 import java.util.Set;
26
27 import javax.mail.internet.ContentDisposition;
28 import javax.mail.internet.ParseException;
29 import javax.ws.rs.Consumes;
30 import javax.ws.rs.GET;
31 import javax.ws.rs.PUT;
32 import javax.ws.rs.Path;
33 import javax.ws.rs.Produces;
34 import javax.ws.rs.WebApplicationException;
35 import javax.ws.rs.core.Context;
36 import javax.ws.rs.core.HttpHeaders;
37 import javax.ws.rs.core.MultivaluedMap;
38 import javax.ws.rs.core.Response;
39 import javax.ws.rs.core.StreamingOutput;
40 import javax.ws.rs.core.UriInfo;
41 import javax.xml.transform.OutputKeys;
42 import javax.xml.transform.TransformerConfigurationException;
43 import javax.xml.transform.sax.SAXTransformerFactory;
44 import javax.xml.transform.sax.TransformerHandler;
45 import javax.xml.transform.stream.StreamResult;
46
47 import org.apache.commons.logging.Log;
48 import org.apache.commons.logging.LogFactory;
49 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
50 import org.apache.poi.extractor.ExtractorFactory;
51 import org.apache.poi.hwpf.OldWordFileFormatException;
52 import org.apache.tika.detect.Detector;
53 import org.apache.tika.exception.EncryptedDocumentException;
54 import org.apache.tika.exception.TikaException;
55 import org.apache.tika.io.TikaInputStream;
56 import org.apache.tika.metadata.Metadata;
57 import org.apache.tika.metadata.TikaMetadataKeys;
58 import org.apache.tika.mime.MediaType;
59 import org.apache.tika.parser.AutoDetectParser;
60 import org.apache.tika.parser.ParseContext;
61 import org.apache.tika.parser.Parser;
62 import org.apache.tika.parser.html.HtmlParser;
63 import org.apache.tika.sax.BodyContentHandler;
64 import org.apache.tika.sax.ExpandedTitleContentHandler;
65 import org.xml.sax.ContentHandler;
66 import org.xml.sax.SAXException;
67
68 @Path("/tika")
69 public class TikaResource {
70 public static final String GREETING = "This is Tika Server. Please PUT\n";
71 private final Log logger = LogFactory.getLog(TikaResource.class);
72
73 static {
74 ExtractorFactory.setAllThreadsPreferEventExtractors(true);
75 }
76
77 @GET
78 @Produces("text/plain")
79 public String getMessage() {
80 return GREETING;
81 }
82
83 @SuppressWarnings("serial")
84 public static AutoDetectParser createParser() {
85 final AutoDetectParser parser = new AutoDetectParser();
86
87 Map<MediaType,Parser> parsers = parser.getParsers();
88 parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
89 parser.setParsers(parsers);
90
91 parser.setFallback(new Parser() {
92 public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
93 return parser.getSupportedTypes(parseContext);
94 }
95
96 public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
97 throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
98 }
99 });
100
101 return parser;
102 }
103
104 public static String detectFilename(MultivaluedMap<String, String> httpHeaders) {
105
106 String disposition = httpHeaders.getFirst("Content-Disposition");
107 if (disposition != null) {
108 try {
109 ContentDisposition c = new ContentDisposition(disposition);
110
111 // only support "attachment" dispositions
112 if ("attachment".equals(c.getDisposition())) {
113 String fn = c.getParameter("filename");
114 if (fn != null) {
115 return fn;
116 }
117 }
118 } catch (ParseException e) {
119 // not a valid content-disposition field
120 }
121 }
122
123 // this really should not be used, since it's not an official field
124 return httpHeaders.getFirst("File-Name");
125 }
126
127 @SuppressWarnings("serial")
128 public static void fillMetadata(AutoDetectParser parser, Metadata metadata, MultivaluedMap<String, String> httpHeaders) {
129 String fileName = detectFilename(httpHeaders);
130 if (fileName != null) {
131 metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
132 }
133
134 String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
135 javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null
136 : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
137 if (mediaType!=null && "xml".equals(mediaType.getSubtype()) ) {
138 mediaType = null;
139 }
140
141 if (mediaType !=null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
142 mediaType = null;
143 }
144
145 if (mediaType !=null) {
146 metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
147
148 final Detector detector = parser.getDetector();
149
150 parser.setDetector(new Detector() {
151 public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
152 String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
153
154 if (ct!=null) {
155 return MediaType.parse(ct);
156 } else {
157 return detector.detect(inputStream, metadata);
158 }
159 }
160 });
161 }
162 }
163
164 @PUT
165 @Consumes("multipart/form-data")
166 @Produces("text/plain")
167 @Path("form")
168 public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) {
169 return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
170 }
171
172 @PUT
173 @Consumes("*/*")
174 @Produces("text/plain")
175 public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
176 return produceText(is, httpHeaders.getRequestHeaders(), info);
177 }
178 public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) {
179 final AutoDetectParser parser = createParser();
180 final Metadata metadata = new Metadata();
181
182 fillMetadata(parser, metadata, httpHeaders);
183
184 logRequest(logger, info, metadata);
185
186 return new StreamingOutput() {
187 public void write(OutputStream outputStream) throws IOException, WebApplicationException {
188 Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
189
190 BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));
191
192 TikaInputStream tis = TikaInputStream.get(is);
193
194 try {
195 parser.parse(tis, body, metadata);
196 } catch (SAXException e) {
197 throw new WebApplicationException(e);
198 } catch (EncryptedDocumentException e) {
199 logger.warn(String.format(
200 "%s: Encrypted document",
201 info.getPath()
202 ), e);
203
204 throw new WebApplicationException(e, Response.status(422).build());
205 } catch (TikaException e) {
206 logger.warn(String.format(
207 "%s: Text extraction failed",
208 info.getPath()
209 ), e);
210
211 if (e.getCause()!=null && e.getCause() instanceof WebApplicationException) {
212 throw (WebApplicationException) e.getCause();
213 }
214
215 if (e.getCause()!=null && e.getCause() instanceof IllegalStateException) {
216 throw new WebApplicationException(Response.status(422).build());
217 }
218
219 if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException) {
220 throw new WebApplicationException(Response.status(422).build());
221 }
222
223 throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
224 } finally {
225 tis.close();
226 }
227 }
228 };
229 }
230
231 @PUT
232 @Consumes("multipart/form-data")
233 @Produces("text/html")
234 @Path("form")
235 public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) {
236 return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "html");
237 }
238
239 @PUT
240 @Consumes("*/*")
241 @Produces("text/html")
242 public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
243 return produceOutput(is, httpHeaders.getRequestHeaders(), info, "html");
244 }
245
246 @PUT
247 @Consumes("multipart/form-data")
248 @Produces("text/xml")
249 @Path("form")
250 public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) {
251 return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "xml");
252 }
253
254 @PUT
255 @Consumes("*/*")
256 @Produces("text/xml")
257 public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
258 return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
259 }
260
261 private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders,
262 final UriInfo info, final String format) {
263 final AutoDetectParser parser = createParser();
264 final Metadata metadata = new Metadata();
265
266 fillMetadata(parser, metadata, httpHeaders);
267
268 logRequest(logger, info, metadata);
269
270 return new StreamingOutput() {
271 public void write(OutputStream outputStream)
272 throws IOException, WebApplicationException {
273 Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
274 ContentHandler content;
275
276 try {
277 SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
278 TransformerHandler handler = factory.newTransformerHandler( );
279 handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
280 handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
281 handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
282 handler.setResult(new StreamResult(writer));
283 content = new ExpandedTitleContentHandler( handler );
284 }
285 catch ( TransformerConfigurationException e ) {
286 throw new WebApplicationException( e );
287 }
288
289 TikaInputStream tis = TikaInputStream.get(is);
290
291 try {
292 parser.parse(tis, content, metadata);
293 }
294 catch (SAXException e) {
295 throw new WebApplicationException(e);
296 }
297 catch (EncryptedDocumentException e) {
298 logger.warn(String.format(
299 "%s: Encrypted document",
300 info.getPath()
301 ), e);
302 throw new WebApplicationException(e, Response.status(422).build());
303 }
304 catch (TikaException e) {
305 logger.warn(String.format(
306 "%s: Text extraction failed",
307 info.getPath()
308 ), e);
309
310 if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
311 throw (WebApplicationException) e.getCause();
312
313 if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
314 throw new WebApplicationException(Response.status(422).build());
315
316 if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
317 throw new WebApplicationException(Response.status(422).build());
318
319 throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
320 }
321 finally {
322 tis.close();
323 }
324 }
325 };
326 }
327
328 public static void logRequest(Log logger, UriInfo info, Metadata metadata) {
329 if (metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)==null) {
330 logger.info(String.format(
331 "%s (autodetecting type)",
332 info.getPath()
333 ));
334 } else {
335 logger.info(String.format(
336 "%s (%s)",
337 info.getPath(),
338 metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)
339 ));
340 }
341 }
342 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.IOException;
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.Properties;
23
24 import org.apache.commons.cli.CommandLine;
25 import org.apache.commons.cli.CommandLineParser;
26 import org.apache.commons.cli.GnuParser;
27 import org.apache.commons.cli.HelpFormatter;
28 import org.apache.commons.cli.Options;
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.cxf.binding.BindingFactoryManager;
32 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
33 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
34 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
35 import org.apache.tika.Tika;
36
37 public class TikaServerCli {
38 private static final Log logger = LogFactory.getLog(TikaServerCli.class);
39 public static final int DEFAULT_PORT = 9998;
40 public static final String DEFAULT_HOST = "localhost";
41
42 private static Options getOptions() {
43 Options options = new Options();
44 options.addOption("h", "host", true, "host name (default = " + DEFAULT_HOST + ')');
45 options.addOption("p", "port", true, "listen port (default = " + DEFAULT_PORT + ')');
46 options.addOption("h", "help", false, "this help message");
47
48 return options;
49 }
50
51 public static void main(String[] args) {
52 Properties properties = new Properties();
53 try {
54 properties.load(TikaServerCli.class.getClassLoader().getResourceAsStream("tikaserver-version.properties"));
55 } catch (IOException e) {
56 throw new RuntimeException(e);
57 }
58
59 logger.info("Starting Tikaserver "+properties.getProperty("tikaserver.version"));
60 logger.info("Starting Tika Server " + new Tika().toString());
61
62 try {
63 Options options = getOptions();
64
65 CommandLineParser cliParser = new GnuParser();
66 CommandLine line = cliParser.parse(options, args);
67
68 if (line.hasOption("help")) {
69 HelpFormatter helpFormatter = new HelpFormatter();
70 helpFormatter.printHelp("tikaserver", options);
71 System.exit(-1);
72 }
73
74 String host = DEFAULT_HOST;
75
76 if (line.hasOption("host")) {
77 host = line.getOptionValue("host");
78 }
79
80 int port = DEFAULT_PORT;
81
82 if (line.hasOption("port")) {
83 port = Integer.valueOf(line.getOptionValue("port"));
84 }
85
86
87 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
88 sf.setResourceClasses(MetadataEP.class,MetadataResource.class, TikaResource.class, UnpackerResource.class, TikaVersion.class);
89
90 List<Object> providers = new ArrayList<Object>();
91 providers.add(new TarWriter());
92 providers.add(new ZipWriter());
93 providers.add(new CSVMessageBodyWriter());
94 providers.add(new JSONMessageBodyWriter());
95 providers.add(new TikaExceptionMapper());
96 providers.add(new SingletonResourceProvider(new MetadataResource()));
97 providers.add(new SingletonResourceProvider(new TikaResource()));
98 providers.add(new SingletonResourceProvider(new UnpackerResource()));
99 providers.add(new SingletonResourceProvider(new TikaVersion()));
100 sf.setProviders(providers);
101 sf.setAddress("http://" + host + ":" + port + "/");
102 BindingFactoryManager manager = sf.getBus().getExtension(
103 BindingFactoryManager.class);
104 JAXRSBindingFactory factory = new JAXRSBindingFactory();
105 factory.setBus(sf.getBus());
106 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
107 factory);
108 sf.create();
109 logger.info("Started");
110 } catch (Exception ex) {
111 logger.fatal("Can't start", ex);
112 System.exit(-1);
113 }
114 }
115 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 import javax.ws.rs.GET;
19 import javax.ws.rs.Path;
20 import javax.ws.rs.Produces;
21
22 import org.apache.tika.Tika;
23
24 @Path("/version")
25 public class TikaVersion {
26
27 @GET
28 @Produces("text/plain")
29 public String getVersion() {
30 return new Tika().toString();
31 }
32
33 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.ByteArrayInputStream;
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.OutputStreamWriter;
24 import java.util.HashMap;
25 import java.util.Map;
26
27 import javax.ws.rs.PUT;
28 import javax.ws.rs.Path;
29 import javax.ws.rs.Produces;
30 import javax.ws.rs.WebApplicationException;
31 import javax.ws.rs.core.Context;
32 import javax.ws.rs.core.HttpHeaders;
33 import javax.ws.rs.core.Response;
34 import javax.ws.rs.core.UriInfo;
35
36 import org.apache.commons.lang.mutable.MutableInt;
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.poi.poifs.filesystem.DirectoryEntry;
40 import org.apache.poi.poifs.filesystem.DocumentEntry;
41 import org.apache.poi.poifs.filesystem.DocumentInputStream;
42 import org.apache.poi.poifs.filesystem.Entry;
43 import org.apache.poi.poifs.filesystem.Ole10Native;
44 import org.apache.poi.poifs.filesystem.Ole10NativeException;
45 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
46 import org.apache.poi.util.IOUtils;
47 import org.apache.tika.config.TikaConfig;
48 import org.apache.tika.exception.TikaException;
49 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
50 import org.apache.tika.io.TikaInputStream;
51 import org.apache.tika.metadata.Metadata;
52 import org.apache.tika.metadata.TikaMetadataKeys;
53 import org.apache.tika.mime.MimeTypeException;
54 import org.apache.tika.parser.AutoDetectParser;
55 import org.apache.tika.parser.ParseContext;
56 import org.apache.tika.parser.microsoft.OfficeParser;
57 import org.apache.tika.sax.BodyContentHandler;
58 import org.xml.sax.ContentHandler;
59 import org.xml.sax.SAXException;
60 import org.xml.sax.helpers.DefaultHandler;
61
62 @Path("/")
63 public class UnpackerResource {
64 private static final Log logger = LogFactory.getLog(UnpackerResource.class);
65 public static final String TEXT_FILENAME = "__TEXT__";
66 private static final String META_FILENAME = "__METADATA__";
67
68 private final TikaConfig tikaConfig;
69
70 public UnpackerResource() {
71 tikaConfig = TikaConfig.getDefaultConfig();
72 }
73
74 @Path("unpacker{id:(/.*)?}")
75 @PUT
76 @Produces({"application/zip", "application/x-tar"})
77 public Map<String, byte[]> unpack(
78 InputStream is,
79 @Context HttpHeaders httpHeaders,
80 @Context UriInfo info
81 ) throws Exception {
82 return process(is, httpHeaders, info, false);
83 }
84
85 @Path("all{id:(/.*)?}")
86 @PUT
87 @Produces({"application/zip", "application/x-tar"})
88 public Map<String, byte[]> unpackAll(
89 InputStream is,
90 @Context HttpHeaders httpHeaders,
91 @Context UriInfo info
92 ) throws Exception {
93 return process(is, httpHeaders, info, true);
94 }
95
96 private Map<String, byte[]> process(
97 InputStream is,
98 @Context HttpHeaders httpHeaders,
99 @Context UriInfo info,
100 boolean saveAll
101 ) throws Exception {
102 Metadata metadata = new Metadata();
103
104 AutoDetectParser parser = TikaResource.createParser();
105
106 TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
107 TikaResource.logRequest(logger, info, metadata);
108
109 ContentHandler ch;
110 ByteArrayOutputStream text = new ByteArrayOutputStream();
111
112 if (saveAll) {
113 ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, "UTF-8")));
114 } else {
115 ch = new DefaultHandler();
116 }
117
118 ParseContext pc = new ParseContext();
119
120 Map<String, byte[]> files = new HashMap<String, byte[]>();
121 MutableInt count = new MutableInt();
122
123 pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
124
125 try {
126 parser.parse(is, ch, metadata, pc);
127 } catch (TikaException ex) {
128 logger.warn(String.format(
129 "%s: Unpacker failed",
130 info.getPath()
131 ), ex);
132
133 throw ex;
134 }
135
136 if (count.intValue() == 0 && !saveAll) {
137 throw new WebApplicationException(Response.Status.NO_CONTENT);
138 }
139
140 if (saveAll) {
141 files.put(TEXT_FILENAME, text.toByteArray());
142
143 ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
144 MetadataResource.metadataToCsv(metadata, metaStream);
145
146 files.put(META_FILENAME, metaStream.toByteArray());
147 }
148
149 return files;
150 }
151
152 private class MyEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
153 private final MutableInt count;
154 private final Map<String, byte[]> zout;
155
156 MyEmbeddedDocumentExtractor(MutableInt count, Map<String, byte[]> zout) {
157 this.count = count;
158 this.zout = zout;
159 }
160
161 public boolean shouldParseEmbedded(Metadata metadata) {
162 return true;
163 }
164
165 public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
166 ByteArrayOutputStream bos = new ByteArrayOutputStream();
167 IOUtils.copy(inputStream, bos);
168 byte[] data = bos.toByteArray();
169
170 String name = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
171 String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
172
173 if (name == null) {
174 name = Integer.toString(count.intValue());
175 }
176
177 if (!name.contains(".") && contentType!=null) {
178 try {
179 String ext = tikaConfig.getMimeRepository().forName(contentType).getExtension();
180
181 if (ext!=null) {
182 name += ext;
183 }
184 } catch (MimeTypeException e) {
185 logger.warn("Unexpected MimeTypeException", e);
186 }
187 }
188
189 if ("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) {
190 POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(data));
191 OfficeParser.POIFSDocumentType type = OfficeParser.POIFSDocumentType.detectType(poifs);
192
193 if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
194 try {
195 Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(poifs);
196 if (ole.getDataSize()>0) {
197 String label = ole.getLabel();
198
199 if (label.startsWith("ole-")) {
200 label = Integer.toString(count.intValue()) + '-' + label;
201 }
202
203 name = label;
204
205 data = ole.getDataBuffer();
206 }
207 } catch (Ole10NativeException ex) {
208 logger.warn("Skipping invalid part", ex);
209 }
210 } else {
211 name += '.' + type.getExtension();
212 }
213 }
214
215 final String finalName = name;
216
217 if (data.length > 0) {
218 zout.put(finalName, data);
219
220 count.increment();
221 } else {
222 if (inputStream instanceof TikaInputStream) {
223 TikaInputStream tin = (TikaInputStream) inputStream;
224
225 if (tin.getOpenContainer()!=null && tin.getOpenContainer() instanceof DirectoryEntry) {
226 POIFSFileSystem fs = new POIFSFileSystem();
227 copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
228 ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
229 fs.writeFilesystem(bos2);
230 bos2.close();
231
232 zout.put(finalName, bos2.toByteArray());
233 }
234 }
235 }
236 }
237
238 protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
239 throws IOException {
240 for (Entry entry : sourceDir) {
241 if (entry instanceof DirectoryEntry) {
242 // Need to recurse
243 DirectoryEntry newDir = destDir.createDirectory(entry.getName());
244 copy((DirectoryEntry) entry, newDir);
245 } else {
246 // Copy entry
247 InputStream contents = new DocumentInputStream((DocumentEntry) entry);
248 try {
249 destDir.createDocument(entry.getName(), contents);
250 } finally {
251 contents.close();
252 }
253 }
254 }
255 }
256 }
257 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
20 import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
21
22 import javax.ws.rs.Produces;
23 import javax.ws.rs.WebApplicationException;
24 import javax.ws.rs.core.MediaType;
25 import javax.ws.rs.core.MultivaluedMap;
26 import javax.ws.rs.ext.MessageBodyWriter;
27 import javax.ws.rs.ext.Provider;
28 import java.io.IOException;
29 import java.io.OutputStream;
30 import java.lang.annotation.Annotation;
31 import java.lang.reflect.Type;
32 import java.util.Map;
33 import java.util.UUID;
34 import java.util.zip.CRC32;
35 import java.util.zip.ZipEntry;
36 import java.util.zip.ZipException;
37 import java.util.zip.ZipOutputStream;
38
39 @Provider
40 @Produces("application/zip")
41 public class ZipWriter implements MessageBodyWriter<Map<String, byte[]>> {
42 private static void zipStoreBuffer(ZipArchiveOutputStream zip, String name, byte[] dataBuffer) throws IOException {
43 ZipEntry zipEntry = new ZipEntry(name!=null?name: UUID.randomUUID().toString());
44 zipEntry.setMethod(ZipOutputStream.STORED);
45
46 zipEntry.setSize(dataBuffer.length);
47 CRC32 crc32 = new CRC32();
48 crc32.update(dataBuffer);
49 zipEntry.setCrc(crc32.getValue());
50
51 try {
52 zip.putArchiveEntry(new ZipArchiveEntry(zipEntry));
53 } catch (ZipException ex) {
54 if (name!=null) {
55 zipStoreBuffer(zip, "x-"+name, dataBuffer);
56 return;
57 }
58 }
59
60 zip.write(dataBuffer);
61
62 zip.closeArchiveEntry();
63 }
64
65 public boolean isWriteable(Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
66 return Map.class.isAssignableFrom(type);
67 }
68
69 public long getSize(Map<String, byte[]> stringMap, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
70 return -1;
71 }
72
73 public void writeTo(Map<String, byte[]> parts, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream) throws IOException, WebApplicationException {
74 ZipArchiveOutputStream zip = new ZipArchiveOutputStream(entityStream);
75
76 zip.setMethod(ZipArchiveOutputStream.STORED);
77
78 for (Map.Entry<String, byte[]> entry : parts.entrySet()) {
79 zipStoreBuffer(zip, entry.getKey(), entry.getValue());
80 }
81
82 zip.close();
83 }
84 }
0 #
1 # Licensed to the Apache Software Foundation (ASF) under one or more
2 # contributor license agreements. See the NOTICE file distributed with
3 # this work for additional information regarding copyright ownership.
4 # The ASF licenses this file to You under the Apache License, Version 2.0
5 # (the "License"); you may not use this file except in compliance with
6 # the License. You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 #
16
17 handlers = java.util.logging.ConsoleHandler
18
19 # Set the default logging level for the root logger
20 .level = ALL
21
22 # Set the default logging level for new ConsoleHandler instances
23 java.util.logging.ConsoleHandler.level = ALL
24
25 # Set the default formatter for new ConsoleHandler instances
26 java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter
27 #org.apache.commons.logging.Log=org.apache.commons.logging.impl.Jdk14Logger
28
29 org.apache.cxf.level = INFO
0 #
1 # Licensed to the Apache Software Foundation (ASF) under one or more
2 # contributor license agreements. See the NOTICE file distributed with
3 # this work for additional information regarding copyright ownership.
4 # The ASF licenses this file to You under the Apache License, Version 2.0
5 # (the "License"); you may not use this file except in compliance with
6 # the License. You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 #
16
17 tikaserver.version=${project.version}
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import java.io.ByteArrayOutputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.HashMap;
23 import java.util.Map;
24
25 import org.apache.commons.codec.digest.DigestUtils;
26 import org.apache.commons.compress.archivers.ArchiveEntry;
27 import org.apache.commons.compress.archivers.ArchiveInputStream;
28 import org.apache.commons.compress.utils.IOUtils;
29 import org.apache.cxf.io.CachedOutputStream;
30
31 public class CXFTestBase {
32
33 protected String getStringFromInputStream(InputStream in) throws Exception {
34 CachedOutputStream bos = new CachedOutputStream();
35 IOUtils.copy(in, bos);
36 in.close();
37 bos.close();
38 return bos.getOut().toString();
39 }
40
41 protected Map<String, String> readArchive(ArchiveInputStream zip)
42 throws IOException {
43 Map<String, String> data = new HashMap<String, String>();
44
45 while (true) {
46 ArchiveEntry entry = zip.getNextEntry();
47 if (entry == null) {
48 break;
49 }
50
51 ByteArrayOutputStream bos = new ByteArrayOutputStream();
52 IOUtils.copy(zip, bos);
53 data.put(entry.getName(), DigestUtils.md5Hex(bos.toByteArray()));
54 }
55
56 return data;
57 }
58
59 protected String readArchiveText(ArchiveInputStream zip) throws IOException {
60 while (true) {
61 ArchiveEntry entry = zip.getNextEntry();
62 if (entry == null) {
63 break;
64 }
65
66 if (!entry.getName().equals(UnpackerResource.TEXT_FILENAME)) {
67 continue;
68 }
69
70 ByteArrayOutputStream bos = new ByteArrayOutputStream();
71 IOUtils.copy(zip, bos);
72 return bos.toString("UTF-8");
73 }
74
75 return null;
76 }
77 }
0 package org.apache.tika.server;
1
2 /*
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNotNull;
21
22 import java.io.ByteArrayInputStream;
23 import java.io.ByteArrayOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.Reader;
28 import java.io.StringWriter;
29 import java.util.ArrayList;
30 import java.util.HashMap;
31 import java.util.List;
32 import java.util.Map;
33
34 import javax.ws.rs.core.MediaType;
35 import javax.ws.rs.core.Response;
36 import javax.ws.rs.core.Response.Status;
37
38 import org.apache.cxf.binding.BindingFactoryManager;
39 import org.apache.cxf.endpoint.Server;
40 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
41 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
42 import org.apache.cxf.jaxrs.client.WebClient;
43 import org.apache.tika.io.IOUtils;
44 import org.eclipse.jetty.util.ajax.JSON;
45 import org.junit.After;
46 import org.junit.Assert;
47 import org.junit.Before;
48 import org.junit.Test;
49
50 import au.com.bytecode.opencsv.CSVReader;
51
52 public class MetadataEPTest extends CXFTestBase {
53 private static final String META_PATH = "/metadata";
54
55 private static final String endPoint = "http://localhost:" + TikaServerCli.DEFAULT_PORT;
56
57 private Server server;
58
59 private static InputStream copy(InputStream in, int remaining) throws IOException {
60 ByteArrayOutputStream out = new ByteArrayOutputStream();
61 while (remaining > 0) {
62 byte[] bytes = new byte[remaining];
63 int n = in.read(bytes);
64 if (n <= 0) {
65 break;
66 }
67 out.write(bytes, 0, n);
68 remaining -= n;
69 }
70 return new ByteArrayInputStream(out.toByteArray());
71 }
72
73 @Before
74 public void setUp() {
75 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
76 sf.setResourceClasses(MetadataEP.class);
77 List<Object> providers = new ArrayList<Object>();
78 providers.add(new CSVMessageBodyWriter());
79 providers.add(new JSONMessageBodyWriter());
80 sf.setProviders(providers);
81 sf.setAddress(endPoint + "/");
82 BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
83 JAXRSBindingFactory factory = new JAXRSBindingFactory();
84 factory.setBus(sf.getBus());
85 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
86 server = sf.create();
87 }
88
89 @After
90 public void tearDown() {
91 server.stop();
92 server.destroy();
93 }
94
95 @Test
96 public void testSimpleWord_CSV() throws Exception {
97 Response response = WebClient.create(endPoint + META_PATH).type("application/msword").accept("text/csv")
98 .post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
99 Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
100
101 Reader reader = new InputStreamReader((InputStream) response.getEntity());
102
103 @SuppressWarnings("resource")
104 CSVReader csvReader = new CSVReader(reader);
105
106 Map<String, String> metadata = new HashMap<String, String>();
107
108 String[] nextLine;
109 while ((nextLine = csvReader.readNext()) != null) {
110 metadata.put(nextLine[0], nextLine[1]);
111 }
112
113 assertNotNull(metadata.get("Author"));
114 assertEquals("Maxim Valyanskiy", metadata.get("Author"));
115 }
116
117 @Test
118 public void testSimpleWord_JSON() throws Exception {
119 Response response = WebClient.create(endPoint + META_PATH).type("application/msword")
120 .accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
121
122 Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
123
124 Reader reader = new InputStreamReader((InputStream) response.getEntity());
125 Map<?, ?> metadata = (Map<?, ?>) JSON.parse(reader);
126
127 assertNotNull(metadata.get("Author"));
128 assertEquals("Maxim Valyanskiy", metadata.get("Author"));
129 }
130
131 @Test
132 public void testGetField_Author_TEXT() throws Exception {
133 Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
134 .accept(MediaType.TEXT_PLAIN).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
135 Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
136
137 StringWriter w = new StringWriter();
138 IOUtils.copy((InputStream) response.getEntity(), w);
139 assertEquals("Maxim Valyanskiy", w.toString());
140 }
141
142 @Test
143 public void testGetField_Author_JSON() throws Exception {
144 Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
145 .accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
146 Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
147
148 Reader reader = new InputStreamReader((InputStream) response.getEntity());
149 Map<?, ?> metadata = (Map<?, ?>) JSON.parse(reader);
150
151 assertNotNull(metadata.get("Author"));
152 assertEquals("Maxim Valyanskiy", metadata.get("Author"));
153 }
154
155 @Test
156 public void testGetField_XXX_NotFound() throws Exception {
157 Response response = WebClient.create(endPoint + META_PATH + "/xxx").type("application/msword")
158 .accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
159 Assert.assertEquals(Status.NOT_FOUND.getStatusCode(), response.getStatus());
160 }
161
162 @Test
163 public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception {
164
165 InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
166
167 Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
168 .accept(MediaType.TEXT_PLAIN).post(copy(stream, 8000));
169 Assert.assertEquals(Status.BAD_REQUEST.getStatusCode(), response.getStatus());
170 }
171
172 @Test
173 public void testGetField_Author_TEXT_Partial_Found() throws Exception {
174
175 InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
176
177 Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
178 .accept(MediaType.TEXT_PLAIN).post(copy(stream, 12000));
179 Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
180
181 StringWriter w = new StringWriter();
182 IOUtils.copy((InputStream) response.getEntity(), w);
183 assertEquals("Maxim Valyanskiy", w.toString());
184 }
185
186 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNotNull;
21
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.io.Reader;
25 import java.util.HashMap;
26 import java.util.Map;
27
28 import javax.ws.rs.core.Response;
29
30 import org.apache.cxf.binding.BindingFactoryManager;
31 import org.apache.cxf.endpoint.Server;
32 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
33 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
34 import org.apache.cxf.jaxrs.client.WebClient;
35 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
36 import org.junit.After;
37 import org.junit.Before;
38 import org.junit.Test;
39
40 import au.com.bytecode.opencsv.CSVReader;
41
42 public class MetadataResourceTest extends CXFTestBase {
43 private static final String META_PATH = "/meta";
44
45 private static final String endPoint = "http://localhost:"
46 + TikaServerCli.DEFAULT_PORT;
47
48 private Server server;
49
50 @Before
51 public void setUp() {
52 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
53 sf.setResourceClasses(MetadataResource.class);
54 sf.setResourceProvider(MetadataResource.class,
55 new SingletonResourceProvider(new MetadataResource()));
56 sf.setAddress(endPoint + "/");
57 BindingFactoryManager manager = sf.getBus().getExtension(
58 BindingFactoryManager.class);
59 JAXRSBindingFactory factory = new JAXRSBindingFactory();
60 factory.setBus(sf.getBus());
61 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
62 factory);
63 server = sf.create();
64 }
65
66 @After
67 public void tearDown() {
68 server.stop();
69 server.destroy();
70 }
71
72 @Test
73 public void testSimpleWord() throws Exception {
74 Response response = WebClient
75 .create(endPoint + META_PATH)
76 .type("application/msword")
77 .accept("text/csv")
78 .put(ClassLoader
79 .getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
80
81 Reader reader = new InputStreamReader(
82 (InputStream) response.getEntity());
83
84 CSVReader csvReader = new CSVReader(reader);
85
86 Map<String, String> metadata = new HashMap<String, String>();
87
88 String[] nextLine;
89 while ((nextLine = csvReader.readNext()) != null) {
90 metadata.put(nextLine[0], nextLine[1]);
91 }
92
93 assertNotNull(metadata.get("Author"));
94 assertEquals("Maxim Valyanskiy", metadata.get("Author"));
95 }
96
97 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertTrue;
21
22 import java.io.InputStream;
23
24 import javax.ws.rs.core.Response;
25
26 import org.apache.cxf.binding.BindingFactoryManager;
27 import org.apache.cxf.endpoint.Server;
28 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
29 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
30 import org.apache.cxf.jaxrs.client.WebClient;
31 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
32 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
33 import org.junit.After;
34 import org.junit.Before;
35 import org.junit.Test;
36
37 public class TikaResourceTest extends CXFTestBase {
38 private static final String TIKA_PATH = "/tika";
39 public static final String TEST_DOC = "test.doc";
40 public static final String TEST_XLSX = "16637.xlsx";
41 private static final int UNPROCESSEABLE = 422;
42 private static final String endPoint = "http://localhost:"
43 + TikaServerCli.DEFAULT_PORT;
44
45 private Server server;
46
47 @Before
48 public void setUp() {
49 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
50 sf.setResourceClasses(TikaResource.class);
51 sf.setResourceProvider(TikaResource.class,
52 new SingletonResourceProvider(new TikaResource()));
53 sf.setAddress(endPoint + "/");
54 BindingFactoryManager manager = sf.getBus().getExtension(
55 BindingFactoryManager.class);
56 JAXRSBindingFactory factory = new JAXRSBindingFactory();
57 factory.setBus(sf.getBus());
58 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
59 factory);
60 server = sf.create();
61 }
62
63 @After
64 public void tearDown() throws Exception {
65 server.stop();
66 server.destroy();
67 }
68
69 @Test
70 public void testHelloWorld() throws Exception {
71 Response response = WebClient.create(endPoint + TIKA_PATH)
72 .type("text/plain").accept("text/plain").get();
73 assertEquals(TikaResource.GREETING,
74 getStringFromInputStream((InputStream) response.getEntity()));
75 }
76
77 @Test
78 public void testSimpleWord() throws Exception {
79 Response response = WebClient.create(endPoint + TIKA_PATH)
80 .type("application/msword")
81 .accept("text/plain")
82 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
83 String responseMsg = getStringFromInputStream((InputStream) response
84 .getEntity());
85 assertTrue(responseMsg.contains("test"));
86 }
87
88 @Test
89 public void testApplicationWadl() throws Exception {
90 Response response = WebClient
91 .create(endPoint + TIKA_PATH + "?_wadl")
92 .accept("text/plain").get();
93 String resp = getStringFromInputStream((InputStream) response
94 .getEntity());
95 assertTrue(resp.startsWith("<application"));
96 }
97
98 @Test
99 public void testPasswordXLS() throws Exception {
100 Response response = WebClient.create(endPoint + TIKA_PATH)
101 .type("application/vnd.ms-excel")
102 .accept("text/plain")
103 .put(ClassLoader.getSystemResourceAsStream("password.xls"));
104
105 assertEquals(UNPROCESSEABLE, response.getStatus());
106 }
107
108 @Test
109 public void testSimpleWordHTML() throws Exception {
110 Response response = WebClient.create(endPoint + TIKA_PATH)
111 .type("application/msword")
112 .accept("text/html")
113 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
114 String responseMsg = getStringFromInputStream((InputStream) response
115 .getEntity());
116 assertTrue(responseMsg.contains("test"));
117 }
118
119 @Test
120 public void testPasswordXLSHTML() throws Exception {
121 Response response = WebClient.create(endPoint + TIKA_PATH)
122 .type("application/vnd.ms-excel")
123 .accept("text/html")
124 .put(ClassLoader.getSystemResourceAsStream("password.xls"));
125
126 assertEquals(UNPROCESSEABLE, response.getStatus());
127 }
128
129 @Test
130 public void testSimpleWordXML() throws Exception {
131 Response response = WebClient.create(endPoint + TIKA_PATH)
132 .type("application/msword")
133 .accept("text/xml")
134 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
135 String responseMsg = getStringFromInputStream((InputStream) response
136 .getEntity());
137 assertTrue(responseMsg.contains("test"));
138 }
139
140 @Test
141 public void testPasswordXLSXML() throws Exception {
142 Response response = WebClient.create(endPoint + TIKA_PATH)
143 .type("application/vnd.ms-excel")
144 .accept("text/xml")
145 .put(ClassLoader.getSystemResourceAsStream("password.xls"));
146
147 assertEquals(UNPROCESSEABLE, response.getStatus());
148 }
149
150 @Test
151 public void testSimpleWordMultipartXML() throws Exception {
152 ClassLoader.getSystemResourceAsStream(TEST_DOC);
153 Attachment attachmentPart =
154 new Attachment("myworddoc", "application/msword", ClassLoader.getSystemResourceAsStream(TEST_DOC));
155 WebClient webClient = WebClient.create(endPoint + TIKA_PATH + "/form");
156 Response response = webClient.type("multipart/form-data")
157 .accept("text/xml")
158 .put(attachmentPart);
159 String responseMsg = getStringFromInputStream((InputStream) response
160 .getEntity());
161 assertTrue(responseMsg.contains("test"));
162 }
163
164 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import static org.junit.Assert.assertEquals;
20
21 import java.io.InputStream;
22
23 import javax.ws.rs.core.Response;
24
25 import org.apache.cxf.binding.BindingFactoryManager;
26 import org.apache.cxf.endpoint.Server;
27 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
28 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
29 import org.apache.cxf.jaxrs.client.WebClient;
30 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
31 import org.apache.tika.Tika;
32 import org.junit.After;
33 import org.junit.Before;
34 import org.junit.Test;
35
36 public class TikaVersionTest extends CXFTestBase {
37
38 private static final String VERSION_PATH = "/version";
39 private static final String endPoint = "http://localhost:"
40 + TikaServerCli.DEFAULT_PORT;
41 private Server server;
42
43 @Before
44 public void setUp() {
45 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
46 sf.setResourceClasses(TikaVersion.class);
47 sf.setResourceProvider(
48 TikaVersion.class,
49 new SingletonResourceProvider(new TikaVersion())
50 );
51 sf.setAddress(endPoint + "/");
52
53 BindingFactoryManager manager = sf.getBus().getExtension(
54 BindingFactoryManager.class
55 );
56
57 JAXRSBindingFactory factory = new JAXRSBindingFactory();
58 factory.setBus(sf.getBus());
59
60 manager.registerBindingFactory(
61 JAXRSBindingFactory.JAXRS_BINDING_ID,
62 factory
63 );
64
65 server = sf.create();
66 }
67
68 @After
69 public void tearDown() throws Exception {
70 server.stop();
71 server.destroy();
72 }
73
74 @Test
75 public void testGetVersion() throws Exception {
76 Response response = WebClient
77 .create(endPoint + VERSION_PATH)
78 .type("text/plain")
79 .accept("text/plain")
80 .get();
81
82 assertEquals(new Tika().toString(),
83 getStringFromInputStream((InputStream) response.getEntity()));
84 }
85
86 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertFalse;
21 import static org.junit.Assert.assertNotNull;
22 import static org.junit.Assert.assertTrue;
23
24 import org.apache.commons.compress.archivers.ArchiveInputStream;
25 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
26 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
27 import org.apache.cxf.binding.BindingFactoryManager;
28 import org.apache.cxf.endpoint.Server;
29 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
30 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
31 import org.apache.cxf.jaxrs.client.WebClient;
32 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
33 import org.junit.After;
34 import org.junit.Before;
35 import org.junit.Test;
36
37 import java.io.InputStream;
38 import java.util.ArrayList;
39 import java.util.List;
40 import java.util.Map;
41
42 import javax.ws.rs.core.Response;
43
44 public class UnpackerResourceTest extends CXFTestBase {
45 private static final String UNPACKER_PATH = "/unpacker";
46 private static final String ALL_PATH = "/all";
47
48 private static final String endPoint = "http://localhost:"
49 + TikaServerCli.DEFAULT_PORT;
50
51 private static final String TEST_DOC_WAV = "Doc1_ole.doc";
52 private static final String WAV1_MD5 = "bdd0a78a54968e362445364f95d8dc96";
53 private static final String WAV1_NAME = "_1310388059/MSj00974840000[1].wav";
54 private static final String WAV2_MD5 = "3bbd42fb1ac0e46a95350285f16d9596";
55 private static final String WAV2_NAME = "_1310388058/MSj00748450000[1].wav";
56 private static final String JPG_NAME = "image1.jpg";
57 private static final String XSL_IMAGE1_MD5 = "68ead8f4995a3555f48a2f738b2b0c3d";
58 private static final String JPG_MD5 = XSL_IMAGE1_MD5;
59 private static final String JPG2_NAME = "image2.jpg";
60 private static final String JPG2_MD5 = "b27a41d12c646d7fc4f3826cf8183c68";
61 private static final String TEST_DOCX_IMAGE = "2pic.docx";
62 private static final String DOCX_IMAGE1_MD5 = "5516590467b069fa59397432677bad4d";
63 private static final String DOCX_IMAGE2_MD5 = "a5dd81567427070ce0a2ff3e3ef13a4c";
64 private static final String DOCX_IMAGE1_NAME = "image1.jpeg";
65 private static final String DOCX_IMAGE2_NAME = "image2.jpeg";
66 private static final String DOCX_EXE1_MD5 = "d71ffa0623014df725f8fd2710de4411";
67 private static final String DOCX_EXE1_NAME = "GMapTool.exe";
68 private static final String DOCX_EXE2_MD5 = "2485435c7c22d35f2de9b4c98c0c2e1a";
69 private static final String DOCX_EXE2_NAME = "Setup.exe";
70 private static final String XSL_IMAGE2_MD5 = "8969288f4245120e7c3870287cce0ff3";
71 private static final String APPLICATION_MSWORD = "application/msword";
72 private static final String APPLICATION_XML = "application/xml";
73 private static final String CONTENT_TYPE = "Content-type";
74
75 private Server server;
76
77 @Before
78 public void setUp() {
79 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
80 List<Object> providers = new ArrayList<Object>();
81 providers.add(new TarWriter());
82 providers.add(new ZipWriter());
83 providers.add(new TikaExceptionMapper());
84 sf.setProviders(providers);
85 sf.setResourceClasses(UnpackerResource.class);
86 sf.setResourceProvider(UnpackerResource.class,
87 new SingletonResourceProvider(new UnpackerResource()));
88 sf.setAddress(endPoint + "/");
89 BindingFactoryManager manager = sf.getBus().getExtension(
90 BindingFactoryManager.class);
91 JAXRSBindingFactory factory = new JAXRSBindingFactory();
92 factory.setBus(sf.getBus());
93 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
94 factory);
95 server = sf.create();
96 }
97
98 @After
99 public void tearDown() {
100 server.stop();
101 server.destroy();
102 }
103
104 @Test
105 public void testDocWAV() throws Exception {
106 Response response = WebClient.create(endPoint + UNPACKER_PATH)
107 .type(APPLICATION_MSWORD).accept("application/zip")
108 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
109
110 ArchiveInputStream zip = new ZipArchiveInputStream(
111 (InputStream) response.getEntity());
112
113 Map<String, String> data = readArchive(zip);
114 assertEquals(WAV1_MD5, data.get(WAV1_NAME));
115 assertEquals(WAV2_MD5, data.get(WAV2_NAME));
116 assertFalse(data.containsKey(UnpackerResource.TEXT_FILENAME));
117 }
118
119 @Test
120 public void testDocWAVText() throws Exception {
121 Response response = WebClient.create(endPoint + ALL_PATH)
122 .type(APPLICATION_MSWORD).accept("application/zip")
123 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
124
125 ArchiveInputStream zip = new ZipArchiveInputStream(
126 (InputStream) response.getEntity());
127
128 Map<String, String> data = readArchive(zip);
129 assertEquals(WAV1_MD5, data.get(WAV1_NAME));
130 assertEquals(WAV2_MD5, data.get(WAV2_NAME));
131 assertTrue(data.containsKey(UnpackerResource.TEXT_FILENAME));
132 }
133
134 @Test
135 public void testDocPicture() throws Exception {
136 Response response = WebClient.create(endPoint + UNPACKER_PATH)
137 .type(APPLICATION_MSWORD).accept("application/zip")
138 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
139
140 ZipArchiveInputStream zip = new ZipArchiveInputStream(
141 (InputStream) response.getEntity());
142 Map<String, String> data = readArchive(zip);
143
144 assertEquals(JPG_MD5, data.get(JPG_NAME));
145 }
146
147 @Test
148 public void testDocPictureNoOle() throws Exception {
149 Response response = WebClient.create(endPoint + UNPACKER_PATH)
150 .type(APPLICATION_MSWORD).accept("application/zip")
151 .put(ClassLoader.getSystemResourceAsStream("2pic.doc"));
152
153 ZipArchiveInputStream zip = new ZipArchiveInputStream(
154 (InputStream) response.getEntity());
155
156 Map<String, String> data = readArchive(zip);
157 assertEquals(JPG2_MD5, data.get(JPG2_NAME));
158 }
159
160 @Test
161 public void testImageDOCX() throws Exception {
162 Response response = WebClient.create(endPoint + UNPACKER_PATH)
163 .accept("application/zip").put(
164 ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE));
165
166 ZipArchiveInputStream zip = new ZipArchiveInputStream(
167 (InputStream) response.getEntity());
168
169 Map<String, String> data = readArchive(zip);
170 assertEquals(DOCX_IMAGE1_MD5, data.get(DOCX_IMAGE1_NAME));
171 assertEquals(DOCX_IMAGE2_MD5, data.get(DOCX_IMAGE2_NAME));
172 }
173
174 @Test
175 public void test415() throws Exception {
176 Response response = WebClient.create(endPoint + UNPACKER_PATH)
177 .type("xxx/xxx")
178 .accept("*/*")
179 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
180
181 assertEquals(415, response.getStatus());
182 }
183
184 @Test
185 public void testExeDOCX() throws Exception {
186 String TEST_DOCX_EXE = "2exe.docx";
187 Response response = WebClient.create(endPoint + UNPACKER_PATH)
188 .accept("application/zip")
189 .put(ClassLoader.getSystemResourceAsStream(TEST_DOCX_EXE));
190
191 ZipArchiveInputStream zip = new ZipArchiveInputStream(
192 (InputStream) response.getEntity());
193
194 Map<String, String> data = readArchive(zip);
195
196 assertEquals(DOCX_EXE1_MD5, data.get(DOCX_EXE1_NAME));
197 assertEquals(DOCX_EXE2_MD5, data.get(DOCX_EXE2_NAME));
198 }
199
200 @Test
201 public void testImageXSL() throws Exception {
202 Response response = WebClient.create(endPoint + UNPACKER_PATH)
203 .accept("application/zip")
204 .put(ClassLoader.getSystemResourceAsStream("pic.xls"));
205
206 ZipArchiveInputStream zip = new ZipArchiveInputStream(
207 (InputStream) response.getEntity());
208
209 Map<String, String> data = readArchive(zip);
210 assertEquals(XSL_IMAGE1_MD5, data.get("0.jpg"));
211 assertEquals(XSL_IMAGE2_MD5, data.get("1.jpg"));
212 }
213
214 @Test
215 public void testTarDocPicture() throws Exception {
216 Response response = WebClient.create(endPoint + UNPACKER_PATH)
217 .type(APPLICATION_MSWORD).accept("application/x-tar")
218 .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
219
220 Map<String, String> data = readArchive(new TarArchiveInputStream(
221 (InputStream) response.getEntity()));
222
223 assertEquals(JPG_MD5, data.get(JPG_NAME));
224 }
225
226 @Test
227 public void testText() throws Exception {
228 Response response = WebClient.create(endPoint + ALL_PATH)
229 .header(CONTENT_TYPE, APPLICATION_XML)
230 .accept("application/zip")
231 .put(ClassLoader.getSystemResourceAsStream("test.doc"));
232
233 String responseMsg = readArchiveText(new ZipArchiveInputStream(
234 (InputStream) response.getEntity()));
235 assertNotNull(responseMsg);
236 assertTrue(responseMsg.contains("test"));
237 }
238
239 }
0 <?xml version="1.0" encoding="UTF-8"?>
1
2 <!--
3 Licensed to the Apache Software Foundation (ASF) under one
4 or more contributor license agreements. See the NOTICE file
5 distributed with this work for additional information
6 regarding copyright ownership. The ASF licenses this file
7 to you under the Apache License, Version 2.0 (the
8 "License"); you may not use this file except in compliance
9 with the License. You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under the License is distributed on an
15 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 KIND, either express or implied. See the License for the
17 specific language governing permissions and limitations
18 under the License.
19 -->
20
21 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
22 <modelVersion>4.0.0</modelVersion>
23
24 <parent>
25 <groupId>org.apache.tika</groupId>
26 <artifactId>tika-parent</artifactId>
27 <version>1.5</version>
28 <relativePath>../tika-parent/pom.xml</relativePath>
29 </parent>
30
31 <artifactId>tika-xmp</artifactId>
32 <packaging>bundle</packaging>
33
34 <name>Apache Tika XMP</name>
35 <description>Converts Tika metadata to XMP</description>
36
37 <build>
38 <plugins>
39 <plugin>
40 <groupId>org.apache.felix</groupId>
41 <artifactId>maven-scr-plugin</artifactId>
42 <version>1.7.4</version>
43 </plugin>
44 <plugin>
45 <!-- builds the bundle -->
46 <groupId>org.apache.felix</groupId>
47 <artifactId>maven-bundle-plugin</artifactId>
48 <extensions>true</extensions>
49 <configuration>
50 <instructions>
51 <Export-Package>
52 org.apache.tika.xmp,
53 org.apache.tika.xmp.convert
54 </Export-Package>
55 <Private-Package />
56 </instructions>
57 </configuration>
58 </plugin>
59 </plugins>
60 </build>
61
62 <dependencies>
63 <dependency>
64 <groupId>${project.groupId}</groupId>
65 <artifactId>tika-core</artifactId>
66 <version>${project.version}</version>
67 </dependency>
68 <dependency>
69 <groupId>${project.groupId}</groupId>
70 <artifactId>tika-parsers</artifactId>
71 <version>${project.version}</version>
72 </dependency>
73 <dependency>
74 <groupId>com.adobe.xmp</groupId>
75 <artifactId>xmpcore</artifactId>
76 <version>5.1.2</version>
77 </dependency>
78 <dependency>
79 <groupId>junit</groupId>
80 <artifactId>junit</artifactId>
81 <scope>test</scope>
82 <version>4.11</version>
83 </dependency>
84 </dependencies>
85
86 <url>http://tika.apache.org/</url>
87 <organization>
88 <name>The Apache Software Foundation</name>
89 <url>http://www.apache.org</url>
90 </organization>
91 <scm>
92 <url>http://svn.apache.org/viewvc/tika/tags/1.5/tika-xmp</url>
93 <connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.5/tika-xmp</connection>
94 <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.5/tika-xmp</developerConnection>
95 </scm>
96 <issueManagement>
97 <system>JIRA</system>
98 <url>https://issues.apache.org/jira/browse/TIKA</url>
99 </issueManagement>
100 <ciManagement>
101 <system>Jenkins</system>
102 <url>https://builds.apache.org/job/Tika-trunk/</url>
103 </ciManagement>
104 </project>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp;
17
18 import java.io.IOException;
19 import java.io.NotSerializableException;
20 import java.io.ObjectInputStream;
21 import java.io.ObjectOutputStream;
22 import java.util.Calendar;
23 import java.util.Date;
24 import java.util.Enumeration;
25 import java.util.Map;
26 import java.util.Properties;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.metadata.Property;
31 import org.apache.tika.metadata.Property.PropertyType;
32 import org.apache.tika.metadata.PropertyTypeException;
33 import org.apache.tika.xmp.convert.TikaToXMP;
34
35 import com.adobe.xmp.XMPDateTime;
36 import com.adobe.xmp.XMPException;
37 import com.adobe.xmp.XMPIterator;
38 import com.adobe.xmp.XMPMeta;
39 import com.adobe.xmp.XMPMetaFactory;
40 import com.adobe.xmp.XMPSchemaRegistry;
41 import com.adobe.xmp.XMPUtils;
42 import com.adobe.xmp.options.IteratorOptions;
43 import com.adobe.xmp.options.PropertyOptions;
44 import com.adobe.xmp.options.SerializeOptions;
45 import com.adobe.xmp.properties.XMPProperty;
46
47 /**
48 * Provides a conversion of the Metadata map from Tika to the XMP data model by also providing the
49 * Metadata API for clients to ease transition. But clients can also work directly on the XMP data
50 * model, by getting the XMPMeta reference from this class. Usually the instance would be
51 * initialized by providing the Metadata object that had been returned from Tika-core which
52 * populates the XMP data model with all properties that can be converted.
53 *
54 * This class is not serializable!
55 */
56 @SuppressWarnings("serial")
57 public class XMPMetadata extends Metadata {
58 /** The XMP data */
59 private XMPMeta xmpData;
60 /** Use the XMP namespace registry implementation */
61 private static final XMPSchemaRegistry registry = XMPMetaFactory.getSchemaRegistry();
62
63 /**
64 * Initializes with an empty XMP packet
65 */
66 public XMPMetadata() {
67 xmpData = XMPMetaFactory.create();
68 }
69
70 /**
71 * @see #XMPMetadata(Metadata, String)
72 * But the mimetype is retrieved from the metadata map.
73 */
74 public XMPMetadata(Metadata meta) throws TikaException {
75 this.xmpData = TikaToXMP.convert( meta );
76 }
77
78 /**
79 * Initializes the data by converting the Metadata information to XMP. If a mimetype is
80 * provided, a specific converter can be used, that converts all available metadata. If there is
81 * no mimetype provided or no specific converter available a generic conversion is done which
82 * will convert only those properties that are in known namespaces and are using the correct
83 * prefixes
84 *
85 * @param meta
86 * the Metadata information from Tika-core
87 * @param mimetype
88 * mimetype information
89 * @throws In
90 * case an error occured during conversion
91 */
92 public XMPMetadata(Metadata meta, String mimetype) throws TikaException {
93 this.xmpData = TikaToXMP.convert( meta, mimetype );
94 }
95
96 /**
97 * @see org.apache.tika.xmp.XMPMetadata#process(org.apache.tika.metadata.Metadata,
98 * java.lang.String)
99 * But the mimetype is retrieved from the metadata map.
100 */
101 public void process(Metadata meta) throws TikaException {
102 this.xmpData = TikaToXMP.convert( meta );
103 }
104
105 /**
106 * Converts the Metadata information to XMP. If a mimetype is provided, a specific converter can
107 * be used, that converts all available metadata. If there is no mimetype provided or no
108 * specific converter available a generic conversion is done which will convert only those
109 * properties that are in known namespaces and are using the correct prefixes
110 *
111 * @param meta
112 * the Metadata information from Tika-core
113 * @param mimetype
114 * mimetype information
115 * @throws In
116 * case an error occured during conversion
117 */
118 public void process(Metadata meta, String mimetype) throws TikaException {
119 this.xmpData = TikaToXMP.convert( meta, mimetype );
120 }
121
122 /**
123 * Provides direct access to the XMP data model, in case a client prefers to work directly on it
124 * instead of using the Metadata API
125 *
126 * @return the "internal" XMP data object
127 */
128 public XMPMeta getXMPData() {
129 return xmpData;
130 }
131
132 // === Namespace Registry API === //
133 /**
134 * Register a namespace URI with a suggested prefix. It is not an error if the URI is already
135 * registered, no matter what the prefix is. If the URI is not registered but the suggested
136 * prefix is in use, a unique prefix is created from the suggested one. The actual registeed
137 * prefix is always returned. The function result tells if the registered prefix is the
138 * suggested one.
139 * Note: No checking is presently done on either the URI or the prefix.
140 *
141 * @param namespaceURI
142 * The URI for the namespace. Must be a valid XML URI.
143 * @param suggestedPrefix
144 * The suggested prefix to be used if the URI is not yet registered. Must be a valid
145 * XML name.
146 * @return Returns the registered prefix for this URI, is equal to the suggestedPrefix if the
147 * namespace hasn't been registered before, otherwise the existing prefix.
148 * @throws XMPException
149 * If the parameters are not accordingly set
150 */
151 public static String registerNamespace(String namespaceURI, String suggestedPrefix)
152 throws XMPException {
153 return registry.registerNamespace( namespaceURI, suggestedPrefix );
154 }
155
156 /**
157 * Obtain the prefix for a registered namespace URI.
158 * It is not an error if the namespace URI is not registered.
159 *
160 * @param namespaceURI
161 * The URI for the namespace. Must not be null or the empty string.
162 * @return Returns the prefix registered for this namespace URI or null.
163 */
164 public static String getNamespacePrefix(String namespaceURI) {
165 return registry.getNamespacePrefix( namespaceURI );
166 }
167
168 /**
169 * Obtain the URI for a registered namespace prefix.
170 * It is not an error if the namespace prefix is not registered.
171 *
172 * @param namespacePrefix
173 * The prefix for the namespace. Must not be null or the empty string.
174 * @return Returns the URI registered for this prefix or null.
175 */
176 public static String getNamespaceURI(String namespacePrefix) {
177 return registry.getNamespaceURI( namespacePrefix );
178 }
179
180 /**
181 * @return Returns the registered prefix/namespace-pairs as map, where the keys are the
182 * namespaces and the values are the prefixes.
183 */
184 @SuppressWarnings("unchecked")
185 public static Map<String, String> getNamespaces() {
186 return registry.getNamespaces();
187 }
188
189 /**
190 * @return Returns the registered namespace/prefix-pairs as map, where the keys are the prefixes
191 * and the values are the namespaces.
192 */
193 @SuppressWarnings("unchecked")
194 public static Map<String, String> getPrefixes() {
195 return registry.getPrefixes();
196 }
197
198 /**
199 * Deletes a namespace from the registry.
200 * <p>
201 * Does nothing if the URI is not registered, or if the namespaceURI parameter is null or the
202 * empty string.
203 * <p>
204 * Note: Not yet implemented.
205 *
206 * @param namespaceURI
207 * The URI for the namespace.
208 */
209 public static void deleteNamespace(String namespaceURI) {
210 registry.deleteNamespace( namespaceURI );
211 }
212
213 // === Metadata API === //
214 /**
215 * @see org.apache.tika.xmp.XMPMetadata#isMultiValued(java.lang.String)
216 */
217 @Override
218 public boolean isMultiValued(Property property) {
219 return this.isMultiValued( property.getName() );
220 }
221
222 /**
223 * Checks if the named property is an array.
224 *
225 * @see org.apache.tika.metadata.Metadata#isMultiValued(java.lang.String)
226 */
227 @Override
228 public boolean isMultiValued(String name) {
229 checkKey( name );
230
231 String[] keyParts = splitKey( name );
232
233 String ns = registry.getNamespaceURI( keyParts[0] );
234 if (ns != null) {
235 try {
236 XMPProperty prop = xmpData.getProperty( ns, keyParts[1] );
237
238 return prop.getOptions().isArray();
239 }
240 catch (XMPException e) {
241 // Ignore
242 }
243 }
244
245 return false;
246 }
247
248 /**
249 * For XMP it is not clear what that API should return, therefor not implemented
250 */
251 @Override
252 public String[] names() {
253 throw new UnsupportedOperationException( "Not implemented" );
254 }
255
256 /**
257 * Returns the value of a simple property or the first one of an array. The given name must
258 * contain a namespace prefix of a registered namespace.
259 *
260 * @see org.apache.tika.metadata.Metadata#get(java.lang.String)
261 */
262 @Override
263 public String get(String name) {
264 checkKey( name );
265
266 String value = null;
267 String[] keyParts = splitKey( name );
268
269 String ns = registry.getNamespaceURI( keyParts[0] );
270 if (ns != null) {
271 try {
272 XMPProperty prop = xmpData.getProperty( ns, keyParts[1] );
273
274 if (prop != null && prop.getOptions().isSimple()) {
275 value = prop.getValue();
276 }
277 else if (prop != null && prop.getOptions().isArray()) {
278 prop = xmpData.getArrayItem( ns, keyParts[1], 1 );
279 value = prop.getValue();
280 }
281 // in all other cases, null is returned
282 }
283 catch (XMPException e) {
284 // Ignore
285 }
286 }
287
288 return value;
289 }
290
291 /**
292 * @see org.apache.tika.xmp.XMPMetadata#get(java.lang.String)
293 */
294 @Override
295 public String get(Property property) {
296 return this.get( property.getName() );
297 }
298
299 /**
300 * @see org.apache.tika.xmp.XMPMetadata#get(java.lang.String)
301 */
302 @Override
303 public Integer getInt(Property property) {
304 Integer result = null;
305
306 try {
307 result = new Integer( XMPUtils.convertToInteger( this.get( property.getName() ) ) );
308 }
309 catch (XMPException e) {
310 // Ignore
311 }
312
313 return result;
314 }
315
316 /**
317 * @see org.apache.tika.xmp.XMPMetadata#get(java.lang.String)
318 */
319 @Override
320 public Date getDate(Property property) {
321 Date result = null;
322
323 try {
324 XMPDateTime xmpDate = XMPUtils.convertToDate( this.get( property.getName() ) );
325 if (xmpDate != null) {
326 Calendar cal = xmpDate.getCalendar();
327 // TODO Timezone is currently lost
328 // need another solution that preserves the timezone
329 result = cal.getTime();
330 }
331 }
332 catch (XMPException e) {
333 // Ignore
334 }
335
336 return result;
337 }
338
339 /**
340 * @see org.apache.tika.xmp.XMPMetadata#getValues(java.lang.String)
341 */
342 @Override
343 public String[] getValues(Property property) {
344 return this.getValues( property.getName() );
345 }
346
347 /**
348 * Returns the value of a simple property or all if the property is an array and the elements
349 * are of simple type. The given name must contain a namespace prefix of a registered namespace.
350 *
351 * @see org.apache.tika.metadata.Metadata#getValues(java.lang.String)
352 */
353 @Override
354 public String[] getValues(String name) {
355 checkKey( name );
356
357 String[] value = null;
358 String[] keyParts = splitKey( name );
359
360 String ns = registry.getNamespaceURI( keyParts[0] );
361 if (ns != null) {
362 try {
363 XMPProperty prop = xmpData.getProperty( ns, keyParts[1] );
364
365 if (prop != null && prop.getOptions().isSimple()) {
366 value = new String[1];
367 value[0] = prop.getValue();
368 }
369 else if (prop != null && prop.getOptions().isArray()) {
370 int size = xmpData.countArrayItems( ns, keyParts[1] );
371 value = new String[size];
372 boolean onlySimpleChildren = true;
373
374 for (int i = 0; i < size && onlySimpleChildren; i++) {
375 prop = xmpData.getArrayItem( ns, keyParts[1], i + 1 );
376 if (prop.getOptions().isSimple()) {
377 value[i] = prop.getValue();
378 }
379 else {
380 onlySimpleChildren = false;
381 }
382 }
383
384 if (!onlySimpleChildren) {
385 value = null;
386 }
387 }
388 // in all other cases, null is returned
389 }
390 catch (XMPException e) {
391 // Ignore
392 }
393 }
394
395 return value;
396 }
397
398 /**
399 * As this API could only possibly work for simple properties in XMP, it just calls the set
400 * method, which replaces any existing value
401 *
402 * @see org.apache.tika.metadata.Metadata#add(java.lang.String, java.lang.String)
403 */
404 @Override
405 public void add(String name, String value) {
406 set( name, value );
407 }
408
409 /**
410 * Sets the given property. If the property already exists, it is overwritten. Only simple
411 * properties that use a registered prefix are stored in the XMP.
412 *
413 * @see org.apache.tika.metadata.Metadata#set(java.lang.String, java.lang.String)
414 */
415 @Override
416 public void set(String name, String value) {
417 checkKey( name );
418
419 String[] keyParts = splitKey( name );
420
421 String ns = registry.getNamespaceURI( keyParts[0] );
422 if (ns != null) {
423 try {
424 xmpData.setProperty( ns, keyParts[1], value );
425 }
426 catch (XMPException e) {
427 // Ignore
428 }
429 }
430 }
431
432 /**
433 * @see org.apache.tika.xmp.XMPMetadata#set(java.lang.String, java.lang.String)
434 */
435 @Override
436 public void set(Property property, String value) {
437 this.set( property.getName(), value );
438 }
439
440 /**
441 * @see org.apache.tika.xmp.XMPMetadata#set(java.lang.String, java.lang.String)
442 */
443 @Override
444 public void set(Property property, int value) {
445 // Can reuse the checks from the base class implementation which will call
446 // the set(String, String) method in the end
447 super.set( property, value );
448 }
449
450 /**
451 * @see org.apache.tika.xmp.XMPMetadata#set(java.lang.String, java.lang.String)
452 */
453 @Override
454 public void set(Property property, double value) {
455 super.set( property, value );
456 }
457
458 /**
459 * @see org.apache.tika.xmp.XMPMetadata#set(java.lang.String, java.lang.String)
460 */
461 @Override
462 public void set(Property property, Date date) {
463 super.set( property, date );
464 }
465
466 /**
467 * Sets array properties. If the property already exists, it is overwritten. Only array
468 * properties that use a registered prefix are stored in the XMP.
469 *
470 * @see org.apache.tika.metadata.Metadata#set(org.apache.tika.metadata.Property,
471 * java.lang.String[])
472 */
473 @Override
474 public void set(Property property, String[] values) {
475 checkKey( property.getName() );
476
477 if (!property.isMultiValuePermitted()) {
478 throw new PropertyTypeException( "Property is not of an array type" );
479 }
480
481 String[] keyParts = splitKey( property.getName() );
482
483 String ns = registry.getNamespaceURI( keyParts[0] );
484 if (ns != null) {
485 try {
486 int arrayType = tikaToXMPArrayType( property.getPrimaryProperty().getPropertyType() );
487 xmpData.setProperty( ns, keyParts[1], null, new PropertyOptions( arrayType ) );
488
489 for (String value : values) {
490 xmpData.appendArrayItem( ns, keyParts[1], value );
491 }
492 }
493 catch (XMPException e) {
494 // Ignore
495 }
496 }
497 }
498
499 /**
500 * It will set all simple and array properties that have QName keys in registered namespaces.
501 *
502 * @see org.apache.tika.metadata.Metadata#setAll(java.util.Properties)
503 */
504 @Override
505 public void setAll(Properties properties) {
506 @SuppressWarnings("unchecked")
507 Enumeration<String> names = (Enumeration<String>) properties.propertyNames();
508
509 while (names.hasMoreElements()) {
510 String name = names.nextElement();
511 Property property = Property.get( name );
512 if (property == null) {
513 throw new PropertyTypeException( "Unknown property: " + name );
514 }
515
516 String value = properties.getProperty( name );
517
518 if (property.isMultiValuePermitted()) {
519 this.set( property, new String[] { value } );
520 }
521 else {
522 this.set( property, value );
523 }
524 }
525 }
526
527 /**
528 * @see org.apache.tika.xmp.XMPMetadata#remove(java.lang.String)
529 */
530 public void remove(Property property) {
531 this.remove( property.getName() );
532 }
533
534 /**
535 * Removes the given property from the XMP data. If it is a complex property the whole subtree
536 * is removed
537 *
538 * @see org.apache.tika.metadata.Metadata#remove(java.lang.String)
539 */
540 @Override
541 public void remove(String name) {
542 checkKey( name );
543
544 String[] keyParts = splitKey( name );
545
546 String ns = registry.getNamespaceURI( keyParts[0] );
547 if (ns != null) {
548 xmpData.deleteProperty( ns, keyParts[1] );
549 }
550 }
551
552 /**
553 * Returns the number of top-level namespaces
554 */
555 @Override
556 public int size() {
557 int size = 0;
558
559 try {
560 // Get an iterator for the XMP packet, starting at the top level schema nodes
561 XMPIterator nsIter = xmpData.iterator( new IteratorOptions().setJustChildren( true )
562 .setOmitQualifiers( true ) );
563 // iterate all top level namespaces
564 while (nsIter.hasNext()) {
565 nsIter.next();
566 size++;
567 }
568 }
569 catch (XMPException e) {
570 // ignore
571 }
572
573 return size;
574 }
575
576 /**
577 * This method is not implemented, yet. It is very tedious to check for semantic equality of XMP
578 * packets
579 */
580 @Override
581 public boolean equals(Object o) {
582 throw new UnsupportedOperationException( "Not implemented" );
583 }
584
585 /**
586 * Serializes the XMP data in compact form without packet wrapper
587 *
588 * @see org.apache.tika.metadata.Metadata#toString()
589 */
590 @Override
591 public String toString() {
592 String result = null;
593 try {
594 result = XMPMetaFactory.serializeToString( xmpData, new SerializeOptions()
595 .setOmitPacketWrapper( true ).setUseCompactFormat( true ) );
596 }
597 catch (XMPException e) {
598 // ignore
599 }
600 return result;
601 }
602
603 // The XMP object is not serializable!
604 private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException {
605 throw new NotSerializableException();
606 }
607
608 // The XMP object is not serializable!
609 private void writeObject(ObjectOutputStream ois) throws IOException {
610 throw new NotSerializableException();
611 }
612
613 /**
614 * Checks if the given key is a valid QName with a known standard namespace prefix
615 *
616 * @param key
617 * the key to check
618 * @return true if the key is valid otherwise false
619 */
620 private void checkKey(String key) throws PropertyTypeException {
621 if (key == null || key.length() == 0) {
622 throw new PropertyTypeException( "Key must not be null" );
623 }
624
625 String[] keyParts = splitKey( key );
626 if (keyParts == null) {
627 throw new PropertyTypeException( "Key must be a QName in the form prefix:localName" );
628 }
629
630 if (registry.getNamespaceURI( keyParts[0] ) == null) {
631 throw new PropertyTypeException( "Key does not use a registered Namespace prefix" );
632 }
633 }
634
635 /**
636 * Split the given key at the namespace prefix delimiter
637 *
638 * @param key
639 * the key to split
640 * @return prefix and local name of the property or null if the key did not contain a delimiter
641 * or too much of them
642 */
643 private String[] splitKey(String key) {
644 String[] keyParts = key.split( Metadata.NAMESPACE_PREFIX_DELIMITER );
645 if (keyParts.length > 0 && keyParts.length <= 2) {
646 return keyParts;
647 }
648
649 return null;
650 }// checkKeyPrefix
651
652 /**
653 * Convert Tika array types to XMP array types
654 *
655 * @param type
656 * @return
657 */
658 private int tikaToXMPArrayType(PropertyType type) {
659 int result = 0;
660 switch (type) {
661 case BAG:
662 result = PropertyOptions.ARRAY;
663 break;
664 case SEQ:
665 result = PropertyOptions.ARRAY_ORDERED;
666 break;
667 case ALT:
668 result = PropertyOptions.ARRAY_ALTERNATE;
669 break;
670 }
671 return result;
672 }
673 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Set;
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.Property;
23
24 import com.adobe.xmp.XMPConst;
25 import com.adobe.xmp.XMPException;
26 import com.adobe.xmp.XMPMeta;
27 import com.adobe.xmp.XMPMetaFactory;
28 import com.adobe.xmp.XMPSchemaRegistry;
29 import com.adobe.xmp.XMPUtils;
30 import com.adobe.xmp.options.PropertyOptions;
31
32 /**
33 * Base class for Tika Metadata to XMP converter which provides some needed common functionality.
34 */
35 public abstract class AbstractConverter implements ITikaToXMPConverter {
36 private Metadata metadata;
37 protected XMPMeta meta;
38
39 abstract public XMPMeta process(Metadata metadata) throws XMPException;
40
41 /**
42 * Every Converter has to provide information about namespaces that are used additionally to the
43 * core set of XMP namespaces.
44 *
45 * @return the additional namespace information
46 */
47 abstract protected Set<Namespace> getAdditionalNamespaces();
48
49 public AbstractConverter() throws TikaException {
50 meta = XMPMetaFactory.create();
51 metadata = new Metadata();
52 registerNamespaces( getAdditionalNamespaces() );
53 }
54
55 public void setMetadata(Metadata metadata) {
56 this.metadata = metadata;
57 }
58
59 public XMPMeta getXMPMeta() {
60 return meta;
61 }
62
63 // --- utility methods used by sub-classes ---
64
65 /**
66 * Registers a number <code>Namespace</code> information with XMPCore. Any already registered
67 * namespace is not registered again.
68 *
69 * @param namespaces
70 * the list of namespaces to be registered
71 * @throws TikaException
72 * in case a namespace oculd not be registered
73 */
74 protected void registerNamespaces(Set<Namespace> namespaces) throws TikaException {
75 XMPSchemaRegistry registry = XMPMetaFactory.getSchemaRegistry();
76
77 for (Namespace namespace : namespaces) {
78 // Any already registered namespace is not registered again
79 try {
80 registry.registerNamespace( namespace.uri, namespace.prefix );
81 }
82 catch (XMPException e) {
83 throw new TikaException(
84 "Namespace needed by converter could not be registiered with XMPCore", e );
85 }
86 }
87 }
88
89 /**
90 * @see AbstractConverter#createProperty(String, String, String)
91 */
92 protected void createProperty(Property metadataProperty, String ns, String propertyName)
93 throws XMPException {
94 createProperty( metadataProperty.getName(), ns, propertyName );
95 }
96
97 /**
98 * Creates a simple property.
99 *
100 * @param tikaKey
101 * Key in the Tika metadata map
102 * @param ns
103 * namespace the property should be created in
104 * @param propertyName
105 * name of the property
106 * @throws XMPException
107 * if the property could not be created
108 */
109 protected void createProperty(String tikaKey, String ns, String propertyName)
110 throws XMPException {
111 String value = metadata.get( tikaKey );
112 if (value != null && value.length() > 0) {
113 meta.setProperty( ns, propertyName, value );
114 }
115 }
116
117 /**
118 * @see AbstractConverter#createLangAltProperty(String, String, String)
119 */
120 protected void createLangAltProperty(Property metadataProperty, String ns, String propertyName)
121 throws XMPException {
122 createLangAltProperty( metadataProperty.getName(), ns, propertyName );
123 }
124
125 /**
126 * Creates a language alternative property in the x-default language
127 *
128 * @param tikaKey
129 * Key in the Tika metadata map
130 * @param ns
131 * namespace the property should be created in
132 * @param propertyName
133 * name of the property
134 * @throws XMPException
135 * if the property could not be created
136 */
137 protected void createLangAltProperty(String tikaKey, String ns, String propertyName)
138 throws XMPException {
139 String value = metadata.get( tikaKey );
140 if (value != null && value.length() > 0) {
141 meta.setLocalizedText( ns, propertyName, null, XMPConst.X_DEFAULT, value );
142 }
143 }
144
145 protected void createArrayProperty(Property metadataProperty, String nsDc,
146 String arrayProperty, int arrayType) throws XMPException {
147 createArrayProperty( metadataProperty.getName(), nsDc, arrayProperty, arrayType );
148 }
149
150 /**
151 * Creates an array property from a list of values.
152 *
153 * @param tikaKey
154 * Key in the Tika metadata map
155 * @param ns
156 * namespace the property should be created in
157 * @param propertyName
158 * name of the property
159 * @param arrayType
160 * depicts which kind of array shall be created
161 * @throws XMPException
162 * if the property could not be created
163 */
164 protected void createArrayProperty(String tikaKey, String ns, String propertyName, int arrayType)
165 throws XMPException {
166 String[] values = metadata.getValues( tikaKey );
167 if (values != null) {
168 meta.setProperty( ns, propertyName, null, new PropertyOptions( arrayType ) );
169 for (String value : values) {
170 meta.appendArrayItem( ns, propertyName, value );
171 }
172 }
173 }
174
175 protected void createCommaSeparatedArray(Property metadataProperty, String nsDc,
176 String arrayProperty, int arrayType) throws XMPException {
177 createCommaSeparatedArray( metadataProperty.getName(), nsDc, arrayProperty, arrayType );
178 }
179
180 /**
181 * Creates an array property from a comma separated list.
182 *
183 * @param tikaKey
184 * Key in the Tika metadata map
185 * @param ns
186 * namespace the property should be created in
187 * @param propertyName
188 * name of the property
189 * @param arrayType
190 * depicts which kind of array shall be created
191 * @throws XMPException
192 * if the property could not be created
193 */
194 protected void createCommaSeparatedArray(String tikaKey, String ns, String propertyName,
195 int arrayType) throws XMPException {
196 String value = metadata.get( tikaKey );
197 if (value != null && value.length() > 0) {
198 XMPUtils.separateArrayItems( meta, ns, propertyName, value, new PropertyOptions(
199 arrayType ), false );
200 }
201 }
202
203 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Collections;
19 import java.util.HashSet;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.DublinCore;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.Property;
26 import org.apache.tika.metadata.XMPRights;
27 import org.apache.tika.metadata.Property.PropertyType;
28
29 import com.adobe.xmp.XMPException;
30 import com.adobe.xmp.XMPMeta;
31 import com.adobe.xmp.XMPMetaFactory;
32 import com.adobe.xmp.XMPSchemaRegistry;
33 import com.adobe.xmp.options.PropertyOptions;
34
35 /**
36 * Trys to convert as much of the properties in the <code>Metadata</code> map to XMP namespaces.
37 * only those properties will be cnverted where the name contains a prefix and this prefix
38 * correlates with a "known" prefix for a standard namespace. For example "dc:title" would be mapped
39 * to the "title" property in the DublinCore namespace.
40 */
41 public class GenericConverter extends AbstractConverter {
42 public GenericConverter() throws TikaException {
43 super();
44 }
45
46 @Override
47 public XMPMeta process(Metadata metadata) throws XMPException {
48 setMetadata( metadata );
49 XMPSchemaRegistry registry = XMPMetaFactory.getSchemaRegistry();
50
51 String[] keys = metadata.names();
52 for (String key : keys) {
53 String[] keyParts = key.split( Metadata.NAMESPACE_PREFIX_DELIMITER );
54 if (keyParts.length > 0 && keyParts.length <= 2) {
55 String uri = registry.getNamespaceURI( keyParts[0] );
56
57 if (uri != null) {
58 // Tika properties where the type differs from the XMP specification
59 if (key.equals( DublinCore.TITLE.getName() )
60 || key.equals( DublinCore.DESCRIPTION.getName() )
61 || key.equals( XMPRights.USAGE_TERMS.getName() )) {
62 createLangAltProperty( key, uri, keyParts[1] );
63 }
64 else if (key.equals( DublinCore.CREATOR.getName() )) {
65 createArrayProperty( key, uri, keyParts[1], PropertyOptions.ARRAY_ORDERED );
66 }
67 else {
68 PropertyType type = Property.getPropertyType( key );
69 if (type != null) {
70 switch (type) {
71 case SIMPLE:
72 createProperty( key, uri, keyParts[1] );
73 break;
74 case BAG:
75 createArrayProperty( key, uri, keyParts[1],
76 PropertyOptions.ARRAY );
77 break;
78 case SEQ:
79 createArrayProperty( key, uri, keyParts[1],
80 PropertyOptions.ARRAY_ORDERED );
81 break;
82 case ALT:
83 createArrayProperty( key, uri, keyParts[1],
84 PropertyOptions.ARRAY_ALTERNATE );
85 break;
86 // TODO Add support for structs and lang-alts, but those types are
87 // currently not used in Tika
88 }
89 }
90 }
91 }
92 } // ignore keys that are not qualified
93 }
94
95 return getXMPMeta();
96 }
97
98 @Override
99 public Set<Namespace> getAdditionalNamespaces() {
100 // no additional namespaces needed
101 return Collections.unmodifiableSet( new HashSet<Namespace>() );
102 }
103 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import org.apache.tika.metadata.Metadata;
19
20 import com.adobe.xmp.XMPException;
21 import com.adobe.xmp.XMPMeta;
22
23 /**
24 * Interface for the specific <code>Metadata</code> to XMP converters
25 */
26 public interface ITikaToXMPConverter {
27 /**
28 * Converts a Tika {@link Metadata}-object into an {@link XMPMeta} containing the useful
29 * properties.
30 *
31 * @param metadata
32 * a Tika Metadata object
33 * @return Returns an XMPMeta object.
34 * @throws XMPException
35 * If an error occurs during the creation of the XMP object.
36 */
37 XMPMeta process(Metadata metadata) throws XMPException;
38 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.HttpHeaders;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.Office;
27 import org.apache.tika.metadata.OfficeOpenXMLCore;
28 import org.apache.tika.metadata.OfficeOpenXMLExtended;
29 import org.apache.tika.metadata.TikaCoreProperties;
30
31 import com.adobe.xmp.XMPConst;
32 import com.adobe.xmp.XMPException;
33 import com.adobe.xmp.XMPMeta;
34 import com.adobe.xmp.options.PropertyOptions;
35
36 /**
37 * Tika to XMP mapping for the binary MS formats Word (.doc), Excel (.xls) and PowerPoint (.ppt).
38 */
39 public class MSOfficeBinaryConverter extends AbstractConverter {
40 public MSOfficeBinaryConverter() throws TikaException {
41 super();
42 }
43
44 protected static final Set<Namespace> ADDITIONAL_NAMESPACES = Collections
45 .unmodifiableSet( new HashSet<Namespace>( Arrays.asList( new Namespace(
46 OfficeOpenXMLCore.NAMESPACE_URI, OfficeOpenXMLCore.PREFIX ), new Namespace(
47 OfficeOpenXMLExtended.NAMESPACE_URI, OfficeOpenXMLExtended.PREFIX ) ) ) );
48
49 /**
50 * @throws XMPException
51 * Forwards XMP errors
52 * @see ITikaToXMPConverter#process(Metadata)
53 */
54 public XMPMeta process(Metadata metadata) throws XMPException {
55 super.setMetadata( metadata );
56
57 // For all formats, Tika uses the same keys
58 createProperty( HttpHeaders.CONTENT_TYPE, XMPConst.NS_DC, "format" );
59 createProperty( OfficeOpenXMLExtended.APPLICATION, XMPConst.NS_XMP, "CreatorTool" );
60 createCommaSeparatedArray( TikaCoreProperties.CREATOR, XMPConst.NS_DC, "creator",
61 PropertyOptions.ARRAY_ORDERED );
62 createProperty( OfficeOpenXMLCore.CATEGORY, XMPConst.NS_IPTCCORE, "intellectualGenre" );
63 createProperty( TikaCoreProperties.CREATED, XMPConst.NS_XMP, "CreateDate" );
64 createProperty( Office.CHARACTER_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Characters" );
65 createProperty( TikaCoreProperties.COMMENTS, XMPConst.NS_PDFX, "Comments" );
66 createProperty( OfficeOpenXMLExtended.COMPANY, OfficeOpenXMLExtended.NAMESPACE_URI,
67 "Company" );
68 createCommaSeparatedArray( TikaCoreProperties.KEYWORDS, XMPConst.NS_DC, "subject",
69 PropertyOptions.ARRAY );
70 createLangAltProperty( TikaCoreProperties.DESCRIPTION, XMPConst.NS_DC, "description" );
71 createProperty( TikaCoreProperties.LANGUAGE, OfficeOpenXMLCore.NAMESPACE_URI, "language" );
72 createProperty( TikaCoreProperties.PRINT_DATE, OfficeOpenXMLCore.NAMESPACE_URI,
73 "lastPrinted" );
74 createProperty( TikaCoreProperties.MODIFIED, XMPConst.NS_XMP, "ModifyDate" );
75 createProperty( Office.PAGE_COUNT, XMPConst.TYPE_PAGEDFILE, "NPages" );
76 createProperty( OfficeOpenXMLCore.REVISION, OfficeOpenXMLCore.NAMESPACE_URI, "revision" );
77 createProperty( Office.SLIDE_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Pages" );
78 createProperty( OfficeOpenXMLExtended.TEMPLATE, OfficeOpenXMLExtended.NAMESPACE_URI,
79 "Template" );
80 createLangAltProperty( TikaCoreProperties.TITLE, XMPConst.NS_DC, "title" );
81 createProperty( Office.WORD_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Words" );
82 // Not mapped: (MSOffice) Edit-Time ???
83 // Not mapped: (MSOffice) Last-Author ???
84 // not mapped: (MSOffice) Security ???
85
86 return super.getXMPMeta();
87 }
88
89 protected Set<Namespace> getAdditionalNamespaces() {
90 return ADDITIONAL_NAMESPACES;
91 }
92 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.HttpHeaders;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.Office;
27 import org.apache.tika.metadata.OfficeOpenXMLCore;
28 import org.apache.tika.metadata.OfficeOpenXMLExtended;
29 import org.apache.tika.metadata.TikaCoreProperties;
30
31 import com.adobe.xmp.XMPConst;
32 import com.adobe.xmp.XMPException;
33 import com.adobe.xmp.XMPMeta;
34 import com.adobe.xmp.options.PropertyOptions;
35
36 /**
37 * Tika to XMP mapping for the Office Open XML formats Word (.docx), Excel (.xlsx) and PowerPoint
38 * (.pptx).
39 */
40 public class MSOfficeXMLConverter extends AbstractConverter {
41 protected static final Set<Namespace> ADDITIONAL_NAMESPACES = Collections
42 .unmodifiableSet( new HashSet<Namespace>( Arrays.asList( new Namespace(
43 OfficeOpenXMLCore.NAMESPACE_URI, OfficeOpenXMLCore.PREFIX ), new Namespace(
44 OfficeOpenXMLExtended.NAMESPACE_URI, OfficeOpenXMLExtended.PREFIX ) ) ) );
45
46 public MSOfficeXMLConverter() throws TikaException {
47 super();
48 }
49
50 @Override
51 public XMPMeta process(Metadata metadata) throws XMPException {
52 super.setMetadata( metadata );
53
54 createProperty( HttpHeaders.CONTENT_TYPE, XMPConst.NS_DC, "format" );
55
56 // Core Properties
57 createProperty( OfficeOpenXMLCore.CATEGORY, XMPConst.NS_IPTCCORE, "intellectualGenre" );
58 createProperty( OfficeOpenXMLCore.CONTENT_STATUS, OfficeOpenXMLCore.NAMESPACE_URI,
59 "contentStatus" );
60 createProperty( TikaCoreProperties.CREATED, XMPConst.NS_XMP, "CreateDate" );
61 createCommaSeparatedArray( TikaCoreProperties.CREATOR, XMPConst.NS_DC, "creator",
62 PropertyOptions.ARRAY_ORDERED );
63 createProperty( TikaCoreProperties.COMMENTS, XMPConst.NS_PDFX, "Comments" );
64 createProperty( TikaCoreProperties.IDENTIFIER, XMPConst.NS_DC, "identifier" );
65 createCommaSeparatedArray( TikaCoreProperties.KEYWORDS, XMPConst.NS_DC, "subject",
66 PropertyOptions.ARRAY );
67 createLangAltProperty( TikaCoreProperties.DESCRIPTION, XMPConst.NS_DC, "description" );
68 createProperty( TikaCoreProperties.LANGUAGE, XMPConst.NS_DC, "language" );
69 createProperty( TikaCoreProperties.MODIFIER, OfficeOpenXMLCore.NAMESPACE_URI,
70 "lastModifiedBy" );
71 createProperty( TikaCoreProperties.PRINT_DATE, OfficeOpenXMLCore.NAMESPACE_URI,
72 "lastPrinted" );
73 createProperty( TikaCoreProperties.MODIFIED, XMPConst.NS_XMP, "ModifyDate" );
74 createProperty( OfficeOpenXMLCore.REVISION, OfficeOpenXMLCore.NAMESPACE_URI, "revision" );
75 createLangAltProperty( TikaCoreProperties.TITLE, XMPConst.NS_DC, "title" );
76 createProperty( OfficeOpenXMLCore.VERSION, OfficeOpenXMLCore.NAMESPACE_URI, "version" );
77
78 // Extended Properties
79
80 // Put both App name and version in xmp:CreatorTool
81 String creatorTool = "";
82 String value = metadata.get( OfficeOpenXMLExtended.APPLICATION );
83 if (value != null && value.length() > 0) {
84 creatorTool = value;
85
86 value = metadata.get( OfficeOpenXMLExtended.APP_VERSION );
87 if (value != null && value.length() > 0) {
88 creatorTool += " " + value;
89 }
90 }
91
92 if (creatorTool.length() > 0) {
93 meta.setProperty( XMPConst.NS_XMP, "CreatorTool", creatorTool );
94 }
95
96 createProperty( Office.CHARACTER_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Characters" );
97 createProperty( Office.CHARACTER_COUNT_WITH_SPACES, OfficeOpenXMLExtended.NAMESPACE_URI,
98 "CharactersWithSpaces" );
99 createProperty( TikaCoreProperties.PUBLISHER, OfficeOpenXMLExtended.NAMESPACE_URI,
100 "Company" );
101 createProperty( Office.LINE_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Lines" );
102 createProperty( OfficeOpenXMLExtended.MANAGER, OfficeOpenXMLExtended.NAMESPACE_URI,
103 "Manager" );
104 createProperty( OfficeOpenXMLExtended.NOTES, OfficeOpenXMLExtended.NAMESPACE_URI, "Notes" );
105 createProperty( Office.PAGE_COUNT, XMPConst.TYPE_PAGEDFILE, "NPages" );
106 createProperty( Office.PARAGRAPH_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Paragraphs" );
107 createProperty( OfficeOpenXMLExtended.PRESENTATION_FORMAT,
108 OfficeOpenXMLExtended.NAMESPACE_URI, "PresentationFormat" );
109 createProperty( Office.SLIDE_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Slides" );
110 createProperty( OfficeOpenXMLExtended.TEMPLATE, OfficeOpenXMLExtended.NAMESPACE_URI,
111 "Template" );
112 createProperty( OfficeOpenXMLExtended.TOTAL_TIME, OfficeOpenXMLExtended.NAMESPACE_URI,
113 "TotalTime" );
114 createProperty( Office.WORD_COUNT, OfficeOpenXMLExtended.NAMESPACE_URI, "Words" );
115
116 return super.getXMPMeta();
117 }
118
119 @Override
120 protected Set<Namespace> getAdditionalNamespaces() {
121 return ADDITIONAL_NAMESPACES;
122 }
123
124 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 /**
19 * Utility class to hold namespace information.
20 */
21 public class Namespace {
22 public String uri;
23 public String prefix;
24
25 public Namespace(String uri, String prefix) {
26 this.uri = uri;
27 this.prefix = prefix;
28 }
29 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.HttpHeaders;
25 import org.apache.tika.metadata.MSOffice;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.Office;
28 import org.apache.tika.metadata.PagedText;
29 import org.apache.tika.metadata.TikaCoreProperties;
30
31 import com.adobe.xmp.XMPConst;
32 import com.adobe.xmp.XMPException;
33 import com.adobe.xmp.XMPMeta;
34 import com.adobe.xmp.options.PropertyOptions;
35
36 /**
37 * Tika to XMP mapping for the Open Document formats: Text (.odt), Spreatsheet (.ods), Graphics
38 * (.odg) and Presentation (.odp).
39 */
40 public class OpenDocumentConverter extends AbstractConverter {
41 protected static final Set<Namespace> ADDITIONAL_NAMESPACES = Collections
42 .unmodifiableSet( new HashSet<Namespace>( Arrays.asList( new Namespace(
43 Office.NAMESPACE_URI_DOC_META, Office.PREFIX_DOC_META ) ) ) );
44
45 public OpenDocumentConverter() throws TikaException {
46 super();
47 }
48
49 /**
50 * @throws XMPException
51 * Forwards XMP errors
52 * @see ITikaToXMPConverter#process(Metadata)
53 */
54 @Override
55 public XMPMeta process(Metadata metadata) throws XMPException {
56 super.setMetadata( metadata );
57
58 createProperty( HttpHeaders.CONTENT_TYPE, XMPConst.NS_DC, "format" );
59
60 createProperty( Office.CHARACTER_COUNT, Office.NAMESPACE_URI_DOC_META, "character-count" );
61 createProperty( TikaCoreProperties.CREATED, XMPConst.NS_XMP, "CreateDate" );
62 createCommaSeparatedArray( TikaCoreProperties.CREATOR, XMPConst.NS_DC, "creator",
63 PropertyOptions.ARRAY_ORDERED );
64 createProperty( TikaCoreProperties.MODIFIED, XMPConst.NS_XMP, "ModifyDate" );
65 createProperty( TikaCoreProperties.COMMENTS, XMPConst.NS_PDFX, "Comments" );
66 createCommaSeparatedArray( TikaCoreProperties.KEYWORDS, XMPConst.NS_DC, "subject",
67 PropertyOptions.ARRAY );
68 createLangAltProperty( TikaCoreProperties.DESCRIPTION, XMPConst.NS_DC, "description" );
69 createProperty( MSOffice.EDIT_TIME, Office.NAMESPACE_URI_DOC_META, "editing-duration" );
70 createProperty( "editing-cycles", Office.NAMESPACE_URI_DOC_META, "editing-cycles" );
71 createProperty( "generator", XMPConst.NS_XMP, "CreatorTool" );
72 createProperty( Office.IMAGE_COUNT, Office.NAMESPACE_URI_DOC_META, "image-count" );
73 createProperty( "initial-creator", Office.NAMESPACE_URI_DOC_META, "initial-creator" );
74 createProperty( Office.OBJECT_COUNT, Office.NAMESPACE_URI_DOC_META, "object-count" );
75 createProperty( PagedText.N_PAGES, XMPConst.TYPE_PAGEDFILE, "NPages" );
76 createProperty( Office.PARAGRAPH_COUNT, Office.NAMESPACE_URI_DOC_META, "paragraph-count" );
77 createProperty( Office.TABLE_COUNT, Office.NAMESPACE_URI_DOC_META, "table-count" );
78 createLangAltProperty( TikaCoreProperties.TITLE, XMPConst.NS_DC, "title" );
79 createProperty( Office.WORD_COUNT, Office.NAMESPACE_URI_DOC_META, "word-count" );
80
81 // duplicate properties not mapped:
82 // nbImg | 0
83 // nbObject | 0
84 // nbPage | 1
85 // nbPara | 3
86 // nbTab | 0
87 // nbWord | 5
88
89 return super.getXMPMeta();
90 }
91
92 @Override
93 protected Set<Namespace> getAdditionalNamespaces() {
94 return ADDITIONAL_NAMESPACES;
95 }
96 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.HashSet;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.HttpHeaders;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.OfficeOpenXMLCore;
27 import org.apache.tika.metadata.OfficeOpenXMLExtended;
28 import org.apache.tika.metadata.TikaCoreProperties;
29
30 import com.adobe.xmp.XMPConst;
31 import com.adobe.xmp.XMPException;
32 import com.adobe.xmp.XMPMeta;
33 import com.adobe.xmp.options.PropertyOptions;
34
35 /**
36 * Tika to XMP mapping for the RTF format.
37 */
38 public class RTFConverter extends AbstractConverter {
39 protected static final Set<Namespace> ADDITIONAL_NAMESPACES = Collections
40 .unmodifiableSet( new HashSet<Namespace>( Arrays.asList( new Namespace(
41 OfficeOpenXMLExtended.NAMESPACE_URI, OfficeOpenXMLExtended.PREFIX ) ) ) );
42
43 public RTFConverter() throws TikaException {
44 super();
45 }
46
47 @Override
48 public XMPMeta process(Metadata metadata) throws XMPException {
49 setMetadata( metadata );
50
51 createProperty( HttpHeaders.CONTENT_TYPE, XMPConst.NS_DC, "format" );
52
53 createCommaSeparatedArray( TikaCoreProperties.CREATOR, XMPConst.NS_DC, "creator",
54 PropertyOptions.ARRAY_ORDERED );
55 createLangAltProperty( TikaCoreProperties.TITLE, XMPConst.NS_DC, "title" );
56 createLangAltProperty( TikaCoreProperties.DESCRIPTION, XMPConst.NS_DC, "description" );
57 createCommaSeparatedArray( TikaCoreProperties.KEYWORDS, XMPConst.NS_DC, "subject",
58 PropertyOptions.ARRAY );
59 createProperty( OfficeOpenXMLCore.CATEGORY, XMPConst.NS_IPTCCORE, "intellectualGenre" );
60 createProperty( OfficeOpenXMLExtended.TEMPLATE, OfficeOpenXMLExtended.NAMESPACE_URI,
61 "Template" );
62 createProperty( TikaCoreProperties.COMMENTS, XMPConst.NS_PDFX, "Comments" );
63 createProperty( OfficeOpenXMLExtended.COMPANY, OfficeOpenXMLExtended.NAMESPACE_URI,
64 "Company" );
65 createProperty( OfficeOpenXMLExtended.MANAGER, OfficeOpenXMLExtended.NAMESPACE_URI,
66 "Manager" );
67
68 return getXMPMeta();
69 }
70
71 @Override
72 protected Set<Namespace> getAdditionalNamespaces() {
73 return ADDITIONAL_NAMESPACES;
74 }
75 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp.convert;
17
18 import java.util.HashMap;
19 import java.util.Map;
20 import java.util.Set;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.metadata.TikaCoreProperties;
25 import org.apache.tika.mime.MediaType;
26 import org.apache.tika.parser.ParseContext;
27 import org.apache.tika.parser.microsoft.OfficeParser;
28 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
29 import org.apache.tika.parser.odf.OpenDocumentParser;
30 import org.apache.tika.parser.rtf.RTFParser;
31
32 import com.adobe.xmp.XMPException;
33 import com.adobe.xmp.XMPMeta;
34 import com.adobe.xmp.XMPMetaFactory;
35
36 public class TikaToXMP {
37 /**
38 * Map from mimetype to converter class Must only be accessed through
39 * <code>getConverterMap</code>
40 */
41 private static Map<MediaType, Class<? extends ITikaToXMPConverter>> converterMap;
42
43 // --- public API implementation---
44
45 public TikaToXMP() {
46 // Nothing to do
47 }
48
49 /**
50 * @see TikaToXMP#convert(Metadata, String) But the mimetype is retrieved from the metadata
51 * map.
52 */
53 public static XMPMeta convert(Metadata tikaMetadata) throws TikaException {
54 if (tikaMetadata == null) {
55 throw new IllegalArgumentException( "Metadata parameter must not be null" );
56 }
57
58 String mimetype = tikaMetadata.get( Metadata.CONTENT_TYPE );
59 if (mimetype == null) {
60 mimetype = tikaMetadata.get( TikaCoreProperties.FORMAT );
61 }
62
63 return convert( tikaMetadata, mimetype );
64 }
65
66 /**
67 * Convert the given Tika metadata map to XMP object. If a mimetype is provided in the Metadata
68 * map, a specific converter can be used, that converts all available metadata. If there is no
69 * mimetype provided or no specific converter available a generic conversion is done which will
70 * convert only those properties that are in known namespaces and are using the correct
71 * prefixes.
72 *
73 * @param tikaMetadata
74 * the Metadata map from Tika
75 * @param mimetype
76 * depicts the format's converter to use
77 * @return XMP object
78 * @throws TikaException
79 */
80 public static XMPMeta convert(Metadata tikaMetadata, String mimetype) throws TikaException {
81 if (tikaMetadata == null) {
82 throw new IllegalArgumentException( "Metadata parameter must not be null" );
83 }
84
85 ITikaToXMPConverter converter = null;
86
87 if (isConverterAvailable( mimetype )) {
88 converter = getConverter( mimetype );
89 }
90 else {
91 converter = new GenericConverter();
92 }
93
94 XMPMeta xmp = null;
95
96 if (converter != null) {
97 try {
98 xmp = converter.process( tikaMetadata );
99 }
100 catch (XMPException e) {
101 throw new TikaException( "Tika metadata could not be converted to XMP", e );
102 }
103 }
104 else {
105 xmp = XMPMetaFactory.create(); // empty packet
106 }
107
108 return xmp;
109 }
110
111 /**
112 * Check if there is a converter available which allows to convert the Tika metadata to XMP
113 *
114 * @param mimetype
115 * the Mimetype
116 * @return true if the Metadata object can be converted or false if not
117 */
118 public static boolean isConverterAvailable(String mimetype) {
119 MediaType type = MediaType.parse( mimetype );
120
121 if (type != null) {
122 return (getConverterMap().get( type ) != null);
123 }
124
125 return false;
126 }
127
128 /**
129 * Retrieve a specific converter according to the mimetype
130 *
131 * @param mimetype
132 * the Mimetype
133 * @return the converter or null, if none exists
134 * @throws TikaException
135 */
136 public static ITikaToXMPConverter getConverter(String mimetype) throws TikaException {
137 if (mimetype == null) {
138 throw new IllegalArgumentException( "mimetype must not be null" );
139 }
140
141 ITikaToXMPConverter converter = null;
142
143 MediaType type = MediaType.parse( mimetype );
144
145 if (type != null) {
146 Class<? extends ITikaToXMPConverter> clazz = getConverterMap().get( type );
147 if (clazz != null) {
148 try {
149 converter = clazz.newInstance();
150 }
151 catch (Exception e) {
152 throw new TikaException(
153 "TikaToXMP converter class cannot be instantiated for mimetype: "
154 + type.toString(), e );
155 }
156 }
157 }
158
159 return converter;
160 }
161
162 // --- Private methods ---
163
164 private static Map<MediaType, Class<? extends ITikaToXMPConverter>> getConverterMap() {
165 if (converterMap == null) {
166 converterMap = new HashMap<MediaType, Class<? extends ITikaToXMPConverter>>();
167 initialize();
168 }
169 return converterMap;
170 }
171
172 /**
173 * Initializes the map with supported converters.
174 */
175 private static void initialize() {
176 // No particular parsing context is needed
177 ParseContext parseContext = new ParseContext();
178
179 // MS Office Binary File Format
180 addConverter( new OfficeParser().getSupportedTypes( parseContext ),
181 MSOfficeBinaryConverter.class );
182
183 // Rich Text Format
184 addConverter( new RTFParser().getSupportedTypes( parseContext ), RTFConverter.class );
185
186 // MS Open XML Format
187 addConverter( new OOXMLParser().getSupportedTypes( parseContext ),
188 MSOfficeXMLConverter.class );
189
190 // Open document format
191 addConverter( new OpenDocumentParser().getSupportedTypes( parseContext ),
192 OpenDocumentConverter.class );
193 }
194
195 private static void addConverter(Set<MediaType> supportedTypes,
196 Class<? extends ITikaToXMPConverter> converter) {
197 for (MediaType type : supportedTypes) {
198 getConverterMap().put( type, converter );
199 }
200 }
201 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertNotNull;
21 import static org.junit.Assert.assertNull;
22 import static org.junit.Assert.assertTrue;
23
24 import org.apache.tika.exception.TikaException;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.OfficeOpenXMLCore;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.xmp.convert.ITikaToXMPConverter;
29 import org.apache.tika.xmp.convert.MSOfficeXMLConverter;
30 import org.apache.tika.xmp.convert.TikaToXMP;
31 import org.junit.Before;
32 import org.junit.Test;
33
34 import com.adobe.xmp.XMPConst;
35 import com.adobe.xmp.XMPException;
36 import com.adobe.xmp.XMPIterator;
37 import com.adobe.xmp.XMPMeta;
38 import com.adobe.xmp.XMPMetaFactory;
39 import com.adobe.xmp.properties.XMPProperty;
40
41 /**
42 * Tests the Tika <code>Metadata</code> to XMP conversion functionatlity
43 */
44 public class TikaToXMPTest {
45 private Metadata tikaMetadata;
46
47 private static final String OOXML_MIMETYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
48 private static final String GENERIC_MIMETYPE = "generic/mimetype";
49
50 // --- Set up ---
51 @Before
52 public void setup() {
53 tikaMetadata = new Metadata();
54 }
55
56 private void setupOOXMLMetadata(Metadata metadata) {
57 // simple property
58 metadata.set( TikaCoreProperties.LANGUAGE, "language" );
59 // language alternative
60 metadata.set( TikaCoreProperties.TITLE, "title" );
61 // comma separated array
62 metadata.set( TikaCoreProperties.KEYWORDS, "keyword1,keyword2" );
63 // OOXML specific simple prop
64 metadata.set( TikaCoreProperties.MODIFIER, "lastModifiedBy" );
65 }
66
67 private void checkOOXMLMetadata(XMPMeta xmp) throws XMPException {
68 // check simple property
69 XMPProperty prop = xmp.getProperty( XMPConst.NS_DC, "language" );
70 assertNotNull( prop );
71 assertEquals( "language", prop.getValue() );
72
73 // check lang alt
74 prop = xmp.getLocalizedText( XMPConst.NS_DC, "title", null, XMPConst.X_DEFAULT );
75 assertNotNull( prop );
76 assertEquals( "title", prop.getValue() );
77
78 // check array
79 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 1 );
80 assertNotNull( prop );
81 assertEquals( "keyword1", prop.getValue() );
82 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 2 );
83 assertNotNull( prop );
84 assertEquals( "keyword2", prop.getValue() );
85
86 // check OOXML specific simple property
87 prop = xmp.getProperty( OfficeOpenXMLCore.NAMESPACE_URI, "lastModifiedBy" );
88 assertNotNull( prop );
89 assertEquals( "lastModifiedBy", prop.getValue() );
90 }
91
92 // --- TESTS ---
93 @Test
94 public void convert_OOXMLMetadataWithMimetype_everythingConverted() throws XMPException,
95 TikaException {
96 setupOOXMLMetadata( tikaMetadata );
97 tikaMetadata.set( Metadata.CONTENT_TYPE, OOXML_MIMETYPE );
98
99 XMPMeta xmp = TikaToXMP.convert( tikaMetadata );
100
101 checkOOXMLMetadata( xmp );
102 }
103
104 @Test
105 public void convert_OOXMLMetadataWithExtraMimetype_everythingConverted() throws XMPException,
106 TikaException {
107 setupOOXMLMetadata( tikaMetadata );
108
109 XMPMeta xmp = TikaToXMP.convert( tikaMetadata, OOXML_MIMETYPE );
110
111 checkOOXMLMetadata( xmp );
112 }
113
114 @Test
115 public void convert_OOXMLMetadataWithoutMimetype_onlyGeneralMetadataconverted()
116 throws XMPException, TikaException {
117 setupOOXMLMetadata( tikaMetadata );
118
119 XMPMeta xmp = TikaToXMP.convert( tikaMetadata, null );
120
121 // general metadata is converted
122 // check simple property
123 XMPProperty prop = xmp.getProperty( XMPConst.NS_DC, "language" );
124 assertNotNull( prop );
125 assertEquals( "language", prop.getValue() );
126
127 // check lang alt
128 prop = xmp.getLocalizedText( XMPConst.NS_DC, "title", null, XMPConst.X_DEFAULT );
129 assertNotNull( prop );
130 assertEquals( "title", prop.getValue() );
131
132 // OOXML one is not, the namespace has also not been registiered as the converter has not
133 // been used
134 XMPMetaFactory.getSchemaRegistry().registerNamespace( OfficeOpenXMLCore.NAMESPACE_URI,
135 OfficeOpenXMLCore.PREFIX );
136 prop = xmp.getProperty( OfficeOpenXMLCore.NAMESPACE_URI, "lastModifiedBy" );
137 assertNull( prop );
138 }
139
140 @Test
141 public void convert_genericMetadataAllQualified_allConverted() throws XMPException,
142 TikaException {
143 // simple property
144 tikaMetadata.set( TikaCoreProperties.FORMAT, GENERIC_MIMETYPE );
145 // language alternative
146 tikaMetadata.set( TikaCoreProperties.TITLE, "title" );
147 // array
148 tikaMetadata.set( TikaCoreProperties.KEYWORDS, new String[] { "keyword1", "keyword2" } );
149
150 XMPMeta xmp = TikaToXMP.convert( tikaMetadata, null );
151
152 // check simple property
153 XMPProperty prop = xmp.getProperty( XMPConst.NS_DC, "format" );
154 assertNotNull( prop );
155 assertEquals( GENERIC_MIMETYPE, prop.getValue() );
156
157 // check lang alt
158 prop = xmp.getLocalizedText( XMPConst.NS_DC, "title", null, XMPConst.X_DEFAULT );
159 assertNotNull( prop );
160 assertEquals( "title", prop.getValue() );
161
162 // check array
163 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 1 );
164 assertNotNull( prop );
165 assertEquals( "keyword1", prop.getValue() );
166 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 2 );
167 assertNotNull( prop );
168 assertEquals( "keyword2", prop.getValue() );
169 }
170
171 @Test
172 public void convert_wrongGenericMetadata_notConverted() throws XMPException, TikaException {
173 // unknown prefix
174 tikaMetadata.set( "unknown:key", "unknownPrefixValue" );
175 // not qualified key
176 tikaMetadata.set( "wrongKey", "wrongKeyValue" );
177
178 XMPMeta xmp = TikaToXMP.convert( tikaMetadata, null );
179
180 // XMP is empty
181 XMPIterator iter = xmp.iterator();
182 assertFalse( iter.hasNext() );
183 }
184
185 @Test(expected = IllegalArgumentException.class)
186 public void convert_nullInput_throw() throws TikaException {
187 TikaToXMP.convert( null );
188 }
189
190 @Test
191 public void isConverterAvailable_availableMime_true() {
192 assertTrue( TikaToXMP.isConverterAvailable( OOXML_MIMETYPE ) );
193 }
194
195 @Test
196 public void isConverterAvailable_noAvailableMime_false() {
197 assertFalse( TikaToXMP.isConverterAvailable( GENERIC_MIMETYPE ) );
198 }
199
200 @Test
201 public void isConverterAvailable_nullInput_false() {
202 assertFalse( TikaToXMP.isConverterAvailable( null ) );
203 }
204
205 @Test
206 public void getConverter_ConverterAvailable_class() throws TikaException {
207 ITikaToXMPConverter converter = TikaToXMP.getConverter( OOXML_MIMETYPE );
208 assertNotNull( converter );
209 assertTrue( converter instanceof MSOfficeXMLConverter );
210 }
211
212 @Test
213 public void getConverter_noConverterAvailable_null() throws TikaException {
214 ITikaToXMPConverter converter = TikaToXMP.getConverter( GENERIC_MIMETYPE );
215 assertNull( converter );
216 }
217
218 @Test(expected = IllegalArgumentException.class)
219 public void getConverter_nullInput_throw() throws TikaException {
220 TikaToXMP.getConverter( null );
221 }
222 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.xmp;
17
18 import static org.junit.Assert.*;
19
20 import java.util.Date;
21 import java.util.Properties;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.DublinCore;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.Property;
27 import org.apache.tika.metadata.PropertyTypeException;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.metadata.XMPRights;
30 import org.junit.Before;
31 import org.junit.Test;
32
33 import com.adobe.xmp.XMPConst;
34 import com.adobe.xmp.XMPException;
35 import com.adobe.xmp.XMPMeta;
36 import com.adobe.xmp.XMPUtils;
37 import com.adobe.xmp.properties.XMPProperty;
38
39 public class XMPMetadataTest {
40 private Metadata tikaMetadata;
41 private XMPMetadata xmpMeta;
42
43 private static final String GENERIC_MIMETYPE = "generic/mimetype";
44
45 // --- SETUP ---
46 @Before
47 public void setUp() throws Exception {
48 XMPMetadata.registerNamespace( DublinCore.NAMESPACE_URI_DC_TERMS,
49 DublinCore.PREFIX_DC_TERMS );
50 xmpMeta = new XMPMetadata();
51 tikaMetadata = new Metadata();
52 setupMetadata( tikaMetadata );
53 }
54
55 private void setupMetadata(Metadata metadata) {
56 // simple property
57 metadata.set( TikaCoreProperties.FORMAT, GENERIC_MIMETYPE );
58 // language alternative
59 metadata.set( TikaCoreProperties.TITLE, "title" );
60 // array
61 metadata.set( TikaCoreProperties.KEYWORDS, new String[] { "keyword1", "keyword2" } );
62 // date
63 metadata.set( TikaCoreProperties.MODIFIED, "2001-01-01T01:01" );
64 // int simple property
65 metadata.set( Property.internalInteger( "xmp:Integer" ), "2" );
66 }
67
68 // --- HELPER ---
69 private void checkArrayValues(String[] values, String baseValue) {
70 int i = 1;
71 for (String value : values) {
72 assertEquals( baseValue + i, value );
73 i++;
74 }
75 }
76
77 // --- TESTS ---
78 @Test
79 public void process_genericConversion_ok() throws TikaException, XMPException {
80 xmpMeta.process( tikaMetadata, GENERIC_MIMETYPE );
81
82 XMPMeta xmp = xmpMeta.getXMPData();
83
84 // check simple property
85 XMPProperty prop = xmp.getProperty( XMPConst.NS_DC, "format" );
86 assertNotNull( prop );
87 assertEquals( GENERIC_MIMETYPE, prop.getValue() );
88
89 // check lang alt
90 prop = xmp.getLocalizedText( XMPConst.NS_DC, "title", null, XMPConst.X_DEFAULT );
91 assertNotNull( prop );
92 assertEquals( "title", prop.getValue() );
93
94 // check array
95 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 1 );
96 assertNotNull( prop );
97 assertEquals( "keyword1", prop.getValue() );
98 prop = xmp.getArrayItem( XMPConst.NS_DC, "subject", 2 );
99 assertNotNull( prop );
100 assertEquals( "keyword2", prop.getValue() );
101 }
102
103 @Test
104 public void isMultiValued_multiProp_true() throws TikaException {
105 xmpMeta.process( tikaMetadata );
106
107 assertTrue( xmpMeta.isMultiValued( TikaCoreProperties.KEYWORDS ) );
108 }
109
110 @Test
111 public void isMultiValued_simpleProp_false() throws TikaException {
112 xmpMeta.process( tikaMetadata );
113
114 assertFalse( xmpMeta.isMultiValued( TikaCoreProperties.FORMAT ) );
115 }
116
117 @Test
118 public void get_simpleProp_valueReturned() throws TikaException {
119 xmpMeta.process( tikaMetadata );
120
121 assertEquals( GENERIC_MIMETYPE, xmpMeta.get( TikaCoreProperties.FORMAT ) );
122 }
123
124 @Test
125 public void get_arrayProp_firstValueReturned() throws TikaException {
126 xmpMeta.process( tikaMetadata );
127
128 assertEquals( "keyword1", xmpMeta.get( TikaCoreProperties.KEYWORDS ) );
129 }
130
131 @Test
132 public void get_notExistingProp_null() throws TikaException {
133 assertNull( xmpMeta.get( TikaCoreProperties.FORMAT ) );
134 }
135
136 @Test(expected = PropertyTypeException.class)
137 public void get_nullInput_throw() {
138 String notInitialized = null;
139 xmpMeta.get( notInitialized );
140 }
141
142 @Test(expected = PropertyTypeException.class)
143 public void get_notQualifiedKey_throw() {
144 xmpMeta.get( "wrongKey" );
145 }
146
147 @Test(expected = PropertyTypeException.class)
148 public void get_unknownPrefixKey_throw() {
149 xmpMeta.get( "unknown:key" );
150 }
151
152 @Test
153 public void getInt_IntegerProperty_valueReturned() throws TikaException {
154 xmpMeta.process( tikaMetadata );
155
156 assertEquals( new Integer( 2 ), xmpMeta.getInt( Property.get( "xmp:Integer" ) ) );
157 }
158
159 @Test
160 public void getDate_DateProperty_valueReturned() throws TikaException, XMPException {
161 xmpMeta.process( tikaMetadata );
162
163 Date date = XMPUtils.convertToDate( "2001-01-01T01:01" ).getCalendar().getTime();
164 assertTrue( date.equals( xmpMeta.getDate( TikaCoreProperties.MODIFIED ) ) );
165 }
166
167 @Test
168 public void getValues_arrayProperty_allElementsReturned() throws TikaException {
169 xmpMeta.process( tikaMetadata );
170
171 String[] values = xmpMeta.getValues( TikaCoreProperties.KEYWORDS );
172 assertEquals( 2, values.length );
173
174 checkArrayValues( values, "keyword" );
175 }
176
177 @Test
178 public void testSetAll() {
179 Properties props = new Properties();
180 props.put( TikaCoreProperties.FORMAT.getName(), "format" );
181 props.put( TikaCoreProperties.KEYWORDS.getName(), "keyword" );
182
183 xmpMeta.setAll( props );
184
185 assertEquals( "format", xmpMeta.get( TikaCoreProperties.FORMAT ) );
186
187 String[] values = xmpMeta.getValues( TikaCoreProperties.KEYWORDS );
188 assertEquals( 1, values.length );
189
190 assertEquals( "keyword", values[0] );
191 }
192
193 @Test
194 public void set_simpleProp_ok() {
195 xmpMeta.set( TikaCoreProperties.FORMAT, GENERIC_MIMETYPE );
196
197 assertEquals( GENERIC_MIMETYPE, xmpMeta.get( TikaCoreProperties.FORMAT ) );
198 }
199
200 @Test(expected = PropertyTypeException.class)
201 public void set_nullInput_throw() {
202 String notInitialized = null;
203 xmpMeta.set( notInitialized, "value" );
204 }
205
206 @Test(expected = PropertyTypeException.class)
207 public void set_notQualifiedKey_throw() {
208 xmpMeta.set( "wrongKey", "value" );
209 }
210
211 @Test(expected = PropertyTypeException.class)
212 public void set_unknownPrefixKey_throw() {
213 xmpMeta.set( "unknown:key", "value" );
214 }
215
216 @Test
217 public void set_arrayProperty_ok() {
218 xmpMeta.set( TikaCoreProperties.KEYWORDS, new String[] { "keyword1", "keyword2" } );
219
220 String[] values = xmpMeta.getValues( TikaCoreProperties.KEYWORDS );
221 assertEquals( 2, values.length );
222
223 checkArrayValues( values, "keyword" );
224 }
225
226 @Test(expected = PropertyTypeException.class)
227 public void set_simplePropWithMultipleValues_throw() {
228 xmpMeta.set( TikaCoreProperties.FORMAT, new String[] { "value1", "value2" } );
229 }
230
231 @Test
232 public void remove_existingProperty_propertyRemoved() throws TikaException {
233 xmpMeta.process( tikaMetadata );
234
235 assertNotNull( xmpMeta.get( TikaCoreProperties.FORMAT ) );
236
237 xmpMeta.remove( TikaCoreProperties.FORMAT );
238
239 assertNull( xmpMeta.get( TikaCoreProperties.FORMAT ) );
240 }
241
242 @Test
243 public void size_numberOfNamespacesReturned() throws TikaException {
244 xmpMeta.process( tikaMetadata );
245
246 assertEquals( 3, xmpMeta.size() );
247
248 xmpMeta.set( XMPRights.OWNER, "owner" );
249
250 assertEquals( 4, xmpMeta.size() );
251 }
252
253 }