Codebase list tika / upstream/1.18
New upstream version 1.18 Emmanuel Bourg 5 years ago
80 changed file(s) with 3469 addition(s) and 296 deletion(s). Raw diff Collapse all Expand all
0 Release 1.17 - December 8, 2017
0 Release 1.18 - 4/20/2018
1
2 * Upgrade Jackson to 2.9.5 (TIKA-2634).
3
4 * Add support for brotli (TIKA-2621).
5
6 * Upgrade PDFBox to 2.0.9 and include new jbig2-imageio
7 from org.apache.pdfbox (TIKA-2579 and TIKA-2607).
8
9 * Support for TIFF images in PDF files (TIKA-2338)
10
11 * Detection of full encrypted 7z files (TIKA-2568)
12
13 * Various new mimes and typo fixes in tika-mimetypes.xml
14 via Andreas Meier (TIKA-2527).
15
16 * Revert to listenForAllRecords=false in ExcelExtractor
17 via Grigoriy Alekseev (TIKA-2590)
18
19 * Add workaround to identify TIFFs that might confuse
20 commons-compress's tar detection via Daniel Schmidt
21 (TIKA-2591)
22
23 * Ignore non-IANA supported charsets in HTML meta-headers
24 during charset detection in HTMLEncodingDetector
25 via Andreas Meier (TIKA-2592)
26
27 * Add detection and parsing of zstd (if user provides
28 com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
29
30 * Allow for RFC822 detection for files starting with "dkim-"
31 and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
32
33 * Extract xlsx files embedded in OLE objects within PPT and PPTX
34 via Brian McColgan (TIKA-2588).
35
36 * Extract files embedded in HTML and javascript inside HTML
37 that are stored in the Data URI scheme (TIKA-2563).
38
39 * Extract text from grouped text boxes in PPT (TIKA-2569).
40
41 * Extract language metadata item from PDF files via Matt Sheppard (TIKA-2559)
42
43 * RFC822 with multipart/mixed, first text element should be treated
44 as the main body of the email, not an attachment (TIKA-2547).
45
46 * Swap out com.tdunning:json for com.github.openjson:openjson to avoid
47 jar conflicts (TIKA-2556).
48
49 * No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551).
50
51 * Require Java 8 (TIKA-2553).
52
53 * Add a parser for XPS (TIKA-2524).
54
55 * Mime magic for Dolby Digital AC3 and EAC3 files
56
57 * Fixed bug where TesseractOCRParser ignores configured ImageMagickPath,
58 and set rotation script to ignore Python warnings (TIKA-2509)
59
60 * Upgrade geo-apis to 3.0.1 (TIKA-2535).
61
62 * Added local Docker image build using dockerfile-maven-plugin to allow
63 images to be built from source (TIKA-1518).
64
65 Release 1.17 - 12/8/2017
166
267 ***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN
368 ON Java 7. The next versions will require Java 8***
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>tika-parent/pom.xml</relativePath>
2929 </parent>
3030
103103 <include name="tika-eval/target/tika-eval-${project.version}.jar*" />
104104 </fileset>
105105 </copy>
106 <checksum algorithm="MD5" fileext=".md5">
106 <checksum algorithm="SHA-512" fileext=".sha512">
107107 <fileset dir="${basedir}/target/${project.version}">
108108 <include name="*.zip" />
109109 <include name="*.?ar" />
110110 </fileset>
111111 </checksum>
112 <checksum algorithm="SHA1" fileext=".sha">
113 <fileset dir="${basedir}/target/${project.version}">
114 <include name="*.zip" />
115 <include name="*.?ar" />
116 </fileset>
117 </checksum>
118 <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" />
112 <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA-512" property="checksum" />
119113 <echo file="${basedir}/target/vote.txt">
120114 From: ${username}@apache.org
121115 To: dev@tika.apache.org
128122 The release candidate is a zip archive of the sources in:
129123 https://github.com/apache/tika/tree/{project.version}-rcN/
130124
131 The SHA1 checksum of the archive is
125 The SHA-512 checksum of the archive is
132126 ${checksum}.
133127
134128 In addition, a staged maven repository is available here:
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
1616
1717 package org.apache.tika.cli;
1818
19
20 import org.apache.commons.lang.SystemUtils;
21
1922 import java.io.IOException;
2023 import java.nio.file.Files;
2124 import java.nio.file.Path;
4043 static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
4144
4245 protected static String[] build(String[] args) throws IOException {
46
4347 Map<String, String> processArgs = new LinkedHashMap<String, String>();
4448 Map<String, String> jvmOpts = new LinkedHashMap<String,String>();
4549 //take the args, and divide them into process args and options for
5256 //maybe the user specified a different classpath?!
5357 if (! jvmOpts.containsKey("-cp") && ! jvmOpts.containsKey("--classpath")) {
5458 String cp = System.getProperty("java.class.path");
55 //need to test for " " on *nix, can't just add double quotes
56 //across platforms.
57 if (cp.contains(" ")){
58 cp = "\""+cp+"\"";
59 }
6059 jvmOpts.put("-cp", cp);
6160 }
6261
6968 }
7069 //use the log4j config file inside the app /resources/log4j_batch_process.properties
7170 if (! hasLog4j) {
72 jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", "");
71 jvmOpts.put("-Dlog4j.configuration=log4j_batch_process.properties", "");
7372 }
7473 //now build the full command line
7574 List<String> fullCommand = new ArrayList<String>();
7877 for (Map.Entry<String, String> e : jvmOpts.entrySet()) {
7978 fullCommand.add(e.getKey());
8079 if (e.getValue().length() > 0) {
81 fullCommand.add(e.getValue());
80 fullCommand.add(commandLineSafe(e.getValue()));
8281 }
8382 if (e.getKey().contains("java.awt.headless")) {
8483 foundHeadlessOption = true;
9392 for (Map.Entry<String, String> e : processArgs.entrySet()) {
9493 fullCommand.add(e.getKey());
9594 if (e.getValue().length() > 0) {
96 fullCommand.add(e.getValue());
95 fullCommand.add(commandLineSafe(e.getValue()));
9796 }
9897 }
9998 return fullCommand.toArray(new String[fullCommand.size()]);
99 }
100
101 protected static String commandLineSafe(String arg) {
102 if (arg == null) {
103 return arg;
104 }
105 //need to test for " " on windows, can't just add double quotes
106 //across platforms.
107 if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
108 arg = "\"" + arg + "\"";
109 }
110 return arg;
100111 }
101112
102113
10401040 if (name == null) {
10411041 name = "file" + count++;
10421042 }
1043
1043 if (! inputStream.markSupported()) {
1044 inputStream = TikaInputStream.get(inputStream);
1045 }
10441046 MediaType contentType = detector.detect(inputStream, metadata);
10451047
10461048 if (name.indexOf('.')==-1 && contentType!=null) {
4040 Path testFile = null;
4141
4242 String testInputPathForCommandLine;
43 String escapedInputPathForCommandLine;
4344
4445 @Before
4546 public void init() {
5657 throw new RuntimeException("Couldn't open testFile");
5758 }
5859 testInputPathForCommandLine = testInput.toAbsolutePath().toString();
60 escapedInputPathForCommandLine = BatchCommandLineBuilder.commandLineSafe(testInputPathForCommandLine);
5961 }
6062
6163 @After
113115 assertEquals("true", attrs.get("-recursiveParserWrapper"));
114116 assertEquals("html", attrs.get("-basicHandlerType"));
115117 assertEquals("batch-config.xml", attrs.get("-bc"));
116 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
118 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
117119 }
118120
119121 @Test
124126
125127 String[] commandLine = BatchCommandLineBuilder.build(params);
126128 Map<String, String> attrs = mapify(commandLine);
127 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
129 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
128130 assertEquals(outputRoot, attrs.get("-outputDir"));
129131 }
130132
135137
136138 String[] commandLine = BatchCommandLineBuilder.build(params);
137139 Map<String, String> attrs = mapify(commandLine);
138 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
140 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
139141 assertEquals(outputRoot, attrs.get("-outputDir"));
140142
141143 params = new String[]{"--inputDir", testInputPathForCommandLine, "--outputDir", outputRoot};
142144
143145 commandLine = BatchCommandLineBuilder.build(params);
144146 attrs = mapify(commandLine);
145 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
147 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
146148 assertEquals(outputRoot, attrs.get("-outputDir"));
147149
148150 params = new String[]{"-inputDir", testInputPathForCommandLine, "-outputDir", outputRoot};
149151
150152 commandLine = BatchCommandLineBuilder.build(params);
151153 attrs = mapify(commandLine);
152 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
154 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
153155 assertEquals(outputRoot, attrs.get("-outputDir"));
154156 }
155157
162164 "--config="+configPath};
163165 String[] commandLine = BatchCommandLineBuilder.build(params);
164166 Map<String, String> attrs = mapify(commandLine);
165 assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
167 assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
166168 assertEquals(outputRoot, attrs.get("-outputDir"));
167169 assertEquals(configPath, attrs.get("-c"));
168170
281281 FileUtils.deleteDirectory(tempFile);
282282 }
283283 }
284
285 @Test
286 public void testExtractTgz() throws Exception {
287 //TIKA-2564
288 File tempFile = File.createTempFile("tika-test-", "");
289 tempFile.delete();
290 tempFile.mkdir();
291
292 try {
293 String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/test-documents.tgz"};
294
295 TikaCLI.main(params);
296
297 StringBuffer allFiles = new StringBuffer();
298 for (String f : tempFile.list()) {
299 if (allFiles.length() > 0) allFiles.append(" : ");
300 allFiles.append(f);
301 }
302
303 File expectedTAR = new File(tempFile, "test-documents.tar");
304
305 assertExtracted(expectedTAR, allFiles.toString());
306 } finally {
307 FileUtils.deleteDirectory(tempFile);
308 }
309 }
310
311
284312 protected static void assertExtracted(File f, String allFiles) {
285313
286314 assertTrue(
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.extractor;
18
19 import org.apache.tika.batch.DigestingAutoDetectParserFactory;
20 import org.apache.tika.config.TikaConfig;
21 import org.apache.tika.extractor.EmbeddedDocumentUtil;
22 import org.apache.tika.parser.AutoDetectParser;
23 import org.apache.tika.parser.ParseContext;
24 import org.apache.tika.parser.Parser;
25 import org.apache.tika.parser.RecursiveParserWrapper;
26 import org.apache.tika.sax.BasicContentHandlerFactory;
27 import org.junit.Test;
28
29 import static org.junit.Assert.assertEquals;
30 import static org.junit.Assert.assertNotNull;
31
32 public class TestEmbeddedDocumentUtil {
33 //TODO -- figure out how to mock this into tika-core
34
35 @Test
36 public void testSimple() {
37 Parser p = new AutoDetectParser();
38 ParseContext parseContext = new ParseContext();
39 parseContext.set(Parser.class, p);
40 Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
41 assertNotNull(txtParser);
42 assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
43
44 }
45
46 @Test
47 public void testDoublyDecorated() {
48 Parser d = new DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig());
49 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d,
50 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
51 ParseContext parseContext = new ParseContext();
52 parseContext.set(Parser.class, wrapper);
53 Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
54 assertNotNull(txtParser);
55 assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
56 }
57 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
3434 <url>http://tika.apache.org/</url>
3535
3636 <properties>
37 <cli.version>1.3.1</cli.version>
37 <cli.version>1.4</cli.version>
3838 </properties>
3939
4040 <dependencies>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
7171 <groupId>org.ops4j.pax.exam</groupId>
7272 <artifactId>pax-exam-container-native</artifactId>
7373 <version>${pax.exam.version}</version>
74 <exclusions>
75 <exclusion>
76 <groupId>org.ops4j.base</groupId>
77 <artifactId>ops4j-base-util-property</artifactId>
78 </exclusion>
79 <exclusion>
80 <groupId>org.ops4j.base</groupId>
81 <artifactId>ops4j-base-lang</artifactId>
82 </exclusion>
83 </exclusions>
84 <scope>test</scope>
85 </dependency>
86 <dependency>
87 <groupId>org.ops4j.base</groupId>
88 <artifactId>ops4j-base-util-property</artifactId>
89 <version>1.5.0</version>
90 <scope>test</scope>
91 </dependency>
92 <dependency>
93 <groupId>org.ops4j.base</groupId>
94 <artifactId>ops4j-base-lang</artifactId>
95 <version>1.5.0</version>
7496 <scope>test</scope>
7597 </dependency>
7698 <dependency>
167189 sis-netcdf|
168190 sis-utility|
169191 sis-storage|
192 unit-api|
170193 apache-mime4j-core|
171194 apache-mime4j-dom|
172 jsr-275|
173195 jhighlight|
174196 java-libpst|
175197 netcdf4|
205227 android.util;resolution:=optional,
206228 com.adobe.xmp;resolution:=optional,
207229 com.adobe.xmp.properties;resolution:=optional,
230 com.github.luben.zstd;resolution:=optional,
231 com.github.openjson;resolution:=optional,
208232 com.google.protobuf;resolution:=optional,
209233 com.ibm.icu.text;resolution:=optional,
210234 com.sleepycat.je;resolution:=optional,
253277 org.apache.pdfbox.debugger;resolution:=optional,
254278 org.apache.sis;resolution:=optional,
255279 org.apache.sis.distance;resolution:=optional,
280 org.apache.sis.feature;resolution:=optional,
256281 org.apache.sis.geometry;resolution:=optional,
282 org.apache.sis.internal.feature;resolution:=optional,
283 org.apache.sis.internal.referencing;resolution:=optional,
284 org.apache.sis.parameter;resolution:=optional,
285 org.apache.sis.referencing;resolution:=optional,
257286 org.apache.tools.ant;resolution:=optional,
258287 org.apache.tools.ant.taskdefs;resolution:=optional,
259288 org.apache.tools.ant.types;resolution:=optional,
294323 org.jdom2.output;resolution:=optional,
295324 org.jdom2.filter;resolution:=optional,
296325 org.json.simple;resolution:=optional,
297 org.json;resolution:=optional,
298326 org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
299327 org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
300328 org.osgi.framework;resolution:=optional,
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
240240 Parser returnParser = null;
241241 if (p != null) {
242242 if (p instanceof ParserDecorator) {
243 p = ((ParserDecorator)p).getWrappedParser();
243 p = findInDecorated((ParserDecorator)p, clazz);
244244 }
245245 if (equals(p, clazz)) {
246246 return p;
254254 }
255255
256256 return null;
257 }
258
259 private static Parser findInDecorated(ParserDecorator p, Class clazz) {
260 Parser candidate = p.getWrappedParser();
261 if (equals(candidate, clazz)) {
262 return candidate;
263 }
264 if (candidate instanceof ParserDecorator) {
265 candidate = findInDecorated((ParserDecorator)candidate, clazz);
266 }
267 return candidate;
257268 }
258269
259270 private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) {
264275 return candidate;
265276 }
266277 if (candidate instanceof ParserDecorator) {
267 candidate = ((ParserDecorator)candidate).getWrappedParser();
278 candidate = findInDecorated((ParserDecorator)candidate, clazz);
268279 }
269280 if (equals(candidate, clazz)) {
270281 return candidate;
229229 break;
230230 }
231231 }
232 if (i < 0) {
233 throw new IOException("Buffer underun; expected one more byte");
234 }
232235 return v;
233236 }
234237
6969 * The unit tests for this class are in the tika-parsers module.
7070 * </p>
7171 */
72 public class RecursiveParserWrapper implements Parser {
72 public class RecursiveParserWrapper extends ParserDecorator {
7373
7474 /**
7575 * Generated serial version
125125 */
126126 public RecursiveParserWrapper(Parser wrappedParser,
127127 ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) {
128 super(wrappedParser);
128129 this.wrappedParser = wrappedParser;
129130 this.contentHandlerFactory = contentHandlerFactory;
130131 this.catchEmbeddedExceptions = catchEmbeddedExceptions;
3030 * ({@link #characters(char[], int, int)} or
3131 * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
3232 * content handler contain only valid XML characters. All invalid characters
33 * are replaced with spaces.
33 * are replaced with the Unicode replacement character U+FFFD (though a
34 * subclass may change this by overriding the {@link #writeReplacement(Output)} method).
3435 * <p>
3536 * The XML standard defines the following Unicode character ranges as
3637 * valid XML characters:
158158 * @return the regular expression containing the most important technical standard organizations.
159159 */
160160 public static String getOrganzationsRegex() {
161 String regex = "(" + String.join("|", organizations.keySet()) + ")";
162
163 return regex;
161 StringBuilder sb = new StringBuilder();
162 sb.append("(");
163 int i = 0;
164 for (String org : organizations.keySet()) {
165 if (i > 0) {
166 sb.append("|");
167 }
168 sb.append(org);
169 i++;
170 }
171 sb.append(")");
172 return sb.toString();
164173 }
165174 }
118118 <mime-type type="application/cnrp+xml"/>
119119 <mime-type type="application/commonground"/>
120120 <mime-type type="application/conference-info+xml"/>
121
122 <mime-type type="application/coreldraw">
123 <alias type="application/x-coreldraw"/>
124 <alias type="application/x-cdr"/>
125 <alias type="application/cdr"/>
126 <alias type="image/x-cdr"/>
127 <alias type="image/cdr"/>
128 <_comment>CorelDraw</_comment>
129 <_comment>cdr: CorelDraw</_comment>
130 <_comment>des: CorelDraw X4 and newer</_comment>
131 <magic priority="60">
132 <match value="RIFF" type="string" offset="0">
133 <match value="CDR" type="string" offset="8" />
134 <match value="cdr" type="string" offset="8" />
135 <match value="DES" type="string" offset="8" />
136 <match value="des" type="string" offset="8" />
137 </match>
138 </magic>
139 <glob pattern="*.cdr"/>
140 </mime-type>
141
121142 <mime-type type="application/cpl+xml"/>
122143 <mime-type type="application/csta+xml"/>
123144 <mime-type type="application/cstadata+xml"/>
347368 <alias type="application/mac-binhex"/>
348369 <alias type="application/binhex"/>
349370 <magic priority="50">
350 <match value="must\ be\ converted\ with\ BinHex" type="string" offset="11"/>
371 <match value="must be converted with BinHex" type="string" offset="11"/>
351372 </magic>
352373 <glob pattern="*.hqx"/>
353374 </mime-type>
839860 <mime-type type="application/smil+xml">
840861 <alias type="application/smil"/>
841862 <_comment>SMIL Multimedia</_comment>
863 <root-XML localName="smil"/>
864 <sub-class-of type="application/xml"/>
842865 <glob pattern="*.smi"/>
843866 <glob pattern="*.smil"/>
844867 <glob pattern="*.sml"/>
13901413 <mime-type type="application/vnd.intu.qfx">
13911414 <glob pattern="*.qfx"/>
13921415 </mime-type>
1416 <mime-type type="application/vnd.iptc.g2.catalogitem+xml"/>
13931417 <mime-type type="application/vnd.iptc.g2.conceptitem+xml"/>
13941418 <mime-type type="application/vnd.iptc.g2.knowledgeitem+xml"/>
13951419 <mime-type type="application/vnd.iptc.g2.newsitem+xml"/>
1420
1421 <mime-type type="application/vnd.iptc.g2.newsmessage+xml">
1422 <root-XML localName="newsMessage"/>
1423 <root-XML localName="newsMessage" namespaceURI="http://iptc.org/std/nar/2006-10-01/"/>
1424 <sub-class-of type="application/xml"/>
1425 <_comment>XML syntax for IPTC NewsMessages</_comment>
1426 <glob pattern="*.nar"/>
1427 </mime-type>
1428
13961429 <mime-type type="application/vnd.iptc.g2.packageitem+xml"/>
1430 <mime-type type="application/vnd.iptc.g2.planningitem+xml"/>
1431
13971432 <mime-type type="application/vnd.ipunplugged.rcprofile">
13981433 <glob pattern="*.rcprofile"/>
13991434 </mime-type>
27742809 <mime-type type="application/wspolicy+xml">
27752810 <glob pattern="*.wspolicy"/>
27762811 </mime-type>
2812
2813 <mime-type type="image/x-tga">
2814 <alias type="image/x-targa"/>
2815 <!-- trailer bytes: 54 52 55 45 56 49 53 49 4F 4E 2D 58 46 49 4C 45 2E 00
2816 trailer as string: TRUEVISION-XFILE\\x2E\\x00
2817 Some .tga files may be conflicting with application/x-123 recognition,
2818 therefore this mime-type must be set in front of application/x-123 -->
2819 <_comment>Targa image data</_comment>
2820 <magic priority="90">
2821 <match value="0x01010000" type="big32" offset="1" >
2822 <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
2823 </match>
2824 <match value="0x00020000" type="big32" offset="1" >
2825 <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
2826 </match>
2827 <match value="0x00030000" type="big32" offset="1" >
2828 <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
2829 </match>
2830 </magic>
2831 <glob pattern="*.tga"/>
2832 <glob pattern="*.icb"/>
2833 <glob pattern="*.vda"/>
2834 <!-- <glob pattern="*.vst"/> --> <!-- conflicting with application/vnd.visio-->
2835 </mime-type>
27772836
27782837 <mime-type type="application/x-123">
27792838 <magic priority="50">
30753134 <match value="bplist" type="string" offset="0"/>
30763135 </magic>
30773136 </mime-type>
3137 <mime-type type="application/x-gtar">
3138 <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
3139 <magic priority="50">
3140 <!-- GNU tar archive -->
3141 <match value="ustar \0" type="string" offset="257" />
3142 </magic>
3143 <glob pattern="*.gtar"/>
3144 <sub-class-of type="application/x-tar"/>
3145 </mime-type>
3146
3147 <mime-type type="application/x-brotli">
3148 <glob pattern="*.br" />
3149 <glob pattern="*.brotli" />
3150 </mime-type>
30783151
30793152 <mime-type type="application/x-bzip">
30803153 <magic priority="40">
34523525 <glob pattern="*.tgz" />
34533526 <glob pattern="*-gz" />
34543527 </mime-type>
3455
3528 <mime-type type="application/zstd">
3529 <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment>
3530 <_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment>
3531 <magic priority="50">
3532 <match value="0xFD2FB528" type="little32" offset="0"/>
3533 </magic>
3534 <glob pattern="*.zstd"/>
3535 </mime-type>
34563536 <mime-type type="application/x-hdf">
34573537 <_comment>Hierarchical Data Format File</_comment>
34583538 <magic priority="50">
35913671 <match value="-lz5-" type="string" offset="2"/>
35923672 </magic>
35933673 </mime-type>
3674
3675 <mime-type type="application/x-lz4">
3676 <_comment>First match LZ4 Frame</_comment>
3677 <_comment>Second match Legacy Frame</_comment>
3678 <magic priority="60">
3679 <match value="0x184d2204" type="little32" offset="0" />
3680 <match value="0x184c2102" type="little32" offset="0" />
3681 </magic>
3682 <glob pattern="*.lz4"/>
3683 </mime-type>
3684
3685 <mime-type type="application/x-lzip">
3686 <_comment>Lzip (LZMA) compressed archive</_comment>
3687 <magic priority="50">
3688 <match value="\x4c\x5a\x49\x50" type="string" offset="0"/>
3689 </magic>
3690 <glob pattern="*.lz"/>
3691 </mime-type>
3692
3693 <mime-type type="application/x-lzma">
3694 <_comment>LZMA compressed archive</_comment>
3695 <glob pattern="*.lzma"/>
3696 </mime-type>
35943697
35953698 <mime-type type="application/x-mobipocket-ebook">
35963699 <acronym>MOBI</acronym>
40024105 <acronym>ESRI Shapefiles</acronym>
40034106 <_comment>ESRI Shapefiles</_comment>
40044107 <magic priority="60">
4005 <match value="0x0000270a" type="big32" offset="2" />
4108 <match value="0x0000270a" type="big32" offset="0" />
40064109 </magic>
40074110 <glob pattern="*.shp"/>
40084111 </mime-type>
47404843 <glob pattern="*.aac"/>
47414844 </mime-type>
47424845
4743 <mime-type type="audio/x-adbcm">
4846 <mime-type type="audio/x-adpcm">
47444847 <magic priority="20">
47454848 <match value=".snd" type="string" offset="0">
47464849 <match value="23" type="big32" offset="12"/>
47664869 <glob pattern="*.aiff"/>
47674870 <glob pattern="*.aifc"/>
47684871 </mime-type>
4872
4873 <mime-type type="audio/x-caf">
4874 <_comment>Core Audio Format</_comment>
4875 <_comment>com.apple.coreaudio-format</_comment>
4876 <magic priority="60">
4877 <match value="caff" type="string" offset="0" />
4878 </magic>
4879 <glob pattern="*.caf"/>
4880 </mime-type>
47694881
47704882 <mime-type type="audio/x-dec-basic">
47714883 <magic priority="20">
47814893 </magic>
47824894 </mime-type>
47834895
4784 <mime-type type="audio/x-dec-adbcm">
4896 <mime-type type="audio/x-dec-adpcm">
47854897 <magic priority="20">
47864898 <match value="0x0064732E" type="big32" offset="0">
47874899 <match value="23" type="big32" offset="12"/>
56125724 <magic priority="50">
56135725 <match value="Delivered-To:" type="string" offset="0"/>
56145726 <match value="Status:" type="string" offset="0"/>
5615 <match value="X-Mozilla-Keys:" type="string" offset="0"/>
5616 <match value="X-Mozilla-Status:" type="string" offset="0"/>
5617 <match value="X-Mozilla-Status2:" type="string" offset="0"/>
56185727 <match value="Relay-Version:" type="stringignorecase" offset="0"/>
56195728 <match value="#!\ rnews" type="string" offset="0"/>
56205729 <match value="N#!\ rnews" type="string" offset="0"/>
56245733 <match value="From:" type="stringignorecase" offset="0"/>
56255734 <match value="Received:" type="stringignorecase" offset="0"/>
56265735 <match value="Message-ID:" type="stringignorecase" offset="0"/>
5736 <match value="\nReturn-Path:" type="stringignorecase" offset="0:1000"/>
5737 <match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1000"/>
5738 <match value="\nReceived:" type="stringignorecase" offset="0:1000"/>
56275739 <match value="Date:" type="string" offset="0"/>
56285740 <match value="User-Agent:" type="string" offset="0"/>
56295741 <match value="MIME-Version:" type="stringignorecase" offset="0"/>
56315743 <match value="X-Notes-Item:" type="string" offset="0">
56325744 <match value="Message-ID:" type="string" offset="0:8192"/>
56335745 </match>
5746 <match value="X-" type="stringignorecase" offset="0">
5747 <match value="\nMessage-ID:" type="string" offset="0:8192"/>
5748 <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
5749 <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
5750 <match value="\nSubject:" type="string" offset="0:8192"/>
5751 <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
5752 </match>
5753 <match value="DKIM-" type="string" offset="0">
5754 <match value="\nMessage-ID:" type="string" offset="0:8192"/>
5755 <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
5756 <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
5757 <match value="\nSubject:" type="string" offset="0:8192"/>
5758 <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
5759 </match>
5760 </magic>
5761 <magic priority="40">
5762 <!-- lower priority than message/news -->
5763 <match value="\nMessage-ID:" type="stringignorecase" offset="0:1000"/>
56345764 </magic>
56355765 <glob pattern="*.eml"/>
56365766 <glob pattern="*.mime"/>
212212 return getRecursiveMetadata(filePath, new ParseContext());
213213 }
214214
215 protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
216 return getRecursiveMetadata(filePath, new ParseContext(), metadata);
217 }
218
219 protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
220 Parser p = new AutoDetectParser();
221 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
222 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
223 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
224 wrapper.parse(is, new DefaultHandler(), metadata, context);
225 }
226 return wrapper.getMetadata();
227 }
228
215229 protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
216230 Parser p = new AutoDetectParser();
217231 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
2323 <parent>
2424 <groupId>org.apache.tika</groupId>
2525 <artifactId>tika-parent</artifactId>
26 <version>1.17</version>
26 <version>1.18</version>
2727 <relativePath>../tika-parent/pom.xml</relativePath>
2828 </parent>
2929
4545 <artifactId>tika-parsers</artifactId>
4646 <version>${project.version}</version>
4747 <scope>provided</scope>
48 <exclusions>
49 <exclusion>
50 <groupId>joda-time</groupId>
51 <artifactId>joda-time</artifactId>
52 </exclusion>
53 </exclusions>
4854 </dependency>
4955 <dependency>
5056 <groupId>junit</groupId>
6369 <groupId>org.json</groupId>
6470 <artifactId>json</artifactId>
6571 </exclusion>
66 </exclusions>
72 <exclusion>
73 <groupId>com.google.guava</groupId>
74 <artifactId>guava</artifactId>
75 </exclusion>
76 <exclusion>
77 <groupId>org.deeplearning4j</groupId>
78 <artifactId>deeplearning4j-modelimport</artifactId>
79 </exclusion>
80 <exclusion>
81 <groupId>org.apache.commons</groupId>
82 <artifactId>commons-compress</artifactId>
83 </exclusion>
84 <exclusion>
85 <groupId>org.apache.commons</groupId>
86 <artifactId>commons-math3</artifactId>
87 </exclusion>
88 <exclusion>
89 <groupId>commons-io</groupId>
90 <artifactId>commons-io</artifactId>
91 </exclusion>
92 </exclusions>
93 </dependency>
94 <dependency>
95 <groupId>org.apache.commons</groupId>
96 <artifactId>commons-math3</artifactId>
97 <version>3.4.1</version>
6798 </dependency>
6899 <dependency>
69100 <groupId>org.deeplearning4j</groupId>
74105 <groupId>org.deeplearning4j</groupId>
75106 <artifactId>deeplearning4j-keras</artifactId>
76107 </exclusion>
108 <exclusion>
109 <groupId>org.bytedeco</groupId>
110 <artifactId>javacpp</artifactId>
111 </exclusion>
112 <exclusion>
113 <groupId>joda-time</groupId>
114 <artifactId>joda-time</artifactId>
115 </exclusion>
77116 </exclusions>
78117 </dependency>
79118 <dependency>
80119 <groupId>org.datavec</groupId>
81120 <artifactId>datavec-data-image</artifactId>
82121 <version>${dl4j.version}</version>
122 <exclusions>
123 <exclusion>
124 <groupId>com.google.guava</groupId>
125 <artifactId>guava</artifactId>
126 </exclusion>
127 <exclusion>
128 <groupId>org.bytedeco</groupId>
129 <artifactId>javacpp</artifactId>
130 </exclusion>
131 <exclusion>
132 <groupId>org.apache.commons</groupId>
133 <artifactId>commons-math3</artifactId>
134 </exclusion>
135 <exclusion>
136 <groupId>commons-io</groupId>
137 <artifactId>commons-io</artifactId>
138 </exclusion>
139 <exclusion>
140 <groupId>com.github.jai-imageio</groupId>
141 <artifactId>jai-imageio-core</artifactId>
142 </exclusion>
143 </exclusions>
83144 </dependency>
84145 <dependency>
85146 <groupId>org.nd4j</groupId>
86147 <artifactId>nd4j-native-platform</artifactId>
87148 <version>${dl4j.version}</version>
149 <exclusions>
150 <exclusion>
151 <groupId>org.bytedeco</groupId>
152 <artifactId>javacpp</artifactId>
153 </exclusion>
154 </exclusions>
155 </dependency>
156 <dependency>
157 <groupId>org.bytedeco</groupId>
158 <artifactId>javacpp</artifactId>
159 <version>1.3.2</version>
88160 </dependency>
89161 <dependency>
90162 <groupId>org.apache.commons</groupId>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
8989 <groupId>org.apache.jackrabbit</groupId>
9090 <artifactId>jackrabbit-jcr-server</artifactId>
9191 <version>2.3.6</version>
92 <exclusions>
93 <exclusion>
94 <groupId>org.apache.tika</groupId>
95 <artifactId>tika-core</artifactId>
96 </exclusion>
97 <exclusion>
98 <groupId>commons-codec</groupId>
99 <artifactId>commons-codec</artifactId>
100 </exclusion>
101 <exclusion>
102 <groupId>commons-io</groupId>
103 <artifactId>commons-io</artifactId>
104 </exclusion>
105 </exclusions>
92106 </dependency>
93107 <dependency>
94108 <groupId>org.apache.jackrabbit</groupId>
95109 <artifactId>jackrabbit-core</artifactId>
96110 <version>2.3.6</version>
111 <exclusions>
112 <exclusion>
113 <groupId>org.apache.tika</groupId>
114 <artifactId>tika-core</artifactId>
115 </exclusion>
116 <exclusion>
117 <groupId>commons-io</groupId>
118 <artifactId>commons-io</artifactId>
119 </exclusion>
120 <exclusion>
121 <groupId>org.apache.lucene</groupId>
122 <artifactId>lucene-core</artifactId>
123 </exclusion>
124 </exclusions>
97125 </dependency>
98126 <dependency>
99127 <groupId>org.apache.lucene</groupId>
108136 <dependency>
109137 <groupId>org.springframework</groupId>
110138 <artifactId>spring-context</artifactId>
111 <version>3.0.2.RELEASE</version>
139 <version>3.2.16.RELEASE</version>
112140 <exclusions>
113141 <exclusion>
114142 <groupId>commons-logging</groupId>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
8181
8282 Iterator<FileTypeDetector> iterator = serviceLoader.iterator();
8383 assertTrue(iterator.hasNext());
84
84
85 boolean foundTika = false;
8586 while(iterator.hasNext()) {
8687 FileTypeDetector fileTypeDetector = iterator.next();
8788 assertNotNull(fileTypeDetector);
88 assertTrue(fileTypeDetector instanceof TikaFileTypeDetector);
89 if (fileTypeDetector instanceof TikaFileTypeDetector) {
90 foundTika = true;
91 }
8992 }
93 //o.a.sis.internal.storage.StoreTypeDetector appears with latest upgrade
94 //check that TikaFileTypeDetector appears at all
95 assertTrue(foundTika);
9096 }
9197 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
4242 <dependency>
4343 <groupId>com.optimaize.languagedetector</groupId>
4444 <artifactId>language-detector</artifactId>
45 <version>0.5</version>
45 <version>0.6</version>
46 <exclusions>
47 <exclusion>
48 <groupId>com.google.guava</groupId>
49 <artifactId>guava</artifactId>
50 </exclusion>
51 </exclusions>
52 </dependency>
53 <!-- exclude and then add back in to avoid
54 conflicts with edu.ucar:cdm in tika-parsers -->
55 <dependency>
56 <groupId>com.google.guava</groupId>
57 <artifactId>guava</artifactId>
58 <version>17.0</version>
4659 </dependency>
4760 <dependency>
4861 <groupId>org.apache.cxf</groupId>
5265 <dependency>
5366 <groupId>com.google.code.gson</groupId>
5467 <artifactId>gson</artifactId>
55 <version>2.6.1</version>
68 <version>${gson.version}</version>
5669 </dependency>
5770
5871 <!-- Test dependencies -->
2323 <parent>
2424 <groupId>org.apache.tika</groupId>
2525 <artifactId>tika-parent</artifactId>
26 <version>1.17</version>
26 <version>1.18</version>
2727 <relativePath>../tika-parent/pom.xml</relativePath>
2828 </parent>
2929
6363 <groupId>edu.usc.ir</groupId>
6464 <artifactId>age-predictor-api</artifactId>
6565 <version>1.0</version>
66 </dependency>
67
66 <exclusions>
67 <exclusion>
68 <groupId>com.google.guava</groupId>
69 <artifactId>guava</artifactId>
70 </exclusion>
71 <exclusion>
72 <groupId>commons-lang</groupId>
73 <artifactId>commons-lang</artifactId>
74 </exclusion>
75 <exclusion>
76 <groupId>commons-compress</groupId>
77 <artifactId>commons-compress</artifactId>
78 </exclusion>
79 <exclusion>
80 <groupId>org.xerial.snappy</groupId>
81 <artifactId>snappy-java</artifactId>
82 </exclusion>
83 <exclusion>
84 <groupId>com.fasterxml.jackson.core</groupId>
85 <artifactId>jackson-core</artifactId>
86 </exclusion>
87 <exclusion>
88 <groupId>com.fasterxml.jackson.core</groupId>
89 <artifactId>jackson-databind</artifactId>
90 </exclusion>
91 <exclusion>
92 <groupId>com.fasterxml.jackson.core</groupId>
93 <artifactId>jackson-annotations</artifactId>
94 </exclusion>
95 <exclusion>
96 <groupId>org.codehaus.jackson</groupId>
97 <artifactId>jackson-mapper-asl</artifactId>
98 </exclusion>
99 <exclusion>
100 <groupId>log4j</groupId>
101 <artifactId>log4j</artifactId>
102 </exclusion>
103 <exclusion>
104 <groupId>commons-codec</groupId>
105 <artifactId>commons-codec</artifactId>
106 </exclusion>
107 <exclusion>
108 <groupId>commons-io</groupId>
109 <artifactId>commons-io</artifactId>
110 </exclusion>
111 <exclusion>
112 <groupId>com.thoughtworks.paranamer</groupId>
113 <artifactId>paranamer</artifactId>
114 </exclusion>
115 <exclusion>
116 <groupId>commons-net</groupId>
117 <artifactId>commons-net</artifactId>
118 </exclusion>
119 <exclusion>
120 <groupId>org.scala-lang</groupId>
121 <artifactId>scala-library</artifactId>
122 </exclusion>
123 <exclusion>
124 <groupId>org.scala-lang</groupId>
125 <artifactId>scala-reflect</artifactId>
126 </exclusion>
127 <exclusion>
128 <groupId>org.scalamacros</groupId>
129 <artifactId>quasiquotes_2.10</artifactId>
130 </exclusion>
131 <exclusion>
132 <groupId>org.codehaus.jackson</groupId>
133 <artifactId>jackson-core-asl</artifactId>
134 </exclusion>
135 <exclusion>
136 <groupId>org.apache.avro</groupId>
137 <artifactId>avro</artifactId>
138 </exclusion>
139 </exclusions>
140 </dependency>
141 <dependency>
142 <groupId>org.scalamacros</groupId>
143 <artifactId>quasiquotes_2.10</artifactId>
144 <version>2.0.0-M8</version>
145 <exclusions>
146 <exclusion>
147 <groupId>org.scala-lang</groupId>
148 <artifactId>scala-reflect</artifactId>
149 </exclusion>
150 <exclusion>
151 <groupId>org.scala-lang</groupId>
152 <artifactId>scala-library</artifactId>
153 </exclusion>
154 </exclusions>
155 </dependency>
156 <dependency>
157 <groupId>org.scala-lang</groupId>
158 <artifactId>scala-library</artifactId>
159 <version>2.10.6</version>
160 </dependency>
161 <dependency>
162 <groupId>org.scala-lang</groupId>
163 <artifactId>scala-reflect</artifactId>
164 <version>2.10.6</version>
165 </dependency>
166 <dependency>
167 <groupId>commons-net</groupId>
168 <artifactId>commons-net</artifactId>
169 <version>3.1</version>
170 </dependency>
171 <dependency>
172 <groupId>com.thoughtworks.paranamer</groupId>
173 <artifactId>paranamer</artifactId>
174 <version>2.6</version>
175 </dependency>
176 <dependency>
177 <groupId>org.xerial.snappy</groupId>
178 <artifactId>snappy-java</artifactId>
179 <version>1.1.2.4</version>
180 </dependency>
181 <dependency>
182 <groupId>org.codehaus.jackson</groupId>
183 <artifactId>jackson-mapper-asl</artifactId>
184 <version>1.9.13</version>
185 </dependency>
186 <dependency>
187 <groupId>com.fasterxml.jackson.core</groupId>
188 <artifactId>jackson-databind</artifactId>
189 <version>${jackson.version}</version>
190 <exclusions>
191 <exclusion>
192 <groupId>com.fasterxml.jackson.core</groupId>
193 <artifactId>jackson-annotations</artifactId>
194 </exclusion>
195 </exclusions>
196 </dependency>
197 <dependency>
198 <groupId>com.fasterxml.jackson.core</groupId>
199 <artifactId>jackson-annotations</artifactId>
200 <version>${jackson.version}</version>
201 </dependency>
68202 <!-- Test dependencies -->
69203 <dependency>
70204 <groupId>junit</groupId>
73207 <dependency>
74208 <groupId>org.mockito</groupId>
75209 <artifactId>mockito-core</artifactId>
76 <version>1.7</version>
210 <version>2.15.0</version>
77211 <scope>test</scope>
78212 </dependency>
79213 <dependency>
3030
3131 <groupId>org.apache.tika</groupId>
3232 <artifactId>tika-parent</artifactId>
33 <version>1.17</version>
33 <version>1.18</version>
3434 <packaging>pom</packaging>
3535
3636 <name>Apache Tika parent</name>
305305 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
306306 <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
307307 <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
308 <commons.compress.version>1.14</commons.compress.version>
309 <commons.io.version>2.5</commons.io.version>
308 <commons.compress.version>1.16.1</commons.compress.version>
309 <commons.io.version>2.6</commons.io.version>
310 <gson.version>2.8.1</gson.version>
310311 <cxf.version>3.0.16</cxf.version>
311312 <slf4j.version>1.7.24</slf4j.version>
313 <jackson.version>2.9.5</jackson.version>
312314 </properties>
313315
314316 <build>
324326 <plugin>
325327 <groupId>de.thetaphi</groupId>
326328 <artifactId>forbiddenapis</artifactId>
327 <version>2.3</version>
329 <!-- if this version contains commons-io 2.6, remove hard-coded commons-io version below -->
330 <version>2.5</version>
328331 <configuration>
329332 <targetVersion>${maven.compiler.target}</targetVersion>
330333 <failOnUnresolvableSignatures>false</failOnUnresolvableSignatures>
375378 <version>1.9.5</version>
376379 </dependency>
377380 </dependencies>
381 </plugin>
382 <plugin>
383 <groupId>org.apache.maven.plugins</groupId>
384 <artifactId>maven-enforcer-plugin</artifactId>
385 <version>3.0.0-M1</version>
386 <executions>
387 <execution>
388 <id>enforce</id>
389 <configuration>
390 <rules>
391 <dependencyConvergence />
392 </rules>
393 </configuration>
394 <goals>
395 <goal>enforce</goal>
396 </goals>
397 </execution>
398 </executions>
378399 </plugin>
379400 </plugins>
380401 </build>
438459 <connection>scm:git:https://github.com/apache/</connection>
439460 <developerConnection>scm:git:https://github.com/apache/</developerConnection>
440461 <url>https://github.com/apache/tika</url>
441 <tag>1.17-rc2</tag>
462 <tag>1.18-rc3</tag>
442463 </scm>
443464 </project>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
3838 <!-- NOTE: sync codec version with POI -->
3939 <codec.version>1.10</codec.version>
4040 <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
41 <tukaani.version>1.6</tukaani.version>
41 <tukaani.version>1.8</tukaani.version>
42 <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
43 <brotli.version>0.1.2</brotli.version>
4244 <mime4j.version>0.8.1</mime4j.version>
4345 <vorbis.version>0.8</vorbis.version>
44 <pdfbox.version>2.0.8</pdfbox.version>
46 <pdfbox.version>2.0.9</pdfbox.version>
4547 <jempbox.version>1.8.13</jempbox.version>
4648 <netcdf-java.version>4.5.5</netcdf-java.version>
47 <sis.version>0.6</sis.version>
49 <sis.version>0.8</sis.version>
4850 <!-- used by POI, PDFBox and Jackcess ...try to sync -->
4951 <bouncycastle.version>1.54</bouncycastle.version>
5052 <commonsexec.version>1.3</commonsexec.version>
8082 <groupId>org.gagravarr</groupId>
8183 <artifactId>vorbis-java-tika</artifactId>
8284 <version>${vorbis.version}</version>
85 <exclusions>
86 <exclusion>
87 <groupId>org.apache.tika</groupId>
88 <artifactId>tika-core</artifactId>
89 </exclusion>
90 </exclusions>
8391 </dependency>
8492 <dependency>
8593 <groupId>com.healthmarketscience.jackcess</groupId>
8694 <artifactId>jackcess</artifactId>
87 <version>2.1.8</version>
95 <version>2.1.10</version>
8896 <exclusions>
8997 <exclusion>
9098 <groupId>commons-logging</groupId>
95103 <dependency>
96104 <groupId>com.healthmarketscience.jackcess</groupId>
97105 <artifactId>jackcess-encrypt</artifactId>
98 <version>2.1.2</version>
106 <version>2.1.4</version>
99107 <exclusions>
100108 <exclusion>
101109 <groupId>org.bouncycastle</groupId>
102110 <artifactId>bcprov-jdk15on</artifactId>
111 </exclusion>
112 <!-- to avoid maven-enforcer convergence error,
113 let's make this explicit -->
114 <exclusion>
115 <groupId>com.healthmarketscience.jackcess</groupId>
116 <artifactId>jackcess</artifactId>
103117 </exclusion>
104118 </exclusions>
105119 </dependency>
136150 <groupId>org.tukaani</groupId>
137151 <artifactId>xz</artifactId>
138152 <version>${tukaani.version}</version>
153 </dependency>
154 <dependency>
155 <groupId>org.brotli</groupId>
156 <artifactId>dec</artifactId>
157 <version>${brotli.version}</version>
158 </dependency>
159 <dependency>
160 <groupId>com.github.luben</groupId>
161 <artifactId>zstd-jni</artifactId>
162 <version>1.3.3-3</version>
163 <scope>provided</scope>
139164 </dependency>
140165
141166 <dependency>
315340 <dependency>
316341 <groupId>org.apache.opennlp</groupId>
317342 <artifactId>opennlp-tools</artifactId>
318 <version>1.8.3</version>
343 <version>1.8.4</version>
319344 </dependency>
320345
321346 <dependency>
336361 </exclusions>
337362 </dependency>
338363
339 <dependency>
364 <!-- <dependency>
340365 <groupId>com.tdunning</groupId>
341366 <artifactId>json</artifactId>
342367 <version>1.8</version>
368 </dependency> -->
369 <dependency>
370 <groupId>com.github.openjson</groupId>
371 <artifactId>openjson</artifactId>
372 <version>1.0.10</version>
343373 </dependency>
344374 <dependency>
345375 <groupId>com.google.code.gson</groupId>
346376 <artifactId>gson</artifactId>
347 <version>2.8.1</version>
377 <version>${gson.version}</version>
348378 </dependency>
349379
350380 <!-- logging dependencies -->
369399 <dependency>
370400 <groupId>org.mockito</groupId>
371401 <artifactId>mockito-core</artifactId>
372 <version>1.7</version>
402 <version>2.15.0</version>
373403 <scope>test</scope>
374404 </dependency>
375405 <dependency>
389419 <groupId>commons-logging</groupId>
390420 <artifactId>commons-logging</artifactId>
391421 </exclusion>
422 <exclusion>
423 <groupId>org.jdom</groupId>
424 <artifactId>jdom2</artifactId>
425 </exclusion>
392426 </exclusions>
393427 </dependency>
394428 <dependency>
400434 <groupId>edu.ucar</groupId>
401435 <artifactId>jj2000</artifactId>
402436 </exclusion>
403 </exclusions>
404 </dependency>
405 <dependency>
437 <exclusion>
438 <groupId>org.jsoup</groupId>
439 <artifactId>jsoup</artifactId>
440 </exclusion>
441 <exclusion>
442 <groupId>org.jdom</groupId>
443 <artifactId>jdom2</artifactId>
444 </exclusion>
445 </exclusions>
446 </dependency>
447 <!-- grib's current jsoup is vulnerable to xss
448 exclude and import a more modern version TIKA-2561-->
449 <dependency>
450 <groupId>org.jsoup</groupId>
451 <artifactId>jsoup</artifactId>
452 <version>1.11.2</version>
453 </dependency> <dependency>
406454 <groupId>edu.ucar</groupId>
407455 <artifactId>cdm</artifactId>
408456 <version>${netcdf-java.version}</version>
415463 <groupId>org.slf4j</groupId>
416464 <artifactId>jcl-over-slf4j</artifactId>
417465 </exclusion>
466 <exclusion>
467 <groupId>org.apache.httpcomponents</groupId>
468 <artifactId>httpcore</artifactId>
469 </exclusion>
470 <exclusion>
471 <groupId>org.jdom</groupId>
472 <artifactId>jdom2</artifactId>
473 </exclusion>
418474 </exclusions>
419475 </dependency>
420476 <dependency>
433489 </exclusion>
434490 <exclusion>
435491 <groupId>org.apache.httpcomponents</groupId>
492 <artifactId>httpcore</artifactId>
493 </exclusion> <exclusion>
494 <groupId>org.apache.httpcomponents</groupId>
436495 <artifactId>httpmime</artifactId>
437496 </exclusion>
438497 </exclusions>
480539 <dependency>
481540 <groupId>org.opengis</groupId>
482541 <artifactId>geoapi</artifactId>
483 <version>3.0.0</version>
542 <version>3.0.1</version>
484543 </dependency>
485544
486545 <dependency>
536595 <dependency>
537596 <groupId>org.apache.ctakes</groupId>
538597 <artifactId>ctakes-core</artifactId>
539 <version>3.2.2</version>
598 <version>4.0.0</version>
540599 <scope>provided</scope>
541600 <exclusions>
542601 <exclusion>
563622 <groupId>org.springframework</groupId>
564623 <artifactId>spring-core</artifactId>
565624 </exclusion>
566 </exclusions>
567 </dependency>
568
625 <exclusion>
626 <groupId>org.apache.opennlp</groupId>
627 <artifactId>opennlp-tools</artifactId>
628 </exclusion>
629 <exclusion>
630 <groupId>com.google.guava</groupId>
631 <artifactId>guava</artifactId>
632 </exclusion>
633 <exclusion>
634 <groupId>commons-io</groupId>
635 <artifactId>commons-io</artifactId>
636 </exclusion>
637 <exclusion>
638 <groupId>org.apache.uima</groupId>
639 <artifactId>uimafit-core</artifactId>
640 </exclusion>
641 <exclusion>
642 <groupId>org.apache.uima</groupId>
643 <artifactId>uimaj-core</artifactId>
644 </exclusion>
645 <exclusion>
646 <groupId>org.jdom</groupId>
647 <artifactId>jdom2</artifactId>
648 </exclusion>
649 </exclusions>
650 </dependency>
651 <!-- need to specify this to avoid
652 version clash within ctakes-core 4.0.0 -->
653 <dependency>
654 <groupId>org.apache.uima</groupId>
655 <artifactId>uimafit-core</artifactId>
656 <version>2.2.0</version>
657 <exclusions>
658 <exclusion>
659 <groupId>org.apache.uima</groupId>
660 <artifactId>uimaj-core</artifactId>
661 </exclusion>
662 <exclusion>
663 <groupId>commons-io</groupId>
664 <artifactId>commons-io</artifactId>
665 </exclusion>
666 </exclusions>
667 </dependency>
668 <!-- need to specify this to avoid
669 version clash within ctakes-core 4.0.0 -->
670 <dependency>
671 <groupId>org.apache.uima</groupId>
672 <artifactId>uimaj-core</artifactId>
673 <version>2.9.0</version>
674 </dependency>
675
676 <dependency>
677 <groupId>org.jdom</groupId>
678 <artifactId>jdom2</artifactId>
679 <version>2.0.6</version>
680 </dependency>
569681 <!--Jackson parse String to JSON-->
570682 <dependency>
571683 <groupId>com.fasterxml.jackson.core</groupId>
572684 <artifactId>jackson-core</artifactId>
573 <version>2.9.2</version>
574 </dependency>
575
576 <!-- Java ImageIO plugin for JBIG2 support (often used in PDF)
577 This jbig2 dep is not distributed with Tika due to licensing
578 issue (GPLV3). That's why it is included here as "test".
579 https://github.com/levigo/jbig2-imageio
580 -->
581 <dependency>
582 <groupId>com.levigo.jbig2</groupId>
583 <artifactId>levigo-jbig2-imageio</artifactId>
584 <version>1.6.5</version>
585 <scope>test</scope>
586 </dependency>
587 <!-- Copied from PDFBox:
588 For legal reasons (incompatible license), jai-imageio-core is to be used
589 only in the tests and may not be distributed. See also LEGAL-195-->
685 <version>${jackson.version}</version>
686 </dependency>
687 <!-- as of 2.9.5, jackson-databind is pulling in jackson-annotations 2.9.0
688 For now, we need to specify databind here with exclusion statement
689 -->
690 <dependency>
691 <groupId>com.fasterxml.jackson.core</groupId>
692 <artifactId>jackson-databind</artifactId>
693 <version>${jackson.version}</version>
694 <exclusions>
695 <exclusion>
696 <groupId>com.fasterxml.jackson.core</groupId>
697 <artifactId>jackson-annotations</artifactId>
698 </exclusion>
699 </exclusions>
700 </dependency>
701 <dependency>
702 <groupId>com.fasterxml.jackson.core</groupId>
703 <artifactId>jackson-annotations</artifactId>
704 <version>${jackson.version}</version>
705 </dependency>
706
707
708 <dependency>
709 <groupId>org.apache.pdfbox</groupId>
710 <artifactId>jbig2-imageio</artifactId>
711 <version>3.0.0</version>
712 </dependency>
713
714 <!-- jai-imageio-core is allowed since LEGAL-304 -->
590715 <dependency>
591716 <groupId>com.github.jai-imageio</groupId>
592717 <artifactId>jai-imageio-core</artifactId>
593718 <version>1.3.1</version>
594 <scope>test</scope>
595 </dependency>
719 </dependency>
720 <!-- For legal reasons (incompatible license), jai-imageio-jpeg2000 is to be used
721 only in the tests and may not be distributed. See also LEGAL-195 -->
596722 <dependency>
597723 <groupId>com.github.jai-imageio</groupId>
598724 <artifactId>jai-imageio-jpeg2000</artifactId>
599725 <version>1.3.0</version>
600726 <scope>test</scope>
727 <exclusions>
728 <exclusion>
729 <groupId>com.github.jai-imageio</groupId>
730 <artifactId>jai-imageio-core</artifactId>
731 </exclusion>
732 </exclusions>
601733 </dependency>
602734
603735 </dependencies>
1919
2020 import java.math.BigInteger;
2121 import java.util.ArrayList;
22 import java.util.HashSet;
2223 import java.util.List;
24 import java.util.Set;
2325
2426 import org.apache.tika.exception.TikaException;
2527 import org.apache.tika.parser.chm.core.ChmCommons;
136138
137139 /* loops over all pmgls */
138140 byte[] dir_chunk = null;
141 Set<Integer> processed = new HashSet<>();
139142 for (int i = startPmgl; i>=0; ) {
140143 dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
141144 int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
146149 PMGLheader = new ChmPmglHeader();
147150 PMGLheader.parse(dir_chunk, PMGLheader);
148151 enumerateOneSegment(dir_chunk);
149
150 i=PMGLheader.getBlockNext();
152 int nextBlock = PMGLheader.getBlockNext();
153 processed.add(i);
154 if (processed.contains(nextBlock)) {
155 throw new ChmParsingException("already processed block; avoiding cycle");
156 }
157 i=nextBlock;
151158 dir_chunk = null;
152159 }
160
153161 } catch (ChmParsingException e) {
154162 LOG.warn("Chm parse exception", e);
155163 } finally {
1515 */
1616 package org.apache.tika.parser.html;
1717
18 import java.io.BufferedReader;
1819 import java.io.IOException;
1920 import java.io.InputStream;
21 import java.io.InputStreamReader;
2022 import java.nio.ByteBuffer;
2123 import java.nio.charset.Charset;
24 import java.nio.charset.StandardCharsets;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.Locale;
28 import java.util.Set;
2229 import java.util.regex.Matcher;
2330 import java.util.regex.Pattern;
2431
3845 */
3946 public class HtmlEncodingDetector implements EncodingDetector {
4047
48 /**
49 * HTML can include non-iana supported charsets that Java
50 * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake.
51 * Ignore charsets in html meta-headers that are not supported by IANA.
52 * See: TIKA-2592
53 */
54 private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
55 static {
56 Set<String> unsupported = new HashSet<>();
57 try (BufferedReader reader =
58 new BufferedReader(
59 new InputStreamReader(
60 HtmlEncodingDetector.class
61 .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"),
62 StandardCharsets.UTF_8))) {
63 String line = reader.readLine();
64 while (line != null) {
65 if (line.startsWith("#")) {
66 line = reader.readLine();
67 continue;
68 }
69 line = line.trim();
70 if (line.length() > 0) {
71 unsupported.add(line.toLowerCase(Locale.US));
72 }
73 line = reader.readLine();
74 }
75 } catch (IOException e) {
76 throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
77 }
78 CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
79 }
4180 // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
4281 private static final int DEFAULT_MARK_LIMIT = 8192;
4382
111150 //that is valid
112151 while (charsetMatcher.find()) {
113152 String candCharset = charsetMatcher.group(1);
153 if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
154 continue;
155 }
114156 if (CharsetUtils.isSupported(candCharset)) {
115157 try {
116158 return CharsetUtils.forName(candCharset);
2323 import java.nio.charset.StandardCharsets;
2424 import java.util.Arrays;
2525 import java.util.HashSet;
26 import java.util.List;
2627 import java.util.Locale;
2728 import java.util.Set;
2829 import java.util.regex.Matcher;
3536 import org.apache.tika.metadata.TikaCoreProperties;
3637 import org.apache.tika.mime.MediaType;
3738 import org.apache.tika.parser.ParseContext;
39 import org.apache.tika.parser.utils.DataURIScheme;
40 import org.apache.tika.parser.utils.DataURISchemeParseException;
41 import org.apache.tika.parser.utils.DataURISchemeUtil;
3842 import org.apache.tika.sax.TextContentHandler;
3943 import org.apache.tika.sax.XHTMLContentHandler;
4044 import org.xml.sax.Attributes;
5660 private final ParseContext context;
5761 private final boolean extractScripts;
5862 private final StringBuilder title = new StringBuilder();
63 private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
5964 private int bodyLevel = 0;
6065 private int discardLevel = 0;
6166 private int titleLevel = 0;
168173 }
169174
170175 title.setLength(0);
176 String value = atts.getValue("src");
177 if (value != null && value.startsWith("data:")) {
178 handleDataURIScheme(value);
179 }
171180 }
172181
173182 /**
230239 // And resolve relative links. Eventually this should be pushed
231240 // into the HtmlMapper code.
232241 if (URI_ATTRIBUTES.contains(normAttrName)) {
242 //if this is a src="data: " element,
243 //we've handled that as an embedded file, don't include the full thing
244 //here
245 if (normAttrName.equals("src")) {
246 String v = newAttributes.getValue(att);
247 if (v.startsWith("data:")) {
248 newAttributes.setValue(att, "data:");
249 }
250 }
233251 newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
234252 } else if (isObject && "codebase".equals(normAttrName)) {
235253 newAttributes.setValue(att, codebase);
295313 }
296314 }
297315
316 private void handleDataURIScheme(String string) throws SAXException {
317 DataURIScheme dataURIScheme = null;
318 try {
319 dataURIScheme = dataURISchemeUtil.parse(string);
320 } catch (DataURISchemeParseException e) {
321 //swallow
322 return;
323 }
324
325 //do anything with attrs?
326 Metadata m = new Metadata();
327 m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
328 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
329 if (dataURIScheme.getMediaType() != null) {
330 m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString());
331 }
332 EmbeddedDocumentExtractor embeddedDocumentExtractor =
333 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
334 if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
335 try (InputStream stream = dataURIScheme.getInputStream()) {
336 embeddedDocumentExtractor.parseEmbedded(
337 stream, xhtml, m, false
338 );
339 } catch (IOException e) {
340 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
341 }
342 }
343 }
344
298345 private void writeScript() throws SAXException {
299346 //don't write an attached macro if there is no content
300347 //we may want to revisit this behavior
312359
313360 EmbeddedDocumentExtractor embeddedDocumentExtractor =
314361 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
362 //try to scrape dataURISchemes from javascript
363 List<DataURIScheme> dataURISchemes = dataURISchemeUtil.extract(script.toString());
364 for (DataURIScheme dataURIScheme : dataURISchemes) {
365 Metadata dataUriMetadata = new Metadata();
366 dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
367 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
368 dataUriMetadata.set(Metadata.CONTENT_TYPE,
369 dataURIScheme.getMediaType().toString());
370 if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
371 try (InputStream dataURISchemeInputStream = dataURIScheme.getInputStream()) {
372 embeddedDocumentExtractor.parseEmbedded(dataURISchemeInputStream,
373 xhtml, dataUriMetadata, false);
374 } catch (IOException e) {
375 //swallow
376 }
377 }
378 }
379
315380 try (InputStream stream = new ByteArrayInputStream(
316381 script.toString().getBytes(StandardCharsets.UTF_8))) {
317382 embeddedDocumentExtractor.parseEmbedded(
6565 MediaType.image("png"),
6666 MediaType.image("vnd.wap.wbmp"),
6767 MediaType.image("x-icon"),
68 MediaType.image("x-xcf")));
69 try {
70 Class.forName("com.levigo.jbig2.JBIG2ImageReader");
71 TMP_SUPPORTED.add(MediaType.image("x-jbig2"));
72 } catch (ClassNotFoundException e) {
73 }
68 MediaType.image("x-xcf"),
69 MediaType.image("x-jbig2")));
70 //add try/catch class.forName() for image types relying on
71 //provided dependencies
7472 }
7573
7674 private static final Set<MediaType> SUPPORTED_TYPES =
3232 import org.apache.james.mime4j.parser.ContentHandler;
3333 import org.apache.james.mime4j.stream.BodyDescriptor;
3434 import org.apache.james.mime4j.stream.Field;
35 import org.apache.tika.detect.Detector;
3536 import org.apache.tika.exception.TikaException;
3637 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
3738 import org.apache.tika.extractor.EmbeddedDocumentUtil;
146147 private boolean strictParsing = false;
147148 private final boolean extractAllAlternatives;
148149 private final EmbeddedDocumentExtractor extractor;
149
150 private final Detector detector;
150151 //this is used to buffer a multipart body that
151152 //keeps track of multipart/alternative and its children
152153 private Stack<Part> alternativePartBuffer = new Stack<>();
153154
154155 private Stack<BodyDescriptor> parts = new Stack<>();
155156
156 MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata,
157 MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
157158 ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
158159 this.handler = xhtml;
159160 this.metadata = metadata;
166167
167168 // Was an EmbeddedDocumentExtractor explicitly supplied?
168169 this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
170 this.detector = detector;
169171 }
170172
171173 @Override
183185 if (parts.size() > 0) {
184186 submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
185187 submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
186 }
188 }
187189 if (body instanceof MaximalBodyDescriptor) {
188190 MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
189191 String contentDispositionType = maximalBody.getContentDispositionType();
190192 if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
191 StringBuilder contentDisposition = new StringBuilder( contentDispositionType );
193 StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
192194 Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters();
193 for ( Entry<String, String> param : contentDispositionParameters.entrySet() ) {
195 for (Entry<String, String> param : contentDispositionParameters.entrySet()) {
194196 contentDisposition.append("; ")
195 .append(param.getKey()).append("=\"").append(param.getValue()).append('"');
197 .append(param.getKey()).append("=\"").append(param.getValue()).append('"');
196198 }
197199
198200 String contentDispositionFileName = maximalBody.getContentDispositionFilename();
200202 submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName );
201203 }
202204
203 submd.set( Metadata.CONTENT_DISPOSITION, contentDisposition.toString() );
205 submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
204206 }
205207 }
206208 //if we're in a multipart/alternative or any one of its children
207209 //add the bodypart to the latest that was added
208 if (! extractAllAlternatives && alternativePartBuffer.size() > 0) {
210 if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
209211 ByteArrayOutputStream bos = new ByteArrayOutputStream();
210212 IOUtils.copy(is, bos);
211213 alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
214 } else if (!extractAllAlternatives && parts.size() < 2) {
215 //if you're at the first level of embedding
216 //and you're not in an alternative part block
217 //and you're text/html, put that in the body of the email
218 //otherwise treat as a regular attachment
219 ByteArrayOutputStream bos = new ByteArrayOutputStream();
220 IOUtils.copy(is, bos);
221 byte[] bytes = bos.toByteArray();
222 if (detectTextOrHtml(submd, bytes)) {
223 handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
224 } else {
225 //else handle as you would any other embedded content
226 try (TikaInputStream tis = TikaInputStream.get(bytes)) {
227 handleEmbedded(tis, submd);
228 }
229 }
212230 } else {
213231 //else handle as you would any other embedded content
214232 try (TikaInputStream tis = TikaInputStream.get(is)) {
215233 handleEmbedded(tis, submd);
216234 }
217235 }
236 }
237
238 private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
239 String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
240 if (mediaTypeString != null) {
241 if (mediaTypeString.startsWith("text")) {
242 return true;
243 } else {
244 return false;
245 }
246 }
247 try (TikaInputStream tis = TikaInputStream.get(bytes)) {
248 MediaType mediaType = detector.detect(tis, submd);
249 if (mediaType != null) {
250 //detect only once
251 submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString());
252 if (mediaType.toString().startsWith("text")) {
253 return true;
254 }
255 }
256 } catch (IOException e) {
257
258 }
259 return false;
218260 }
219261
220262 private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {
515557 }
516558
517559 if (part instanceof BodyContents) {
518 handlePart((BodyContents)part);
560 handleInlineBodyPart((BodyContents)part);
519561 return;
520562 }
521563
538580 }
539581 }
540582
541 private void handlePart(BodyContents part) throws MimeException, IOException {
583 private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
542584 String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
543585 Parser parser = null;
544586 if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
554596
555597
556598 if (parser == null) {
599 //back off and treat it as an embedded chunk
557600 try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
558601 handleEmbedded(tis, part.metadata);
559602 }
2525 import org.apache.james.mime4j.parser.MimeStreamParser;
2626 import org.apache.james.mime4j.stream.MimeConfig;
2727 import org.apache.tika.config.Field;
28 import org.apache.tika.detect.Detector;
2829 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.extractor.EmbeddedDocumentUtil;
2931 import org.apache.tika.io.TikaInputStream;
3032 import org.apache.tika.metadata.Metadata;
3133 import org.apache.tika.mime.MediaType;
5355 private static final Set<MediaType> SUPPORTED_TYPES = Collections
5456 .singleton(MediaType.parse("message/rfc822"));
5557
58 //rely on the detector to be thread-safe
59 //built lazily and then reused
60 private Detector detector;
61
5662 @Field
5763 private boolean extractAllAlternatives = false;
5864
7076 .build();
7177
7278 config = context.get(MimeConfig.class, config);
73
79 Detector localDetector = context.get(Detector.class);
80 if (localDetector == null) {
81 //lazily load this if necessary
82 if (detector == null) {
83 EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
84 detector = embeddedDocumentUtil.getDetector();
85 }
86 localDetector = detector;
87 }
7488 MimeStreamParser parser = new MimeStreamParser(config, null, new DefaultBodyDescriptorBuilder());
7589 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
7690
7791 MailContentHandler mch = new MailContentHandler(
78 xhtml, metadata, context, config.isStrictParsing(),
92 xhtml, localDetector, metadata, context, config.isStrictParsing(),
7993 extractAllAlternatives);
8094 parser.setContentHandler(mch);
8195 parser.setContentDecoding(true);
283283
284284 // Set up listener and register the records we want to process
285285 HSSFRequest hssfRequest = new HSSFRequest();
286 listenForAllRecords = true;
287286 if (listenForAllRecords) {
288287 hssfRequest.addListenerForAllRecords(formatListener);
289288 } else {
541540 CellValueRecordInterface value =
542541 (CellValueRecordInterface) record;
543542 Point point = new Point(value.getColumn(), value.getRow());
544 currentSheet.put(point, cell);
543 if (currentSheet.containsKey(point)) {
544 //avoid overwriting content
545 //for now, add to extraTextCells
546 //TODO: consider allowing multiple text pieces
547 //per x,y to keep the text together
548 extraTextCells.add(cell);
549 } else {
550 currentSheet.put(point, cell);
551 }
552
545553 } else {
546554 // Cell outside the worksheets
547555 extraTextCells.add(cell);
650658 }
651659
652660 @Override
661 public void processRecord(Record record) {
662 // System.out.println(record.getClass() + " : "+record.toString());
663 super.processRecord(record);
664 }
665
666 @Override
653667 public String formatNumberDateCell(CellValueRecordInterface cell) {
654668 String formatString = this.getFormatString(cell);
655669 if (formatString != null && ! formatString.equals("General")) {
1717
1818 import java.io.IOException;
1919 import java.io.InputStream;
20 import java.util.ArrayList;
2021 import java.util.HashSet;
2122 import java.util.List;
2223
2930 import org.apache.poi.hslf.record.RecordTypes;
3031 import org.apache.poi.hslf.record.VBAInfoAtom;
3132 import org.apache.poi.hslf.record.VBAInfoContainer;
33 import org.apache.poi.hslf.usermodel.HSLFGroupShape;
3234 import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
3335 import org.apache.poi.hslf.usermodel.HSLFNotes;
3436 import org.apache.poi.hslf.usermodel.HSLFObjectData;
3840 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
3941 import org.apache.poi.hslf.usermodel.HSLFTable;
4042 import org.apache.poi.hslf.usermodel.HSLFTableCell;
43 import org.apache.poi.hslf.usermodel.HSLFTextBox;
4144 import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
4245 import org.apache.poi.hslf.usermodel.HSLFTextRun;
4346 import org.apache.poi.hslf.usermodel.HSLFTextShape;
4750 import org.apache.tika.exception.EncryptedDocumentException;
4851 import org.apache.tika.extractor.EmbeddedDocumentUtil;
4952 import org.apache.tika.io.CloseShieldInputStream;
53 import org.apache.tika.io.IOExceptionWithCause;
5054 import org.apache.tika.io.TikaInputStream;
5155 import org.apache.tika.metadata.Metadata;
5256 import org.apache.tika.mime.MediaType;
116120 }
117121 }
118122
123 extractGroupText(xhtml, slide.getShapes(), 0);
124
119125 // Slide footer, if present
120126 if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
121127 xhtml.startElement("p", "class", "slide-footer");
215221 extractMacros(ss, xhtml);
216222 }
217223 xhtml.endElement("div");
224 }
225
226 //Extract any text that's within an HSLFTextShape that's a descendant of
227 //an HSLFGroupShape.
228 private void extractGroupText(XHTMLContentHandler xhtml, List<HSLFShape> shapes, int depth) throws SAXException {
229
230 if (shapes == null) {
231 return;
232 }
233
234 //Only process items with depth > 0 because they should have been included
235 //already in slide.getTextParagraphs above.
236
237 //However, cells are considered grouped within the table, so ignore them.
238 //I don't believe that cells can be inside a text box or other
239 //grouped text containing object, so always ignore them.
240 List<List<HSLFTextParagraph>> paragraphList = new ArrayList<>();
241 for (HSLFShape shape : shapes) {
242 if (shape instanceof HSLFGroupShape) {
243 //work recursively, HSLFGroupShape can contain HSLFGroupShape
244 extractGroupText(xhtml, ((HSLFGroupShape)shape).getShapes(), depth+1);
245 } else if (shape instanceof HSLFTextShape
246 && ! (shape instanceof HSLFTableCell) && depth > 0) {
247 paragraphList.add(((HSLFTextShape)shape).getTextParagraphs());
248 }
249 }
250 textRunsToText(xhtml, paragraphList);
218251 }
219252
220253 private void extractMacros(HSLFSlideShow ppt, XHTMLContentHandler xhtml) {
453486 MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
454487 mediaType = mt.toString();
455488 }
456 if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
457 try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
489 if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
490 || mediaType.equals("application/x-tika-msoffice")) {
491 NPOIFSFileSystem npoifs = null;
492
493 try {
494 npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
495 } catch (RuntimeException e) {
496 throw new IOExceptionWithCause(e);
497 }
498 try {
458499 handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
500 } finally {
501 if (npoifs != null) {
502 npoifs.close();
503 }
459504 }
460505 } else {
461506 handleEmbeddedResource(
7373 import org.apache.tika.sax.BodyContentHandler;
7474 import org.apache.tika.sax.EmbeddedContentHandler;
7575 import org.apache.tika.sax.XHTMLContentHandler;
76 import org.bouncycastle.cms.Recipient;
7677 import org.xml.sax.SAXException;
7778
7879 /**
320321 }
321322 if (rtfChunk != null && (extractAllAlternatives || !doneBody)) {
322323 ByteChunk chunk = (ByteChunk) rtfChunk;
323 MAPIRtfAttribute rtf = new MAPIRtfAttribute(
324 MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
325 );
326 Parser rtfParser =
327 EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
328 if (rtfParser == null) {
329 rtfParser = new RTFParser();
330 }
331 rtfParser.parse(
332 new ByteArrayInputStream(rtf.getData()),
333 new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
334 new Metadata(), parseContext);
335 doneBody = true;
324 //avoid buffer underflow TIKA-2530
325 //TODO -- would be good to find an example triggering file and
326 //figure out if this is a bug in POI or a genuine 0 length chunk
327 if (chunk.getValue() != null && chunk.getValue().length > 0) {
328 MAPIRtfAttribute rtf = new MAPIRtfAttribute(
329 MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
330 );
331 Parser rtfParser =
332 EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
333 if (rtfParser == null) {
334 rtfParser = new RTFParser();
335 }
336 rtfParser.parse(
337 new ByteArrayInputStream(rtf.getData()),
338 new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
339 new Metadata(), parseContext);
340 doneBody = true;
341 }
336342 }
337343 if (textChunk != null && (extractAllAlternatives || !doneBody)) {
338344 xhtml.element("p", ((StringChunk) textChunk).getValue());
2424 import java.net.URI;
2525 import java.util.HashMap;
2626 import java.util.HashSet;
27 import java.util.Iterator;
2728 import java.util.List;
2829 import java.util.Map;
2930 import java.util.Set;
3940 import org.apache.poi.openxml4j.opc.TargetMode;
4041 import org.apache.poi.openxml4j.opc.internal.FileHelper;
4142 import org.apache.poi.poifs.filesystem.DirectoryNode;
43 import org.apache.poi.poifs.filesystem.DocumentEntry;
44 import org.apache.poi.poifs.filesystem.Entry;
4245 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
4346 import org.apache.poi.poifs.filesystem.Ole10Native;
4447 import org.apache.poi.poifs.filesystem.Ole10NativeException;
298301 DirectoryNode root = fs.getRoot();
299302 POIFSDocumentType type = POIFSDocumentType.detectType(root);
300303
301 if (root.hasEntry("CONTENTS")
302 && root.hasEntry("\u0001Ole")
303 && root.hasEntry("\u0001CompObj")) {
304 if (root.hasEntry("\u0001Ole")
305 && root.hasEntry("\u0001CompObj")
306 && (
307 root.hasEntry("CONTENTS") || root.hasEntry("Package")
308 )) {
304309 // TIKA-704: OLE 2.0 embedded non-Office document?
305310 //TODO: figure out if the equivalent of OLE 1.0's
306311 //getCommand() and getFileName() exist for OLE 2.0 to populate
307312 //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
308 stream = TikaInputStream.get(
309 fs.createDocumentInputStream("CONTENTS"));
313 if (root.hasEntry("CONTENTS")) {
314 stream = TikaInputStream.get(
315 fs.createDocumentInputStream("CONTENTS"));
316 } else if (root.hasEntry("Package")) {
317 //TIKA-2588
318 stream = TikaInputStream.get(
319 fs.createDocumentInputStream("Package"));
320 } else {
321 throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira");
322 }
310323 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
311324 embeddedExtractor.parseEmbedded(
312325 stream, new EmbeddedContentHandler(handler),
3535 import org.apache.tika.metadata.Property;
3636 import org.apache.tika.metadata.TikaCoreProperties;
3737 import org.apache.tika.parser.microsoft.SummaryExtractor;
38 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
3839 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
3940 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
4041 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
6061 if (extractor.getDocument() != null ||
6162 ((extractor instanceof XSSFEventBasedExcelExtractor ||
6263 extractor instanceof XWPFEventBasedWordExtractor ||
63 extractor instanceof XSLFEventBasedPowerPointExtractor) &&
64 extractor instanceof XSLFEventBasedPowerPointExtractor ||
65 extractor instanceof XPSTextExtractor) &&
6466 extractor.getPackage() != null)) {
6567 extractMetadata(extractor.getCoreProperties(), metadata);
6668 extractMetadata(extractor.getExtendedProperties(), metadata);
4444 import org.apache.tika.parser.EmptyParser;
4545 import org.apache.tika.parser.ParseContext;
4646 import org.apache.tika.parser.microsoft.OfficeParserConfig;
47 import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
48 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
4749 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
4850 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
4951 import org.apache.tika.parser.pkg.ZipContainerDetector;
6567 ExtractorFactory.setThreadPrefersEventExtractors(true);
6668
6769 try {
68 OOXMLExtractor extractor;
70 OOXMLExtractor extractor = null;
6971 OPCPackage pkg;
7072
7173 // Locate or Open the OPCPackage for the file
8284
8385 // Get the type, and ensure it's one we handle
8486 MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
87 if (type == null) {
88 type = ZipContainerDetector.detectXPSOPC(pkg);
89 }
90
8591 if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
8692 // Not a supported type, delegate to Empty Parser
8793 EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
8894 return;
8995 }
9096 metadata.set(Metadata.CONTENT_TYPE, type.toString());
91
9297 // Have the appropriate OOXML text extractor picked
9398 POIXMLTextExtractor poiExtractor = null;
9499 // This has already been set by OOXMLParser's call to configure()
100105 if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
101106 poiExtractor = trySXSLF(pkg);
102107 }
108 if (type.equals(OOXMLParser.XPS)) {
109 poiExtractor = new XPSTextExtractor(pkg);
110 }
111
103112 if (poiExtractor == null) {
104113 poiExtractor = ExtractorFactory.createExtractor(pkg);
105114 }
118127 extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
119128 (XSLFEventBasedPowerPointExtractor) poiExtractor);
120129 metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
130 } else if (poiExtractor instanceof XPSTextExtractor) {
131 extractor = new XPSExtractorDecorator(context, poiExtractor);
121132 } else if (document == null) {
122133 throw new TikaException(
123134 "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
3939 //turn off POI's zip bomb detection because we have our own
4040 ZipSecureFile.setMinInflateRatio(-1.0d);
4141 }
42
43 protected static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument");
4244
4345 protected static final Set<MediaType> SUPPORTED_TYPES =
4446 Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
8385 * by Tika and/or POI.
8486 */
8587 protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
86 Collections.singleton(
88 Collections.EMPTY_SET;
89 //TODO: should we do a singleton for dwfx+xps?
90 /*Collections.singleton(
8791 MediaType.application("vnd.ms-xpsdocument")
88 );
92 );*/
8993 /**
9094 * Serial version UID
9195 */
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.microsoft.ooxml.xps;
18
19 import org.apache.commons.io.IOUtils;
20 import org.apache.commons.io.input.CloseShieldInputStream;
21 import org.apache.poi.POIXMLDocument;
22 import org.apache.poi.POIXMLTextExtractor;
23 import org.apache.poi.openxml4j.opc.PackagePart;
24 import org.apache.poi.openxml4j.opc.PackageRelationship;
25 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
26 import org.apache.poi.openxml4j.opc.ZipPackage;
27 import org.apache.poi.openxml4j.util.ZipEntrySource;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.extractor.EmbeddedDocumentUtil;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.TikaCoreProperties;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
34 import org.apache.tika.sax.EmbeddedContentHandler;
35 import org.apache.tika.sax.OfflineContentHandler;
36 import org.apache.tika.sax.XHTMLContentHandler;
37 import org.apache.tika.utils.ExceptionUtils;
38 import org.xml.sax.Attributes;
39 import org.xml.sax.SAXException;
40 import org.xml.sax.helpers.DefaultHandler;
41
42 import java.io.IOException;
43 import java.io.InputStream;
44 import java.util.Collections;
45 import java.util.Enumeration;
46 import java.util.HashMap;
47 import java.util.List;
48 import java.util.Map;
49 import java.util.zip.ZipEntry;
50
51 public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
52
53 private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
54
55 private final ParseContext context;
56 private final ZipPackage pkg;
57 Map<String, Metadata> embeddedImages = new HashMap<>();
58
59 public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) throws TikaException {
60 super(context, extractor);
61 this.context = context;
62 if (extractor.getPackage() instanceof ZipPackage) {
63 this.pkg = (ZipPackage) extractor.getPackage();
64 } else {
65 throw new TikaException("OPCPackage must be a ZipPackage");
66 }
67 }
68
69 @Override
70 public POIXMLDocument getDocument() {
71 return null;
72 }
73
74
75 @Override
76 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
77
78 PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
79 for (int i = 0; i < prc.size(); i++) {
80 PackageRelationship pr = prc.getRelationship(i);
81
82 //there should only be one.
83 //in the test file, this points to FixedDocSeq.fdseq
84 try {
85 handleDocuments(pr, xhtml);
86 } catch (TikaException e) {
87 throw new SAXException(e);
88 }
89 }
90
91 //now handle embedded images
92 if (embeddedImages.size() > 0) {
93 EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
94 for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) {
95 String zipPath = embeddedImage.getKey();
96 Metadata metadata = embeddedImage.getValue();
97 if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
98 handleEmbeddedImage(
99 zipPath,
100 metadata,
101 embeddedDocumentUtil,
102 xhtml);
103 }
104 }
105 }
106
107 }
108
109 private void handleEmbeddedImage(String zipPath, Metadata metadata,
110 EmbeddedDocumentUtil embeddedDocumentUtil,
111 XHTMLContentHandler xhtml) throws SAXException, IOException {
112 InputStream stream = null;
113 try {
114 stream = getZipStream(zipPath, pkg);
115 } catch (IOException|TikaException e) {
116 //store this exception in the parent's metadata
117 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
118 return;
119 }
120
121 try {
122 embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true);
123 } finally {
124 IOUtils.closeQuietly(stream);
125 }
126 }
127
128 private void handleDocuments(PackageRelationship packageRelationship,
129 XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
130
131 try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
132 context.getSAXParser().parse(
133 new CloseShieldInputStream(stream),
134 new OfflineContentHandler(new EmbeddedContentHandler(
135 new FixedDocSeqHandler(xhtml))));
136 }
137 }
138
139 @Override
140 protected List<PackagePart> getMainDocumentParts() throws TikaException {
141 return Collections.EMPTY_LIST;
142 }
143
144 private class FixedDocSeqHandler extends DefaultHandler {
145 private final static String DOCUMENT_REFERENCE = "DocumentReference";
146 private final static String SOURCE = "Source";
147
148 private final XHTMLContentHandler xhtml;
149
150 private FixedDocSeqHandler(XHTMLContentHandler xhtml) {
151 this.xhtml = xhtml;
152 }
153
154 @Override
155 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
156 if (!DOCUMENT_REFERENCE.equals(localName)) {
157 return;
158 }
159 for (int i = 0; i < atts.getLength(); i++) {
160 String lName = atts.getLocalName(i);
161 if (SOURCE.equals(lName)) {
162 handleDocumentRef(atts.getValue(i));
163 }
164 }
165 }
166
167 private void handleDocumentRef(String docRef) throws SAXException {
168 //docRef is a path to a FixedDocumentSequence document,
169 // e.g. /Documents/1/FixedDoc.fdoc
170
171 //relative root is /Documents/1 ..need this Pages...
172 String relativeRoot = null;
173 int i = docRef.lastIndexOf("/");
174 if (i > 0) {
175 relativeRoot = docRef.substring(0, i);
176 } else {
177 relativeRoot = "";
178 }
179 String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
180 if (pkg instanceof ZipPackage) {
181 try (InputStream stream = getZipStream(zipPath, pkg)) {
182 context.getSAXParser().parse(
183 new CloseShieldInputStream(stream),
184 new OfflineContentHandler(new EmbeddedContentHandler(
185 new PageContentPartHandler(relativeRoot, xhtml))));
186
187 } catch (IOException | TikaException e) {
188 throw new SAXException(new TikaException("IOException trying to read: " + docRef));
189 }
190 } else {
191 throw new SAXException(new TikaException("Package must be ZipPackage"));
192 }
193 }
194
195 private class PageContentPartHandler extends DefaultHandler {
196 private static final String PAGE_CONTENT = "PageContent";
197 private static final String SOURCE = "Source";
198
199 private final String relativeRoot;
200 private final XHTMLContentHandler xhtml;
201
202 private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) {
203 this.relativeRoot = relativeRoot;
204 this.xhtml = xhtml;
205 }
206
207 @Override
208 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
209 if (!PAGE_CONTENT.equals(localName)) {
210 return;
211 }
212 String pagePath = null;
213 for (int i = 0; i < atts.getLength(); i++) {
214 if (SOURCE.equals(atts.getLocalName(i))) {
215 pagePath = atts.getValue(i);
216 break;
217 }
218 }
219
220 if (pagePath != null) {
221 if (!pagePath.startsWith("/")) {
222 pagePath = relativeRoot + "/" + pagePath;
223 }
224 //trim initial /
225 if (pagePath.startsWith("/")) {
226 pagePath = pagePath.substring(1);
227 }
228 try (InputStream stream = getZipStream(pagePath, pkg)) {
229 context.getSAXParser().parse(
230 new CloseShieldInputStream(stream),
231 new OfflineContentHandler(
232 new XPSPageContentHandler(xhtml, embeddedImages)
233 )
234 );
235 } catch (TikaException | IOException e) {
236 throw new SAXException(e);
237 }
238 }
239
240 }
241 }
242 }
243
244 private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
245 String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
246 ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
247 Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
248 ZipEntry zipEntry = null;
249 while (zipEntryEnumeration.hasMoreElements()) {
250 ZipEntry ze = zipEntryEnumeration.nextElement();
251 if (ze.getName().equals(targPath)) {
252 zipEntry = ze;
253 break;
254 }
255 }
256 if (zipEntry == null) {
257 throw new TikaException("Couldn't find required zip entry: " + zipPath);
258 }
259 return zipEntrySource.getInputStream(zipEntry);
260 }
261 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml.xps;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.TikaCoreProperties;
20 import org.apache.tika.sax.XHTMLContentHandler;
21 import org.xml.sax.Attributes;
22 import org.xml.sax.SAXException;
23 import org.xml.sax.helpers.DefaultHandler;
24
25 import java.util.ArrayList;
26 import java.util.Collections;
27 import java.util.Comparator;
28 import java.util.LinkedHashMap;
29 import java.util.LinkedHashSet;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.Set;
33 import java.util.Stack;
34
35
36 /**
37 * Handles an individual page. For now, this marks up
38 * canvas entities in a &lt;div&gt; tag. Based on the spec,
39 * it currently relies on order within the xml for order of output
40 * of text to xhtml. We could do more complex processing of coordinates
41 * for bidi-languages, but the spec implies that we should be able
42 * to rely on storage order.
43 * <p/>
44 * As with our PDFParser, this currently dumps urls at the bottom of the page
45 * and does not attempt to calculate the correct anchor text.
46 * <p/>
47 * TODO: integrate table markup
48 */
49 class XPSPageContentHandler extends DefaultHandler {
50
51 private static final String GLYPHS = "Glyphs";
52 private static final String CANVAS = "Canvas";
53 private static final String CLIP = "Clip";
54 private static final String NULL_CLIP = "NULL_CLIP";
55 private static final String UNICODE_STRING = "UnicodeString";
56 private static final String ORIGIN_X = "OriginX";
57 private static final String ORIGIN_Y = "OriginY";
58 private static final String BIDI_LEVEL = "BidiLevel";
59 private static final String INDICES = "Indices";
60 private static final String NAME = "Name";
61 private static final String PATH = "Path";
62 private static final String NAVIGATE_URI = "FixedPage.NavigateUri";
63 private static final String IMAGE_SOURCE = "ImageSource";
64 private static final String IMAGE_BRUSH = "ImageBrush";
65 private static final String AUTOMATION_PROPERITES_HELP_TEXT = "AutomationProperties.HelpText";
66
67 private static final String URL_DIV = "urls";
68 private static final String DIV = "div";
69 private static final String CLASS = "class";
70 private static final String PAGE = "page";
71 private static final String CANVAS_SAX = "canvas";
72 private static final String P = "p";
73 private static final String HREF = "href";
74 private static final String A = "a";
75
76
77 private final XHTMLContentHandler xhml;
78
79 //path in zip file for an image rendered on this page
80 private String imageSourcePathInZip = null;
81 //embedded images sometimes include full path info of original image
82 private String originalLocationOnDrive = null;
83
84 //buffer for the glyph runs within a given canvas
85 //in insertion order
86 private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>();
87
88 private Set<String> urls = new LinkedHashSet();
89 private Stack<String> canvasStack = new Stack<>();
90 private final Map<String, Metadata> embeddedInfos;
91 //sort based on y coordinate of first element in each row
92 //this requires every row to have at least one element
93 private static Comparator<? super List<GlyphRun>> ROW_SORTER = new Comparator<List<GlyphRun>>() {
94 @Override
95 public int compare(List<GlyphRun> o1, List<GlyphRun> o2) {
96 if (o1.get(0).originY < o2.get(0).originY) {
97 return -1;
98 } else if (o1.get(0).originY > o2.get(0).originY) {
99 return 1;
100 }
101 return 0;
102 }
103 };
104
105 public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) {
106 this.xhml = xhtml;
107 this.embeddedInfos = embeddedInfos;
108 }
109
110 @Override
111 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
112 if (CANVAS.equals(localName)) {
113 String clip = getVal(CLIP, atts);
114 if (clip == null) {
115 canvasStack.push(NULL_CLIP);
116 } else {
117 canvasStack.push(clip);
118 }
119 return;
120 } else if (PATH.equals(localName)) {
121 //for now just grab them and dump them at the end of the page.
122 String url = getVal(NAVIGATE_URI, atts);
123 if (url != null) {
124 urls.add(url);
125 }
126 originalLocationOnDrive = getVal(AUTOMATION_PROPERITES_HELP_TEXT, atts);
127 } else if (IMAGE_BRUSH.equals(localName)) {
128 imageSourcePathInZip = getVal(IMAGE_SOURCE, atts);
129 }
130
131 if (!GLYPHS.equals(localName)) {
132 return;
133 }
134 String name = null;
135 Float originX = null;
136 Float originY = null;
137 String unicodeString = null;
138 Integer bidilevel = 1;
139 String indicesString = null;
140
141 for (int i = 0; i < atts.getLength(); i++) {
142 String lName = atts.getLocalName(i);
143 String value = atts.getValue(i);
144 value = (value == null) ? "" : value.trim();
145
146 if (ORIGIN_X.equals(lName) && value.length() > 0) {
147 try {
148 originX = Float.parseFloat(atts.getValue(i));
149 } catch (NumberFormatException e) {
150 throw new SAXException(e);
151 }
152 } else if (ORIGIN_Y.equals(lName) && value.length() > 0) {
153 try {
154 originY = Float.parseFloat(atts.getValue(i));
155 } catch (NumberFormatException e) {
156 throw new SAXException(e);
157 }
158 } else if (UNICODE_STRING.equals(lName)) {
159 unicodeString = atts.getValue(i);
160 } else if (BIDI_LEVEL.equals(lName) && value.length() > 0) {
161 try {
162 bidilevel = Integer.parseInt(atts.getValue(i));
163 } catch (NumberFormatException e) {
164 throw new SAXException(e);
165 }
166 } else if (INDICES.equals(lName)) {
167 indicesString = atts.getValue(i);
168 } else if (NAME.equals(lName)) {
169 name = value;
170 }
171 }
172 if (unicodeString != null) {
173 originX = (originX == null) ? Integer.MIN_VALUE : originX;
174 originY = (originY == null) ? Integer.MAX_VALUE : originY;
175 String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP;
176 List<GlyphRun> runs = canvases.get(currentCanvasClip);
177 if (runs == null) {
178 runs = new ArrayList<>();
179 }
180 runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString));
181 canvases.put(currentCanvasClip, runs);
182 }
183
184 }
185
186 @Override
187 public void endElement(String uri, String localName, String qName) throws SAXException {
188 if (CANVAS.equals(localName)) {
189 if (! canvasStack.isEmpty()) {
190 canvasStack.pop();
191 }
192 } else if (PATH.equals(localName)) {
193 //this assumes that there cannot be a path within a path
194 //not sure if this is true or if we need to track path depth
195 if (imageSourcePathInZip != null) {
196 Metadata m = embeddedInfos.get(imageSourcePathInZip);
197 if (m == null) {
198 m = new Metadata();
199 }
200 if (originalLocationOnDrive != null) {
201 String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
202 if (val == null) {
203 m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive);
204 }
205 }
206 m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
207 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
208 embeddedInfos.put(imageSourcePathInZip, m);
209 }
210 //reset
211 imageSourcePathInZip = null;
212 originalLocationOnDrive = null;
213 }
214 }
215 @Override
216 public void startDocument() throws SAXException {
217 xhml.startElement(DIV, CLASS, PAGE);
218 }
219
220 @Override
221 public void endDocument() throws SAXException {
222 writePage();
223 xhml.endElement(DIV);
224 }
225
226
227 private final void writePage() throws SAXException {
228 if (canvases.size() == 0) {
229 return;
230 }
231
232 for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) {
233 String clip = e.getKey();
234 List<GlyphRun> runs = e.getValue();
235 if (runs.size() == 0) {
236 continue;
237 }
238 xhml.startElement(DIV, CLASS, CANVAS_SAX);
239 //a list of rows sorted by the y of the first element in each row
240 List<List<GlyphRun>> rows = buildRows(runs);
241 for (List<GlyphRun> row : rows) {
242 writeRow(row);
243 }
244 xhml.endElement(DIV);
245 }
246 //for now just dump the urls at the end of the page
247 //At some point, we could link them back up to their
248 //true anchor text.
249 if (urls.size() > 0) {
250 xhml.startElement(DIV, CLASS, URL_DIV);
251 for (String u : urls) {
252 xhml.startElement(A, HREF, u);
253 xhml.characters(u);
254 xhml.endElement(A);
255 }
256 xhml.endElement(DIV);
257 }
258 canvases.clear();
259 }
260
261 private void writeRow(List<GlyphRun> row) throws SAXException {
262 /*
263 int rtl = 0;
264 int ltr = 0;
265 //if the row is entirely rtl, sort all as rtl
266 //otherwise sort ltr
267 for (GlyphRun r : row) {
268 //ignore directionality of pure spaces
269 if (r.unicodeString == null || r.unicodeString.trim().length() == 0) {
270 continue;
271 }
272 if (r.direction == GlyphRun.DIRECTION.RTL) {
273 rtl++;
274 } else {
275 ltr++;
276 }
277 }
278 if (rtl > 0 && ltr == 0) {
279 Collections.sort(row, GlyphRun.RTL_COMPARATOR);
280 } else {
281 Collections.sort(row, GlyphRun.LTR_COMPARATOR);
282 }*/
283
284 xhml.startElement(P);
285 for (GlyphRun run : row) {
286 //figure out if you need to add a space
287 xhml.characters(run.unicodeString);
288 }
289 xhml.endElement(P);
290 }
291
292 //returns a List of rows (where a row is a list of glyphruns)
293 //the List is sorted in increasing order of the first y of each row
294 private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
295 List<List<GlyphRun>> rows = new ArrayList<>();
296 float maxY = -1.0f;
297 for (GlyphRun glyphRun : glyphRuns) {
298 if (rows.size() == 0) {
299 List<GlyphRun> row = new ArrayList<>();
300 row.add(glyphRun);
301 rows.add(row);
302 continue;
303 } else {
304 boolean addedNewRow = false;
305 //can rely on the last row having the highest y
306 List<GlyphRun> row = rows.get(rows.size()-1);
307 //0.5 is a purely heuristic/magical number that should be derived
308 //from the data, not made up. TODO: fix this
309 if (Math.abs(glyphRun.originY -row.get(0).originY) < 0.5) {
310 row.add(glyphRun);
311 } else {
312 row = new ArrayList<>();
313 row.add(glyphRun);
314 rows.add(row);
315 addedNewRow = true;
316 }
317 //sort rows so that they are in ascending order of y
318 //in most xps files in our test corpus, this is never triggered
319 //because the runs are already ordered correctly
320 if (maxY > -1.0f && addedNewRow && glyphRun.originY < maxY) {
321 Collections.sort(rows, ROW_SORTER);
322 }
323 if (glyphRun.originY > maxY) {
324 maxY = glyphRun.originY;
325 }
326 }
327 }
328 return rows;
329 }
330
331 private static String getVal(String localName, Attributes atts) {
332 for (int i = 0; i < atts.getLength(); i++) {
333 if (localName.equals(atts.getLocalName(i))) {
334 return atts.getValue(i);
335 }
336 }
337 return null;
338 }
339
340 final static class GlyphRun {
341
342 private enum DIRECTION {
343 LTR,
344 RTL
345 }
346
347 //TODO: use name in conjunction with Frag information
348 //to do a better job of extracting paragraph and table structure
349 private final String name;
350 private final float originY;
351 private final float originX;//not currently used, but could be used for bidi text calculations
352 private final String unicodeString;
353 private final String indicesString;//not currently used, but could be used for width calculations
354
355 //not used yet
356 private final DIRECTION direction;
357
358 private GlyphRun(String name, float originY, float originX, String unicodeString, Integer bidiLevel, String indicesString) {
359 this.name = name;
360 this.unicodeString = unicodeString;
361 this.originY = originY;
362 this.originX = originX;
363 if (bidiLevel == null) {
364 direction = DIRECTION.LTR;
365 } else {
366 if (bidiLevel % 2 == 0) {
367 direction = DIRECTION.LTR;
368 } else {
369 direction = DIRECTION.RTL;
370 }
371 }
372 this.indicesString = indicesString;
373 }
374 }
375
376 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.microsoft.ooxml.xps;
18
19 import org.apache.poi.POIXMLDocument;
20 import org.apache.poi.POIXMLProperties;
21 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
23 import org.apache.poi.openxml4j.opc.OPCPackage;
24 import org.apache.xmlbeans.XmlException;
25
26 import java.io.IOException;
27
28 /**
29 * Currently, mostly a pass-through class to hold pkg and properties
30 * and keep the general framework similar to our other POI-integrated
31 * extractors.
32 */
33 public class XPSTextExtractor extends POIXMLTextExtractor {
34
35 private final OPCPackage pkg;
36 private final POIXMLProperties properties;
37
38 public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
39 super((POIXMLDocument)null);
40 this.pkg = pkg;
41 this.properties = new POIXMLProperties(pkg);
42
43 }
44
45 @Override
46 public OPCPackage getPackage() {
47 return pkg;
48 }
49
50 @Override
51 public String getText() {
52 return null;
53 }
54 public POIXMLProperties.CoreProperties getCoreProperties() {
55 return this.properties.getCoreProperties();
56 }
57
58 public POIXMLProperties.ExtendedProperties getExtendedProperties() {
59 return this.properties.getExtendedProperties();
60 }
61
62 public POIXMLProperties.CustomProperties getCustomProperties() {
63 return this.properties.getCustomProperties();
64 }
65 }
2828
2929 import org.apache.tika.io.IOUtils;
3030 import org.apache.tika.parser.ner.NERecogniser;
31 import org.json.JSONException;
32 import org.json.JSONObject;
31 import com.github.openjson.JSONException;
32 import com.github.openjson.JSONObject;
3333 import org.slf4j.Logger;
3434 import org.slf4j.LoggerFactory;
3535
1515 */
1616 package org.apache.tika.parser.ocr;
1717
18 import org.apache.commons.io.FilenameUtils;
19
1820 import java.io.File;
1921 import java.io.IOException;
2022 import java.io.InputStream;
2123 import java.io.Serializable;
24 import java.util.HashMap;
2225 import java.util.Locale;
26 import java.util.Map;
2327 import java.util.Properties;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
2430
2531 /**
2632 * Configuration for TesseractOCRParser.
4046
4147 private static final long serialVersionUID = -4861942486845757891L;
4248
49 private static Pattern ALLOWABLE_PAGE_SEPARATORS_PATTERN =
50 Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
51
52 private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN =
53 Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
54
4355 public enum OUTPUT_TYPE {
4456 TXT,
4557 HOCR
7385 private int enableImageProcessing = 0;
7486
7587 // Path to ImageMagick program, if not on system path.
76 private String ImageMagickPath = "";
88 private String imageMagickPath = "";
7789
7890 // resolution of processed image (in dpi).
7991 private int density = 300;
90102 // factor by which image is to be scaled.
91103 private int resize = 900;
92104
105 // See setPageSeparator.
106 private String pageSeparator = "";
107
93108 // whether or not to preserve interword spacing
94109 private boolean preserveInterwordSpacing = false;
95110
96111 // whether or not to apply rotation calculated by the rotation.py script
97112 private boolean applyRotation = false;
113
114 // See addOtherTesseractConfig.
115 private Map<String, String> otherTesseractConfig = new HashMap<>();
98116
99117
100118 /**
148166 getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
149167 setTimeout(
150168 getProp(props, "timeout", getTimeout()));
151 String outputTypeString = props.getProperty("outputType");
152 if ("txt".equals(outputTypeString)) {
153 setOutputType(OUTPUT_TYPE.TXT);
154 } else if ("hocr".equals(outputTypeString)) {
155 setOutputType(OUTPUT_TYPE.HOCR);
156 }
169 setOutputType(getProp(props, "outputType", getOutputType().toString()));
157170 setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
158171
159172 // set parameters for ImageMagick
174187 setApplyRotation(
175188 getProp(props, "applyRotation", getApplyRotation()));
176189
190 loadOtherTesseractConfig(props);
177191 }
178192
179193 /**
184198 }
185199
186200 /**
187 * Set the path to the Tesseract executable, needed if it is not on system path.
201 * Set the path to the Tesseract executable's directory, needed if it is not on system path.
188202 * <p>
189203 * Note that if you set this value, it is highly recommended that you also
190204 * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
191205 * </p>
192206 */
193207 public void setTesseractPath(String tesseractPath) {
208
209 tesseractPath = FilenameUtils.normalize(tesseractPath);
194210 if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
195211 tesseractPath += File.separator;
196212
210226 * (such as when Tesseract is built from source), it may be located elsewhere.
211227 */
212228 public void setTessdataPath(String tessdataPath) {
229 tessdataPath = FilenameUtils.normalize(tessdataPath);
213230 if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
214231 tessdataPath += File.separator;
215232
255272 }
256273
257274 /**
275 * @see #setPageSeparator(String pageSeparator)
276 */
277 public String getPageSeparator() {
278 return pageSeparator;
279 }
280
281 /**
282 * The page separator to use in plain text output. This corresponds to Tesseract's page_separator config option.
283 * The default here is the empty string (i.e. no page separators). Note that this is also the default in
284 * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character. We are overriding
285 * Tesseract 4.0's default here.
286 *
287 * @param pageSeparator
288 */
289 public void setPageSeparator(String pageSeparator) {
290 Matcher m = ALLOWABLE_PAGE_SEPARATORS_PATTERN.matcher(pageSeparator);
291 if (! m.find()) {
292 throw new IllegalArgumentException(pageSeparator + " contains illegal characters.\n"+
293 "If you trust this value, set it with setTrustedPageSeparator");
294 }
295 setTrustedPageSeparator(pageSeparator);
296 }
297
298 /**
299 * Same as {@link #setPageSeparator(String)} but does not perform
300 * any checks on the string.
301 * @param pageSeparator
302 */
303 public void setTrustedPageSeparator(String pageSeparator) {
304 this.pageSeparator = pageSeparator;
305 }
306
307 /**
258308 * Whether or not to maintain interword spacing. Default is <code>false</code>.
259309 *
260310 * @param preserveInterwordSpacing
318368
319369 /**
320370 * Set output type from ocr process. Default is "txt", but can be "hocr".
321 * Default value is 120s.
371 * Default value is {@link OUTPUT_TYPE#TXT}.
322372 */
323373 public void setOutputType(OUTPUT_TYPE outputType) {
324374 this.outputType = outputType;
375 }
376
377 public void setOutputType(String outputType) {
378 if (outputType == null) {
379 throw new IllegalArgumentException("outputType must not be null");
380 }
381 String lc = outputType.toLowerCase(Locale.US);
382 if ("txt".equals(lc)) {
383 setOutputType(OUTPUT_TYPE.TXT);
384 } else if ("hocr".equals(lc)) {
385 setOutputType(OUTPUT_TYPE.HOCR);
386 } else {
387 throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
388 }
389
390
325391 }
326392
327393 /**
399465 * Deafult value is gray.
400466 */
401467 public void setColorspace(String colorspace) {
402 if (!colorspace.equals(null)) {
403 this.colorspace = colorspace;
404 } else {
468 if (colorspace == null) {
405469 throw new IllegalArgumentException("Colorspace value cannot be null.");
406470 }
471 if (! colorspace.matches("(?i)^[-_A-Z0-9]+$")) {
472 throw new IllegalArgumentException("colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
473 }
474 this.colorspace = colorspace;
407475 }
408476
409477 /**
456524 }
457525
458526 /**
459 * @return path to ImageMagick file.
460 * @see #setImageMagickPath(String ImageMagickPath)
527 * @return path to ImageMagick executable directory.
528 * @see #setImageMagickPath(String imageMagickPath)
461529 */
462530 public String getImageMagickPath() {
463531
464 return ImageMagickPath;
465 }
466
467 /**
468 * Set the path to the ImageMagick executable, needed if it is not on system path.
469 *
470 * @param ImageMagickPath to ImageMagick file.
471 */
472 public void setImageMagickPath(String ImageMagickPath) {
473 if (!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator))
474 ImageMagickPath += File.separator;
475
476 this.ImageMagickPath = ImageMagickPath;
532 return imageMagickPath;
533 }
534
535 /**
536 * Set the path to the ImageMagick executable directory, needed if it is not on system path.
537 *
538 * @param imageMagickPath to ImageMagick executable directory.
539 */
540 public void setImageMagickPath(String imageMagickPath) {
541 imageMagickPath = FilenameUtils.normalize(imageMagickPath);
542 if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator))
543 imageMagickPath += File.separator;
544
545 this.imageMagickPath = imageMagickPath;
477546 }
478547
479548 /**
487556 /**
488557 * Sets whether or not a rotation value should be calculated and passed to ImageMagick.
489558 *
490 * @param true to calculate and apply rotation, false to skip. Default is false, true required Python installed.
559 * @param applyRotation to calculate and apply rotation, false to skip. Default is false, true required Python installed.
491560 */
492561 public void setApplyRotation(boolean applyRotation) {
493562 this.applyRotation = applyRotation;
563 }
564
565 /**
566 * @see #addOtherTesseractConfig(String, String)
567 */
568 public Map<String, String> getOtherTesseractConfig() {
569 return otherTesseractConfig;
570 }
571
572 /**
573 * Add a key-value pair to pass to Tesseract using its -c command line option.
574 * To see the possible options, run tesseract --print-parameters.
575 *
576 * You may also add these parameters in TesseractOCRConfig.properties; any
577 * key-value pair in the properties file where the key contains an underscore
578 * is passed directly to Tesseract.
579 *
580 * @param key
581 * @param value
582 */
583 public void addOtherTesseractConfig(String key, String value) {
584 if (key == null) {
585 throw new IllegalArgumentException("key must not be null");
586 }
587 if (value == null) {
588 throw new IllegalArgumentException("value must not be null");
589 }
590
591 Matcher m = ALLOWABLE_OTHER_PARAMS_PATTERN.matcher(key);
592 if (! m.find()) {
593 throw new IllegalArgumentException("Key contains illegal characters: "+key);
594 }
595 m.reset(value);
596 if (! m.find()) {
597 throw new IllegalArgumentException("Value contains illegal characters: "+value);
598 }
599
600 otherTesseractConfig.put(key.trim(), value.trim());
494601 }
495602
496603 /**
542649 property, propVal));
543650 }
544651
652 /**
653 * Populate otherTesseractConfig from the given properties.
654 * This assumes that any key-value pair where the key contains
655 * an underscore is an option to be passed opaquely to Tesseract.
656 *
657 * @param properties properties file to read from.
658 */
659 private void loadOtherTesseractConfig(Properties properties) {
660 for (String k : properties.stringPropertyNames()) {
661 if (k.contains("_")) {
662 addOtherTesseractConfig(k, properties.getProperty(k));
663 }
664 }
665 }
545666 }
3232 import java.io.Reader;
3333 import java.nio.charset.Charset;
3434 import java.nio.file.Files;
35 import java.nio.file.Paths;
3536 import java.nio.file.StandardCopyOption;
37 import java.util.ArrayList;
3638 import java.util.Arrays;
3739 import java.util.Collections;
3840 import java.util.HashMap;
5254 import org.apache.commons.exec.PumpStreamHandler;
5355 import org.apache.commons.io.FileUtils;
5456 import org.apache.commons.io.IOUtils;
57 import org.apache.commons.lang.SystemUtils;
5558 import org.apache.tika.config.Initializable;
5659 import org.apache.tika.config.InitializableProblemHandler;
5760 import org.apache.tika.config.Param;
109112 MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
110113 })));
111114 private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
115 private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
112116
113117
114118 @Override
143147 if (TESSERACT_PRESENT.containsKey(tesseract)) {
144148 return TESSERACT_PRESENT.get(tesseract);
145149 }
150 //prevent memory bloat
151 if (TESSERACT_PRESENT.size() > 100) {
152 TESSERACT_PRESENT.clear();
153 }
154 //check that the parent directory exists
155 if (! config.getTesseractPath().isEmpty() &&
156 ! Files.isDirectory(Paths.get(config.getTesseractPath()))) {
157 TESSERACT_PRESENT.put(tesseract, false);
158 return false;
159 }
160
146161 // Try running Tesseract from there, and see if it exists + works
147162 String[] checkCmd = { tesseract };
148163 boolean hasTesseract = ExternalParser.check(checkCmd);
153168
154169 private boolean hasImageMagick(TesseractOCRConfig config) {
155170 // Fetch where the config says to find ImageMagick Program
156 String ImageMagick = config.getImageMagickPath() + getImageMagickProg();
171 String ImageMagick = getImageMagickPath(config);
157172
158173 // Have we already checked for a copy of ImageMagick Program there?
159 if (TESSERACT_PRESENT.containsKey(ImageMagick)) {
160 return TESSERACT_PRESENT.get(ImageMagick);
174 if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) {
175 return IMAGE_MAGICK_PRESENT.get(ImageMagick);
176 }
177 //prevent memory bloat
178 if (IMAGE_MAGICK_PRESENT.size() > 100) {
179 IMAGE_MAGICK_PRESENT.clear();
180 }
181 //check that directory exists
182 if (!config.getImageMagickPath().isEmpty() &&
183 ! Files.isDirectory(Paths.get(config.getImageMagickPath()))) {
184 IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
185 return false;
186 }
187 if (SystemUtils.IS_OS_WINDOWS && config.getImageMagickPath().isEmpty()) {
188 LOG.warn("Must specify path for imagemagick on Windows OS to avoid accidental confusion with convert.exe");
189 IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
190 return false;
161191 }
162192
163193 // Try running ImageMagick program from there, and see if it exists + works
164194 String[] checkCmd = { ImageMagick };
165195 boolean hasImageMagick = ExternalParser.check(checkCmd);
166 TESSERACT_PRESENT.put(ImageMagick, hasImageMagick);
196 IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick);
167197
168198 return hasImageMagick;
169199
170200 }
171201
202 private String getImageMagickPath(TesseractOCRConfig config) {
203 return config.getImageMagickPath() + getImageMagickProg();
204 }
205
172206 static boolean hasPython() {
173207 // check if python is installed and it has the required dependencies for the rotation program to run
174208 boolean hasPython = false;
175
209 TemporaryResources tmp = null;
176210 try {
177 TemporaryResources tmp = new TemporaryResources();
211 tmp = new TemporaryResources();
178212 File importCheck = tmp.createTemporaryFile();
179213 String prg = "import numpy, matplotlib, skimage";
180214 OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(importCheck), Charset.forName("UTF-8"));
186220 hasPython = true;
187221 }
188222
189 tmp.close();
190223
191224 } catch (Exception e) {
192225
226 } finally {
227 IOUtils.closeQuietly(tmp);
193228 }
194229
195230 return hasPython;
305340
306341 /**
307342 * This method is used to process the image to an OCR-friendly format.
308 * @param streamingObject input image to be processed
343 * @param scratchFile input image to be processed
309344 * @param config TesseractOCRconfig class to get ImageMagick properties
310345 * @throws IOException if an input error occurred
311346 * @throws TikaException if an exception timed out
312347 */
313 private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException {
348 private void processImage(File scratchFile, TesseractOCRConfig config) throws IOException, TikaException {
314349
315350 // fetch rotation script from resources
316351 InputStream in = getClass().getResourceAsStream("rotation.py");
317352 TemporaryResources tmp = new TemporaryResources();
318353 File rotationScript = tmp.createTemporaryFile();
319354 Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING);
320
321 String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath();
355
356 CommandLine commandLine = new CommandLine("python");
357 String[] args = {"-W",
358 "ignore",
359 rotationScript.getAbsolutePath(),
360 "-f",
361 scratchFile.getAbsolutePath()};
362 commandLine.addArguments(args, true);
322363 String angle = "0";
323364
324365 DefaultExecutor executor = new DefaultExecutor();
327368 executor.setStreamHandler(streamHandler);
328369
329370 // determine the angle of rotation required to make the text horizontal
330 CommandLine cmdLine = CommandLine.parse(cmd);
331371 if(config.getApplyRotation() && hasPython()) {
332372 try {
333 executor.execute(cmdLine);
334 angle = outputStream.toString("UTF-8").trim();
373 executor.execute(commandLine);
374 String tmpAngle = outputStream.toString("UTF-8").trim();
375 //verify that you've gotten a numeric value out
376 Double.parseDouble(tmpAngle);
377 angle = tmpAngle;
335378 } catch(Exception e) {
336379
337380 }
338381 }
339382
340383 // process the image - parameter values can be set in TesseractOCRConfig.properties
341 String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() +
342 " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() +
343 " -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() +
344 " " + streamingObject.getAbsolutePath();
345 cmdLine = CommandLine.parse(line);
384 commandLine = new CommandLine(getImageMagickPath(config));
385 args = new String[]{
386 "-density", Integer.toString(config.getDensity()),
387 "-depth ", Integer.toString(config.getDepth()),
388 "-colorspace", config.getColorspace(),
389 "-filter", config.getFilter(),
390 "-resize", config.getResize() + "%",
391 "-rotate", angle,
392 scratchFile.getAbsolutePath(),
393 scratchFile.getAbsolutePath()
394 };
395 commandLine.addArguments(args, true);
346396 try {
347 executor.execute(cmdLine);
397 executor.execute(commandLine);
348398 } catch(Exception e) {
349399
350400 }
460510 * if an input error occurred
461511 */
462512 private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
463 String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
464 config.getLanguage(), "-psm", config.getPageSegMode(),
465 config.getOutputType().name().toLowerCase(Locale.US),
513 ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
514 config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
515 config.getLanguage(), "--psm", config.getPageSegMode()
516 ));
517 for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
518 cmd.add("-c");
519 cmd.add(entry.getKey() + "=" + entry.getValue());
520 }
521 cmd.addAll(Arrays.asList(
522 "-c", "page_separator=" + config.getPageSeparator(),
466523 "-c",
467 (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
524 (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
525 config.getOutputType().name().toLowerCase(Locale.US)
526 ));
468527 ProcessBuilder pb = new ProcessBuilder(cmd);
469528 setEnv(config, pb);
470529 final Process process = pb.start();
131131 throws IOException, SAXException, TikaException {
132132
133133 PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
134 if (localConfig.getSetKCMS()) {
135 System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
136 }
134137
135138 PDDocument pdfDocument = null;
136139
221224 metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
222225 Boolean.toString(ap.canPrintDegraded()));
223226
227 if (document.getDocumentCatalog().getLanguage() != null) {
228 metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
229 }
224230
225231 //now go for the XMP
226232 Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
699705 }
700706
701707 @Field
708 void setSetKCMS(boolean setKCMS) {
709 defaultConfig.setSetKCMS(setKCMS);
710 }
711
712 @Field
702713 void setInitializableProblemHander(String name) {
703714 if ("ignore".equals(name)) {
704715 setInitializableProblemHandler(InitializableProblemHandler.IGNORE);
763774 }
764775 StringBuilder sb = new StringBuilder();
765776 try {
766 Class.forName("com.levigo.jbig2.JBIG2ImageReader");
767 } catch (ClassNotFoundException e) {
768 sb.append("JBIG2ImageReader not loaded. jbig2 files will be ignored\n");
769 sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n");
770 sb.append("for optional dependencies.\n");
771 }
772 try {
773777 Class.forName("com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriter");
774778 } catch (ClassNotFoundException e) {
775779 sb.append("TIFFImageWriter not loaded. tiff files will not be processed\n");
135135 private boolean extractActions = false;
136136
137137 private long maxMainMemoryBytes = -1;
138
139 private boolean setKCMS = false;
138140
139141 public PDFParserConfig() {
140142 init(this.getClass().getResourceAsStream("PDFParser.properties"));
214216
215217 setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
216218
219 setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
217220
218221 boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
219222 boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
685688
686689 public void setMaxMainMemoryBytes(int maxMainMemoryBytes) {
687690 this.maxMainMemoryBytes = maxMainMemoryBytes;
691 }
692
693 /**
694 * <p>
695 * Whether to call <code>System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")</code>.
696 * KCMS is the unmaintained, legacy provider and is far faster than the newer replacement.
697 * However, there are stability and security risks with using the unmaintained legacy provider.
698 * </p>
699 * <p>
700 * Note, of course, that this is <b>not</b> thread safe. If the value is <code>false</code>
701 * in your first thread, and the second thread changes this to <code>true</code>,
702 * the system property in the first thread will now be <code>true</code>.
703 * </p>
704 * <p>
705 * Default is <code>false</code>.
706 * </p>
707 * @param setKCMS whether or not to set KCMS
708 */
709 public void setSetKCMS(boolean setKCMS) {
710 this.setKCMS = setKCMS;
711 }
712
713 public boolean getSetKCMS() {
714 return setKCMS;
688715 }
689716
690717 private ImageType parseImageType(String ocrImageType) {
2020 import java.io.BufferedInputStream;
2121 import java.io.IOException;
2222 import java.io.InputStream;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Map;
2327 import java.util.Set;
2428
2529 import org.apache.commons.compress.MemoryLimitException;
7478 private static final MediaType ZLIB = MediaType.application("zlib");
7579 private static final MediaType LZMA = MediaType.application("x-lzma");
7680 private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
77
78 private static final Set<MediaType> SUPPORTED_TYPES =
79 MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
80 XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
81 private static final MediaType ZSTD = MediaType.application("zstd");
82 private static final MediaType DEFLATE64= MediaType.application("deflate64");
83
84 private static Set<MediaType> SUPPORTED_TYPES;
85 private static Map<String, String> MIMES_TO_NAME;
86
87 static {
88 Set<MediaType> TMP_SET = new HashSet<>();
89 TMP_SET.addAll(
90 MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
91 XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA));
92 try {
93 Class.forName("org.brotli.dec.BrotliInputStream");
94 TMP_SET.add(BROTLI);
95 } catch (NoClassDefFoundError|ClassNotFoundException e) {
96 //swallow
97 }
98 try {
99 Class.forName("com.github.luben.zstd.ZstdInputStream");
100 TMP_SET.add(ZSTD);
101 } catch (NoClassDefFoundError|ClassNotFoundException e) {
102 //swallow
103 }
104 SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET);
105 }
106
107 static {
108 //map the mime type strings to the compressor stream names
109 Map<String, String> tmpMimesToName = new HashMap<>();
110 tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2);
111 tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP);
112 tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED);
113 tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK);
114 tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ);
115 tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200);
116 tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED);
117 tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE);
118 tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z);
119 tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA);
120 tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI);
121 tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD);
122 MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName);
123 }
124
81125
82126 private int memoryLimitInKb = 100000;//100MB
83127
140184 return SNAPPY_RAW;
141185 } else if (CompressorStreamFactory.LZMA.equals(name)) {
142186 return LZMA;
187 } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) {
188 return ZSTD;
189 } else if (CompressorStreamFactory.DEFLATE64.equals(name)) {
190 return DEFLATE64;
143191 } else {
144192 return MediaType.OCTET_STREAM;
145193 }
174222 });
175223 CompressorStreamFactory factory =
176224 new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
177 cis = factory.createCompressorInputStream(stream);
225 //if we've already identified it via autodetect
226 //trust that and go with the appropriate name
227 //to avoid calling CompressorStreamFactory.detect() twice
228 String name = getStreamName(metadata);
229 if (name != null) {
230 cis = factory.createCompressorInputStream(name, stream);
231 } else {
232 cis = factory.createCompressorInputStream(stream);
233 MediaType type = getMediaType(cis);
234 if (!type.equals(MediaType.OCTET_STREAM)) {
235 metadata.set(CONTENT_TYPE, type.toString());
236 }
237 }
178238 } catch (CompressorException e) {
179239 if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
180240 throw new TikaMemoryLimitException(e.getMessage());
182242 throw new TikaException("Unable to uncompress document stream", e);
183243 }
184244
185 MediaType type = getMediaType(cis);
186 if (!type.equals(MediaType.OCTET_STREAM)) {
187 metadata.set(CONTENT_TYPE, type.toString());
188 }
189245
190246 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
191247 xhtml.startDocument();
208264 name = name.substring(0, name.length() - 5);
209265 } else if (name.endsWith(".pack")) {
210266 name = name.substring(0, name.length() - 5);
267 } else if (name.endsWith(".br")) {
268 name = name.substring(0, name.length() - 3);
211269 } else if (name.length() > 0) {
212270 name = GzipUtils.getUncompressedFilename(name);
213271 }
227285 xhtml.endDocument();
228286 }
229287
288 /**
289 * @param metadata
290 * @return CompressorStream name based on the content-type value
291 * in metadata or <code>null</code> if not found
292 * ind
293 */
294 private String getStreamName(Metadata metadata) {
295 String mimeString = metadata.get(Metadata.CONTENT_TYPE);
296 if (mimeString == null) {
297 return null;
298 }
299 return MIMES_TO_NAME.get(mimeString);
300 }
301
230302 @Field
231303 public void setMemoryLimitInKb(int memoryLimitInKb) {
232304 this.memoryLimitInKb = memoryLimitInKb;
251251 }
252252
253253 SevenZFile sevenz;
254 if (password == null) {
255 sevenz = new SevenZFile(tstream.getFile());
256 } else {
257 sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
254 try{
255 if (password == null) {
256 sevenz = new SevenZFile(tstream.getFile());
257 } else {
258 sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
259 }
260 }catch(PasswordRequiredException e){
261 throw new EncryptedDocumentException(e);
258262 }
259263
260264 // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
2020 import java.io.ByteArrayInputStream;
2121 import java.io.IOException;
2222 import java.io.InputStream;
23 import java.nio.charset.StandardCharsets;
2324 import java.util.Enumeration;
2425 import java.util.HashSet;
2526 import java.util.Iterator;
5556 * formats to figure out exactly what the file is.
5657 */
5758 public class ZipContainerDetector implements Detector {
59
60 //Regrettably, some tiff files can be incorrectly identified
61 //as tar files. We need this ugly workaround to rule out TIFF.
62 //If commons-compress ever chooses to take over TIFF detection
63 //we can remove all of this. See TIKA-2591.
64 private final static MediaType TIFF = MediaType.image("tiff");
65 private final static byte[][] TIFF_SIGNATURES = new byte[3][];
66 static {
67 TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
68 TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
69 TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
70 }
71
5872 private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
5973
6074 // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
6377 // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
6478 private static final String STRICT_CORE_DOCUMENT =
6579 "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
80
81 private static final String XPS_DOCUMENT =
82 "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
6683
6784 /** Serial version UID */
6885 private static final long serialVersionUID = 2891763938430295453L;
8299 int length = tis.peek(prefix);
83100
84101 MediaType type = detectArchiveFormat(prefix, length);
85 if (PackageParser.isZipArchive(type)
86 && TikaInputStream.isTikaInputStream(input)) {
102
103 if (type == TIFF) {
104 return TIFF;
105 } else if (PackageParser.isZipArchive(type)
106 && TikaInputStream.isTikaInputStream(input)) {
87107 return detectZipFormat(tis);
88108 } else if (!type.equals(MediaType.OCTET_STREAM)) {
89109 return type;
108128 }
109129 }
110130
131 private static boolean isTiff(byte[] prefix) {
132 for (byte[] sig : TIFF_SIGNATURES) {
133 if(arrayStartWith(sig, prefix)) {
134 return true;
135 }
136 }
137 return false;
138 }
139
140 private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
141 if (haystack.length < needle.length) {
142 return false;
143 }
144 for (int i = 0; i < needle.length; i++) {
145 if (haystack[i] != needle[i]) {
146 return false;
147 }
148 }
149 return true;
150 }
151
111152 private static MediaType detectArchiveFormat(byte[] prefix, int length) {
153 if (isTiff(prefix)) {
154 return TIFF;
155 }
112156 try {
113157 String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
114158 return PackageParser.getMediaType(name);
212256 return null;
213257 } catch (IOException e) {
214258 return null;
259 } catch (SecurityException e) {
260 //TIKA-2571
261 throw e;
215262 } catch (RuntimeException e) {
216263 return null;
217264 } catch (InvalidFormatException e) {
244291 PackagePart corePart = pkg.getPart(core.getRelationship(0));
245292 String coreType = corePart.getContentType();
246293
294 if (coreType.contains(".xps")) {
295 return MediaType.application("vnd.ms-package.xps");
296 }
247297 // Turn that into the type of the overall document
248298 String docType = coreType.substring(0, coreType.lastIndexOf('.'));
249299
262312 /**
263313 * Detects Open XML Paper Specification (XPS)
264314 */
265 private static MediaType detectXPSOPC(OPCPackage pkg) {
315 public static MediaType detectXPSOPC(OPCPackage pkg) {
266316 PackageRelationshipCollection xps =
267317 pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
268318 if (xps.size() == 1) {
4545 import org.apache.tika.parser.ParseContext;
4646 import org.apache.tika.parser.recognition.ObjectRecogniser;
4747 import org.apache.tika.parser.recognition.RecognisedObject;
48 import org.json.JSONArray;
49 import org.json.JSONObject;
48 import com.github.openjson.JSONArray;
49 import com.github.openjson.JSONObject;
5050 import org.slf4j.Logger;
5151 import org.slf4j.LoggerFactory;
5252 import org.xml.sax.ContentHandler;
4646 import org.apache.tika.mime.MimeTypeException;
4747 import org.apache.tika.parser.ParseContext;
4848 import org.apache.tika.parser.recognition.RecognisedObject;
49 import org.json.JSONArray;
50 import org.json.JSONObject;
49 import com.github.openjson.JSONArray;
50 import com.github.openjson.JSONObject;
5151 import org.slf4j.Logger;
5252 import org.slf4j.LoggerFactory;
5353 import org.xml.sax.ContentHandler;
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.utils;
18
19 import org.apache.tika.mime.MediaType;
20
21 import java.io.ByteArrayInputStream;
22 import java.io.InputStream;
23 import java.util.Arrays;
24 import java.util.Objects;
25
26 public class DataURIScheme {
27
28
29 private final String rawMediaTypeString;
30 private final boolean isBase64;
31 private final byte[] data;
32
33 DataURIScheme(String mediaTypeString, boolean isBase64, byte[] data) {
34 this.rawMediaTypeString = mediaTypeString;
35 this.isBase64 = isBase64;
36 this.data = data;
37 }
38
39 public InputStream getInputStream() {
40 return new ByteArrayInputStream(data);
41 }
42
43 /**
44 *
45 * @return parsed media type or <code>null</code> if parse fails or if media type string was
46 * not specified
47 */
48 public MediaType getMediaType() {
49 if (rawMediaTypeString != null) {
50 return MediaType.parse(rawMediaTypeString);
51 }
52 return null;
53 }
54
55 public boolean isBase64() {
56 return isBase64;
57 }
58
59 @Override
60 public boolean equals(Object o) {
61 if (this == o) return true;
62 if (!(o instanceof DataURIScheme)) return false;
63 DataURIScheme that = (DataURIScheme) o;
64 return isBase64() == that.isBase64() &&
65 Objects.equals(rawMediaTypeString, that.rawMediaTypeString) &&
66 Arrays.equals(data, that.data);
67 }
68
69 @Override
70 public int hashCode() {
71
72 int result = Objects.hash(rawMediaTypeString, isBase64());
73 result = 31 * result + Arrays.hashCode(data);
74 return result;
75 }
76 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.utils;
18
19 import org.apache.tika.exception.TikaException;
20
21 public class DataURISchemeParseException extends TikaException {
22
23 public DataURISchemeParseException(String msg) {
24 super(msg);
25 }
26
27 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.utils;
18
19 import org.apache.commons.codec.binary.Base64;
20 import org.apache.tika.mime.MediaType;
21
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.StandardCharsets;
25 import java.util.ArrayList;
26 import java.util.Collections;
27 import java.util.List;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 /**
32 * Not thread safe. Create a separate util for each thread.
33 */
34 public class DataURISchemeUtil {
35
36 public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII";
37
38 private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$");
39 private static Pattern EXTRACT_PATTERN =
40 Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']");
41 private final Matcher parseMatcher = PARSE_PATTERN.matcher("");
42 private final Matcher extractMatcher = EXTRACT_PATTERN.matcher("");
43 Base64 base64 = new Base64();
44
45 public DataURIScheme parse(String string) throws DataURISchemeParseException {
46 parseMatcher.reset(string);
47 if (parseMatcher.find()) {
48 return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3));
49 }
50 throw new DataURISchemeParseException("Couldn't find expected pattern");
51 }
52
53 private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) {
54 byte[] data = null;
55 //strip out back slashes as you might have in css
56 dataString = (dataString != null) ?
57 dataString.replaceAll("\\\\", " ") : dataString;
58
59 if (dataString == null || dataString.length() == 0) {
60 data = new byte[0];
61 } else if (isBase64 != null) {
62 data = base64.decode(dataString);
63 } else {
64 //TODO: handle encodings
65 MediaType mediaType = MediaType.parse(mediaTypeString);
66 Charset charset = StandardCharsets.UTF_8;
67 if (mediaType.hasParameters()) {
68 String charsetName = mediaType.getParameters().get("charset");
69 if (charsetName != null && Charset.isSupported(charsetName)) {
70 try {
71 charset = Charset.forName(charsetName);
72 } catch (IllegalCharsetNameException e) {
73 //swallow and default to UTF-8
74 }
75 }
76 }
77 data = dataString.getBytes(charset);
78 }
79 return new DataURIScheme(mediaTypeString, (isBase64 != null), data);
80 }
81
82 /**
83 * Extracts DataURISchemes from free text, as in javascript.
84 *
85 * @param string
86 * @return list of extracted DataURISchemes
87 */
88 public List<DataURIScheme> extract(String string) {
89 extractMatcher.reset(string);
90 List<DataURIScheme> list = null;
91 while (extractMatcher.find()) {
92 DataURIScheme dataURIScheme = build(extractMatcher.group(1),
93 extractMatcher.group(2), extractMatcher.group(3));
94 if (list == null) {
95 list = new ArrayList<>();
96 }
97 list.add(dataURIScheme);
98 }
99 return (list == null) ? Collections.EMPTY_LIST : list;
100 }
101
102 }
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 646
15 737
16 775
17 813
18 819
19 858
20 874
21 8859_1
22 8859_13
23 8859_15
24 8859_2
25 8859_4
26 8859_5
27 8859_7
28 8859_9
29 912
30 914
31 915
32 920
33 923
34 ansi-1251
35 ascii
36 ascii7
37 cesu8
38 cp1250
39 cp1251
40 cp1252
41 cp1253
42 cp1254
43 cp1257
44 cp5346
45 cp5347
46 cp5348
47 cp5349
48 cp5350
49 cp5353
50 cp737
51 cp813
52 cp858
53 cp874
54 cp912
55 cp914
56 cp915
57 cp920
58 cp923
59 csibm862
60 csisolatin0
61 csisolatin9
62 cspcp855
63 default
64 ibm-437
65 ibm-737
66 ibm-775
67 ibm-813
68 ibm-819
69 ibm-850
70 ibm-852
71 ibm-855
72 ibm-857
73 ibm-862
74 ibm-866
75 ibm-874
76 ibm-912
77 ibm-914
78 ibm-915
79 ibm-920
80 ibm-923
81 ibm737
82 ibm813
83 ibm874
84 ibm912
85 ibm914
86 ibm915
87 ibm920
88 ibm923
89 iso8859-1
90 iso8859-13
91 iso8859-15
92 iso8859-2
93 iso8859-4
94 iso8859-5
95 iso8859-7
96 iso8859-9
97 iso8859_1
98 iso8859_13
99 iso8859_15
100 iso8859_15_fdis
101 iso8859_2
102 iso8859_4
103 iso8859_5
104 iso8859_7
105 iso8859_9
106 iso_8859-13
107 iso_8859_1
108 koi8
109 koi8_r
110 koi8_u
111 l9
112 latin0
113 latin9
114 sun_eu_greek
115 unicode
116 unicode-1-1-utf-8
117 unicodebig
118 unicodebigunmarked
119 unicodelittle
120 unicodelittleunmarked
121 utf-32be-bom
122 utf-32le-bom
123 utf16
124 utf32
125 utf8
126 utf_16
127 utf_16be
128 utf_16le
129 utf_32
130 utf_32be
131 utf_32be_bom
132 utf_32le
133 utf_32le_bom
134 windows-437
135 x-utf-16be
136 x-utf-16le
137 x-utf-32be
138 x-utf-32le
3636 ocrImageScale 2.0
3737 # Use up to 500MB when loading a pdf into a PDDocument
3838 maxMainMemoryBytes 524288000
39 #whether or not to set KCMS for faster (but legacy/unsupported) image rendering
40 setKCMS false
378378 // For spanned zip files, the .zip file doesn't have the header, it's the other parts
379379 assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
380380 assertTypeByData("application/zip", "test-documents-spanned.z01");
381
382 assertTypeDetection("testZSTD.zstd", "application/zstd");
381383 }
382384
383385 @Test
896898 // MBOX
897899 assertTypeDetection("headers.mbox", "application/mbox");
898900
899 // Thunderbird - doesn't currently work by name
900 assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
901 }
902
901 // Thunderbird
902 assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
903
904 //dkim header
905 assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
906
907 //x- custom header
908 assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
909
910 //embedded xhtml and img
911 assertTypeDetection("testEML_embedded_xhtml_and_img.eml", "message/rfc822");
912
913 }
914
915 @Test
916 public void testMessageNews() throws Exception {
917 assertTypeByData("message/news", "testMessageNews.txt");
918 }
903919 @Test
904920 public void testAxCrypt() throws Exception {
905921 // test-TXT.txt encrypted with a key of "tika"
2828 import javax.xml.transform.sax.TransformerHandler;
2929 import javax.xml.transform.stream.StreamResult;
3030 import java.io.ByteArrayInputStream;
31 import java.io.ByteArrayOutputStream;
3132 import java.io.File;
3233 import java.io.IOException;
3334 import java.io.InputStream;
3839 import java.nio.file.Path;
3940 import java.nio.file.Paths;
4041 import java.util.ArrayList;
42 import java.util.Arrays;
4143 import java.util.HashMap;
4244 import java.util.List;
4345 import java.util.Map;
5153 import java.util.concurrent.Future;
5254 import java.util.regex.Pattern;
5355
56 import org.apache.commons.codec.binary.Base64;
5457 import org.apache.tika.Tika;
5558 import org.apache.tika.TikaTest;
5659 import org.apache.tika.config.ServiceLoader;
5861 import org.apache.tika.detect.AutoDetectReader;
5962 import org.apache.tika.detect.EncodingDetector;
6063 import org.apache.tika.exception.TikaException;
64 import org.apache.tika.io.IOUtils;
6165 import org.apache.tika.io.TikaInputStream;
6266 import org.apache.tika.metadata.Geographic;
6367 import org.apache.tika.metadata.Metadata;
12641268 }
12651269
12661270 @Test
1271 public void testDataURI() throws Exception {
1272 List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img.html");
1273 assertEquals(2, metadataList.size());
1274 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
1275 assertContains("some content", content);
1276 //make sure that you've truncated the data: value
1277 assertContains("src=\"data:\"", content);
1278 Metadata imgMetadata = metadataList.get(1);
1279 assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE));
1280 assertContains("moscow-birds",
1281 Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT)));
1282 }
1283
1284 @Test
1285 public void testDataURIInJS() throws Exception {
1286 InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml");
1287 assertNotNull(is);
1288 TikaConfig tikaConfig = new TikaConfig(is);
1289 Parser p = new AutoDetectParser(tikaConfig);
1290 List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img_in_js.html", p);
1291 assertEquals(3, metadataList.size());
1292 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
1293 assertContains("some content", content);
1294 Metadata imgMetadata = metadataList.get(1);
1295 assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE));
1296 assertContains("moscow-birds",
1297 Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT)));
1298 }
1299
1300 @Test
12671301 public void testMultiThreadingEncodingDetection() throws Exception {
12681302 List<EncodingDetector> detectors = new ArrayList<>();
12691303 ServiceLoader loader =
13501384 }
13511385 }
13521386 }
1387
1388 @Test
1389 public void testCharsetsNotSupportedByIANA() throws Exception {
1390 assertContains("This is a sample text",
1391 getXML("testHTML_charset_utf8.html").xml);
1392
1393 assertContains("This is a sample text",
1394 getXML("testHTML_charset_utf16le.html").xml);
1395
1396 }
13531397 }
259259 metadata.get(Metadata.SUBJECT));
260260 }
261261
262 @Test
263 public void testMainBody() throws Exception {
264 //test that the first text or html chunk is processed in the main body
265 //not treated as an attachment. TIKA-2547
266 List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom");
267 assertEquals(7, metadataList.size());
268 assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
269
270 //Make sure text alternative doesn't get treated as an attachment
271 metadataList = getRecursiveMetadata("testRFC822_normal_zip");
272 assertEquals(3, metadataList.size());
273 assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
274 assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE));
275
276 metadataList = getRecursiveMetadata("testRFC822-txt-body");
277 assertEquals(2, metadataList.size());
278 assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
279 }
280
262281 /**
263282 * Test for TIKA-640, increase header max beyond 10k bytes
264283 */
670689 assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
671690 assertEquals("/tzora-titan-4-hummer-xl-manual.pdf", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
672691 }
692
693 @Test
694 public void testSimpleBodyInlined() throws Exception {
695 List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt");
696 assertEquals(1, metadataList.size());
697 assertContains("asked", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
698 }
673699 }
1919 import static org.junit.Assert.assertTrue;
2020 import static org.junit.Assert.fail;
2121
22 import java.io.File;
2223 import java.io.InputStream;
2324 import java.text.DecimalFormatSymbols;
2425 import java.util.List;
543544 getXML("testEXCEL_phonetic.xls", parser).xml);
544545
545546 }
547
548 @Test
549 public void testLabelsAreExtracted() throws Exception {
550 String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
551 assertContains("Morocco", xml);
552 }
546553 }
3030 import org.apache.tika.parser.ParseContext;
3131 import org.apache.tika.parser.RecursiveParserWrapper;
3232 import org.apache.tika.sax.BodyContentHandler;
33 import org.junit.Ignore;
3334 import org.junit.Test;
3435 import org.xml.sax.ContentHandler;
3536
6364 assertContains("<p>[1] This is a footnote.", xml);
6465 assertContains("<p>This is the header text.</p>", xml);
6566 assertContains("<p>This is the footer text.</p>", xml);
66 assertContains("<p>Here is a text box</p>", xml);
67 assertContainsCount("<p>Here is a text box</p>", xml, 1);
6768 assertContains("<p>Bold ", xml);
6869 assertContains("italic underline superscript subscript", xml);
6970 assertContains("underline", xml);
292293 public void testEncrypted() throws Exception {
293294 getXML("testPPT_protected_passtika.ppt");
294295 }
296
297 @Test
298 public void testGroups() throws Exception {
299 List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.ppt");
300 assertEquals(3, metadataList.size());
301 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
302 //this tests that we're ignoring text shapes at depth=0
303 //i.e. POI has already included them in the slide's getTextParagraphs()
304 assertContainsCount("Text box1", content, 1);
305
306
307 //the WordArt and text box count tests will fail
308 //if this content is available via getTextParagraphs() of the slide in POI
309 //i.e. when POI is fixed, these tests will fail, and
310 //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
311 assertContainsCount("WordArt1", content, 1);
312 assertContainsCount("WordArt2", content, 1);
313 assertContainsCount("Ungrouped text box", content, 1);//should only be 1
314 assertContains("Text box2", content);
315 assertContains("Text box3", content);
316 assertContains("Text box4", content);
317 assertContains("Text box5", content);
318
319 //see below -- need to extract hyperlinks
320 assertContains("tika", content);
321 assertContains("MyTitle", content);
322
323 assertEquals("/embedded-1",
324 metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
325
326 assertEquals("/embedded-2",
327 metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
328
329 }
330
331 @Ignore("until we add smart text extraction")
332 @Test
333 public void testSmartArtText() throws Exception {
334 String content = getXML("testPPT_groups.ppt").xml;
335 assertContains("smart1", content);
336 }
337
338 @Ignore("until we fix hyperlink extraction from text boxes")
339 @Test
340 public void testHyperlinksInTextBoxes() throws Exception {
341 String content = getXML("testPPT_groups.ppt").xml;
342 assertContains("href=\"http://tika.apache.org", content);
343 }
344
345 @Test
346 public void testEmbeddedXLSInOLEObject() throws Exception {
347 List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt");
348 assertEquals(3, metadataList.size());
349 Metadata xlsx = metadataList.get(1);
350 assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
351 assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
352 assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
353 xlsx.get(Metadata.CONTENT_TYPE));
354
355 }
295356 }
17401740 }
17411741
17421742 @Test
1743 public void testPPTXGroups() throws Exception {
1744 List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx");
1745 assertEquals(3, metadataList.size());
1746 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
1747 assertContains("WordArt1", content);
1748 assertContains("WordArt2", content);
1749 assertContainsCount("Ungrouped text box", content, 1);//should only be 1
1750 assertContains("Text box1", content);
1751 assertContains("Text box2", content);
1752 assertContains("Text box3", content);
1753 assertContains("Text box4", content);
1754 assertContains("Text box5", content);
1755
1756
1757 assertContains("href=\"http://tika.apache.org", content);
1758 assertContains("smart1", content);
1759 assertContains("MyTitle", content);
1760
1761 assertEquals("/image1.jpg",
1762 metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
1763
1764 assertEquals("/thumbnail.jpeg",
1765 metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
1766 }
1767
1768 @Test
17431769 public void testXLSXPhoneticStrings() throws Exception {
17441770 //This unit test and test file come from Apache POI 51519.xlsx
17451771
17891815 assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
17901816 assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
17911817 }
1818
1819 @Test
1820 public void testEmbeddedXLSInOLEObject() throws Exception {
1821 List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx");
1822 assertEquals(4, metadataList.size());
1823 Metadata xlsx = metadataList.get(2);
1824 assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
1825 assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
1826 assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1827 xlsx.get(Metadata.CONTENT_TYPE));
1828 }
17921829 }
17931830
17941831
590590 assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
591591
592592 }
593
594 @Test
595 public void testPPTXGroups() throws Exception {
596 List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx", parseContext);
597 assertEquals(3, metadataList.size());
598 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
599 assertContains("WordArt1", content);
600 assertContains("WordArt2", content);
601 assertContainsCount("Ungrouped text box", content, 1);//should only be 1
602 assertContains("Text box1", content);
603 assertContains("Text box2", content);
604 assertContains("Text box3", content);
605 assertContains("Text box4", content);
606 assertContains("Text box5", content);
607
608
609 assertContains("href=\"http://tika.apache.org", content);
610 assertContains("smart1", content);
611 assertContains("MyTitle", content);
612
613 assertEquals("/image1.jpg",
614 metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
615
616 assertEquals("/thumbnail.jpeg",
617 metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
618 }
619
593620 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.microsoft.ooxml.xps;
17
18 import org.apache.tika.TikaTest;
19 import org.apache.tika.metadata.Metadata;
20 import org.apache.tika.metadata.TikaCoreProperties;
21 import org.apache.tika.parser.RecursiveParserWrapper;
22 import org.junit.Test;
23
24 import java.util.List;
25
26 import static org.junit.Assert.assertEquals;
27
28 public class XPSParserTest extends TikaTest {
29
30 @Test
31 public void testBasic() throws Exception {
32 List<Metadata> metadataList = getRecursiveMetadata("testPPT.xps");
33 assertEquals(2, metadataList.size());
34
35 //metadata
36 assertEquals("Rajiv", metadataList.get(0).get(TikaCoreProperties.CREATOR));
37 assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.CREATED));
38 assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.MODIFIED));
39 assertEquals("Attachment Test", metadataList.get(0).get(TikaCoreProperties.TITLE));
40
41 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
42 assertContains("<p>Attachment Test</p>", content);
43 assertContains("<div class=\"canvas\"><p>Different", content);
44
45 //I'd want this to be "tika content", but copy+paste in Windows yields tikacontent
46 assertContains("tikacontent", content);
47
48
49 assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
50 }
51
52 @Test
53 public void testVarious() throws Exception {
54 List<Metadata> metadataList = getRecursiveMetadata("testXPS_various.xps");
55 //confirm embedded images and thumbnails were extracted
56 assertEquals(4, metadataList.size());
57
58 //now check for content in the right order
59 String quickBrownFox = "\u0644\u062B\u0639\u0644\u0628\u0020" +
60 "\u0627\u0644\u0628\u0646\u064A\u0020" +
61 "\u0627\u0644\u0633\u0631\u064A\u0639";
62
63 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
64 assertContains(quickBrownFox, content);
65
66 assertContains("The \u0627\u0644\u0628\u0646\u064A fox", content);
67
68 assertContains("\u0644\u062B\u0639\u0644\u0628 brown \u0627\u0644\u0633\u0631\u064A\u0639",
69 content);
70
71 //make sure the urls come through
72 assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>",
73 content);
74
75 Metadata metadata = metadataList.get(0);
76 assertEquals("Allison, Timothy B.", metadata.get(TikaCoreProperties.CREATOR));
77 assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.CREATED));
78 assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.MODIFIED));
79
80
81 assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
82
83 Metadata inlineJpeg = metadataList.get(2);
84 assertEquals("image/jpeg", inlineJpeg.get(Metadata.CONTENT_TYPE));
85 assertContains("INetCache", inlineJpeg.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
86 assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
87 inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
88
89 assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
90 // assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
91 // inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
92
93
94 }
95
96 }
170170 config.setResize(1000);
171171 }
172172
173 @Test(expected=IllegalArgumentException.class)
174 public void testDataPathCheck() {
175 TesseractOCRConfig config = new TesseractOCRConfig();
176 config.setTessdataPath("blah\u0000deblah");
177 }
178
179 @Test(expected=IllegalArgumentException.class)
180 public void testPathCheck() {
181 TesseractOCRConfig config = new TesseractOCRConfig();
182 config.setTesseractPath("blah\u0000deblah");
183 }
184
185 @Test(expected=IllegalArgumentException.class)
186 public void testBadOtherKey() {
187 TesseractOCRConfig config = new TesseractOCRConfig();
188 config.addOtherTesseractConfig("bad bad", "bad");
189
190 }
191
192 @Test(expected=IllegalArgumentException.class)
193 public void testBadOtherValue() {
194 TesseractOCRConfig config = new TesseractOCRConfig();
195 config.addOtherTesseractConfig("bad", "bad bad");
196 }
197
198 @Test(expected=IllegalArgumentException.class)
199 public void testBadOtherValueSlash() {
200 TesseractOCRConfig config = new TesseractOCRConfig();
201 config.addOtherTesseractConfig("bad", "bad\\bad");
202 }
203
204 @Test(expected=IllegalArgumentException.class)
205 public void testBadOtherValueControl() {
206 TesseractOCRConfig config = new TesseractOCRConfig();
207 config.addOtherTesseractConfig("bad", "bad\u0001bad");
208 }
209
210 @Test
211 public void testGoodOtherParameters() {
212 TesseractOCRConfig config = new TesseractOCRConfig();
213 config.addOtherTesseractConfig("good", "good");
214 }
215
216 @Test
217 public void testBogusPathCheck() {
218 //allow path that doesn't actually exist
219 TesseractOCRConfig config = new TesseractOCRConfig();
220 config.setTesseractPath("blahdeblahblah");
221 assertEquals("blahdeblahblah"+File.separator, config.getTesseractPath());
222 }
223
224 @Test
225 public void testTrailingSlashInPathBehavior() {
226
227 TesseractOCRConfig config = new TesseractOCRConfig();
228 config.setTesseractPath("blah");
229 assertEquals("blah"+File.separator, config.getTesseractPath());
230 config.setTesseractPath("blah"+File.separator);
231 assertEquals("blah"+File.separator, config.getTesseractPath());
232 config.setTesseractPath("");
233 assertEquals("", config.getTesseractPath());
234
235 config.setTessdataPath("blahdata");
236 assertEquals("blahdata"+File.separator, config.getTessdataPath());
237 config.setTessdataPath("blahdata"+File.separator);
238 assertEquals("blahdata"+File.separator, config.getTessdataPath());
239 config.setTessdataPath("");
240 assertEquals("", config.getTessdataPath());
241
242 config.setImageMagickPath("imagemagickpath");
243 assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
244 config.setImageMagickPath("imagemagickpath"+File.separator);
245 assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
246 config.setImageMagickPath("");
247 assertEquals("", config.getImageMagickPath());
248 }
249
250 @Test(expected=IllegalArgumentException.class)
251 public void testBadColorSpace() {
252 TesseractOCRConfig config = new TesseractOCRConfig();
253 config.setColorspace("someth!ng");
254 }
173255 }
13691369 assertFalse(path + " should have thrown exception", noEx);
13701370 }
13711371
1372 @Test
1373 public void testLanguageMetadata() throws Exception {
1374 assertEquals("de-CH", getXML("testPDF-custommetadata.pdf")
1375 .metadata.get(TikaCoreProperties.LANGUAGE));
1376 assertEquals("zh-CN", getXML("testPDFFileEmbInAnnotation.pdf")
1377 .metadata.get(TikaCoreProperties.LANGUAGE));
1378 }
1379
13721380 /**
13731381 * Simple class to count end of document events. If functionality is useful,
13741382 * move to org.apache.tika in src/test
2020 import static org.junit.Assert.assertEquals;
2121 import static org.junit.Assert.fail;
2222
23 import java.io.BufferedWriter;
24 import java.io.OutputStreamWriter;
25 import java.io.Writer;
26 import java.nio.charset.StandardCharsets;
27 import java.nio.file.Files;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.nio.file.StandardOpenOption;
2331 import java.util.HashSet;
32 import java.util.List;
2433 import java.util.Set;
2534
2635 import org.apache.commons.compress.compressors.CompressorStreamFactory;
2736 import org.apache.tika.TikaTest;
2837 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.metadata.TikaCoreProperties;
2939 import org.apache.tika.mime.MediaType;
3040 import org.apache.tika.parser.ParseContext;
41 import org.apache.tika.parser.RecursiveParserWrapper;
3142 import org.junit.BeforeClass;
3243 import org.junit.Test;
3344
3849
3950 @BeforeClass
4051 public static void setUp() {
41 NOT_COVERED.add(MediaType.application("x-brotli"));
4252 NOT_COVERED.add(MediaType.application("x-lz4-block"));
4353 NOT_COVERED.add(MediaType.application("x-snappy-raw"));
54 NOT_COVERED.add(MediaType.application("deflate64"));
4455 }
4556
4657 @Test
5768 //xml parser throws an exception for test1.xml
5869 //for now, be content that the container file is correctly identified
5970 assertContains("test1.xml", r.xml);
71 }
72
73 @Test
74 public void testZstd() throws Exception {
75 XMLResult r = getXML("testZSTD.zstd");
76 assertContains("0123456789", r.xml);
77 }
78
79 @Test
80 public void testBrotli() throws Exception {
81 Metadata metadata = new Metadata();
82 metadata.set(Metadata.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
83 List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
84
85 assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
86 assertEquals("testBROTLI_compressed", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
6087 }
6188
6289 @Test
140140 }
141141
142142 assertTrue("test no password", ex);
143
144 // No password, will fail with EncryptedDocumentException
145 ex = false;
146 try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
147 "/test-documents/full_encrypted.7z")) {
148 parser.parse(stream, handler, metadata, recursingContext);
149 fail("Shouldn't be able to read a full password protected 7z without the password");
150 } catch (EncryptedDocumentException e) {
151 // Good
152 ex = true;
153 } catch (Exception e){
154 ex = false;
155 }
156
157 assertTrue("test no password for full encrypted 7z", ex);
143158
144159 ex = false;
145160
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.pkg;
18
19
20 import org.apache.commons.compress.compressors.CompressorStreamFactory;
21 import org.apache.tika.TikaTest;
22 import org.apache.tika.io.TikaInputStream;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.mime.MediaType;
25 import org.apache.tika.parser.ParseContext;
26 import org.junit.BeforeClass;
27 import org.junit.Test;
28
29 import java.io.InputStream;
30 import java.util.HashSet;
31 import java.util.Set;
32
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.fail;
35
36 public class ZipContainerDetectorTest extends TikaTest {
37
38 @Test
39 public void testTiffWorkaround() throws Exception {
40 //TIKA-2591
41 ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
42 Metadata metadata = new Metadata();
43 try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
44 MediaType mt = zipContainerDetector.detect(is, metadata);
45 assertEquals(MediaType.image("tiff"), mt);
46 }
47 metadata = new Metadata();
48 try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
49 MediaType mt = zipContainerDetector.detect(is, metadata);
50 assertEquals(MediaType.image("tiff"), mt);
51 }
52
53 }
54 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.utils;
18
19 import org.apache.tika.TikaTest;
20 import org.apache.tika.io.IOUtils;
21 import org.apache.tika.mime.MediaType;
22 import org.junit.Test;
23
24 import java.io.ByteArrayOutputStream;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27 import java.nio.charset.StandardCharsets;
28
29 import static org.junit.Assert.assertEquals;
30 import static org.junit.Assert.assertFalse;
31 import static org.junit.Assert.assertNull;
32 import static org.junit.Assert.assertTrue;
33
34 public class DataURISchemeParserTest extends TikaTest {
35 DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
36
37 @Test
38 public void testEmpty() throws Exception {
39 DataURIScheme dataURIScheme = dataURISchemeUtil.parse("data:,");
40 assertFalse(dataURIScheme.isBase64());
41 assertNull(dataURIScheme.getMediaType());
42 assertEquals(-1, dataURIScheme.getInputStream().read());
43 }
44
45 @Test
46 public void testNewlines() throws Exception {
47 String data = "data:image/png;base64,R0lG\nODdh";
48 DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
49 assertTrue(dataURIScheme.isBase64());
50 assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
51
52 String expected = "data:image/png;base64,R0lGODdh";
53 assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
54
55 }
56
57 @Test
58 public void testBackslashNewlines() throws Exception {
59 //like you'd have in a css fragment
60 String data = "data:image/png;base64,R0lG\\\nODdh";
61 DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
62 assertTrue(dataURIScheme.isBase64());
63 assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
64
65 String expected = "data:image/png;base64,R0lGODdh";
66 assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
67 }
68
69 @Test
70 public void testUTF8() throws Exception {
71 String utf8 = "\u0628\u0631\u0646\u0633\u062A\u0648\u0646";
72 String data = "data:text/plain;charset=UTF-8;page=21,the%20data:"+utf8;
73 DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
74 ByteArrayOutputStream bos = new ByteArrayOutputStream();
75 IOUtils.copy(dataURIScheme.getInputStream(), bos);
76 assertContains(utf8, new String(bos.toByteArray(), StandardCharsets.UTF_8));
77 }
78 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
5252 <dependency>
5353 <groupId>com.google.code.gson</groupId>
5454 <artifactId>gson</artifactId>
55 <version>2.8.1</version>
55 <version>${gson.version}</version>
5656 </dependency>
5757
5858 <!-- Test dependencies -->
1515 FROM ubuntu:latest
1616 MAINTAINER Apache Tika Team
1717
18 ENV TIKA_VERSION 1.7
19 ENV TIKA_SERVER_URL https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar
20
2118 RUN apt-get update \
22 && apt-get install openjdk-7-jre-headless curl gdal-bin tesseract-ocr \
23 tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \
24 && curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \
25 && gpg --import /tmp/tika.asc \
26 && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \
27 && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \
28 | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \
29 | sed -r -e 's/^"//; s/",$//; s/" "//') \
30 && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \
31 && curl -sSL "$NEAREST_TIKA_SERVER_URL" -o /tika-server-${TIKA_VERSION}.jar \
32 && gpg --verify /tmp/tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar \
19 && apt-get install openjdk-8-jre-headless curl gdal-bin tesseract-ocr \
20 tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \
3321 && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
3422
23 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
24 RUN export JAVA_HOME
25
26 ARG JAR_FILE
27 ADD target/${JAR_FILE} /tika-server.jar
28
3529 EXPOSE 9998
36 ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0
30 ENTRYPOINT java -jar /tika-server.jar -h 0.0.0.0
31
1313 -s,--includeStack whether or not to return a stack trace
1414 if there is an exception during 'parse'
1515 ```
16 Running via Docker
17 ------------------
18 Assuming you have Docker installed, you can build you own local image using the:
19
20 `mvn dockerfile:build`
21
22 The image will be named apache/tika with the tag being the version being built.
23 For example, building Apache Tika Server 1.17 will result in an image of `apache/tika-server:1.17`
24
25 You can then run this image by executing the following, replacing `1.17` with your build version:
26
27 `docker run -d -p 9998:9998 apache/tika-server:1.17`
28
29 This will load Apache Tika Server and expose its interface on:
30
31 `http://localhost:9998`
1632
1733 Usage
1834 -----
1919 <parent>
2020 <groupId>org.apache.tika</groupId>
2121 <artifactId>tika-parent</artifactId>
22 <version>1.17</version>
22 <version>1.18</version>
2323 <relativePath>../tika-parent/pom.xml</relativePath>
2424 </parent>
2525
258258 </configuration>
259259 </execution>
260260 </executions>
261 </plugin>
262 <plugin>
263 <groupId>com.spotify</groupId>
264 <artifactId>dockerfile-maven-plugin</artifactId>
265 <version>1.3.7</version>
266 <configuration>
267 <repository>apache/tika-server</repository>
268 <tag>${project.version}</tag>
269 <buildArgs>
270 <JAR_FILE>tika-server-${project.version}.jar</JAR_FILE>
271 </buildArgs>
272 </configuration>
261273 </plugin>
262274 <plugin>
263275 <groupId>org.apache.maven.plugins</groupId>
4848 import java.util.Locale;
4949 import java.util.Map;
5050 import java.util.Set;
51 import java.util.regex.Matcher;
52 import java.util.regex.Pattern;
5153
5254 import org.apache.commons.lang.StringUtils;
5355 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
8183
8284 @Path("/tika")
8385 public class TikaResource {
86
87 private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_\\.A-Z0-9 ]+$");
88
8489 public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n";
8590 public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
8691 public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";
189194 * @throws WebApplicationException thrown when field cannot be found.
190195 */
191196 private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) {
192 try {
193 String property = StringUtils.removeStart(key, prefix);
194 Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
195
196 field.setAccessible(true);
197 if (field.getType() == String.class) {
198 field.set(object, httpHeaders.getFirst(key));
199 } else if (field.getType() == int.class) {
200 field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key)));
201 } else if (field.getType() == double.class) {
202 field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key)));
203 } else if (field.getType() == boolean.class) {
204 field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key)));
197
198 try {String property = StringUtils.removeStart(key, prefix);
199 Field field = null;
200 try {
201 field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
202 } catch (NoSuchFieldException e) {
203 //swallow
204 }
205 String setter = property;
206 setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
207 //default assume string class
208 //if there's a more specific type, e.g. double, int, boolean
209 //try that.
210 Class clazz = String.class;
211 if (field != null) {
212 if (field.getType() == int.class || field.getType() == Integer.class) {
213 clazz = int.class;
214 } else if (field.getType() == double.class) {
215 clazz = double.class;
216 } else if (field.getType() == Double.class) {
217 clazz = Double.class;
218 } else if (field.getType() == float.class) {
219 clazz = float.class;
220 } else if (field.getType() == Float.class) {
221 clazz = Float.class;
222 } else if (field.getType() == boolean.class) {
223 clazz = boolean.class;
224 } else if (field.getType() == Boolean.class) {
225 clazz = Boolean.class;
226 }
227 }
228
229 Method m = tryToGetMethod(object, setter, clazz);
230 //if you couldn't find more specific setter, back off
231 //to string setter and try that.
232 if (m == null && clazz != String.class) {
233 m = tryToGetMethod(object, setter, String.class);
234 }
235
236 if (m != null) {
237 String val = httpHeaders.getFirst(key);
238 val = val.trim();
239 if (clazz == String.class) {
240 checkTrustWorthy(setter, val);
241 m.invoke(object, val);
242 } else if (clazz == int.class || clazz == Integer.class) {
243 m.invoke(object, Integer.parseInt(val));
244 } else if (clazz == double.class || clazz == Double.class) {
245 m.invoke(object, Double.parseDouble(val));
246 } else if (clazz == boolean.class || clazz == Boolean.class) {
247 m.invoke(object, Boolean.parseBoolean(val));
248 } else if (clazz == float.class || clazz == Float.class) {
249 m.invoke(object, Float.parseFloat(val));
250 } else {
251 throw new IllegalArgumentException("setter must be String, int, float, double or boolean...for now");
252 }
205253 } else {
206 //couldn't find a directly accessible field
207 //try for setX(String s)
208 String setter = StringUtils.uncapitalize(property);
209 setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
210 Method m = null;
211 try {
212 m = object.getClass().getMethod(setter, String.class);
213 } catch (NoSuchMethodException e) {
214 //swallow
215 }
216 if (m != null) {
217 m.invoke(object, httpHeaders.getFirst(key));
218 }
219 }
254 throw new NoSuchMethodException("Couldn't find: "+setter);
255 }
256
220257 } catch (Throwable ex) {
221258 throw new WebApplicationException(String.format(Locale.ROOT,
222259 "%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX));
223260 }
261 }
262
263 private static void checkTrustWorthy(String setter, String val) {
264 if (setter == null || val == null) {
265 throw new IllegalArgumentException("setter and val must not be null");
266 }
267 if (setter.toLowerCase(Locale.US).contains("trusted")) {
268 throw new IllegalArgumentException("Can't call a trusted method via tika-server headers");
269 }
270 Matcher m = ALLOWABLE_HEADER_CHARS.matcher(val);
271 if (! m.find()) {
272 throw new IllegalArgumentException("Header val: "+val +" contains illegal characters. " +
273 "Must contain: TikaResource.ALLOWABLE_HEADER_CHARS");
274 }
275 }
276
277 /**
278 * Tries to get method. Silently swallows NoMethodException and returns
279 * <code>null</code> if not found.
280 * @param object
281 * @param method
282 * @param clazz
283 * @return
284 */
285 private static Method tryToGetMethod(Object object, String method, Class clazz) {
286 try {
287 return object.getClass().getMethod(method, clazz);
288 } catch (NoSuchMethodException e) {
289 //swallow
290 }
291 return null;
224292 }
225293
226294 @SuppressWarnings("serial")
1616
1717 package org.apache.tika.server;
1818
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertFalse;
21 import static org.junit.Assert.assertTrue;
22
23 import javax.ws.rs.core.Response;
24 import java.io.InputStream;
25 import java.util.ArrayList;
26 import java.util.List;
27
2819 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
2920 import org.apache.cxf.jaxrs.client.WebClient;
3021 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
3425 import org.apache.tika.server.resource.TikaResource;
3526 import org.junit.Test;
3627
28 import javax.ws.rs.core.Response;
29 import java.io.InputStream;
30 import java.util.ArrayList;
31 import java.util.List;
32
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.assertFalse;
35 import static org.junit.Assert.assertTrue;
36
3737 public class TikaResourceTest extends CXFTestBase {
3838 public static final String TEST_DOC = "test.doc";
3939 public static final String TEST_PASSWORD_PROTECTED = "password.xls";
278278 responseMsg
279279 );
280280 }
281
282 @Test
283 public void testDataIntegrityCheck() throws Exception {
284 Response response = WebClient.create(endPoint + TIKA_PATH)
285 .type("application/pdf")
286 .accept("text/plain")
287 .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
288 "tesseractPath",
289
290 "C://tmp//hello.bat\u0000")
291 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
292 assertEquals(500, response.getStatus());
293
294 response = WebClient.create(endPoint + TIKA_PATH)
295 .type("application/pdf")
296 .accept("text/plain")
297 .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
298 "tesseractPath",
299 "bogus path")
300 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
301 assertEquals(200, response.getStatus());
302 }
303
304 @Test
305 public void testTrustedMethodPrevention() {
306 Response response = WebClient.create(endPoint + TIKA_PATH)
307 .type("application/pdf")
308 .accept("text/plain")
309 .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
310 "trustedPageSeparator",
311 "\u0010")
312 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
313 assertEquals(500, response.getStatus());
314
315 }
316
317 @Test
318 public void testFloatInHeader() {
319 Response response = WebClient.create(endPoint + TIKA_PATH)
320 .type("application/pdf")
321 .accept("text/plain")
322 .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX +
323 "averageCharTolerance",
324 "2.0")
325 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
326 assertEquals(200, response.getStatus());
327
328 }
281329 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
4949 <artifactId>microsoft-translator-java-api</artifactId>
5050 <version>0.6.2</version>
5151 <type>jar</type>
52 <exclusions>
53 <exclusion>
54 <groupId>com.googlecode.json-simple</groupId>
55 <artifactId>json-simple</artifactId>
56 </exclusion>
57 </exclusions>
58 </dependency>
59 <dependency>
60 <groupId>com.googlecode.json-simple</groupId>
61 <artifactId>json-simple</artifactId>
62 <version>1.1.1</version>
5263 </dependency>
5364 <dependency>
5465 <groupId>org.apache.cxf</groupId>
5566 <artifactId>cxf-rt-frontend-jaxrs</artifactId>
5667 <version>${cxf.version}</version>
68 <exclusions>
69 <!-- exclude because, as of 2.9.5, jaxb-annotations
70 is bringing in 2.9.0 of core's annotations
71 -->
72 <exclusion>
73 <groupId>com.fasterxml.jackson.core</groupId>
74 <artifactId>jackson-annotations</artifactId>
75 </exclusion>
76 </exclusions>
5777 </dependency>
5878 <dependency>
5979 <groupId>com.fasterxml.jackson.jaxrs</groupId>
6080 <artifactId>jackson-jaxrs-json-provider</artifactId>
61 <version>2.9.2</version>
81 <version>${jackson.version}</version>
82 <exclusions>
83 <!-- exclude because, as of 2.9.5, jaxrs-json-provider
84 is bringing in 2.9.0 of core's annotations
85 -->
86 <exclusion>
87 <groupId>com.fasterxml.jackson.core</groupId>
88 <artifactId>jackson-annotations</artifactId>
89 </exclusion>
90 </exclusions>
91 </dependency>
92 <dependency>
93 <groupId>com.fasterxml.jackson.core</groupId>
94 <artifactId>jackson-annotations</artifactId>
95 <version>${jackson.version}</version>
6296 </dependency>
6397
6498 <!-- Test dependencies -->
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.17</version>
27 <version>1.18</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030