New upstream version 1.18
Emmanuel Bourg
5 years ago
0 | Release 1.17 - December 8, 2017 | |
0 | Release 1.18 - 4/20/2018 | |
1 | ||
2 | * Upgrade Jackson to 2.9.5 (TIKA-2634). | |
3 | ||
4 | * Add support for brotli (TIKA-2621). | |
5 | ||
6 | * Upgrade PDFBox to 2.0.9 and include new jbig2-imageio | |
7 | from org.apache.pdfbox (TIKA-2579 and TIKA-2607). | |
8 | ||
9 | * Support for TIFF images in PDF files (TIKA-2338) | |
10 | ||
11 | * Detection of full encrypted 7z files (TIKA-2568) | |
12 | ||
13 | * Various new mimes and typo fixes in tika-mimetypes.xml | |
14 | via Andreas Meier (TIKA-2527). | |
15 | ||
16 | * Revert to listenForAllRecords=false in ExcelExtractor | |
17 | via Grigoriy Alekseev (TIKA-2590) | |
18 | ||
19 | * Add workaround to identify TIFFs that might confuse | |
20 | commons-compress's tar detection via Daniel Schmidt | |
21 | (TIKA-2591) | |
22 | ||
23 | * Ignore non-IANA supported charsets in HTML meta-headers | |
24 | during charset detection in HTMLEncodingDetector | |
25 | via Andreas Meier (TIKA-2592) | |
26 | ||
27 | * Add detection and parsing of zstd (if user provides | |
28 | com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576) | |
29 | ||
30 | * Allow for RFC822 detection for files starting with "dkim-" | |
31 | and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587) | |
32 | ||
33 | * Extract xlsx files embedded in OLE objects within PPT and PPTX | |
34 | via Brian McColgan (TIKA-2588). | |
35 | ||
36 | * Extract files embedded in HTML and javascript inside HTML | |
37 | that are stored in the Data URI scheme (TIKA-2563). | |
38 | ||
39 | * Extract text from grouped text boxes in PPT (TIKA-2569). | |
40 | ||
41 | * Extract language metadata item from PDF files via Matt Sheppard (TIKA-2559) | |
42 | ||
43 | * RFC822 with multipart/mixed, first text element should be treated | |
44 | as the main body of the email, not an attachment (TIKA-2547). | |
45 | ||
46 | * Swap out com.tdunning:json for com.github.openjson:openjson to avoid | |
47 | jar conflicts (TIKA-2556). | |
48 | ||
49 | * No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551). | |
50 | ||
51 | * Require Java 8 (TIKA-2553). | |
52 | ||
53 | * Add a parser for XPS (TIKA-2524). | |
54 | ||
55 | * Mime magic for Dolby Digital AC3 and EAC3 files | |
56 | ||
57 | * Fixed bug where TesseractOCRParser ignores configured ImageMagickPath, | |
58 | and set rotation script to ignore Python warnings (TIKA-2509) | |
59 | ||
60 | * Upgrade geo-apis to 3.0.1 (TIKA-2535). | |
61 | ||
62 | * Added local Docker image build using dockerfile-maven-plugin to allow | |
63 | images to be built from source (TIKA-1518). | |
64 | ||
65 | Release 1.17 - 12/8/2017 | |
1 | 66 | |
2 | 67 | ***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN |
3 | 68 | ON Java 7. The next versions will require Java 8*** |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
103 | 103 | <include name="tika-eval/target/tika-eval-${project.version}.jar*" /> |
104 | 104 | </fileset> |
105 | 105 | </copy> |
106 | <checksum algorithm="MD5" fileext=".md5"> | |
106 | <checksum algorithm="SHA-512" fileext=".sha512"> | |
107 | 107 | <fileset dir="${basedir}/target/${project.version}"> |
108 | 108 | <include name="*.zip" /> |
109 | 109 | <include name="*.?ar" /> |
110 | 110 | </fileset> |
111 | 111 | </checksum> |
112 | <checksum algorithm="SHA1" fileext=".sha"> | |
113 | <fileset dir="${basedir}/target/${project.version}"> | |
114 | <include name="*.zip" /> | |
115 | <include name="*.?ar" /> | |
116 | </fileset> | |
117 | </checksum> | |
118 | <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" /> | |
112 | <checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA-512" property="checksum" /> | |
119 | 113 | <echo file="${basedir}/target/vote.txt"> |
120 | 114 | From: ${username}@apache.org |
121 | 115 | To: dev@tika.apache.org |
128 | 122 | The release candidate is a zip archive of the sources in: |
129 | 123 | https://github.com/apache/tika/tree/{project.version}-rcN/ |
130 | 124 | |
131 | The SHA1 checksum of the archive is | |
125 | The SHA-512 checksum of the archive is | |
132 | 126 | ${checksum}. |
133 | 127 | |
134 | 128 | In addition, a staged maven repository is available here: |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 |
16 | 16 | |
17 | 17 | package org.apache.tika.cli; |
18 | 18 | |
19 | ||
20 | import org.apache.commons.lang.SystemUtils; | |
21 | ||
19 | 22 | import java.io.IOException; |
20 | 23 | import java.nio.file.Files; |
21 | 24 | import java.nio.file.Path; |
40 | 43 | static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)"); |
41 | 44 | |
42 | 45 | protected static String[] build(String[] args) throws IOException { |
46 | ||
43 | 47 | Map<String, String> processArgs = new LinkedHashMap<String, String>(); |
44 | 48 | Map<String, String> jvmOpts = new LinkedHashMap<String,String>(); |
45 | 49 | //take the args, and divide them into process args and options for |
52 | 56 | //maybe the user specified a different classpath?! |
53 | 57 | if (! jvmOpts.containsKey("-cp") && ! jvmOpts.containsKey("--classpath")) { |
54 | 58 | String cp = System.getProperty("java.class.path"); |
55 | //need to test for " " on *nix, can't just add double quotes | |
56 | //across platforms. | |
57 | if (cp.contains(" ")){ | |
58 | cp = "\""+cp+"\""; | |
59 | } | |
60 | 59 | jvmOpts.put("-cp", cp); |
61 | 60 | } |
62 | 61 | |
69 | 68 | } |
70 | 69 | //use the log4j config file inside the app /resources/log4j_batch_process.properties |
71 | 70 | if (! hasLog4j) { |
72 | jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", ""); | |
71 | jvmOpts.put("-Dlog4j.configuration=log4j_batch_process.properties", ""); | |
73 | 72 | } |
74 | 73 | //now build the full command line |
75 | 74 | List<String> fullCommand = new ArrayList<String>(); |
78 | 77 | for (Map.Entry<String, String> e : jvmOpts.entrySet()) { |
79 | 78 | fullCommand.add(e.getKey()); |
80 | 79 | if (e.getValue().length() > 0) { |
81 | fullCommand.add(e.getValue()); | |
80 | fullCommand.add(commandLineSafe(e.getValue())); | |
82 | 81 | } |
83 | 82 | if (e.getKey().contains("java.awt.headless")) { |
84 | 83 | foundHeadlessOption = true; |
93 | 92 | for (Map.Entry<String, String> e : processArgs.entrySet()) { |
94 | 93 | fullCommand.add(e.getKey()); |
95 | 94 | if (e.getValue().length() > 0) { |
96 | fullCommand.add(e.getValue()); | |
95 | fullCommand.add(commandLineSafe(e.getValue())); | |
97 | 96 | } |
98 | 97 | } |
99 | 98 | return fullCommand.toArray(new String[fullCommand.size()]); |
99 | } | |
100 | ||
101 | protected static String commandLineSafe(String arg) { | |
102 | if (arg == null) { | |
103 | return arg; | |
104 | } | |
105 | //need to test for " " on windows, can't just add double quotes | |
106 | //across platforms. | |
107 | if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) { | |
108 | arg = "\"" + arg + "\""; | |
109 | } | |
110 | return arg; | |
100 | 111 | } |
101 | 112 | |
102 | 113 |
1040 | 1040 | if (name == null) { |
1041 | 1041 | name = "file" + count++; |
1042 | 1042 | } |
1043 | ||
1043 | if (! inputStream.markSupported()) { | |
1044 | inputStream = TikaInputStream.get(inputStream); | |
1045 | } | |
1044 | 1046 | MediaType contentType = detector.detect(inputStream, metadata); |
1045 | 1047 | |
1046 | 1048 | if (name.indexOf('.')==-1 && contentType!=null) { |
40 | 40 | Path testFile = null; |
41 | 41 | |
42 | 42 | String testInputPathForCommandLine; |
43 | String escapedInputPathForCommandLine; | |
43 | 44 | |
44 | 45 | @Before |
45 | 46 | public void init() { |
56 | 57 | throw new RuntimeException("Couldn't open testFile"); |
57 | 58 | } |
58 | 59 | testInputPathForCommandLine = testInput.toAbsolutePath().toString(); |
60 | escapedInputPathForCommandLine = BatchCommandLineBuilder.commandLineSafe(testInputPathForCommandLine); | |
59 | 61 | } |
60 | 62 | |
61 | 63 | @After |
113 | 115 | assertEquals("true", attrs.get("-recursiveParserWrapper")); |
114 | 116 | assertEquals("html", attrs.get("-basicHandlerType")); |
115 | 117 | assertEquals("batch-config.xml", attrs.get("-bc")); |
116 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
118 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
117 | 119 | } |
118 | 120 | |
119 | 121 | @Test |
124 | 126 | |
125 | 127 | String[] commandLine = BatchCommandLineBuilder.build(params); |
126 | 128 | Map<String, String> attrs = mapify(commandLine); |
127 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
129 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
128 | 130 | assertEquals(outputRoot, attrs.get("-outputDir")); |
129 | 131 | } |
130 | 132 | |
135 | 137 | |
136 | 138 | String[] commandLine = BatchCommandLineBuilder.build(params); |
137 | 139 | Map<String, String> attrs = mapify(commandLine); |
138 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
140 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
139 | 141 | assertEquals(outputRoot, attrs.get("-outputDir")); |
140 | 142 | |
141 | 143 | params = new String[]{"--inputDir", testInputPathForCommandLine, "--outputDir", outputRoot}; |
142 | 144 | |
143 | 145 | commandLine = BatchCommandLineBuilder.build(params); |
144 | 146 | attrs = mapify(commandLine); |
145 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
147 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
146 | 148 | assertEquals(outputRoot, attrs.get("-outputDir")); |
147 | 149 | |
148 | 150 | params = new String[]{"-inputDir", testInputPathForCommandLine, "-outputDir", outputRoot}; |
149 | 151 | |
150 | 152 | commandLine = BatchCommandLineBuilder.build(params); |
151 | 153 | attrs = mapify(commandLine); |
152 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
154 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
153 | 155 | assertEquals(outputRoot, attrs.get("-outputDir")); |
154 | 156 | } |
155 | 157 | |
162 | 164 | "--config="+configPath}; |
163 | 165 | String[] commandLine = BatchCommandLineBuilder.build(params); |
164 | 166 | Map<String, String> attrs = mapify(commandLine); |
165 | assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); | |
167 | assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir")); | |
166 | 168 | assertEquals(outputRoot, attrs.get("-outputDir")); |
167 | 169 | assertEquals(configPath, attrs.get("-c")); |
168 | 170 |
281 | 281 | FileUtils.deleteDirectory(tempFile); |
282 | 282 | } |
283 | 283 | } |
284 | ||
285 | @Test | |
286 | public void testExtractTgz() throws Exception { | |
287 | //TIKA-2564 | |
288 | File tempFile = File.createTempFile("tika-test-", ""); | |
289 | tempFile.delete(); | |
290 | tempFile.mkdir(); | |
291 | ||
292 | try { | |
293 | String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/test-documents.tgz"}; | |
294 | ||
295 | TikaCLI.main(params); | |
296 | ||
297 | StringBuffer allFiles = new StringBuffer(); | |
298 | for (String f : tempFile.list()) { | |
299 | if (allFiles.length() > 0) allFiles.append(" : "); | |
300 | allFiles.append(f); | |
301 | } | |
302 | ||
303 | File expectedTAR = new File(tempFile, "test-documents.tar"); | |
304 | ||
305 | assertExtracted(expectedTAR, allFiles.toString()); | |
306 | } finally { | |
307 | FileUtils.deleteDirectory(tempFile); | |
308 | } | |
309 | } | |
310 | ||
311 | ||
284 | 312 | protected static void assertExtracted(File f, String allFiles) { |
285 | 313 | |
286 | 314 | assertTrue( |
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.extractor; | |
18 | ||
19 | import org.apache.tika.batch.DigestingAutoDetectParserFactory; | |
20 | import org.apache.tika.config.TikaConfig; | |
21 | import org.apache.tika.extractor.EmbeddedDocumentUtil; | |
22 | import org.apache.tika.parser.AutoDetectParser; | |
23 | import org.apache.tika.parser.ParseContext; | |
24 | import org.apache.tika.parser.Parser; | |
25 | import org.apache.tika.parser.RecursiveParserWrapper; | |
26 | import org.apache.tika.sax.BasicContentHandlerFactory; | |
27 | import org.junit.Test; | |
28 | ||
29 | import static org.junit.Assert.assertEquals; | |
30 | import static org.junit.Assert.assertNotNull; | |
31 | ||
32 | public class TestEmbeddedDocumentUtil { | |
33 | //TODO -- figure out how to mock this into tika-core | |
34 | ||
35 | @Test | |
36 | public void testSimple() { | |
37 | Parser p = new AutoDetectParser(); | |
38 | ParseContext parseContext = new ParseContext(); | |
39 | parseContext.set(Parser.class, p); | |
40 | Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext); | |
41 | assertNotNull(txtParser); | |
42 | assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass()); | |
43 | ||
44 | } | |
45 | ||
46 | @Test | |
47 | public void testDoublyDecorated() { | |
48 | Parser d = new DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig()); | |
49 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d, | |
50 | new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); | |
51 | ParseContext parseContext = new ParseContext(); | |
52 | parseContext.set(Parser.class, wrapper); | |
53 | Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext); | |
54 | assertNotNull(txtParser); | |
55 | assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass()); | |
56 | } | |
57 | } |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
34 | 34 | <url>http://tika.apache.org/</url> |
35 | 35 | |
36 | 36 | <properties> |
37 | <cli.version>1.3.1</cli.version> | |
37 | <cli.version>1.4</cli.version> | |
38 | 38 | </properties> |
39 | 39 | |
40 | 40 | <dependencies> |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
71 | 71 | <groupId>org.ops4j.pax.exam</groupId> |
72 | 72 | <artifactId>pax-exam-container-native</artifactId> |
73 | 73 | <version>${pax.exam.version}</version> |
74 | <exclusions> | |
75 | <exclusion> | |
76 | <groupId>org.ops4j.base</groupId> | |
77 | <artifactId>ops4j-base-util-property</artifactId> | |
78 | </exclusion> | |
79 | <exclusion> | |
80 | <groupId>org.ops4j.base</groupId> | |
81 | <artifactId>ops4j-base-lang</artifactId> | |
82 | </exclusion> | |
83 | </exclusions> | |
84 | <scope>test</scope> | |
85 | </dependency> | |
86 | <dependency> | |
87 | <groupId>org.ops4j.base</groupId> | |
88 | <artifactId>ops4j-base-util-property</artifactId> | |
89 | <version>1.5.0</version> | |
90 | <scope>test</scope> | |
91 | </dependency> | |
92 | <dependency> | |
93 | <groupId>org.ops4j.base</groupId> | |
94 | <artifactId>ops4j-base-lang</artifactId> | |
95 | <version>1.5.0</version> | |
74 | 96 | <scope>test</scope> |
75 | 97 | </dependency> |
76 | 98 | <dependency> |
167 | 189 | sis-netcdf| |
168 | 190 | sis-utility| |
169 | 191 | sis-storage| |
192 | unit-api| | |
170 | 193 | apache-mime4j-core| |
171 | 194 | apache-mime4j-dom| |
172 | jsr-275| | |
173 | 195 | jhighlight| |
174 | 196 | java-libpst| |
175 | 197 | netcdf4| |
205 | 227 | android.util;resolution:=optional, |
206 | 228 | com.adobe.xmp;resolution:=optional, |
207 | 229 | com.adobe.xmp.properties;resolution:=optional, |
230 | com.github.luben.zstd;resolution:=optional, | |
231 | com.github.openjson;resolution:=optional, | |
208 | 232 | com.google.protobuf;resolution:=optional, |
209 | 233 | com.ibm.icu.text;resolution:=optional, |
210 | 234 | com.sleepycat.je;resolution:=optional, |
253 | 277 | org.apache.pdfbox.debugger;resolution:=optional, |
254 | 278 | org.apache.sis;resolution:=optional, |
255 | 279 | org.apache.sis.distance;resolution:=optional, |
280 | org.apache.sis.feature;resolution:=optional, | |
256 | 281 | org.apache.sis.geometry;resolution:=optional, |
282 | org.apache.sis.internal.feature;resolution:=optional, | |
283 | org.apache.sis.internal.referencing;resolution:=optional, | |
284 | org.apache.sis.parameter;resolution:=optional, | |
285 | org.apache.sis.referencing;resolution:=optional, | |
257 | 286 | org.apache.tools.ant;resolution:=optional, |
258 | 287 | org.apache.tools.ant.taskdefs;resolution:=optional, |
259 | 288 | org.apache.tools.ant.types;resolution:=optional, |
294 | 323 | org.jdom2.output;resolution:=optional, |
295 | 324 | org.jdom2.filter;resolution:=optional, |
296 | 325 | org.json.simple;resolution:=optional, |
297 | org.json;resolution:=optional, | |
298 | 326 | org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional, |
299 | 327 | org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional, |
300 | 328 | org.osgi.framework;resolution:=optional, |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 |
240 | 240 | Parser returnParser = null; |
241 | 241 | if (p != null) { |
242 | 242 | if (p instanceof ParserDecorator) { |
243 | p = ((ParserDecorator)p).getWrappedParser(); | |
243 | p = findInDecorated((ParserDecorator)p, clazz); | |
244 | 244 | } |
245 | 245 | if (equals(p, clazz)) { |
246 | 246 | return p; |
254 | 254 | } |
255 | 255 | |
256 | 256 | return null; |
257 | } | |
258 | ||
259 | private static Parser findInDecorated(ParserDecorator p, Class clazz) { | |
260 | Parser candidate = p.getWrappedParser(); | |
261 | if (equals(candidate, clazz)) { | |
262 | return candidate; | |
263 | } | |
264 | if (candidate instanceof ParserDecorator) { | |
265 | candidate = findInDecorated((ParserDecorator)candidate, clazz); | |
266 | } | |
267 | return candidate; | |
257 | 268 | } |
258 | 269 | |
259 | 270 | private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) { |
264 | 275 | return candidate; |
265 | 276 | } |
266 | 277 | if (candidate instanceof ParserDecorator) { |
267 | candidate = ((ParserDecorator)candidate).getWrappedParser(); | |
278 | candidate = findInDecorated((ParserDecorator)candidate, clazz); | |
268 | 279 | } |
269 | 280 | if (equals(candidate, clazz)) { |
270 | 281 | return candidate; |
229 | 229 | break; |
230 | 230 | } |
231 | 231 | } |
232 | if (i < 0) { | |
233 | throw new IOException("Buffer underun; expected one more byte"); | |
234 | } | |
232 | 235 | return v; |
233 | 236 | } |
234 | 237 |
69 | 69 | * The unit tests for this class are in the tika-parsers module. |
70 | 70 | * </p> |
71 | 71 | */ |
72 | public class RecursiveParserWrapper implements Parser { | |
72 | public class RecursiveParserWrapper extends ParserDecorator { | |
73 | 73 | |
74 | 74 | /** |
75 | 75 | * Generated serial version |
125 | 125 | */ |
126 | 126 | public RecursiveParserWrapper(Parser wrappedParser, |
127 | 127 | ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) { |
128 | super(wrappedParser); | |
128 | 129 | this.wrappedParser = wrappedParser; |
129 | 130 | this.contentHandlerFactory = contentHandlerFactory; |
130 | 131 | this.catchEmbeddedExceptions = catchEmbeddedExceptions; |
30 | 30 | * ({@link #characters(char[], int, int)} or |
31 | 31 | * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated |
32 | 32 | * content handler contain only valid XML characters. All invalid characters |
33 | * are replaced with spaces. | |
33 | * are replaced with the Unicode replacement character U+FFFD (though a | |
34 | * subclass may change this by overriding the {@link #writeReplacement(Output)} method). | |
34 | 35 | * <p> |
35 | 36 | * The XML standard defines the following Unicode character ranges as |
36 | 37 | * valid XML characters: |
158 | 158 | * @return the regular expression containing the most important technical standard organizations. |
159 | 159 | */ |
160 | 160 | public static String getOrganzationsRegex() { |
161 | String regex = "(" + String.join("|", organizations.keySet()) + ")"; | |
162 | ||
163 | return regex; | |
161 | StringBuilder sb = new StringBuilder(); | |
162 | sb.append("("); | |
163 | int i = 0; | |
164 | for (String org : organizations.keySet()) { | |
165 | if (i > 0) { | |
166 | sb.append("|"); | |
167 | } | |
168 | sb.append(org); | |
169 | i++; | |
170 | } | |
171 | sb.append(")"); | |
172 | return sb.toString(); | |
164 | 173 | } |
165 | 174 | }⏎ |
118 | 118 | <mime-type type="application/cnrp+xml"/> |
119 | 119 | <mime-type type="application/commonground"/> |
120 | 120 | <mime-type type="application/conference-info+xml"/> |
121 | ||
122 | <mime-type type="application/coreldraw"> | |
123 | <alias type="application/x-coreldraw"/> | |
124 | <alias type="application/x-cdr"/> | |
125 | <alias type="application/cdr"/> | |
126 | <alias type="image/x-cdr"/> | |
127 | <alias type="image/cdr"/> | |
128 | <_comment>CorelDraw</_comment> | |
129 | <_comment>cdr: CorelDraw</_comment> | |
130 | <_comment>des: CorelDraw X4 and newer</_comment> | |
131 | <magic priority="60"> | |
132 | <match value="RIFF" type="string" offset="0"> | |
133 | <match value="CDR" type="string" offset="8" /> | |
134 | <match value="cdr" type="string" offset="8" /> | |
135 | <match value="DES" type="string" offset="8" /> | |
136 | <match value="des" type="string" offset="8" /> | |
137 | </match> | |
138 | </magic> | |
139 | <glob pattern="*.cdr"/> | |
140 | </mime-type> | |
141 | ||
121 | 142 | <mime-type type="application/cpl+xml"/> |
122 | 143 | <mime-type type="application/csta+xml"/> |
123 | 144 | <mime-type type="application/cstadata+xml"/> |
347 | 368 | <alias type="application/mac-binhex"/> |
348 | 369 | <alias type="application/binhex"/> |
349 | 370 | <magic priority="50"> |
350 | <match value="must\ be\ converted\ with\ BinHex" type="string" offset="11"/> | |
371 | <match value="must be converted with BinHex" type="string" offset="11"/> | |
351 | 372 | </magic> |
352 | 373 | <glob pattern="*.hqx"/> |
353 | 374 | </mime-type> |
839 | 860 | <mime-type type="application/smil+xml"> |
840 | 861 | <alias type="application/smil"/> |
841 | 862 | <_comment>SMIL Multimedia</_comment> |
863 | <root-XML localName="smil"/> | |
864 | <sub-class-of type="application/xml"/> | |
842 | 865 | <glob pattern="*.smi"/> |
843 | 866 | <glob pattern="*.smil"/> |
844 | 867 | <glob pattern="*.sml"/> |
1390 | 1413 | <mime-type type="application/vnd.intu.qfx"> |
1391 | 1414 | <glob pattern="*.qfx"/> |
1392 | 1415 | </mime-type> |
1416 | <mime-type type="application/vnd.iptc.g2.catalogitem+xml"/> | |
1393 | 1417 | <mime-type type="application/vnd.iptc.g2.conceptitem+xml"/> |
1394 | 1418 | <mime-type type="application/vnd.iptc.g2.knowledgeitem+xml"/> |
1395 | 1419 | <mime-type type="application/vnd.iptc.g2.newsitem+xml"/> |
1420 | ||
1421 | <mime-type type="application/vnd.iptc.g2.newsmessage+xml"> | |
1422 | <root-XML localName="newsMessage"/> | |
1423 | <root-XML localName="newsMessage" namespaceURI="http://iptc.org/std/nar/2006-10-01/"/> | |
1424 | <sub-class-of type="application/xml"/> | |
1425 | <_comment>XML syntax for IPTC NewsMessages</_comment> | |
1426 | <glob pattern="*.nar"/> | |
1427 | </mime-type> | |
1428 | ||
1396 | 1429 | <mime-type type="application/vnd.iptc.g2.packageitem+xml"/> |
1430 | <mime-type type="application/vnd.iptc.g2.planningitem+xml"/> | |
1431 | ||
1397 | 1432 | <mime-type type="application/vnd.ipunplugged.rcprofile"> |
1398 | 1433 | <glob pattern="*.rcprofile"/> |
1399 | 1434 | </mime-type> |
2774 | 2809 | <mime-type type="application/wspolicy+xml"> |
2775 | 2810 | <glob pattern="*.wspolicy"/> |
2776 | 2811 | </mime-type> |
2812 | ||
2813 | <mime-type type="image/x-tga"> | |
2814 | <alias type="image/x-targa"/> | |
2815 | <!-- trailer bytes: 54 52 55 45 56 49 53 49 4F 4E 2D 58 46 49 4C 45 2E 00 | |
2816 | trailer as string: TRUEVISION-XFILE\\x2E\\x00 | |
2817 | Some .tga files may be conflicting with application/x-123 recognition, | |
2818 | therefore this mime-type must be set in front of application/x-123 --> | |
2819 | <_comment>Targa image data</_comment> | |
2820 | <magic priority="90"> | |
2821 | <match value="0x01010000" type="big32" offset="1" > | |
2822 | <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" /> | |
2823 | </match> | |
2824 | <match value="0x00020000" type="big32" offset="1" > | |
2825 | <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" /> | |
2826 | </match> | |
2827 | <match value="0x00030000" type="big32" offset="1" > | |
2828 | <match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" /> | |
2829 | </match> | |
2830 | </magic> | |
2831 | <glob pattern="*.tga"/> | |
2832 | <glob pattern="*.icb"/> | |
2833 | <glob pattern="*.vda"/> | |
2834 | <!-- <glob pattern="*.vst"/> --> <!-- conflicting with application/vnd.visio--> | |
2835 | </mime-type> | |
2777 | 2836 | |
2778 | 2837 | <mime-type type="application/x-123"> |
2779 | 2838 | <magic priority="50"> |
3075 | 3134 | <match value="bplist" type="string" offset="0"/> |
3076 | 3135 | </magic> |
3077 | 3136 | </mime-type> |
3137 | <mime-type type="application/x-gtar"> | |
3138 | <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment> | |
3139 | <magic priority="50"> | |
3140 | <!-- GNU tar archive --> | |
3141 | <match value="ustar \0" type="string" offset="257" /> | |
3142 | </magic> | |
3143 | <glob pattern="*.gtar"/> | |
3144 | <sub-class-of type="application/x-tar"/> | |
3145 | </mime-type> | |
3146 | ||
3147 | <mime-type type="application/x-brotli"> | |
3148 | <glob pattern="*.br" /> | |
3149 | <glob pattern="*.brotli" /> | |
3150 | </mime-type> | |
3078 | 3151 | |
3079 | 3152 | <mime-type type="application/x-bzip"> |
3080 | 3153 | <magic priority="40"> |
3452 | 3525 | <glob pattern="*.tgz" /> |
3453 | 3526 | <glob pattern="*-gz" /> |
3454 | 3527 | </mime-type> |
3455 | ||
3528 | <mime-type type="application/zstd"> | |
3529 | <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment> | |
3530 | <_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment> | |
3531 | <magic priority="50"> | |
3532 | <match value="0xFD2FB528" type="little32" offset="0"/> | |
3533 | </magic> | |
3534 | <glob pattern="*.zstd"/> | |
3535 | </mime-type> | |
3456 | 3536 | <mime-type type="application/x-hdf"> |
3457 | 3537 | <_comment>Hierarchical Data Format File</_comment> |
3458 | 3538 | <magic priority="50"> |
3591 | 3671 | <match value="-lz5-" type="string" offset="2"/> |
3592 | 3672 | </magic> |
3593 | 3673 | </mime-type> |
3674 | ||
3675 | <mime-type type="application/x-lz4"> | |
3676 | <_comment>First match LZ4 Frame</_comment> | |
3677 | <_comment>Second match Legacy Frame</_comment> | |
3678 | <magic priority="60"> | |
3679 | <match value="0x184d2204" type="little32" offset="0" /> | |
3680 | <match value="0x184c2102" type="little32" offset="0" /> | |
3681 | </magic> | |
3682 | <glob pattern="*.lz4"/> | |
3683 | </mime-type> | |
3684 | ||
3685 | <mime-type type="application/x-lzip"> | |
3686 | <_comment>Lzip (LZMA) compressed archive</_comment> | |
3687 | <magic priority="50"> | |
3688 | <match value="\x4c\x5a\x49\x50" type="string" offset="0"/> | |
3689 | </magic> | |
3690 | <glob pattern="*.lz"/> | |
3691 | </mime-type> | |
3692 | ||
3693 | <mime-type type="application/x-lzma"> | |
3694 | <_comment>LZMA compressed archive</_comment> | |
3695 | <glob pattern="*.lzma"/> | |
3696 | </mime-type> | |
3594 | 3697 | |
3595 | 3698 | <mime-type type="application/x-mobipocket-ebook"> |
3596 | 3699 | <acronym>MOBI</acronym> |
4002 | 4105 | <acronym>ESRI Shapefiles</acronym> |
4003 | 4106 | <_comment>ESRI Shapefiles</_comment> |
4004 | 4107 | <magic priority="60"> |
4005 | <match value="0x0000270a" type="big32" offset="2" /> | |
4108 | <match value="0x0000270a" type="big32" offset="0" /> | |
4006 | 4109 | </magic> |
4007 | 4110 | <glob pattern="*.shp"/> |
4008 | 4111 | </mime-type> |
4740 | 4843 | <glob pattern="*.aac"/> |
4741 | 4844 | </mime-type> |
4742 | 4845 | |
4743 | <mime-type type="audio/x-adbcm"> | |
4846 | <mime-type type="audio/x-adpcm"> | |
4744 | 4847 | <magic priority="20"> |
4745 | 4848 | <match value=".snd" type="string" offset="0"> |
4746 | 4849 | <match value="23" type="big32" offset="12"/> |
4766 | 4869 | <glob pattern="*.aiff"/> |
4767 | 4870 | <glob pattern="*.aifc"/> |
4768 | 4871 | </mime-type> |
4872 | ||
4873 | <mime-type type="audio/x-caf"> | |
4874 | <_comment>Core Audio Format</_comment> | |
4875 | <_comment>com.apple.coreaudio-format</_comment> | |
4876 | <magic priority="60"> | |
4877 | <match value="caff" type="string" offset="0" /> | |
4878 | </magic> | |
4879 | <glob pattern="*.caf"/> | |
4880 | </mime-type> | |
4769 | 4881 | |
4770 | 4882 | <mime-type type="audio/x-dec-basic"> |
4771 | 4883 | <magic priority="20"> |
4781 | 4893 | </magic> |
4782 | 4894 | </mime-type> |
4783 | 4895 | |
4784 | <mime-type type="audio/x-dec-adbcm"> | |
4896 | <mime-type type="audio/x-dec-adpcm"> | |
4785 | 4897 | <magic priority="20"> |
4786 | 4898 | <match value="0x0064732E" type="big32" offset="0"> |
4787 | 4899 | <match value="23" type="big32" offset="12"/> |
5612 | 5724 | <magic priority="50"> |
5613 | 5725 | <match value="Delivered-To:" type="string" offset="0"/> |
5614 | 5726 | <match value="Status:" type="string" offset="0"/> |
5615 | <match value="X-Mozilla-Keys:" type="string" offset="0"/> | |
5616 | <match value="X-Mozilla-Status:" type="string" offset="0"/> | |
5617 | <match value="X-Mozilla-Status2:" type="string" offset="0"/> | |
5618 | 5727 | <match value="Relay-Version:" type="stringignorecase" offset="0"/> |
5619 | 5728 | <match value="#!\ rnews" type="string" offset="0"/> |
5620 | 5729 | <match value="N#!\ rnews" type="string" offset="0"/> |
5624 | 5733 | <match value="From:" type="stringignorecase" offset="0"/> |
5625 | 5734 | <match value="Received:" type="stringignorecase" offset="0"/> |
5626 | 5735 | <match value="Message-ID:" type="stringignorecase" offset="0"/> |
5736 | <match value="\nReturn-Path:" type="stringignorecase" offset="0:1000"/> | |
5737 | <match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1000"/> | |
5738 | <match value="\nReceived:" type="stringignorecase" offset="0:1000"/> | |
5627 | 5739 | <match value="Date:" type="string" offset="0"/> |
5628 | 5740 | <match value="User-Agent:" type="string" offset="0"/> |
5629 | 5741 | <match value="MIME-Version:" type="stringignorecase" offset="0"/> |
5631 | 5743 | <match value="X-Notes-Item:" type="string" offset="0"> |
5632 | 5744 | <match value="Message-ID:" type="string" offset="0:8192"/> |
5633 | 5745 | </match> |
5746 | <match value="X-" type="stringignorecase" offset="0"> | |
5747 | <match value="\nMessage-ID:" type="string" offset="0:8192"/> | |
5748 | <match value="\nFrom:" type="stringignorecase" offset="0:8192"/> | |
5749 | <match value="\nTo:" type="stringignorecase" offset="0:8192"/> | |
5750 | <match value="\nSubject:" type="string" offset="0:8192"/> | |
5751 | <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/> | |
5752 | </match> | |
5753 | <match value="DKIM-" type="string" offset="0"> | |
5754 | <match value="\nMessage-ID:" type="string" offset="0:8192"/> | |
5755 | <match value="\nFrom:" type="stringignorecase" offset="0:8192"/> | |
5756 | <match value="\nTo:" type="stringignorecase" offset="0:8192"/> | |
5757 | <match value="\nSubject:" type="string" offset="0:8192"/> | |
5758 | <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/> | |
5759 | </match> | |
5760 | </magic> | |
5761 | <magic priority="40"> | |
5762 | <!-- lower priority than message/news --> | |
5763 | <match value="\nMessage-ID:" type="stringignorecase" offset="0:1000"/> | |
5634 | 5764 | </magic> |
5635 | 5765 | <glob pattern="*.eml"/> |
5636 | 5766 | <glob pattern="*.mime"/> |
212 | 212 | return getRecursiveMetadata(filePath, new ParseContext()); |
213 | 213 | } |
214 | 214 | |
215 | protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception { | |
216 | return getRecursiveMetadata(filePath, new ParseContext(), metadata); | |
217 | } | |
218 | ||
219 | protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { | |
220 | Parser p = new AutoDetectParser(); | |
221 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, | |
222 | new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); | |
223 | try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { | |
224 | wrapper.parse(is, new DefaultHandler(), metadata, context); | |
225 | } | |
226 | return wrapper.getMetadata(); | |
227 | } | |
228 | ||
215 | 229 | protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { |
216 | 230 | Parser p = new AutoDetectParser(); |
217 | 231 | RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, |
23 | 23 | <parent> |
24 | 24 | <groupId>org.apache.tika</groupId> |
25 | 25 | <artifactId>tika-parent</artifactId> |
26 | <version>1.17</version> | |
26 | <version>1.18</version> | |
27 | 27 | <relativePath>../tika-parent/pom.xml</relativePath> |
28 | 28 | </parent> |
29 | 29 | |
45 | 45 | <artifactId>tika-parsers</artifactId> |
46 | 46 | <version>${project.version}</version> |
47 | 47 | <scope>provided</scope> |
48 | <exclusions> | |
49 | <exclusion> | |
50 | <groupId>joda-time</groupId> | |
51 | <artifactId>joda-time</artifactId> | |
52 | </exclusion> | |
53 | </exclusions> | |
48 | 54 | </dependency> |
49 | 55 | <dependency> |
50 | 56 | <groupId>junit</groupId> |
63 | 69 | <groupId>org.json</groupId> |
64 | 70 | <artifactId>json</artifactId> |
65 | 71 | </exclusion> |
66 | </exclusions> | |
72 | <exclusion> | |
73 | <groupId>com.google.guava</groupId> | |
74 | <artifactId>guava</artifactId> | |
75 | </exclusion> | |
76 | <exclusion> | |
77 | <groupId>org.deeplearning4j</groupId> | |
78 | <artifactId>deeplearning4j-modelimport</artifactId> | |
79 | </exclusion> | |
80 | <exclusion> | |
81 | <groupId>org.apache.commons</groupId> | |
82 | <artifactId>commons-compress</artifactId> | |
83 | </exclusion> | |
84 | <exclusion> | |
85 | <groupId>org.apache.commons</groupId> | |
86 | <artifactId>commons-math3</artifactId> | |
87 | </exclusion> | |
88 | <exclusion> | |
89 | <groupId>commons-io</groupId> | |
90 | <artifactId>commons-io</artifactId> | |
91 | </exclusion> | |
92 | </exclusions> | |
93 | </dependency> | |
94 | <dependency> | |
95 | <groupId>org.apache.commons</groupId> | |
96 | <artifactId>commons-math3</artifactId> | |
97 | <version>3.4.1</version> | |
67 | 98 | </dependency> |
68 | 99 | <dependency> |
69 | 100 | <groupId>org.deeplearning4j</groupId> |
74 | 105 | <groupId>org.deeplearning4j</groupId> |
75 | 106 | <artifactId>deeplearning4j-keras</artifactId> |
76 | 107 | </exclusion> |
108 | <exclusion> | |
109 | <groupId>org.bytedeco</groupId> | |
110 | <artifactId>javacpp</artifactId> | |
111 | </exclusion> | |
112 | <exclusion> | |
113 | <groupId>joda-time</groupId> | |
114 | <artifactId>joda-time</artifactId> | |
115 | </exclusion> | |
77 | 116 | </exclusions> |
78 | 117 | </dependency> |
79 | 118 | <dependency> |
80 | 119 | <groupId>org.datavec</groupId> |
81 | 120 | <artifactId>datavec-data-image</artifactId> |
82 | 121 | <version>${dl4j.version}</version> |
122 | <exclusions> | |
123 | <exclusion> | |
124 | <groupId>com.google.guava</groupId> | |
125 | <artifactId>guava</artifactId> | |
126 | </exclusion> | |
127 | <exclusion> | |
128 | <groupId>org.bytedeco</groupId> | |
129 | <artifactId>javacpp</artifactId> | |
130 | </exclusion> | |
131 | <exclusion> | |
132 | <groupId>org.apache.commons</groupId> | |
133 | <artifactId>commons-math3</artifactId> | |
134 | </exclusion> | |
135 | <exclusion> | |
136 | <groupId>commons-io</groupId> | |
137 | <artifactId>commons-io</artifactId> | |
138 | </exclusion> | |
139 | <exclusion> | |
140 | <groupId>com.github.jai-imageio</groupId> | |
141 | <artifactId>jai-imageio-core</artifactId> | |
142 | </exclusion> | |
143 | </exclusions> | |
83 | 144 | </dependency> |
84 | 145 | <dependency> |
85 | 146 | <groupId>org.nd4j</groupId> |
86 | 147 | <artifactId>nd4j-native-platform</artifactId> |
87 | 148 | <version>${dl4j.version}</version> |
149 | <exclusions> | |
150 | <exclusion> | |
151 | <groupId>org.bytedeco</groupId> | |
152 | <artifactId>javacpp</artifactId> | |
153 | </exclusion> | |
154 | </exclusions> | |
155 | </dependency> | |
156 | <dependency> | |
157 | <groupId>org.bytedeco</groupId> | |
158 | <artifactId>javacpp</artifactId> | |
159 | <version>1.3.2</version> | |
88 | 160 | </dependency> |
89 | 161 | <dependency> |
90 | 162 | <groupId>org.apache.commons</groupId> |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
89 | 89 | <groupId>org.apache.jackrabbit</groupId> |
90 | 90 | <artifactId>jackrabbit-jcr-server</artifactId> |
91 | 91 | <version>2.3.6</version> |
92 | <exclusions> | |
93 | <exclusion> | |
94 | <groupId>org.apache.tika</groupId> | |
95 | <artifactId>tika-core</artifactId> | |
96 | </exclusion> | |
97 | <exclusion> | |
98 | <groupId>commons-codec</groupId> | |
99 | <artifactId>commons-codec</artifactId> | |
100 | </exclusion> | |
101 | <exclusion> | |
102 | <groupId>commons-io</groupId> | |
103 | <artifactId>commons-io</artifactId> | |
104 | </exclusion> | |
105 | </exclusions> | |
92 | 106 | </dependency> |
93 | 107 | <dependency> |
94 | 108 | <groupId>org.apache.jackrabbit</groupId> |
95 | 109 | <artifactId>jackrabbit-core</artifactId> |
96 | 110 | <version>2.3.6</version> |
111 | <exclusions> | |
112 | <exclusion> | |
113 | <groupId>org.apache.tika</groupId> | |
114 | <artifactId>tika-core</artifactId> | |
115 | </exclusion> | |
116 | <exclusion> | |
117 | <groupId>commons-io</groupId> | |
118 | <artifactId>commons-io</artifactId> | |
119 | </exclusion> | |
120 | <exclusion> | |
121 | <groupId>org.apache.lucene</groupId> | |
122 | <artifactId>lucene-core</artifactId> | |
123 | </exclusion> | |
124 | </exclusions> | |
97 | 125 | </dependency> |
98 | 126 | <dependency> |
99 | 127 | <groupId>org.apache.lucene</groupId> |
108 | 136 | <dependency> |
109 | 137 | <groupId>org.springframework</groupId> |
110 | 138 | <artifactId>spring-context</artifactId> |
111 | <version>3.0.2.RELEASE</version> | |
139 | <version>3.2.16.RELEASE</version> | |
112 | 140 | <exclusions> |
113 | 141 | <exclusion> |
114 | 142 | <groupId>commons-logging</groupId> |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 |
+8
-2
81 | 81 | |
82 | 82 | Iterator<FileTypeDetector> iterator = serviceLoader.iterator(); |
83 | 83 | assertTrue(iterator.hasNext()); |
84 | ||
84 | ||
85 | boolean foundTika = false; | |
85 | 86 | while(iterator.hasNext()) { |
86 | 87 | FileTypeDetector fileTypeDetector = iterator.next(); |
87 | 88 | assertNotNull(fileTypeDetector); |
88 | assertTrue(fileTypeDetector instanceof TikaFileTypeDetector); | |
89 | if (fileTypeDetector instanceof TikaFileTypeDetector) { | |
90 | foundTika = true; | |
91 | } | |
89 | 92 | } |
93 | //o.a.sis.internal.storage.StoreTypeDetector appears with latest upgrade | |
94 | //check that TikaFileTypeDetector appears at all | |
95 | assertTrue(foundTika); | |
90 | 96 | } |
91 | 97 | } |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
42 | 42 | <dependency> |
43 | 43 | <groupId>com.optimaize.languagedetector</groupId> |
44 | 44 | <artifactId>language-detector</artifactId> |
45 | <version>0.5</version> | |
45 | <version>0.6</version> | |
46 | <exclusions> | |
47 | <exclusion> | |
48 | <groupId>com.google.guava</groupId> | |
49 | <artifactId>guava</artifactId> | |
50 | </exclusion> | |
51 | </exclusions> | |
52 | </dependency> | |
53 | <!-- exclude and then add back in to avoid | |
54 | conflicts with edu.ucar:cdm in tika-parsers --> | |
55 | <dependency> | |
56 | <groupId>com.google.guava</groupId> | |
57 | <artifactId>guava</artifactId> | |
58 | <version>17.0</version> | |
46 | 59 | </dependency> |
47 | 60 | <dependency> |
48 | 61 | <groupId>org.apache.cxf</groupId> |
52 | 65 | <dependency> |
53 | 66 | <groupId>com.google.code.gson</groupId> |
54 | 67 | <artifactId>gson</artifactId> |
55 | <version>2.6.1</version> | |
68 | <version>${gson.version}</version> | |
56 | 69 | </dependency> |
57 | 70 | |
58 | 71 | <!-- Test dependencies --> |
23 | 23 | <parent> |
24 | 24 | <groupId>org.apache.tika</groupId> |
25 | 25 | <artifactId>tika-parent</artifactId> |
26 | <version>1.17</version> | |
26 | <version>1.18</version> | |
27 | 27 | <relativePath>../tika-parent/pom.xml</relativePath> |
28 | 28 | </parent> |
29 | 29 | |
63 | 63 | <groupId>edu.usc.ir</groupId> |
64 | 64 | <artifactId>age-predictor-api</artifactId> |
65 | 65 | <version>1.0</version> |
66 | </dependency> | |
67 | ||
66 | <exclusions> | |
67 | <exclusion> | |
68 | <groupId>com.google.guava</groupId> | |
69 | <artifactId>guava</artifactId> | |
70 | </exclusion> | |
71 | <exclusion> | |
72 | <groupId>commons-lang</groupId> | |
73 | <artifactId>commons-lang</artifactId> | |
74 | </exclusion> | |
75 | <exclusion> | |
76 | <groupId>commons-compress</groupId> | |
77 | <artifactId>commons-compress</artifactId> | |
78 | </exclusion> | |
79 | <exclusion> | |
80 | <groupId>org.xerial.snappy</groupId> | |
81 | <artifactId>snappy-java</artifactId> | |
82 | </exclusion> | |
83 | <exclusion> | |
84 | <groupId>com.fasterxml.jackson.core</groupId> | |
85 | <artifactId>jackson-core</artifactId> | |
86 | </exclusion> | |
87 | <exclusion> | |
88 | <groupId>com.fasterxml.jackson.core</groupId> | |
89 | <artifactId>jackson-databind</artifactId> | |
90 | </exclusion> | |
91 | <exclusion> | |
92 | <groupId>com.fasterxml.jackson.core</groupId> | |
93 | <artifactId>jackson-annotations</artifactId> | |
94 | </exclusion> | |
95 | <exclusion> | |
96 | <groupId>org.codehaus.jackson</groupId> | |
97 | <artifactId>jackson-mapper-asl</artifactId> | |
98 | </exclusion> | |
99 | <exclusion> | |
100 | <groupId>log4j</groupId> | |
101 | <artifactId>log4j</artifactId> | |
102 | </exclusion> | |
103 | <exclusion> | |
104 | <groupId>commons-codec</groupId> | |
105 | <artifactId>commons-codec</artifactId> | |
106 | </exclusion> | |
107 | <exclusion> | |
108 | <groupId>commons-io</groupId> | |
109 | <artifactId>commons-io</artifactId> | |
110 | </exclusion> | |
111 | <exclusion> | |
112 | <groupId>com.thoughtworks.paranamer</groupId> | |
113 | <artifactId>paranamer</artifactId> | |
114 | </exclusion> | |
115 | <exclusion> | |
116 | <groupId>commons-net</groupId> | |
117 | <artifactId>commons-net</artifactId> | |
118 | </exclusion> | |
119 | <exclusion> | |
120 | <groupId>org.scala-lang</groupId> | |
121 | <artifactId>scala-library</artifactId> | |
122 | </exclusion> | |
123 | <exclusion> | |
124 | <groupId>org.scala-lang</groupId> | |
125 | <artifactId>scala-reflect</artifactId> | |
126 | </exclusion> | |
127 | <exclusion> | |
128 | <groupId>org.scalamacros</groupId> | |
129 | <artifactId>quasiquotes_2.10</artifactId> | |
130 | </exclusion> | |
131 | <exclusion> | |
132 | <groupId>org.codehaus.jackson</groupId> | |
133 | <artifactId>jackson-core-asl</artifactId> | |
134 | </exclusion> | |
135 | <exclusion> | |
136 | <groupId>org.apache.avro</groupId> | |
137 | <artifactId>avro</artifactId> | |
138 | </exclusion> | |
139 | </exclusions> | |
140 | </dependency> | |
141 | <dependency> | |
142 | <groupId>org.scalamacros</groupId> | |
143 | <artifactId>quasiquotes_2.10</artifactId> | |
144 | <version>2.0.0-M8</version> | |
145 | <exclusions> | |
146 | <exclusion> | |
147 | <groupId>org.scala-lang</groupId> | |
148 | <artifactId>scala-reflect</artifactId> | |
149 | </exclusion> | |
150 | <exclusion> | |
151 | <groupId>org.scala-lang</groupId> | |
152 | <artifactId>scala-library</artifactId> | |
153 | </exclusion> | |
154 | </exclusions> | |
155 | </dependency> | |
156 | <dependency> | |
157 | <groupId>org.scala-lang</groupId> | |
158 | <artifactId>scala-library</artifactId> | |
159 | <version>2.10.6</version> | |
160 | </dependency> | |
161 | <dependency> | |
162 | <groupId>org.scala-lang</groupId> | |
163 | <artifactId>scala-reflect</artifactId> | |
164 | <version>2.10.6</version> | |
165 | </dependency> | |
166 | <dependency> | |
167 | <groupId>commons-net</groupId> | |
168 | <artifactId>commons-net</artifactId> | |
169 | <version>3.1</version> | |
170 | </dependency> | |
171 | <dependency> | |
172 | <groupId>com.thoughtworks.paranamer</groupId> | |
173 | <artifactId>paranamer</artifactId> | |
174 | <version>2.6</version> | |
175 | </dependency> | |
176 | <dependency> | |
177 | <groupId>org.xerial.snappy</groupId> | |
178 | <artifactId>snappy-java</artifactId> | |
179 | <version>1.1.2.4</version> | |
180 | </dependency> | |
181 | <dependency> | |
182 | <groupId>org.codehaus.jackson</groupId> | |
183 | <artifactId>jackson-mapper-asl</artifactId> | |
184 | <version>1.9.13</version> | |
185 | </dependency> | |
186 | <dependency> | |
187 | <groupId>com.fasterxml.jackson.core</groupId> | |
188 | <artifactId>jackson-databind</artifactId> | |
189 | <version>${jackson.version}</version> | |
190 | <exclusions> | |
191 | <exclusion> | |
192 | <groupId>com.fasterxml.jackson.core</groupId> | |
193 | <artifactId>jackson-annotations</artifactId> | |
194 | </exclusion> | |
195 | </exclusions> | |
196 | </dependency> | |
197 | <dependency> | |
198 | <groupId>com.fasterxml.jackson.core</groupId> | |
199 | <artifactId>jackson-annotations</artifactId> | |
200 | <version>${jackson.version}</version> | |
201 | </dependency> | |
68 | 202 | <!-- Test dependencies --> |
69 | 203 | <dependency> |
70 | 204 | <groupId>junit</groupId> |
73 | 207 | <dependency> |
74 | 208 | <groupId>org.mockito</groupId> |
75 | 209 | <artifactId>mockito-core</artifactId> |
76 | <version>1.7</version> | |
210 | <version>2.15.0</version> | |
77 | 211 | <scope>test</scope> |
78 | 212 | </dependency> |
79 | 213 | <dependency> |
30 | 30 | |
31 | 31 | <groupId>org.apache.tika</groupId> |
32 | 32 | <artifactId>tika-parent</artifactId> |
33 | <version>1.17</version> | |
33 | <version>1.18</version> | |
34 | 34 | <packaging>pom</packaging> |
35 | 35 | |
36 | 36 | <name>Apache Tika parent</name> |
305 | 305 | <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
306 | 306 | <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding> |
307 | 307 | <!-- NOTE: sync tukaani version with commons-compress in tika-parsers --> |
308 | <commons.compress.version>1.14</commons.compress.version> | |
309 | <commons.io.version>2.5</commons.io.version> | |
308 | <commons.compress.version>1.16.1</commons.compress.version> | |
309 | <commons.io.version>2.6</commons.io.version> | |
310 | <gson.version>2.8.1</gson.version> | |
310 | 311 | <cxf.version>3.0.16</cxf.version> |
311 | 312 | <slf4j.version>1.7.24</slf4j.version> |
313 | <jackson.version>2.9.5</jackson.version> | |
312 | 314 | </properties> |
313 | 315 | |
314 | 316 | <build> |
324 | 326 | <plugin> |
325 | 327 | <groupId>de.thetaphi</groupId> |
326 | 328 | <artifactId>forbiddenapis</artifactId> |
327 | <version>2.3</version> | |
329 | <!-- if this version contains commons-io 2.6, remove hard-coded commons-io version below --> | |
330 | <version>2.5</version> | |
328 | 331 | <configuration> |
329 | 332 | <targetVersion>${maven.compiler.target}</targetVersion> |
330 | 333 | <failOnUnresolvableSignatures>false</failOnUnresolvableSignatures> |
375 | 378 | <version>1.9.5</version> |
376 | 379 | </dependency> |
377 | 380 | </dependencies> |
381 | </plugin> | |
382 | <plugin> | |
383 | <groupId>org.apache.maven.plugins</groupId> | |
384 | <artifactId>maven-enforcer-plugin</artifactId> | |
385 | <version>3.0.0-M1</version> | |
386 | <executions> | |
387 | <execution> | |
388 | <id>enforce</id> | |
389 | <configuration> | |
390 | <rules> | |
391 | <dependencyConvergence /> | |
392 | </rules> | |
393 | </configuration> | |
394 | <goals> | |
395 | <goal>enforce</goal> | |
396 | </goals> | |
397 | </execution> | |
398 | </executions> | |
378 | 399 | </plugin> |
379 | 400 | </plugins> |
380 | 401 | </build> |
438 | 459 | <connection>scm:git:https://github.com/apache/</connection> |
439 | 460 | <developerConnection>scm:git:https://github.com/apache/</developerConnection> |
440 | 461 | <url>https://github.com/apache/tika</url> |
441 | <tag>1.17-rc2</tag> | |
462 | <tag>1.18-rc3</tag> | |
442 | 463 | </scm> |
443 | 464 | </project> |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
38 | 38 | <!-- NOTE: sync codec version with POI --> |
39 | 39 | <codec.version>1.10</codec.version> |
40 | 40 | <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> |
41 | <tukaani.version>1.6</tukaani.version> | |
41 | <tukaani.version>1.8</tukaani.version> | |
42 | <!-- NOTE: sync brotli version with commons-compress in tika-parent--> | |
43 | <brotli.version>0.1.2</brotli.version> | |
42 | 44 | <mime4j.version>0.8.1</mime4j.version> |
43 | 45 | <vorbis.version>0.8</vorbis.version> |
44 | <pdfbox.version>2.0.8</pdfbox.version> | |
46 | <pdfbox.version>2.0.9</pdfbox.version> | |
45 | 47 | <jempbox.version>1.8.13</jempbox.version> |
46 | 48 | <netcdf-java.version>4.5.5</netcdf-java.version> |
47 | <sis.version>0.6</sis.version> | |
49 | <sis.version>0.8</sis.version> | |
48 | 50 | <!-- used by POI, PDFBox and Jackcess ...try to sync --> |
49 | 51 | <bouncycastle.version>1.54</bouncycastle.version> |
50 | 52 | <commonsexec.version>1.3</commonsexec.version> |
80 | 82 | <groupId>org.gagravarr</groupId> |
81 | 83 | <artifactId>vorbis-java-tika</artifactId> |
82 | 84 | <version>${vorbis.version}</version> |
85 | <exclusions> | |
86 | <exclusion> | |
87 | <groupId>org.apache.tika</groupId> | |
88 | <artifactId>tika-core</artifactId> | |
89 | </exclusion> | |
90 | </exclusions> | |
83 | 91 | </dependency> |
84 | 92 | <dependency> |
85 | 93 | <groupId>com.healthmarketscience.jackcess</groupId> |
86 | 94 | <artifactId>jackcess</artifactId> |
87 | <version>2.1.8</version> | |
95 | <version>2.1.10</version> | |
88 | 96 | <exclusions> |
89 | 97 | <exclusion> |
90 | 98 | <groupId>commons-logging</groupId> |
95 | 103 | <dependency> |
96 | 104 | <groupId>com.healthmarketscience.jackcess</groupId> |
97 | 105 | <artifactId>jackcess-encrypt</artifactId> |
98 | <version>2.1.2</version> | |
106 | <version>2.1.4</version> | |
99 | 107 | <exclusions> |
100 | 108 | <exclusion> |
101 | 109 | <groupId>org.bouncycastle</groupId> |
102 | 110 | <artifactId>bcprov-jdk15on</artifactId> |
111 | </exclusion> | |
112 | <!-- to avoid maven-enforcer convergence error, | |
113 | let's make this explicit --> | |
114 | <exclusion> | |
115 | <groupId>com.healthmarketscience.jackcess</groupId> | |
116 | <artifactId>jackcess</artifactId> | |
103 | 117 | </exclusion> |
104 | 118 | </exclusions> |
105 | 119 | </dependency> |
136 | 150 | <groupId>org.tukaani</groupId> |
137 | 151 | <artifactId>xz</artifactId> |
138 | 152 | <version>${tukaani.version}</version> |
153 | </dependency> | |
154 | <dependency> | |
155 | <groupId>org.brotli</groupId> | |
156 | <artifactId>dec</artifactId> | |
157 | <version>${brotli.version}</version> | |
158 | </dependency> | |
159 | <dependency> | |
160 | <groupId>com.github.luben</groupId> | |
161 | <artifactId>zstd-jni</artifactId> | |
162 | <version>1.3.3-3</version> | |
163 | <scope>provided</scope> | |
139 | 164 | </dependency> |
140 | 165 | |
141 | 166 | <dependency> |
315 | 340 | <dependency> |
316 | 341 | <groupId>org.apache.opennlp</groupId> |
317 | 342 | <artifactId>opennlp-tools</artifactId> |
318 | <version>1.8.3</version> | |
343 | <version>1.8.4</version> | |
319 | 344 | </dependency> |
320 | 345 | |
321 | 346 | <dependency> |
336 | 361 | </exclusions> |
337 | 362 | </dependency> |
338 | 363 | |
339 | <dependency> | |
364 | <!-- <dependency> | |
340 | 365 | <groupId>com.tdunning</groupId> |
341 | 366 | <artifactId>json</artifactId> |
342 | 367 | <version>1.8</version> |
368 | </dependency> --> | |
369 | <dependency> | |
370 | <groupId>com.github.openjson</groupId> | |
371 | <artifactId>openjson</artifactId> | |
372 | <version>1.0.10</version> | |
343 | 373 | </dependency> |
344 | 374 | <dependency> |
345 | 375 | <groupId>com.google.code.gson</groupId> |
346 | 376 | <artifactId>gson</artifactId> |
347 | <version>2.8.1</version> | |
377 | <version>${gson.version}</version> | |
348 | 378 | </dependency> |
349 | 379 | |
350 | 380 | <!-- logging dependencies --> |
369 | 399 | <dependency> |
370 | 400 | <groupId>org.mockito</groupId> |
371 | 401 | <artifactId>mockito-core</artifactId> |
372 | <version>1.7</version> | |
402 | <version>2.15.0</version> | |
373 | 403 | <scope>test</scope> |
374 | 404 | </dependency> |
375 | 405 | <dependency> |
389 | 419 | <groupId>commons-logging</groupId> |
390 | 420 | <artifactId>commons-logging</artifactId> |
391 | 421 | </exclusion> |
422 | <exclusion> | |
423 | <groupId>org.jdom</groupId> | |
424 | <artifactId>jdom2</artifactId> | |
425 | </exclusion> | |
392 | 426 | </exclusions> |
393 | 427 | </dependency> |
394 | 428 | <dependency> |
400 | 434 | <groupId>edu.ucar</groupId> |
401 | 435 | <artifactId>jj2000</artifactId> |
402 | 436 | </exclusion> |
403 | </exclusions> | |
404 | </dependency> | |
405 | <dependency> | |
437 | <exclusion> | |
438 | <groupId>org.jsoup</groupId> | |
439 | <artifactId>jsoup</artifactId> | |
440 | </exclusion> | |
441 | <exclusion> | |
442 | <groupId>org.jdom</groupId> | |
443 | <artifactId>jdom2</artifactId> | |
444 | </exclusion> | |
445 | </exclusions> | |
446 | </dependency> | |
447 | <!-- grib's current jsoup is vulnerable to xss | |
448 | exclude and import a more modern version TIKA-2561--> | |
449 | <dependency> | |
450 | <groupId>org.jsoup</groupId> | |
451 | <artifactId>jsoup</artifactId> | |
452 | <version>1.11.2</version> | |
453 | </dependency> <dependency> | |
406 | 454 | <groupId>edu.ucar</groupId> |
407 | 455 | <artifactId>cdm</artifactId> |
408 | 456 | <version>${netcdf-java.version}</version> |
415 | 463 | <groupId>org.slf4j</groupId> |
416 | 464 | <artifactId>jcl-over-slf4j</artifactId> |
417 | 465 | </exclusion> |
466 | <exclusion> | |
467 | <groupId>org.apache.httpcomponents</groupId> | |
468 | <artifactId>httpcore</artifactId> | |
469 | </exclusion> | |
470 | <exclusion> | |
471 | <groupId>org.jdom</groupId> | |
472 | <artifactId>jdom2</artifactId> | |
473 | </exclusion> | |
418 | 474 | </exclusions> |
419 | 475 | </dependency> |
420 | 476 | <dependency> |
433 | 489 | </exclusion> |
434 | 490 | <exclusion> |
435 | 491 | <groupId>org.apache.httpcomponents</groupId> |
492 | <artifactId>httpcore</artifactId> | |
493 | </exclusion> <exclusion> | |
494 | <groupId>org.apache.httpcomponents</groupId> | |
436 | 495 | <artifactId>httpmime</artifactId> |
437 | 496 | </exclusion> |
438 | 497 | </exclusions> |
480 | 539 | <dependency> |
481 | 540 | <groupId>org.opengis</groupId> |
482 | 541 | <artifactId>geoapi</artifactId> |
483 | <version>3.0.0</version> | |
542 | <version>3.0.1</version> | |
484 | 543 | </dependency> |
485 | 544 | |
486 | 545 | <dependency> |
536 | 595 | <dependency> |
537 | 596 | <groupId>org.apache.ctakes</groupId> |
538 | 597 | <artifactId>ctakes-core</artifactId> |
539 | <version>3.2.2</version> | |
598 | <version>4.0.0</version> | |
540 | 599 | <scope>provided</scope> |
541 | 600 | <exclusions> |
542 | 601 | <exclusion> |
563 | 622 | <groupId>org.springframework</groupId> |
564 | 623 | <artifactId>spring-core</artifactId> |
565 | 624 | </exclusion> |
566 | </exclusions> | |
567 | </dependency> | |
568 | ||
625 | <exclusion> | |
626 | <groupId>org.apache.opennlp</groupId> | |
627 | <artifactId>opennlp-tools</artifactId> | |
628 | </exclusion> | |
629 | <exclusion> | |
630 | <groupId>com.google.guava</groupId> | |
631 | <artifactId>guava</artifactId> | |
632 | </exclusion> | |
633 | <exclusion> | |
634 | <groupId>commons-io</groupId> | |
635 | <artifactId>commons-io</artifactId> | |
636 | </exclusion> | |
637 | <exclusion> | |
638 | <groupId>org.apache.uima</groupId> | |
639 | <artifactId>uimafit-core</artifactId> | |
640 | </exclusion> | |
641 | <exclusion> | |
642 | <groupId>org.apache.uima</groupId> | |
643 | <artifactId>uimaj-core</artifactId> | |
644 | </exclusion> | |
645 | <exclusion> | |
646 | <groupId>org.jdom</groupId> | |
647 | <artifactId>jdom2</artifactId> | |
648 | </exclusion> | |
649 | </exclusions> | |
650 | </dependency> | |
651 | <!-- need to specify this to avoid | |
652 | version clash within ctakes-core 4.0.0 --> | |
653 | <dependency> | |
654 | <groupId>org.apache.uima</groupId> | |
655 | <artifactId>uimafit-core</artifactId> | |
656 | <version>2.2.0</version> | |
657 | <exclusions> | |
658 | <exclusion> | |
659 | <groupId>org.apache.uima</groupId> | |
660 | <artifactId>uimaj-core</artifactId> | |
661 | </exclusion> | |
662 | <exclusion> | |
663 | <groupId>commons-io</groupId> | |
664 | <artifactId>commons-io</artifactId> | |
665 | </exclusion> | |
666 | </exclusions> | |
667 | </dependency> | |
668 | <!-- need to specify this to avoid | |
669 | version clash within ctakes-core 4.0.0 --> | |
670 | <dependency> | |
671 | <groupId>org.apache.uima</groupId> | |
672 | <artifactId>uimaj-core</artifactId> | |
673 | <version>2.9.0</version> | |
674 | </dependency> | |
675 | ||
676 | <dependency> | |
677 | <groupId>org.jdom</groupId> | |
678 | <artifactId>jdom2</artifactId> | |
679 | <version>2.0.6</version> | |
680 | </dependency> | |
569 | 681 | <!--Jackson parse String to JSON--> |
570 | 682 | <dependency> |
571 | 683 | <groupId>com.fasterxml.jackson.core</groupId> |
572 | 684 | <artifactId>jackson-core</artifactId> |
573 | <version>2.9.2</version> | |
574 | </dependency> | |
575 | ||
576 | <!-- Java ImageIO plugin for JBIG2 support (often used in PDF) | |
577 | This jbig2 dep is not distributed with Tika due to licensing | |
578 | issue (GPLV3). That's why it is included here as "test". | |
579 | https://github.com/levigo/jbig2-imageio | |
580 | --> | |
581 | <dependency> | |
582 | <groupId>com.levigo.jbig2</groupId> | |
583 | <artifactId>levigo-jbig2-imageio</artifactId> | |
584 | <version>1.6.5</version> | |
585 | <scope>test</scope> | |
586 | </dependency> | |
587 | <!-- Copied from PDFBox: | |
588 | For legal reasons (incompatible license), jai-imageio-core is to be used | |
589 | only in the tests and may not be distributed. See also LEGAL-195--> | |
685 | <version>${jackson.version}</version> | |
686 | </dependency> | |
687 | <!-- as of 2.9.5, jackson-databind is pulling in jackson-annotations 2.9.0 | |
688 | For now, we need to specify databind here with exclusion statement | |
689 | --> | |
690 | <dependency> | |
691 | <groupId>com.fasterxml.jackson.core</groupId> | |
692 | <artifactId>jackson-databind</artifactId> | |
693 | <version>${jackson.version}</version> | |
694 | <exclusions> | |
695 | <exclusion> | |
696 | <groupId>com.fasterxml.jackson.core</groupId> | |
697 | <artifactId>jackson-annotations</artifactId> | |
698 | </exclusion> | |
699 | </exclusions> | |
700 | </dependency> | |
701 | <dependency> | |
702 | <groupId>com.fasterxml.jackson.core</groupId> | |
703 | <artifactId>jackson-annotations</artifactId> | |
704 | <version>${jackson.version}</version> | |
705 | </dependency> | |
706 | ||
707 | ||
708 | <dependency> | |
709 | <groupId>org.apache.pdfbox</groupId> | |
710 | <artifactId>jbig2-imageio</artifactId> | |
711 | <version>3.0.0</version> | |
712 | </dependency> | |
713 | ||
714 | <!-- jai-imageio-core is allowed since LEGAL-304 --> | |
590 | 715 | <dependency> |
591 | 716 | <groupId>com.github.jai-imageio</groupId> |
592 | 717 | <artifactId>jai-imageio-core</artifactId> |
593 | 718 | <version>1.3.1</version> |
594 | <scope>test</scope> | |
595 | </dependency> | |
719 | </dependency> | |
720 | <!-- For legal reasons (incompatible license), jai-imageio-jpeg2000 is to be used | |
721 | only in the tests and may not be distributed. See also LEGAL-195 --> | |
596 | 722 | <dependency> |
597 | 723 | <groupId>com.github.jai-imageio</groupId> |
598 | 724 | <artifactId>jai-imageio-jpeg2000</artifactId> |
599 | 725 | <version>1.3.0</version> |
600 | 726 | <scope>test</scope> |
727 | <exclusions> | |
728 | <exclusion> | |
729 | <groupId>com.github.jai-imageio</groupId> | |
730 | <artifactId>jai-imageio-core</artifactId> | |
731 | </exclusion> | |
732 | </exclusions> | |
601 | 733 | </dependency> |
602 | 734 | |
603 | 735 | </dependencies> |
+10
-2
19 | 19 | |
20 | 20 | import java.math.BigInteger; |
21 | 21 | import java.util.ArrayList; |
22 | import java.util.HashSet; | |
22 | 23 | import java.util.List; |
24 | import java.util.Set; | |
23 | 25 | |
24 | 26 | import org.apache.tika.exception.TikaException; |
25 | 27 | import org.apache.tika.parser.chm.core.ChmCommons; |
136 | 138 | |
137 | 139 | /* loops over all pmgls */ |
138 | 140 | byte[] dir_chunk = null; |
141 | Set<Integer> processed = new HashSet<>(); | |
139 | 142 | for (int i = startPmgl; i>=0; ) { |
140 | 143 | dir_chunk = new byte[(int) chmItspHeader.getBlock_len()]; |
141 | 144 | int start = i * (int) chmItspHeader.getBlock_len() + dir_offset; |
146 | 149 | PMGLheader = new ChmPmglHeader(); |
147 | 150 | PMGLheader.parse(dir_chunk, PMGLheader); |
148 | 151 | enumerateOneSegment(dir_chunk); |
149 | ||
150 | i=PMGLheader.getBlockNext(); | |
152 | int nextBlock = PMGLheader.getBlockNext(); | |
153 | processed.add(i); | |
154 | if (processed.contains(nextBlock)) { | |
155 | throw new ChmParsingException("already processed block; avoiding cycle"); | |
156 | } | |
157 | i=nextBlock; | |
151 | 158 | dir_chunk = null; |
152 | 159 | } |
160 | ||
153 | 161 | } catch (ChmParsingException e) { |
154 | 162 | LOG.warn("Chm parse exception", e); |
155 | 163 | } finally { |
15 | 15 | */ |
16 | 16 | package org.apache.tika.parser.html; |
17 | 17 | |
18 | import java.io.BufferedReader; | |
18 | 19 | import java.io.IOException; |
19 | 20 | import java.io.InputStream; |
21 | import java.io.InputStreamReader; | |
20 | 22 | import java.nio.ByteBuffer; |
21 | 23 | import java.nio.charset.Charset; |
24 | import java.nio.charset.StandardCharsets; | |
25 | import java.util.Collections; | |
26 | import java.util.HashSet; | |
27 | import java.util.Locale; | |
28 | import java.util.Set; | |
22 | 29 | import java.util.regex.Matcher; |
23 | 30 | import java.util.regex.Pattern; |
24 | 31 | |
38 | 45 | */ |
39 | 46 | public class HtmlEncodingDetector implements EncodingDetector { |
40 | 47 | |
48 | /** | |
49 | * HTML can include non-iana supported charsets that Java | |
50 | * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake. | |
51 | * Ignore charsets in html meta-headers that are not supported by IANA. | |
52 | * See: TIKA-2592 | |
53 | */ | |
54 | private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA; | |
55 | static { | |
56 | Set<String> unsupported = new HashSet<>(); | |
57 | try (BufferedReader reader = | |
58 | new BufferedReader( | |
59 | new InputStreamReader( | |
60 | HtmlEncodingDetector.class | |
61 | .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"), | |
62 | StandardCharsets.UTF_8))) { | |
63 | String line = reader.readLine(); | |
64 | while (line != null) { | |
65 | if (line.startsWith("#")) { | |
66 | line = reader.readLine(); | |
67 | continue; | |
68 | } | |
69 | line = line.trim(); | |
70 | if (line.length() > 0) { | |
71 | unsupported.add(line.toLowerCase(Locale.US)); | |
72 | } | |
73 | line = reader.readLine(); | |
74 | } | |
75 | } catch (IOException e) { | |
76 | throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path"); | |
77 | } | |
78 | CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported); | |
79 | } | |
41 | 80 | // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) |
42 | 81 | private static final int DEFAULT_MARK_LIMIT = 8192; |
43 | 82 | |
111 | 150 | //that is valid |
112 | 151 | while (charsetMatcher.find()) { |
113 | 152 | String candCharset = charsetMatcher.group(1); |
153 | if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { | |
154 | continue; | |
155 | } | |
114 | 156 | if (CharsetUtils.isSupported(candCharset)) { |
115 | 157 | try { |
116 | 158 | return CharsetUtils.forName(candCharset); |
23 | 23 | import java.nio.charset.StandardCharsets; |
24 | 24 | import java.util.Arrays; |
25 | 25 | import java.util.HashSet; |
26 | import java.util.List; | |
26 | 27 | import java.util.Locale; |
27 | 28 | import java.util.Set; |
28 | 29 | import java.util.regex.Matcher; |
35 | 36 | import org.apache.tika.metadata.TikaCoreProperties; |
36 | 37 | import org.apache.tika.mime.MediaType; |
37 | 38 | import org.apache.tika.parser.ParseContext; |
39 | import org.apache.tika.parser.utils.DataURIScheme; | |
40 | import org.apache.tika.parser.utils.DataURISchemeParseException; | |
41 | import org.apache.tika.parser.utils.DataURISchemeUtil; | |
38 | 42 | import org.apache.tika.sax.TextContentHandler; |
39 | 43 | import org.apache.tika.sax.XHTMLContentHandler; |
40 | 44 | import org.xml.sax.Attributes; |
56 | 60 | private final ParseContext context; |
57 | 61 | private final boolean extractScripts; |
58 | 62 | private final StringBuilder title = new StringBuilder(); |
63 | private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil(); | |
59 | 64 | private int bodyLevel = 0; |
60 | 65 | private int discardLevel = 0; |
61 | 66 | private int titleLevel = 0; |
168 | 173 | } |
169 | 174 | |
170 | 175 | title.setLength(0); |
176 | String value = atts.getValue("src"); | |
177 | if (value != null && value.startsWith("data:")) { | |
178 | handleDataURIScheme(value); | |
179 | } | |
171 | 180 | } |
172 | 181 | |
173 | 182 | /** |
230 | 239 | // And resolve relative links. Eventually this should be pushed |
231 | 240 | // into the HtmlMapper code. |
232 | 241 | if (URI_ATTRIBUTES.contains(normAttrName)) { |
242 | //if this is a src="data: " element, | |
243 | //we've handled that as an embedded file, don't include the full thing | |
244 | //here | |
245 | if (normAttrName.equals("src")) { | |
246 | String v = newAttributes.getValue(att); | |
247 | if (v.startsWith("data:")) { | |
248 | newAttributes.setValue(att, "data:"); | |
249 | } | |
250 | } | |
233 | 251 | newAttributes.setValue(att, resolve(newAttributes.getValue(att))); |
234 | 252 | } else if (isObject && "codebase".equals(normAttrName)) { |
235 | 253 | newAttributes.setValue(att, codebase); |
295 | 313 | } |
296 | 314 | } |
297 | 315 | |
316 | private void handleDataURIScheme(String string) throws SAXException { | |
317 | DataURIScheme dataURIScheme = null; | |
318 | try { | |
319 | dataURIScheme = dataURISchemeUtil.parse(string); | |
320 | } catch (DataURISchemeParseException e) { | |
321 | //swallow | |
322 | return; | |
323 | } | |
324 | ||
325 | //do anything with attrs? | |
326 | Metadata m = new Metadata(); | |
327 | m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | |
328 | TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); | |
329 | if (dataURIScheme.getMediaType() != null) { | |
330 | m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString()); | |
331 | } | |
332 | EmbeddedDocumentExtractor embeddedDocumentExtractor = | |
333 | EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); | |
334 | if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { | |
335 | try (InputStream stream = dataURIScheme.getInputStream()) { | |
336 | embeddedDocumentExtractor.parseEmbedded( | |
337 | stream, xhtml, m, false | |
338 | ); | |
339 | } catch (IOException e) { | |
340 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
298 | 345 | private void writeScript() throws SAXException { |
299 | 346 | //don't write an attached macro if there is no content |
300 | 347 | //we may want to revisit this behavior |
312 | 359 | |
313 | 360 | EmbeddedDocumentExtractor embeddedDocumentExtractor = |
314 | 361 | EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); |
362 | //try to scrape dataURISchemes from javascript | |
363 | List<DataURIScheme> dataURISchemes = dataURISchemeUtil.extract(script.toString()); | |
364 | for (DataURIScheme dataURIScheme : dataURISchemes) { | |
365 | Metadata dataUriMetadata = new Metadata(); | |
366 | dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | |
367 | TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); | |
368 | dataUriMetadata.set(Metadata.CONTENT_TYPE, | |
369 | dataURIScheme.getMediaType().toString()); | |
370 | if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) { | |
371 | try (InputStream dataURISchemeInputStream = dataURIScheme.getInputStream()) { | |
372 | embeddedDocumentExtractor.parseEmbedded(dataURISchemeInputStream, | |
373 | xhtml, dataUriMetadata, false); | |
374 | } catch (IOException e) { | |
375 | //swallow | |
376 | } | |
377 | } | |
378 | } | |
379 | ||
315 | 380 | try (InputStream stream = new ByteArrayInputStream( |
316 | 381 | script.toString().getBytes(StandardCharsets.UTF_8))) { |
317 | 382 | embeddedDocumentExtractor.parseEmbedded( |
65 | 65 | MediaType.image("png"), |
66 | 66 | MediaType.image("vnd.wap.wbmp"), |
67 | 67 | MediaType.image("x-icon"), |
68 | MediaType.image("x-xcf"))); | |
69 | try { | |
70 | Class.forName("com.levigo.jbig2.JBIG2ImageReader"); | |
71 | TMP_SUPPORTED.add(MediaType.image("x-jbig2")); | |
72 | } catch (ClassNotFoundException e) { | |
73 | } | |
68 | MediaType.image("x-xcf"), | |
69 | MediaType.image("x-jbig2"))); | |
70 | //add try/catch class.forName() for image types relying on | |
71 | //provided dependencies | |
74 | 72 | } |
75 | 73 | |
76 | 74 | private static final Set<MediaType> SUPPORTED_TYPES = |
32 | 32 | import org.apache.james.mime4j.parser.ContentHandler; |
33 | 33 | import org.apache.james.mime4j.stream.BodyDescriptor; |
34 | 34 | import org.apache.james.mime4j.stream.Field; |
35 | import org.apache.tika.detect.Detector; | |
35 | 36 | import org.apache.tika.exception.TikaException; |
36 | 37 | import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
37 | 38 | import org.apache.tika.extractor.EmbeddedDocumentUtil; |
146 | 147 | private boolean strictParsing = false; |
147 | 148 | private final boolean extractAllAlternatives; |
148 | 149 | private final EmbeddedDocumentExtractor extractor; |
149 | ||
150 | private final Detector detector; | |
150 | 151 | //this is used to buffer a multipart body that |
151 | 152 | //keeps track of multipart/alternative and its children |
152 | 153 | private Stack<Part> alternativePartBuffer = new Stack<>(); |
153 | 154 | |
154 | 155 | private Stack<BodyDescriptor> parts = new Stack<>(); |
155 | 156 | |
156 | MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, | |
157 | MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata, | |
157 | 158 | ParseContext context, boolean strictParsing, boolean extractAllAlternatives) { |
158 | 159 | this.handler = xhtml; |
159 | 160 | this.metadata = metadata; |
166 | 167 | |
167 | 168 | // Was an EmbeddedDocumentExtractor explicitly supplied? |
168 | 169 | this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); |
170 | this.detector = detector; | |
169 | 171 | } |
170 | 172 | |
171 | 173 | @Override |
183 | 185 | if (parts.size() > 0) { |
184 | 186 | submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType()); |
185 | 187 | submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary()); |
186 | } | |
188 | } | |
187 | 189 | if (body instanceof MaximalBodyDescriptor) { |
188 | 190 | MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body; |
189 | 191 | String contentDispositionType = maximalBody.getContentDispositionType(); |
190 | 192 | if (contentDispositionType != null && !contentDispositionType.isEmpty()) { |
191 | StringBuilder contentDisposition = new StringBuilder( contentDispositionType ); | |
193 | StringBuilder contentDisposition = new StringBuilder(contentDispositionType); | |
192 | 194 | Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters(); |
193 | for ( Entry<String, String> param : contentDispositionParameters.entrySet() ) { | |
195 | for (Entry<String, String> param : contentDispositionParameters.entrySet()) { | |
194 | 196 | contentDisposition.append("; ") |
195 | .append(param.getKey()).append("=\"").append(param.getValue()).append('"'); | |
197 | .append(param.getKey()).append("=\"").append(param.getValue()).append('"'); | |
196 | 198 | } |
197 | 199 | |
198 | 200 | String contentDispositionFileName = maximalBody.getContentDispositionFilename(); |
200 | 202 | submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName ); |
201 | 203 | } |
202 | 204 | |
203 | submd.set( Metadata.CONTENT_DISPOSITION, contentDisposition.toString() ); | |
205 | submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString()); | |
204 | 206 | } |
205 | 207 | } |
206 | 208 | //if we're in a multipart/alternative or any one of its children |
207 | 209 | //add the bodypart to the latest that was added |
208 | if (! extractAllAlternatives && alternativePartBuffer.size() > 0) { | |
210 | if (!extractAllAlternatives && alternativePartBuffer.size() > 0) { | |
209 | 211 | ByteArrayOutputStream bos = new ByteArrayOutputStream(); |
210 | 212 | IOUtils.copy(is, bos); |
211 | 213 | alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray())); |
214 | } else if (!extractAllAlternatives && parts.size() < 2) { | |
215 | //if you're at the first level of embedding | |
216 | //and you're not in an alternative part block | |
217 | //and you're text/html, put that in the body of the email | |
218 | //otherwise treat as a regular attachment | |
219 | ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |
220 | IOUtils.copy(is, bos); | |
221 | byte[] bytes = bos.toByteArray(); | |
222 | if (detectTextOrHtml(submd, bytes)) { | |
223 | handleInlineBodyPart(new BodyContents(submd, bos.toByteArray())); | |
224 | } else { | |
225 | //else handle as you would any other embedded content | |
226 | try (TikaInputStream tis = TikaInputStream.get(bytes)) { | |
227 | handleEmbedded(tis, submd); | |
228 | } | |
229 | } | |
212 | 230 | } else { |
213 | 231 | //else handle as you would any other embedded content |
214 | 232 | try (TikaInputStream tis = TikaInputStream.get(is)) { |
215 | 233 | handleEmbedded(tis, submd); |
216 | 234 | } |
217 | 235 | } |
236 | } | |
237 | ||
238 | private boolean detectTextOrHtml(Metadata submd, byte[] bytes) { | |
239 | String mediaTypeString = submd.get(Metadata.CONTENT_TYPE); | |
240 | if (mediaTypeString != null) { | |
241 | if (mediaTypeString.startsWith("text")) { | |
242 | return true; | |
243 | } else { | |
244 | return false; | |
245 | } | |
246 | } | |
247 | try (TikaInputStream tis = TikaInputStream.get(bytes)) { | |
248 | MediaType mediaType = detector.detect(tis, submd); | |
249 | if (mediaType != null) { | |
250 | //detect only once | |
251 | submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString()); | |
252 | if (mediaType.toString().startsWith("text")) { | |
253 | return true; | |
254 | } | |
255 | } | |
256 | } catch (IOException e) { | |
257 | ||
258 | } | |
259 | return false; | |
218 | 260 | } |
219 | 261 | |
220 | 262 | private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException { |
515 | 557 | } |
516 | 558 | |
517 | 559 | if (part instanceof BodyContents) { |
518 | handlePart((BodyContents)part); | |
560 | handleInlineBodyPart((BodyContents)part); | |
519 | 561 | return; |
520 | 562 | } |
521 | 563 | |
538 | 580 | } |
539 | 581 | } |
540 | 582 | |
541 | private void handlePart(BodyContents part) throws MimeException, IOException { | |
583 | private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { | |
542 | 584 | String contentType = part.metadata.get(Metadata.CONTENT_TYPE); |
543 | 585 | Parser parser = null; |
544 | 586 | if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { |
554 | 596 | |
555 | 597 | |
556 | 598 | if (parser == null) { |
599 | //back off and treat it as an embedded chunk | |
557 | 600 | try (TikaInputStream tis = TikaInputStream.get(part.bytes)) { |
558 | 601 | handleEmbedded(tis, part.metadata); |
559 | 602 | } |
25 | 25 | import org.apache.james.mime4j.parser.MimeStreamParser; |
26 | 26 | import org.apache.james.mime4j.stream.MimeConfig; |
27 | 27 | import org.apache.tika.config.Field; |
28 | import org.apache.tika.detect.Detector; | |
28 | 29 | import org.apache.tika.exception.TikaException; |
30 | import org.apache.tika.extractor.EmbeddedDocumentUtil; | |
29 | 31 | import org.apache.tika.io.TikaInputStream; |
30 | 32 | import org.apache.tika.metadata.Metadata; |
31 | 33 | import org.apache.tika.mime.MediaType; |
53 | 55 | private static final Set<MediaType> SUPPORTED_TYPES = Collections |
54 | 56 | .singleton(MediaType.parse("message/rfc822")); |
55 | 57 | |
58 | //rely on the detector to be thread-safe | |
59 | //built lazily and then reused | |
60 | private Detector detector; | |
61 | ||
56 | 62 | @Field |
57 | 63 | private boolean extractAllAlternatives = false; |
58 | 64 | |
70 | 76 | .build(); |
71 | 77 | |
72 | 78 | config = context.get(MimeConfig.class, config); |
73 | ||
79 | Detector localDetector = context.get(Detector.class); | |
80 | if (localDetector == null) { | |
81 | //lazily load this if necessary | |
82 | if (detector == null) { | |
83 | EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context); | |
84 | detector = embeddedDocumentUtil.getDetector(); | |
85 | } | |
86 | localDetector = detector; | |
87 | } | |
74 | 88 | MimeStreamParser parser = new MimeStreamParser(config, null, new DefaultBodyDescriptorBuilder()); |
75 | 89 | XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); |
76 | 90 | |
77 | 91 | MailContentHandler mch = new MailContentHandler( |
78 | xhtml, metadata, context, config.isStrictParsing(), | |
92 | xhtml, localDetector, metadata, context, config.isStrictParsing(), | |
79 | 93 | extractAllAlternatives); |
80 | 94 | parser.setContentHandler(mch); |
81 | 95 | parser.setContentDecoding(true); |
283 | 283 | |
284 | 284 | // Set up listener and register the records we want to process |
285 | 285 | HSSFRequest hssfRequest = new HSSFRequest(); |
286 | listenForAllRecords = true; | |
287 | 286 | if (listenForAllRecords) { |
288 | 287 | hssfRequest.addListenerForAllRecords(formatListener); |
289 | 288 | } else { |
541 | 540 | CellValueRecordInterface value = |
542 | 541 | (CellValueRecordInterface) record; |
543 | 542 | Point point = new Point(value.getColumn(), value.getRow()); |
544 | currentSheet.put(point, cell); | |
543 | if (currentSheet.containsKey(point)) { | |
544 | //avoid overwriting content | |
545 | //for now, add to extraTextCells | |
546 | //TODO: consider allowing multiple text pieces | |
547 | //per x,y to keep the text together | |
548 | extraTextCells.add(cell); | |
549 | } else { | |
550 | currentSheet.put(point, cell); | |
551 | } | |
552 | ||
545 | 553 | } else { |
546 | 554 | // Cell outside the worksheets |
547 | 555 | extraTextCells.add(cell); |
650 | 658 | } |
651 | 659 | |
652 | 660 | @Override |
661 | public void processRecord(Record record) { | |
662 | // System.out.println(record.getClass() + " : "+record.toString()); | |
663 | super.processRecord(record); | |
664 | } | |
665 | ||
666 | @Override | |
653 | 667 | public String formatNumberDateCell(CellValueRecordInterface cell) { |
654 | 668 | String formatString = this.getFormatString(cell); |
655 | 669 | if (formatString != null && ! formatString.equals("General")) { |
17 | 17 | |
18 | 18 | import java.io.IOException; |
19 | 19 | import java.io.InputStream; |
20 | import java.util.ArrayList; | |
20 | 21 | import java.util.HashSet; |
21 | 22 | import java.util.List; |
22 | 23 | |
29 | 30 | import org.apache.poi.hslf.record.RecordTypes; |
30 | 31 | import org.apache.poi.hslf.record.VBAInfoAtom; |
31 | 32 | import org.apache.poi.hslf.record.VBAInfoContainer; |
33 | import org.apache.poi.hslf.usermodel.HSLFGroupShape; | |
32 | 34 | import org.apache.poi.hslf.usermodel.HSLFMasterSheet; |
33 | 35 | import org.apache.poi.hslf.usermodel.HSLFNotes; |
34 | 36 | import org.apache.poi.hslf.usermodel.HSLFObjectData; |
38 | 40 | import org.apache.poi.hslf.usermodel.HSLFSlideShow; |
39 | 41 | import org.apache.poi.hslf.usermodel.HSLFTable; |
40 | 42 | import org.apache.poi.hslf.usermodel.HSLFTableCell; |
43 | import org.apache.poi.hslf.usermodel.HSLFTextBox; | |
41 | 44 | import org.apache.poi.hslf.usermodel.HSLFTextParagraph; |
42 | 45 | import org.apache.poi.hslf.usermodel.HSLFTextRun; |
43 | 46 | import org.apache.poi.hslf.usermodel.HSLFTextShape; |
47 | 50 | import org.apache.tika.exception.EncryptedDocumentException; |
48 | 51 | import org.apache.tika.extractor.EmbeddedDocumentUtil; |
49 | 52 | import org.apache.tika.io.CloseShieldInputStream; |
53 | import org.apache.tika.io.IOExceptionWithCause; | |
50 | 54 | import org.apache.tika.io.TikaInputStream; |
51 | 55 | import org.apache.tika.metadata.Metadata; |
52 | 56 | import org.apache.tika.mime.MediaType; |
116 | 120 | } |
117 | 121 | } |
118 | 122 | |
123 | extractGroupText(xhtml, slide.getShapes(), 0); | |
124 | ||
119 | 125 | // Slide footer, if present |
120 | 126 | if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { |
121 | 127 | xhtml.startElement("p", "class", "slide-footer"); |
215 | 221 | extractMacros(ss, xhtml); |
216 | 222 | } |
217 | 223 | xhtml.endElement("div"); |
224 | } | |
225 | ||
226 | //Extract any text that's within an HSLFTextShape that's a descendant of | |
227 | //an HSLFGroupShape. | |
228 | private void extractGroupText(XHTMLContentHandler xhtml, List<HSLFShape> shapes, int depth) throws SAXException { | |
229 | ||
230 | if (shapes == null) { | |
231 | return; | |
232 | } | |
233 | ||
234 | //Only process items with depth > 0 because they should have been included | |
235 | //already in slide.getTextParagraphs above. | |
236 | ||
237 | //However, cells are considered grouped within the table, so ignore them. | |
238 | //I don't believe that cells can be inside a text box or other | |
239 | //grouped text containing object, so always ignore them. | |
240 | List<List<HSLFTextParagraph>> paragraphList = new ArrayList<>(); | |
241 | for (HSLFShape shape : shapes) { | |
242 | if (shape instanceof HSLFGroupShape) { | |
243 | //work recursively, HSLFGroupShape can contain HSLFGroupShape | |
244 | extractGroupText(xhtml, ((HSLFGroupShape)shape).getShapes(), depth+1); | |
245 | } else if (shape instanceof HSLFTextShape | |
246 | && ! (shape instanceof HSLFTableCell) && depth > 0) { | |
247 | paragraphList.add(((HSLFTextShape)shape).getTextParagraphs()); | |
248 | } | |
249 | } | |
250 | textRunsToText(xhtml, paragraphList); | |
218 | 251 | } |
219 | 252 | |
220 | 253 | private void extractMacros(HSLFSlideShow ppt, XHTMLContentHandler xhtml) { |
453 | 486 | MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata()); |
454 | 487 | mediaType = mt.toString(); |
455 | 488 | } |
456 | if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) { | |
457 | try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) { | |
489 | if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj") | |
490 | || mediaType.equals("application/x-tika-msoffice")) { | |
491 | NPOIFSFileSystem npoifs = null; | |
492 | ||
493 | try { | |
494 | npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); | |
495 | } catch (RuntimeException e) { | |
496 | throw new IOExceptionWithCause(e); | |
497 | } | |
498 | try { | |
458 | 499 | handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml); |
500 | } finally { | |
501 | if (npoifs != null) { | |
502 | npoifs.close(); | |
503 | } | |
459 | 504 | } |
460 | 505 | } else { |
461 | 506 | handleEmbeddedResource( |
73 | 73 | import org.apache.tika.sax.BodyContentHandler; |
74 | 74 | import org.apache.tika.sax.EmbeddedContentHandler; |
75 | 75 | import org.apache.tika.sax.XHTMLContentHandler; |
76 | import org.bouncycastle.cms.Recipient; | |
76 | 77 | import org.xml.sax.SAXException; |
77 | 78 | |
78 | 79 | /** |
320 | 321 | } |
321 | 322 | if (rtfChunk != null && (extractAllAlternatives || !doneBody)) { |
322 | 323 | ByteChunk chunk = (ByteChunk) rtfChunk; |
323 | MAPIRtfAttribute rtf = new MAPIRtfAttribute( | |
324 | MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() | |
325 | ); | |
326 | Parser rtfParser = | |
327 | EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext); | |
328 | if (rtfParser == null) { | |
329 | rtfParser = new RTFParser(); | |
330 | } | |
331 | rtfParser.parse( | |
332 | new ByteArrayInputStream(rtf.getData()), | |
333 | new EmbeddedContentHandler(new BodyContentHandler(xhtml)), | |
334 | new Metadata(), parseContext); | |
335 | doneBody = true; | |
324 | //avoid buffer underflow TIKA-2530 | |
325 | //TODO -- would be good to find an example triggering file and | |
326 | //figure out if this is a bug in POI or a genuine 0 length chunk | |
327 | if (chunk.getValue() != null && chunk.getValue().length > 0) { | |
328 | MAPIRtfAttribute rtf = new MAPIRtfAttribute( | |
329 | MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() | |
330 | ); | |
331 | Parser rtfParser = | |
332 | EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext); | |
333 | if (rtfParser == null) { | |
334 | rtfParser = new RTFParser(); | |
335 | } | |
336 | rtfParser.parse( | |
337 | new ByteArrayInputStream(rtf.getData()), | |
338 | new EmbeddedContentHandler(new BodyContentHandler(xhtml)), | |
339 | new Metadata(), parseContext); | |
340 | doneBody = true; | |
341 | } | |
336 | 342 | } |
337 | 343 | if (textChunk != null && (extractAllAlternatives || !doneBody)) { |
338 | 344 | xhtml.element("p", ((StringChunk) textChunk).getValue()); |
+18
-5
24 | 24 | import java.net.URI; |
25 | 25 | import java.util.HashMap; |
26 | 26 | import java.util.HashSet; |
27 | import java.util.Iterator; | |
27 | 28 | import java.util.List; |
28 | 29 | import java.util.Map; |
29 | 30 | import java.util.Set; |
39 | 40 | import org.apache.poi.openxml4j.opc.TargetMode; |
40 | 41 | import org.apache.poi.openxml4j.opc.internal.FileHelper; |
41 | 42 | import org.apache.poi.poifs.filesystem.DirectoryNode; |
43 | import org.apache.poi.poifs.filesystem.DocumentEntry; | |
44 | import org.apache.poi.poifs.filesystem.Entry; | |
42 | 45 | import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; |
43 | 46 | import org.apache.poi.poifs.filesystem.Ole10Native; |
44 | 47 | import org.apache.poi.poifs.filesystem.Ole10NativeException; |
298 | 301 | DirectoryNode root = fs.getRoot(); |
299 | 302 | POIFSDocumentType type = POIFSDocumentType.detectType(root); |
300 | 303 | |
301 | if (root.hasEntry("CONTENTS") | |
302 | && root.hasEntry("\u0001Ole") | |
303 | && root.hasEntry("\u0001CompObj")) { | |
304 | if (root.hasEntry("\u0001Ole") | |
305 | && root.hasEntry("\u0001CompObj") | |
306 | && ( | |
307 | root.hasEntry("CONTENTS") || root.hasEntry("Package") | |
308 | )) { | |
304 | 309 | // TIKA-704: OLE 2.0 embedded non-Office document? |
305 | 310 | //TODO: figure out if the equivalent of OLE 1.0's |
306 | 311 | //getCommand() and getFileName() exist for OLE 2.0 to populate |
307 | 312 | //TikaCoreProperties.ORIGINAL_RESOURCE_NAME |
308 | stream = TikaInputStream.get( | |
309 | fs.createDocumentInputStream("CONTENTS")); | |
313 | if (root.hasEntry("CONTENTS")) { | |
314 | stream = TikaInputStream.get( | |
315 | fs.createDocumentInputStream("CONTENTS")); | |
316 | } else if (root.hasEntry("Package")) { | |
317 | //TIKA-2588 | |
318 | stream = TikaInputStream.get( | |
319 | fs.createDocumentInputStream("Package")); | |
320 | } else { | |
321 | throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira"); | |
322 | } | |
310 | 323 | if (embeddedExtractor.shouldParseEmbedded(metadata)) { |
311 | 324 | embeddedExtractor.parseEmbedded( |
312 | 325 | stream, new EmbeddedContentHandler(handler), |
+3
-1
35 | 35 | import org.apache.tika.metadata.Property; |
36 | 36 | import org.apache.tika.metadata.TikaCoreProperties; |
37 | 37 | import org.apache.tika.parser.microsoft.SummaryExtractor; |
38 | import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor; | |
38 | 39 | import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor; |
39 | 40 | import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; |
40 | 41 | import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException; |
60 | 61 | if (extractor.getDocument() != null || |
61 | 62 | ((extractor instanceof XSSFEventBasedExcelExtractor || |
62 | 63 | extractor instanceof XWPFEventBasedWordExtractor || |
63 | extractor instanceof XSLFEventBasedPowerPointExtractor) && | |
64 | extractor instanceof XSLFEventBasedPowerPointExtractor || | |
65 | extractor instanceof XPSTextExtractor) && | |
64 | 66 | extractor.getPackage() != null)) { |
65 | 67 | extractMetadata(extractor.getCoreProperties(), metadata); |
66 | 68 | extractMetadata(extractor.getExtendedProperties(), metadata); |
+13
-2
44 | 44 | import org.apache.tika.parser.EmptyParser; |
45 | 45 | import org.apache.tika.parser.ParseContext; |
46 | 46 | import org.apache.tika.parser.microsoft.OfficeParserConfig; |
47 | import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator; | |
48 | import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor; | |
47 | 49 | import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor; |
48 | 50 | import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; |
49 | 51 | import org.apache.tika.parser.pkg.ZipContainerDetector; |
65 | 67 | ExtractorFactory.setThreadPrefersEventExtractors(true); |
66 | 68 | |
67 | 69 | try { |
68 | OOXMLExtractor extractor; | |
70 | OOXMLExtractor extractor = null; | |
69 | 71 | OPCPackage pkg; |
70 | 72 | |
71 | 73 | // Locate or Open the OPCPackage for the file |
82 | 84 | |
83 | 85 | // Get the type, and ensure it's one we handle |
84 | 86 | MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); |
87 | if (type == null) { | |
88 | type = ZipContainerDetector.detectXPSOPC(pkg); | |
89 | } | |
90 | ||
85 | 91 | if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { |
86 | 92 | // Not a supported type, delegate to Empty Parser |
87 | 93 | EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); |
88 | 94 | return; |
89 | 95 | } |
90 | 96 | metadata.set(Metadata.CONTENT_TYPE, type.toString()); |
91 | ||
92 | 97 | // Have the appropriate OOXML text extractor picked |
93 | 98 | POIXMLTextExtractor poiExtractor = null; |
94 | 99 | // This has already been set by OOXMLParser's call to configure() |
100 | 105 | if (poiExtractor == null && config.getUseSAXPptxExtractor()) { |
101 | 106 | poiExtractor = trySXSLF(pkg); |
102 | 107 | } |
108 | if (type.equals(OOXMLParser.XPS)) { | |
109 | poiExtractor = new XPSTextExtractor(pkg); | |
110 | } | |
111 | ||
103 | 112 | if (poiExtractor == null) { |
104 | 113 | poiExtractor = ExtractorFactory.createExtractor(pkg); |
105 | 114 | } |
118 | 127 | extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, |
119 | 128 | (XSLFEventBasedPowerPointExtractor) poiExtractor); |
120 | 129 | metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName()); |
130 | } else if (poiExtractor instanceof XPSTextExtractor) { | |
131 | extractor = new XPSExtractorDecorator(context, poiExtractor); | |
121 | 132 | } else if (document == null) { |
122 | 133 | throw new TikaException( |
123 | 134 | "Expecting UserModel based POI OOXML extractor with a document, but none found. " + |
39 | 39 | //turn off POI's zip bomb detection because we have our own |
40 | 40 | ZipSecureFile.setMinInflateRatio(-1.0d); |
41 | 41 | } |
42 | ||
43 | protected static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument"); | |
42 | 44 | |
43 | 45 | protected static final Set<MediaType> SUPPORTED_TYPES = |
44 | 46 | Collections.unmodifiableSet(new HashSet<>(Arrays.asList( |
83 | 85 | * by Tika and/or POI. |
84 | 86 | */ |
85 | 87 | protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = |
86 | Collections.singleton( | |
88 | Collections.EMPTY_SET; | |
89 | //TODO: should we do a singleton for dwfx+xps? | |
90 | /*Collections.singleton( | |
87 | 91 | MediaType.application("vnd.ms-xpsdocument") |
88 | ); | |
92 | );*/ | |
89 | 93 | /** |
90 | 94 | * Serial version UID |
91 | 95 | */ |
+262
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.microsoft.ooxml.xps; | |
18 | ||
19 | import org.apache.commons.io.IOUtils; | |
20 | import org.apache.commons.io.input.CloseShieldInputStream; | |
21 | import org.apache.poi.POIXMLDocument; | |
22 | import org.apache.poi.POIXMLTextExtractor; | |
23 | import org.apache.poi.openxml4j.opc.PackagePart; | |
24 | import org.apache.poi.openxml4j.opc.PackageRelationship; | |
25 | import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; | |
26 | import org.apache.poi.openxml4j.opc.ZipPackage; | |
27 | import org.apache.poi.openxml4j.util.ZipEntrySource; | |
28 | import org.apache.tika.exception.TikaException; | |
29 | import org.apache.tika.extractor.EmbeddedDocumentUtil; | |
30 | import org.apache.tika.metadata.Metadata; | |
31 | import org.apache.tika.metadata.TikaCoreProperties; | |
32 | import org.apache.tika.parser.ParseContext; | |
33 | import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor; | |
34 | import org.apache.tika.sax.EmbeddedContentHandler; | |
35 | import org.apache.tika.sax.OfflineContentHandler; | |
36 | import org.apache.tika.sax.XHTMLContentHandler; | |
37 | import org.apache.tika.utils.ExceptionUtils; | |
38 | import org.xml.sax.Attributes; | |
39 | import org.xml.sax.SAXException; | |
40 | import org.xml.sax.helpers.DefaultHandler; | |
41 | ||
42 | import java.io.IOException; | |
43 | import java.io.InputStream; | |
44 | import java.util.Collections; | |
45 | import java.util.Enumeration; | |
46 | import java.util.HashMap; | |
47 | import java.util.List; | |
48 | import java.util.Map; | |
49 | import java.util.zip.ZipEntry; | |
50 | ||
51 | public class XPSExtractorDecorator extends AbstractOOXMLExtractor { | |
52 | ||
53 | private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"; | |
54 | ||
55 | private final ParseContext context; | |
56 | private final ZipPackage pkg; | |
57 | Map<String, Metadata> embeddedImages = new HashMap<>(); | |
58 | ||
59 | public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) throws TikaException { | |
60 | super(context, extractor); | |
61 | this.context = context; | |
62 | if (extractor.getPackage() instanceof ZipPackage) { | |
63 | this.pkg = (ZipPackage) extractor.getPackage(); | |
64 | } else { | |
65 | throw new TikaException("OPCPackage must be a ZipPackage"); | |
66 | } | |
67 | } | |
68 | ||
69 | @Override | |
70 | public POIXMLDocument getDocument() { | |
71 | return null; | |
72 | } | |
73 | ||
74 | ||
75 | @Override | |
76 | protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { | |
77 | ||
78 | PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT); | |
79 | for (int i = 0; i < prc.size(); i++) { | |
80 | PackageRelationship pr = prc.getRelationship(i); | |
81 | ||
82 | //there should only be one. | |
83 | //in the test file, this points to FixedDocSeq.fdseq | |
84 | try { | |
85 | handleDocuments(pr, xhtml); | |
86 | } catch (TikaException e) { | |
87 | throw new SAXException(e); | |
88 | } | |
89 | } | |
90 | ||
91 | //now handle embedded images | |
92 | if (embeddedImages.size() > 0) { | |
93 | EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context); | |
94 | for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) { | |
95 | String zipPath = embeddedImage.getKey(); | |
96 | Metadata metadata = embeddedImage.getValue(); | |
97 | if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { | |
98 | handleEmbeddedImage( | |
99 | zipPath, | |
100 | metadata, | |
101 | embeddedDocumentUtil, | |
102 | xhtml); | |
103 | } | |
104 | } | |
105 | } | |
106 | ||
107 | } | |
108 | ||
109 | private void handleEmbeddedImage(String zipPath, Metadata metadata, | |
110 | EmbeddedDocumentUtil embeddedDocumentUtil, | |
111 | XHTMLContentHandler xhtml) throws SAXException, IOException { | |
112 | InputStream stream = null; | |
113 | try { | |
114 | stream = getZipStream(zipPath, pkg); | |
115 | } catch (IOException|TikaException e) { | |
116 | //store this exception in the parent's metadata | |
117 | EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); | |
118 | return; | |
119 | } | |
120 | ||
121 | try { | |
122 | embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true); | |
123 | } finally { | |
124 | IOUtils.closeQuietly(stream); | |
125 | } | |
126 | } | |
127 | ||
128 | private void handleDocuments(PackageRelationship packageRelationship, | |
129 | XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { | |
130 | ||
131 | try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) { | |
132 | context.getSAXParser().parse( | |
133 | new CloseShieldInputStream(stream), | |
134 | new OfflineContentHandler(new EmbeddedContentHandler( | |
135 | new FixedDocSeqHandler(xhtml)))); | |
136 | } | |
137 | } | |
138 | ||
139 | @Override | |
140 | protected List<PackagePart> getMainDocumentParts() throws TikaException { | |
141 | return Collections.EMPTY_LIST; | |
142 | } | |
143 | ||
144 | private class FixedDocSeqHandler extends DefaultHandler { | |
145 | private final static String DOCUMENT_REFERENCE = "DocumentReference"; | |
146 | private final static String SOURCE = "Source"; | |
147 | ||
148 | private final XHTMLContentHandler xhtml; | |
149 | ||
150 | private FixedDocSeqHandler(XHTMLContentHandler xhtml) { | |
151 | this.xhtml = xhtml; | |
152 | } | |
153 | ||
154 | @Override | |
155 | public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { | |
156 | if (!DOCUMENT_REFERENCE.equals(localName)) { | |
157 | return; | |
158 | } | |
159 | for (int i = 0; i < atts.getLength(); i++) { | |
160 | String lName = atts.getLocalName(i); | |
161 | if (SOURCE.equals(lName)) { | |
162 | handleDocumentRef(atts.getValue(i)); | |
163 | } | |
164 | } | |
165 | } | |
166 | ||
167 | private void handleDocumentRef(String docRef) throws SAXException { | |
168 | //docRef is a path to a FixedDocumentSequence document, | |
169 | // e.g. /Documents/1/FixedDoc.fdoc | |
170 | ||
171 | //relative root is /Documents/1 ..need this Pages... | |
172 | String relativeRoot = null; | |
173 | int i = docRef.lastIndexOf("/"); | |
174 | if (i > 0) { | |
175 | relativeRoot = docRef.substring(0, i); | |
176 | } else { | |
177 | relativeRoot = ""; | |
178 | } | |
179 | String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef); | |
180 | if (pkg instanceof ZipPackage) { | |
181 | try (InputStream stream = getZipStream(zipPath, pkg)) { | |
182 | context.getSAXParser().parse( | |
183 | new CloseShieldInputStream(stream), | |
184 | new OfflineContentHandler(new EmbeddedContentHandler( | |
185 | new PageContentPartHandler(relativeRoot, xhtml)))); | |
186 | ||
187 | } catch (IOException | TikaException e) { | |
188 | throw new SAXException(new TikaException("IOException trying to read: " + docRef)); | |
189 | } | |
190 | } else { | |
191 | throw new SAXException(new TikaException("Package must be ZipPackage")); | |
192 | } | |
193 | } | |
194 | ||
195 | private class PageContentPartHandler extends DefaultHandler { | |
196 | private static final String PAGE_CONTENT = "PageContent"; | |
197 | private static final String SOURCE = "Source"; | |
198 | ||
199 | private final String relativeRoot; | |
200 | private final XHTMLContentHandler xhtml; | |
201 | ||
202 | private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) { | |
203 | this.relativeRoot = relativeRoot; | |
204 | this.xhtml = xhtml; | |
205 | } | |
206 | ||
207 | @Override | |
208 | public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { | |
209 | if (!PAGE_CONTENT.equals(localName)) { | |
210 | return; | |
211 | } | |
212 | String pagePath = null; | |
213 | for (int i = 0; i < atts.getLength(); i++) { | |
214 | if (SOURCE.equals(atts.getLocalName(i))) { | |
215 | pagePath = atts.getValue(i); | |
216 | break; | |
217 | } | |
218 | } | |
219 | ||
220 | if (pagePath != null) { | |
221 | if (!pagePath.startsWith("/")) { | |
222 | pagePath = relativeRoot + "/" + pagePath; | |
223 | } | |
224 | //trim initial / | |
225 | if (pagePath.startsWith("/")) { | |
226 | pagePath = pagePath.substring(1); | |
227 | } | |
228 | try (InputStream stream = getZipStream(pagePath, pkg)) { | |
229 | context.getSAXParser().parse( | |
230 | new CloseShieldInputStream(stream), | |
231 | new OfflineContentHandler( | |
232 | new XPSPageContentHandler(xhtml, embeddedImages) | |
233 | ) | |
234 | ); | |
235 | } catch (TikaException | IOException e) { | |
236 | throw new SAXException(e); | |
237 | } | |
238 | } | |
239 | ||
240 | } | |
241 | } | |
242 | } | |
243 | ||
244 | private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException { | |
245 | String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath); | |
246 | ZipEntrySource zipEntrySource = zipPackage.getZipArchive(); | |
247 | Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries(); | |
248 | ZipEntry zipEntry = null; | |
249 | while (zipEntryEnumeration.hasMoreElements()) { | |
250 | ZipEntry ze = zipEntryEnumeration.nextElement(); | |
251 | if (ze.getName().equals(targPath)) { | |
252 | zipEntry = ze; | |
253 | break; | |
254 | } | |
255 | } | |
256 | if (zipEntry == null) { | |
257 | throw new TikaException("Couldn't find required zip entry: " + zipPath); | |
258 | } | |
259 | return zipEntrySource.getInputStream(zipEntry); | |
260 | } | |
261 | } |
+377
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | package org.apache.tika.parser.microsoft.ooxml.xps; | |
17 | ||
18 | import org.apache.tika.metadata.Metadata; | |
19 | import org.apache.tika.metadata.TikaCoreProperties; | |
20 | import org.apache.tika.sax.XHTMLContentHandler; | |
21 | import org.xml.sax.Attributes; | |
22 | import org.xml.sax.SAXException; | |
23 | import org.xml.sax.helpers.DefaultHandler; | |
24 | ||
25 | import java.util.ArrayList; | |
26 | import java.util.Collections; | |
27 | import java.util.Comparator; | |
28 | import java.util.LinkedHashMap; | |
29 | import java.util.LinkedHashSet; | |
30 | import java.util.List; | |
31 | import java.util.Map; | |
32 | import java.util.Set; | |
33 | import java.util.Stack; | |
34 | ||
35 | ||
36 | /** | |
37 | * Handles an individual page. For now, this marks up | |
38 | * canvas entities in a <div> tag. Based on the spec, | |
39 | * it currently relies on order within the xml for order of output | |
40 | * of text to xhtml. We could do more complex processing of coordinates | |
41 | * for bidi-languages, but the spec implies that we should be able | |
42 | * to rely on storage order. | |
43 | * <p/> | |
44 | * As with our PDFParser, this currently dumps urls at the bottom of the page | |
45 | * and does not attempt to calculate the correct anchor text. | |
46 | * <p/> | |
47 | * TODO: integrate table markup | |
48 | */ | |
49 | class XPSPageContentHandler extends DefaultHandler { | |
50 | ||
51 | private static final String GLYPHS = "Glyphs"; | |
52 | private static final String CANVAS = "Canvas"; | |
53 | private static final String CLIP = "Clip"; | |
54 | private static final String NULL_CLIP = "NULL_CLIP"; | |
55 | private static final String UNICODE_STRING = "UnicodeString"; | |
56 | private static final String ORIGIN_X = "OriginX"; | |
57 | private static final String ORIGIN_Y = "OriginY"; | |
58 | private static final String BIDI_LEVEL = "BidiLevel"; | |
59 | private static final String INDICES = "Indices"; | |
60 | private static final String NAME = "Name"; | |
61 | private static final String PATH = "Path"; | |
62 | private static final String NAVIGATE_URI = "FixedPage.NavigateUri"; | |
63 | private static final String IMAGE_SOURCE = "ImageSource"; | |
64 | private static final String IMAGE_BRUSH = "ImageBrush"; | |
65 | private static final String AUTOMATION_PROPERITES_HELP_TEXT = "AutomationProperties.HelpText"; | |
66 | ||
67 | private static final String URL_DIV = "urls"; | |
68 | private static final String DIV = "div"; | |
69 | private static final String CLASS = "class"; | |
70 | private static final String PAGE = "page"; | |
71 | private static final String CANVAS_SAX = "canvas"; | |
72 | private static final String P = "p"; | |
73 | private static final String HREF = "href"; | |
74 | private static final String A = "a"; | |
75 | ||
76 | ||
77 | private final XHTMLContentHandler xhml; | |
78 | ||
79 | //path in zip file for an image rendered on this page | |
80 | private String imageSourcePathInZip = null; | |
81 | //embedded images sometimes include full path info of original image | |
82 | private String originalLocationOnDrive = null; | |
83 | ||
84 | //buffer for the glyph runs within a given canvas | |
85 | //in insertion order | |
86 | private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>(); | |
87 | ||
88 | private Set<String> urls = new LinkedHashSet(); | |
89 | private Stack<String> canvasStack = new Stack<>(); | |
90 | private final Map<String, Metadata> embeddedInfos; | |
91 | //sort based on y coordinate of first element in each row | |
92 | //this requires every row to have at least one element | |
93 | private static Comparator<? super List<GlyphRun>> ROW_SORTER = new Comparator<List<GlyphRun>>() { | |
94 | @Override | |
95 | public int compare(List<GlyphRun> o1, List<GlyphRun> o2) { | |
96 | if (o1.get(0).originY < o2.get(0).originY) { | |
97 | return -1; | |
98 | } else if (o1.get(0).originY > o2.get(0).originY) { | |
99 | return 1; | |
100 | } | |
101 | return 0; | |
102 | } | |
103 | }; | |
104 | ||
105 | public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) { | |
106 | this.xhml = xhtml; | |
107 | this.embeddedInfos = embeddedInfos; | |
108 | } | |
109 | ||
110 | @Override | |
111 | public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { | |
112 | if (CANVAS.equals(localName)) { | |
113 | String clip = getVal(CLIP, atts); | |
114 | if (clip == null) { | |
115 | canvasStack.push(NULL_CLIP); | |
116 | } else { | |
117 | canvasStack.push(clip); | |
118 | } | |
119 | return; | |
120 | } else if (PATH.equals(localName)) { | |
121 | //for now just grab them and dump them at the end of the page. | |
122 | String url = getVal(NAVIGATE_URI, atts); | |
123 | if (url != null) { | |
124 | urls.add(url); | |
125 | } | |
126 | originalLocationOnDrive = getVal(AUTOMATION_PROPERITES_HELP_TEXT, atts); | |
127 | } else if (IMAGE_BRUSH.equals(localName)) { | |
128 | imageSourcePathInZip = getVal(IMAGE_SOURCE, atts); | |
129 | } | |
130 | ||
131 | if (!GLYPHS.equals(localName)) { | |
132 | return; | |
133 | } | |
134 | String name = null; | |
135 | Float originX = null; | |
136 | Float originY = null; | |
137 | String unicodeString = null; | |
138 | Integer bidilevel = 1; | |
139 | String indicesString = null; | |
140 | ||
141 | for (int i = 0; i < atts.getLength(); i++) { | |
142 | String lName = atts.getLocalName(i); | |
143 | String value = atts.getValue(i); | |
144 | value = (value == null) ? "" : value.trim(); | |
145 | ||
146 | if (ORIGIN_X.equals(lName) && value.length() > 0) { | |
147 | try { | |
148 | originX = Float.parseFloat(atts.getValue(i)); | |
149 | } catch (NumberFormatException e) { | |
150 | throw new SAXException(e); | |
151 | } | |
152 | } else if (ORIGIN_Y.equals(lName) && value.length() > 0) { | |
153 | try { | |
154 | originY = Float.parseFloat(atts.getValue(i)); | |
155 | } catch (NumberFormatException e) { | |
156 | throw new SAXException(e); | |
157 | } | |
158 | } else if (UNICODE_STRING.equals(lName)) { | |
159 | unicodeString = atts.getValue(i); | |
160 | } else if (BIDI_LEVEL.equals(lName) && value.length() > 0) { | |
161 | try { | |
162 | bidilevel = Integer.parseInt(atts.getValue(i)); | |
163 | } catch (NumberFormatException e) { | |
164 | throw new SAXException(e); | |
165 | } | |
166 | } else if (INDICES.equals(lName)) { | |
167 | indicesString = atts.getValue(i); | |
168 | } else if (NAME.equals(lName)) { | |
169 | name = value; | |
170 | } | |
171 | } | |
172 | if (unicodeString != null) { | |
173 | originX = (originX == null) ? Integer.MIN_VALUE : originX; | |
174 | originY = (originY == null) ? Integer.MAX_VALUE : originY; | |
175 | String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP; | |
176 | List<GlyphRun> runs = canvases.get(currentCanvasClip); | |
177 | if (runs == null) { | |
178 | runs = new ArrayList<>(); | |
179 | } | |
180 | runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString)); | |
181 | canvases.put(currentCanvasClip, runs); | |
182 | } | |
183 | ||
184 | } | |
185 | ||
186 | @Override | |
187 | public void endElement(String uri, String localName, String qName) throws SAXException { | |
188 | if (CANVAS.equals(localName)) { | |
189 | if (! canvasStack.isEmpty()) { | |
190 | canvasStack.pop(); | |
191 | } | |
192 | } else if (PATH.equals(localName)) { | |
193 | //this assumes that there cannot be a path within a path | |
194 | //not sure if this is true or if we need to track path depth | |
195 | if (imageSourcePathInZip != null) { | |
196 | Metadata m = embeddedInfos.get(imageSourcePathInZip); | |
197 | if (m == null) { | |
198 | m = new Metadata(); | |
199 | } | |
200 | if (originalLocationOnDrive != null) { | |
201 | String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME); | |
202 | if (val == null) { | |
203 | m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive); | |
204 | } | |
205 | } | |
206 | m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | |
207 | TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); | |
208 | embeddedInfos.put(imageSourcePathInZip, m); | |
209 | } | |
210 | //reset | |
211 | imageSourcePathInZip = null; | |
212 | originalLocationOnDrive = null; | |
213 | } | |
214 | } | |
215 | @Override | |
216 | public void startDocument() throws SAXException { | |
217 | xhml.startElement(DIV, CLASS, PAGE); | |
218 | } | |
219 | ||
220 | @Override | |
221 | public void endDocument() throws SAXException { | |
222 | writePage(); | |
223 | xhml.endElement(DIV); | |
224 | } | |
225 | ||
226 | ||
227 | private final void writePage() throws SAXException { | |
228 | if (canvases.size() == 0) { | |
229 | return; | |
230 | } | |
231 | ||
232 | for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) { | |
233 | String clip = e.getKey(); | |
234 | List<GlyphRun> runs = e.getValue(); | |
235 | if (runs.size() == 0) { | |
236 | continue; | |
237 | } | |
238 | xhml.startElement(DIV, CLASS, CANVAS_SAX); | |
239 | //a list of rows sorted by the y of the first element in each row | |
240 | List<List<GlyphRun>> rows = buildRows(runs); | |
241 | for (List<GlyphRun> row : rows) { | |
242 | writeRow(row); | |
243 | } | |
244 | xhml.endElement(DIV); | |
245 | } | |
246 | //for now just dump the urls at the end of the page | |
247 | //At some point, we could link them back up to their | |
248 | //true anchor text. | |
249 | if (urls.size() > 0) { | |
250 | xhml.startElement(DIV, CLASS, URL_DIV); | |
251 | for (String u : urls) { | |
252 | xhml.startElement(A, HREF, u); | |
253 | xhml.characters(u); | |
254 | xhml.endElement(A); | |
255 | } | |
256 | xhml.endElement(DIV); | |
257 | } | |
258 | canvases.clear(); | |
259 | } | |
260 | ||
261 | private void writeRow(List<GlyphRun> row) throws SAXException { | |
262 | /* | |
263 | int rtl = 0; | |
264 | int ltr = 0; | |
265 | //if the row is entirely rtl, sort all as rtl | |
266 | //otherwise sort ltr | |
267 | for (GlyphRun r : row) { | |
268 | //ignore directionality of pure spaces | |
269 | if (r.unicodeString == null || r.unicodeString.trim().length() == 0) { | |
270 | continue; | |
271 | } | |
272 | if (r.direction == GlyphRun.DIRECTION.RTL) { | |
273 | rtl++; | |
274 | } else { | |
275 | ltr++; | |
276 | } | |
277 | } | |
278 | if (rtl > 0 && ltr == 0) { | |
279 | Collections.sort(row, GlyphRun.RTL_COMPARATOR); | |
280 | } else { | |
281 | Collections.sort(row, GlyphRun.LTR_COMPARATOR); | |
282 | }*/ | |
283 | ||
284 | xhml.startElement(P); | |
285 | for (GlyphRun run : row) { | |
286 | //figure out if you need to add a space | |
287 | xhml.characters(run.unicodeString); | |
288 | } | |
289 | xhml.endElement(P); | |
290 | } | |
291 | ||
292 | //returns a List of rows (where a row is a list of glyphruns) | |
293 | //the List is sorted in increasing order of the first y of each row | |
294 | private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) { | |
295 | List<List<GlyphRun>> rows = new ArrayList<>(); | |
296 | float maxY = -1.0f; | |
297 | for (GlyphRun glyphRun : glyphRuns) { | |
298 | if (rows.size() == 0) { | |
299 | List<GlyphRun> row = new ArrayList<>(); | |
300 | row.add(glyphRun); | |
301 | rows.add(row); | |
302 | continue; | |
303 | } else { | |
304 | boolean addedNewRow = false; | |
305 | //can rely on the last row having the highest y | |
306 | List<GlyphRun> row = rows.get(rows.size()-1); | |
307 | //0.5 is a purely heuristic/magical number that should be derived | |
308 | //from the data, not made up. TODO: fix this | |
309 | if (Math.abs(glyphRun.originY -row.get(0).originY) < 0.5) { | |
310 | row.add(glyphRun); | |
311 | } else { | |
312 | row = new ArrayList<>(); | |
313 | row.add(glyphRun); | |
314 | rows.add(row); | |
315 | addedNewRow = true; | |
316 | } | |
317 | //sort rows so that they are in ascending order of y | |
318 | //in most xps files in our test corpus, this is never triggered | |
319 | //because the runs are already ordered correctly | |
320 | if (maxY > -1.0f && addedNewRow && glyphRun.originY < maxY) { | |
321 | Collections.sort(rows, ROW_SORTER); | |
322 | } | |
323 | if (glyphRun.originY > maxY) { | |
324 | maxY = glyphRun.originY; | |
325 | } | |
326 | } | |
327 | } | |
328 | return rows; | |
329 | } | |
330 | ||
331 | private static String getVal(String localName, Attributes atts) { | |
332 | for (int i = 0; i < atts.getLength(); i++) { | |
333 | if (localName.equals(atts.getLocalName(i))) { | |
334 | return atts.getValue(i); | |
335 | } | |
336 | } | |
337 | return null; | |
338 | } | |
339 | ||
340 | final static class GlyphRun { | |
341 | ||
342 | private enum DIRECTION { | |
343 | LTR, | |
344 | RTL | |
345 | } | |
346 | ||
347 | //TODO: use name in conjunction with Frag information | |
348 | //to do a better job of extracting paragraph and table structure | |
349 | private final String name; | |
350 | private final float originY; | |
351 | private final float originX;//not currently used, but could be used for bidi text calculations | |
352 | private final String unicodeString; | |
353 | private final String indicesString;//not currently used, but could be used for width calculations | |
354 | ||
355 | //not used yet | |
356 | private final DIRECTION direction; | |
357 | ||
358 | private GlyphRun(String name, float originY, float originX, String unicodeString, Integer bidiLevel, String indicesString) { | |
359 | this.name = name; | |
360 | this.unicodeString = unicodeString; | |
361 | this.originY = originY; | |
362 | this.originX = originX; | |
363 | if (bidiLevel == null) { | |
364 | direction = DIRECTION.LTR; | |
365 | } else { | |
366 | if (bidiLevel % 2 == 0) { | |
367 | direction = DIRECTION.LTR; | |
368 | } else { | |
369 | direction = DIRECTION.RTL; | |
370 | } | |
371 | } | |
372 | this.indicesString = indicesString; | |
373 | } | |
374 | } | |
375 | ||
376 | } |
+66
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.microsoft.ooxml.xps; | |
18 | ||
19 | import org.apache.poi.POIXMLDocument; | |
20 | import org.apache.poi.POIXMLProperties; | |
21 | import org.apache.poi.POIXMLTextExtractor; | |
22 | import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |
23 | import org.apache.poi.openxml4j.opc.OPCPackage; | |
24 | import org.apache.xmlbeans.XmlException; | |
25 | ||
26 | import java.io.IOException; | |
27 | ||
28 | /** | |
29 | * Currently, mostly a pass-through class to hold pkg and properties | |
30 | * and keep the general framework similar to our other POI-integrated | |
31 | * extractors. | |
32 | */ | |
33 | public class XPSTextExtractor extends POIXMLTextExtractor { | |
34 | ||
35 | private final OPCPackage pkg; | |
36 | private final POIXMLProperties properties; | |
37 | ||
38 | public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException { | |
39 | super((POIXMLDocument)null); | |
40 | this.pkg = pkg; | |
41 | this.properties = new POIXMLProperties(pkg); | |
42 | ||
43 | } | |
44 | ||
45 | @Override | |
46 | public OPCPackage getPackage() { | |
47 | return pkg; | |
48 | } | |
49 | ||
50 | @Override | |
51 | public String getText() { | |
52 | return null; | |
53 | } | |
54 | public POIXMLProperties.CoreProperties getCoreProperties() { | |
55 | return this.properties.getCoreProperties(); | |
56 | } | |
57 | ||
58 | public POIXMLProperties.ExtendedProperties getExtendedProperties() { | |
59 | return this.properties.getExtendedProperties(); | |
60 | } | |
61 | ||
62 | public POIXMLProperties.CustomProperties getCustomProperties() { | |
63 | return this.properties.getCustomProperties(); | |
64 | } | |
65 | } |
+2
-2
28 | 28 | |
29 | 29 | import org.apache.tika.io.IOUtils; |
30 | 30 | import org.apache.tika.parser.ner.NERecogniser; |
31 | import org.json.JSONException; | |
32 | import org.json.JSONObject; | |
31 | import com.github.openjson.JSONException; | |
32 | import com.github.openjson.JSONObject; | |
33 | 33 | import org.slf4j.Logger; |
34 | 34 | import org.slf4j.LoggerFactory; |
35 | 35 |
15 | 15 | */ |
16 | 16 | package org.apache.tika.parser.ocr; |
17 | 17 | |
18 | import org.apache.commons.io.FilenameUtils; | |
19 | ||
18 | 20 | import java.io.File; |
19 | 21 | import java.io.IOException; |
20 | 22 | import java.io.InputStream; |
21 | 23 | import java.io.Serializable; |
24 | import java.util.HashMap; | |
22 | 25 | import java.util.Locale; |
26 | import java.util.Map; | |
23 | 27 | import java.util.Properties; |
28 | import java.util.regex.Matcher; | |
29 | import java.util.regex.Pattern; | |
24 | 30 | |
25 | 31 | /** |
26 | 32 | * Configuration for TesseractOCRParser. |
40 | 46 | |
41 | 47 | private static final long serialVersionUID = -4861942486845757891L; |
42 | 48 | |
49 | private static Pattern ALLOWABLE_PAGE_SEPARATORS_PATTERN = | |
50 | Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$"); | |
51 | ||
52 | private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN = | |
53 | Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$"); | |
54 | ||
43 | 55 | public enum OUTPUT_TYPE { |
44 | 56 | TXT, |
45 | 57 | HOCR |
73 | 85 | private int enableImageProcessing = 0; |
74 | 86 | |
75 | 87 | // Path to ImageMagick program, if not on system path. |
76 | private String ImageMagickPath = ""; | |
88 | private String imageMagickPath = ""; | |
77 | 89 | |
78 | 90 | // resolution of processed image (in dpi). |
79 | 91 | private int density = 300; |
90 | 102 | // factor by which image is to be scaled. |
91 | 103 | private int resize = 900; |
92 | 104 | |
105 | // See setPageSeparator. | |
106 | private String pageSeparator = ""; | |
107 | ||
93 | 108 | // whether or not to preserve interword spacing |
94 | 109 | private boolean preserveInterwordSpacing = false; |
95 | 110 | |
96 | 111 | // whether or not to apply rotation calculated by the rotation.py script |
97 | 112 | private boolean applyRotation = false; |
113 | ||
114 | // See addOtherTesseractConfig. | |
115 | private Map<String, String> otherTesseractConfig = new HashMap<>(); | |
98 | 116 | |
99 | 117 | |
100 | 118 | /** |
148 | 166 | getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); |
149 | 167 | setTimeout( |
150 | 168 | getProp(props, "timeout", getTimeout())); |
151 | String outputTypeString = props.getProperty("outputType"); | |
152 | if ("txt".equals(outputTypeString)) { | |
153 | setOutputType(OUTPUT_TYPE.TXT); | |
154 | } else if ("hocr".equals(outputTypeString)) { | |
155 | setOutputType(OUTPUT_TYPE.HOCR); | |
156 | } | |
169 | setOutputType(getProp(props, "outputType", getOutputType().toString())); | |
157 | 170 | setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false)); |
158 | 171 | |
159 | 172 | // set parameters for ImageMagick |
174 | 187 | setApplyRotation( |
175 | 188 | getProp(props, "applyRotation", getApplyRotation())); |
176 | 189 | |
190 | loadOtherTesseractConfig(props); | |
177 | 191 | } |
178 | 192 | |
179 | 193 | /** |
184 | 198 | } |
185 | 199 | |
186 | 200 | /** |
187 | * Set the path to the Tesseract executable, needed if it is not on system path. | |
201 | * Set the path to the Tesseract executable's directory, needed if it is not on system path. | |
188 | 202 | * <p> |
189 | 203 | * Note that if you set this value, it is highly recommended that you also |
190 | 204 | * set the path to the 'tessdata' folder using {@link #setTessdataPath}. |
191 | 205 | * </p> |
192 | 206 | */ |
193 | 207 | public void setTesseractPath(String tesseractPath) { |
208 | ||
209 | tesseractPath = FilenameUtils.normalize(tesseractPath); | |
194 | 210 | if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) |
195 | 211 | tesseractPath += File.separator; |
196 | 212 | |
210 | 226 | * (such as when Tesseract is built from source), it may be located elsewhere. |
211 | 227 | */ |
212 | 228 | public void setTessdataPath(String tessdataPath) { |
229 | tessdataPath = FilenameUtils.normalize(tessdataPath); | |
213 | 230 | if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) |
214 | 231 | tessdataPath += File.separator; |
215 | 232 | |
255 | 272 | } |
256 | 273 | |
257 | 274 | /** |
275 | * @see #setPageSeparator(String pageSeparator) | |
276 | */ | |
277 | public String getPageSeparator() { | |
278 | return pageSeparator; | |
279 | } | |
280 | ||
281 | /** | |
282 | * The page separator to use in plain text output. This corresponds to Tesseract's page_separator config option. | |
283 | * The default here is the empty string (i.e. no page separators). Note that this is also the default in | |
284 | * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character. We are overriding | |
285 | * Tesseract 4.0's default here. | |
286 | * | |
287 | * @param pageSeparator | |
288 | */ | |
289 | public void setPageSeparator(String pageSeparator) { | |
290 | Matcher m = ALLOWABLE_PAGE_SEPARATORS_PATTERN.matcher(pageSeparator); | |
291 | if (! m.find()) { | |
292 | throw new IllegalArgumentException(pageSeparator + " contains illegal characters.\n"+ | |
293 | "If you trust this value, set it with setTrustedPageSeparator"); | |
294 | } | |
295 | setTrustedPageSeparator(pageSeparator); | |
296 | } | |
297 | ||
298 | /** | |
299 | * Same as {@link #setPageSeparator(String)} but does not perform | |
300 | * any checks on the string. | |
301 | * @param pageSeparator | |
302 | */ | |
303 | public void setTrustedPageSeparator(String pageSeparator) { | |
304 | this.pageSeparator = pageSeparator; | |
305 | } | |
306 | ||
307 | /** | |
258 | 308 | * Whether or not to maintain interword spacing. Default is <code>false</code>. |
259 | 309 | * |
260 | 310 | * @param preserveInterwordSpacing |
318 | 368 | |
319 | 369 | /** |
320 | 370 | * Set output type from ocr process. Default is "txt", but can be "hocr". |
321 | * Default value is 120s. | |
371 | * Default value is {@link OUTPUT_TYPE#TXT}. | |
322 | 372 | */ |
323 | 373 | public void setOutputType(OUTPUT_TYPE outputType) { |
324 | 374 | this.outputType = outputType; |
375 | } | |
376 | ||
377 | public void setOutputType(String outputType) { | |
378 | if (outputType == null) { | |
379 | throw new IllegalArgumentException("outputType must not be null"); | |
380 | } | |
381 | String lc = outputType.toLowerCase(Locale.US); | |
382 | if ("txt".equals(lc)) { | |
383 | setOutputType(OUTPUT_TYPE.TXT); | |
384 | } else if ("hocr".equals(lc)) { | |
385 | setOutputType(OUTPUT_TYPE.HOCR); | |
386 | } else { | |
387 | throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'"); | |
388 | } | |
389 | ||
390 | ||
325 | 391 | } |
326 | 392 | |
327 | 393 | /** |
399 | 465 | * Deafult value is gray. |
400 | 466 | */ |
401 | 467 | public void setColorspace(String colorspace) { |
402 | if (!colorspace.equals(null)) { | |
403 | this.colorspace = colorspace; | |
404 | } else { | |
468 | if (colorspace == null) { | |
405 | 469 | throw new IllegalArgumentException("Colorspace value cannot be null."); |
406 | 470 | } |
471 | if (! colorspace.matches("(?i)^[-_A-Z0-9]+$")) { | |
472 | throw new IllegalArgumentException("colorspace must match this pattern: (?i)^[-_A-Z0-9]+$"); | |
473 | } | |
474 | this.colorspace = colorspace; | |
407 | 475 | } |
408 | 476 | |
409 | 477 | /** |
456 | 524 | } |
457 | 525 | |
458 | 526 | /** |
459 | * @return path to ImageMagick file. | |
460 | * @see #setImageMagickPath(String ImageMagickPath) | |
527 | * @return path to ImageMagick executable directory. | |
528 | * @see #setImageMagickPath(String imageMagickPath) | |
461 | 529 | */ |
462 | 530 | public String getImageMagickPath() { |
463 | 531 | |
464 | return ImageMagickPath; | |
465 | } | |
466 | ||
467 | /** | |
468 | * Set the path to the ImageMagick executable, needed if it is not on system path. | |
469 | * | |
470 | * @param ImageMagickPath to ImageMagick file. | |
471 | */ | |
472 | public void setImageMagickPath(String ImageMagickPath) { | |
473 | if (!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator)) | |
474 | ImageMagickPath += File.separator; | |
475 | ||
476 | this.ImageMagickPath = ImageMagickPath; | |
532 | return imageMagickPath; | |
533 | } | |
534 | ||
535 | /** | |
536 | * Set the path to the ImageMagick executable directory, needed if it is not on system path. | |
537 | * | |
538 | * @param imageMagickPath to ImageMagick executable directory. | |
539 | */ | |
540 | public void setImageMagickPath(String imageMagickPath) { | |
541 | imageMagickPath = FilenameUtils.normalize(imageMagickPath); | |
542 | if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) | |
543 | imageMagickPath += File.separator; | |
544 | ||
545 | this.imageMagickPath = imageMagickPath; | |
477 | 546 | } |
478 | 547 | |
479 | 548 | /** |
487 | 556 | /** |
488 | 557 | * Sets whether or not a rotation value should be calculated and passed to ImageMagick. |
489 | 558 | * |
490 | * @param true to calculate and apply rotation, false to skip. Default is false, true required Python installed. | |
559 | * @param applyRotation to calculate and apply rotation, false to skip. Default is false, true required Python installed. | |
491 | 560 | */ |
492 | 561 | public void setApplyRotation(boolean applyRotation) { |
493 | 562 | this.applyRotation = applyRotation; |
563 | } | |
564 | ||
565 | /** | |
566 | * @see #addOtherTesseractConfig(String, String) | |
567 | */ | |
568 | public Map<String, String> getOtherTesseractConfig() { | |
569 | return otherTesseractConfig; | |
570 | } | |
571 | ||
572 | /** | |
573 | * Add a key-value pair to pass to Tesseract using its -c command line option. | |
574 | * To see the possible options, run tesseract --print-parameters. | |
575 | * | |
576 | * You may also add these parameters in TesseractOCRConfig.properties; any | |
577 | * key-value pair in the properties file where the key contains an underscore | |
578 | * is passed directly to Tesseract. | |
579 | * | |
580 | * @param key | |
581 | * @param value | |
582 | */ | |
583 | public void addOtherTesseractConfig(String key, String value) { | |
584 | if (key == null) { | |
585 | throw new IllegalArgumentException("key must not be null"); | |
586 | } | |
587 | if (value == null) { | |
588 | throw new IllegalArgumentException("value must not be null"); | |
589 | } | |
590 | ||
591 | Matcher m = ALLOWABLE_OTHER_PARAMS_PATTERN.matcher(key); | |
592 | if (! m.find()) { | |
593 | throw new IllegalArgumentException("Key contains illegal characters: "+key); | |
594 | } | |
595 | m.reset(value); | |
596 | if (! m.find()) { | |
597 | throw new IllegalArgumentException("Value contains illegal characters: "+value); | |
598 | } | |
599 | ||
600 | otherTesseractConfig.put(key.trim(), value.trim()); | |
494 | 601 | } |
495 | 602 | |
496 | 603 | /** |
542 | 649 | property, propVal)); |
543 | 650 | } |
544 | 651 | |
652 | /** | |
653 | * Populate otherTesseractConfig from the given properties. | |
654 | * This assumes that any key-value pair where the key contains | |
655 | * an underscore is an option to be passed opaquely to Tesseract. | |
656 | * | |
657 | * @param properties properties file to read from. | |
658 | */ | |
659 | private void loadOtherTesseractConfig(Properties properties) { | |
660 | for (String k : properties.stringPropertyNames()) { | |
661 | if (k.contains("_")) { | |
662 | addOtherTesseractConfig(k, properties.getProperty(k)); | |
663 | } | |
664 | } | |
665 | } | |
545 | 666 | } |
32 | 32 | import java.io.Reader; |
33 | 33 | import java.nio.charset.Charset; |
34 | 34 | import java.nio.file.Files; |
35 | import java.nio.file.Paths; | |
35 | 36 | import java.nio.file.StandardCopyOption; |
37 | import java.util.ArrayList; | |
36 | 38 | import java.util.Arrays; |
37 | 39 | import java.util.Collections; |
38 | 40 | import java.util.HashMap; |
52 | 54 | import org.apache.commons.exec.PumpStreamHandler; |
53 | 55 | import org.apache.commons.io.FileUtils; |
54 | 56 | import org.apache.commons.io.IOUtils; |
57 | import org.apache.commons.lang.SystemUtils; | |
55 | 58 | import org.apache.tika.config.Initializable; |
56 | 59 | import org.apache.tika.config.InitializableProblemHandler; |
57 | 60 | import org.apache.tika.config.Param; |
109 | 112 | MediaType.image("jpx"), MediaType.image("x-portable-pixmap") |
110 | 113 | }))); |
111 | 114 | private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>(); |
115 | private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>(); | |
112 | 116 | |
113 | 117 | |
114 | 118 | @Override |
143 | 147 | if (TESSERACT_PRESENT.containsKey(tesseract)) { |
144 | 148 | return TESSERACT_PRESENT.get(tesseract); |
145 | 149 | } |
150 | //prevent memory bloat | |
151 | if (TESSERACT_PRESENT.size() > 100) { | |
152 | TESSERACT_PRESENT.clear(); | |
153 | } | |
154 | //check that the parent directory exists | |
155 | if (! config.getTesseractPath().isEmpty() && | |
156 | ! Files.isDirectory(Paths.get(config.getTesseractPath()))) { | |
157 | TESSERACT_PRESENT.put(tesseract, false); | |
158 | return false; | |
159 | } | |
160 | ||
146 | 161 | // Try running Tesseract from there, and see if it exists + works |
147 | 162 | String[] checkCmd = { tesseract }; |
148 | 163 | boolean hasTesseract = ExternalParser.check(checkCmd); |
153 | 168 | |
154 | 169 | private boolean hasImageMagick(TesseractOCRConfig config) { |
155 | 170 | // Fetch where the config says to find ImageMagick Program |
156 | String ImageMagick = config.getImageMagickPath() + getImageMagickProg(); | |
171 | String ImageMagick = getImageMagickPath(config); | |
157 | 172 | |
158 | 173 | // Have we already checked for a copy of ImageMagick Program there? |
159 | if (TESSERACT_PRESENT.containsKey(ImageMagick)) { | |
160 | return TESSERACT_PRESENT.get(ImageMagick); | |
174 | if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) { | |
175 | return IMAGE_MAGICK_PRESENT.get(ImageMagick); | |
176 | } | |
177 | //prevent memory bloat | |
178 | if (IMAGE_MAGICK_PRESENT.size() > 100) { | |
179 | IMAGE_MAGICK_PRESENT.clear(); | |
180 | } | |
181 | //check that directory exists | |
182 | if (!config.getImageMagickPath().isEmpty() && | |
183 | ! Files.isDirectory(Paths.get(config.getImageMagickPath()))) { | |
184 | IMAGE_MAGICK_PRESENT.put(ImageMagick, false); | |
185 | return false; | |
186 | } | |
187 | if (SystemUtils.IS_OS_WINDOWS && config.getImageMagickPath().isEmpty()) { | |
188 | LOG.warn("Must specify path for imagemagick on Windows OS to avoid accidental confusion with convert.exe"); | |
189 | IMAGE_MAGICK_PRESENT.put(ImageMagick, false); | |
190 | return false; | |
161 | 191 | } |
162 | 192 | |
163 | 193 | // Try running ImageMagick program from there, and see if it exists + works |
164 | 194 | String[] checkCmd = { ImageMagick }; |
165 | 195 | boolean hasImageMagick = ExternalParser.check(checkCmd); |
166 | TESSERACT_PRESENT.put(ImageMagick, hasImageMagick); | |
196 | IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick); | |
167 | 197 | |
168 | 198 | return hasImageMagick; |
169 | 199 | |
170 | 200 | } |
171 | 201 | |
202 | private String getImageMagickPath(TesseractOCRConfig config) { | |
203 | return config.getImageMagickPath() + getImageMagickProg(); | |
204 | } | |
205 | ||
172 | 206 | static boolean hasPython() { |
173 | 207 | // check if python is installed and it has the required dependencies for the rotation program to run |
174 | 208 | boolean hasPython = false; |
175 | ||
209 | TemporaryResources tmp = null; | |
176 | 210 | try { |
177 | TemporaryResources tmp = new TemporaryResources(); | |
211 | tmp = new TemporaryResources(); | |
178 | 212 | File importCheck = tmp.createTemporaryFile(); |
179 | 213 | String prg = "import numpy, matplotlib, skimage"; |
180 | 214 | OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(importCheck), Charset.forName("UTF-8")); |
186 | 220 | hasPython = true; |
187 | 221 | } |
188 | 222 | |
189 | tmp.close(); | |
190 | 223 | |
191 | 224 | } catch (Exception e) { |
192 | 225 | |
226 | } finally { | |
227 | IOUtils.closeQuietly(tmp); | |
193 | 228 | } |
194 | 229 | |
195 | 230 | return hasPython; |
305 | 340 | |
306 | 341 | /** |
307 | 342 | * This method is used to process the image to an OCR-friendly format. |
308 | * @param streamingObject input image to be processed | |
343 | * @param scratchFile input image to be processed | |
309 | 344 | * @param config TesseractOCRconfig class to get ImageMagick properties |
310 | 345 | * @throws IOException if an input error occurred |
311 | 346 | * @throws TikaException if an exception timed out |
312 | 347 | */ |
313 | private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException { | |
348 | private void processImage(File scratchFile, TesseractOCRConfig config) throws IOException, TikaException { | |
314 | 349 | |
315 | 350 | // fetch rotation script from resources |
316 | 351 | InputStream in = getClass().getResourceAsStream("rotation.py"); |
317 | 352 | TemporaryResources tmp = new TemporaryResources(); |
318 | 353 | File rotationScript = tmp.createTemporaryFile(); |
319 | 354 | Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING); |
320 | ||
321 | String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath(); | |
355 | ||
356 | CommandLine commandLine = new CommandLine("python"); | |
357 | String[] args = {"-W", | |
358 | "ignore", | |
359 | rotationScript.getAbsolutePath(), | |
360 | "-f", | |
361 | scratchFile.getAbsolutePath()}; | |
362 | commandLine.addArguments(args, true); | |
322 | 363 | String angle = "0"; |
323 | 364 | |
324 | 365 | DefaultExecutor executor = new DefaultExecutor(); |
327 | 368 | executor.setStreamHandler(streamHandler); |
328 | 369 | |
329 | 370 | // determine the angle of rotation required to make the text horizontal |
330 | CommandLine cmdLine = CommandLine.parse(cmd); | |
331 | 371 | if(config.getApplyRotation() && hasPython()) { |
332 | 372 | try { |
333 | executor.execute(cmdLine); | |
334 | angle = outputStream.toString("UTF-8").trim(); | |
373 | executor.execute(commandLine); | |
374 | String tmpAngle = outputStream.toString("UTF-8").trim(); | |
375 | //verify that you've gotten a numeric value out | |
376 | Double.parseDouble(tmpAngle); | |
377 | angle = tmpAngle; | |
335 | 378 | } catch(Exception e) { |
336 | 379 | |
337 | 380 | } |
338 | 381 | } |
339 | 382 | |
340 | 383 | // process the image - parameter values can be set in TesseractOCRConfig.properties |
341 | String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() + | |
342 | " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() + | |
343 | " -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() + | |
344 | " " + streamingObject.getAbsolutePath(); | |
345 | cmdLine = CommandLine.parse(line); | |
384 | commandLine = new CommandLine(getImageMagickPath(config)); | |
385 | args = new String[]{ | |
386 | "-density", Integer.toString(config.getDensity()), | |
387 | "-depth ", Integer.toString(config.getDepth()), | |
388 | "-colorspace", config.getColorspace(), | |
389 | "-filter", config.getFilter(), | |
390 | "-resize", config.getResize() + "%", | |
391 | "-rotate", angle, | |
392 | scratchFile.getAbsolutePath(), | |
393 | scratchFile.getAbsolutePath() | |
394 | }; | |
395 | commandLine.addArguments(args, true); | |
346 | 396 | try { |
347 | executor.execute(cmdLine); | |
397 | executor.execute(commandLine); | |
348 | 398 | } catch(Exception e) { |
349 | 399 | |
350 | 400 | } |
460 | 510 | * if an input error occurred |
461 | 511 | */ |
462 | 512 | private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { |
463 | String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", | |
464 | config.getLanguage(), "-psm", config.getPageSegMode(), | |
465 | config.getOutputType().name().toLowerCase(Locale.US), | |
513 | ArrayList<String> cmd = new ArrayList<>(Arrays.asList( | |
514 | config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", | |
515 | config.getLanguage(), "--psm", config.getPageSegMode() | |
516 | )); | |
517 | for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) { | |
518 | cmd.add("-c"); | |
519 | cmd.add(entry.getKey() + "=" + entry.getValue()); | |
520 | } | |
521 | cmd.addAll(Arrays.asList( | |
522 | "-c", "page_separator=" + config.getPageSeparator(), | |
466 | 523 | "-c", |
467 | (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"}; | |
524 | (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0", | |
525 | config.getOutputType().name().toLowerCase(Locale.US) | |
526 | )); | |
468 | 527 | ProcessBuilder pb = new ProcessBuilder(cmd); |
469 | 528 | setEnv(config, pb); |
470 | 529 | final Process process = pb.start(); |
131 | 131 | throws IOException, SAXException, TikaException { |
132 | 132 | |
133 | 133 | PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); |
134 | if (localConfig.getSetKCMS()) { | |
135 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); | |
136 | } | |
134 | 137 | |
135 | 138 | PDDocument pdfDocument = null; |
136 | 139 | |
221 | 224 | metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, |
222 | 225 | Boolean.toString(ap.canPrintDegraded())); |
223 | 226 | |
227 | if (document.getDocumentCatalog().getLanguage() != null) { | |
228 | metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage()); | |
229 | } | |
224 | 230 | |
225 | 231 | //now go for the XMP |
226 | 232 | Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); |
699 | 705 | } |
700 | 706 | |
701 | 707 | @Field |
708 | void setSetKCMS(boolean setKCMS) { | |
709 | defaultConfig.setSetKCMS(setKCMS); | |
710 | } | |
711 | ||
712 | @Field | |
702 | 713 | void setInitializableProblemHander(String name) { |
703 | 714 | if ("ignore".equals(name)) { |
704 | 715 | setInitializableProblemHandler(InitializableProblemHandler.IGNORE); |
763 | 774 | } |
764 | 775 | StringBuilder sb = new StringBuilder(); |
765 | 776 | try { |
766 | Class.forName("com.levigo.jbig2.JBIG2ImageReader"); | |
767 | } catch (ClassNotFoundException e) { | |
768 | sb.append("JBIG2ImageReader not loaded. jbig2 files will be ignored\n"); | |
769 | sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n"); | |
770 | sb.append("for optional dependencies.\n"); | |
771 | } | |
772 | try { | |
773 | 777 | Class.forName("com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriter"); |
774 | 778 | } catch (ClassNotFoundException e) { |
775 | 779 | sb.append("TIFFImageWriter not loaded. tiff files will not be processed\n"); |
135 | 135 | private boolean extractActions = false; |
136 | 136 | |
137 | 137 | private long maxMainMemoryBytes = -1; |
138 | ||
139 | private boolean setKCMS = false; | |
138 | 140 | |
139 | 141 | public PDFParserConfig() { |
140 | 142 | init(this.getClass().getResourceAsStream("PDFParser.properties")); |
214 | 216 | |
215 | 217 | setExtractActions(getBooleanProp(props.getProperty("extractActions"), false)); |
216 | 218 | |
219 | setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false)); | |
217 | 220 | |
218 | 221 | boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); |
219 | 222 | boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); |
685 | 688 | |
686 | 689 | public void setMaxMainMemoryBytes(int maxMainMemoryBytes) { |
687 | 690 | this.maxMainMemoryBytes = maxMainMemoryBytes; |
691 | } | |
692 | ||
693 | /** | |
694 | * <p> | |
695 | * Whether to call <code>System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")</code>. | |
696 | * KCMS is the unmaintained, legacy provider and is far faster than the newer replacement. | |
697 | * However, there are stability and security risks with using the unmaintained legacy provider. | |
698 | * </p> | |
699 | * <p> | |
700 | * Note, of course, that this is <b>not</b> thread safe. If the value is <code>false</code> | |
701 | * in your first thread, and the second thread changes this to <code>true</code>, | |
702 | * the system property in the first thread will now be <code>true</code>. | |
703 | * </p> | |
704 | * <p> | |
705 | * Default is <code>false</code>. | |
706 | * </p> | |
707 | * @param setKCMS whether or not to set KCMS | |
708 | */ | |
709 | public void setSetKCMS(boolean setKCMS) { | |
710 | this.setKCMS = setKCMS; | |
711 | } | |
712 | ||
713 | public boolean getSetKCMS() { | |
714 | return setKCMS; | |
688 | 715 | } |
689 | 716 | |
690 | 717 | private ImageType parseImageType(String ocrImageType) { |
20 | 20 | import java.io.BufferedInputStream; |
21 | 21 | import java.io.IOException; |
22 | 22 | import java.io.InputStream; |
23 | import java.util.Collections; | |
24 | import java.util.HashMap; | |
25 | import java.util.HashSet; | |
26 | import java.util.Map; | |
23 | 27 | import java.util.Set; |
24 | 28 | |
25 | 29 | import org.apache.commons.compress.MemoryLimitException; |
74 | 78 | private static final MediaType ZLIB = MediaType.application("zlib"); |
75 | 79 | private static final MediaType LZMA = MediaType.application("x-lzma"); |
76 | 80 | private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4"); |
77 | ||
78 | private static final Set<MediaType> SUPPORTED_TYPES = | |
79 | MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, | |
80 | XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA); | |
81 | private static final MediaType ZSTD = MediaType.application("zstd"); | |
82 | private static final MediaType DEFLATE64= MediaType.application("deflate64"); | |
83 | ||
84 | private static Set<MediaType> SUPPORTED_TYPES; | |
85 | private static Map<String, String> MIMES_TO_NAME; | |
86 | ||
87 | static { | |
88 | Set<MediaType> TMP_SET = new HashSet<>(); | |
89 | TMP_SET.addAll( | |
90 | MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, | |
91 | XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA)); | |
92 | try { | |
93 | Class.forName("org.brotli.dec.BrotliInputStream"); | |
94 | TMP_SET.add(BROTLI); | |
95 | } catch (NoClassDefFoundError|ClassNotFoundException e) { | |
96 | //swallow | |
97 | } | |
98 | try { | |
99 | Class.forName("com.github.luben.zstd.ZstdInputStream"); | |
100 | TMP_SET.add(ZSTD); | |
101 | } catch (NoClassDefFoundError|ClassNotFoundException e) { | |
102 | //swallow | |
103 | } | |
104 | SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET); | |
105 | } | |
106 | ||
107 | static { | |
108 | //map the mime type strings to the compressor stream names | |
109 | Map<String, String> tmpMimesToName = new HashMap<>(); | |
110 | tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2); | |
111 | tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP); | |
112 | tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED); | |
113 | tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK); | |
114 | tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ); | |
115 | tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200); | |
116 | tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED); | |
117 | tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE); | |
118 | tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z); | |
119 | tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA); | |
120 | tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI); | |
121 | tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD); | |
122 | MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName); | |
123 | } | |
124 | ||
81 | 125 | |
82 | 126 | private int memoryLimitInKb = 100000;//100MB |
83 | 127 | |
140 | 184 | return SNAPPY_RAW; |
141 | 185 | } else if (CompressorStreamFactory.LZMA.equals(name)) { |
142 | 186 | return LZMA; |
187 | } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) { | |
188 | return ZSTD; | |
189 | } else if (CompressorStreamFactory.DEFLATE64.equals(name)) { | |
190 | return DEFLATE64; | |
143 | 191 | } else { |
144 | 192 | return MediaType.OCTET_STREAM; |
145 | 193 | } |
174 | 222 | }); |
175 | 223 | CompressorStreamFactory factory = |
176 | 224 | new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb); |
177 | cis = factory.createCompressorInputStream(stream); | |
225 | //if we've already identified it via autodetect | |
226 | //trust that and go with the appropriate name | |
227 | //to avoid calling CompressorStreamFactory.detect() twice | |
228 | String name = getStreamName(metadata); | |
229 | if (name != null) { | |
230 | cis = factory.createCompressorInputStream(name, stream); | |
231 | } else { | |
232 | cis = factory.createCompressorInputStream(stream); | |
233 | MediaType type = getMediaType(cis); | |
234 | if (!type.equals(MediaType.OCTET_STREAM)) { | |
235 | metadata.set(CONTENT_TYPE, type.toString()); | |
236 | } | |
237 | } | |
178 | 238 | } catch (CompressorException e) { |
179 | 239 | if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) { |
180 | 240 | throw new TikaMemoryLimitException(e.getMessage()); |
182 | 242 | throw new TikaException("Unable to uncompress document stream", e); |
183 | 243 | } |
184 | 244 | |
185 | MediaType type = getMediaType(cis); | |
186 | if (!type.equals(MediaType.OCTET_STREAM)) { | |
187 | metadata.set(CONTENT_TYPE, type.toString()); | |
188 | } | |
189 | 245 | |
190 | 246 | XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); |
191 | 247 | xhtml.startDocument(); |
208 | 264 | name = name.substring(0, name.length() - 5); |
209 | 265 | } else if (name.endsWith(".pack")) { |
210 | 266 | name = name.substring(0, name.length() - 5); |
267 | } else if (name.endsWith(".br")) { | |
268 | name = name.substring(0, name.length() - 3); | |
211 | 269 | } else if (name.length() > 0) { |
212 | 270 | name = GzipUtils.getUncompressedFilename(name); |
213 | 271 | } |
227 | 285 | xhtml.endDocument(); |
228 | 286 | } |
229 | 287 | |
288 | /** | |
289 | * @param metadata | |
290 | * @return CompressorStream name based on the content-type value | |
291 | * in metadata or <code>null</code> if not found | |
292 | * ind | |
293 | */ | |
294 | private String getStreamName(Metadata metadata) { | |
295 | String mimeString = metadata.get(Metadata.CONTENT_TYPE); | |
296 | if (mimeString == null) { | |
297 | return null; | |
298 | } | |
299 | return MIMES_TO_NAME.get(mimeString); | |
300 | } | |
301 | ||
230 | 302 | @Field |
231 | 303 | public void setMemoryLimitInKb(int memoryLimitInKb) { |
232 | 304 | this.memoryLimitInKb = memoryLimitInKb; |
251 | 251 | } |
252 | 252 | |
253 | 253 | SevenZFile sevenz; |
254 | if (password == null) { | |
255 | sevenz = new SevenZFile(tstream.getFile()); | |
256 | } else { | |
257 | sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked")); | |
254 | try{ | |
255 | if (password == null) { | |
256 | sevenz = new SevenZFile(tstream.getFile()); | |
257 | } else { | |
258 | sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked")); | |
259 | } | |
260 | }catch(PasswordRequiredException e){ | |
261 | throw new EncryptedDocumentException(e); | |
258 | 262 | } |
259 | 263 | |
260 | 264 | // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty |
20 | 20 | import java.io.ByteArrayInputStream; |
21 | 21 | import java.io.IOException; |
22 | 22 | import java.io.InputStream; |
23 | import java.nio.charset.StandardCharsets; | |
23 | 24 | import java.util.Enumeration; |
24 | 25 | import java.util.HashSet; |
25 | 26 | import java.util.Iterator; |
55 | 56 | * formats to figure out exactly what the file is. |
56 | 57 | */ |
57 | 58 | public class ZipContainerDetector implements Detector { |
59 | ||
60 | //Regrettably, some tiff files can be incorrectly identified | |
61 | //as tar files. We need this ugly workaround to rule out TIFF. | |
62 | //If commons-compress ever chooses to take over TIFF detection | |
63 | //we can remove all of this. See TIKA-2591. | |
64 | private final static MediaType TIFF = MediaType.image("tiff"); | |
65 | private final static byte[][] TIFF_SIGNATURES = new byte[3][]; | |
66 | static { | |
67 | TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a}; | |
68 | TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00}; | |
69 | TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b}; | |
70 | } | |
71 | ||
58 | 72 | private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE); |
59 | 73 | |
60 | 74 | // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes |
63 | 77 | // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes |
64 | 78 | private static final String STRICT_CORE_DOCUMENT = |
65 | 79 | "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument"; |
80 | ||
81 | private static final String XPS_DOCUMENT = | |
82 | "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"; | |
66 | 83 | |
67 | 84 | /** Serial version UID */ |
68 | 85 | private static final long serialVersionUID = 2891763938430295453L; |
82 | 99 | int length = tis.peek(prefix); |
83 | 100 | |
84 | 101 | MediaType type = detectArchiveFormat(prefix, length); |
85 | if (PackageParser.isZipArchive(type) | |
86 | && TikaInputStream.isTikaInputStream(input)) { | |
102 | ||
103 | if (type == TIFF) { | |
104 | return TIFF; | |
105 | } else if (PackageParser.isZipArchive(type) | |
106 | && TikaInputStream.isTikaInputStream(input)) { | |
87 | 107 | return detectZipFormat(tis); |
88 | 108 | } else if (!type.equals(MediaType.OCTET_STREAM)) { |
89 | 109 | return type; |
108 | 128 | } |
109 | 129 | } |
110 | 130 | |
131 | private static boolean isTiff(byte[] prefix) { | |
132 | for (byte[] sig : TIFF_SIGNATURES) { | |
133 | if(arrayStartWith(sig, prefix)) { | |
134 | return true; | |
135 | } | |
136 | } | |
137 | return false; | |
138 | } | |
139 | ||
140 | private static boolean arrayStartWith(byte[] needle, byte[] haystack) { | |
141 | if (haystack.length < needle.length) { | |
142 | return false; | |
143 | } | |
144 | for (int i = 0; i < needle.length; i++) { | |
145 | if (haystack[i] != needle[i]) { | |
146 | return false; | |
147 | } | |
148 | } | |
149 | return true; | |
150 | } | |
151 | ||
111 | 152 | private static MediaType detectArchiveFormat(byte[] prefix, int length) { |
153 | if (isTiff(prefix)) { | |
154 | return TIFF; | |
155 | } | |
112 | 156 | try { |
113 | 157 | String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length)); |
114 | 158 | return PackageParser.getMediaType(name); |
212 | 256 | return null; |
213 | 257 | } catch (IOException e) { |
214 | 258 | return null; |
259 | } catch (SecurityException e) { | |
260 | //TIKA-2571 | |
261 | throw e; | |
215 | 262 | } catch (RuntimeException e) { |
216 | 263 | return null; |
217 | 264 | } catch (InvalidFormatException e) { |
244 | 291 | PackagePart corePart = pkg.getPart(core.getRelationship(0)); |
245 | 292 | String coreType = corePart.getContentType(); |
246 | 293 | |
294 | if (coreType.contains(".xps")) { | |
295 | return MediaType.application("vnd.ms-package.xps"); | |
296 | } | |
247 | 297 | // Turn that into the type of the overall document |
248 | 298 | String docType = coreType.substring(0, coreType.lastIndexOf('.')); |
249 | 299 | |
262 | 312 | /** |
263 | 313 | * Detects Open XML Paper Specification (XPS) |
264 | 314 | */ |
265 | private static MediaType detectXPSOPC(OPCPackage pkg) { | |
315 | public static MediaType detectXPSOPC(OPCPackage pkg) { | |
266 | 316 | PackageRelationshipCollection xps = |
267 | 317 | pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"); |
268 | 318 | if (xps.size() == 1) { |
+2
-2
45 | 45 | import org.apache.tika.parser.ParseContext; |
46 | 46 | import org.apache.tika.parser.recognition.ObjectRecogniser; |
47 | 47 | import org.apache.tika.parser.recognition.RecognisedObject; |
48 | import org.json.JSONArray; | |
49 | import org.json.JSONObject; | |
48 | import com.github.openjson.JSONArray; | |
49 | import com.github.openjson.JSONObject; | |
50 | 50 | import org.slf4j.Logger; |
51 | 51 | import org.slf4j.LoggerFactory; |
52 | 52 | import org.xml.sax.ContentHandler; |
+2
-2
46 | 46 | import org.apache.tika.mime.MimeTypeException; |
47 | 47 | import org.apache.tika.parser.ParseContext; |
48 | 48 | import org.apache.tika.parser.recognition.RecognisedObject; |
49 | import org.json.JSONArray; | |
50 | import org.json.JSONObject; | |
49 | import com.github.openjson.JSONArray; | |
50 | import com.github.openjson.JSONObject; | |
51 | 51 | import org.slf4j.Logger; |
52 | 52 | import org.slf4j.LoggerFactory; |
53 | 53 | import org.xml.sax.ContentHandler; |
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.utils; | |
18 | ||
19 | import org.apache.tika.mime.MediaType; | |
20 | ||
21 | import java.io.ByteArrayInputStream; | |
22 | import java.io.InputStream; | |
23 | import java.util.Arrays; | |
24 | import java.util.Objects; | |
25 | ||
26 | public class DataURIScheme { | |
27 | ||
28 | ||
29 | private final String rawMediaTypeString; | |
30 | private final boolean isBase64; | |
31 | private final byte[] data; | |
32 | ||
33 | DataURIScheme(String mediaTypeString, boolean isBase64, byte[] data) { | |
34 | this.rawMediaTypeString = mediaTypeString; | |
35 | this.isBase64 = isBase64; | |
36 | this.data = data; | |
37 | } | |
38 | ||
39 | public InputStream getInputStream() { | |
40 | return new ByteArrayInputStream(data); | |
41 | } | |
42 | ||
43 | /** | |
44 | * | |
45 | * @return parsed media type or <code>null</code> if parse fails or if media type string was | |
46 | * not specified | |
47 | */ | |
48 | public MediaType getMediaType() { | |
49 | if (rawMediaTypeString != null) { | |
50 | return MediaType.parse(rawMediaTypeString); | |
51 | } | |
52 | return null; | |
53 | } | |
54 | ||
55 | public boolean isBase64() { | |
56 | return isBase64; | |
57 | } | |
58 | ||
59 | @Override | |
60 | public boolean equals(Object o) { | |
61 | if (this == o) return true; | |
62 | if (!(o instanceof DataURIScheme)) return false; | |
63 | DataURIScheme that = (DataURIScheme) o; | |
64 | return isBase64() == that.isBase64() && | |
65 | Objects.equals(rawMediaTypeString, that.rawMediaTypeString) && | |
66 | Arrays.equals(data, that.data); | |
67 | } | |
68 | ||
69 | @Override | |
70 | public int hashCode() { | |
71 | ||
72 | int result = Objects.hash(rawMediaTypeString, isBase64()); | |
73 | result = 31 * result + Arrays.hashCode(data); | |
74 | return result; | |
75 | } | |
76 | } |
+28
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.utils; | |
18 | ||
19 | import org.apache.tika.exception.TikaException; | |
20 | ||
21 | public class DataURISchemeParseException extends TikaException { | |
22 | ||
23 | public DataURISchemeParseException(String msg) { | |
24 | super(msg); | |
25 | } | |
26 | ||
27 | } |
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.utils; | |
18 | ||
19 | import org.apache.commons.codec.binary.Base64; | |
20 | import org.apache.tika.mime.MediaType; | |
21 | ||
22 | import java.nio.charset.Charset; | |
23 | import java.nio.charset.IllegalCharsetNameException; | |
24 | import java.nio.charset.StandardCharsets; | |
25 | import java.util.ArrayList; | |
26 | import java.util.Collections; | |
27 | import java.util.List; | |
28 | import java.util.regex.Matcher; | |
29 | import java.util.regex.Pattern; | |
30 | ||
31 | /** | |
32 | * Not thread safe. Create a separate util for each thread. | |
33 | */ | |
34 | public class DataURISchemeUtil { | |
35 | ||
36 | public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII"; | |
37 | ||
38 | private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$"); | |
39 | private static Pattern EXTRACT_PATTERN = | |
40 | Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']"); | |
41 | private final Matcher parseMatcher = PARSE_PATTERN.matcher(""); | |
42 | private final Matcher extractMatcher = EXTRACT_PATTERN.matcher(""); | |
43 | Base64 base64 = new Base64(); | |
44 | ||
45 | public DataURIScheme parse(String string) throws DataURISchemeParseException { | |
46 | parseMatcher.reset(string); | |
47 | if (parseMatcher.find()) { | |
48 | return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3)); | |
49 | } | |
50 | throw new DataURISchemeParseException("Couldn't find expected pattern"); | |
51 | } | |
52 | ||
53 | private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) { | |
54 | byte[] data = null; | |
55 | //strip out back slashes as you might have in css | |
56 | dataString = (dataString != null) ? | |
57 | dataString.replaceAll("\\\\", " ") : dataString; | |
58 | ||
59 | if (dataString == null || dataString.length() == 0) { | |
60 | data = new byte[0]; | |
61 | } else if (isBase64 != null) { | |
62 | data = base64.decode(dataString); | |
63 | } else { | |
64 | //TODO: handle encodings | |
65 | MediaType mediaType = MediaType.parse(mediaTypeString); | |
66 | Charset charset = StandardCharsets.UTF_8; | |
67 | if (mediaType.hasParameters()) { | |
68 | String charsetName = mediaType.getParameters().get("charset"); | |
69 | if (charsetName != null && Charset.isSupported(charsetName)) { | |
70 | try { | |
71 | charset = Charset.forName(charsetName); | |
72 | } catch (IllegalCharsetNameException e) { | |
73 | //swallow and default to UTF-8 | |
74 | } | |
75 | } | |
76 | } | |
77 | data = dataString.getBytes(charset); | |
78 | } | |
79 | return new DataURIScheme(mediaTypeString, (isBase64 != null), data); | |
80 | } | |
81 | ||
82 | /** | |
83 | * Extracts DataURISchemes from free text, as in javascript. | |
84 | * | |
85 | * @param string | |
86 | * @return list of extracted DataURISchemes | |
87 | */ | |
88 | public List<DataURIScheme> extract(String string) { | |
89 | extractMatcher.reset(string); | |
90 | List<DataURIScheme> list = null; | |
91 | while (extractMatcher.find()) { | |
92 | DataURIScheme dataURIScheme = build(extractMatcher.group(1), | |
93 | extractMatcher.group(2), extractMatcher.group(3)); | |
94 | if (list == null) { | |
95 | list = new ArrayList<>(); | |
96 | } | |
97 | list.add(dataURIScheme); | |
98 | } | |
99 | return (list == null) ? Collections.EMPTY_LIST : list; | |
100 | } | |
101 | ||
102 | } |
+139
-0
0 | # Licensed to the Apache Software Foundation (ASF) under one or more | |
1 | # contributor license agreements. See the NOTICE file distributed with | |
2 | # this work for additional information regarding copyright ownership. | |
3 | # The ASF licenses this file to You under the Apache License, Version 2.0 | |
4 | # (the "License"); you may not use this file except in compliance with | |
5 | # the License. You may obtain a copy of the License at | |
6 | # | |
7 | # http://www.apache.org/licenses/LICENSE-2.0 | |
8 | # | |
9 | # Unless required by applicable law or agreed to in writing, software | |
10 | # distributed under the License is distributed on an "AS IS" BASIS, | |
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | # See the License for the specific language governing permissions and | |
13 | # limitations under the License. | |
14 | 646 | |
15 | 737 | |
16 | 775 | |
17 | 813 | |
18 | 819 | |
19 | 858 | |
20 | 874 | |
21 | 8859_1 | |
22 | 8859_13 | |
23 | 8859_15 | |
24 | 8859_2 | |
25 | 8859_4 | |
26 | 8859_5 | |
27 | 8859_7 | |
28 | 8859_9 | |
29 | 912 | |
30 | 914 | |
31 | 915 | |
32 | 920 | |
33 | 923 | |
34 | ansi-1251 | |
35 | ascii | |
36 | ascii7 | |
37 | cesu8 | |
38 | cp1250 | |
39 | cp1251 | |
40 | cp1252 | |
41 | cp1253 | |
42 | cp1254 | |
43 | cp1257 | |
44 | cp5346 | |
45 | cp5347 | |
46 | cp5348 | |
47 | cp5349 | |
48 | cp5350 | |
49 | cp5353 | |
50 | cp737 | |
51 | cp813 | |
52 | cp858 | |
53 | cp874 | |
54 | cp912 | |
55 | cp914 | |
56 | cp915 | |
57 | cp920 | |
58 | cp923 | |
59 | csibm862 | |
60 | csisolatin0 | |
61 | csisolatin9 | |
62 | cspcp855 | |
63 | default | |
64 | ibm-437 | |
65 | ibm-737 | |
66 | ibm-775 | |
67 | ibm-813 | |
68 | ibm-819 | |
69 | ibm-850 | |
70 | ibm-852 | |
71 | ibm-855 | |
72 | ibm-857 | |
73 | ibm-862 | |
74 | ibm-866 | |
75 | ibm-874 | |
76 | ibm-912 | |
77 | ibm-914 | |
78 | ibm-915 | |
79 | ibm-920 | |
80 | ibm-923 | |
81 | ibm737 | |
82 | ibm813 | |
83 | ibm874 | |
84 | ibm912 | |
85 | ibm914 | |
86 | ibm915 | |
87 | ibm920 | |
88 | ibm923 | |
89 | iso8859-1 | |
90 | iso8859-13 | |
91 | iso8859-15 | |
92 | iso8859-2 | |
93 | iso8859-4 | |
94 | iso8859-5 | |
95 | iso8859-7 | |
96 | iso8859-9 | |
97 | iso8859_1 | |
98 | iso8859_13 | |
99 | iso8859_15 | |
100 | iso8859_15_fdis | |
101 | iso8859_2 | |
102 | iso8859_4 | |
103 | iso8859_5 | |
104 | iso8859_7 | |
105 | iso8859_9 | |
106 | iso_8859-13 | |
107 | iso_8859_1 | |
108 | koi8 | |
109 | koi8_r | |
110 | koi8_u | |
111 | l9 | |
112 | latin0 | |
113 | latin9 | |
114 | sun_eu_greek | |
115 | unicode | |
116 | unicode-1-1-utf-8 | |
117 | unicodebig | |
118 | unicodebigunmarked | |
119 | unicodelittle | |
120 | unicodelittleunmarked | |
121 | utf-32be-bom | |
122 | utf-32le-bom | |
123 | utf16 | |
124 | utf32 | |
125 | utf8 | |
126 | utf_16 | |
127 | utf_16be | |
128 | utf_16le | |
129 | utf_32 | |
130 | utf_32be | |
131 | utf_32be_bom | |
132 | utf_32le | |
133 | utf_32le_bom | |
134 | windows-437 | |
135 | x-utf-16be | |
136 | x-utf-16le | |
137 | x-utf-32be | |
138 | x-utf-32le⏎ |
36 | 36 | ocrImageScale 2.0 |
37 | 37 | # Use up to 500MB when loading a pdf into a PDDocument |
38 | 38 | maxMainMemoryBytes 524288000 |
39 | #whether or not to set KCMS for faster (but legacy/unsupported) image rendering | |
40 | setKCMS false |
378 | 378 | // For spanned zip files, the .zip file doesn't have the header, it's the other parts |
379 | 379 | assertTypeByData("application/octet-stream", "test-documents-spanned.zip"); |
380 | 380 | assertTypeByData("application/zip", "test-documents-spanned.z01"); |
381 | ||
382 | assertTypeDetection("testZSTD.zstd", "application/zstd"); | |
381 | 383 | } |
382 | 384 | |
383 | 385 | @Test |
896 | 898 | // MBOX |
897 | 899 | assertTypeDetection("headers.mbox", "application/mbox"); |
898 | 900 | |
899 | // Thunderbird - doesn't currently work by name | |
900 | assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml"); | |
901 | } | |
902 | ||
901 | // Thunderbird | |
902 | assertTypeDetection("testThunderbirdEml.eml", "message/rfc822"); | |
903 | ||
904 | //dkim header | |
905 | assertTypeDetection("testThunderbirdEml.eml", "message/rfc822"); | |
906 | ||
907 | //x- custom header | |
908 | assertTypeDetection("testRFC822_x-.eml", "message/rfc822"); | |
909 | ||
910 | //embedded xhtml and img | |
911 | assertTypeDetection("testEML_embedded_xhtml_and_img.eml", "message/rfc822"); | |
912 | ||
913 | } | |
914 | ||
915 | @Test | |
916 | public void testMessageNews() throws Exception { | |
917 | assertTypeByData("message/news", "testMessageNews.txt"); | |
918 | } | |
903 | 919 | @Test |
904 | 920 | public void testAxCrypt() throws Exception { |
905 | 921 | // test-TXT.txt encrypted with a key of "tika" |
28 | 28 | import javax.xml.transform.sax.TransformerHandler; |
29 | 29 | import javax.xml.transform.stream.StreamResult; |
30 | 30 | import java.io.ByteArrayInputStream; |
31 | import java.io.ByteArrayOutputStream; | |
31 | 32 | import java.io.File; |
32 | 33 | import java.io.IOException; |
33 | 34 | import java.io.InputStream; |
38 | 39 | import java.nio.file.Path; |
39 | 40 | import java.nio.file.Paths; |
40 | 41 | import java.util.ArrayList; |
42 | import java.util.Arrays; | |
41 | 43 | import java.util.HashMap; |
42 | 44 | import java.util.List; |
43 | 45 | import java.util.Map; |
51 | 53 | import java.util.concurrent.Future; |
52 | 54 | import java.util.regex.Pattern; |
53 | 55 | |
56 | import org.apache.commons.codec.binary.Base64; | |
54 | 57 | import org.apache.tika.Tika; |
55 | 58 | import org.apache.tika.TikaTest; |
56 | 59 | import org.apache.tika.config.ServiceLoader; |
58 | 61 | import org.apache.tika.detect.AutoDetectReader; |
59 | 62 | import org.apache.tika.detect.EncodingDetector; |
60 | 63 | import org.apache.tika.exception.TikaException; |
64 | import org.apache.tika.io.IOUtils; | |
61 | 65 | import org.apache.tika.io.TikaInputStream; |
62 | 66 | import org.apache.tika.metadata.Geographic; |
63 | 67 | import org.apache.tika.metadata.Metadata; |
1264 | 1268 | } |
1265 | 1269 | |
1266 | 1270 | @Test |
1271 | public void testDataURI() throws Exception { | |
1272 | List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img.html"); | |
1273 | assertEquals(2, metadataList.size()); | |
1274 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
1275 | assertContains("some content", content); | |
1276 | //make sure that you've truncated the data: value | |
1277 | assertContains("src=\"data:\"", content); | |
1278 | Metadata imgMetadata = metadataList.get(1); | |
1279 | assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE)); | |
1280 | assertContains("moscow-birds", | |
1281 | Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT))); | |
1282 | } | |
1283 | ||
1284 | @Test | |
1285 | public void testDataURIInJS() throws Exception { | |
1286 | InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml"); | |
1287 | assertNotNull(is); | |
1288 | TikaConfig tikaConfig = new TikaConfig(is); | |
1289 | Parser p = new AutoDetectParser(tikaConfig); | |
1290 | List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img_in_js.html", p); | |
1291 | assertEquals(3, metadataList.size()); | |
1292 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
1293 | assertContains("some content", content); | |
1294 | Metadata imgMetadata = metadataList.get(1); | |
1295 | assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE)); | |
1296 | assertContains("moscow-birds", | |
1297 | Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT))); | |
1298 | } | |
1299 | ||
1300 | @Test | |
1267 | 1301 | public void testMultiThreadingEncodingDetection() throws Exception { |
1268 | 1302 | List<EncodingDetector> detectors = new ArrayList<>(); |
1269 | 1303 | ServiceLoader loader = |
1350 | 1384 | } |
1351 | 1385 | } |
1352 | 1386 | } |
1387 | ||
1388 | @Test | |
1389 | public void testCharsetsNotSupportedByIANA() throws Exception { | |
1390 | assertContains("This is a sample text", | |
1391 | getXML("testHTML_charset_utf8.html").xml); | |
1392 | ||
1393 | assertContains("This is a sample text", | |
1394 | getXML("testHTML_charset_utf16le.html").xml); | |
1395 | ||
1396 | } | |
1353 | 1397 | } |
259 | 259 | metadata.get(Metadata.SUBJECT)); |
260 | 260 | } |
261 | 261 | |
262 | @Test | |
263 | public void testMainBody() throws Exception { | |
264 | //test that the first text or html chunk is processed in the main body | |
265 | //not treated as an attachment. TIKA-2547 | |
266 | List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom"); | |
267 | assertEquals(7, metadataList.size()); | |
268 | assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); | |
269 | ||
270 | //Make sure text alternative doesn't get treated as an attachment | |
271 | metadataList = getRecursiveMetadata("testRFC822_normal_zip"); | |
272 | assertEquals(3, metadataList.size()); | |
273 | assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); | |
274 | assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE)); | |
275 | ||
276 | metadataList = getRecursiveMetadata("testRFC822-txt-body"); | |
277 | assertEquals(2, metadataList.size()); | |
278 | assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); | |
279 | } | |
280 | ||
262 | 281 | /** |
263 | 282 | * Test for TIKA-640, increase header max beyond 10k bytes |
264 | 283 | */ |
670 | 689 | assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE)); |
671 | 690 | assertEquals("/tzora-titan-4-hummer-xl-manual.pdf", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); |
672 | 691 | } |
692 | ||
693 | @Test | |
694 | public void testSimpleBodyInlined() throws Exception { | |
695 | List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt"); | |
696 | assertEquals(1, metadataList.size()); | |
697 | assertContains("asked", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); | |
698 | } | |
673 | 699 | } |
19 | 19 | import static org.junit.Assert.assertTrue; |
20 | 20 | import static org.junit.Assert.fail; |
21 | 21 | |
22 | import java.io.File; | |
22 | 23 | import java.io.InputStream; |
23 | 24 | import java.text.DecimalFormatSymbols; |
24 | 25 | import java.util.List; |
543 | 544 | getXML("testEXCEL_phonetic.xls", parser).xml); |
544 | 545 | |
545 | 546 | } |
547 | ||
548 | @Test | |
549 | public void testLabelsAreExtracted() throws Exception { | |
550 | String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml; | |
551 | assertContains("Morocco", xml); | |
552 | } | |
546 | 553 | } |
+62
-1
30 | 30 | import org.apache.tika.parser.ParseContext; |
31 | 31 | import org.apache.tika.parser.RecursiveParserWrapper; |
32 | 32 | import org.apache.tika.sax.BodyContentHandler; |
33 | import org.junit.Ignore; | |
33 | 34 | import org.junit.Test; |
34 | 35 | import org.xml.sax.ContentHandler; |
35 | 36 | |
63 | 64 | assertContains("<p>[1] This is a footnote.", xml); |
64 | 65 | assertContains("<p>This is the header text.</p>", xml); |
65 | 66 | assertContains("<p>This is the footer text.</p>", xml); |
66 | assertContains("<p>Here is a text box</p>", xml); | |
67 | assertContainsCount("<p>Here is a text box</p>", xml, 1); | |
67 | 68 | assertContains("<p>Bold ", xml); |
68 | 69 | assertContains("italic underline superscript subscript", xml); |
69 | 70 | assertContains("underline", xml); |
292 | 293 | public void testEncrypted() throws Exception { |
293 | 294 | getXML("testPPT_protected_passtika.ppt"); |
294 | 295 | } |
296 | ||
297 | @Test | |
298 | public void testGroups() throws Exception { | |
299 | List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.ppt"); | |
300 | assertEquals(3, metadataList.size()); | |
301 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
302 | //this tests that we're ignoring text shapes at depth=0 | |
303 | //i.e. POI has already included them in the slide's getTextParagraphs() | |
304 | assertContainsCount("Text box1", content, 1); | |
305 | ||
306 | ||
307 | //the WordArt and text box count tests will fail | |
308 | //if this content is available via getTextParagraphs() of the slide in POI | |
309 | //i.e. when POI is fixed, these tests will fail, and | |
310 | //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...) | |
311 | assertContainsCount("WordArt1", content, 1); | |
312 | assertContainsCount("WordArt2", content, 1); | |
313 | assertContainsCount("Ungrouped text box", content, 1);//should only be 1 | |
314 | assertContains("Text box2", content); | |
315 | assertContains("Text box3", content); | |
316 | assertContains("Text box4", content); | |
317 | assertContains("Text box5", content); | |
318 | ||
319 | //see below -- need to extract hyperlinks | |
320 | assertContains("tika", content); | |
321 | assertContains("MyTitle", content); | |
322 | ||
323 | assertEquals("/embedded-1", | |
324 | metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
325 | ||
326 | assertEquals("/embedded-2", | |
327 | metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
328 | ||
329 | } | |
330 | ||
331 | @Ignore("until we add smart text extraction") | |
332 | @Test | |
333 | public void testSmartArtText() throws Exception { | |
334 | String content = getXML("testPPT_groups.ppt").xml; | |
335 | assertContains("smart1", content); | |
336 | } | |
337 | ||
338 | @Ignore("until we fix hyperlink extraction from text boxes") | |
339 | @Test | |
340 | public void testHyperlinksInTextBoxes() throws Exception { | |
341 | String content = getXML("testPPT_groups.ppt").xml; | |
342 | assertContains("href=\"http://tika.apache.org", content); | |
343 | } | |
344 | ||
345 | @Test | |
346 | public void testEmbeddedXLSInOLEObject() throws Exception { | |
347 | List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt"); | |
348 | assertEquals(3, metadataList.size()); | |
349 | Metadata xlsx = metadataList.get(1); | |
350 | assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT)); | |
351 | assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT)); | |
352 | assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
353 | xlsx.get(Metadata.CONTENT_TYPE)); | |
354 | ||
355 | } | |
295 | 356 | } |
+37
-0
1740 | 1740 | } |
1741 | 1741 | |
1742 | 1742 | @Test |
1743 | public void testPPTXGroups() throws Exception { | |
1744 | List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx"); | |
1745 | assertEquals(3, metadataList.size()); | |
1746 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
1747 | assertContains("WordArt1", content); | |
1748 | assertContains("WordArt2", content); | |
1749 | assertContainsCount("Ungrouped text box", content, 1);//should only be 1 | |
1750 | assertContains("Text box1", content); | |
1751 | assertContains("Text box2", content); | |
1752 | assertContains("Text box3", content); | |
1753 | assertContains("Text box4", content); | |
1754 | assertContains("Text box5", content); | |
1755 | ||
1756 | ||
1757 | assertContains("href=\"http://tika.apache.org", content); | |
1758 | assertContains("smart1", content); | |
1759 | assertContains("MyTitle", content); | |
1760 | ||
1761 | assertEquals("/image1.jpg", | |
1762 | metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
1763 | ||
1764 | assertEquals("/thumbnail.jpeg", | |
1765 | metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
1766 | } | |
1767 | ||
1768 | @Test | |
1743 | 1769 | public void testXLSXPhoneticStrings() throws Exception { |
1744 | 1770 | //This unit test and test file come from Apache POI 51519.xlsx |
1745 | 1771 | |
1789 | 1815 | assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE)); |
1790 | 1816 | assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE)); |
1791 | 1817 | } |
1818 | ||
1819 | @Test | |
1820 | public void testEmbeddedXLSInOLEObject() throws Exception { | |
1821 | List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx"); | |
1822 | assertEquals(4, metadataList.size()); | |
1823 | Metadata xlsx = metadataList.get(2); | |
1824 | assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT)); | |
1825 | assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT)); | |
1826 | assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
1827 | xlsx.get(Metadata.CONTENT_TYPE)); | |
1828 | } | |
1792 | 1829 | } |
1793 | 1830 | |
1794 | 1831 |
+27
-0
590 | 590 | assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE)); |
591 | 591 | |
592 | 592 | } |
593 | ||
594 | @Test | |
595 | public void testPPTXGroups() throws Exception { | |
596 | List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx", parseContext); | |
597 | assertEquals(3, metadataList.size()); | |
598 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
599 | assertContains("WordArt1", content); | |
600 | assertContains("WordArt2", content); | |
601 | assertContainsCount("Ungrouped text box", content, 1);//should only be 1 | |
602 | assertContains("Text box1", content); | |
603 | assertContains("Text box2", content); | |
604 | assertContains("Text box3", content); | |
605 | assertContains("Text box4", content); | |
606 | assertContains("Text box5", content); | |
607 | ||
608 | ||
609 | assertContains("href=\"http://tika.apache.org", content); | |
610 | assertContains("smart1", content); | |
611 | assertContains("MyTitle", content); | |
612 | ||
613 | assertEquals("/image1.jpg", | |
614 | metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
615 | ||
616 | assertEquals("/thumbnail.jpeg", | |
617 | metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); | |
618 | } | |
619 | ||
593 | 620 | } |
+97
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | package org.apache.tika.parser.microsoft.ooxml.xps; | |
17 | ||
18 | import org.apache.tika.TikaTest; | |
19 | import org.apache.tika.metadata.Metadata; | |
20 | import org.apache.tika.metadata.TikaCoreProperties; | |
21 | import org.apache.tika.parser.RecursiveParserWrapper; | |
22 | import org.junit.Test; | |
23 | ||
24 | import java.util.List; | |
25 | ||
26 | import static org.junit.Assert.assertEquals; | |
27 | ||
28 | public class XPSParserTest extends TikaTest { | |
29 | ||
30 | @Test | |
31 | public void testBasic() throws Exception { | |
32 | List<Metadata> metadataList = getRecursiveMetadata("testPPT.xps"); | |
33 | assertEquals(2, metadataList.size()); | |
34 | ||
35 | //metadata | |
36 | assertEquals("Rajiv", metadataList.get(0).get(TikaCoreProperties.CREATOR)); | |
37 | assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.CREATED)); | |
38 | assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.MODIFIED)); | |
39 | assertEquals("Attachment Test", metadataList.get(0).get(TikaCoreProperties.TITLE)); | |
40 | ||
41 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
42 | assertContains("<p>Attachment Test</p>", content); | |
43 | assertContains("<div class=\"canvas\"><p>Different", content); | |
44 | ||
45 | //I'd want this to be "tika content", but copy+paste in Windows yields tikacontent | |
46 | assertContains("tikacontent", content); | |
47 | ||
48 | ||
49 | assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE)); | |
50 | } | |
51 | ||
52 | @Test | |
53 | public void testVarious() throws Exception { | |
54 | List<Metadata> metadataList = getRecursiveMetadata("testXPS_various.xps"); | |
55 | //confirm embedded images and thumbnails were extracted | |
56 | assertEquals(4, metadataList.size()); | |
57 | ||
58 | //now check for content in the right order | |
59 | String quickBrownFox = "\u0644\u062B\u0639\u0644\u0628\u0020" + | |
60 | "\u0627\u0644\u0628\u0646\u064A\u0020" + | |
61 | "\u0627\u0644\u0633\u0631\u064A\u0639"; | |
62 | ||
63 | String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); | |
64 | assertContains(quickBrownFox, content); | |
65 | ||
66 | assertContains("The \u0627\u0644\u0628\u0646\u064A fox", content); | |
67 | ||
68 | assertContains("\u0644\u062B\u0639\u0644\u0628 brown \u0627\u0644\u0633\u0631\u064A\u0639", | |
69 | content); | |
70 | ||
71 | //make sure the urls come through | |
72 | assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", | |
73 | content); | |
74 | ||
75 | Metadata metadata = metadataList.get(0); | |
76 | assertEquals("Allison, Timothy B.", metadata.get(TikaCoreProperties.CREATOR)); | |
77 | assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.CREATED)); | |
78 | assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.MODIFIED)); | |
79 | ||
80 | ||
81 | assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE)); | |
82 | ||
83 | Metadata inlineJpeg = metadataList.get(2); | |
84 | assertEquals("image/jpeg", inlineJpeg.get(Metadata.CONTENT_TYPE)); | |
85 | assertContains("INetCache", inlineJpeg.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)); | |
86 | assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(), | |
87 | inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); | |
88 | ||
89 | assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE)); | |
90 | // assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(), | |
91 | // inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); | |
92 | ||
93 | ||
94 | } | |
95 | ||
96 | } |
170 | 170 | config.setResize(1000); |
171 | 171 | } |
172 | 172 | |
173 | @Test(expected=IllegalArgumentException.class) | |
174 | public void testDataPathCheck() { | |
175 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
176 | config.setTessdataPath("blah\u0000deblah"); | |
177 | } | |
178 | ||
179 | @Test(expected=IllegalArgumentException.class) | |
180 | public void testPathCheck() { | |
181 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
182 | config.setTesseractPath("blah\u0000deblah"); | |
183 | } | |
184 | ||
185 | @Test(expected=IllegalArgumentException.class) | |
186 | public void testBadOtherKey() { | |
187 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
188 | config.addOtherTesseractConfig("bad bad", "bad"); | |
189 | ||
190 | } | |
191 | ||
192 | @Test(expected=IllegalArgumentException.class) | |
193 | public void testBadOtherValue() { | |
194 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
195 | config.addOtherTesseractConfig("bad", "bad bad"); | |
196 | } | |
197 | ||
198 | @Test(expected=IllegalArgumentException.class) | |
199 | public void testBadOtherValueSlash() { | |
200 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
201 | config.addOtherTesseractConfig("bad", "bad\\bad"); | |
202 | } | |
203 | ||
204 | @Test(expected=IllegalArgumentException.class) | |
205 | public void testBadOtherValueControl() { | |
206 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
207 | config.addOtherTesseractConfig("bad", "bad\u0001bad"); | |
208 | } | |
209 | ||
210 | @Test | |
211 | public void testGoodOtherParameters() { | |
212 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
213 | config.addOtherTesseractConfig("good", "good"); | |
214 | } | |
215 | ||
216 | @Test | |
217 | public void testBogusPathCheck() { | |
218 | //allow path that doesn't actually exist | |
219 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
220 | config.setTesseractPath("blahdeblahblah"); | |
221 | assertEquals("blahdeblahblah"+File.separator, config.getTesseractPath()); | |
222 | } | |
223 | ||
224 | @Test | |
225 | public void testTrailingSlashInPathBehavior() { | |
226 | ||
227 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
228 | config.setTesseractPath("blah"); | |
229 | assertEquals("blah"+File.separator, config.getTesseractPath()); | |
230 | config.setTesseractPath("blah"+File.separator); | |
231 | assertEquals("blah"+File.separator, config.getTesseractPath()); | |
232 | config.setTesseractPath(""); | |
233 | assertEquals("", config.getTesseractPath()); | |
234 | ||
235 | config.setTessdataPath("blahdata"); | |
236 | assertEquals("blahdata"+File.separator, config.getTessdataPath()); | |
237 | config.setTessdataPath("blahdata"+File.separator); | |
238 | assertEquals("blahdata"+File.separator, config.getTessdataPath()); | |
239 | config.setTessdataPath(""); | |
240 | assertEquals("", config.getTessdataPath()); | |
241 | ||
242 | config.setImageMagickPath("imagemagickpath"); | |
243 | assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath()); | |
244 | config.setImageMagickPath("imagemagickpath"+File.separator); | |
245 | assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath()); | |
246 | config.setImageMagickPath(""); | |
247 | assertEquals("", config.getImageMagickPath()); | |
248 | } | |
249 | ||
250 | @Test(expected=IllegalArgumentException.class) | |
251 | public void testBadColorSpace() { | |
252 | TesseractOCRConfig config = new TesseractOCRConfig(); | |
253 | config.setColorspace("someth!ng"); | |
254 | } | |
173 | 255 | } |
1369 | 1369 | assertFalse(path + " should have thrown exception", noEx); |
1370 | 1370 | } |
1371 | 1371 | |
1372 | @Test | |
1373 | public void testLanguageMetadata() throws Exception { | |
1374 | assertEquals("de-CH", getXML("testPDF-custommetadata.pdf") | |
1375 | .metadata.get(TikaCoreProperties.LANGUAGE)); | |
1376 | assertEquals("zh-CN", getXML("testPDFFileEmbInAnnotation.pdf") | |
1377 | .metadata.get(TikaCoreProperties.LANGUAGE)); | |
1378 | } | |
1379 | ||
1372 | 1380 | /** |
1373 | 1381 | * Simple class to count end of document events. If functionality is useful, |
1374 | 1382 | * move to org.apache.tika in src/test |
20 | 20 | import static org.junit.Assert.assertEquals; |
21 | 21 | import static org.junit.Assert.fail; |
22 | 22 | |
23 | import java.io.BufferedWriter; | |
24 | import java.io.OutputStreamWriter; | |
25 | import java.io.Writer; | |
26 | import java.nio.charset.StandardCharsets; | |
27 | import java.nio.file.Files; | |
28 | import java.nio.file.Path; | |
29 | import java.nio.file.Paths; | |
30 | import java.nio.file.StandardOpenOption; | |
23 | 31 | import java.util.HashSet; |
32 | import java.util.List; | |
24 | 33 | import java.util.Set; |
25 | 34 | |
26 | 35 | import org.apache.commons.compress.compressors.CompressorStreamFactory; |
27 | 36 | import org.apache.tika.TikaTest; |
28 | 37 | import org.apache.tika.metadata.Metadata; |
38 | import org.apache.tika.metadata.TikaCoreProperties; | |
29 | 39 | import org.apache.tika.mime.MediaType; |
30 | 40 | import org.apache.tika.parser.ParseContext; |
41 | import org.apache.tika.parser.RecursiveParserWrapper; | |
31 | 42 | import org.junit.BeforeClass; |
32 | 43 | import org.junit.Test; |
33 | 44 | |
38 | 49 | |
39 | 50 | @BeforeClass |
40 | 51 | public static void setUp() { |
41 | NOT_COVERED.add(MediaType.application("x-brotli")); | |
42 | 52 | NOT_COVERED.add(MediaType.application("x-lz4-block")); |
43 | 53 | NOT_COVERED.add(MediaType.application("x-snappy-raw")); |
54 | NOT_COVERED.add(MediaType.application("deflate64")); | |
44 | 55 | } |
45 | 56 | |
46 | 57 | @Test |
57 | 68 | //xml parser throws an exception for test1.xml |
58 | 69 | //for now, be content that the container file is correctly identified |
59 | 70 | assertContains("test1.xml", r.xml); |
71 | } | |
72 | ||
73 | @Test | |
74 | public void testZstd() throws Exception { | |
75 | XMLResult r = getXML("testZSTD.zstd"); | |
76 | assertContains("0123456789", r.xml); | |
77 | } | |
78 | ||
79 | @Test | |
80 | public void testBrotli() throws Exception { | |
81 | Metadata metadata = new Metadata(); | |
82 | metadata.set(Metadata.RESOURCE_NAME_KEY, "testBROTLI_compressed.br"); | |
83 | List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata); | |
84 | ||
85 | assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); | |
86 | assertEquals("testBROTLI_compressed", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY)); | |
60 | 87 | } |
61 | 88 | |
62 | 89 | @Test |
140 | 140 | } |
141 | 141 | |
142 | 142 | assertTrue("test no password", ex); |
143 | ||
144 | // No password, will fail with EncryptedDocumentException | |
145 | ex = false; | |
146 | try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( | |
147 | "/test-documents/full_encrypted.7z")) { | |
148 | parser.parse(stream, handler, metadata, recursingContext); | |
149 | fail("Shouldn't be able to read a full password protected 7z without the password"); | |
150 | } catch (EncryptedDocumentException e) { | |
151 | // Good | |
152 | ex = true; | |
153 | } catch (Exception e){ | |
154 | ex = false; | |
155 | } | |
156 | ||
157 | assertTrue("test no password for full encrypted 7z", ex); | |
143 | 158 | |
144 | 159 | ex = false; |
145 | 160 |
+55
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.pkg; | |
18 | ||
19 | ||
20 | import org.apache.commons.compress.compressors.CompressorStreamFactory; | |
21 | import org.apache.tika.TikaTest; | |
22 | import org.apache.tika.io.TikaInputStream; | |
23 | import org.apache.tika.metadata.Metadata; | |
24 | import org.apache.tika.mime.MediaType; | |
25 | import org.apache.tika.parser.ParseContext; | |
26 | import org.junit.BeforeClass; | |
27 | import org.junit.Test; | |
28 | ||
29 | import java.io.InputStream; | |
30 | import java.util.HashSet; | |
31 | import java.util.Set; | |
32 | ||
33 | import static org.junit.Assert.assertEquals; | |
34 | import static org.junit.Assert.fail; | |
35 | ||
36 | public class ZipContainerDetectorTest extends TikaTest { | |
37 | ||
38 | @Test | |
39 | public void testTiffWorkaround() throws Exception { | |
40 | //TIKA-2591 | |
41 | ZipContainerDetector zipContainerDetector = new ZipContainerDetector(); | |
42 | Metadata metadata = new Metadata(); | |
43 | try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) { | |
44 | MediaType mt = zipContainerDetector.detect(is, metadata); | |
45 | assertEquals(MediaType.image("tiff"), mt); | |
46 | } | |
47 | metadata = new Metadata(); | |
48 | try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) { | |
49 | MediaType mt = zipContainerDetector.detect(is, metadata); | |
50 | assertEquals(MediaType.image("tiff"), mt); | |
51 | } | |
52 | ||
53 | } | |
54 | }⏎ |
+79
-0
0 | /* | |
1 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
2 | * contributor license agreements. See the NOTICE file distributed with | |
3 | * this work for additional information regarding copyright ownership. | |
4 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 | * (the "License"); you may not use this file except in compliance with | |
6 | * the License. You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.tika.parser.utils; | |
18 | ||
19 | import org.apache.tika.TikaTest; | |
20 | import org.apache.tika.io.IOUtils; | |
21 | import org.apache.tika.mime.MediaType; | |
22 | import org.junit.Test; | |
23 | ||
24 | import java.io.ByteArrayOutputStream; | |
25 | import java.io.InputStream; | |
26 | import java.nio.charset.Charset; | |
27 | import java.nio.charset.StandardCharsets; | |
28 | ||
29 | import static org.junit.Assert.assertEquals; | |
30 | import static org.junit.Assert.assertFalse; | |
31 | import static org.junit.Assert.assertNull; | |
32 | import static org.junit.Assert.assertTrue; | |
33 | ||
34 | public class DataURISchemeParserTest extends TikaTest { | |
35 | DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil(); | |
36 | ||
37 | @Test | |
38 | public void testEmpty() throws Exception { | |
39 | DataURIScheme dataURIScheme = dataURISchemeUtil.parse("data:,"); | |
40 | assertFalse(dataURIScheme.isBase64()); | |
41 | assertNull(dataURIScheme.getMediaType()); | |
42 | assertEquals(-1, dataURIScheme.getInputStream().read()); | |
43 | } | |
44 | ||
45 | @Test | |
46 | public void testNewlines() throws Exception { | |
47 | String data = "data:image/png;base64,R0lG\nODdh"; | |
48 | DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data); | |
49 | assertTrue(dataURIScheme.isBase64()); | |
50 | assertEquals(MediaType.image("png"), dataURIScheme.getMediaType()); | |
51 | ||
52 | String expected = "data:image/png;base64,R0lGODdh"; | |
53 | assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data)); | |
54 | ||
55 | } | |
56 | ||
57 | @Test | |
58 | public void testBackslashNewlines() throws Exception { | |
59 | //like you'd have in a css fragment | |
60 | String data = "data:image/png;base64,R0lG\\\nODdh"; | |
61 | DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data); | |
62 | assertTrue(dataURIScheme.isBase64()); | |
63 | assertEquals(MediaType.image("png"), dataURIScheme.getMediaType()); | |
64 | ||
65 | String expected = "data:image/png;base64,R0lGODdh"; | |
66 | assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data)); | |
67 | } | |
68 | ||
69 | @Test | |
70 | public void testUTF8() throws Exception { | |
71 | String utf8 = "\u0628\u0631\u0646\u0633\u062A\u0648\u0646"; | |
72 | String data = "data:text/plain;charset=UTF-8;page=21,the%20data:"+utf8; | |
73 | DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data); | |
74 | ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |
75 | IOUtils.copy(dataURIScheme.getInputStream(), bos); | |
76 | assertContains(utf8, new String(bos.toByteArray(), StandardCharsets.UTF_8)); | |
77 | } | |
78 | } |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
52 | 52 | <dependency> |
53 | 53 | <groupId>com.google.code.gson</groupId> |
54 | 54 | <artifactId>gson</artifactId> |
55 | <version>2.8.1</version> | |
55 | <version>${gson.version}</version> | |
56 | 56 | </dependency> |
57 | 57 | |
58 | 58 | <!-- Test dependencies --> |
15 | 15 | FROM ubuntu:latest |
16 | 16 | MAINTAINER Apache Tika Team |
17 | 17 | |
18 | ENV TIKA_VERSION 1.7 | |
19 | ENV TIKA_SERVER_URL https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar | |
20 | ||
21 | 18 | RUN apt-get update \ |
22 | && apt-get install openjdk-7-jre-headless curl gdal-bin tesseract-ocr \ | |
23 | tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \ | |
24 | && curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \ | |
25 | && gpg --import /tmp/tika.asc \ | |
26 | && curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \ | |
27 | && NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \ | |
28 | | awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \ | |
29 | | sed -r -e 's/^"//; s/",$//; s/" "//') \ | |
30 | && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \ | |
31 | && curl -sSL "$NEAREST_TIKA_SERVER_URL" -o /tika-server-${TIKA_VERSION}.jar \ | |
32 | && gpg --verify /tmp/tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar \ | |
19 | && apt-get install openjdk-8-jre-headless curl gdal-bin tesseract-ocr \ | |
20 | tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \ | |
33 | 21 | && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* |
34 | 22 | |
23 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 | |
24 | RUN export JAVA_HOME | |
25 | ||
26 | ARG JAR_FILE | |
27 | ADD target/${JAR_FILE} /tika-server.jar | |
28 | ||
35 | 29 | EXPOSE 9998 |
36 | ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0 | |
30 | ENTRYPOINT java -jar /tika-server.jar -h 0.0.0.0 | |
31 |
13 | 13 | -s,--includeStack whether or not to return a stack trace |
14 | 14 | if there is an exception during 'parse' |
15 | 15 | ``` |
16 | Running via Docker | |
17 | ------------------ | |
18 | Assuming you have Docker installed, you can build you own local image using the: | |
19 | ||
20 | `mvn dockerfile:build` | |
21 | ||
22 | The image will be named apache/tika with the tag being the version being built. | |
23 | For example, building Apache Tika Server 1.17 will result in an image of `apache/tika-server:1.17` | |
24 | ||
25 | You can then run this image by executing the following, replacing `1.17` with your build version: | |
26 | ||
27 | `docker run -d -p 9998:9998 apache/tika-server:1.17` | |
28 | ||
29 | This will load Apache Tika Server and expose its interface on: | |
30 | ||
31 | `http://localhost:9998` | |
16 | 32 | |
17 | 33 | Usage |
18 | 34 | ----- |
19 | 19 | <parent> |
20 | 20 | <groupId>org.apache.tika</groupId> |
21 | 21 | <artifactId>tika-parent</artifactId> |
22 | <version>1.17</version> | |
22 | <version>1.18</version> | |
23 | 23 | <relativePath>../tika-parent/pom.xml</relativePath> |
24 | 24 | </parent> |
25 | 25 | |
258 | 258 | </configuration> |
259 | 259 | </execution> |
260 | 260 | </executions> |
261 | </plugin> | |
262 | <plugin> | |
263 | <groupId>com.spotify</groupId> | |
264 | <artifactId>dockerfile-maven-plugin</artifactId> | |
265 | <version>1.3.7</version> | |
266 | <configuration> | |
267 | <repository>apache/tika-server</repository> | |
268 | <tag>${project.version}</tag> | |
269 | <buildArgs> | |
270 | <JAR_FILE>tika-server-${project.version}.jar</JAR_FILE> | |
271 | </buildArgs> | |
272 | </configuration> | |
261 | 273 | </plugin> |
262 | 274 | <plugin> |
263 | 275 | <groupId>org.apache.maven.plugins</groupId> |
48 | 48 | import java.util.Locale; |
49 | 49 | import java.util.Map; |
50 | 50 | import java.util.Set; |
51 | import java.util.regex.Matcher; | |
52 | import java.util.regex.Pattern; | |
51 | 53 | |
52 | 54 | import org.apache.commons.lang.StringUtils; |
53 | 55 | import org.apache.cxf.jaxrs.ext.multipart.Attachment; |
81 | 83 | |
82 | 84 | @Path("/tika") |
83 | 85 | public class TikaResource { |
86 | ||
87 | private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_\\.A-Z0-9 ]+$"); | |
88 | ||
84 | 89 | public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n"; |
85 | 90 | public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR"; |
86 | 91 | public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF"; |
189 | 194 | * @throws WebApplicationException thrown when field cannot be found. |
190 | 195 | */ |
191 | 196 | private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) { |
192 | try { | |
193 | String property = StringUtils.removeStart(key, prefix); | |
194 | Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property)); | |
195 | ||
196 | field.setAccessible(true); | |
197 | if (field.getType() == String.class) { | |
198 | field.set(object, httpHeaders.getFirst(key)); | |
199 | } else if (field.getType() == int.class) { | |
200 | field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key))); | |
201 | } else if (field.getType() == double.class) { | |
202 | field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key))); | |
203 | } else if (field.getType() == boolean.class) { | |
204 | field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key))); | |
197 | ||
198 | try {String property = StringUtils.removeStart(key, prefix); | |
199 | Field field = null; | |
200 | try { | |
201 | field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property)); | |
202 | } catch (NoSuchFieldException e) { | |
203 | //swallow | |
204 | } | |
205 | String setter = property; | |
206 | setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1); | |
207 | //default assume string class | |
208 | //if there's a more specific type, e.g. double, int, boolean | |
209 | //try that. | |
210 | Class clazz = String.class; | |
211 | if (field != null) { | |
212 | if (field.getType() == int.class || field.getType() == Integer.class) { | |
213 | clazz = int.class; | |
214 | } else if (field.getType() == double.class) { | |
215 | clazz = double.class; | |
216 | } else if (field.getType() == Double.class) { | |
217 | clazz = Double.class; | |
218 | } else if (field.getType() == float.class) { | |
219 | clazz = float.class; | |
220 | } else if (field.getType() == Float.class) { | |
221 | clazz = Float.class; | |
222 | } else if (field.getType() == boolean.class) { | |
223 | clazz = boolean.class; | |
224 | } else if (field.getType() == Boolean.class) { | |
225 | clazz = Boolean.class; | |
226 | } | |
227 | } | |
228 | ||
229 | Method m = tryToGetMethod(object, setter, clazz); | |
230 | //if you couldn't find more specific setter, back off | |
231 | //to string setter and try that. | |
232 | if (m == null && clazz != String.class) { | |
233 | m = tryToGetMethod(object, setter, String.class); | |
234 | } | |
235 | ||
236 | if (m != null) { | |
237 | String val = httpHeaders.getFirst(key); | |
238 | val = val.trim(); | |
239 | if (clazz == String.class) { | |
240 | checkTrustWorthy(setter, val); | |
241 | m.invoke(object, val); | |
242 | } else if (clazz == int.class || clazz == Integer.class) { | |
243 | m.invoke(object, Integer.parseInt(val)); | |
244 | } else if (clazz == double.class || clazz == Double.class) { | |
245 | m.invoke(object, Double.parseDouble(val)); | |
246 | } else if (clazz == boolean.class || clazz == Boolean.class) { | |
247 | m.invoke(object, Boolean.parseBoolean(val)); | |
248 | } else if (clazz == float.class || clazz == Float.class) { | |
249 | m.invoke(object, Float.parseFloat(val)); | |
250 | } else { | |
251 | throw new IllegalArgumentException("setter must be String, int, float, double or boolean...for now"); | |
252 | } | |
205 | 253 | } else { |
206 | //couldn't find a directly accessible field | |
207 | //try for setX(String s) | |
208 | String setter = StringUtils.uncapitalize(property); | |
209 | setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1); | |
210 | Method m = null; | |
211 | try { | |
212 | m = object.getClass().getMethod(setter, String.class); | |
213 | } catch (NoSuchMethodException e) { | |
214 | //swallow | |
215 | } | |
216 | if (m != null) { | |
217 | m.invoke(object, httpHeaders.getFirst(key)); | |
218 | } | |
219 | } | |
254 | throw new NoSuchMethodException("Couldn't find: "+setter); | |
255 | } | |
256 | ||
220 | 257 | } catch (Throwable ex) { |
221 | 258 | throw new WebApplicationException(String.format(Locale.ROOT, |
222 | 259 | "%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX)); |
223 | 260 | } |
261 | } | |
262 | ||
263 | private static void checkTrustWorthy(String setter, String val) { | |
264 | if (setter == null || val == null) { | |
265 | throw new IllegalArgumentException("setter and val must not be null"); | |
266 | } | |
267 | if (setter.toLowerCase(Locale.US).contains("trusted")) { | |
268 | throw new IllegalArgumentException("Can't call a trusted method via tika-server headers"); | |
269 | } | |
270 | Matcher m = ALLOWABLE_HEADER_CHARS.matcher(val); | |
271 | if (! m.find()) { | |
272 | throw new IllegalArgumentException("Header val: "+val +" contains illegal characters. " + | |
273 | "Must contain: TikaResource.ALLOWABLE_HEADER_CHARS"); | |
274 | } | |
275 | } | |
276 | ||
277 | /** | |
278 | * Tries to get method. Silently swallows NoMethodException and returns | |
279 | * <code>null</code> if not found. | |
280 | * @param object | |
281 | * @param method | |
282 | * @param clazz | |
283 | * @return | |
284 | */ | |
285 | private static Method tryToGetMethod(Object object, String method, Class clazz) { | |
286 | try { | |
287 | return object.getClass().getMethod(method, clazz); | |
288 | } catch (NoSuchMethodException e) { | |
289 | //swallow | |
290 | } | |
291 | return null; | |
224 | 292 | } |
225 | 293 | |
226 | 294 | @SuppressWarnings("serial") |
16 | 16 | |
17 | 17 | package org.apache.tika.server; |
18 | 18 | |
19 | import static org.junit.Assert.assertEquals; | |
20 | import static org.junit.Assert.assertFalse; | |
21 | import static org.junit.Assert.assertTrue; | |
22 | ||
23 | import javax.ws.rs.core.Response; | |
24 | import java.io.InputStream; | |
25 | import java.util.ArrayList; | |
26 | import java.util.List; | |
27 | ||
28 | 19 | import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; |
29 | 20 | import org.apache.cxf.jaxrs.client.WebClient; |
30 | 21 | import org.apache.cxf.jaxrs.ext.multipart.Attachment; |
34 | 25 | import org.apache.tika.server.resource.TikaResource; |
35 | 26 | import org.junit.Test; |
36 | 27 | |
28 | import javax.ws.rs.core.Response; | |
29 | import java.io.InputStream; | |
30 | import java.util.ArrayList; | |
31 | import java.util.List; | |
32 | ||
33 | import static org.junit.Assert.assertEquals; | |
34 | import static org.junit.Assert.assertFalse; | |
35 | import static org.junit.Assert.assertTrue; | |
36 | ||
37 | 37 | public class TikaResourceTest extends CXFTestBase { |
38 | 38 | public static final String TEST_DOC = "test.doc"; |
39 | 39 | public static final String TEST_PASSWORD_PROTECTED = "password.xls"; |
278 | 278 | responseMsg |
279 | 279 | ); |
280 | 280 | } |
281 | ||
282 | @Test | |
283 | public void testDataIntegrityCheck() throws Exception { | |
284 | Response response = WebClient.create(endPoint + TIKA_PATH) | |
285 | .type("application/pdf") | |
286 | .accept("text/plain") | |
287 | .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + | |
288 | "tesseractPath", | |
289 | ||
290 | "C://tmp//hello.bat\u0000") | |
291 | .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); | |
292 | assertEquals(500, response.getStatus()); | |
293 | ||
294 | response = WebClient.create(endPoint + TIKA_PATH) | |
295 | .type("application/pdf") | |
296 | .accept("text/plain") | |
297 | .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + | |
298 | "tesseractPath", | |
299 | "bogus path") | |
300 | .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); | |
301 | assertEquals(200, response.getStatus()); | |
302 | } | |
303 | ||
304 | @Test | |
305 | public void testTrustedMethodPrevention() { | |
306 | Response response = WebClient.create(endPoint + TIKA_PATH) | |
307 | .type("application/pdf") | |
308 | .accept("text/plain") | |
309 | .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + | |
310 | "trustedPageSeparator", | |
311 | "\u0010") | |
312 | .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); | |
313 | assertEquals(500, response.getStatus()); | |
314 | ||
315 | } | |
316 | ||
317 | @Test | |
318 | public void testFloatInHeader() { | |
319 | Response response = WebClient.create(endPoint + TIKA_PATH) | |
320 | .type("application/pdf") | |
321 | .accept("text/plain") | |
322 | .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + | |
323 | "averageCharTolerance", | |
324 | "2.0") | |
325 | .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); | |
326 | assertEquals(200, response.getStatus()); | |
327 | ||
328 | } | |
281 | 329 | } |
24 | 24 | <parent> |
25 | 25 | <groupId>org.apache.tika</groupId> |
26 | 26 | <artifactId>tika-parent</artifactId> |
27 | <version>1.17</version> | |
27 | <version>1.18</version> | |
28 | 28 | <relativePath>../tika-parent/pom.xml</relativePath> |
29 | 29 | </parent> |
30 | 30 | |
49 | 49 | <artifactId>microsoft-translator-java-api</artifactId> |
50 | 50 | <version>0.6.2</version> |
51 | 51 | <type>jar</type> |
52 | <exclusions> | |
53 | <exclusion> | |
54 | <groupId>com.googlecode.json-simple</groupId> | |
55 | <artifactId>json-simple</artifactId> | |
56 | </exclusion> | |
57 | </exclusions> | |
58 | </dependency> | |
59 | <dependency> | |
60 | <groupId>com.googlecode.json-simple</groupId> | |
61 | <artifactId>json-simple</artifactId> | |
62 | <version>1.1.1</version> | |
52 | 63 | </dependency> |
53 | 64 | <dependency> |
54 | 65 | <groupId>org.apache.cxf</groupId> |
55 | 66 | <artifactId>cxf-rt-frontend-jaxrs</artifactId> |
56 | 67 | <version>${cxf.version}</version> |
68 | <exclusions> | |
69 | <!-- exclude because, as of 2.9.5, jaxb-annotations | |
70 | is bringing in 2.9.0 of core's annotations | |
71 | --> | |
72 | <exclusion> | |
73 | <groupId>com.fasterxml.jackson.core</groupId> | |
74 | <artifactId>jackson-annotations</artifactId> | |
75 | </exclusion> | |
76 | </exclusions> | |
57 | 77 | </dependency> |
58 | 78 | <dependency> |
59 | 79 | <groupId>com.fasterxml.jackson.jaxrs</groupId> |
60 | 80 | <artifactId>jackson-jaxrs-json-provider</artifactId> |
61 | <version>2.9.2</version> | |
81 | <version>${jackson.version}</version> | |
82 | <exclusions> | |
83 | <!-- exclude because, as of 2.9.5, jaxrs-json-provider | |
84 | is bringing in 2.9.0 of core's annotations | |
85 | --> | |
86 | <exclusion> | |
87 | <groupId>com.fasterxml.jackson.core</groupId> | |
88 | <artifactId>jackson-annotations</artifactId> | |
89 | </exclusion> | |
90 | </exclusions> | |
91 | </dependency> | |
92 | <dependency> | |
93 | <groupId>com.fasterxml.jackson.core</groupId> | |
94 | <artifactId>jackson-annotations</artifactId> | |
95 | <version>${jackson.version}</version> | |
62 | 96 | </dependency> |
63 | 97 | |
64 | 98 | <!-- Test dependencies --> |