Commit upstream/1.18 - tika

+66

-1

CHANGES.txt less more

0		Release 1.17 - December 8, 2017
	0	Release 1.18 - 4/20/2018
	1
	2	* Upgrade Jackson to 2.9.5 (TIKA-2634).
	3
	4	* Add support for brotli (TIKA-2621).
	5
	6	* Upgrade PDFBox to 2.0.9 and include new jbig2-imageio
	7	from org.apache.pdfbox (TIKA-2579 and TIKA-2607).
	8
	9	* Support for TIFF images in PDF files (TIKA-2338)
	10
	11	* Detection of full encrypted 7z files (TIKA-2568)
	12
	13	* Various new mimes and typo fixes in tika-mimetypes.xml
	14	via Andreas Meier (TIKA-2527).
	15
	16	* Revert to listenForAllRecords=false in ExcelExtractor
	17	via Grigoriy Alekseev (TIKA-2590)
	18
	19	* Add workaround to identify TIFFs that might confuse
	20	commons-compress's tar detection via Daniel Schmidt
	21	(TIKA-2591)
	22
	23	* Ignore non-IANA supported charsets in HTML meta-headers
	24	during charset detection in HTMLEncodingDetector
	25	via Andreas Meier (TIKA-2592)
	26
	27	* Add detection and parsing of zstd (if user provides
	28	com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
	29
	30	* Allow for RFC822 detection for files starting with "dkim-"
	31	and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
	32
	33	* Extract xlsx files embedded in OLE objects within PPT and PPTX
	34	via Brian McColgan (TIKA-2588).
	35
	36	* Extract files embedded in HTML and javascript inside HTML
	37	that are stored in the Data URI scheme (TIKA-2563).
	38
	39	* Extract text from grouped text boxes in PPT (TIKA-2569).
	40
	41	* Extract language metadata item from PDF files via Matt Sheppard (TIKA-2559)
	42
	43	* RFC822 with multipart/mixed, first text element should be treated
	44	as the main body of the email, not an attachment (TIKA-2547).
	45
	46	* Swap out com.tdunning:json for com.github.openjson:openjson to avoid
	47	jar conflicts (TIKA-2556).
	48
	49	* No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551).
	50
	51	* Require Java 8 (TIKA-2553).
	52
	53	* Add a parser for XPS (TIKA-2524).
	54
	55	* Mime magic for Dolby Digital AC3 and EAC3 files
	56
	57	* Fixed bug where TesseractOCRParser ignores configured ImageMagickPath,
	58	and set rotation script to ignore Python warnings (TIKA-2509)
	59
	60	* Upgrade geo-apis to 3.0.1 (TIKA-2535).
	61
	62	* Added local Docker image build using dockerfile-maven-plugin to allow
	63	images to be built from source (TIKA-1518).
	64
	65	Release 1.17 - 12/8/2017
1	66
2	67	***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN
3	68	ON Java 7. The next versions will require Java 8***

+4

-10

pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

103	103	<include name="tika-eval/target/tika-eval-${project.version}.jar*" />
104	104	</fileset>
105	105	</copy>
106		<checksum algorithm="MD5" fileext=".md5">
	106	<checksum algorithm="SHA-512" fileext=".sha512">
107	107	<fileset dir="${basedir}/target/${project.version}">
108	108	<include name="*.zip" />
109	109	<include name="*.?ar" />
110	110	</fileset>
111	111	</checksum>
112		<checksum algorithm="SHA1" fileext=".sha">
113		<fileset dir="${basedir}/target/${project.version}">
114		<include name="*.zip" />
115		<include name="*.?ar" />
116		</fileset>
117		</checksum>
118		<checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" />
	112	<checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA-512" property="checksum" />
119	113	<echo file="${basedir}/target/vote.txt">
120	114	From: ${username}@apache.org
121	115	To: dev@tika.apache.org

128	122	The release candidate is a zip archive of the sources in:
129	123	https://github.com/apache/tika/tree/{project.version}-rcN/
130	124
131		The SHA1 checksum of the archive is
	125	The SHA-512 checksum of the archive is
132	126	${checksum}.
133	127
134	128	In addition, a staged maven repository is available here:

+1

-1

tika-app/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

+19

-8

tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java less more

16	16
17	17	package org.apache.tika.cli;
18	18
	19
	20	import org.apache.commons.lang.SystemUtils;
	21
19	22	import java.io.IOException;
20	23	import java.nio.file.Files;
21	24	import java.nio.file.Path;

40	43	static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
41	44
42	45	protected static String[] build(String[] args) throws IOException {
	46
43	47	Map<String, String> processArgs = new LinkedHashMap<String, String>();
44	48	Map<String, String> jvmOpts = new LinkedHashMap<String,String>();
45	49	//take the args, and divide them into process args and options for

52	56	//maybe the user specified a different classpath?!
53	57	if (! jvmOpts.containsKey("-cp") && ! jvmOpts.containsKey("--classpath")) {
54	58	String cp = System.getProperty("java.class.path");
55		//need to test for " " on *nix, can't just add double quotes
56		//across platforms.
57		if (cp.contains(" ")){
58		cp = "\""+cp+"\"";
59		}
60	59	jvmOpts.put("-cp", cp);
61	60	}
62	61

69	68	}
70	69	//use the log4j config file inside the app /resources/log4j_batch_process.properties
71	70	if (! hasLog4j) {
72		jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", "");
	71	jvmOpts.put("-Dlog4j.configuration=log4j_batch_process.properties", "");
73	72	}
74	73	//now build the full command line
75	74	List<String> fullCommand = new ArrayList<String>();

78	77	for (Map.Entry<String, String> e : jvmOpts.entrySet()) {
79	78	fullCommand.add(e.getKey());
80	79	if (e.getValue().length() > 0) {
81		fullCommand.add(e.getValue());
	80	fullCommand.add(commandLineSafe(e.getValue()));
82	81	}
83	82	if (e.getKey().contains("java.awt.headless")) {
84	83	foundHeadlessOption = true;

93	92	for (Map.Entry<String, String> e : processArgs.entrySet()) {
94	93	fullCommand.add(e.getKey());
95	94	if (e.getValue().length() > 0) {
96		fullCommand.add(e.getValue());
	95	fullCommand.add(commandLineSafe(e.getValue()));
97	96	}
98	97	}
99	98	return fullCommand.toArray(new String[fullCommand.size()]);
	99	}
	100
	101	protected static String commandLineSafe(String arg) {
	102	if (arg == null) {
	103	return arg;
	104	}
	105	//need to test for " " on windows, can't just add double quotes
	106	//across platforms.
	107	if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
	108	arg = "\"" + arg + "\"";
	109	}
	110	return arg;
100	111	}
101	112
102	113

+3

-1

tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java less more

1040	1040	if (name == null) {
1041	1041	name = "file" + count++;
1042	1042	}
1043
	1043	if (! inputStream.markSupported()) {
	1044	inputStream = TikaInputStream.get(inputStream);
	1045	}
1044	1046	MediaType contentType = detector.detect(inputStream, metadata);
1045	1047
1046	1048	if (name.indexOf('.')==-1 && contentType!=null) {

+8

-6

tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java less more

40	40	Path testFile = null;
41	41
42	42	String testInputPathForCommandLine;
	43	String escapedInputPathForCommandLine;
43	44
44	45	@Before
45	46	public void init() {

56	57	throw new RuntimeException("Couldn't open testFile");
57	58	}
58	59	testInputPathForCommandLine = testInput.toAbsolutePath().toString();
	60	escapedInputPathForCommandLine = BatchCommandLineBuilder.commandLineSafe(testInputPathForCommandLine);
59	61	}
60	62
61	63	@After

113	115	assertEquals("true", attrs.get("-recursiveParserWrapper"));
114	116	assertEquals("html", attrs.get("-basicHandlerType"));
115	117	assertEquals("batch-config.xml", attrs.get("-bc"));
116		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	118	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
117	119	}
118	120
119	121	@Test

124	126
125	127	String[] commandLine = BatchCommandLineBuilder.build(params);
126	128	Map<String, String> attrs = mapify(commandLine);
127		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	129	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
128	130	assertEquals(outputRoot, attrs.get("-outputDir"));
129	131	}
130	132

135	137
136	138	String[] commandLine = BatchCommandLineBuilder.build(params);
137	139	Map<String, String> attrs = mapify(commandLine);
138		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	140	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
139	141	assertEquals(outputRoot, attrs.get("-outputDir"));
140	142
141	143	params = new String[]{"--inputDir", testInputPathForCommandLine, "--outputDir", outputRoot};
142	144
143	145	commandLine = BatchCommandLineBuilder.build(params);
144	146	attrs = mapify(commandLine);
145		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	147	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
146	148	assertEquals(outputRoot, attrs.get("-outputDir"));
147	149
148	150	params = new String[]{"-inputDir", testInputPathForCommandLine, "-outputDir", outputRoot};
149	151
150	152	commandLine = BatchCommandLineBuilder.build(params);
151	153	attrs = mapify(commandLine);
152		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	154	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
153	155	assertEquals(outputRoot, attrs.get("-outputDir"));
154	156	}
155	157

162	164	"--config="+configPath};
163	165	String[] commandLine = BatchCommandLineBuilder.build(params);
164	166	Map<String, String> attrs = mapify(commandLine);
165		assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
	167	assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
166	168	assertEquals(outputRoot, attrs.get("-outputDir"));
167	169	assertEquals(configPath, attrs.get("-c"));
168	170

+28

-0

tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java less more

281	281	FileUtils.deleteDirectory(tempFile);
282	282	}
283	283	}
	284
	285	@Test
	286	public void testExtractTgz() throws Exception {
	287	//TIKA-2564
	288	File tempFile = File.createTempFile("tika-test-", "");
	289	tempFile.delete();
	290	tempFile.mkdir();
	291
	292	try {
	293	String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/test-documents.tgz"};
	294
	295	TikaCLI.main(params);
	296
	297	StringBuffer allFiles = new StringBuffer();
	298	for (String f : tempFile.list()) {
	299	if (allFiles.length() > 0) allFiles.append(" : ");
	300	allFiles.append(f);
	301	}
	302
	303	File expectedTAR = new File(tempFile, "test-documents.tar");
	304
	305	assertExtracted(expectedTAR, allFiles.toString());
	306	} finally {
	307	FileUtils.deleteDirectory(tempFile);
	308	}
	309	}
	310
	311
284	312	protected static void assertExtracted(File f, String allFiles) {
285	313
286	314	assertTrue(

+58

-0

tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.extractor;
	18
	19	import org.apache.tika.batch.DigestingAutoDetectParserFactory;
	20	import org.apache.tika.config.TikaConfig;
	21	import org.apache.tika.extractor.EmbeddedDocumentUtil;
	22	import org.apache.tika.parser.AutoDetectParser;
	23	import org.apache.tika.parser.ParseContext;
	24	import org.apache.tika.parser.Parser;
	25	import org.apache.tika.parser.RecursiveParserWrapper;
	26	import org.apache.tika.sax.BasicContentHandlerFactory;
	27	import org.junit.Test;
	28
	29	import static org.junit.Assert.assertEquals;
	30	import static org.junit.Assert.assertNotNull;
	31
	32	public class TestEmbeddedDocumentUtil {
	33	//TODO -- figure out how to mock this into tika-core
	34
	35	@Test
	36	public void testSimple() {
	37	Parser p = new AutoDetectParser();
	38	ParseContext parseContext = new ParseContext();
	39	parseContext.set(Parser.class, p);
	40	Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
	41	assertNotNull(txtParser);
	42	assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
	43
	44	}
	45
	46	@Test
	47	public void testDoublyDecorated() {
	48	Parser d = new DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig());
	49	RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d,
	50	new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
	51	ParseContext parseContext = new ParseContext();
	52	parseContext.set(Parser.class, wrapper);
	53	Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
	54	assertNotNull(txtParser);
	55	assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
	56	}
	57	}

+2

-2

tika-batch/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

34	34	<url>http://tika.apache.org/</url>
35	35
36	36	<properties>
37		<cli.version>1.3.1</cli.version>
	37	<cli.version>1.4</cli.version>
38	38	</properties>
39	39
40	40	<dependencies>

+31

-3

tika-bundle/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

71	71	<groupId>org.ops4j.pax.exam</groupId>
72	72	<artifactId>pax-exam-container-native</artifactId>
73	73	<version>${pax.exam.version}</version>
	74	<exclusions>
	75	<exclusion>
	76	<groupId>org.ops4j.base</groupId>
	77	<artifactId>ops4j-base-util-property</artifactId>
	78	</exclusion>
	79	<exclusion>
	80	<groupId>org.ops4j.base</groupId>
	81	<artifactId>ops4j-base-lang</artifactId>
	82	</exclusion>
	83	</exclusions>
	84	<scope>test</scope>
	85	</dependency>
	86	<dependency>
	87	<groupId>org.ops4j.base</groupId>
	88	<artifactId>ops4j-base-util-property</artifactId>
	89	<version>1.5.0</version>
	90	<scope>test</scope>
	91	</dependency>
	92	<dependency>
	93	<groupId>org.ops4j.base</groupId>
	94	<artifactId>ops4j-base-lang</artifactId>
	95	<version>1.5.0</version>
74	96	<scope>test</scope>
75	97	</dependency>
76	98	<dependency>

167	189	sis-netcdf\|
168	190	sis-utility\|
169	191	sis-storage\|
	192	unit-api\|
170	193	apache-mime4j-core\|
171	194	apache-mime4j-dom\|
172		jsr-275\|
173	195	jhighlight\|
174	196	java-libpst\|
175	197	netcdf4\|

205	227	android.util;resolution:=optional,
206	228	com.adobe.xmp;resolution:=optional,
207	229	com.adobe.xmp.properties;resolution:=optional,
	230	com.github.luben.zstd;resolution:=optional,
	231	com.github.openjson;resolution:=optional,
208	232	com.google.protobuf;resolution:=optional,
209	233	com.ibm.icu.text;resolution:=optional,
210	234	com.sleepycat.je;resolution:=optional,

253	277	org.apache.pdfbox.debugger;resolution:=optional,
254	278	org.apache.sis;resolution:=optional,
255	279	org.apache.sis.distance;resolution:=optional,
	280	org.apache.sis.feature;resolution:=optional,
256	281	org.apache.sis.geometry;resolution:=optional,
	282	org.apache.sis.internal.feature;resolution:=optional,
	283	org.apache.sis.internal.referencing;resolution:=optional,
	284	org.apache.sis.parameter;resolution:=optional,
	285	org.apache.sis.referencing;resolution:=optional,
257	286	org.apache.tools.ant;resolution:=optional,
258	287	org.apache.tools.ant.taskdefs;resolution:=optional,
259	288	org.apache.tools.ant.types;resolution:=optional,

294	323	org.jdom2.output;resolution:=optional,
295	324	org.jdom2.filter;resolution:=optional,
296	325	org.json.simple;resolution:=optional,
297		org.json;resolution:=optional,
298	326	org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
299	327	org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
300	328	org.osgi.framework;resolution:=optional,

+1

-1

tika-core/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

+13

-2

tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java less more

240	240	Parser returnParser = null;
241	241	if (p != null) {
242	242	if (p instanceof ParserDecorator) {
243		p = ((ParserDecorator)p).getWrappedParser();
	243	p = findInDecorated((ParserDecorator)p, clazz);
244	244	}
245	245	if (equals(p, clazz)) {
246	246	return p;

254	254	}
255	255
256	256	return null;
	257	}
	258
	259	private static Parser findInDecorated(ParserDecorator p, Class clazz) {
	260	Parser candidate = p.getWrappedParser();
	261	if (equals(candidate, clazz)) {
	262	return candidate;
	263	}
	264	if (candidate instanceof ParserDecorator) {
	265	candidate = findInDecorated((ParserDecorator)candidate, clazz);
	266	}
	267	return candidate;
257	268	}
258	269
259	270	private static Parser findInComposite(CompositeParser p, Class clazz, ParseContext context) {

264	275	return candidate;
265	276	}
266	277	if (candidate instanceof ParserDecorator) {
267		candidate = ((ParserDecorator)candidate).getWrappedParser();
	278	candidate = findInDecorated((ParserDecorator)candidate, clazz);
268	279	}
269	280	if (equals(candidate, clazz)) {
270	281	return candidate;

+3

-0

tika-core/src/main/java/org/apache/tika/io/EndianUtils.java less more

229	229	break;
230	230	}
231	231	}
	232	if (i < 0) {
	233	throw new IOException("Buffer underun; expected one more byte");
	234	}
232	235	return v;
233	236	}
234	237

+2

-1

tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java less more

69	69	* The unit tests for this class are in the tika-parsers module.
70	70	* </p>
71	71	*/
72		public class RecursiveParserWrapper implements Parser {
	72	public class RecursiveParserWrapper extends ParserDecorator {
73	73
74	74	/**
75	75	* Generated serial version

125	125	*/
126	126	public RecursiveParserWrapper(Parser wrappedParser,
127	127	ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) {
	128	super(wrappedParser);
128	129	this.wrappedParser = wrappedParser;
129	130	this.contentHandlerFactory = contentHandlerFactory;
130	131	this.catchEmbeddedExceptions = catchEmbeddedExceptions;

+2

-1

tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java less more

30	30	* ({@link #characters(char[], int, int)} or
31	31	* {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
32	32	* content handler contain only valid XML characters. All invalid characters
33		* are replaced with spaces.
	33	* are replaced with the Unicode replacement character U+FFFD (though a
	34	* subclass may change this by overriding the {@link #writeReplacement(Output)} method).
34	35	* <p>
35	36	* The XML standard defines the following Unicode character ranges as
36	37	* valid XML characters:

+12

-3

tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java less more

158	158	* @return the regular expression containing the most important technical standard organizations.
159	159	*/
160	160	public static String getOrganzationsRegex() {
161		String regex = "(" + String.join("\|", organizations.keySet()) + ")";
162
163		return regex;
	161	StringBuilder sb = new StringBuilder();
	162	sb.append("(");
	163	int i = 0;
	164	for (String org : organizations.keySet()) {
	165	if (i > 0) {
	166	sb.append("\|");
	167	}
	168	sb.append(org);
	169	i++;
	170	}
	171	sb.append(")");
	172	return sb.toString();
164	173	}
165	174	}⏎

+138

-8

tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml less more

118	118	<mime-type type="application/cnrp+xml"/>
119	119	<mime-type type="application/commonground"/>
120	120	<mime-type type="application/conference-info+xml"/>
	121
	122	<mime-type type="application/coreldraw">
	123	<alias type="application/x-coreldraw"/>
	124	<alias type="application/x-cdr"/>
	125	<alias type="application/cdr"/>
	126	<alias type="image/x-cdr"/>
	127	<alias type="image/cdr"/>
	128	<_comment>CorelDraw</_comment>
	129	<_comment>cdr: CorelDraw</_comment>
	130	<_comment>des: CorelDraw X4 and newer</_comment>
	131	<magic priority="60">
	132	<match value="RIFF" type="string" offset="0">
	133	<match value="CDR" type="string" offset="8" />
	134	<match value="cdr" type="string" offset="8" />
	135	<match value="DES" type="string" offset="8" />
	136	<match value="des" type="string" offset="8" />
	137	</match>
	138	</magic>
	139	<glob pattern="*.cdr"/>
	140	</mime-type>
	141
121	142	<mime-type type="application/cpl+xml"/>
122	143	<mime-type type="application/csta+xml"/>
123	144	<mime-type type="application/cstadata+xml"/>

347	368	<alias type="application/mac-binhex"/>
348	369	<alias type="application/binhex"/>
349	370	<magic priority="50">
350		<match value="must\ be\ converted\ with\ BinHex" type="string" offset="11"/>
	371	<match value="must be converted with BinHex" type="string" offset="11"/>
351	372	</magic>
352	373	<glob pattern="*.hqx"/>
353	374	</mime-type>

839	860	<mime-type type="application/smil+xml">
840	861	<alias type="application/smil"/>
841	862	<_comment>SMIL Multimedia</_comment>
	863	<root-XML localName="smil"/>
	864	<sub-class-of type="application/xml"/>
842	865	<glob pattern="*.smi"/>
843	866	<glob pattern="*.smil"/>
844	867	<glob pattern="*.sml"/>

1390	1413	<mime-type type="application/vnd.intu.qfx">
1391	1414	<glob pattern="*.qfx"/>
1392	1415	</mime-type>
	1416	<mime-type type="application/vnd.iptc.g2.catalogitem+xml"/>
1393	1417	<mime-type type="application/vnd.iptc.g2.conceptitem+xml"/>
1394	1418	<mime-type type="application/vnd.iptc.g2.knowledgeitem+xml"/>
1395	1419	<mime-type type="application/vnd.iptc.g2.newsitem+xml"/>
	1420
	1421	<mime-type type="application/vnd.iptc.g2.newsmessage+xml">
	1422	<root-XML localName="newsMessage"/>
	1423	<root-XML localName="newsMessage" namespaceURI="http://iptc.org/std/nar/2006-10-01/"/>
	1424	<sub-class-of type="application/xml"/>
	1425	<_comment>XML syntax for IPTC NewsMessages</_comment>
	1426	<glob pattern="*.nar"/>
	1427	</mime-type>
	1428
1396	1429	<mime-type type="application/vnd.iptc.g2.packageitem+xml"/>
	1430	<mime-type type="application/vnd.iptc.g2.planningitem+xml"/>
	1431
1397	1432	<mime-type type="application/vnd.ipunplugged.rcprofile">
1398	1433	<glob pattern="*.rcprofile"/>
1399	1434	</mime-type>

2774	2809	<mime-type type="application/wspolicy+xml">
2775	2810	<glob pattern="*.wspolicy"/>
2776	2811	</mime-type>
	2812
	2813	<mime-type type="image/x-tga">
	2814	<alias type="image/x-targa"/>
	2815	<!-- trailer bytes: 54 52 55 45 56 49 53 49 4F 4E 2D 58 46 49 4C 45 2E 00
	2816	trailer as string: TRUEVISION-XFILE\\x2E\\x00
	2817	Some .tga files may be conflicting with application/x-123 recognition,
	2818	therefore this mime-type must be set in front of application/x-123 -->
	2819	<_comment>Targa image data</_comment>
	2820	<magic priority="90">
	2821	<match value="0x01010000" type="big32" offset="1" >
	2822	<match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
	2823	</match>
	2824	<match value="0x00020000" type="big32" offset="1" >
	2825	<match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
	2826	</match>
	2827	<match value="0x00030000" type="big32" offset="1" >
	2828	<match value=".*[\\x54\\x52\\x55\\x45\\x56\\x49\\x53\\x49\\x4F\\x4E\\x2D\\x58\\x46\\x49\\x4C\\x45\\x2E\\x00]" type="regex" offset="8" />
	2829	</match>
	2830	</magic>
	2831	<glob pattern="*.tga"/>
	2832	<glob pattern="*.icb"/>
	2833	<glob pattern="*.vda"/>
	2834	<!-- <glob pattern="*.vst"/> --> <!-- conflicting with application/vnd.visio-->
	2835	</mime-type>
2777	2836
2778	2837	<mime-type type="application/x-123">
2779	2838	<magic priority="50">

3075	3134	<match value="bplist" type="string" offset="0"/>
3076	3135	</magic>
3077	3136	</mime-type>
	3137	<mime-type type="application/x-gtar">
	3138	<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
	3139	<magic priority="50">
	3140	<!-- GNU tar archive -->
	3141	<match value="ustar \0" type="string" offset="257" />
	3142	</magic>
	3143	<glob pattern="*.gtar"/>
	3144	<sub-class-of type="application/x-tar"/>
	3145	</mime-type>
	3146
	3147	<mime-type type="application/x-brotli">
	3148	<glob pattern="*.br" />
	3149	<glob pattern="*.brotli" />
	3150	</mime-type>
3078	3151
3079	3152	<mime-type type="application/x-bzip">
3080	3153	<magic priority="40">

3452	3525	<glob pattern="*.tgz" />
3453	3526	<glob pattern="*-gz" />
3454	3527	</mime-type>
3455
	3528	<mime-type type="application/zstd">
	3529	<_comment>https://en.wikipedia.org/wiki/Zstandard</_comment>
	3530	<_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment>
	3531	<magic priority="50">
	3532	<match value="0xFD2FB528" type="little32" offset="0"/>
	3533	</magic>
	3534	<glob pattern="*.zstd"/>
	3535	</mime-type>
3456	3536	<mime-type type="application/x-hdf">
3457	3537	<_comment>Hierarchical Data Format File</_comment>
3458	3538	<magic priority="50">

3591	3671	<match value="-lz5-" type="string" offset="2"/>
3592	3672	</magic>
3593	3673	</mime-type>
	3674
	3675	<mime-type type="application/x-lz4">
	3676	<_comment>First match LZ4 Frame</_comment>
	3677	<_comment>Second match Legacy Frame</_comment>
	3678	<magic priority="60">
	3679	<match value="0x184d2204" type="little32" offset="0" />
	3680	<match value="0x184c2102" type="little32" offset="0" />
	3681	</magic>
	3682	<glob pattern="*.lz4"/>
	3683	</mime-type>
	3684
	3685	<mime-type type="application/x-lzip">
	3686	<_comment>Lzip (LZMA) compressed archive</_comment>
	3687	<magic priority="50">
	3688	<match value="\x4c\x5a\x49\x50" type="string" offset="0"/>
	3689	</magic>
	3690	<glob pattern="*.lz"/>
	3691	</mime-type>
	3692
	3693	<mime-type type="application/x-lzma">
	3694	<_comment>LZMA compressed archive</_comment>
	3695	<glob pattern="*.lzma"/>
	3696	</mime-type>
3594	3697
3595	3698	<mime-type type="application/x-mobipocket-ebook">
3596	3699	<acronym>MOBI</acronym>

4002	4105	<acronym>ESRI Shapefiles</acronym>
4003	4106	<_comment>ESRI Shapefiles</_comment>
4004	4107	<magic priority="60">
4005		<match value="0x0000270a" type="big32" offset="2" />
	4108	<match value="0x0000270a" type="big32" offset="0" />
4006	4109	</magic>
4007	4110	<glob pattern="*.shp"/>
4008	4111	</mime-type>

4740	4843	<glob pattern="*.aac"/>
4741	4844	</mime-type>
4742	4845
4743		<mime-type type="audio/x-adbcm">
	4846	<mime-type type="audio/x-adpcm">
4744	4847	<magic priority="20">
4745	4848	<match value=".snd" type="string" offset="0">
4746	4849	<match value="23" type="big32" offset="12"/>

4766	4869	<glob pattern="*.aiff"/>
4767	4870	<glob pattern="*.aifc"/>
4768	4871	</mime-type>
	4872
	4873	<mime-type type="audio/x-caf">
	4874	<_comment>Core Audio Format</_comment>
	4875	<_comment>com.apple.coreaudio-format</_comment>
	4876	<magic priority="60">
	4877	<match value="caff" type="string" offset="0" />
	4878	</magic>
	4879	<glob pattern="*.caf"/>
	4880	</mime-type>
4769	4881
4770	4882	<mime-type type="audio/x-dec-basic">
4771	4883	<magic priority="20">

4781	4893	</magic>
4782	4894	</mime-type>
4783	4895
4784		<mime-type type="audio/x-dec-adbcm">
	4896	<mime-type type="audio/x-dec-adpcm">
4785	4897	<magic priority="20">
4786	4898	<match value="0x0064732E" type="big32" offset="0">
4787	4899	<match value="23" type="big32" offset="12"/>

5612	5724	<magic priority="50">
5613	5725	<match value="Delivered-To:" type="string" offset="0"/>
5614	5726	<match value="Status:" type="string" offset="0"/>
5615		<match value="X-Mozilla-Keys:" type="string" offset="0"/>
5616		<match value="X-Mozilla-Status:" type="string" offset="0"/>
5617		<match value="X-Mozilla-Status2:" type="string" offset="0"/>
5618	5727	<match value="Relay-Version:" type="stringignorecase" offset="0"/>
5619	5728	<match value="#!\ rnews" type="string" offset="0"/>
5620	5729	<match value="N#!\ rnews" type="string" offset="0"/>

5624	5733	<match value="From:" type="stringignorecase" offset="0"/>
5625	5734	<match value="Received:" type="stringignorecase" offset="0"/>
5626	5735	<match value="Message-ID:" type="stringignorecase" offset="0"/>
	5736	<match value="\nReturn-Path:" type="stringignorecase" offset="0:1000"/>
	5737	<match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1000"/>
	5738	<match value="\nReceived:" type="stringignorecase" offset="0:1000"/>
5627	5739	<match value="Date:" type="string" offset="0"/>
5628	5740	<match value="User-Agent:" type="string" offset="0"/>
5629	5741	<match value="MIME-Version:" type="stringignorecase" offset="0"/>

5631	5743	<match value="X-Notes-Item:" type="string" offset="0">
5632	5744	<match value="Message-ID:" type="string" offset="0:8192"/>
5633	5745	</match>
	5746	<match value="X-" type="stringignorecase" offset="0">
	5747	<match value="\nMessage-ID:" type="string" offset="0:8192"/>
	5748	<match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
	5749	<match value="\nTo:" type="stringignorecase" offset="0:8192"/>
	5750	<match value="\nSubject:" type="string" offset="0:8192"/>
	5751	<match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
	5752	</match>
	5753	<match value="DKIM-" type="string" offset="0">
	5754	<match value="\nMessage-ID:" type="string" offset="0:8192"/>
	5755	<match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
	5756	<match value="\nTo:" type="stringignorecase" offset="0:8192"/>
	5757	<match value="\nSubject:" type="string" offset="0:8192"/>
	5758	<match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
	5759	</match>
	5760	</magic>
	5761	<magic priority="40">
	5762	<!-- lower priority than message/news -->
	5763	<match value="\nMessage-ID:" type="stringignorecase" offset="0:1000"/>
5634	5764	</magic>
5635	5765	<glob pattern="*.eml"/>
5636	5766	<glob pattern="*.mime"/>

+14

-0

tika-core/src/test/java/org/apache/tika/TikaTest.java less more

212	212	return getRecursiveMetadata(filePath, new ParseContext());
213	213	}
214	214
	215	protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
	216	return getRecursiveMetadata(filePath, new ParseContext(), metadata);
	217	}
	218
	219	protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
	220	Parser p = new AutoDetectParser();
	221	RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
	222	new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
	223	try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
	224	wrapper.parse(is, new DefaultHandler(), metadata, context);
	225	}
	226	return wrapper.getMetadata();
	227	}
	228
215	229	protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
216	230	Parser p = new AutoDetectParser();
217	231	RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,

+74

-2

tika-dl/pom.xml less more

23	23	<parent>
24	24	<groupId>org.apache.tika</groupId>
25	25	<artifactId>tika-parent</artifactId>
26		<version>1.17</version>
	26	<version>1.18</version>
27	27	<relativePath>../tika-parent/pom.xml</relativePath>
28	28	</parent>
29	29

45	45	<artifactId>tika-parsers</artifactId>
46	46	<version>${project.version}</version>
47	47	<scope>provided</scope>
	48	<exclusions>
	49	<exclusion>
	50	<groupId>joda-time</groupId>
	51	<artifactId>joda-time</artifactId>
	52	</exclusion>
	53	</exclusions>
48	54	</dependency>
49	55	<dependency>
50	56	<groupId>junit</groupId>

63	69	<groupId>org.json</groupId>
64	70	<artifactId>json</artifactId>
65	71	</exclusion>
66		</exclusions>
	72	<exclusion>
	73	<groupId>com.google.guava</groupId>
	74	<artifactId>guava</artifactId>
	75	</exclusion>
	76	<exclusion>
	77	<groupId>org.deeplearning4j</groupId>
	78	<artifactId>deeplearning4j-modelimport</artifactId>
	79	</exclusion>
	80	<exclusion>
	81	<groupId>org.apache.commons</groupId>
	82	<artifactId>commons-compress</artifactId>
	83	</exclusion>
	84	<exclusion>
	85	<groupId>org.apache.commons</groupId>
	86	<artifactId>commons-math3</artifactId>
	87	</exclusion>
	88	<exclusion>
	89	<groupId>commons-io</groupId>
	90	<artifactId>commons-io</artifactId>
	91	</exclusion>
	92	</exclusions>
	93	</dependency>
	94	<dependency>
	95	<groupId>org.apache.commons</groupId>
	96	<artifactId>commons-math3</artifactId>
	97	<version>3.4.1</version>
67	98	</dependency>
68	99	<dependency>
69	100	<groupId>org.deeplearning4j</groupId>

74	105	<groupId>org.deeplearning4j</groupId>
75	106	<artifactId>deeplearning4j-keras</artifactId>
76	107	</exclusion>
	108	<exclusion>
	109	<groupId>org.bytedeco</groupId>
	110	<artifactId>javacpp</artifactId>
	111	</exclusion>
	112	<exclusion>
	113	<groupId>joda-time</groupId>
	114	<artifactId>joda-time</artifactId>
	115	</exclusion>
77	116	</exclusions>
78	117	</dependency>
79	118	<dependency>
80	119	<groupId>org.datavec</groupId>
81	120	<artifactId>datavec-data-image</artifactId>
82	121	<version>${dl4j.version}</version>
	122	<exclusions>
	123	<exclusion>
	124	<groupId>com.google.guava</groupId>
	125	<artifactId>guava</artifactId>
	126	</exclusion>
	127	<exclusion>
	128	<groupId>org.bytedeco</groupId>
	129	<artifactId>javacpp</artifactId>
	130	</exclusion>
	131	<exclusion>
	132	<groupId>org.apache.commons</groupId>
	133	<artifactId>commons-math3</artifactId>
	134	</exclusion>
	135	<exclusion>
	136	<groupId>commons-io</groupId>
	137	<artifactId>commons-io</artifactId>
	138	</exclusion>
	139	<exclusion>
	140	<groupId>com.github.jai-imageio</groupId>
	141	<artifactId>jai-imageio-core</artifactId>
	142	</exclusion>
	143	</exclusions>
83	144	</dependency>
84	145	<dependency>
85	146	<groupId>org.nd4j</groupId>
86	147	<artifactId>nd4j-native-platform</artifactId>
87	148	<version>${dl4j.version}</version>
	149	<exclusions>
	150	<exclusion>
	151	<groupId>org.bytedeco</groupId>
	152	<artifactId>javacpp</artifactId>
	153	</exclusion>
	154	</exclusions>
	155	</dependency>
	156	<dependency>
	157	<groupId>org.bytedeco</groupId>
	158	<artifactId>javacpp</artifactId>
	159	<version>1.3.2</version>
88	160	</dependency>
89	161	<dependency>
90	162	<groupId>org.apache.commons</groupId>

+1

-1

tika-eval/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

+30

-2

tika-example/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

89	89	<groupId>org.apache.jackrabbit</groupId>
90	90	<artifactId>jackrabbit-jcr-server</artifactId>
91	91	<version>2.3.6</version>
	92	<exclusions>
	93	<exclusion>
	94	<groupId>org.apache.tika</groupId>
	95	<artifactId>tika-core</artifactId>
	96	</exclusion>
	97	<exclusion>
	98	<groupId>commons-codec</groupId>
	99	<artifactId>commons-codec</artifactId>
	100	</exclusion>
	101	<exclusion>
	102	<groupId>commons-io</groupId>
	103	<artifactId>commons-io</artifactId>
	104	</exclusion>
	105	</exclusions>
92	106	</dependency>
93	107	<dependency>
94	108	<groupId>org.apache.jackrabbit</groupId>
95	109	<artifactId>jackrabbit-core</artifactId>
96	110	<version>2.3.6</version>
	111	<exclusions>
	112	<exclusion>
	113	<groupId>org.apache.tika</groupId>
	114	<artifactId>tika-core</artifactId>
	115	</exclusion>
	116	<exclusion>
	117	<groupId>commons-io</groupId>
	118	<artifactId>commons-io</artifactId>
	119	</exclusion>
	120	<exclusion>
	121	<groupId>org.apache.lucene</groupId>
	122	<artifactId>lucene-core</artifactId>
	123	</exclusion>
	124	</exclusions>
97	125	</dependency>
98	126	<dependency>
99	127	<groupId>org.apache.lucene</groupId>

108	136	<dependency>
109	137	<groupId>org.springframework</groupId>
110	138	<artifactId>spring-context</artifactId>
111		<version>3.0.2.RELEASE</version>
	139	<version>3.2.16.RELEASE</version>
112	140	<exclusions>
113	141	<exclusion>
114	142	<groupId>commons-logging</groupId>

+1

-1

tika-java7/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

+8

-2

tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java less more

81	81
82	82	Iterator<FileTypeDetector> iterator = serviceLoader.iterator();
83	83	assertTrue(iterator.hasNext());
84
	84
	85	boolean foundTika = false;
85	86	while(iterator.hasNext()) {
86	87	FileTypeDetector fileTypeDetector = iterator.next();
87	88	assertNotNull(fileTypeDetector);
88		assertTrue(fileTypeDetector instanceof TikaFileTypeDetector);
	89	if (fileTypeDetector instanceof TikaFileTypeDetector) {
	90	foundTika = true;
	91	}
89	92	}
	93	//o.a.sis.internal.storage.StoreTypeDetector appears with latest upgrade
	94	//check that TikaFileTypeDetector appears at all
	95	assertTrue(foundTika);
90	96	}
91	97	}

+16

-3

tika-langdetect/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

42	42	<dependency>
43	43	<groupId>com.optimaize.languagedetector</groupId>
44	44	<artifactId>language-detector</artifactId>
45		<version>0.5</version>
	45	<version>0.6</version>
	46	<exclusions>
	47	<exclusion>
	48	<groupId>com.google.guava</groupId>
	49	<artifactId>guava</artifactId>
	50	</exclusion>
	51	</exclusions>
	52	</dependency>
	53	<!-- exclude and then add back in to avoid
	54	conflicts with edu.ucar:cdm in tika-parsers -->
	55	<dependency>
	56	<groupId>com.google.guava</groupId>
	57	<artifactId>guava</artifactId>
	58	<version>17.0</version>
46	59	</dependency>
47	60	<dependency>
48	61	<groupId>org.apache.cxf</groupId>

52	65	<dependency>
53	66	<groupId>com.google.code.gson</groupId>
54	67	<artifactId>gson</artifactId>
55		<version>2.6.1</version>
	68	<version>${gson.version}</version>
56	69	</dependency>
57	70
58	71	<!-- Test dependencies -->

+138

-4

tika-nlp/pom.xml less more

23	23	<parent>
24	24	<groupId>org.apache.tika</groupId>
25	25	<artifactId>tika-parent</artifactId>
26		<version>1.17</version>
	26	<version>1.18</version>
27	27	<relativePath>../tika-parent/pom.xml</relativePath>
28	28	</parent>
29	29

63	63	<groupId>edu.usc.ir</groupId>
64	64	<artifactId>age-predictor-api</artifactId>
65	65	<version>1.0</version>
66		</dependency>
67
	66	<exclusions>
	67	<exclusion>
	68	<groupId>com.google.guava</groupId>
	69	<artifactId>guava</artifactId>
	70	</exclusion>
	71	<exclusion>
	72	<groupId>commons-lang</groupId>
	73	<artifactId>commons-lang</artifactId>
	74	</exclusion>
	75	<exclusion>
	76	<groupId>commons-compress</groupId>
	77	<artifactId>commons-compress</artifactId>
	78	</exclusion>
	79	<exclusion>
	80	<groupId>org.xerial.snappy</groupId>
	81	<artifactId>snappy-java</artifactId>
	82	</exclusion>
	83	<exclusion>
	84	<groupId>com.fasterxml.jackson.core</groupId>
	85	<artifactId>jackson-core</artifactId>
	86	</exclusion>
	87	<exclusion>
	88	<groupId>com.fasterxml.jackson.core</groupId>
	89	<artifactId>jackson-databind</artifactId>
	90	</exclusion>
	91	<exclusion>
	92	<groupId>com.fasterxml.jackson.core</groupId>
	93	<artifactId>jackson-annotations</artifactId>
	94	</exclusion>
	95	<exclusion>
	96	<groupId>org.codehaus.jackson</groupId>
	97	<artifactId>jackson-mapper-asl</artifactId>
	98	</exclusion>
	99	<exclusion>
	100	<groupId>log4j</groupId>
	101	<artifactId>log4j</artifactId>
	102	</exclusion>
	103	<exclusion>
	104	<groupId>commons-codec</groupId>
	105	<artifactId>commons-codec</artifactId>
	106	</exclusion>
	107	<exclusion>
	108	<groupId>commons-io</groupId>
	109	<artifactId>commons-io</artifactId>
	110	</exclusion>
	111	<exclusion>
	112	<groupId>com.thoughtworks.paranamer</groupId>
	113	<artifactId>paranamer</artifactId>
	114	</exclusion>
	115	<exclusion>
	116	<groupId>commons-net</groupId>
	117	<artifactId>commons-net</artifactId>
	118	</exclusion>
	119	<exclusion>
	120	<groupId>org.scala-lang</groupId>
	121	<artifactId>scala-library</artifactId>
	122	</exclusion>
	123	<exclusion>
	124	<groupId>org.scala-lang</groupId>
	125	<artifactId>scala-reflect</artifactId>
	126	</exclusion>
	127	<exclusion>
	128	<groupId>org.scalamacros</groupId>
	129	<artifactId>quasiquotes_2.10</artifactId>
	130	</exclusion>
	131	<exclusion>
	132	<groupId>org.codehaus.jackson</groupId>
	133	<artifactId>jackson-core-asl</artifactId>
	134	</exclusion>
	135	<exclusion>
	136	<groupId>org.apache.avro</groupId>
	137	<artifactId>avro</artifactId>
	138	</exclusion>
	139	</exclusions>
	140	</dependency>
	141	<dependency>
	142	<groupId>org.scalamacros</groupId>
	143	<artifactId>quasiquotes_2.10</artifactId>
	144	<version>2.0.0-M8</version>
	145	<exclusions>
	146	<exclusion>
	147	<groupId>org.scala-lang</groupId>
	148	<artifactId>scala-reflect</artifactId>
	149	</exclusion>
	150	<exclusion>
	151	<groupId>org.scala-lang</groupId>
	152	<artifactId>scala-library</artifactId>
	153	</exclusion>
	154	</exclusions>
	155	</dependency>
	156	<dependency>
	157	<groupId>org.scala-lang</groupId>
	158	<artifactId>scala-library</artifactId>
	159	<version>2.10.6</version>
	160	</dependency>
	161	<dependency>
	162	<groupId>org.scala-lang</groupId>
	163	<artifactId>scala-reflect</artifactId>
	164	<version>2.10.6</version>
	165	</dependency>
	166	<dependency>
	167	<groupId>commons-net</groupId>
	168	<artifactId>commons-net</artifactId>
	169	<version>3.1</version>
	170	</dependency>
	171	<dependency>
	172	<groupId>com.thoughtworks.paranamer</groupId>
	173	<artifactId>paranamer</artifactId>
	174	<version>2.6</version>
	175	</dependency>
	176	<dependency>
	177	<groupId>org.xerial.snappy</groupId>
	178	<artifactId>snappy-java</artifactId>
	179	<version>1.1.2.4</version>
	180	</dependency>
	181	<dependency>
	182	<groupId>org.codehaus.jackson</groupId>
	183	<artifactId>jackson-mapper-asl</artifactId>
	184	<version>1.9.13</version>
	185	</dependency>
	186	<dependency>
	187	<groupId>com.fasterxml.jackson.core</groupId>
	188	<artifactId>jackson-databind</artifactId>
	189	<version>${jackson.version}</version>
	190	<exclusions>
	191	<exclusion>
	192	<groupId>com.fasterxml.jackson.core</groupId>
	193	<artifactId>jackson-annotations</artifactId>
	194	</exclusion>
	195	</exclusions>
	196	</dependency>
	197	<dependency>
	198	<groupId>com.fasterxml.jackson.core</groupId>
	199	<artifactId>jackson-annotations</artifactId>
	200	<version>${jackson.version}</version>
	201	</dependency>
68	202	<!-- Test dependencies -->
69	203	<dependency>
70	204	<groupId>junit</groupId>

73	207	<dependency>
74	208	<groupId>org.mockito</groupId>
75	209	<artifactId>mockito-core</artifactId>
76		<version>1.7</version>
	210	<version>2.15.0</version>
77	211	<scope>test</scope>
78	212	</dependency>
79	213	<dependency>

+26

-5

tika-parent/pom.xml less more

30	30
31	31	<groupId>org.apache.tika</groupId>
32	32	<artifactId>tika-parent</artifactId>
33		<version>1.17</version>
	33	<version>1.18</version>
34	34	<packaging>pom</packaging>
35	35
36	36	<name>Apache Tika parent</name>

305	305	<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
306	306	<project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
307	307	<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
308		<commons.compress.version>1.14</commons.compress.version>
309		<commons.io.version>2.5</commons.io.version>
	308	<commons.compress.version>1.16.1</commons.compress.version>
	309	<commons.io.version>2.6</commons.io.version>
	310	<gson.version>2.8.1</gson.version>
310	311	<cxf.version>3.0.16</cxf.version>
311	312	<slf4j.version>1.7.24</slf4j.version>
	313	<jackson.version>2.9.5</jackson.version>
312	314	</properties>
313	315
314	316	<build>

324	326	<plugin>
325	327	<groupId>de.thetaphi</groupId>
326	328	<artifactId>forbiddenapis</artifactId>
327		<version>2.3</version>
	329	<!-- if this version contains commons-io 2.6, remove hard-coded commons-io version below -->
	330	<version>2.5</version>
328	331	<configuration>
329	332	<targetVersion>${maven.compiler.target}</targetVersion>
330	333	<failOnUnresolvableSignatures>false</failOnUnresolvableSignatures>

375	378	<version>1.9.5</version>
376	379	</dependency>
377	380	</dependencies>
	381	</plugin>
	382	<plugin>
	383	<groupId>org.apache.maven.plugins</groupId>
	384	<artifactId>maven-enforcer-plugin</artifactId>
	385	<version>3.0.0-M1</version>
	386	<executions>
	387	<execution>
	388	<id>enforce</id>
	389	<configuration>
	390	<rules>
	391	<dependencyConvergence />
	392	</rules>
	393	</configuration>
	394	<goals>
	395	<goal>enforce</goal>
	396	</goals>
	397	</execution>
	398	</executions>
378	399	</plugin>
379	400	</plugins>
380	401	</build>

438	459	<connection>scm:git:https://github.com/apache/</connection>
439	460	<developerConnection>scm:git:https://github.com/apache/</developerConnection>
440	461	<url>https://github.com/apache/tika</url>
441		<tag>1.17-rc2</tag>
	462	<tag>1.18-rc3</tag>
442	463	</scm>
443	464	</project>

+169

-37

tika-parsers/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

38	38	<!-- NOTE: sync codec version with POI -->
39	39	<codec.version>1.10</codec.version>
40	40	<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
41		<tukaani.version>1.6</tukaani.version>
	41	<tukaani.version>1.8</tukaani.version>
	42	<!-- NOTE: sync brotli version with commons-compress in tika-parent-->
	43	<brotli.version>0.1.2</brotli.version>
42	44	<mime4j.version>0.8.1</mime4j.version>
43	45	<vorbis.version>0.8</vorbis.version>
44		<pdfbox.version>2.0.8</pdfbox.version>
	46	<pdfbox.version>2.0.9</pdfbox.version>
45	47	<jempbox.version>1.8.13</jempbox.version>
46	48	<netcdf-java.version>4.5.5</netcdf-java.version>
47		<sis.version>0.6</sis.version>
	49	<sis.version>0.8</sis.version>
48	50	<!-- used by POI, PDFBox and Jackcess ...try to sync -->
49	51	<bouncycastle.version>1.54</bouncycastle.version>
50	52	<commonsexec.version>1.3</commonsexec.version>

80	82	<groupId>org.gagravarr</groupId>
81	83	<artifactId>vorbis-java-tika</artifactId>
82	84	<version>${vorbis.version}</version>
	85	<exclusions>
	86	<exclusion>
	87	<groupId>org.apache.tika</groupId>
	88	<artifactId>tika-core</artifactId>
	89	</exclusion>
	90	</exclusions>
83	91	</dependency>
84	92	<dependency>
85	93	<groupId>com.healthmarketscience.jackcess</groupId>
86	94	<artifactId>jackcess</artifactId>
87		<version>2.1.8</version>
	95	<version>2.1.10</version>
88	96	<exclusions>
89	97	<exclusion>
90	98	<groupId>commons-logging</groupId>

95	103	<dependency>
96	104	<groupId>com.healthmarketscience.jackcess</groupId>
97	105	<artifactId>jackcess-encrypt</artifactId>
98		<version>2.1.2</version>
	106	<version>2.1.4</version>
99	107	<exclusions>
100	108	<exclusion>
101	109	<groupId>org.bouncycastle</groupId>
102	110	<artifactId>bcprov-jdk15on</artifactId>
	111	</exclusion>
	112	<!-- to avoid maven-enforcer convergence error,
	113	let's make this explicit -->
	114	<exclusion>
	115	<groupId>com.healthmarketscience.jackcess</groupId>
	116	<artifactId>jackcess</artifactId>
103	117	</exclusion>
104	118	</exclusions>
105	119	</dependency>

136	150	<groupId>org.tukaani</groupId>
137	151	<artifactId>xz</artifactId>
138	152	<version>${tukaani.version}</version>
	153	</dependency>
	154	<dependency>
	155	<groupId>org.brotli</groupId>
	156	<artifactId>dec</artifactId>
	157	<version>${brotli.version}</version>
	158	</dependency>
	159	<dependency>
	160	<groupId>com.github.luben</groupId>
	161	<artifactId>zstd-jni</artifactId>
	162	<version>1.3.3-3</version>
	163	<scope>provided</scope>
139	164	</dependency>
140	165
141	166	<dependency>

315	340	<dependency>
316	341	<groupId>org.apache.opennlp</groupId>
317	342	<artifactId>opennlp-tools</artifactId>
318		<version>1.8.3</version>
	343	<version>1.8.4</version>
319	344	</dependency>
320	345
321	346	<dependency>

336	361	</exclusions>
337	362	</dependency>
338	363
339		<dependency>
	364	<!-- <dependency>
340	365	<groupId>com.tdunning</groupId>
341	366	<artifactId>json</artifactId>
342	367	<version>1.8</version>
	368	</dependency> -->
	369	<dependency>
	370	<groupId>com.github.openjson</groupId>
	371	<artifactId>openjson</artifactId>
	372	<version>1.0.10</version>
343	373	</dependency>
344	374	<dependency>
345	375	<groupId>com.google.code.gson</groupId>
346	376	<artifactId>gson</artifactId>
347		<version>2.8.1</version>
	377	<version>${gson.version}</version>
348	378	</dependency>
349	379
350	380	<!-- logging dependencies -->

369	399	<dependency>
370	400	<groupId>org.mockito</groupId>
371	401	<artifactId>mockito-core</artifactId>
372		<version>1.7</version>
	402	<version>2.15.0</version>
373	403	<scope>test</scope>
374	404	</dependency>
375	405	<dependency>

389	419	<groupId>commons-logging</groupId>
390	420	<artifactId>commons-logging</artifactId>
391	421	</exclusion>
	422	<exclusion>
	423	<groupId>org.jdom</groupId>
	424	<artifactId>jdom2</artifactId>
	425	</exclusion>
392	426	</exclusions>
393	427	</dependency>
394	428	<dependency>

400	434	<groupId>edu.ucar</groupId>
401	435	<artifactId>jj2000</artifactId>
402	436	</exclusion>
403		</exclusions>
404		</dependency>
405		<dependency>
	437	<exclusion>
	438	<groupId>org.jsoup</groupId>
	439	<artifactId>jsoup</artifactId>
	440	</exclusion>
	441	<exclusion>
	442	<groupId>org.jdom</groupId>
	443	<artifactId>jdom2</artifactId>
	444	</exclusion>
	445	</exclusions>
	446	</dependency>
	447	<!-- grib's current jsoup is vulnerable to xss
	448	exclude and import a more modern version TIKA-2561-->
	449	<dependency>
	450	<groupId>org.jsoup</groupId>
	451	<artifactId>jsoup</artifactId>
	452	<version>1.11.2</version>
	453	</dependency> <dependency>
406	454	<groupId>edu.ucar</groupId>
407	455	<artifactId>cdm</artifactId>
408	456	<version>${netcdf-java.version}</version>

415	463	<groupId>org.slf4j</groupId>
416	464	<artifactId>jcl-over-slf4j</artifactId>
417	465	</exclusion>
	466	<exclusion>
	467	<groupId>org.apache.httpcomponents</groupId>
	468	<artifactId>httpcore</artifactId>
	469	</exclusion>
	470	<exclusion>
	471	<groupId>org.jdom</groupId>
	472	<artifactId>jdom2</artifactId>
	473	</exclusion>
418	474	</exclusions>
419	475	</dependency>
420	476	<dependency>

433	489	</exclusion>
434	490	<exclusion>
435	491	<groupId>org.apache.httpcomponents</groupId>
	492	<artifactId>httpcore</artifactId>
	493	</exclusion> <exclusion>
	494	<groupId>org.apache.httpcomponents</groupId>
436	495	<artifactId>httpmime</artifactId>
437	496	</exclusion>
438	497	</exclusions>

480	539	<dependency>
481	540	<groupId>org.opengis</groupId>
482	541	<artifactId>geoapi</artifactId>
483		<version>3.0.0</version>
	542	<version>3.0.1</version>
484	543	</dependency>
485	544
486	545	<dependency>

536	595	<dependency>
537	596	<groupId>org.apache.ctakes</groupId>
538	597	<artifactId>ctakes-core</artifactId>
539		<version>3.2.2</version>
	598	<version>4.0.0</version>
540	599	<scope>provided</scope>
541	600	<exclusions>
542	601	<exclusion>

563	622	<groupId>org.springframework</groupId>
564	623	<artifactId>spring-core</artifactId>
565	624	</exclusion>
566		</exclusions>
567		</dependency>
568
	625	<exclusion>
	626	<groupId>org.apache.opennlp</groupId>
	627	<artifactId>opennlp-tools</artifactId>
	628	</exclusion>
	629	<exclusion>
	630	<groupId>com.google.guava</groupId>
	631	<artifactId>guava</artifactId>
	632	</exclusion>
	633	<exclusion>
	634	<groupId>commons-io</groupId>
	635	<artifactId>commons-io</artifactId>
	636	</exclusion>
	637	<exclusion>
	638	<groupId>org.apache.uima</groupId>
	639	<artifactId>uimafit-core</artifactId>
	640	</exclusion>
	641	<exclusion>
	642	<groupId>org.apache.uima</groupId>
	643	<artifactId>uimaj-core</artifactId>
	644	</exclusion>
	645	<exclusion>
	646	<groupId>org.jdom</groupId>
	647	<artifactId>jdom2</artifactId>
	648	</exclusion>
	649	</exclusions>
	650	</dependency>
	651	<!-- need to specify this to avoid
	652	version clash within ctakes-core 4.0.0 -->
	653	<dependency>
	654	<groupId>org.apache.uima</groupId>
	655	<artifactId>uimafit-core</artifactId>
	656	<version>2.2.0</version>
	657	<exclusions>
	658	<exclusion>
	659	<groupId>org.apache.uima</groupId>
	660	<artifactId>uimaj-core</artifactId>
	661	</exclusion>
	662	<exclusion>
	663	<groupId>commons-io</groupId>
	664	<artifactId>commons-io</artifactId>
	665	</exclusion>
	666	</exclusions>
	667	</dependency>
	668	<!-- need to specify this to avoid
	669	version clash within ctakes-core 4.0.0 -->
	670	<dependency>
	671	<groupId>org.apache.uima</groupId>
	672	<artifactId>uimaj-core</artifactId>
	673	<version>2.9.0</version>
	674	</dependency>
	675
	676	<dependency>
	677	<groupId>org.jdom</groupId>
	678	<artifactId>jdom2</artifactId>
	679	<version>2.0.6</version>
	680	</dependency>
569	681	<!--Jackson parse String to JSON-->
570	682	<dependency>
571	683	<groupId>com.fasterxml.jackson.core</groupId>
572	684	<artifactId>jackson-core</artifactId>
573		<version>2.9.2</version>
574		</dependency>
575
576		<!-- Java ImageIO plugin for JBIG2 support (often used in PDF)
577		This jbig2 dep is not distributed with Tika due to licensing
578		issue (GPLV3). That's why it is included here as "test".
579		https://github.com/levigo/jbig2-imageio
580		-->
581		<dependency>
582		<groupId>com.levigo.jbig2</groupId>
583		<artifactId>levigo-jbig2-imageio</artifactId>
584		<version>1.6.5</version>
585		<scope>test</scope>
586		</dependency>
587		<!-- Copied from PDFBox:
588		For legal reasons (incompatible license), jai-imageio-core is to be used
589		only in the tests and may not be distributed. See also LEGAL-195-->
	685	<version>${jackson.version}</version>
	686	</dependency>
	687	<!-- as of 2.9.5, jackson-databind is pulling in jackson-annotations 2.9.0
	688	For now, we need to specify databind here with exclusion statement
	689	-->
	690	<dependency>
	691	<groupId>com.fasterxml.jackson.core</groupId>
	692	<artifactId>jackson-databind</artifactId>
	693	<version>${jackson.version}</version>
	694	<exclusions>
	695	<exclusion>
	696	<groupId>com.fasterxml.jackson.core</groupId>
	697	<artifactId>jackson-annotations</artifactId>
	698	</exclusion>
	699	</exclusions>
	700	</dependency>
	701	<dependency>
	702	<groupId>com.fasterxml.jackson.core</groupId>
	703	<artifactId>jackson-annotations</artifactId>
	704	<version>${jackson.version}</version>
	705	</dependency>
	706
	707
	708	<dependency>
	709	<groupId>org.apache.pdfbox</groupId>
	710	<artifactId>jbig2-imageio</artifactId>
	711	<version>3.0.0</version>
	712	</dependency>
	713
	714	<!-- jai-imageio-core is allowed since LEGAL-304 -->
590	715	<dependency>
591	716	<groupId>com.github.jai-imageio</groupId>
592	717	<artifactId>jai-imageio-core</artifactId>
593	718	<version>1.3.1</version>
594		<scope>test</scope>
595		</dependency>
	719	</dependency>
	720	<!-- For legal reasons (incompatible license), jai-imageio-jpeg2000 is to be used
	721	only in the tests and may not be distributed. See also LEGAL-195 -->
596	722	<dependency>
597	723	<groupId>com.github.jai-imageio</groupId>
598	724	<artifactId>jai-imageio-jpeg2000</artifactId>
599	725	<version>1.3.0</version>
600	726	<scope>test</scope>
	727	<exclusions>
	728	<exclusion>
	729	<groupId>com.github.jai-imageio</groupId>
	730	<artifactId>jai-imageio-core</artifactId>
	731	</exclusion>
	732	</exclusions>
601	733	</dependency>
602	734
603	735	</dependencies>

+10

-2

tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java less more

19	19
20	20	import java.math.BigInteger;
21	21	import java.util.ArrayList;
	22	import java.util.HashSet;
22	23	import java.util.List;
	24	import java.util.Set;
23	25
24	26	import org.apache.tika.exception.TikaException;
25	27	import org.apache.tika.parser.chm.core.ChmCommons;

136	138
137	139	/* loops over all pmgls */
138	140	byte[] dir_chunk = null;
	141	Set<Integer> processed = new HashSet<>();
139	142	for (int i = startPmgl; i>=0; ) {
140	143	dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
141	144	int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;

146	149	PMGLheader = new ChmPmglHeader();
147	150	PMGLheader.parse(dir_chunk, PMGLheader);
148	151	enumerateOneSegment(dir_chunk);
149
150		i=PMGLheader.getBlockNext();
	152	int nextBlock = PMGLheader.getBlockNext();
	153	processed.add(i);
	154	if (processed.contains(nextBlock)) {
	155	throw new ChmParsingException("already processed block; avoiding cycle");
	156	}
	157	i=nextBlock;
151	158	dir_chunk = null;
152	159	}
	160
153	161	} catch (ChmParsingException e) {
154	162	LOG.warn("Chm parse exception", e);
155	163	} finally {

+42

-0

tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java less more

15	15	*/
16	16	package org.apache.tika.parser.html;
17	17
	18	import java.io.BufferedReader;
18	19	import java.io.IOException;
19	20	import java.io.InputStream;
	21	import java.io.InputStreamReader;
20	22	import java.nio.ByteBuffer;
21	23	import java.nio.charset.Charset;
	24	import java.nio.charset.StandardCharsets;
	25	import java.util.Collections;
	26	import java.util.HashSet;
	27	import java.util.Locale;
	28	import java.util.Set;
22	29	import java.util.regex.Matcher;
23	30	import java.util.regex.Pattern;
24	31

38	45	*/
39	46	public class HtmlEncodingDetector implements EncodingDetector {
40	47
	48	/**
	49	* HTML can include non-iana supported charsets that Java
	50	* recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake.
	51	* Ignore charsets in html meta-headers that are not supported by IANA.
	52	* See: TIKA-2592
	53	*/
	54	private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
	55	static {
	56	Set<String> unsupported = new HashSet<>();
	57	try (BufferedReader reader =
	58	new BufferedReader(
	59	new InputStreamReader(
	60	HtmlEncodingDetector.class
	61	.getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"),
	62	StandardCharsets.UTF_8))) {
	63	String line = reader.readLine();
	64	while (line != null) {
	65	if (line.startsWith("#")) {
	66	line = reader.readLine();
	67	continue;
	68	}
	69	line = line.trim();
	70	if (line.length() > 0) {
	71	unsupported.add(line.toLowerCase(Locale.US));
	72	}
	73	line = reader.readLine();
	74	}
	75	} catch (IOException e) {
	76	throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
	77	}
	78	CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
	79	}
41	80	// TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
42	81	private static final int DEFAULT_MARK_LIMIT = 8192;
43	82

111	150	//that is valid
112	151	while (charsetMatcher.find()) {
113	152	String candCharset = charsetMatcher.group(1);
	153	if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
	154	continue;
	155	}
114	156	if (CharsetUtils.isSupported(candCharset)) {
115	157	try {
116	158	return CharsetUtils.forName(candCharset);

+65

-0

tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java less more

23	23	import java.nio.charset.StandardCharsets;
24	24	import java.util.Arrays;
25	25	import java.util.HashSet;
	26	import java.util.List;
26	27	import java.util.Locale;
27	28	import java.util.Set;
28	29	import java.util.regex.Matcher;

35	36	import org.apache.tika.metadata.TikaCoreProperties;
36	37	import org.apache.tika.mime.MediaType;
37	38	import org.apache.tika.parser.ParseContext;
	39	import org.apache.tika.parser.utils.DataURIScheme;
	40	import org.apache.tika.parser.utils.DataURISchemeParseException;
	41	import org.apache.tika.parser.utils.DataURISchemeUtil;
38	42	import org.apache.tika.sax.TextContentHandler;
39	43	import org.apache.tika.sax.XHTMLContentHandler;
40	44	import org.xml.sax.Attributes;

56	60	private final ParseContext context;
57	61	private final boolean extractScripts;
58	62	private final StringBuilder title = new StringBuilder();
	63	private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
59	64	private int bodyLevel = 0;
60	65	private int discardLevel = 0;
61	66	private int titleLevel = 0;

168	173	}
169	174
170	175	title.setLength(0);
	176	String value = atts.getValue("src");
	177	if (value != null && value.startsWith("data:")) {
	178	handleDataURIScheme(value);
	179	}
171	180	}
172	181
173	182	/**

230	239	// And resolve relative links. Eventually this should be pushed
231	240	// into the HtmlMapper code.
232	241	if (URI_ATTRIBUTES.contains(normAttrName)) {
	242	//if this is a src="data: " element,
	243	//we've handled that as an embedded file, don't include the full thing
	244	//here
	245	if (normAttrName.equals("src")) {
	246	String v = newAttributes.getValue(att);
	247	if (v.startsWith("data:")) {
	248	newAttributes.setValue(att, "data:");
	249	}
	250	}
233	251	newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
234	252	} else if (isObject && "codebase".equals(normAttrName)) {
235	253	newAttributes.setValue(att, codebase);

295	313	}
296	314	}
297	315
	316	private void handleDataURIScheme(String string) throws SAXException {
	317	DataURIScheme dataURIScheme = null;
	318	try {
	319	dataURIScheme = dataURISchemeUtil.parse(string);
	320	} catch (DataURISchemeParseException e) {
	321	//swallow
	322	return;
	323	}
	324
	325	//do anything with attrs?
	326	Metadata m = new Metadata();
	327	m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
	328	TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
	329	if (dataURIScheme.getMediaType() != null) {
	330	m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString());
	331	}
	332	EmbeddedDocumentExtractor embeddedDocumentExtractor =
	333	EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
	334	if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
	335	try (InputStream stream = dataURIScheme.getInputStream()) {
	336	embeddedDocumentExtractor.parseEmbedded(
	337	stream, xhtml, m, false
	338	);
	339	} catch (IOException e) {
	340	EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
	341	}
	342	}
	343	}
	344
298	345	private void writeScript() throws SAXException {
299	346	//don't write an attached macro if there is no content
300	347	//we may want to revisit this behavior

312	359
313	360	EmbeddedDocumentExtractor embeddedDocumentExtractor =
314	361	EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
	362	//try to scrape dataURISchemes from javascript
	363	List<DataURIScheme> dataURISchemes = dataURISchemeUtil.extract(script.toString());
	364	for (DataURIScheme dataURIScheme : dataURISchemes) {
	365	Metadata dataUriMetadata = new Metadata();
	366	dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
	367	TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
	368	dataUriMetadata.set(Metadata.CONTENT_TYPE,
	369	dataURIScheme.getMediaType().toString());
	370	if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
	371	try (InputStream dataURISchemeInputStream = dataURIScheme.getInputStream()) {
	372	embeddedDocumentExtractor.parseEmbedded(dataURISchemeInputStream,
	373	xhtml, dataUriMetadata, false);
	374	} catch (IOException e) {
	375	//swallow
	376	}
	377	}
	378	}
	379
315	380	try (InputStream stream = new ByteArrayInputStream(
316	381	script.toString().getBytes(StandardCharsets.UTF_8))) {
317	382	embeddedDocumentExtractor.parseEmbedded(

+4

-6

tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java less more

65	65	MediaType.image("png"),
66	66	MediaType.image("vnd.wap.wbmp"),
67	67	MediaType.image("x-icon"),
68		MediaType.image("x-xcf")));
69		try {
70		Class.forName("com.levigo.jbig2.JBIG2ImageReader");
71		TMP_SUPPORTED.add(MediaType.image("x-jbig2"));
72		} catch (ClassNotFoundException e) {
73		}
	68	MediaType.image("x-xcf"),
	69	MediaType.image("x-jbig2")));
	70	//add try/catch class.forName() for image types relying on
	71	//provided dependencies
74	72	}
75	73
76	74	private static final Set<MediaType> SUPPORTED_TYPES =

+53

-10

tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java less more

32	32	import org.apache.james.mime4j.parser.ContentHandler;
33	33	import org.apache.james.mime4j.stream.BodyDescriptor;
34	34	import org.apache.james.mime4j.stream.Field;
	35	import org.apache.tika.detect.Detector;
35	36	import org.apache.tika.exception.TikaException;
36	37	import org.apache.tika.extractor.EmbeddedDocumentExtractor;
37	38	import org.apache.tika.extractor.EmbeddedDocumentUtil;

146	147	private boolean strictParsing = false;
147	148	private final boolean extractAllAlternatives;
148	149	private final EmbeddedDocumentExtractor extractor;
149
	150	private final Detector detector;
150	151	//this is used to buffer a multipart body that
151	152	//keeps track of multipart/alternative and its children
152	153	private Stack<Part> alternativePartBuffer = new Stack<>();
153	154
154	155	private Stack<BodyDescriptor> parts = new Stack<>();
155	156
156		MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata,
	157	MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
157	158	ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
158	159	this.handler = xhtml;
159	160	this.metadata = metadata;

166	167
167	168	// Was an EmbeddedDocumentExtractor explicitly supplied?
168	169	this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
	170	this.detector = detector;
169	171	}
170	172
171	173	@Override

183	185	if (parts.size() > 0) {
184	186	submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
185	187	submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
186		}
	188	}
187	189	if (body instanceof MaximalBodyDescriptor) {
188	190	MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
189	191	String contentDispositionType = maximalBody.getContentDispositionType();
190	192	if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
191		StringBuilder contentDisposition = new StringBuilder( contentDispositionType );
	193	StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
192	194	Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters();
193		for ( Entry<String, String> param : contentDispositionParameters.entrySet() ) {
	195	for (Entry<String, String> param : contentDispositionParameters.entrySet()) {
194	196	contentDisposition.append("; ")
195		.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
	197	.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
196	198	}
197	199
198	200	String contentDispositionFileName = maximalBody.getContentDispositionFilename();

200	202	submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName );
201	203	}
202	204
203		submd.set( Metadata.CONTENT_DISPOSITION, contentDisposition.toString() );
	205	submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
204	206	}
205	207	}
206	208	//if we're in a multipart/alternative or any one of its children
207	209	//add the bodypart to the latest that was added
208		if (! extractAllAlternatives && alternativePartBuffer.size() > 0) {
	210	if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
209	211	ByteArrayOutputStream bos = new ByteArrayOutputStream();
210	212	IOUtils.copy(is, bos);
211	213	alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
	214	} else if (!extractAllAlternatives && parts.size() < 2) {
	215	//if you're at the first level of embedding
	216	//and you're not in an alternative part block
	217	//and you're text/html, put that in the body of the email
	218	//otherwise treat as a regular attachment
	219	ByteArrayOutputStream bos = new ByteArrayOutputStream();
	220	IOUtils.copy(is, bos);
	221	byte[] bytes = bos.toByteArray();
	222	if (detectTextOrHtml(submd, bytes)) {
	223	handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
	224	} else {
	225	//else handle as you would any other embedded content
	226	try (TikaInputStream tis = TikaInputStream.get(bytes)) {
	227	handleEmbedded(tis, submd);
	228	}
	229	}
212	230	} else {
213	231	//else handle as you would any other embedded content
214	232	try (TikaInputStream tis = TikaInputStream.get(is)) {
215	233	handleEmbedded(tis, submd);
216	234	}
217	235	}
	236	}
	237
	238	private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
	239	String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
	240	if (mediaTypeString != null) {
	241	if (mediaTypeString.startsWith("text")) {
	242	return true;
	243	} else {
	244	return false;
	245	}
	246	}
	247	try (TikaInputStream tis = TikaInputStream.get(bytes)) {
	248	MediaType mediaType = detector.detect(tis, submd);
	249	if (mediaType != null) {
	250	//detect only once
	251	submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString());
	252	if (mediaType.toString().startsWith("text")) {
	253	return true;
	254	}
	255	}
	256	} catch (IOException e) {
	257
	258	}
	259	return false;
218	260	}
219	261
220	262	private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {

515	557	}
516	558
517	559	if (part instanceof BodyContents) {
518		handlePart((BodyContents)part);
	560	handleInlineBodyPart((BodyContents)part);
519	561	return;
520	562	}
521	563

538	580	}
539	581	}
540	582
541		private void handlePart(BodyContents part) throws MimeException, IOException {
	583	private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
542	584	String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
543	585	Parser parser = null;
544	586	if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {

554	596
555	597
556	598	if (parser == null) {
	599	//back off and treat it as an embedded chunk
557	600	try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
558	601	handleEmbedded(tis, part.metadata);
559	602	}

+16

-2

tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java less more

25	25	import org.apache.james.mime4j.parser.MimeStreamParser;
26	26	import org.apache.james.mime4j.stream.MimeConfig;
27	27	import org.apache.tika.config.Field;
	28	import org.apache.tika.detect.Detector;
28	29	import org.apache.tika.exception.TikaException;
	30	import org.apache.tika.extractor.EmbeddedDocumentUtil;
29	31	import org.apache.tika.io.TikaInputStream;
30	32	import org.apache.tika.metadata.Metadata;
31	33	import org.apache.tika.mime.MediaType;

53	55	private static final Set<MediaType> SUPPORTED_TYPES = Collections
54	56	.singleton(MediaType.parse("message/rfc822"));
55	57
	58	//rely on the detector to be thread-safe
	59	//built lazily and then reused
	60	private Detector detector;
	61
56	62	@Field
57	63	private boolean extractAllAlternatives = false;
58	64

70	76	.build();
71	77
72	78	config = context.get(MimeConfig.class, config);
73
	79	Detector localDetector = context.get(Detector.class);
	80	if (localDetector == null) {
	81	//lazily load this if necessary
	82	if (detector == null) {
	83	EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
	84	detector = embeddedDocumentUtil.getDetector();
	85	}
	86	localDetector = detector;
	87	}
74	88	MimeStreamParser parser = new MimeStreamParser(config, null, new DefaultBodyDescriptorBuilder());
75	89	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
76	90
77	91	MailContentHandler mch = new MailContentHandler(
78		xhtml, metadata, context, config.isStrictParsing(),
	92	xhtml, localDetector, metadata, context, config.isStrictParsing(),
79	93	extractAllAlternatives);
80	94	parser.setContentHandler(mch);
81	95	parser.setContentDecoding(true);

+16

-2

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java less more

283	283
284	284	// Set up listener and register the records we want to process
285	285	HSSFRequest hssfRequest = new HSSFRequest();
286		listenForAllRecords = true;
287	286	if (listenForAllRecords) {
288	287	hssfRequest.addListenerForAllRecords(formatListener);
289	288	} else {

541	540	CellValueRecordInterface value =
542	541	(CellValueRecordInterface) record;
543	542	Point point = new Point(value.getColumn(), value.getRow());
544		currentSheet.put(point, cell);
	543	if (currentSheet.containsKey(point)) {
	544	//avoid overwriting content
	545	//for now, add to extraTextCells
	546	//TODO: consider allowing multiple text pieces
	547	//per x,y to keep the text together
	548	extraTextCells.add(cell);
	549	} else {
	550	currentSheet.put(point, cell);
	551	}
	552
545	553	} else {
546	554	// Cell outside the worksheets
547	555	extraTextCells.add(cell);

650	658	}
651	659
652	660	@Override
	661	public void processRecord(Record record) {
	662	// System.out.println(record.getClass() + " : "+record.toString());
	663	super.processRecord(record);
	664	}
	665
	666	@Override
653	667	public String formatNumberDateCell(CellValueRecordInterface cell) {
654	668	String formatString = this.getFormatString(cell);
655	669	if (formatString != null && ! formatString.equals("General")) {

+47

-2

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java less more

17	17
18	18	import java.io.IOException;
19	19	import java.io.InputStream;
	20	import java.util.ArrayList;
20	21	import java.util.HashSet;
21	22	import java.util.List;
22	23

29	30	import org.apache.poi.hslf.record.RecordTypes;
30	31	import org.apache.poi.hslf.record.VBAInfoAtom;
31	32	import org.apache.poi.hslf.record.VBAInfoContainer;
	33	import org.apache.poi.hslf.usermodel.HSLFGroupShape;
32	34	import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
33	35	import org.apache.poi.hslf.usermodel.HSLFNotes;
34	36	import org.apache.poi.hslf.usermodel.HSLFObjectData;

38	40	import org.apache.poi.hslf.usermodel.HSLFSlideShow;
39	41	import org.apache.poi.hslf.usermodel.HSLFTable;
40	42	import org.apache.poi.hslf.usermodel.HSLFTableCell;
	43	import org.apache.poi.hslf.usermodel.HSLFTextBox;
41	44	import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
42	45	import org.apache.poi.hslf.usermodel.HSLFTextRun;
43	46	import org.apache.poi.hslf.usermodel.HSLFTextShape;

47	50	import org.apache.tika.exception.EncryptedDocumentException;
48	51	import org.apache.tika.extractor.EmbeddedDocumentUtil;
49	52	import org.apache.tika.io.CloseShieldInputStream;
	53	import org.apache.tika.io.IOExceptionWithCause;
50	54	import org.apache.tika.io.TikaInputStream;
51	55	import org.apache.tika.metadata.Metadata;
52	56	import org.apache.tika.mime.MediaType;

116	120	}
117	121	}
118	122
	123	extractGroupText(xhtml, slide.getShapes(), 0);
	124
119	125	// Slide footer, if present
120	126	if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
121	127	xhtml.startElement("p", "class", "slide-footer");

215	221	extractMacros(ss, xhtml);
216	222	}
217	223	xhtml.endElement("div");
	224	}
	225
	226	//Extract any text that's within an HSLFTextShape that's a descendant of
	227	//an HSLFGroupShape.
	228	private void extractGroupText(XHTMLContentHandler xhtml, List<HSLFShape> shapes, int depth) throws SAXException {
	229
	230	if (shapes == null) {
	231	return;
	232	}
	233
	234	//Only process items with depth > 0 because they should have been included
	235	//already in slide.getTextParagraphs above.
	236
	237	//However, cells are considered grouped within the table, so ignore them.
	238	//I don't believe that cells can be inside a text box or other
	239	//grouped text containing object, so always ignore them.
	240	List<List<HSLFTextParagraph>> paragraphList = new ArrayList<>();
	241	for (HSLFShape shape : shapes) {
	242	if (shape instanceof HSLFGroupShape) {
	243	//work recursively, HSLFGroupShape can contain HSLFGroupShape
	244	extractGroupText(xhtml, ((HSLFGroupShape)shape).getShapes(), depth+1);
	245	} else if (shape instanceof HSLFTextShape
	246	&& ! (shape instanceof HSLFTableCell) && depth > 0) {
	247	paragraphList.add(((HSLFTextShape)shape).getTextParagraphs());
	248	}
	249	}
	250	textRunsToText(xhtml, paragraphList);
218	251	}
219	252
220	253	private void extractMacros(HSLFSlideShow ppt, XHTMLContentHandler xhtml) {

453	486	MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
454	487	mediaType = mt.toString();
455	488	}
456		if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
457		try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
	489	if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
	490	\|\| mediaType.equals("application/x-tika-msoffice")) {
	491	NPOIFSFileSystem npoifs = null;
	492
	493	try {
	494	npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
	495	} catch (RuntimeException e) {
	496	throw new IOExceptionWithCause(e);
	497	}
	498	try {
458	499	handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
	500	} finally {
	501	if (npoifs != null) {
	502	npoifs.close();
	503	}
459	504	}
460	505	} else {
461	506	handleEmbeddedResource(

+19

-13

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java less more

73	73	import org.apache.tika.sax.BodyContentHandler;
74	74	import org.apache.tika.sax.EmbeddedContentHandler;
75	75	import org.apache.tika.sax.XHTMLContentHandler;
	76	import org.bouncycastle.cms.Recipient;
76	77	import org.xml.sax.SAXException;
77	78
78	79	/**

320	321	}
321	322	if (rtfChunk != null && (extractAllAlternatives \|\| !doneBody)) {
322	323	ByteChunk chunk = (ByteChunk) rtfChunk;
323		MAPIRtfAttribute rtf = new MAPIRtfAttribute(
324		MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
325		);
326		Parser rtfParser =
327		EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
328		if (rtfParser == null) {
329		rtfParser = new RTFParser();
330		}
331		rtfParser.parse(
332		new ByteArrayInputStream(rtf.getData()),
333		new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
334		new Metadata(), parseContext);
335		doneBody = true;
	324	//avoid buffer underflow TIKA-2530
	325	//TODO -- would be good to find an example triggering file and
	326	//figure out if this is a bug in POI or a genuine 0 length chunk
	327	if (chunk.getValue() != null && chunk.getValue().length > 0) {
	328	MAPIRtfAttribute rtf = new MAPIRtfAttribute(
	329	MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
	330	);
	331	Parser rtfParser =
	332	EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
	333	if (rtfParser == null) {
	334	rtfParser = new RTFParser();
	335	}
	336	rtfParser.parse(
	337	new ByteArrayInputStream(rtf.getData()),
	338	new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
	339	new Metadata(), parseContext);
	340	doneBody = true;
	341	}
336	342	}
337	343	if (textChunk != null && (extractAllAlternatives \|\| !doneBody)) {
338	344	xhtml.element("p", ((StringChunk) textChunk).getValue());

+18

-5

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java less more

24	24	import java.net.URI;
25	25	import java.util.HashMap;
26	26	import java.util.HashSet;
	27	import java.util.Iterator;
27	28	import java.util.List;
28	29	import java.util.Map;
29	30	import java.util.Set;

39	40	import org.apache.poi.openxml4j.opc.TargetMode;
40	41	import org.apache.poi.openxml4j.opc.internal.FileHelper;
41	42	import org.apache.poi.poifs.filesystem.DirectoryNode;
	43	import org.apache.poi.poifs.filesystem.DocumentEntry;
	44	import org.apache.poi.poifs.filesystem.Entry;
42	45	import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
43	46	import org.apache.poi.poifs.filesystem.Ole10Native;
44	47	import org.apache.poi.poifs.filesystem.Ole10NativeException;

298	301	DirectoryNode root = fs.getRoot();
299	302	POIFSDocumentType type = POIFSDocumentType.detectType(root);
300	303
301		if (root.hasEntry("CONTENTS")
302		&& root.hasEntry("\u0001Ole")
303		&& root.hasEntry("\u0001CompObj")) {
	304	if (root.hasEntry("\u0001Ole")
	305	&& root.hasEntry("\u0001CompObj")
	306	&& (
	307	root.hasEntry("CONTENTS") \|\| root.hasEntry("Package")
	308	)) {
304	309	// TIKA-704: OLE 2.0 embedded non-Office document?
305	310	//TODO: figure out if the equivalent of OLE 1.0's
306	311	//getCommand() and getFileName() exist for OLE 2.0 to populate
307	312	//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
308		stream = TikaInputStream.get(
309		fs.createDocumentInputStream("CONTENTS"));
	313	if (root.hasEntry("CONTENTS")) {
	314	stream = TikaInputStream.get(
	315	fs.createDocumentInputStream("CONTENTS"));
	316	} else if (root.hasEntry("Package")) {
	317	//TIKA-2588
	318	stream = TikaInputStream.get(
	319	fs.createDocumentInputStream("Package"));
	320	} else {
	321	throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira");
	322	}
310	323	if (embeddedExtractor.shouldParseEmbedded(metadata)) {
311	324	embeddedExtractor.parseEmbedded(
312	325	stream, new EmbeddedContentHandler(handler),

+3

-1

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java less more

35	35	import org.apache.tika.metadata.Property;
36	36	import org.apache.tika.metadata.TikaCoreProperties;
37	37	import org.apache.tika.parser.microsoft.SummaryExtractor;
	38	import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
38	39	import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
39	40	import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
40	41	import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;

60	61	if (extractor.getDocument() != null \|\|
61	62	((extractor instanceof XSSFEventBasedExcelExtractor \|\|
62	63	extractor instanceof XWPFEventBasedWordExtractor \|\|
63		extractor instanceof XSLFEventBasedPowerPointExtractor) &&
	64	extractor instanceof XSLFEventBasedPowerPointExtractor \|\|
	65	extractor instanceof XPSTextExtractor) &&
64	66	extractor.getPackage() != null)) {
65	67	extractMetadata(extractor.getCoreProperties(), metadata);
66	68	extractMetadata(extractor.getExtendedProperties(), metadata);

+13

-2

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java less more

44	44	import org.apache.tika.parser.EmptyParser;
45	45	import org.apache.tika.parser.ParseContext;
46	46	import org.apache.tika.parser.microsoft.OfficeParserConfig;
	47	import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
	48	import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
47	49	import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
48	50	import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
49	51	import org.apache.tika.parser.pkg.ZipContainerDetector;

65	67	ExtractorFactory.setThreadPrefersEventExtractors(true);
66	68
67	69	try {
68		OOXMLExtractor extractor;
	70	OOXMLExtractor extractor = null;
69	71	OPCPackage pkg;
70	72
71	73	// Locate or Open the OPCPackage for the file

82	84
83	85	// Get the type, and ensure it's one we handle
84	86	MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
	87	if (type == null) {
	88	type = ZipContainerDetector.detectXPSOPC(pkg);
	89	}
	90
85	91	if (type == null \|\| OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
86	92	// Not a supported type, delegate to Empty Parser
87	93	EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
88	94	return;
89	95	}
90	96	metadata.set(Metadata.CONTENT_TYPE, type.toString());
91
92	97	// Have the appropriate OOXML text extractor picked
93	98	POIXMLTextExtractor poiExtractor = null;
94	99	// This has already been set by OOXMLParser's call to configure()

100	105	if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
101	106	poiExtractor = trySXSLF(pkg);
102	107	}
	108	if (type.equals(OOXMLParser.XPS)) {
	109	poiExtractor = new XPSTextExtractor(pkg);
	110	}
	111
103	112	if (poiExtractor == null) {
104	113	poiExtractor = ExtractorFactory.createExtractor(pkg);
105	114	}

118	127	extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
119	128	(XSLFEventBasedPowerPointExtractor) poiExtractor);
120	129	metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
	130	} else if (poiExtractor instanceof XPSTextExtractor) {
	131	extractor = new XPSExtractorDecorator(context, poiExtractor);
121	132	} else if (document == null) {
122	133	throw new TikaException(
123	134	"Expecting UserModel based POI OOXML extractor with a document, but none found. " +

+6

-2

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java less more

39	39	//turn off POI's zip bomb detection because we have our own
40	40	ZipSecureFile.setMinInflateRatio(-1.0d);
41	41	}
	42
	43	protected static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument");
42	44
43	45	protected static final Set<MediaType> SUPPORTED_TYPES =
44	46	Collections.unmodifiableSet(new HashSet<>(Arrays.asList(

83	85	* by Tika and/or POI.
84	86	*/
85	87	protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
86		Collections.singleton(
	88	Collections.EMPTY_SET;
	89	//TODO: should we do a singleton for dwfx+xps?
	90	/*Collections.singleton(
87	91	MediaType.application("vnd.ms-xpsdocument")
88		);
	92	);*/
89	93	/**
90	94	* Serial version UID
91	95	*/

+262

-0

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.microsoft.ooxml.xps;
	18
	19	import org.apache.commons.io.IOUtils;
	20	import org.apache.commons.io.input.CloseShieldInputStream;
	21	import org.apache.poi.POIXMLDocument;
	22	import org.apache.poi.POIXMLTextExtractor;
	23	import org.apache.poi.openxml4j.opc.PackagePart;
	24	import org.apache.poi.openxml4j.opc.PackageRelationship;
	25	import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
	26	import org.apache.poi.openxml4j.opc.ZipPackage;
	27	import org.apache.poi.openxml4j.util.ZipEntrySource;
	28	import org.apache.tika.exception.TikaException;
	29	import org.apache.tika.extractor.EmbeddedDocumentUtil;
	30	import org.apache.tika.metadata.Metadata;
	31	import org.apache.tika.metadata.TikaCoreProperties;
	32	import org.apache.tika.parser.ParseContext;
	33	import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
	34	import org.apache.tika.sax.EmbeddedContentHandler;
	35	import org.apache.tika.sax.OfflineContentHandler;
	36	import org.apache.tika.sax.XHTMLContentHandler;
	37	import org.apache.tika.utils.ExceptionUtils;
	38	import org.xml.sax.Attributes;
	39	import org.xml.sax.SAXException;
	40	import org.xml.sax.helpers.DefaultHandler;
	41
	42	import java.io.IOException;
	43	import java.io.InputStream;
	44	import java.util.Collections;
	45	import java.util.Enumeration;
	46	import java.util.HashMap;
	47	import java.util.List;
	48	import java.util.Map;
	49	import java.util.zip.ZipEntry;
	50
	51	public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
	52
	53	private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
	54
	55	private final ParseContext context;
	56	private final ZipPackage pkg;
	57	Map<String, Metadata> embeddedImages = new HashMap<>();
	58
	59	public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) throws TikaException {
	60	super(context, extractor);
	61	this.context = context;
	62	if (extractor.getPackage() instanceof ZipPackage) {
	63	this.pkg = (ZipPackage) extractor.getPackage();
	64	} else {
	65	throw new TikaException("OPCPackage must be a ZipPackage");
	66	}
	67	}
	68
	69	@Override
	70	public POIXMLDocument getDocument() {
	71	return null;
	72	}
	73
	74
	75	@Override
	76	protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
	77
	78	PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
	79	for (int i = 0; i < prc.size(); i++) {
	80	PackageRelationship pr = prc.getRelationship(i);
	81
	82	//there should only be one.
	83	//in the test file, this points to FixedDocSeq.fdseq
	84	try {
	85	handleDocuments(pr, xhtml);
	86	} catch (TikaException e) {
	87	throw new SAXException(e);
	88	}
	89	}
	90
	91	//now handle embedded images
	92	if (embeddedImages.size() > 0) {
	93	EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
	94	for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) {
	95	String zipPath = embeddedImage.getKey();
	96	Metadata metadata = embeddedImage.getValue();
	97	if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
	98	handleEmbeddedImage(
	99	zipPath,
	100	metadata,
	101	embeddedDocumentUtil,
	102	xhtml);
	103	}
	104	}
	105	}
	106
	107	}
	108
	109	private void handleEmbeddedImage(String zipPath, Metadata metadata,
	110	EmbeddedDocumentUtil embeddedDocumentUtil,
	111	XHTMLContentHandler xhtml) throws SAXException, IOException {
	112	InputStream stream = null;
	113	try {
	114	stream = getZipStream(zipPath, pkg);
	115	} catch (IOException\|TikaException e) {
	116	//store this exception in the parent's metadata
	117	EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
	118	return;
	119	}
	120
	121	try {
	122	embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true);
	123	} finally {
	124	IOUtils.closeQuietly(stream);
	125	}
	126	}
	127
	128	private void handleDocuments(PackageRelationship packageRelationship,
	129	XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
	130
	131	try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
	132	context.getSAXParser().parse(
	133	new CloseShieldInputStream(stream),
	134	new OfflineContentHandler(new EmbeddedContentHandler(
	135	new FixedDocSeqHandler(xhtml))));
	136	}
	137	}
	138
	139	@Override
	140	protected List<PackagePart> getMainDocumentParts() throws TikaException {
	141	return Collections.EMPTY_LIST;
	142	}
	143
	144	private class FixedDocSeqHandler extends DefaultHandler {
	145	private final static String DOCUMENT_REFERENCE = "DocumentReference";
	146	private final static String SOURCE = "Source";
	147
	148	private final XHTMLContentHandler xhtml;
	149
	150	private FixedDocSeqHandler(XHTMLContentHandler xhtml) {
	151	this.xhtml = xhtml;
	152	}
	153
	154	@Override
	155	public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
	156	if (!DOCUMENT_REFERENCE.equals(localName)) {
	157	return;
	158	}
	159	for (int i = 0; i < atts.getLength(); i++) {
	160	String lName = atts.getLocalName(i);
	161	if (SOURCE.equals(lName)) {
	162	handleDocumentRef(atts.getValue(i));
	163	}
	164	}
	165	}
	166
	167	private void handleDocumentRef(String docRef) throws SAXException {
	168	//docRef is a path to a FixedDocumentSequence document,
	169	// e.g. /Documents/1/FixedDoc.fdoc
	170
	171	//relative root is /Documents/1 ..need this Pages...
	172	String relativeRoot = null;
	173	int i = docRef.lastIndexOf("/");
	174	if (i > 0) {
	175	relativeRoot = docRef.substring(0, i);
	176	} else {
	177	relativeRoot = "";
	178	}
	179	String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
	180	if (pkg instanceof ZipPackage) {
	181	try (InputStream stream = getZipStream(zipPath, pkg)) {
	182	context.getSAXParser().parse(
	183	new CloseShieldInputStream(stream),
	184	new OfflineContentHandler(new EmbeddedContentHandler(
	185	new PageContentPartHandler(relativeRoot, xhtml))));
	186
	187	} catch (IOException \| TikaException e) {
	188	throw new SAXException(new TikaException("IOException trying to read: " + docRef));
	189	}
	190	} else {
	191	throw new SAXException(new TikaException("Package must be ZipPackage"));
	192	}
	193	}
	194
	195	private class PageContentPartHandler extends DefaultHandler {
	196	private static final String PAGE_CONTENT = "PageContent";
	197	private static final String SOURCE = "Source";
	198
	199	private final String relativeRoot;
	200	private final XHTMLContentHandler xhtml;
	201
	202	private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) {
	203	this.relativeRoot = relativeRoot;
	204	this.xhtml = xhtml;
	205	}
	206
	207	@Override
	208	public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
	209	if (!PAGE_CONTENT.equals(localName)) {
	210	return;
	211	}
	212	String pagePath = null;
	213	for (int i = 0; i < atts.getLength(); i++) {
	214	if (SOURCE.equals(atts.getLocalName(i))) {
	215	pagePath = atts.getValue(i);
	216	break;
	217	}
	218	}
	219
	220	if (pagePath != null) {
	221	if (!pagePath.startsWith("/")) {
	222	pagePath = relativeRoot + "/" + pagePath;
	223	}
	224	//trim initial /
	225	if (pagePath.startsWith("/")) {
	226	pagePath = pagePath.substring(1);
	227	}
	228	try (InputStream stream = getZipStream(pagePath, pkg)) {
	229	context.getSAXParser().parse(
	230	new CloseShieldInputStream(stream),
	231	new OfflineContentHandler(
	232	new XPSPageContentHandler(xhtml, embeddedImages)
	233	)
	234	);
	235	} catch (TikaException \| IOException e) {
	236	throw new SAXException(e);
	237	}
	238	}
	239
	240	}
	241	}
	242	}
	243
	244	private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
	245	String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
	246	ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
	247	Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
	248	ZipEntry zipEntry = null;
	249	while (zipEntryEnumeration.hasMoreElements()) {
	250	ZipEntry ze = zipEntryEnumeration.nextElement();
	251	if (ze.getName().equals(targPath)) {
	252	zipEntry = ze;
	253	break;
	254	}
	255	}
	256	if (zipEntry == null) {
	257	throw new TikaException("Couldn't find required zip entry: " + zipPath);
	258	}
	259	return zipEntrySource.getInputStream(zipEntry);
	260	}
	261	}

+377

-0

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16	package org.apache.tika.parser.microsoft.ooxml.xps;
	17
	18	import org.apache.tika.metadata.Metadata;
	19	import org.apache.tika.metadata.TikaCoreProperties;
	20	import org.apache.tika.sax.XHTMLContentHandler;
	21	import org.xml.sax.Attributes;
	22	import org.xml.sax.SAXException;
	23	import org.xml.sax.helpers.DefaultHandler;
	24
	25	import java.util.ArrayList;
	26	import java.util.Collections;
	27	import java.util.Comparator;
	28	import java.util.LinkedHashMap;
	29	import java.util.LinkedHashSet;
	30	import java.util.List;
	31	import java.util.Map;
	32	import java.util.Set;
	33	import java.util.Stack;
	34
	35
	36	/**
	37	* Handles an individual page. For now, this marks up
	38	* canvas entities in a <div> tag. Based on the spec,
	39	* it currently relies on order within the xml for order of output
	40	* of text to xhtml. We could do more complex processing of coordinates
	41	* for bidi-languages, but the spec implies that we should be able
	42	* to rely on storage order.
	43	* <p/>
	44	* As with our PDFParser, this currently dumps urls at the bottom of the page
	45	* and does not attempt to calculate the correct anchor text.
	46	* <p/>
	47	* TODO: integrate table markup
	48	*/
	49	class XPSPageContentHandler extends DefaultHandler {
	50
	51	private static final String GLYPHS = "Glyphs";
	52	private static final String CANVAS = "Canvas";
	53	private static final String CLIP = "Clip";
	54	private static final String NULL_CLIP = "NULL_CLIP";
	55	private static final String UNICODE_STRING = "UnicodeString";
	56	private static final String ORIGIN_X = "OriginX";
	57	private static final String ORIGIN_Y = "OriginY";
	58	private static final String BIDI_LEVEL = "BidiLevel";
	59	private static final String INDICES = "Indices";
	60	private static final String NAME = "Name";
	61	private static final String PATH = "Path";
	62	private static final String NAVIGATE_URI = "FixedPage.NavigateUri";
	63	private static final String IMAGE_SOURCE = "ImageSource";
	64	private static final String IMAGE_BRUSH = "ImageBrush";
	65	private static final String AUTOMATION_PROPERITES_HELP_TEXT = "AutomationProperties.HelpText";
	66
	67	private static final String URL_DIV = "urls";
	68	private static final String DIV = "div";
	69	private static final String CLASS = "class";
	70	private static final String PAGE = "page";
	71	private static final String CANVAS_SAX = "canvas";
	72	private static final String P = "p";
	73	private static final String HREF = "href";
	74	private static final String A = "a";
	75
	76
	77	private final XHTMLContentHandler xhml;
	78
	79	//path in zip file for an image rendered on this page
	80	private String imageSourcePathInZip = null;
	81	//embedded images sometimes include full path info of original image
	82	private String originalLocationOnDrive = null;
	83
	84	//buffer for the glyph runs within a given canvas
	85	//in insertion order
	86	private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>();
	87
	88	private Set<String> urls = new LinkedHashSet();
	89	private Stack<String> canvasStack = new Stack<>();
	90	private final Map<String, Metadata> embeddedInfos;
	91	//sort based on y coordinate of first element in each row
	92	//this requires every row to have at least one element
	93	private static Comparator<? super List<GlyphRun>> ROW_SORTER = new Comparator<List<GlyphRun>>() {
	94	@Override
	95	public int compare(List<GlyphRun> o1, List<GlyphRun> o2) {
	96	if (o1.get(0).originY < o2.get(0).originY) {
	97	return -1;
	98	} else if (o1.get(0).originY > o2.get(0).originY) {
	99	return 1;
	100	}
	101	return 0;
	102	}
	103	};
	104
	105	public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) {
	106	this.xhml = xhtml;
	107	this.embeddedInfos = embeddedInfos;
	108	}
	109
	110	@Override
	111	public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
	112	if (CANVAS.equals(localName)) {
	113	String clip = getVal(CLIP, atts);
	114	if (clip == null) {
	115	canvasStack.push(NULL_CLIP);
	116	} else {
	117	canvasStack.push(clip);
	118	}
	119	return;
	120	} else if (PATH.equals(localName)) {
	121	//for now just grab them and dump them at the end of the page.
	122	String url = getVal(NAVIGATE_URI, atts);
	123	if (url != null) {
	124	urls.add(url);
	125	}
	126	originalLocationOnDrive = getVal(AUTOMATION_PROPERITES_HELP_TEXT, atts);
	127	} else if (IMAGE_BRUSH.equals(localName)) {
	128	imageSourcePathInZip = getVal(IMAGE_SOURCE, atts);
	129	}
	130
	131	if (!GLYPHS.equals(localName)) {
	132	return;
	133	}
	134	String name = null;
	135	Float originX = null;
	136	Float originY = null;
	137	String unicodeString = null;
	138	Integer bidilevel = 1;
	139	String indicesString = null;
	140
	141	for (int i = 0; i < atts.getLength(); i++) {
	142	String lName = atts.getLocalName(i);
	143	String value = atts.getValue(i);
	144	value = (value == null) ? "" : value.trim();
	145
	146	if (ORIGIN_X.equals(lName) && value.length() > 0) {
	147	try {
	148	originX = Float.parseFloat(atts.getValue(i));
	149	} catch (NumberFormatException e) {
	150	throw new SAXException(e);
	151	}
	152	} else if (ORIGIN_Y.equals(lName) && value.length() > 0) {
	153	try {
	154	originY = Float.parseFloat(atts.getValue(i));
	155	} catch (NumberFormatException e) {
	156	throw new SAXException(e);
	157	}
	158	} else if (UNICODE_STRING.equals(lName)) {
	159	unicodeString = atts.getValue(i);
	160	} else if (BIDI_LEVEL.equals(lName) && value.length() > 0) {
	161	try {
	162	bidilevel = Integer.parseInt(atts.getValue(i));
	163	} catch (NumberFormatException e) {
	164	throw new SAXException(e);
	165	}
	166	} else if (INDICES.equals(lName)) {
	167	indicesString = atts.getValue(i);
	168	} else if (NAME.equals(lName)) {
	169	name = value;
	170	}
	171	}
	172	if (unicodeString != null) {
	173	originX = (originX == null) ? Integer.MIN_VALUE : originX;
	174	originY = (originY == null) ? Integer.MAX_VALUE : originY;
	175	String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP;
	176	List<GlyphRun> runs = canvases.get(currentCanvasClip);
	177	if (runs == null) {
	178	runs = new ArrayList<>();
	179	}
	180	runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString));
	181	canvases.put(currentCanvasClip, runs);
	182	}
	183
	184	}
	185
	186	@Override
	187	public void endElement(String uri, String localName, String qName) throws SAXException {
	188	if (CANVAS.equals(localName)) {
	189	if (! canvasStack.isEmpty()) {
	190	canvasStack.pop();
	191	}
	192	} else if (PATH.equals(localName)) {
	193	//this assumes that there cannot be a path within a path
	194	//not sure if this is true or if we need to track path depth
	195	if (imageSourcePathInZip != null) {
	196	Metadata m = embeddedInfos.get(imageSourcePathInZip);
	197	if (m == null) {
	198	m = new Metadata();
	199	}
	200	if (originalLocationOnDrive != null) {
	201	String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
	202	if (val == null) {
	203	m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive);
	204	}
	205	}
	206	m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
	207	TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
	208	embeddedInfos.put(imageSourcePathInZip, m);
	209	}
	210	//reset
	211	imageSourcePathInZip = null;
	212	originalLocationOnDrive = null;
	213	}
	214	}
	215	@Override
	216	public void startDocument() throws SAXException {
	217	xhml.startElement(DIV, CLASS, PAGE);
	218	}
	219
	220	@Override
	221	public void endDocument() throws SAXException {
	222	writePage();
	223	xhml.endElement(DIV);
	224	}
	225
	226
	227	private final void writePage() throws SAXException {
	228	if (canvases.size() == 0) {
	229	return;
	230	}
	231
	232	for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) {
	233	String clip = e.getKey();
	234	List<GlyphRun> runs = e.getValue();
	235	if (runs.size() == 0) {
	236	continue;
	237	}
	238	xhml.startElement(DIV, CLASS, CANVAS_SAX);
	239	//a list of rows sorted by the y of the first element in each row
	240	List<List<GlyphRun>> rows = buildRows(runs);
	241	for (List<GlyphRun> row : rows) {
	242	writeRow(row);
	243	}
	244	xhml.endElement(DIV);
	245	}
	246	//for now just dump the urls at the end of the page
	247	//At some point, we could link them back up to their
	248	//true anchor text.
	249	if (urls.size() > 0) {
	250	xhml.startElement(DIV, CLASS, URL_DIV);
	251	for (String u : urls) {
	252	xhml.startElement(A, HREF, u);
	253	xhml.characters(u);
	254	xhml.endElement(A);
	255	}
	256	xhml.endElement(DIV);
	257	}
	258	canvases.clear();
	259	}
	260
	261	private void writeRow(List<GlyphRun> row) throws SAXException {
	262	/*
	263	int rtl = 0;
	264	int ltr = 0;
	265	//if the row is entirely rtl, sort all as rtl
	266	//otherwise sort ltr
	267	for (GlyphRun r : row) {
	268	//ignore directionality of pure spaces
	269	if (r.unicodeString == null \|\| r.unicodeString.trim().length() == 0) {
	270	continue;
	271	}
	272	if (r.direction == GlyphRun.DIRECTION.RTL) {
	273	rtl++;
	274	} else {
	275	ltr++;
	276	}
	277	}
	278	if (rtl > 0 && ltr == 0) {
	279	Collections.sort(row, GlyphRun.RTL_COMPARATOR);
	280	} else {
	281	Collections.sort(row, GlyphRun.LTR_COMPARATOR);
	282	}*/
	283
	284	xhml.startElement(P);
	285	for (GlyphRun run : row) {
	286	//figure out if you need to add a space
	287	xhml.characters(run.unicodeString);
	288	}
	289	xhml.endElement(P);
	290	}
	291
	292	//returns a List of rows (where a row is a list of glyphruns)
	293	//the List is sorted in increasing order of the first y of each row
	294	private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
	295	List<List<GlyphRun>> rows = new ArrayList<>();
	296	float maxY = -1.0f;
	297	for (GlyphRun glyphRun : glyphRuns) {
	298	if (rows.size() == 0) {
	299	List<GlyphRun> row = new ArrayList<>();
	300	row.add(glyphRun);
	301	rows.add(row);
	302	continue;
	303	} else {
	304	boolean addedNewRow = false;
	305	//can rely on the last row having the highest y
	306	List<GlyphRun> row = rows.get(rows.size()-1);
	307	//0.5 is a purely heuristic/magical number that should be derived
	308	//from the data, not made up. TODO: fix this
	309	if (Math.abs(glyphRun.originY -row.get(0).originY) < 0.5) {
	310	row.add(glyphRun);
	311	} else {
	312	row = new ArrayList<>();
	313	row.add(glyphRun);
	314	rows.add(row);
	315	addedNewRow = true;
	316	}
	317	//sort rows so that they are in ascending order of y
	318	//in most xps files in our test corpus, this is never triggered
	319	//because the runs are already ordered correctly
	320	if (maxY > -1.0f && addedNewRow && glyphRun.originY < maxY) {
	321	Collections.sort(rows, ROW_SORTER);
	322	}
	323	if (glyphRun.originY > maxY) {
	324	maxY = glyphRun.originY;
	325	}
	326	}
	327	}
	328	return rows;
	329	}
	330
	331	private static String getVal(String localName, Attributes atts) {
	332	for (int i = 0; i < atts.getLength(); i++) {
	333	if (localName.equals(atts.getLocalName(i))) {
	334	return atts.getValue(i);
	335	}
	336	}
	337	return null;
	338	}
	339
	340	final static class GlyphRun {
	341
	342	private enum DIRECTION {
	343	LTR,
	344	RTL
	345	}
	346
	347	//TODO: use name in conjunction with Frag information
	348	//to do a better job of extracting paragraph and table structure
	349	private final String name;
	350	private final float originY;
	351	private final float originX;//not currently used, but could be used for bidi text calculations
	352	private final String unicodeString;
	353	private final String indicesString;//not currently used, but could be used for width calculations
	354
	355	//not used yet
	356	private final DIRECTION direction;
	357
	358	private GlyphRun(String name, float originY, float originX, String unicodeString, Integer bidiLevel, String indicesString) {
	359	this.name = name;
	360	this.unicodeString = unicodeString;
	361	this.originY = originY;
	362	this.originX = originX;
	363	if (bidiLevel == null) {
	364	direction = DIRECTION.LTR;
	365	} else {
	366	if (bidiLevel % 2 == 0) {
	367	direction = DIRECTION.LTR;
	368	} else {
	369	direction = DIRECTION.RTL;
	370	}
	371	}
	372	this.indicesString = indicesString;
	373	}
	374	}
	375
	376	}

+66

-0

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.microsoft.ooxml.xps;
	18
	19	import org.apache.poi.POIXMLDocument;
	20	import org.apache.poi.POIXMLProperties;
	21	import org.apache.poi.POIXMLTextExtractor;
	22	import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
	23	import org.apache.poi.openxml4j.opc.OPCPackage;
	24	import org.apache.xmlbeans.XmlException;
	25
	26	import java.io.IOException;
	27
	28	/**
	29	* Currently, mostly a pass-through class to hold pkg and properties
	30	* and keep the general framework similar to our other POI-integrated
	31	* extractors.
	32	*/
	33	public class XPSTextExtractor extends POIXMLTextExtractor {
	34
	35	private final OPCPackage pkg;
	36	private final POIXMLProperties properties;
	37
	38	public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
	39	super((POIXMLDocument)null);
	40	this.pkg = pkg;
	41	this.properties = new POIXMLProperties(pkg);
	42
	43	}
	44
	45	@Override
	46	public OPCPackage getPackage() {
	47	return pkg;
	48	}
	49
	50	@Override
	51	public String getText() {
	52	return null;
	53	}
	54	public POIXMLProperties.CoreProperties getCoreProperties() {
	55	return this.properties.getCoreProperties();
	56	}
	57
	58	public POIXMLProperties.ExtendedProperties getExtendedProperties() {
	59	return this.properties.getExtendedProperties();
	60	}
	61
	62	public POIXMLProperties.CustomProperties getCustomProperties() {
	63	return this.properties.getCustomProperties();
	64	}
	65	}

+2

-2

tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java less more

28	28
29	29	import org.apache.tika.io.IOUtils;
30	30	import org.apache.tika.parser.ner.NERecogniser;
31		import org.json.JSONException;
32		import org.json.JSONObject;
	31	import com.github.openjson.JSONException;
	32	import com.github.openjson.JSONObject;
33	33	import org.slf4j.Logger;
34	34	import org.slf4j.LoggerFactory;
35	35

+149

-28

tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java less more

15	15	*/
16	16	package org.apache.tika.parser.ocr;
17	17
	18	import org.apache.commons.io.FilenameUtils;
	19
18	20	import java.io.File;
19	21	import java.io.IOException;
20	22	import java.io.InputStream;
21	23	import java.io.Serializable;
	24	import java.util.HashMap;
22	25	import java.util.Locale;
	26	import java.util.Map;
23	27	import java.util.Properties;
	28	import java.util.regex.Matcher;
	29	import java.util.regex.Pattern;
24	30
25	31	/**
26	32	* Configuration for TesseractOCRParser.

40	46
41	47	private static final long serialVersionUID = -4861942486845757891L;
42	48
	49	private static Pattern ALLOWABLE_PAGE_SEPARATORS_PATTERN =
	50	Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
	51
	52	private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN =
	53	Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
	54
43	55	public enum OUTPUT_TYPE {
44	56	TXT,
45	57	HOCR

73	85	private int enableImageProcessing = 0;
74	86
75	87	// Path to ImageMagick program, if not on system path.
76		private String ImageMagickPath = "";
	88	private String imageMagickPath = "";
77	89
78	90	// resolution of processed image (in dpi).
79	91	private int density = 300;

90	102	// factor by which image is to be scaled.
91	103	private int resize = 900;
92	104
	105	// See setPageSeparator.
	106	private String pageSeparator = "";
	107
93	108	// whether or not to preserve interword spacing
94	109	private boolean preserveInterwordSpacing = false;
95	110
96	111	// whether or not to apply rotation calculated by the rotation.py script
97	112	private boolean applyRotation = false;
	113
	114	// See addOtherTesseractConfig.
	115	private Map<String, String> otherTesseractConfig = new HashMap<>();
98	116
99	117
100	118	/**

148	166	getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
149	167	setTimeout(
150	168	getProp(props, "timeout", getTimeout()));
151		String outputTypeString = props.getProperty("outputType");
152		if ("txt".equals(outputTypeString)) {
153		setOutputType(OUTPUT_TYPE.TXT);
154		} else if ("hocr".equals(outputTypeString)) {
155		setOutputType(OUTPUT_TYPE.HOCR);
156		}
	169	setOutputType(getProp(props, "outputType", getOutputType().toString()));
157	170	setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
158	171
159	172	// set parameters for ImageMagick

174	187	setApplyRotation(
175	188	getProp(props, "applyRotation", getApplyRotation()));
176	189
	190	loadOtherTesseractConfig(props);
177	191	}
178	192
179	193	/**

184	198	}
185	199
186	200	/**
187		* Set the path to the Tesseract executable, needed if it is not on system path.
	201	* Set the path to the Tesseract executable's directory, needed if it is not on system path.
188	202	* <p>
189	203	* Note that if you set this value, it is highly recommended that you also
190	204	* set the path to the 'tessdata' folder using {@link #setTessdataPath}.
191	205	* </p>
192	206	*/
193	207	public void setTesseractPath(String tesseractPath) {
	208
	209	tesseractPath = FilenameUtils.normalize(tesseractPath);
194	210	if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
195	211	tesseractPath += File.separator;
196	212

210	226	* (such as when Tesseract is built from source), it may be located elsewhere.
211	227	*/
212	228	public void setTessdataPath(String tessdataPath) {
	229	tessdataPath = FilenameUtils.normalize(tessdataPath);
213	230	if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
214	231	tessdataPath += File.separator;
215	232

255	272	}
256	273
257	274	/**
	275	* @see #setPageSeparator(String pageSeparator)
	276	*/
	277	public String getPageSeparator() {
	278	return pageSeparator;
	279	}
	280
	281	/**
	282	* The page separator to use in plain text output. This corresponds to Tesseract's page_separator config option.
	283	* The default here is the empty string (i.e. no page separators). Note that this is also the default in
	284	* Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character. We are overriding
	285	* Tesseract 4.0's default here.
	286	*
	287	* @param pageSeparator
	288	*/
	289	public void setPageSeparator(String pageSeparator) {
	290	Matcher m = ALLOWABLE_PAGE_SEPARATORS_PATTERN.matcher(pageSeparator);
	291	if (! m.find()) {
	292	throw new IllegalArgumentException(pageSeparator + " contains illegal characters.\n"+
	293	"If you trust this value, set it with setTrustedPageSeparator");
	294	}
	295	setTrustedPageSeparator(pageSeparator);
	296	}
	297
	298	/**
	299	* Same as {@link #setPageSeparator(String)} but does not perform
	300	* any checks on the string.
	301	* @param pageSeparator
	302	*/
	303	public void setTrustedPageSeparator(String pageSeparator) {
	304	this.pageSeparator = pageSeparator;
	305	}
	306
	307	/**
258	308	* Whether or not to maintain interword spacing. Default is <code>false</code>.
259	309	*
260	310	* @param preserveInterwordSpacing

318	368
319	369	/**
320	370	* Set output type from ocr process. Default is "txt", but can be "hocr".
321		* Default value is 120s.
	371	* Default value is {@link OUTPUT_TYPE#TXT}.
322	372	*/
323	373	public void setOutputType(OUTPUT_TYPE outputType) {
324	374	this.outputType = outputType;
	375	}
	376
	377	public void setOutputType(String outputType) {
	378	if (outputType == null) {
	379	throw new IllegalArgumentException("outputType must not be null");
	380	}
	381	String lc = outputType.toLowerCase(Locale.US);
	382	if ("txt".equals(lc)) {
	383	setOutputType(OUTPUT_TYPE.TXT);
	384	} else if ("hocr".equals(lc)) {
	385	setOutputType(OUTPUT_TYPE.HOCR);
	386	} else {
	387	throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
	388	}
	389
	390
325	391	}
326	392
327	393	/**

399	465	* Deafult value is gray.
400	466	*/
401	467	public void setColorspace(String colorspace) {
402		if (!colorspace.equals(null)) {
403		this.colorspace = colorspace;
404		} else {
	468	if (colorspace == null) {
405	469	throw new IllegalArgumentException("Colorspace value cannot be null.");
406	470	}
	471	if (! colorspace.matches("(?i)^[-_A-Z0-9]+$")) {
	472	throw new IllegalArgumentException("colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
	473	}
	474	this.colorspace = colorspace;
407	475	}
408	476
409	477	/**

456	524	}
457	525
458	526	/**
459		* @return path to ImageMagick file.
460		* @see #setImageMagickPath(String ImageMagickPath)
	527	* @return path to ImageMagick executable directory.
	528	* @see #setImageMagickPath(String imageMagickPath)
461	529	*/
462	530	public String getImageMagickPath() {
463	531
464		return ImageMagickPath;
465		}
466
467		/**
468		* Set the path to the ImageMagick executable, needed if it is not on system path.
469		*
470		* @param ImageMagickPath to ImageMagick file.
471		*/
472		public void setImageMagickPath(String ImageMagickPath) {
473		if (!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator))
474		ImageMagickPath += File.separator;
475
476		this.ImageMagickPath = ImageMagickPath;
	532	return imageMagickPath;
	533	}
	534
	535	/**
	536	* Set the path to the ImageMagick executable directory, needed if it is not on system path.
	537	*
	538	* @param imageMagickPath to ImageMagick executable directory.
	539	*/
	540	public void setImageMagickPath(String imageMagickPath) {
	541	imageMagickPath = FilenameUtils.normalize(imageMagickPath);
	542	if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator))
	543	imageMagickPath += File.separator;
	544
	545	this.imageMagickPath = imageMagickPath;
477	546	}
478	547
479	548	/**

487	556	/**
488	557	* Sets whether or not a rotation value should be calculated and passed to ImageMagick.
489	558	*
490		* @param true to calculate and apply rotation, false to skip. Default is false, true required Python installed.
	559	* @param applyRotation to calculate and apply rotation, false to skip. Default is false, true required Python installed.
491	560	*/
492	561	public void setApplyRotation(boolean applyRotation) {
493	562	this.applyRotation = applyRotation;
	563	}
	564
	565	/**
	566	* @see #addOtherTesseractConfig(String, String)
	567	*/
	568	public Map<String, String> getOtherTesseractConfig() {
	569	return otherTesseractConfig;
	570	}
	571
	572	/**
	573	* Add a key-value pair to pass to Tesseract using its -c command line option.
	574	* To see the possible options, run tesseract --print-parameters.
	575	*
	576	* You may also add these parameters in TesseractOCRConfig.properties; any
	577	* key-value pair in the properties file where the key contains an underscore
	578	* is passed directly to Tesseract.
	579	*
	580	* @param key
	581	* @param value
	582	*/
	583	public void addOtherTesseractConfig(String key, String value) {
	584	if (key == null) {
	585	throw new IllegalArgumentException("key must not be null");
	586	}
	587	if (value == null) {
	588	throw new IllegalArgumentException("value must not be null");
	589	}
	590
	591	Matcher m = ALLOWABLE_OTHER_PARAMS_PATTERN.matcher(key);
	592	if (! m.find()) {
	593	throw new IllegalArgumentException("Key contains illegal characters: "+key);
	594	}
	595	m.reset(value);
	596	if (! m.find()) {
	597	throw new IllegalArgumentException("Value contains illegal characters: "+value);
	598	}
	599
	600	otherTesseractConfig.put(key.trim(), value.trim());
494	601	}
495	602
496	603	/**

542	649	property, propVal));
543	650	}
544	651
	652	/**
	653	* Populate otherTesseractConfig from the given properties.
	654	* This assumes that any key-value pair where the key contains
	655	* an underscore is an option to be passed opaquely to Tesseract.
	656	*
	657	* @param properties properties file to read from.
	658	*/
	659	private void loadOtherTesseractConfig(Properties properties) {
	660	for (String k : properties.stringPropertyNames()) {
	661	if (k.contains("_")) {
	662	addOtherTesseractConfig(k, properties.getProperty(k));
	663	}
	664	}
	665	}
545	666	}

+83

-24

tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java less more

32	32	import java.io.Reader;
33	33	import java.nio.charset.Charset;
34	34	import java.nio.file.Files;
	35	import java.nio.file.Paths;
35	36	import java.nio.file.StandardCopyOption;
	37	import java.util.ArrayList;
36	38	import java.util.Arrays;
37	39	import java.util.Collections;
38	40	import java.util.HashMap;

52	54	import org.apache.commons.exec.PumpStreamHandler;
53	55	import org.apache.commons.io.FileUtils;
54	56	import org.apache.commons.io.IOUtils;
	57	import org.apache.commons.lang.SystemUtils;
55	58	import org.apache.tika.config.Initializable;
56	59	import org.apache.tika.config.InitializableProblemHandler;
57	60	import org.apache.tika.config.Param;

109	112	MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
110	113	})));
111	114	private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
	115	private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
112	116
113	117
114	118	@Override

143	147	if (TESSERACT_PRESENT.containsKey(tesseract)) {
144	148	return TESSERACT_PRESENT.get(tesseract);
145	149	}
	150	//prevent memory bloat
	151	if (TESSERACT_PRESENT.size() > 100) {
	152	TESSERACT_PRESENT.clear();
	153	}
	154	//check that the parent directory exists
	155	if (! config.getTesseractPath().isEmpty() &&
	156	! Files.isDirectory(Paths.get(config.getTesseractPath()))) {
	157	TESSERACT_PRESENT.put(tesseract, false);
	158	return false;
	159	}
	160
146	161	// Try running Tesseract from there, and see if it exists + works
147	162	String[] checkCmd = { tesseract };
148	163	boolean hasTesseract = ExternalParser.check(checkCmd);

153	168
154	169	private boolean hasImageMagick(TesseractOCRConfig config) {
155	170	// Fetch where the config says to find ImageMagick Program
156		String ImageMagick = config.getImageMagickPath() + getImageMagickProg();
	171	String ImageMagick = getImageMagickPath(config);
157	172
158	173	// Have we already checked for a copy of ImageMagick Program there?
159		if (TESSERACT_PRESENT.containsKey(ImageMagick)) {
160		return TESSERACT_PRESENT.get(ImageMagick);
	174	if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) {
	175	return IMAGE_MAGICK_PRESENT.get(ImageMagick);
	176	}
	177	//prevent memory bloat
	178	if (IMAGE_MAGICK_PRESENT.size() > 100) {
	179	IMAGE_MAGICK_PRESENT.clear();
	180	}
	181	//check that directory exists
	182	if (!config.getImageMagickPath().isEmpty() &&
	183	! Files.isDirectory(Paths.get(config.getImageMagickPath()))) {
	184	IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
	185	return false;
	186	}
	187	if (SystemUtils.IS_OS_WINDOWS && config.getImageMagickPath().isEmpty()) {
	188	LOG.warn("Must specify path for imagemagick on Windows OS to avoid accidental confusion with convert.exe");
	189	IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
	190	return false;
161	191	}
162	192
163	193	// Try running ImageMagick program from there, and see if it exists + works
164	194	String[] checkCmd = { ImageMagick };
165	195	boolean hasImageMagick = ExternalParser.check(checkCmd);
166		TESSERACT_PRESENT.put(ImageMagick, hasImageMagick);
	196	IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick);
167	197
168	198	return hasImageMagick;
169	199
170	200	}
171	201
	202	private String getImageMagickPath(TesseractOCRConfig config) {
	203	return config.getImageMagickPath() + getImageMagickProg();
	204	}
	205
172	206	static boolean hasPython() {
173	207	// check if python is installed and it has the required dependencies for the rotation program to run
174	208	boolean hasPython = false;
175
	209	TemporaryResources tmp = null;
176	210	try {
177		TemporaryResources tmp = new TemporaryResources();
	211	tmp = new TemporaryResources();
178	212	File importCheck = tmp.createTemporaryFile();
179	213	String prg = "import numpy, matplotlib, skimage";
180	214	OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(importCheck), Charset.forName("UTF-8"));

186	220	hasPython = true;
187	221	}
188	222
189		tmp.close();
190	223
191	224	} catch (Exception e) {
192	225
	226	} finally {
	227	IOUtils.closeQuietly(tmp);
193	228	}
194	229
195	230	return hasPython;

305	340
306	341	/**
307	342	* This method is used to process the image to an OCR-friendly format.
308		* @param streamingObject input image to be processed
	343	* @param scratchFile input image to be processed
309	344	* @param config TesseractOCRconfig class to get ImageMagick properties
310	345	* @throws IOException if an input error occurred
311	346	* @throws TikaException if an exception timed out
312	347	*/
313		private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException {
	348	private void processImage(File scratchFile, TesseractOCRConfig config) throws IOException, TikaException {
314	349
315	350	// fetch rotation script from resources
316	351	InputStream in = getClass().getResourceAsStream("rotation.py");
317	352	TemporaryResources tmp = new TemporaryResources();
318	353	File rotationScript = tmp.createTemporaryFile();
319	354	Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING);
320
321		String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath();
	355
	356	CommandLine commandLine = new CommandLine("python");
	357	String[] args = {"-W",
	358	"ignore",
	359	rotationScript.getAbsolutePath(),
	360	"-f",
	361	scratchFile.getAbsolutePath()};
	362	commandLine.addArguments(args, true);
322	363	String angle = "0";
323	364
324	365	DefaultExecutor executor = new DefaultExecutor();

327	368	executor.setStreamHandler(streamHandler);
328	369
329	370	// determine the angle of rotation required to make the text horizontal
330		CommandLine cmdLine = CommandLine.parse(cmd);
331	371	if(config.getApplyRotation() && hasPython()) {
332	372	try {
333		executor.execute(cmdLine);
334		angle = outputStream.toString("UTF-8").trim();
	373	executor.execute(commandLine);
	374	String tmpAngle = outputStream.toString("UTF-8").trim();
	375	//verify that you've gotten a numeric value out
	376	Double.parseDouble(tmpAngle);
	377	angle = tmpAngle;
335	378	} catch(Exception e) {
336	379
337	380	}
338	381	}
339	382
340	383	// process the image - parameter values can be set in TesseractOCRConfig.properties
341		String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() +
342		" -colorspace " + config.getColorspace() + " -filter " + config.getFilter() +
343		" -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() +
344		" " + streamingObject.getAbsolutePath();
345		cmdLine = CommandLine.parse(line);
	384	commandLine = new CommandLine(getImageMagickPath(config));
	385	args = new String[]{
	386	"-density", Integer.toString(config.getDensity()),
	387	"-depth ", Integer.toString(config.getDepth()),
	388	"-colorspace", config.getColorspace(),
	389	"-filter", config.getFilter(),
	390	"-resize", config.getResize() + "%",
	391	"-rotate", angle,
	392	scratchFile.getAbsolutePath(),
	393	scratchFile.getAbsolutePath()
	394	};
	395	commandLine.addArguments(args, true);
346	396	try {
347		executor.execute(cmdLine);
	397	executor.execute(commandLine);
348	398	} catch(Exception e) {
349	399
350	400	}

460	510	* if an input error occurred
461	511	*/
462	512	private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
463		String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
464		config.getLanguage(), "-psm", config.getPageSegMode(),
465		config.getOutputType().name().toLowerCase(Locale.US),
	513	ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
	514	config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
	515	config.getLanguage(), "--psm", config.getPageSegMode()
	516	));
	517	for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
	518	cmd.add("-c");
	519	cmd.add(entry.getKey() + "=" + entry.getValue());
	520	}
	521	cmd.addAll(Arrays.asList(
	522	"-c", "page_separator=" + config.getPageSeparator(),
466	523	"-c",
467		(config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
	524	(config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
	525	config.getOutputType().name().toLowerCase(Locale.US)
	526	));
468	527	ProcessBuilder pb = new ProcessBuilder(cmd);
469	528	setEnv(config, pb);
470	529	final Process process = pb.start();

+11

-7

tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java less more

131	131	throws IOException, SAXException, TikaException {
132	132
133	133	PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
	134	if (localConfig.getSetKCMS()) {
	135	System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
	136	}
134	137
135	138	PDDocument pdfDocument = null;
136	139

221	224	metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
222	225	Boolean.toString(ap.canPrintDegraded()));
223	226
	227	if (document.getDocumentCatalog().getLanguage() != null) {
	228	metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
	229	}
224	230
225	231	//now go for the XMP
226	232	Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);

699	705	}
700	706
701	707	@Field
	708	void setSetKCMS(boolean setKCMS) {
	709	defaultConfig.setSetKCMS(setKCMS);
	710	}
	711
	712	@Field
702	713	void setInitializableProblemHander(String name) {
703	714	if ("ignore".equals(name)) {
704	715	setInitializableProblemHandler(InitializableProblemHandler.IGNORE);

763	774	}
764	775	StringBuilder sb = new StringBuilder();
765	776	try {
766		Class.forName("com.levigo.jbig2.JBIG2ImageReader");
767		} catch (ClassNotFoundException e) {
768		sb.append("JBIG2ImageReader not loaded. jbig2 files will be ignored\n");
769		sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n");
770		sb.append("for optional dependencies.\n");
771		}
772		try {
773	777	Class.forName("com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriter");
774	778	} catch (ClassNotFoundException e) {
775	779	sb.append("TIFFImageWriter not loaded. tiff files will not be processed\n");

+27

-0

tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java less more

135	135	private boolean extractActions = false;
136	136
137	137	private long maxMainMemoryBytes = -1;
	138
	139	private boolean setKCMS = false;
138	140
139	141	public PDFParserConfig() {
140	142	init(this.getClass().getResourceAsStream("PDFParser.properties"));

214	216
215	217	setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
216	218
	219	setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
217	220
218	221	boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
219	222	boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);

685	688
686	689	public void setMaxMainMemoryBytes(int maxMainMemoryBytes) {
687	690	this.maxMainMemoryBytes = maxMainMemoryBytes;
	691	}
	692
	693	/**
	694	* <p>
	695	* Whether to call <code>System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")</code>.
	696	* KCMS is the unmaintained, legacy provider and is far faster than the newer replacement.
	697	* However, there are stability and security risks with using the unmaintained legacy provider.
	698	* </p>
	699	* <p>
	700	* Note, of course, that this is <b>not</b> thread safe. If the value is <code>false</code>
	701	* in your first thread, and the second thread changes this to <code>true</code>,
	702	* the system property in the first thread will now be <code>true</code>.
	703	* </p>
	704	* <p>
	705	* Default is <code>false</code>.
	706	* </p>
	707	* @param setKCMS whether or not to set KCMS
	708	*/
	709	public void setSetKCMS(boolean setKCMS) {
	710	this.setKCMS = setKCMS;
	711	}
	712
	713	public boolean getSetKCMS() {
	714	return setKCMS;
688	715	}
689	716
690	717	private ImageType parseImageType(String ocrImageType) {

+81

-9

tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java less more

20	20	import java.io.BufferedInputStream;
21	21	import java.io.IOException;
22	22	import java.io.InputStream;
	23	import java.util.Collections;
	24	import java.util.HashMap;
	25	import java.util.HashSet;
	26	import java.util.Map;
23	27	import java.util.Set;
24	28
25	29	import org.apache.commons.compress.MemoryLimitException;

74	78	private static final MediaType ZLIB = MediaType.application("zlib");
75	79	private static final MediaType LZMA = MediaType.application("x-lzma");
76	80	private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
77
78		private static final Set<MediaType> SUPPORTED_TYPES =
79		MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
80		XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
	81	private static final MediaType ZSTD = MediaType.application("zstd");
	82	private static final MediaType DEFLATE64= MediaType.application("deflate64");
	83
	84	private static Set<MediaType> SUPPORTED_TYPES;
	85	private static Map<String, String> MIMES_TO_NAME;
	86
	87	static {
	88	Set<MediaType> TMP_SET = new HashSet<>();
	89	TMP_SET.addAll(
	90	MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
	91	XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA));
	92	try {
	93	Class.forName("org.brotli.dec.BrotliInputStream");
	94	TMP_SET.add(BROTLI);
	95	} catch (NoClassDefFoundError\|ClassNotFoundException e) {
	96	//swallow
	97	}
	98	try {
	99	Class.forName("com.github.luben.zstd.ZstdInputStream");
	100	TMP_SET.add(ZSTD);
	101	} catch (NoClassDefFoundError\|ClassNotFoundException e) {
	102	//swallow
	103	}
	104	SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET);
	105	}
	106
	107	static {
	108	//map the mime type strings to the compressor stream names
	109	Map<String, String> tmpMimesToName = new HashMap<>();
	110	tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2);
	111	tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP);
	112	tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED);
	113	tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK);
	114	tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ);
	115	tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200);
	116	tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED);
	117	tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE);
	118	tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z);
	119	tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA);
	120	tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI);
	121	tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD);
	122	MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName);
	123	}
	124
81	125
82	126	private int memoryLimitInKb = 100000;//100MB
83	127

140	184	return SNAPPY_RAW;
141	185	} else if (CompressorStreamFactory.LZMA.equals(name)) {
142	186	return LZMA;
	187	} else if (CompressorStreamFactory.ZSTANDARD.equals(name)) {
	188	return ZSTD;
	189	} else if (CompressorStreamFactory.DEFLATE64.equals(name)) {
	190	return DEFLATE64;
143	191	} else {
144	192	return MediaType.OCTET_STREAM;
145	193	}

174	222	});
175	223	CompressorStreamFactory factory =
176	224	new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
177		cis = factory.createCompressorInputStream(stream);
	225	//if we've already identified it via autodetect
	226	//trust that and go with the appropriate name
	227	//to avoid calling CompressorStreamFactory.detect() twice
	228	String name = getStreamName(metadata);
	229	if (name != null) {
	230	cis = factory.createCompressorInputStream(name, stream);
	231	} else {
	232	cis = factory.createCompressorInputStream(stream);
	233	MediaType type = getMediaType(cis);
	234	if (!type.equals(MediaType.OCTET_STREAM)) {
	235	metadata.set(CONTENT_TYPE, type.toString());
	236	}
	237	}
178	238	} catch (CompressorException e) {
179	239	if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
180	240	throw new TikaMemoryLimitException(e.getMessage());

182	242	throw new TikaException("Unable to uncompress document stream", e);
183	243	}
184	244
185		MediaType type = getMediaType(cis);
186		if (!type.equals(MediaType.OCTET_STREAM)) {
187		metadata.set(CONTENT_TYPE, type.toString());
188		}
189	245
190	246	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
191	247	xhtml.startDocument();

208	264	name = name.substring(0, name.length() - 5);
209	265	} else if (name.endsWith(".pack")) {
210	266	name = name.substring(0, name.length() - 5);
	267	} else if (name.endsWith(".br")) {
	268	name = name.substring(0, name.length() - 3);
211	269	} else if (name.length() > 0) {
212	270	name = GzipUtils.getUncompressedFilename(name);
213	271	}

227	285	xhtml.endDocument();
228	286	}
229	287
	288	/**
	289	* @param metadata
	290	* @return CompressorStream name based on the content-type value
	291	* in metadata or <code>null</code> if not found
	292	* ind
	293	*/
	294	private String getStreamName(Metadata metadata) {
	295	String mimeString = metadata.get(Metadata.CONTENT_TYPE);
	296	if (mimeString == null) {
	297	return null;
	298	}
	299	return MIMES_TO_NAME.get(mimeString);
	300	}
	301
230	302	@Field
231	303	public void setMemoryLimitInKb(int memoryLimitInKb) {
232	304	this.memoryLimitInKb = memoryLimitInKb;

+8

-4

tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java less more

251	251	}
252	252
253	253	SevenZFile sevenz;
254		if (password == null) {
255		sevenz = new SevenZFile(tstream.getFile());
256		} else {
257		sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
	254	try{
	255	if (password == null) {
	256	sevenz = new SevenZFile(tstream.getFile());
	257	} else {
	258	sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
	259	}
	260	}catch(PasswordRequiredException e){
	261	throw new EncryptedDocumentException(e);
258	262	}
259	263
260	264	// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty

+53

-3

tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java less more

20	20	import java.io.ByteArrayInputStream;
21	21	import java.io.IOException;
22	22	import java.io.InputStream;
	23	import java.nio.charset.StandardCharsets;
23	24	import java.util.Enumeration;
24	25	import java.util.HashSet;
25	26	import java.util.Iterator;

55	56	* formats to figure out exactly what the file is.
56	57	*/
57	58	public class ZipContainerDetector implements Detector {
	59
	60	//Regrettably, some tiff files can be incorrectly identified
	61	//as tar files. We need this ugly workaround to rule out TIFF.
	62	//If commons-compress ever chooses to take over TIFF detection
	63	//we can remove all of this. See TIKA-2591.
	64	private final static MediaType TIFF = MediaType.image("tiff");
	65	private final static byte[][] TIFF_SIGNATURES = new byte[3][];
	66	static {
	67	TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
	68	TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
	69	TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
	70	}
	71
58	72	private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
59	73
60	74	// TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes

63	77	// TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
64	78	private static final String STRICT_CORE_DOCUMENT =
65	79	"http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
	80
	81	private static final String XPS_DOCUMENT =
	82	"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
66	83
67	84	/** Serial version UID */
68	85	private static final long serialVersionUID = 2891763938430295453L;

82	99	int length = tis.peek(prefix);
83	100
84	101	MediaType type = detectArchiveFormat(prefix, length);
85		if (PackageParser.isZipArchive(type)
86		&& TikaInputStream.isTikaInputStream(input)) {
	102
	103	if (type == TIFF) {
	104	return TIFF;
	105	} else if (PackageParser.isZipArchive(type)
	106	&& TikaInputStream.isTikaInputStream(input)) {
87	107	return detectZipFormat(tis);
88	108	} else if (!type.equals(MediaType.OCTET_STREAM)) {
89	109	return type;

108	128	}
109	129	}
110	130
	131	private static boolean isTiff(byte[] prefix) {
	132	for (byte[] sig : TIFF_SIGNATURES) {
	133	if(arrayStartWith(sig, prefix)) {
	134	return true;
	135	}
	136	}
	137	return false;
	138	}
	139
	140	private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
	141	if (haystack.length < needle.length) {
	142	return false;
	143	}
	144	for (int i = 0; i < needle.length; i++) {
	145	if (haystack[i] != needle[i]) {
	146	return false;
	147	}
	148	}
	149	return true;
	150	}
	151
111	152	private static MediaType detectArchiveFormat(byte[] prefix, int length) {
	153	if (isTiff(prefix)) {
	154	return TIFF;
	155	}
112	156	try {
113	157	String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
114	158	return PackageParser.getMediaType(name);

212	256	return null;
213	257	} catch (IOException e) {
214	258	return null;
	259	} catch (SecurityException e) {
	260	//TIKA-2571
	261	throw e;
215	262	} catch (RuntimeException e) {
216	263	return null;
217	264	} catch (InvalidFormatException e) {

244	291	PackagePart corePart = pkg.getPart(core.getRelationship(0));
245	292	String coreType = corePart.getContentType();
246	293
	294	if (coreType.contains(".xps")) {
	295	return MediaType.application("vnd.ms-package.xps");
	296	}
247	297	// Turn that into the type of the overall document
248	298	String docType = coreType.substring(0, coreType.lastIndexOf('.'));
249	299

262	312	/**
263	313	* Detects Open XML Paper Specification (XPS)
264	314	*/
265		private static MediaType detectXPSOPC(OPCPackage pkg) {
	315	public static MediaType detectXPSOPC(OPCPackage pkg) {
266	316	PackageRelationshipCollection xps =
267	317	pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
268	318	if (xps.size() == 1) {

+2

-2

tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java less more

45	45	import org.apache.tika.parser.ParseContext;
46	46	import org.apache.tika.parser.recognition.ObjectRecogniser;
47	47	import org.apache.tika.parser.recognition.RecognisedObject;
48		import org.json.JSONArray;
49		import org.json.JSONObject;
	48	import com.github.openjson.JSONArray;
	49	import com.github.openjson.JSONObject;
50	50	import org.slf4j.Logger;
51	51	import org.slf4j.LoggerFactory;
52	52	import org.xml.sax.ContentHandler;

+2

-2

tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTVideoRecogniser.java less more

46	46	import org.apache.tika.mime.MimeTypeException;
47	47	import org.apache.tika.parser.ParseContext;
48	48	import org.apache.tika.parser.recognition.RecognisedObject;
49		import org.json.JSONArray;
50		import org.json.JSONObject;
	49	import com.github.openjson.JSONArray;
	50	import com.github.openjson.JSONObject;
51	51	import org.slf4j.Logger;
52	52	import org.slf4j.LoggerFactory;
53	53	import org.xml.sax.ContentHandler;

+77

-0

tika-parsers/src/main/java/org/apache/tika/parser/utils/DataURIScheme.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.utils;
	18
	19	import org.apache.tika.mime.MediaType;
	20
	21	import java.io.ByteArrayInputStream;
	22	import java.io.InputStream;
	23	import java.util.Arrays;
	24	import java.util.Objects;
	25
	26	public class DataURIScheme {
	27
	28
	29	private final String rawMediaTypeString;
	30	private final boolean isBase64;
	31	private final byte[] data;
	32
	33	DataURIScheme(String mediaTypeString, boolean isBase64, byte[] data) {
	34	this.rawMediaTypeString = mediaTypeString;
	35	this.isBase64 = isBase64;
	36	this.data = data;
	37	}
	38
	39	public InputStream getInputStream() {
	40	return new ByteArrayInputStream(data);
	41	}
	42
	43	/**
	44	*
	45	* @return parsed media type or <code>null</code> if parse fails or if media type string was
	46	* not specified
	47	*/
	48	public MediaType getMediaType() {
	49	if (rawMediaTypeString != null) {
	50	return MediaType.parse(rawMediaTypeString);
	51	}
	52	return null;
	53	}
	54
	55	public boolean isBase64() {
	56	return isBase64;
	57	}
	58
	59	@Override
	60	public boolean equals(Object o) {
	61	if (this == o) return true;
	62	if (!(o instanceof DataURIScheme)) return false;
	63	DataURIScheme that = (DataURIScheme) o;
	64	return isBase64() == that.isBase64() &&
	65	Objects.equals(rawMediaTypeString, that.rawMediaTypeString) &&
	66	Arrays.equals(data, that.data);
	67	}
	68
	69	@Override
	70	public int hashCode() {
	71
	72	int result = Objects.hash(rawMediaTypeString, isBase64());
	73	result = 31 * result + Arrays.hashCode(data);
	74	return result;
	75	}
	76	}

+28

-0

tika-parsers/src/main/java/org/apache/tika/parser/utils/DataURISchemeParseException.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.utils;
	18
	19	import org.apache.tika.exception.TikaException;
	20
	21	public class DataURISchemeParseException extends TikaException {
	22
	23	public DataURISchemeParseException(String msg) {
	24	super(msg);
	25	}
	26
	27	}

+103

-0

tika-parsers/src/main/java/org/apache/tika/parser/utils/DataURISchemeUtil.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.utils;
	18
	19	import org.apache.commons.codec.binary.Base64;
	20	import org.apache.tika.mime.MediaType;
	21
	22	import java.nio.charset.Charset;
	23	import java.nio.charset.IllegalCharsetNameException;
	24	import java.nio.charset.StandardCharsets;
	25	import java.util.ArrayList;
	26	import java.util.Collections;
	27	import java.util.List;
	28	import java.util.regex.Matcher;
	29	import java.util.regex.Pattern;
	30
	31	/**
	32	* Not thread safe. Create a separate util for each thread.
	33	*/
	34	public class DataURISchemeUtil {
	35
	36	public static String UNSPECIFIED_MEDIA_TYPE = "text/plain;charset=US-ASCII";
	37
	38	private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]?)(base64)?,(.)$");
	39	private static Pattern EXTRACT_PATTERN =
	40	Pattern.compile("(?s)data:([^,]?)(base64)?,([^\"\'])[\"\']");
	41	private final Matcher parseMatcher = PARSE_PATTERN.matcher("");
	42	private final Matcher extractMatcher = EXTRACT_PATTERN.matcher("");
	43	Base64 base64 = new Base64();
	44
	45	public DataURIScheme parse(String string) throws DataURISchemeParseException {
	46	parseMatcher.reset(string);
	47	if (parseMatcher.find()) {
	48	return build(parseMatcher.group(1), parseMatcher.group(2), parseMatcher.group(3));
	49	}
	50	throw new DataURISchemeParseException("Couldn't find expected pattern");
	51	}
	52
	53	private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) {
	54	byte[] data = null;
	55	//strip out back slashes as you might have in css
	56	dataString = (dataString != null) ?
	57	dataString.replaceAll("\\\\", " ") : dataString;
	58
	59	if (dataString == null \|\| dataString.length() == 0) {
	60	data = new byte[0];
	61	} else if (isBase64 != null) {
	62	data = base64.decode(dataString);
	63	} else {
	64	//TODO: handle encodings
	65	MediaType mediaType = MediaType.parse(mediaTypeString);
	66	Charset charset = StandardCharsets.UTF_8;
	67	if (mediaType.hasParameters()) {
	68	String charsetName = mediaType.getParameters().get("charset");
	69	if (charsetName != null && Charset.isSupported(charsetName)) {
	70	try {
	71	charset = Charset.forName(charsetName);
	72	} catch (IllegalCharsetNameException e) {
	73	//swallow and default to UTF-8
	74	}
	75	}
	76	}
	77	data = dataString.getBytes(charset);
	78	}
	79	return new DataURIScheme(mediaTypeString, (isBase64 != null), data);
	80	}
	81
	82	/**
	83	* Extracts DataURISchemes from free text, as in javascript.
	84	*
	85	* @param string
	86	* @return list of extracted DataURISchemes
	87	*/
	88	public List<DataURIScheme> extract(String string) {
	89	extractMatcher.reset(string);
	90	List<DataURIScheme> list = null;
	91	while (extractMatcher.find()) {
	92	DataURIScheme dataURIScheme = build(extractMatcher.group(1),
	93	extractMatcher.group(2), extractMatcher.group(3));
	94	if (list == null) {
	95	list = new ArrayList<>();
	96	}
	97	list.add(dataURIScheme);
	98	}
	99	return (list == null) ? Collections.EMPTY_LIST : list;
	100	}
	101
	102	}

+139

-0

tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt less more

	0	# Licensed to the Apache Software Foundation (ASF) under one or more
	1	# contributor license agreements. See the NOTICE file distributed with
	2	# this work for additional information regarding copyright ownership.
	3	# The ASF licenses this file to You under the Apache License, Version 2.0
	4	# (the "License"); you may not use this file except in compliance with
	5	# the License. You may obtain a copy of the License at
	6	#
	7	# http://www.apache.org/licenses/LICENSE-2.0
	8	#
	9	# Unless required by applicable law or agreed to in writing, software
	10	# distributed under the License is distributed on an "AS IS" BASIS,
	11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	12	# See the License for the specific language governing permissions and
	13	# limitations under the License.
	14	646
	15	737
	16	775
	17	813
	18	819
	19	858
	20	874
	21	8859_1
	22	8859_13
	23	8859_15
	24	8859_2
	25	8859_4
	26	8859_5
	27	8859_7
	28	8859_9
	29	912
	30	914
	31	915
	32	920
	33	923
	34	ansi-1251
	35	ascii
	36	ascii7
	37	cesu8
	38	cp1250
	39	cp1251
	40	cp1252
	41	cp1253
	42	cp1254
	43	cp1257
	44	cp5346
	45	cp5347
	46	cp5348
	47	cp5349
	48	cp5350
	49	cp5353
	50	cp737
	51	cp813
	52	cp858
	53	cp874
	54	cp912
	55	cp914
	56	cp915
	57	cp920
	58	cp923
	59	csibm862
	60	csisolatin0
	61	csisolatin9
	62	cspcp855
	63	default
	64	ibm-437
	65	ibm-737
	66	ibm-775
	67	ibm-813
	68	ibm-819
	69	ibm-850
	70	ibm-852
	71	ibm-855
	72	ibm-857
	73	ibm-862
	74	ibm-866
	75	ibm-874
	76	ibm-912
	77	ibm-914
	78	ibm-915
	79	ibm-920
	80	ibm-923
	81	ibm737
	82	ibm813
	83	ibm874
	84	ibm912
	85	ibm914
	86	ibm915
	87	ibm920
	88	ibm923
	89	iso8859-1
	90	iso8859-13
	91	iso8859-15
	92	iso8859-2
	93	iso8859-4
	94	iso8859-5
	95	iso8859-7
	96	iso8859-9
	97	iso8859_1
	98	iso8859_13
	99	iso8859_15
	100	iso8859_15_fdis
	101	iso8859_2
	102	iso8859_4
	103	iso8859_5
	104	iso8859_7
	105	iso8859_9
	106	iso_8859-13
	107	iso_8859_1
	108	koi8
	109	koi8_r
	110	koi8_u
	111	l9
	112	latin0
	113	latin9
	114	sun_eu_greek
	115	unicode
	116	unicode-1-1-utf-8
	117	unicodebig
	118	unicodebigunmarked
	119	unicodelittle
	120	unicodelittleunmarked
	121	utf-32be-bom
	122	utf-32le-bom
	123	utf16
	124	utf32
	125	utf8
	126	utf_16
	127	utf_16be
	128	utf_16le
	129	utf_32
	130	utf_32be
	131	utf_32be_bom
	132	utf_32le
	133	utf_32le_bom
	134	windows-437
	135	x-utf-16be
	136	x-utf-16le
	137	x-utf-32be
	138	x-utf-32le⏎

+2

-0

tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties less more

36	36	ocrImageScale 2.0
37	37	# Use up to 500MB when loading a pdf into a PDDocument
38	38	maxMainMemoryBytes 524288000
	39	#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
	40	setKCMS false

+20

-4

tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java less more

378	378	// For spanned zip files, the .zip file doesn't have the header, it's the other parts
379	379	assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
380	380	assertTypeByData("application/zip", "test-documents-spanned.z01");
	381
	382	assertTypeDetection("testZSTD.zstd", "application/zstd");
381	383	}
382	384
383	385	@Test

896	898	// MBOX
897	899	assertTypeDetection("headers.mbox", "application/mbox");
898	900
899		// Thunderbird - doesn't currently work by name
900		assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
901		}
902
	901	// Thunderbird
	902	assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
	903
	904	//dkim header
	905	assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
	906
	907	//x- custom header
	908	assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
	909
	910	//embedded xhtml and img
	911	assertTypeDetection("testEML_embedded_xhtml_and_img.eml", "message/rfc822");
	912
	913	}
	914
	915	@Test
	916	public void testMessageNews() throws Exception {
	917	assertTypeByData("message/news", "testMessageNews.txt");
	918	}
903	919	@Test
904	920	public void testAxCrypt() throws Exception {
905	921	// test-TXT.txt encrypted with a key of "tika"

+44

-0

tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java less more

28	28	import javax.xml.transform.sax.TransformerHandler;
29	29	import javax.xml.transform.stream.StreamResult;
30	30	import java.io.ByteArrayInputStream;
	31	import java.io.ByteArrayOutputStream;
31	32	import java.io.File;
32	33	import java.io.IOException;
33	34	import java.io.InputStream;

38	39	import java.nio.file.Path;
39	40	import java.nio.file.Paths;
40	41	import java.util.ArrayList;
	42	import java.util.Arrays;
41	43	import java.util.HashMap;
42	44	import java.util.List;
43	45	import java.util.Map;

51	53	import java.util.concurrent.Future;
52	54	import java.util.regex.Pattern;
53	55
	56	import org.apache.commons.codec.binary.Base64;
54	57	import org.apache.tika.Tika;
55	58	import org.apache.tika.TikaTest;
56	59	import org.apache.tika.config.ServiceLoader;

58	61	import org.apache.tika.detect.AutoDetectReader;
59	62	import org.apache.tika.detect.EncodingDetector;
60	63	import org.apache.tika.exception.TikaException;
	64	import org.apache.tika.io.IOUtils;
61	65	import org.apache.tika.io.TikaInputStream;
62	66	import org.apache.tika.metadata.Geographic;
63	67	import org.apache.tika.metadata.Metadata;

1264	1268	}
1265	1269
1266	1270	@Test
	1271	public void testDataURI() throws Exception {
	1272	List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img.html");
	1273	assertEquals(2, metadataList.size());
	1274	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	1275	assertContains("some content", content);
	1276	//make sure that you've truncated the data: value
	1277	assertContains("src=\"data:\"", content);
	1278	Metadata imgMetadata = metadataList.get(1);
	1279	assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE));
	1280	assertContains("moscow-birds",
	1281	Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT)));
	1282	}
	1283
	1284	@Test
	1285	public void testDataURIInJS() throws Exception {
	1286	InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml");
	1287	assertNotNull(is);
	1288	TikaConfig tikaConfig = new TikaConfig(is);
	1289	Parser p = new AutoDetectParser(tikaConfig);
	1290	List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img_in_js.html", p);
	1291	assertEquals(3, metadataList.size());
	1292	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	1293	assertContains("some content", content);
	1294	Metadata imgMetadata = metadataList.get(1);
	1295	assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE));
	1296	assertContains("moscow-birds",
	1297	Arrays.asList(imgMetadata.getValues(Metadata.SUBJECT)));
	1298	}
	1299
	1300	@Test
1267	1301	public void testMultiThreadingEncodingDetection() throws Exception {
1268	1302	List<EncodingDetector> detectors = new ArrayList<>();
1269	1303	ServiceLoader loader =

1350	1384	}
1351	1385	}
1352	1386	}
	1387
	1388	@Test
	1389	public void testCharsetsNotSupportedByIANA() throws Exception {
	1390	assertContains("This is a sample text",
	1391	getXML("testHTML_charset_utf8.html").xml);
	1392
	1393	assertContains("This is a sample text",
	1394	getXML("testHTML_charset_utf16le.html").xml);
	1395
	1396	}
1353	1397	}

+26

-0

tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java less more

259	259	metadata.get(Metadata.SUBJECT));
260	260	}
261	261
	262	@Test
	263	public void testMainBody() throws Exception {
	264	//test that the first text or html chunk is processed in the main body
	265	//not treated as an attachment. TIKA-2547
	266	List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom");
	267	assertEquals(7, metadataList.size());
	268	assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
	269
	270	//Make sure text alternative doesn't get treated as an attachment
	271	metadataList = getRecursiveMetadata("testRFC822_normal_zip");
	272	assertEquals(3, metadataList.size());
	273	assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
	274	assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE));
	275
	276	metadataList = getRecursiveMetadata("testRFC822-txt-body");
	277	assertEquals(2, metadataList.size());
	278	assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
	279	}
	280
262	281	/**
263	282	* Test for TIKA-640, increase header max beyond 10k bytes
264	283	*/

670	689	assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
671	690	assertEquals("/tzora-titan-4-hummer-xl-manual.pdf", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
672	691	}
	692
	693	@Test
	694	public void testSimpleBodyInlined() throws Exception {
	695	List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt");
	696	assertEquals(1, metadataList.size());
	697	assertContains("asked", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
	698	}
673	699	}

+7

-0

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java less more

19	19	import static org.junit.Assert.assertTrue;
20	20	import static org.junit.Assert.fail;
21	21
	22	import java.io.File;
22	23	import java.io.InputStream;
23	24	import java.text.DecimalFormatSymbols;
24	25	import java.util.List;

543	544	getXML("testEXCEL_phonetic.xls", parser).xml);
544	545
545	546	}
	547
	548	@Test
	549	public void testLabelsAreExtracted() throws Exception {
	550	String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
	551	assertContains("Morocco", xml);
	552	}
546	553	}

+62

-1

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java less more

30	30	import org.apache.tika.parser.ParseContext;
31	31	import org.apache.tika.parser.RecursiveParserWrapper;
32	32	import org.apache.tika.sax.BodyContentHandler;
	33	import org.junit.Ignore;
33	34	import org.junit.Test;
34	35	import org.xml.sax.ContentHandler;
35	36

63	64	assertContains("<p>[1] This is a footnote.", xml);
64	65	assertContains("<p>This is the header text.</p>", xml);
65	66	assertContains("<p>This is the footer text.</p>", xml);
66		assertContains("<p>Here is a text box</p>", xml);
	67	assertContainsCount("<p>Here is a text box</p>", xml, 1);
67	68	assertContains("<p>Bold ", xml);
68	69	assertContains("italic underline superscript subscript", xml);
69	70	assertContains("underline", xml);

292	293	public void testEncrypted() throws Exception {
293	294	getXML("testPPT_protected_passtika.ppt");
294	295	}
	296
	297	@Test
	298	public void testGroups() throws Exception {
	299	List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.ppt");
	300	assertEquals(3, metadataList.size());
	301	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	302	//this tests that we're ignoring text shapes at depth=0
	303	//i.e. POI has already included them in the slide's getTextParagraphs()
	304	assertContainsCount("Text box1", content, 1);
	305
	306
	307	//the WordArt and text box count tests will fail
	308	//if this content is available via getTextParagraphs() of the slide in POI
	309	//i.e. when POI is fixed, these tests will fail, and
	310	//we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
	311	assertContainsCount("WordArt1", content, 1);
	312	assertContainsCount("WordArt2", content, 1);
	313	assertContainsCount("Ungrouped text box", content, 1);//should only be 1
	314	assertContains("Text box2", content);
	315	assertContains("Text box3", content);
	316	assertContains("Text box4", content);
	317	assertContains("Text box5", content);
	318
	319	//see below -- need to extract hyperlinks
	320	assertContains("tika", content);
	321	assertContains("MyTitle", content);
	322
	323	assertEquals("/embedded-1",
	324	metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	325
	326	assertEquals("/embedded-2",
	327	metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	328
	329	}
	330
	331	@Ignore("until we add smart text extraction")
	332	@Test
	333	public void testSmartArtText() throws Exception {
	334	String content = getXML("testPPT_groups.ppt").xml;
	335	assertContains("smart1", content);
	336	}
	337
	338	@Ignore("until we fix hyperlink extraction from text boxes")
	339	@Test
	340	public void testHyperlinksInTextBoxes() throws Exception {
	341	String content = getXML("testPPT_groups.ppt").xml;
	342	assertContains("href=\"http://tika.apache.org", content);
	343	}
	344
	345	@Test
	346	public void testEmbeddedXLSInOLEObject() throws Exception {
	347	List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt");
	348	assertEquals(3, metadataList.size());
	349	Metadata xlsx = metadataList.get(1);
	350	assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
	351	assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
	352	assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	353	xlsx.get(Metadata.CONTENT_TYPE));
	354
	355	}
295	356	}

+37

-0

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java less more

1740	1740	}
1741	1741
1742	1742	@Test
	1743	public void testPPTXGroups() throws Exception {
	1744	List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx");
	1745	assertEquals(3, metadataList.size());
	1746	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	1747	assertContains("WordArt1", content);
	1748	assertContains("WordArt2", content);
	1749	assertContainsCount("Ungrouped text box", content, 1);//should only be 1
	1750	assertContains("Text box1", content);
	1751	assertContains("Text box2", content);
	1752	assertContains("Text box3", content);
	1753	assertContains("Text box4", content);
	1754	assertContains("Text box5", content);
	1755
	1756
	1757	assertContains("href=\"http://tika.apache.org", content);
	1758	assertContains("smart1", content);
	1759	assertContains("MyTitle", content);
	1760
	1761	assertEquals("/image1.jpg",
	1762	metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	1763
	1764	assertEquals("/thumbnail.jpeg",
	1765	metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	1766	}
	1767
	1768	@Test
1743	1769	public void testXLSXPhoneticStrings() throws Exception {
1744	1770	//This unit test and test file come from Apache POI 51519.xlsx
1745	1771

1789	1815	assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
1790	1816	assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
1791	1817	}
	1818
	1819	@Test
	1820	public void testEmbeddedXLSInOLEObject() throws Exception {
	1821	List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx");
	1822	assertEquals(4, metadataList.size());
	1823	Metadata xlsx = metadataList.get(2);
	1824	assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
	1825	assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
	1826	assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	1827	xlsx.get(Metadata.CONTENT_TYPE));
	1828	}
1792	1829	}
1793	1830
1794	1831

+27

-0

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java less more

590	590	assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
591	591
592	592	}
	593
	594	@Test
	595	public void testPPTXGroups() throws Exception {
	596	List<Metadata> metadataList = getRecursiveMetadata("testPPT_groups.pptx", parseContext);
	597	assertEquals(3, metadataList.size());
	598	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	599	assertContains("WordArt1", content);
	600	assertContains("WordArt2", content);
	601	assertContainsCount("Ungrouped text box", content, 1);//should only be 1
	602	assertContains("Text box1", content);
	603	assertContains("Text box2", content);
	604	assertContains("Text box3", content);
	605	assertContains("Text box4", content);
	606	assertContains("Text box5", content);
	607
	608
	609	assertContains("href=\"http://tika.apache.org", content);
	610	assertContains("smart1", content);
	611	assertContains("MyTitle", content);
	612
	613	assertEquals("/image1.jpg",
	614	metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	615
	616	assertEquals("/thumbnail.jpeg",
	617	metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
	618	}
	619
593	620	}

+97

-0

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16	package org.apache.tika.parser.microsoft.ooxml.xps;
	17
	18	import org.apache.tika.TikaTest;
	19	import org.apache.tika.metadata.Metadata;
	20	import org.apache.tika.metadata.TikaCoreProperties;
	21	import org.apache.tika.parser.RecursiveParserWrapper;
	22	import org.junit.Test;
	23
	24	import java.util.List;
	25
	26	import static org.junit.Assert.assertEquals;
	27
	28	public class XPSParserTest extends TikaTest {
	29
	30	@Test
	31	public void testBasic() throws Exception {
	32	List<Metadata> metadataList = getRecursiveMetadata("testPPT.xps");
	33	assertEquals(2, metadataList.size());
	34
	35	//metadata
	36	assertEquals("Rajiv", metadataList.get(0).get(TikaCoreProperties.CREATOR));
	37	assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.CREATED));
	38	assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.MODIFIED));
	39	assertEquals("Attachment Test", metadataList.get(0).get(TikaCoreProperties.TITLE));
	40
	41	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	42	assertContains("<p>Attachment Test</p>", content);
	43	assertContains("<div class=\"canvas\"><p>Different", content);
	44
	45	//I'd want this to be "tika content", but copy+paste in Windows yields tikacontent
	46	assertContains("tikacontent", content);
	47
	48
	49	assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
	50	}
	51
	52	@Test
	53	public void testVarious() throws Exception {
	54	List<Metadata> metadataList = getRecursiveMetadata("testXPS_various.xps");
	55	//confirm embedded images and thumbnails were extracted
	56	assertEquals(4, metadataList.size());
	57
	58	//now check for content in the right order
	59	String quickBrownFox = "\u0644\u062B\u0639\u0644\u0628\u0020" +
	60	"\u0627\u0644\u0628\u0646\u064A\u0020" +
	61	"\u0627\u0644\u0633\u0631\u064A\u0639";
	62
	63	String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
	64	assertContains(quickBrownFox, content);
	65
	66	assertContains("The \u0627\u0644\u0628\u0646\u064A fox", content);
	67
	68	assertContains("\u0644\u062B\u0639\u0644\u0628 brown \u0627\u0644\u0633\u0631\u064A\u0639",
	69	content);
	70
	71	//make sure the urls come through
	72	assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>",
	73	content);
	74
	75	Metadata metadata = metadataList.get(0);
	76	assertEquals("Allison, Timothy B.", metadata.get(TikaCoreProperties.CREATOR));
	77	assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.CREATED));
	78	assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.MODIFIED));
	79
	80
	81	assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
	82
	83	Metadata inlineJpeg = metadataList.get(2);
	84	assertEquals("image/jpeg", inlineJpeg.get(Metadata.CONTENT_TYPE));
	85	assertContains("INetCache", inlineJpeg.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
	86	assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
	87	inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
	88
	89	assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
	90	// assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
	91	// inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
	92
	93
	94	}
	95
	96	}

+82

-0

tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java less more

170	170	config.setResize(1000);
171	171	}
172	172
	173	@Test(expected=IllegalArgumentException.class)
	174	public void testDataPathCheck() {
	175	TesseractOCRConfig config = new TesseractOCRConfig();
	176	config.setTessdataPath("blah\u0000deblah");
	177	}
	178
	179	@Test(expected=IllegalArgumentException.class)
	180	public void testPathCheck() {
	181	TesseractOCRConfig config = new TesseractOCRConfig();
	182	config.setTesseractPath("blah\u0000deblah");
	183	}
	184
	185	@Test(expected=IllegalArgumentException.class)
	186	public void testBadOtherKey() {
	187	TesseractOCRConfig config = new TesseractOCRConfig();
	188	config.addOtherTesseractConfig("bad bad", "bad");
	189
	190	}
	191
	192	@Test(expected=IllegalArgumentException.class)
	193	public void testBadOtherValue() {
	194	TesseractOCRConfig config = new TesseractOCRConfig();
	195	config.addOtherTesseractConfig("bad", "bad bad");
	196	}
	197
	198	@Test(expected=IllegalArgumentException.class)
	199	public void testBadOtherValueSlash() {
	200	TesseractOCRConfig config = new TesseractOCRConfig();
	201	config.addOtherTesseractConfig("bad", "bad\\bad");
	202	}
	203
	204	@Test(expected=IllegalArgumentException.class)
	205	public void testBadOtherValueControl() {
	206	TesseractOCRConfig config = new TesseractOCRConfig();
	207	config.addOtherTesseractConfig("bad", "bad\u0001bad");
	208	}
	209
	210	@Test
	211	public void testGoodOtherParameters() {
	212	TesseractOCRConfig config = new TesseractOCRConfig();
	213	config.addOtherTesseractConfig("good", "good");
	214	}
	215
	216	@Test
	217	public void testBogusPathCheck() {
	218	//allow path that doesn't actually exist
	219	TesseractOCRConfig config = new TesseractOCRConfig();
	220	config.setTesseractPath("blahdeblahblah");
	221	assertEquals("blahdeblahblah"+File.separator, config.getTesseractPath());
	222	}
	223
	224	@Test
	225	public void testTrailingSlashInPathBehavior() {
	226
	227	TesseractOCRConfig config = new TesseractOCRConfig();
	228	config.setTesseractPath("blah");
	229	assertEquals("blah"+File.separator, config.getTesseractPath());
	230	config.setTesseractPath("blah"+File.separator);
	231	assertEquals("blah"+File.separator, config.getTesseractPath());
	232	config.setTesseractPath("");
	233	assertEquals("", config.getTesseractPath());
	234
	235	config.setTessdataPath("blahdata");
	236	assertEquals("blahdata"+File.separator, config.getTessdataPath());
	237	config.setTessdataPath("blahdata"+File.separator);
	238	assertEquals("blahdata"+File.separator, config.getTessdataPath());
	239	config.setTessdataPath("");
	240	assertEquals("", config.getTessdataPath());
	241
	242	config.setImageMagickPath("imagemagickpath");
	243	assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
	244	config.setImageMagickPath("imagemagickpath"+File.separator);
	245	assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
	246	config.setImageMagickPath("");
	247	assertEquals("", config.getImageMagickPath());
	248	}
	249
	250	@Test(expected=IllegalArgumentException.class)
	251	public void testBadColorSpace() {
	252	TesseractOCRConfig config = new TesseractOCRConfig();
	253	config.setColorspace("someth!ng");
	254	}
173	255	}

+8

-0

tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java less more

1369	1369	assertFalse(path + " should have thrown exception", noEx);
1370	1370	}
1371	1371
	1372	@Test
	1373	public void testLanguageMetadata() throws Exception {
	1374	assertEquals("de-CH", getXML("testPDF-custommetadata.pdf")
	1375	.metadata.get(TikaCoreProperties.LANGUAGE));
	1376	assertEquals("zh-CN", getXML("testPDFFileEmbInAnnotation.pdf")
	1377	.metadata.get(TikaCoreProperties.LANGUAGE));
	1378	}
	1379
1372	1380	/**
1373	1381	* Simple class to count end of document events. If functionality is useful,
1374	1382	* move to org.apache.tika in src/test

+28

-1

tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java less more

20	20	import static org.junit.Assert.assertEquals;
21	21	import static org.junit.Assert.fail;
22	22
	23	import java.io.BufferedWriter;
	24	import java.io.OutputStreamWriter;
	25	import java.io.Writer;
	26	import java.nio.charset.StandardCharsets;
	27	import java.nio.file.Files;
	28	import java.nio.file.Path;
	29	import java.nio.file.Paths;
	30	import java.nio.file.StandardOpenOption;
23	31	import java.util.HashSet;
	32	import java.util.List;
24	33	import java.util.Set;
25	34
26	35	import org.apache.commons.compress.compressors.CompressorStreamFactory;
27	36	import org.apache.tika.TikaTest;
28	37	import org.apache.tika.metadata.Metadata;
	38	import org.apache.tika.metadata.TikaCoreProperties;
29	39	import org.apache.tika.mime.MediaType;
30	40	import org.apache.tika.parser.ParseContext;
	41	import org.apache.tika.parser.RecursiveParserWrapper;
31	42	import org.junit.BeforeClass;
32	43	import org.junit.Test;
33	44

38	49
39	50	@BeforeClass
40	51	public static void setUp() {
41		NOT_COVERED.add(MediaType.application("x-brotli"));
42	52	NOT_COVERED.add(MediaType.application("x-lz4-block"));
43	53	NOT_COVERED.add(MediaType.application("x-snappy-raw"));
	54	NOT_COVERED.add(MediaType.application("deflate64"));
44	55	}
45	56
46	57	@Test

57	68	//xml parser throws an exception for test1.xml
58	69	//for now, be content that the container file is correctly identified
59	70	assertContains("test1.xml", r.xml);
	71	}
	72
	73	@Test
	74	public void testZstd() throws Exception {
	75	XMLResult r = getXML("testZSTD.zstd");
	76	assertContains("0123456789", r.xml);
	77	}
	78
	79	@Test
	80	public void testBrotli() throws Exception {
	81	Metadata metadata = new Metadata();
	82	metadata.set(Metadata.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
	83	List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
	84
	85	assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
	86	assertEquals("testBROTLI_compressed", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
60	87	}
61	88
62	89	@Test

+15

-0

tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java less more

140	140	}
141	141
142	142	assertTrue("test no password", ex);
	143
	144	// No password, will fail with EncryptedDocumentException
	145	ex = false;
	146	try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
	147	"/test-documents/full_encrypted.7z")) {
	148	parser.parse(stream, handler, metadata, recursingContext);
	149	fail("Shouldn't be able to read a full password protected 7z without the password");
	150	} catch (EncryptedDocumentException e) {
	151	// Good
	152	ex = true;
	153	} catch (Exception e){
	154	ex = false;
	155	}
	156
	157	assertTrue("test no password for full encrypted 7z", ex);
143	158
144	159	ex = false;
145	160

+55

-0

tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.pkg;
	18
	19
	20	import org.apache.commons.compress.compressors.CompressorStreamFactory;
	21	import org.apache.tika.TikaTest;
	22	import org.apache.tika.io.TikaInputStream;
	23	import org.apache.tika.metadata.Metadata;
	24	import org.apache.tika.mime.MediaType;
	25	import org.apache.tika.parser.ParseContext;
	26	import org.junit.BeforeClass;
	27	import org.junit.Test;
	28
	29	import java.io.InputStream;
	30	import java.util.HashSet;
	31	import java.util.Set;
	32
	33	import static org.junit.Assert.assertEquals;
	34	import static org.junit.Assert.fail;
	35
	36	public class ZipContainerDetectorTest extends TikaTest {
	37
	38	@Test
	39	public void testTiffWorkaround() throws Exception {
	40	//TIKA-2591
	41	ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
	42	Metadata metadata = new Metadata();
	43	try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
	44	MediaType mt = zipContainerDetector.detect(is, metadata);
	45	assertEquals(MediaType.image("tiff"), mt);
	46	}
	47	metadata = new Metadata();
	48	try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
	49	MediaType mt = zipContainerDetector.detect(is, metadata);
	50	assertEquals(MediaType.image("tiff"), mt);
	51	}
	52
	53	}
	54	}⏎

+79

-0

tika-parsers/src/test/java/org/apache/tika/parser/utils/DataURISchemeParserTest.java less more

	0	/*
	1	* Licensed to the Apache Software Foundation (ASF) under one or more
	2	* contributor license agreements. See the NOTICE file distributed with
	3	* this work for additional information regarding copyright ownership.
	4	* The ASF licenses this file to You under the Apache License, Version 2.0
	5	* (the "License"); you may not use this file except in compliance with
	6	* the License. You may obtain a copy of the License at
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	package org.apache.tika.parser.utils;
	18
	19	import org.apache.tika.TikaTest;
	20	import org.apache.tika.io.IOUtils;
	21	import org.apache.tika.mime.MediaType;
	22	import org.junit.Test;
	23
	24	import java.io.ByteArrayOutputStream;
	25	import java.io.InputStream;
	26	import java.nio.charset.Charset;
	27	import java.nio.charset.StandardCharsets;
	28
	29	import static org.junit.Assert.assertEquals;
	30	import static org.junit.Assert.assertFalse;
	31	import static org.junit.Assert.assertNull;
	32	import static org.junit.Assert.assertTrue;
	33
	34	public class DataURISchemeParserTest extends TikaTest {
	35	DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
	36
	37	@Test
	38	public void testEmpty() throws Exception {
	39	DataURIScheme dataURIScheme = dataURISchemeUtil.parse("data:,");
	40	assertFalse(dataURIScheme.isBase64());
	41	assertNull(dataURIScheme.getMediaType());
	42	assertEquals(-1, dataURIScheme.getInputStream().read());
	43	}
	44
	45	@Test
	46	public void testNewlines() throws Exception {
	47	String data = "data:image/png;base64,R0lG\nODdh";
	48	DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
	49	assertTrue(dataURIScheme.isBase64());
	50	assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
	51
	52	String expected = "data:image/png;base64,R0lGODdh";
	53	assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
	54
	55	}
	56
	57	@Test
	58	public void testBackslashNewlines() throws Exception {
	59	//like you'd have in a css fragment
	60	String data = "data:image/png;base64,R0lG\\\nODdh";
	61	DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
	62	assertTrue(dataURIScheme.isBase64());
	63	assertEquals(MediaType.image("png"), dataURIScheme.getMediaType());
	64
	65	String expected = "data:image/png;base64,R0lGODdh";
	66	assertEquals(dataURISchemeUtil.parse(expected), dataURISchemeUtil.parse(data));
	67	}
	68
	69	@Test
	70	public void testUTF8() throws Exception {
	71	String utf8 = "\u0628\u0631\u0646\u0633\u062A\u0648\u0646";
	72	String data = "data:text/plain;charset=UTF-8;page=21,the%20data:"+utf8;
	73	DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data);
	74	ByteArrayOutputStream bos = new ByteArrayOutputStream();
	75	IOUtils.copy(dataURIScheme.getInputStream(), bos);
	76	assertContains(utf8, new String(bos.toByteArray(), StandardCharsets.UTF_8));
	77	}
	78	}

+2

-2

tika-serialization/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

52	52	<dependency>
53	53	<groupId>com.google.code.gson</groupId>
54	54	<artifactId>gson</artifactId>
55		<version>2.8.1</version>
	55	<version>${gson.version}</version>
56	56	</dependency>
57	57
58	58	<!-- Test dependencies -->

+10

-15

tika-server/Dockerfile less more

15	15	FROM ubuntu:latest
16	16	MAINTAINER Apache Tika Team
17	17
18		ENV TIKA_VERSION 1.7
19		ENV TIKA_SERVER_URL https://www.apache.org/dist/tika/tika-server-$TIKA_VERSION.jar
20
21	18	RUN apt-get update \
22		&& apt-get install openjdk-7-jre-headless curl gdal-bin tesseract-ocr \
23		tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \
24		&& curl -sSL https://people.apache.org/keys/group/tika.asc -o /tmp/tika.asc \
25		&& gpg --import /tmp/tika.asc \
26		&& curl -sSL "$TIKA_SERVER_URL.asc" -o /tmp/tika-server-${TIKA_VERSION}.jar.asc \
27		&& NEAREST_TIKA_SERVER_URL=$(curl -sSL http://www.apache.org/dyn/closer.cgi/${TIKA_SERVER_URL#https://www.apache.org/dist/}\?asjson\=1 \
28		\| awk '/"path_info": / { pi=$2; }; /"preferred":/ { pref=$2; }; END { print pref " " pi; };' \
29		\| sed -r -e 's/^"//; s/",$//; s/" "//') \
30		&& echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \
31		&& curl -sSL "$NEAREST_TIKA_SERVER_URL" -o /tika-server-${TIKA_VERSION}.jar \
32		&& gpg --verify /tmp/tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar \
	19	&& apt-get install openjdk-8-jre-headless curl gdal-bin tesseract-ocr \
	20	tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu -y \
33	21	&& apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
34	22
	23	ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
	24	RUN export JAVA_HOME
	25
	26	ARG JAR_FILE
	27	ADD target/${JAR_FILE} /tika-server.jar
	28
35	29	EXPOSE 9998
36		ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0
	30	ENTRYPOINT java -jar /tika-server.jar -h 0.0.0.0
	31

+16

-0

tika-server/README.md less more

13	13	-s,--includeStack whether or not to return a stack trace
14	14	if there is an exception during 'parse'
15	15	```
	16	Running via Docker
	17	------------------
	18	Assuming you have Docker installed, you can build you own local image using the:
	19
	20	`mvn dockerfile:build`
	21
	22	The image will be named apache/tika with the tag being the version being built.
	23	For example, building Apache Tika Server 1.17 will result in an image of `apache/tika-server:1.17`
	24
	25	You can then run this image by executing the following, replacing `1.17` with your build version:
	26
	27	`docker run -d -p 9998:9998 apache/tika-server:1.17`
	28
	29	This will load Apache Tika Server and expose its interface on:
	30
	31	`http://localhost:9998`
16	32
17	33	Usage
18	34	-----

+13

-1

tika-server/pom.xml less more

19	19	<parent>
20	20	<groupId>org.apache.tika</groupId>
21	21	<artifactId>tika-parent</artifactId>
22		<version>1.17</version>
	22	<version>1.18</version>
23	23	<relativePath>../tika-parent/pom.xml</relativePath>
24	24	</parent>
25	25

258	258	</configuration>
259	259	</execution>
260	260	</executions>
	261	</plugin>
	262	<plugin>
	263	<groupId>com.spotify</groupId>
	264	<artifactId>dockerfile-maven-plugin</artifactId>
	265	<version>1.3.7</version>
	266	<configuration>
	267	<repository>apache/tika-server</repository>
	268	<tag>${project.version}</tag>
	269	<buildArgs>
	270	<JAR_FILE>tika-server-${project.version}.jar</JAR_FILE>
	271	</buildArgs>
	272	</configuration>
261	273	</plugin>
262	274	<plugin>
263	275	<groupId>org.apache.maven.plugins</groupId>

+95

-27

tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java less more

48	48	import java.util.Locale;
49	49	import java.util.Map;
50	50	import java.util.Set;
	51	import java.util.regex.Matcher;
	52	import java.util.regex.Pattern;
51	53
52	54	import org.apache.commons.lang.StringUtils;
53	55	import org.apache.cxf.jaxrs.ext.multipart.Attachment;

81	83
82	84	@Path("/tika")
83	85	public class TikaResource {
	86
	87	private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_\\.A-Z0-9 ]+$");
	88
84	89	public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n";
85	90	public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
86	91	public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";

189	194	* @throws WebApplicationException thrown when field cannot be found.
190	195	*/
191	196	private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) {
192		try {
193		String property = StringUtils.removeStart(key, prefix);
194		Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
195
196		field.setAccessible(true);
197		if (field.getType() == String.class) {
198		field.set(object, httpHeaders.getFirst(key));
199		} else if (field.getType() == int.class) {
200		field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key)));
201		} else if (field.getType() == double.class) {
202		field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key)));
203		} else if (field.getType() == boolean.class) {
204		field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key)));
	197
	198	try {String property = StringUtils.removeStart(key, prefix);
	199	Field field = null;
	200	try {
	201	field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
	202	} catch (NoSuchFieldException e) {
	203	//swallow
	204	}
	205	String setter = property;
	206	setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
	207	//default assume string class
	208	//if there's a more specific type, e.g. double, int, boolean
	209	//try that.
	210	Class clazz = String.class;
	211	if (field != null) {
	212	if (field.getType() == int.class \|\| field.getType() == Integer.class) {
	213	clazz = int.class;
	214	} else if (field.getType() == double.class) {
	215	clazz = double.class;
	216	} else if (field.getType() == Double.class) {
	217	clazz = Double.class;
	218	} else if (field.getType() == float.class) {
	219	clazz = float.class;
	220	} else if (field.getType() == Float.class) {
	221	clazz = Float.class;
	222	} else if (field.getType() == boolean.class) {
	223	clazz = boolean.class;
	224	} else if (field.getType() == Boolean.class) {
	225	clazz = Boolean.class;
	226	}
	227	}
	228
	229	Method m = tryToGetMethod(object, setter, clazz);
	230	//if you couldn't find more specific setter, back off
	231	//to string setter and try that.
	232	if (m == null && clazz != String.class) {
	233	m = tryToGetMethod(object, setter, String.class);
	234	}
	235
	236	if (m != null) {
	237	String val = httpHeaders.getFirst(key);
	238	val = val.trim();
	239	if (clazz == String.class) {
	240	checkTrustWorthy(setter, val);
	241	m.invoke(object, val);
	242	} else if (clazz == int.class \|\| clazz == Integer.class) {
	243	m.invoke(object, Integer.parseInt(val));
	244	} else if (clazz == double.class \|\| clazz == Double.class) {
	245	m.invoke(object, Double.parseDouble(val));
	246	} else if (clazz == boolean.class \|\| clazz == Boolean.class) {
	247	m.invoke(object, Boolean.parseBoolean(val));
	248	} else if (clazz == float.class \|\| clazz == Float.class) {
	249	m.invoke(object, Float.parseFloat(val));
	250	} else {
	251	throw new IllegalArgumentException("setter must be String, int, float, double or boolean...for now");
	252	}
205	253	} else {
206		//couldn't find a directly accessible field
207		//try for setX(String s)
208		String setter = StringUtils.uncapitalize(property);
209		setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
210		Method m = null;
211		try {
212		m = object.getClass().getMethod(setter, String.class);
213		} catch (NoSuchMethodException e) {
214		//swallow
215		}
216		if (m != null) {
217		m.invoke(object, httpHeaders.getFirst(key));
218		}
219		}
	254	throw new NoSuchMethodException("Couldn't find: "+setter);
	255	}
	256
220	257	} catch (Throwable ex) {
221	258	throw new WebApplicationException(String.format(Locale.ROOT,
222	259	"%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX));
223	260	}
	261	}
	262
	263	private static void checkTrustWorthy(String setter, String val) {
	264	if (setter == null \|\| val == null) {
	265	throw new IllegalArgumentException("setter and val must not be null");
	266	}
	267	if (setter.toLowerCase(Locale.US).contains("trusted")) {
	268	throw new IllegalArgumentException("Can't call a trusted method via tika-server headers");
	269	}
	270	Matcher m = ALLOWABLE_HEADER_CHARS.matcher(val);
	271	if (! m.find()) {
	272	throw new IllegalArgumentException("Header val: "+val +" contains illegal characters. " +
	273	"Must contain: TikaResource.ALLOWABLE_HEADER_CHARS");
	274	}
	275	}
	276
	277	/**
	278	* Tries to get method. Silently swallows NoMethodException and returns
	279	* <code>null</code> if not found.
	280	* @param object
	281	* @param method
	282	* @param clazz
	283	* @return
	284	*/
	285	private static Method tryToGetMethod(Object object, String method, Class clazz) {
	286	try {
	287	return object.getClass().getMethod(method, clazz);
	288	} catch (NoSuchMethodException e) {
	289	//swallow
	290	}
	291	return null;
224	292	}
225	293
226	294	@SuppressWarnings("serial")

+57

-9

tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java less more

16	16
17	17	package org.apache.tika.server;
18	18
19		import static org.junit.Assert.assertEquals;
20		import static org.junit.Assert.assertFalse;
21		import static org.junit.Assert.assertTrue;
22
23		import javax.ws.rs.core.Response;
24		import java.io.InputStream;
25		import java.util.ArrayList;
26		import java.util.List;
27
28	19	import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
29	20	import org.apache.cxf.jaxrs.client.WebClient;
30	21	import org.apache.cxf.jaxrs.ext.multipart.Attachment;

34	25	import org.apache.tika.server.resource.TikaResource;
35	26	import org.junit.Test;
36	27
	28	import javax.ws.rs.core.Response;
	29	import java.io.InputStream;
	30	import java.util.ArrayList;
	31	import java.util.List;
	32
	33	import static org.junit.Assert.assertEquals;
	34	import static org.junit.Assert.assertFalse;
	35	import static org.junit.Assert.assertTrue;
	36
37	37	public class TikaResourceTest extends CXFTestBase {
38	38	public static final String TEST_DOC = "test.doc";
39	39	public static final String TEST_PASSWORD_PROTECTED = "password.xls";

278	278	responseMsg
279	279	);
280	280	}
	281
	282	@Test
	283	public void testDataIntegrityCheck() throws Exception {
	284	Response response = WebClient.create(endPoint + TIKA_PATH)
	285	.type("application/pdf")
	286	.accept("text/plain")
	287	.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
	288	"tesseractPath",
	289
	290	"C://tmp//hello.bat\u0000")
	291	.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
	292	assertEquals(500, response.getStatus());
	293
	294	response = WebClient.create(endPoint + TIKA_PATH)
	295	.type("application/pdf")
	296	.accept("text/plain")
	297	.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
	298	"tesseractPath",
	299	"bogus path")
	300	.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
	301	assertEquals(200, response.getStatus());
	302	}
	303
	304	@Test
	305	public void testTrustedMethodPrevention() {
	306	Response response = WebClient.create(endPoint + TIKA_PATH)
	307	.type("application/pdf")
	308	.accept("text/plain")
	309	.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
	310	"trustedPageSeparator",
	311	"\u0010")
	312	.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
	313	assertEquals(500, response.getStatus());
	314
	315	}
	316
	317	@Test
	318	public void testFloatInHeader() {
	319	Response response = WebClient.create(endPoint + TIKA_PATH)
	320	.type("application/pdf")
	321	.accept("text/plain")
	322	.header(TikaResource.X_TIKA_PDF_HEADER_PREFIX +
	323	"averageCharTolerance",
	324	"2.0")
	325	.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
	326	assertEquals(200, response.getStatus());
	327
	328	}
281	329	}

+36

-2

tika-translate/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30

49	49	<artifactId>microsoft-translator-java-api</artifactId>
50	50	<version>0.6.2</version>
51	51	<type>jar</type>
	52	<exclusions>
	53	<exclusion>
	54	<groupId>com.googlecode.json-simple</groupId>
	55	<artifactId>json-simple</artifactId>
	56	</exclusion>
	57	</exclusions>
	58	</dependency>
	59	<dependency>
	60	<groupId>com.googlecode.json-simple</groupId>
	61	<artifactId>json-simple</artifactId>
	62	<version>1.1.1</version>
52	63	</dependency>
53	64	<dependency>
54	65	<groupId>org.apache.cxf</groupId>
55	66	<artifactId>cxf-rt-frontend-jaxrs</artifactId>
56	67	<version>${cxf.version}</version>
	68	<exclusions>
	69	<!-- exclude because, as of 2.9.5, jaxb-annotations
	70	is bringing in 2.9.0 of core's annotations
	71	-->
	72	<exclusion>
	73	<groupId>com.fasterxml.jackson.core</groupId>
	74	<artifactId>jackson-annotations</artifactId>
	75	</exclusion>
	76	</exclusions>
57	77	</dependency>
58	78	<dependency>
59	79	<groupId>com.fasterxml.jackson.jaxrs</groupId>
60	80	<artifactId>jackson-jaxrs-json-provider</artifactId>
61		<version>2.9.2</version>
	81	<version>${jackson.version}</version>
	82	<exclusions>
	83	<!-- exclude because, as of 2.9.5, jaxrs-json-provider
	84	is bringing in 2.9.0 of core's annotations
	85	-->
	86	<exclusion>
	87	<groupId>com.fasterxml.jackson.core</groupId>
	88	<artifactId>jackson-annotations</artifactId>
	89	</exclusion>
	90	</exclusions>
	91	</dependency>
	92	<dependency>
	93	<groupId>com.fasterxml.jackson.core</groupId>
	94	<artifactId>jackson-annotations</artifactId>
	95	<version>${jackson.version}</version>
62	96	</dependency>
63	97
64	98	<!-- Test dependencies -->

+1

-1

tika-xmp/pom.xml less more

24	24	<parent>
25	25	<groupId>org.apache.tika</groupId>
26	26	<artifactId>tika-parent</artifactId>
27		<version>1.17</version>
	27	<version>1.18</version>
28	28	<relativePath>../tika-parent/pom.xml</relativePath>
29	29	</parent>
30	30