Codebase list tika / 5a028f8
New upstream version 1.19 Emmanuel Bourg 5 years ago
226 changed file(s) with 10773 addition(s) and 2296 deletion(s). Raw diff Collapse all Expand all
0 Release 1.19 - 9/14/2018
1
2 * Require Java 8 (TIKA-2679).
3
4 * Enable building with Java 11 (TIKA-2668)
5
6 * Add an option to make tika-server robust against infinite loops,
7 OOMs, and memory leaks (TIKA-2725).
8
9 * Allow configuration of the Tesseract parser via the standard
10 tika-config.xml options (TIKA-2705).
11
12 * Improve handling of empty cells across table-based
13 formats (TIKA-2479).
14
15 * Add a Standards compliant HTML encoding detector
16 via Gerard Bouchar (TIKA-2673).
17
18 * Improved XML parsing -- limited default entity expansions to 20.
19 To raise this limit, add -Djdk.xml.entityExpansionLimit=XXX to
20 your commandline.
21
22 * Mime magic improvements for Olympus RAW (TIKA-2658), interpreted
23 server-side languages via HTTP (TIKA-2648), MHTML (TIKA-2723)
24
25 * Add absolute timeout to ForkParser rather than testing
26 for active (TIKA-2656).
27
28 * Make the RecursiveParserWrapper work with the ForkParser (TIKA-2655).
29
30 * Allow the ForkParser to specify a directory containing tika-app.jar
31 for use by the ForkServer. This allows users to keep most of the
32 parser dependencies out of their code; and it allows for an easy
33 addition of optional jars for Parser dependencies,
34 such as the xerial sqlite jar (TIKA-2653).
35
36 * Use a pool for SAXParsers and DOMBuilders rather than creating
37 a new parser/builder for every parse.
38 For better performance, set XMLReaderUtils.setPoolSize() to the
39 number of threads you're using with Tika (TIKA-2645).
40
41 * Add the RecursiveParserWrapperHandler to improve the RecursiveParserWrapper
42 API slightly (TIKA-2644).
43
44 * Upgraded to Commons-Compress 1.18 (TIKA-2707).
45
46 * Upgraded to Apache POI 4.0.0 (TIKA-2552).
47
48 * Upgraded to Apache PDFBox 2.0.11 (TIKA-2681).
49
50 * Upgraded to deeplearning4j 1.0.0-beta2 (TIKA-2672).
51
52 * Upgraded jmatio to 1.4 (TIKA-2667)
53
54 * Upgraded Apache Lucene to 7.4.0 in tika-eval and tika-examples (TIKA-2695).
55
56 * Upgraded junrar to 1.0.1 (TIKA-2664).
57
58 * Numerous other upgrades (TIKA-2692).
59
60 * Excluded Spring as a transitive dependency (TIKA-2721).
61
062 Release 1.18 - 4/20/2018
163
2 * Upgrade Jackson to 2.9.5 (TIKA-2634).
64 * Upgrade jackson to 2.9.5 (TIKA-2634).
365
466 * Add support for brotli (TIKA-2621).
567
57119 * Fixed bug where TesseractOCRParser ignores configured ImageMagickPath,
58120 and set rotation script to ignore Python warnings (TIKA-2509)
59121
60 * Upgrade geo-apis to 3.0.1 (TIKA-2535).
122 * Upgrade geo-apis to 3.0.1 (TIKA-2535)
123
124 * Mime definition and magic improvements for text-based programming
125 and config formats (TIKA-2554, TIKA-2567, TIKA-1141)
61126
62127 * Added local Docker image build using dockerfile-maven-plugin to allow
63128 images to be built from source (TIKA-1518).
129
130 * Support for SAS7BDAT data files (TIKA-2462)
131
132 * Handle .epub files using .htm rather than .html extensions for the
133 embedded contents (TIKA-1288)
134
135 * Mime magic for ACES Images (TIKA-2628) and DPX Images (TIKA-2629)
136
137 * For sparse XLSX and XLSB files, always output missing cells to
138 the left of filled ones (matching XLS), and optionally output
139 missing rows on all 3 formats if requested via the
140 OfficeParserContext (TIKA-2479)
64141
65142 Release 1.17 - 12/8/2017
66143
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>tika-parent/pom.xml</relativePath>
2929 </parent>
3030
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
163163 <plugin>
164164 <groupId>org.apache.maven.plugins</groupId>
165165 <artifactId>maven-jar-plugin</artifactId>
166 <configuration>
167 <archive>
168 <manifestEntries>
169 <Automatic-Module-Name>org.apache.tika.app</Automatic-Module-Name>
170 </manifestEntries>
171 </archive>
172 </configuration>
166173 <executions>
167174 <execution>
168175 <goals>
5555 import java.util.Map.Entry;
5656 import java.util.Set;
5757 import java.util.TreeSet;
58 import java.util.UUID;
5859
5960 import org.apache.commons.io.FilenameUtils;
6061 import org.apache.commons.io.IOUtils;
7980 import org.apache.tika.io.TikaInputStream;
8081 import org.apache.tika.language.detect.LanguageHandler;
8182 import org.apache.tika.metadata.Metadata;
83 import org.apache.tika.metadata.TikaCoreProperties;
8284 import org.apache.tika.metadata.serialization.JsonMetadata;
8385 import org.apache.tika.metadata.serialization.JsonMetadataList;
8486 import org.apache.tika.mime.MediaType;
102104 import org.apache.tika.sax.BodyContentHandler;
103105 import org.apache.tika.sax.ContentHandlerFactory;
104106 import org.apache.tika.sax.ExpandedTitleContentHandler;
107 import org.apache.tika.sax.RecursiveParserWrapperHandler;
105108 import org.apache.tika.xmp.XMPMetadata;
106109 import org.slf4j.Logger;
107110 import org.slf4j.LoggerFactory;
442445 } else if (arg.equals("-d") || arg.equals("--detect")) {
443446 type = DETECT;
444447 } else if (arg.startsWith("--extract-dir=")) {
445 extractDir = new File(arg.substring("--extract-dir=".length()));
448 String dirPath = arg.substring("--extract-dir=".length());
449 //if the user accidentally doesn't include
450 //a directory, set the directory to the cwd
451 if (dirPath.length() == 0) {
452 dirPath = ".";
453 }
454 extractDir = new File(dirPath);
446455 } else if (arg.equals("-z") || arg.equals("--extract")) {
447456 extractInlineImagesFromPDFs();
448457 type = NO_OUTPUT;
501510
502511 private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
503512 Metadata metadata = new Metadata();
504 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
513 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
514 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1);
505515 try (InputStream input = TikaInputStream.get(url, metadata)) {
506 wrapper.parse(input, null, metadata, context);
516 wrapper.parse(input, handler, metadata, context);
507517 }
508518 JsonMetadataList.setPrettyPrinting(prettyPrint);
509519 Writer writer = getOutputWriter(output, encoding);
510520 try {
511 JsonMetadataList.toJson(wrapper.getMetadata(), writer);
521 JsonMetadataList.toJson(handler.getMetadataList(), writer);
512522 } finally {
513523 writer.flush();
514524 }
10451055 }
10461056 MediaType contentType = detector.detect(inputStream, metadata);
10471057
1048 if (name.indexOf('.')==-1 && contentType!=null) {
1049 try {
1050 name += config.getMimeRepository().forName(
1051 contentType.toString()).getExtension();
1052 } catch (MimeTypeException e) {
1053 e.printStackTrace();
1054 }
1055 }
1056
1057 String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
1058 if (relID != null && !name.startsWith(relID)) {
1059 name = relID + "_" + name;
1060 }
1061
1062 File outputFile = new File(extractDir, FilenameUtils.normalize(name));
1058 File outputFile = null;
1059 if (name == null) {
1060 name = "file" + count++;
1061 }
1062 outputFile = getOutputFile(name, metadata, contentType);
1063
1064
10631065 File parent = outputFile.getParentFile();
10641066 if (!parent.exists()) {
10651067 if (!parent.mkdirs()) {
10941096 );
10951097 LOG.warn(msg, e);
10961098 }
1099 }
1100
1101 private File getOutputFile(String name, Metadata metadata, MediaType contentType) {
1102 String ext = getExtension(contentType);
1103 if (name.indexOf('.')==-1 && contentType!=null) {
1104 name += ext;
1105 }
1106
1107 String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
1108 if (relID != null && !name.startsWith(relID)) {
1109 name = relID + "_" + name;
1110 }
1111 //defensively do this so that we don't get an exception
1112 //from FilenameUtils.normalize
1113 name = name.replaceAll("\u0000", " ");
1114 String normalizedName = FilenameUtils.normalize(name);
1115
1116 if (normalizedName == null) {
1117 normalizedName = FilenameUtils.getName(name);
1118 }
1119
1120 if (normalizedName == null) {
1121 normalizedName = "file"+count++ +ext;
1122 }
1123 //strip off initial C:/ or ~/ or /
1124 int prefixLength = FilenameUtils.getPrefixLength(normalizedName);
1125 if (prefixLength > -1) {
1126 normalizedName = normalizedName.substring(prefixLength);
1127 }
1128 File outputFile = new File(extractDir, normalizedName);
1129 //if file already exists, prepend uuid
1130 if (outputFile.exists()) {
1131 String fileName = FilenameUtils.getName(normalizedName);
1132 outputFile = new File(extractDir, UUID.randomUUID().toString()+"-"+fileName);
1133 }
1134 return outputFile;
1135 }
1136
1137 private String getExtension(MediaType contentType) {
1138 try {
1139 String ext = config.getMimeRepository().forName(
1140 contentType.toString()).getExtension();
1141 if (ext == null) {
1142 return ".bin";
1143 } else {
1144 return ext;
1145 }
1146 } catch (MimeTypeException e) {
1147 e.printStackTrace();
1148 }
1149 return ".bin";
1150
10971151 }
10981152
10991153 protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
11401194 } finally {
11411195 server.close();
11421196 }
1143 } catch (IOException e) {
1197 } catch (IOException e) {
11441198 e.printStackTrace();
11451199 }
11461200 }
11731227 }
11741228
11751229 }
1176
1230
11771231 private class NoDocumentMetHandler extends DefaultHandler {
11781232
11791233 protected final Metadata metadata;
7979 import org.apache.tika.sax.BasicContentHandlerFactory;
8080 import org.apache.tika.sax.BodyContentHandler;
8181 import org.apache.tika.sax.ContentHandlerDecorator;
82 import org.apache.tika.sax.RecursiveParserWrapperHandler;
8283 import org.apache.tika.sax.TeeContentHandler;
8384 import org.apache.tika.sax.XHTMLContentHandler;
8485 import org.xml.sax.Attributes;
394395 );
395396 }
396397 if (isReset) {
397 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
398 new BasicContentHandlerFactory(
399 BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
400 wrapper.parse(input, null, new Metadata(), new ParseContext());
398 RecursiveParserWrapperHandler recursiveParserWrapperHandler =
399 new RecursiveParserWrapperHandler(
400 new BasicContentHandlerFactory(
401 BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1),
402 -1);
403 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
404 wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext());
401405 StringWriter jsonBuffer = new StringWriter();
402406 JsonMetadataList.setPrettyPrinting(true);
403 JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
407 JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer);
404408 setText(json, jsonBuffer.toString());
405409 }
406410 layout.show(cards, "metadata");
6262 description="output directory for output"/> <!-- do we want to make this mandatory -->
6363 <option opt="recursiveParserWrapper"
6464 description="use the RecursiveParserWrapper or not (default = false)"/>
65 <option opt="streamOut" description="stream the output of the RecursiveParserWrapper (default = false)"/>
6566 <option opt="handleExisting" hasArg="true"
6667 description="if an output file already exists, do you want to: overwrite, rename or skip"/>
6768 <option opt="basicHandlerType" hasArg="true"
1919 import static java.nio.charset.StandardCharsets.UTF_8;
2020 import static org.junit.Assert.assertEquals;
2121 import static org.junit.Assert.assertNotNull;
22 import static org.junit.Assert.assertNull;
2223 import static org.junit.Assert.assertTrue;
2324
25 import java.io.BufferedWriter;
2426 import java.io.ByteArrayOutputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
2529 import java.io.OutputStream;
30 import java.io.OutputStreamWriter;
2631 import java.io.PrintStream;
2732 import java.io.Reader;
33 import java.nio.charset.StandardCharsets;
2834 import java.nio.file.Files;
2935 import java.nio.file.Path;
3036 import java.nio.file.Paths;
3137 import java.util.List;
38 import java.util.logging.Handler;
3239
3340 import org.apache.commons.io.FileUtils;
3441 import org.apache.tika.metadata.Metadata;
3542 import org.apache.tika.metadata.serialization.JsonMetadataList;
43 import org.apache.tika.metadata.serialization.JsonStreamingSerializer;
44 import org.apache.tika.parser.AutoDetectParser;
45 import org.apache.tika.parser.ParseContext;
46 import org.apache.tika.parser.Parser;
3647 import org.apache.tika.parser.RecursiveParserWrapper;
48 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
49 import org.apache.tika.sax.BasicContentHandlerFactory;
50 import org.apache.tika.sax.ContentHandlerFactory;
51 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3752 import org.junit.After;
3853 import org.junit.Before;
3954 import org.junit.Test;
55 import org.xml.sax.ContentHandler;
56 import org.xml.sax.SAXException;
4057
4158 public class TikaCLIBatchIntegrationTest {
4259
107124 try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
108125 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
109126 assertEquals(12, metadataList.size());
110 assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
127 assertTrue(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).contains("human events"));
128 }
129 }
130
131 @Test
132 public void testStreamingJsonRecursiveBatchIntegration() throws Exception {
133 String[] params = {"-i", testInputDirForCommandLine,
134 "-o", tempOutputDirForCommandLine,
135 "-numConsumers", "10",
136 "-J", //recursive Json
137 "-t", //plain text in content
138 "-streamOut"
139 };
140 TikaCLI.main(params);
141
142 Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json");
143 try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
144 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
145 assertEquals(12, metadataList.size());
146 assertTrue(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).contains("human events"));
147 //test that the last written object has been bumped to the first by JsonMetadataList.fromJson()
148 assertNull( metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
111149 }
112150 }
113151
169207 Files.isRegularFile(path));
170208 }
171209
172
173210 }
1616 package org.apache.tika.cli;
1717
1818 import static java.nio.charset.StandardCharsets.UTF_8;
19 import static org.junit.Assert.assertEquals;
1920 import static org.junit.Assert.assertFalse;
2021 import static org.junit.Assert.assertTrue;
2122
2223 import java.io.ByteArrayOutputStream;
2324 import java.io.File;
25 import java.io.FileOutputStream;
26 import java.io.IOException;
27 import java.io.InvalidObjectException;
28 import java.io.OutputStream;
2429 import java.io.PrintStream;
2530 import java.net.URI;
31 import java.nio.charset.StandardCharsets;
32 import java.util.zip.ZipEntry;
33 import java.util.zip.ZipFile;
34 import java.util.zip.ZipOutputStream;
2635
2736 import org.apache.commons.io.FileUtils;
37 import org.apache.commons.io.FilenameUtils;
2838 import org.apache.tika.exception.TikaException;
2939 import org.junit.After;
3040 import org.junit.Before;
244254 }
245255
246256 @Test
247 public void testExtract() throws Exception {
257 public void testExtractSimple() throws Exception {
258 String[] expectedChildren = new String[]{
259 "MBD002B040A.cdx",
260 "file4.png",
261 "MBD002B0FA6_file5.bin",
262 "MBD00262FE3.txt",
263 "file0.emf"
264 };
265 testExtract("/coffee.xls", expectedChildren, 8);
266 }
267
268 @Test
269 public void testExtractAbsolute() throws Exception {
270 String[] expectedChildren = new String[] {
271 "dangerous/dont/touch.pl",
272 };
273 testExtract("testZip_absolutePath.zip", expectedChildren, 2);
274 }
275
276 @Test
277 public void testExtractRelative() throws Exception {
278 String[] expectedChildren = new String[] {
279 "touch.pl",
280 };
281 testExtract("testZip_relative.zip", expectedChildren);
282 }
283
284 @Test
285 public void testExtractOverlapping() throws Exception {
286 //there should be two files, one with a prepended uuid-f1.txt
287 String[] expectedChildren = new String[] {
288 "f1.txt",
289 };
290 testExtract("testZip_overlappingNames.zip", expectedChildren, 2);
291 }
292
293 @Test
294 public void testExtract0x00() throws Exception {
295 String[] expectedChildren = new String[] {
296 "dang erous.pl",
297 };
298 testExtract("testZip_zeroByte.zip", expectedChildren);
299 }
300
301 private void testExtract(String targetFile, String[] expectedChildrenFileNames) throws Exception {
302 testExtract(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length);
303 }
304 private void testExtract(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception {
248305 File tempFile = File.createTempFile("tika-test-", "");
249306 tempFile.delete();
250 tempFile.mkdir(); // not really good method for production usage, but ok for tests
251 // google guava library has better solution
307 tempFile.mkdir();
252308
253309 try {
254 String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/coffee.xls"};
310 String[] params = {"--extract-dir=" + tempFile.getAbsolutePath(), "-z", resourcePrefix + "/"+targetFile};
255311
256312 TikaCLI.main(params);
257313
258314 StringBuffer allFiles = new StringBuffer();
315 assertEquals(expectedLength, tempFile.list().length);
259316 for (String f : tempFile.list()) {
317
260318 if (allFiles.length() > 0) allFiles.append(" : ");
261319 allFiles.append(f);
262320 }
263321
264 // ChemDraw file
265 File expectedCDX = new File(tempFile, "MBD002B040A.cdx");
266 // Image of the ChemDraw molecule
267 File expectedIMG = new File(tempFile, "file4.png");
268 // OLE10Native
269 File expectedOLE10 = new File(tempFile, "MBD002B0FA6_file5.bin");
270 // Something that really isnt a text file... Not sure what it is???
271 File expected262FE3 = new File(tempFile, "MBD00262FE3.txt");
272 // Image of one of the embedded resources
273 File expectedEMF = new File(tempFile, "file0.emf");
274
275 assertExtracted(expectedCDX, allFiles.toString());
276 assertExtracted(expectedIMG, allFiles.toString());
277 assertExtracted(expectedOLE10, allFiles.toString());
278 assertExtracted(expected262FE3, allFiles.toString());
279 assertExtracted(expectedEMF, allFiles.toString());
322 for (String expectedChildName : expectedChildrenFileNames) {
323 assertExtracted(new File(tempFile, expectedChildName), allFiles.toString());
324 }
280325 } finally {
281326 FileUtils.deleteDirectory(tempFile);
282327 }
509554 assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
510555 }
511556
512
513557 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
147147 <plugin>
148148 <groupId>org.apache.maven.plugins</groupId>
149149 <artifactId>maven-jar-plugin</artifactId>
150 <configuration>
151 <archive>
152 <manifestEntries>
153 <Automatic-Module-Name>org.apache.tika.batch</Automatic-Module-Name>
154 </manifestEntries>
155 </archive>
156 </configuration>
150157 <executions>
151158 <execution>
152159 <goals>
2121 import java.io.InputStream;
2222 import java.io.OutputStream;
2323 import java.io.UnsupportedEncodingException;
24 import java.nio.charset.Charset;
25 import java.nio.charset.StandardCharsets;
2426 import java.util.concurrent.ArrayBlockingQueue;
2527
2628 import org.apache.commons.io.IOUtils;
4446 public class BasicTikaFSConsumer extends AbstractFSConsumer {
4547
4648 private boolean parseRecursively = true;
47 private final ParserFactory parserFactory;
49 private final Parser parser;
4850 private final ContentHandlerFactory contentHandlerFactory;
4951 private final OutputStreamFactory fsOSFactory;
50 private final TikaConfig config;
51 private String outputEncoding = UTF_8.toString();
5252
53 private Charset outputEncoding = StandardCharsets.UTF_8;
5354
55 /**
56 * @param queue
57 * @param parserFactory
58 * @param contentHandlerFactory
59 * @param fsOSFactory
60 * @param tikaConfig
61 *
62 * @deprecated use {@link BasicTikaFSConsumer#BasicTikaFSConsumer(ArrayBlockingQueue, Parser, ContentHandlerFactory, OutputStreamFactory)}
63 */
64 @Deprecated
5465 public BasicTikaFSConsumer(ArrayBlockingQueue<FileResource> queue,
5566 ParserFactory parserFactory,
5667 ContentHandlerFactory contentHandlerFactory,
57 OutputStreamFactory fsOSFactory,
58 TikaConfig config) {
68 OutputStreamFactory fsOSFactory, TikaConfig tikaConfig) {
5969 super(queue);
60 this.parserFactory = parserFactory;
70 this.parser = parserFactory.getParser(tikaConfig);
6171 this.contentHandlerFactory = contentHandlerFactory;
6272 this.fsOSFactory = fsOSFactory;
63 this.config = config;
73 }
74
75 public BasicTikaFSConsumer(ArrayBlockingQueue<FileResource> queue,
76 Parser parser,
77 ContentHandlerFactory contentHandlerFactory,
78 OutputStreamFactory fsOSFactory) {
79 super(queue);
80 this.parser = parser;
81 this.contentHandlerFactory = contentHandlerFactory;
82 this.fsOSFactory = fsOSFactory;
6483 }
6584
6685 @Override
6786 public boolean processFileResource(FileResource fileResource) {
6887
69 Parser parser = parserFactory.getParser(config);
7088 ParseContext context = new ParseContext();
7189 if (parseRecursively) {
7290 context.set(Parser.class, parser);
86104 return false;
87105 }
88106 ContentHandler handler;
89 try {
90 handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding());
91 } catch (UnsupportedEncodingException e) {
92 incrementHandledExceptions();
93 LOG.error(getXMLifiedLogMsg("output_encoding_ex", fileResource.getResourceId(), e));
94 flushAndClose(os);
95 throw new RuntimeException(e);
96 }
107 handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding());
108
97109
98110 //now actually call parse!
99111 Throwable thrown = null;
114126 return true;
115127 }
116128
117 public String getOutputEncoding() {
129 public Charset getOutputEncoding() {
118130 return outputEncoding;
119131 }
120132
121 public void setOutputEncoding(String outputEncoding) {
122 this.outputEncoding = outputEncoding;
133 public void setOutputEncoding(Charset charset) {
134 this.outputEncoding = charset;
123135 }
124136 }
3636 import org.apache.tika.parser.Parser;
3737 import org.apache.tika.parser.RecursiveParserWrapper;
3838 import org.apache.tika.sax.ContentHandlerFactory;
39 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3940 import org.apache.tika.utils.ExceptionUtils;
4041 import org.xml.sax.helpers.DefaultHandler;
4142
4243 /**
43 * Basic FileResourceConsumer that reads files from an input
44 * directory and writes content to the output directory.
45 * <p/>
46 * This tries to catch most of the common exceptions, log them and
47 * store them in the metadata list output.
44 * This runs a RecursiveParserWrapper against an input file
45 * and outputs the json metadata to an output file.
4846 */
4947 public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
5048
51
52 private final ParserFactory parserFactory;
49 private final Parser parser;
5350 private final ContentHandlerFactory contentHandlerFactory;
5451 private final OutputStreamFactory fsOSFactory;
55 private final TikaConfig tikaConfig;
5652 private String outputEncoding = "UTF-8";
5753
58
54 /**
55 *
56 * @param queue
57 * @param parser -- must be RecursiveParserWrapper or a ForkParser that wraps a RecursiveParserWrapper
58 * @param contentHandlerFactory
59 * @param fsOSFactory
60 */
5961 public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue,
60 ParserFactory parserFactory,
62 Parser parser,
6163 ContentHandlerFactory contentHandlerFactory,
62 OutputStreamFactory fsOSFactory, TikaConfig tikaConfig) {
64 OutputStreamFactory fsOSFactory) {
6365 super(queue);
64 this.parserFactory = parserFactory;
6566 this.contentHandlerFactory = contentHandlerFactory;
6667 this.fsOSFactory = fsOSFactory;
67 this.tikaConfig = tikaConfig;
68 this.parser = parser;
6869 }
6970
7071 @Override
7172 public boolean processFileResource(FileResource fileResource) {
7273
73 Parser wrapped = parserFactory.getParser(tikaConfig);
74 RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
7574 ParseContext context = new ParseContext();
76
77 // if (parseRecursively == true) {
78 context.set(Parser.class, parser);
79 // }
8075
8176 //try to open outputstream first
8277 OutputStream os = getOutputStream(fsOSFactory, fileResource);
9994 Throwable thrown = null;
10095 List<Metadata> metadataList = null;
10196 Metadata containerMetadata = fileResource.getMetadata();
97 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1);
10298 try {
103 parse(fileResource.getResourceId(), parser, is, new DefaultHandler(),
99 parse(fileResource.getResourceId(), parser, is, handler,
104100 containerMetadata, context);
105 metadataList = parser.getMetadata();
106101 } catch (Throwable t) {
107102 thrown = t;
108 metadataList = parser.getMetadata();
109 if (metadataList == null) {
110 metadataList = new LinkedList<>();
111 }
112 Metadata m = null;
113 if (metadataList.size() == 0) {
114 m = containerMetadata;
115 } else {
116 //take the top metadata item
117 m = metadataList.remove(0);
118 }
119 String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
120 m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace);
121 metadataList.add(0, m);
122103 } finally {
104 metadataList = handler.getMetadataList();
123105 IOUtils.closeQuietly(is);
124106 }
125107
139121 if (thrown != null) {
140122 if (thrown instanceof Error) {
141123 throw (Error) thrown;
124 } else if (thrown instanceof SecurityException) {
125 throw (SecurityException)thrown;
142126 } else {
143127 return false;
144128 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.batch.fs;
18
19
20
21 import org.apache.commons.io.IOUtils;
22 import org.apache.tika.batch.FileResource;
23 import org.apache.tika.batch.OutputStreamFactory;
24 import org.apache.tika.batch.ParserFactory;
25 import org.apache.tika.config.TikaConfig;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.metadata.serialization.JsonStreamingSerializer;
29 import org.apache.tika.parser.ParseContext;
30 import org.apache.tika.parser.Parser;
31 import org.apache.tika.parser.RecursiveParserWrapper;
32 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
33 import org.apache.tika.sax.ContentHandlerFactory;
34 import org.apache.tika.sax.RecursiveParserWrapperHandler;
35 import org.apache.tika.utils.ExceptionUtils;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 import java.io.IOException;
40 import java.io.InputStream;
41 import java.io.OutputStream;
42 import java.io.OutputStreamWriter;
43 import java.nio.charset.StandardCharsets;
44 import java.util.concurrent.ArrayBlockingQueue;
45
46 /**
47 * This uses the {@link JsonStreamingSerializer} to write out a
48 * single metadata object at a time.
49 */
50 public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
51
52 private final Parser parser;
53 private final ContentHandlerFactory contentHandlerFactory;
54 private final OutputStreamFactory fsOSFactory;
55 private String outputEncoding = "UTF-8";
56
57
58 public StreamOutRPWFSConsumer(ArrayBlockingQueue<FileResource> queue,
59 Parser parser,
60 ContentHandlerFactory contentHandlerFactory,
61 OutputStreamFactory fsOSFactory) {
62 super(queue);
63 this.contentHandlerFactory = contentHandlerFactory;
64 this.fsOSFactory = fsOSFactory;
65 this.parser = parser;
66 }
67
68 @Override
69 public boolean processFileResource(FileResource fileResource) {
70
71 ParseContext context = new ParseContext();
72
73 //try to open outputstream first
74 OutputStream os = getOutputStream(fsOSFactory, fileResource);
75
76 if (os == null) {
77 LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
78 return false;
79 }
80
81 //try to open the inputstream before the parse.
82 //if the parse hangs or throws a nasty exception, at least there will
83 //be a zero byte file there so that the batchrunner can skip that problematic
84 //file during the next run.
85 InputStream is = getInputStream(fileResource);
86 if (is == null) {
87 IOUtils.closeQuietly(os);
88 return false;
89 }
90
91 Metadata containerMetadata = fileResource.getMetadata();
92 JsonStreamingSerializer writer = new JsonStreamingSerializer(
93 new OutputStreamWriter(os, StandardCharsets.UTF_8));
94
95 WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory, writer);
96 Throwable thrown = null;
97 try {
98 parse(fileResource.getResourceId(), parser, is, handler,
99 containerMetadata, context);
100 } catch (Throwable t) {
101 thrown = t;
102 } finally {
103 try {
104 writer.close();
105 } catch (IOException e) {
106 //this is a stop the world kind of thing
107 LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
108 throw new RuntimeException(e);
109 } finally {
110 IOUtils.closeQuietly(is);
111 }
112 }
113 if (thrown != null) {
114 if (thrown instanceof Error) {
115 throw (Error) thrown;
116 } else if (thrown instanceof SecurityException) {
117 throw (SecurityException)thrown;
118 } else {
119 return false;
120 }
121 }
122 return true;
123 }
124
125 public String getOutputEncoding() {
126 return outputEncoding;
127 }
128
129 public void setOutputEncoding(String outputEncoding) {
130 this.outputEncoding = outputEncoding;
131 }
132
133 //extend AbstractRPWH instead of RecursiveParserWrapperHandler so that
134 //if we use the ForkParser, the output will not have to be streamed
135 //back to the proxy, but can
136 //be written straight to disk.
137 private class WriteoutRPWHandler extends AbstractRecursiveParserWrapperHandler {
138 private final JsonStreamingSerializer jsonWriter;
139
140 public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer) {
141 super(contentHandlerFactory);
142 this.jsonWriter = writer;
143 }
144
145 @Override
146 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
147 metadata.add(RecursiveParserWrapperHandler.TIKA_CONTENT, contentHandler.toString());
148 try {
149 jsonWriter.add(metadata);
150 } catch (IOException e) {
151 throw new SAXException(e);
152 }
153 }
154
155 @Override
156 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
157 endEmbeddedDocument(contentHandler, metadata);
158 }
159 }
160 }
3939 import org.apache.tika.batch.fs.FSOutputStreamFactory;
4040 import org.apache.tika.batch.fs.FSUtil;
4141 import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
42 import org.apache.tika.batch.fs.StreamOutRPWFSConsumer;
4243 import org.apache.tika.config.TikaConfig;
44 import org.apache.tika.parser.Parser;
45 import org.apache.tika.parser.RecursiveParserWrapper;
4346 import org.apache.tika.sax.BasicContentHandlerFactory;
4447 import org.apache.tika.sax.ContentHandlerFactory;
4548 import org.apache.tika.util.ClassLoaderUtil;
6366 Node recursiveParserWrapperNode = node.getAttributes().getNamedItem("recursiveParserWrapper");
6467 if (recursiveParserWrapperNode != null) {
6568 recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperNode.getNodeValue(), recursiveParserWrapper);
69 }
70 }
71
72 boolean streamOut = false;
73 String streamOutString = runtimeAttributes.get("streamOut");
74 if (streamOutString != null){
75 streamOut = PropsUtil.getBoolean(streamOutString, streamOut);
76 } else {
77 Node streamOutNode = node.getAttributes().getNamedItem("streamout");
78 if (streamOutNode != null) {
79 streamOut = PropsUtil.getBoolean(streamOutNode.getNodeValue(), streamOut);
6680 }
6781 }
6882
128142 OutputStreamFactory outputStreamFactory = getOutputStreamFactory(
129143 outputStreamFactoryNode, runtimeAttributes,
130144 contentHandlerFactory, recursiveParserWrapper);
131
145 Parser parser = parserFactory.getParser(config);
132146 if (recursiveParserWrapper) {
147 parser = new RecursiveParserWrapper(parser);
133148 for (int i = 0; i < numConsumers; i++) {
134 FileResourceConsumer c = new RecursiveParserWrapperFSConsumer(queue,
135 parserFactory, contentHandlerFactory, outputStreamFactory, config);
149 FileResourceConsumer c = null;
150 if (streamOut){
151 c = new StreamOutRPWFSConsumer(queue,
152 parser, contentHandlerFactory, outputStreamFactory);
153 } else {
154 c = new RecursiveParserWrapperFSConsumer(queue,
155 parser, contentHandlerFactory, outputStreamFactory);
156 }
136157 consumers.add(c);
137158 }
138159 } else {
139160 for (int i = 0; i < numConsumers; i++) {
140161 FileResourceConsumer c = new BasicTikaFSConsumer(queue,
141 parserFactory, contentHandlerFactory, outputStreamFactory, config);
162 parser, contentHandlerFactory, outputStreamFactory);
142163 consumers.add(c);
143164 }
144165 }
6262 description="output directory for output"/> <!-- do we want to make this mandatory -->
6363 <option opt="recursiveParserWrapper"
6464 description="use the RecursiveParserWrapper or not (default = false)"/>
65 <option opt="streamOut" description="stream the output of the RecursiveParserWrapper (default = false)"/>
6566 <option opt="handleExisting" hasArg="true"
6667 description="if an output file already exists, do you want to: overwrite, rename or skip"/>
6768 <option opt="basicHandlerType" hasArg="true"
7677 description="regex that specifies which files to avoid processing"/>
7778 <option opt="reporterSleepMillis" hasArg="true"
7879 description="millisecond between reports by the reporter"/>
80
7981 </commandline>
8082
8183
107109 <!--
108110 To wrap parser in RecursiveParserWrapper (tika-app's -J or tika-server's /rmeta),
109111 add attribute recursiveParserWrapper="true" to consumers element.
112 To stream the output of the RecursiveParserWrapper set "streamout" = true
113 in consumers element.
110114 -->
111115 <consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
112116 recursiveParserWrapper="false" consumersManagerMaxMillis="60000">
3434 import org.apache.tika.metadata.Metadata;
3535 import org.apache.tika.metadata.TikaCoreProperties;
3636 import org.apache.tika.metadata.serialization.JsonMetadataList;
37 import org.apache.tika.parser.Parser;
3738 import org.apache.tika.parser.RecursiveParserWrapper;
39 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3840 import org.apache.tika.sax.BasicContentHandlerFactory;
3941 import org.junit.Test;
4042
6870 queue.add(new PoisonFileResource());
6971
7072 MockOSFactory mockOSFactory = new MockOSFactory();
73 Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
7174 RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
72 queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
73 mockOSFactory, new TikaConfig());
75 queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
76 mockOSFactory);
7477
7578 IFileProcessorFutureResult result = consumer.call();
7679 mockOSFactory.getStreams().get(0).flush();
7982
8083 assertEquals(4, results.size());
8184 assertContains("another null pointer",
82 results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
85 results.get(2).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION));
8386
8487 assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
8588 for (int i = 1; i < 4; i++) {
8689 assertEquals("embeddedAuthor"+i, results.get(i).get("author"));
87 assertContains("some_embedded_content"+i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
90 assertContains("some_embedded_content"+i, results.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
8891 }
8992 }
9093
115118 queue.add(new PoisonFileResource());
116119
117120 MockOSFactory mockOSFactory = new MockOSFactory();
121 Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
118122 RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
119 queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
120 mockOSFactory, new TikaConfig());
123 queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
124 mockOSFactory);
121125
122126 IFileProcessorFutureResult result = consumer.call();
123127 mockOSFactory.getStreams().get(0).flush();
128132 results.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"));
129133 assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
130134 assertEquals("embeddedAuthor", results.get(1).get("author"));
131 assertContains("some_embedded_content", results.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
135 assertContains("some_embedded_content", results.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
132136 }
133137
134138
0 package org.apache.tika.batch.fs;
1
20 /*
31 * Licensed to the Apache Software Foundation (ASF) under one or more
42 * contributor license agreements. See the NOTICE file distributed with
1513 * See the License for the specific language governing permissions and
1614 * limitations under the License.
1715 */
16 package org.apache.tika.batch.fs;
1817
1918 import static java.nio.charset.StandardCharsets.UTF_8;
2019 import static org.junit.Assert.assertEquals;
2827 import java.util.Map;
2928
3029 import org.apache.tika.batch.BatchProcessDriverCLI;
30 import org.junit.Ignore;
3131 import org.junit.Test;
3232
3333
114114 readFileToString(outputDir.resolve("test2_ok.xml.xml"), UTF_8));
115115 }
116116
117 @Test(timeout = 30000)
117 @Test(timeout = 60000)
118118 public void allHeavyHangsTestWithStarvedCrawler() throws Exception {
119119 //this tests that if all consumers are hung and the crawler is
120120 //waiting to add to the queue, there isn't deadlock. The BatchProcess should
205205 assertEquals(0, driver.getNumRestarts());
206206 }
207207
208 @Test(timeout = 60000)
209 public void testSystemExit() throws Exception {
210 Path outputDir = getNewOutputDir("system-exit");
211 Map<String, String> args = new HashMap<>();
212 args.put("-numConsumers", "1");
213
214 String[] commandLine = getDefaultCommandLineArgsArr("system_exit", outputDir, args);
215 BatchProcessDriverCLI driver = getNewDriver("/tika-batch-config-test.xml", commandLine);
216 driver.execute();
217 assertEquals(6, countChildren(outputDir));
218 assertTrue(driver.getNumRestarts() > 1);
219 for (int i = 0; i < 3; i++) {
220 assertEquals("problem with "+i, 0, Files.size(outputDir.resolve("test"+i+"_system_exit.xml.xml")));
221 }
222 //sys exit may prevent test3 from running successfully
223 for (int i = 5; i < 6; i++) {
224 assertContains("first test file",
225 readFileToString(outputDir.resolve("test"+i+"_ok.xml.xml"), UTF_8));
226 }
227 }
228
229 @Test(timeout = 60000)
230 @Ignore("Java 11-ea+23 makes outputstreams uninterruptible")
231 public void testThreadInterrupt() throws Exception {
232 Path outputDir = getNewOutputDir("thread-interrupt");
233 Map<String, String> args = new HashMap<>();
234 args.put("-numConsumers", "1");
235
236 String[] commandLine = getDefaultCommandLineArgsArr("thread_interrupt", outputDir, args);
237 BatchProcessDriverCLI driver = getNewDriver("/tika-batch-config-test.xml", commandLine);
238 driver.execute();
239 assertEquals(6, countChildren(outputDir));
240
241 for (int i = 0; i < 3; i++) {
242 assertEquals("problem with "+i, 0, Files.size(outputDir.resolve("test"+i+"_thread_interrupt.xml.xml")));
243 }
244 //sys exit may prevent test3 from running successfully
245 for (int i = 5; i < 6; i++) {
246 assertContains("first test file",
247 readFileToString(outputDir.resolve("test"+i+"_ok.xml.xml"), UTF_8));
248 }
249 }
208250
209251 }
3838 import java.util.concurrent.Future;
3939 import java.util.concurrent.TimeUnit;
4040
41 import org.apache.commons.io.FileUtils;
4142 import org.apache.commons.io.IOUtils;
4243 import org.apache.tika.TikaTest;
4344 import org.apache.tika.batch.BatchProcess;
4445 import org.apache.tika.batch.BatchProcessDriverCLI;
4546 import org.apache.tika.batch.ParallelFileProcessingResult;
4647 import org.apache.tika.batch.builders.BatchProcessBuilder;
48 import org.apache.tika.utils.ProcessUtils;
4749 import org.junit.AfterClass;
4850 import org.junit.BeforeClass;
4951
7981 //see caveat in TikaCLITest's textExtract
8082
8183 try {
82 deleteDirectory(outputRoot);
84
85 FileUtils.deleteDirectory(outputRoot.toFile());
8386 } catch (IOException e) {
8487 e.printStackTrace();
8588 }
171174 commandLine.add("-Xmx128m");
172175 commandLine.add("-cp");
173176 String cp = System.getProperty("java.class.path");
174 //need to test for " " on *nix, can't just add double quotes
175 //across platforms.
176 if (cp.contains(" ")){
177 cp = "\""+cp+"\"";
178 }
177 cp = ProcessUtils.escapeCommandLine(cp);
178
179179 commandLine.add(cp);
180180 commandLine.add("org.apache.tika.batch.fs.FSBatchProcessCLI");
181181
204204 String cp = System.getProperty("java.class.path");
205205 //need to test for " " on *nix, can't just add double quotes
206206 //across platforms.
207 if (cp.contains(" ")){
208 cp = "\""+cp+"\"";
209 }
207 cp = ProcessUtils.escapeCommandLine(cp);
208
210209 commandLine.add(cp);
211210 commandLine.add("org.apache.tika.batch.fs.FSBatchProcessCLI");
212211
263262 return sb.toString();
264263 }
265264
266 //TODO: move this into FileUtils
267 public static void deleteDirectory(Path dir) throws IOException {
268 Files.walkFileTree(dir, new SimpleFileVisitor<Path>() {
269 @Override
270 public FileVisitResult visitFile(Path file,
271 BasicFileAttributes attrs) throws IOException {
272 Files.delete(file);
273 return FileVisitResult.CONTINUE;
274 }
275
276 @Override
277 public FileVisitResult postVisitDirectory(Path dir,
278 IOException exc) throws IOException {
279 Files.delete(dir);
280 return FileVisitResult.CONTINUE;
281 }
282
283 });
284 }
285
286265 /**
287266 * helper method equivalent to File#listFiles()
288267 * grabs children only, does not walk recursively
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
9898 <dependency>
9999 <groupId>org.apache.felix</groupId>
100100 <artifactId>org.apache.felix.framework</artifactId>
101 <version>5.6.4</version>
101 <version>5.6.10</version>
102102 <scope>test</scope>
103103 </dependency>
104104 <dependency>
110110 <dependency>
111111 <groupId>org.ops4j.pax.url</groupId>
112112 <artifactId>pax-url-aether</artifactId>
113 <version>2.5.2</version>
113 <version>2.5.4</version>
114114 <scope>test</scope>
115115 </dependency>
116116 <dependency>
143143 <configuration>
144144 <instructions>
145145 <_runsystempackages>com.sun.xml.bind.marshaller, com.sun.xml.internal.bind.marshaller</_runsystempackages>
146 <!-- The file below and the _include entry may be removed once Tika targets OpenJDK 9.0 or above -->
147 <_include>src/main/resources/META-INF/MANIFEST.MF</_include>
146148 <Bundle-Activator>
147149 org.apache.tika.parser.internal.Activator
148150 </Bundle-Activator>
169171 curvesapi|
170172 xmlbeans|
171173 jackcess|
174 jackcess-encrypt|
172175 commons-lang|
173176 tagsoup|
174177 asm|
197200 netcdf4|
198201 grib|
199202 cdm|
203 parso|
200204 httpservices|
201205 jcip-annotations|
202206 jmatio|
231235 com.github.openjson;resolution:=optional,
232236 com.google.protobuf;resolution:=optional,
233237 com.ibm.icu.text;resolution:=optional,
238 com.parso;resolution:=optional,
234239 com.sleepycat.je;resolution:=optional,
235240 com.sun.javadoc;resolution:=optional,
236241 com.sun.xml.bind.marshaller;resolution:=optional,
245250 javax.annotation;resolution:=optional,
246251 javax.mail;resolution:=optional,
247252 javax.mail.internet;resolution:=optional,
253 javax.net.ssl;resolution:=optional,
248254 javax.servlet.annotation;resolution:=optional,
249255 javax.servlet;resolution:=optional,
250256 javax.servlet.http;resolution:=optional,
251257 javax.measure.converter;resolution:=optional,
252258 javax.ws.rs.core;resolution:=optional,
259 javax.xml.bind;resolution:=optional,
260 javax.xml.bind.annotation;resolution:=optional,
261 javax.xml.bind.annotation.adapters;resolution:=optional,
262 javax.xml.bind.attachment;resolution:=optional,
263 javax.xml.bind.helpers;resolution:=optional,
264 javax.xml.bind.util;resolution:=optional,
253265 net.sf.ehcache;resolution:=optional,
254266 nu.xom;resolution:=optional,
255267 opendap.dap.http;resolution:=optional,
269281 org.apache.commons.httpclient.params;resolution:=optional,
270282 org.apache.commons.httpclient.protocol;resolution:=optional,
271283 org.apache.commons.httpclient.util;resolution:=optional,
284 org.apache.commons.math3.exception;resolution:=optional,
285 org.apache.commons.math3.linear;resolution:=optional,
272286 org.apache.commons.vfs2;resolution:=optional,
273287 org.apache.commons.vfs2.provider;resolution:=optional,
274288 org.apache.commons.vfs2.util;resolution:=optional,
505519 </systemPropertyVariables>
506520 </configuration>
507521 </plugin>
522 <plugin>
523 <groupId>org.apache.rat</groupId>
524 <artifactId>apache-rat-plugin</artifactId>
525 <configuration>
526 <excludes>
527 <exclude>src/main/resources/META-INF/MANIFEST.MF</exclude>
528 </excludes>
529 </configuration>
530 </plugin>
531
508532 </plugins>
509533 </build>
510534
0 Require-Capability: osgi.ee;filter:="(&(osgi.ee=JavaSE)(version=1.8))"
3232 import java.io.StringWriter;
3333 import java.io.Writer;
3434 import java.net.URISyntaxException;
35 import java.nio.file.Paths;
3536 import java.util.HashSet;
3637 import java.util.Set;
3738 import java.util.jar.Attributes;
4445 import org.apache.tika.detect.DefaultDetector;
4546 import org.apache.tika.detect.Detector;
4647 import org.apache.tika.fork.ForkParser;
48 import org.apache.tika.io.TikaInputStream;
4749 import org.apache.tika.metadata.Metadata;
4850 import org.apache.tika.mime.MediaType;
4951 import org.apache.tika.parser.CompositeParser;
255257 ParseContext context = new ParseContext();
256258 context.set(Parser.class, parser);
257259
258 try (InputStream stream =
259 new FileInputStream("src/test/resources/test-documents.zip")) {
260 try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/test-documents.zip"))) {
260261 parser.parse(stream, handler, new Metadata(), context);
261262 }
262263
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
9393 <plugin>
9494 <groupId>org.apache.maven.plugins</groupId>
9595 <artifactId>maven-jar-plugin</artifactId>
96 <configuration>
97 <archive>
98 <manifestEntries>
99 <Automatic-Module-Name>org.apache.tika.core</Automatic-Module-Name>
100 </manifestEntries>
101 </archive>
102 </configuration>
96103 <executions>
97104 <execution>
98105 <goals>
104111 <plugin>
105112 <groupId>org.codehaus.mojo</groupId>
106113 <artifactId>clirr-maven-plugin</artifactId>
114 <version>2.8</version>
107115 <executions>
108116 <execution>
109117 <phase>verify</phase>
2020 import java.io.IOException;
2121 import java.io.InputStream;
2222 import java.io.Writer;
23 import java.nio.Buffer;
2324 import java.nio.ByteBuffer;
2425 import java.nio.channels.Channels;
2526 import java.nio.channels.ReadableByteChannel;
104105 float max = -1;
105106 while (bytesRead != -1) {
106107
107 buf.flip(); // make buffer ready for read
108 ((Buffer)buf).flip(); // make buffer ready for read
108109
109110 while (buf.hasRemaining()) {
110111 byte byt = buf.get();
1818 import java.io.ByteArrayInputStream;
1919 import java.io.InputStream;
2020
21 import javax.xml.XMLConstants;
2221 import javax.xml.namespace.QName;
23 import javax.xml.parsers.SAXParserFactory;
22 import javax.xml.parsers.SAXParser;
2423
24 import org.apache.tika.exception.TikaException;
2525 import org.apache.tika.io.CloseShieldInputStream;
26 import org.apache.tika.parser.ParseContext;
2627 import org.apache.tika.sax.OfflineContentHandler;
28 import org.apache.tika.utils.XMLReaderUtils;
2729 import org.xml.sax.Attributes;
2830 import org.xml.sax.SAXException;
29 import org.xml.sax.SAXNotRecognizedException;
3031 import org.xml.sax.helpers.DefaultHandler;
3132
3233 /**
3637 * @since Apache Tika 0.4
3738 */
3839 public class XmlRootExtractor {
40 private static final ParseContext EMPTY_CONTEXT = new ParseContext();
3941
4042 public QName extractRootElement(byte[] data) {
4143 return extractRootElement(new ByteArrayInputStream(data));
4749 public QName extractRootElement(InputStream stream) {
4850 ExtractorHandler handler = new ExtractorHandler();
4951 try {
50 SAXParserFactory factory = SAXParserFactory.newInstance();
51 factory.setNamespaceAware(true);
52 factory.setValidating(false);
53 try {
54 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
55 } catch (SAXNotRecognizedException e) {
56 // TIKA-271 and TIKA-1000: Some XML parsers do not support the secure-processing
57 // feature, even though it's required by JAXP in Java 5. Ignoring
58 // the exception is fine here, deployments without this feature
59 // are inherently vulnerable to XML denial-of-service attacks.
60 }
61 factory.newSAXParser().parse(
52 XMLReaderUtils.parseSAX(
6253 new CloseShieldInputStream(stream),
63 new OfflineContentHandler(handler));
54 new OfflineContentHandler(handler), EMPTY_CONTEXT);
6455 } catch (Exception ignore) {
6556 }
6657 return handler.rootElement;
111111 // Receive the response
112112 if (input.readBoolean()) {
113113 byte[] data = readStream();
114 return defineClass(name, data, 0, data.length);
114 Class<?> clazz = defineClass(name, data, 0, data.length);
115 definePackageIfNecessary(name, clazz);
116 return clazz;
115117 } else {
116118 throw new ClassNotFoundException("Unable to find class " + name);
117119 }
118120 } catch (IOException e) {
119121 throw new ClassNotFoundException("Unable to load class " + name, e);
120122 }
123 }
124
125 private void definePackageIfNecessary(String className, Class<?> clazz) {
126 String packageName = toPackageName(className);
127 if (packageName != null && getPackage(packageName) == null) {
128 definePackage(packageName, null, null, null, null, null, null, null);
129 }
130 }
131
132 private String toPackageName(String className) {
133 int packageEndIndex = className.lastIndexOf('.');
134 if (packageEndIndex > 0) {
135 return className.substring(0, packageEndIndex);
136 }
137 return null;
121138 }
122139
123140 private byte[] readStream() throws IOException {
2424 import java.io.IOException;
2525 import java.io.InputStream;
2626 import java.io.NotSerializableException;
27 import java.nio.file.Path;
2728 import java.util.ArrayList;
2829 import java.util.List;
30 import java.util.concurrent.atomic.AtomicInteger;
2931 import java.util.jar.JarEntry;
3032 import java.util.jar.JarOutputStream;
3133 import java.util.zip.ZipEntry;
3234
3335 import org.apache.tika.exception.TikaException;
3436 import org.apache.tika.io.IOUtils;
37 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
38 import org.apache.tika.sax.RecursiveParserWrapperHandler;
39 import org.apache.tika.utils.ProcessUtils;
3540 import org.xml.sax.ContentHandler;
3641
3742 class ForkClient {
43 private static AtomicInteger CLIENT_COUNTER = new AtomicInteger(0);
3844
3945 private final List<ForkResource> resources = new ArrayList<>();
4046
4854
4955 private final DataInputStream input;
5056
51 private final InputStream error;
52
53 public ForkClient(ClassLoader loader, Object object, List<String> java, long serverPulseMillis)
57 //this is used for debugging/smoke testing
58 private final int id = CLIENT_COUNTER.incrementAndGet();
59
60 private volatile int filesProcessed = 0;
61
62 public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, List<String> java,
63 TimeoutLimits timeoutLimits) throws IOException, TikaException {
64 this(tikaDir, parserFactoryFactory, null, java, timeoutLimits);
65 }
66 /**
67 *
68 * @param tikaDir directory containing jars from which to start the child server and load the Parser
69 * @param parserFactoryFactory factory to send to child process to build parser upon arrival
70 * @param classLoader class loader to use for non-parser resource (content-handler, etc.)
71 * @param java java commandline to use for the commandline server
72 * @throws IOException
73 * @throws TikaException
74 */
75 public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, ClassLoader classLoader,
76 List<String> java, TimeoutLimits timeoutLimits) throws IOException, TikaException {
77 jar = null;
78 loader = null;
79 boolean ok = false;
80 ProcessBuilder builder = new ProcessBuilder();
81 List<String> command = new ArrayList<>();
82 command.addAll(java);
83 command.add("-cp");
84 String dirString = tikaDir.toAbsolutePath().toString();
85 if (!dirString.endsWith("/")) {
86 dirString += "/*";
87 } else {
88 dirString += "/";
89 }
90 dirString = ProcessUtils.escapeCommandLine(dirString);
91 command.add(dirString);
92 command.add("org.apache.tika.fork.ForkServer");
93 command.add(Long.toString(timeoutLimits.getPulseMS()));
94 command.add(Long.toString(timeoutLimits.getParseTimeoutMS()));
95 command.add(Long.toString(timeoutLimits.getWaitTimeoutMS()));
96 builder.command(command);
97 builder.redirectError(ProcessBuilder.Redirect.INHERIT);
98 try {
99 this.process = builder.start();
100
101 this.output = new DataOutputStream(process.getOutputStream());
102 this.input = new DataInputStream(process.getInputStream());
103
104 waitForStartBeacon();
105 if (classLoader != null) {
106 output.writeByte(ForkServer.INIT_PARSER_FACTORY_FACTORY_LOADER);
107 } else {
108 output.writeByte(ForkServer.INIT_PARSER_FACTORY_FACTORY);
109 }
110 output.flush();
111 sendObject(parserFactoryFactory, resources);
112 if (classLoader != null) {
113 sendObject(classLoader, resources);
114 }
115 waitForStartBeacon();
116 ok = true;
117 } catch (Throwable t) {
118 t.printStackTrace();
119 throw t;
120 } finally {
121 if (!ok) {
122 close();
123 }
124 }
125 }
126
127
128 public ForkClient(ClassLoader loader, Object object, List<String> java, TimeoutLimits timeoutLimits)
54129 throws IOException, TikaException {
55130 boolean ok = false;
56131 try {
62137 command.addAll(java);
63138 command.add("-jar");
64139 command.add(jar.getPath());
65 command.add(Long.toString(serverPulseMillis));
140 command.add(Long.toString(timeoutLimits.getPulseMS()));
141 command.add(Long.toString(timeoutLimits.getParseTimeoutMS()));
142 command.add(Long.toString(timeoutLimits.getWaitTimeoutMS()));
66143 builder.command(command);
144 builder.redirectError(ProcessBuilder.Redirect.INHERIT);
67145 this.process = builder.start();
68146
69147 this.output = new DataOutputStream(process.getOutputStream());
70148 this.input = new DataInputStream(process.getInputStream());
71 this.error = process.getErrorStream();
72149
73150 waitForStartBeacon();
74
151 output.writeByte(ForkServer.INIT_LOADER_PARSER);
152 output.flush();
75153 sendObject(loader, resources);
76154 sendObject(object, resources);
155 waitForStartBeacon();
77156
78157 ok = true;
79158 } finally {
85164
86165 private void waitForStartBeacon() throws IOException {
87166 while (true) {
88 consumeErrorStream();
89167 int type = input.read();
90168 if ((byte) type == ForkServer.READY) {
91 consumeErrorStream();
92169 return;
170 } else if ((byte)type == ForkServer.FAILED_TO_START) {
171 throw new IOException("Server had a catastrophic initialization failure");
172 } else if (type == -1) {
173 throw new IOException("EOF while waiting for start beacon");
174 } else {
175 //can't do this because of ForkParserIntegrationTest#testAttachingADebuggerOnTheForkedParserShouldWork
176 // throw new IOException("Unexpected byte while waiting for start beacon: "+type);
93177 }
94178 }
95179 }
99183 output.writeByte(ForkServer.PING);
100184 output.flush();
101185 while (true) {
102 consumeErrorStream();
103186 int type = input.read();
104187 if (type == ForkServer.PING) {
105 consumeErrorStream();
106188 return true;
107189 } else {
108190 return false;
116198
117199 public synchronized Throwable call(String method, Object... args)
118200 throws IOException, TikaException {
201 filesProcessed++;
119202 List<ForkResource> r = new ArrayList<>(resources);
120203 output.writeByte(ForkServer.CALL);
121204 output.writeUTF(method);
123206 sendObject(args[i], r);
124207 }
125208 return waitForResponse(r);
209 }
210
211 public int getFilesProcessed() {
212 return filesProcessed;
126213 }
127214
128215 /**
139226 if (object instanceof InputStream) {
140227 resources.add(new InputStreamResource((InputStream) object));
141228 object = new InputStreamProxy(n);
142 } else if (object instanceof ContentHandler) {
229 } else if (object instanceof RecursiveParserWrapperHandler) {
230 resources.add(new RecursiveMetadataContentHandlerResource((RecursiveParserWrapperHandler) object));
231 object = new RecursiveMetadataContentHandlerProxy(n, ((RecursiveParserWrapperHandler)object).getContentHandlerFactory());
232 } else if (object instanceof ContentHandler
233 && ! (object instanceof AbstractRecursiveParserWrapperHandler)) {
143234 resources.add(new ContentHandlerResource((ContentHandler) object));
144235 object = new ContentHandlerProxy(n);
145236 } else if (object instanceof ClassLoader) {
148239 }
149240
150241 try {
151 ForkObjectInputStream.sendObject(object, output);
242 ForkObjectInputStream.sendObject(object, output);
152243 } catch(NotSerializableException nse) {
153244 // Build a more friendly error message for this
154245 throw new TikaException(
167258 if (input != null) {
168259 input.close();
169260 }
170 if (error != null) {
171 error.close();
172 }
173261 } catch (IOException ignore) {
174262 }
175263 if (process != null) {
176 process.destroy();
264 process.destroyForcibly();
177265 try {
178266 //TIKA-1933
179267 process.waitFor();
190278 throws IOException {
191279 output.flush();
192280 while (true) {
193 consumeErrorStream();
194281 int type = input.read();
195282 if (type == -1) {
196 consumeErrorStream();
197283 throw new IOException(
198284 "Lost connection to a forked server process");
199285 } else if (type == ForkServer.RESOURCE) {
215301 }
216302
217303 /**
218 * Consumes all pending bytes from the standard error stream of the
219 * forked server process, and prints them out to the standard error
220 * stream of this process. This method should be called always before
221 * expecting some output from the server, to prevent the server from
222 * blocking due to a filled up pipe buffer of the error stream.
223 *
224 * @throws IOException if the error stream could not be read
225 */
226 private void consumeErrorStream() throws IOException {
227 int n;
228 while ((n = error.available()) > 0) {
229 byte[] b = new byte[n];
230 n = error.read(b);
231 if (n > 0) {
232 System.err.write(b, 0, n);
233 }
234 }
235 }
236
237 /**
238304 * Creates a temporary jar file that can be used to bootstrap the forked
239305 * server process. Remember to remove the file when no longer used.
240306 *
275341 MemoryURLConnection.class,
276342 MemoryURLStreamHandler.class,
277343 MemoryURLStreamHandlerFactory.class,
278 MemoryURLStreamRecord.class
344 MemoryURLStreamRecord.class, TikaException.class
279345 };
280346 ClassLoader loader = ForkServer.class.getClassLoader();
281347 for (Class<?> klass : bootstrap) {
287353 }
288354 }
289355 }
356
357 public int getId() {
358 return id;
359 }
290360 }
1717
1818 import java.io.IOException;
1919 import java.io.InputStream;
20 import java.nio.file.Path;
2021 import java.util.ArrayList;
2122 import java.util.Arrays;
2223 import java.util.Collections;
2526 import java.util.Queue;
2627 import java.util.Set;
2728
29 import org.apache.tika.config.Field;
2830 import org.apache.tika.exception.TikaException;
2931 import org.apache.tika.metadata.Metadata;
3032 import org.apache.tika.mime.MediaType;
3234 import org.apache.tika.parser.AutoDetectParser;
3335 import org.apache.tika.parser.ParseContext;
3436 import org.apache.tika.parser.Parser;
37 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3538 import org.apache.tika.sax.TeeContentHandler;
3639 import org.xml.sax.ContentHandler;
3740 import org.xml.sax.SAXException;
4144 /** Serial version UID */
4245 private static final long serialVersionUID = -4962742892274663950L;
4346
47 //these are used by the legacy usage
4448 private final ClassLoader loader;
45
4649 private final Parser parser;
50
51 //these are used when the server builds a parser via a directory
52 //of jars, not via legacy bootstrap etc.
53 private final Path tikaBin;
54 private final ParserFactoryFactory parserFactoryFactory;
4755
4856 /** Java command line */
4957 private List<String> java = Arrays.asList("java", "-Xmx32m");
5058
5159 /** Process pool size */
60 @Field
5261 private int poolSize = 5;
5362
5463 private int currentlyInUse = 0;
5564
5665 private final Queue<ForkClient> pool = new LinkedList<>();
5766
58 private long serverPulseMillis = 5000;
67 @Field
68 private long serverPulseMillis = 1000;
69
70 @Field
71 private long serverParseTimeoutMillis = 60000;
72
73 @Field
74 private long serverWaitTimeoutMillis = 60000;
75
76 @Field
77 private int maxFilesProcessedPerClient = -1;
78
79 /**
80 * If you have a directory with, say, tike-app.jar and you want the child process/server to build a parser
81 * and run it from that -- so that you can keep all of those dependencies out of your client code, use
82 * this initializer.
83 *
84 * @param tikaBin directory containing the tika-app.jar or similar -- full jar including tika-core and all
85 * desired parsers and dependencies
86 * @param factoryFactory
87 */
88 public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) {
89 loader = null;
90 parser = null;
91 this.tikaBin = tikaBin;
92 this.parserFactoryFactory = factoryFactory;
93 }
94
95 /**
96 * <b>EXPERT</b>
97 * @param tikaBin directory containing the tika-app.jar or similar -- full jar including tika-core and all
98 * desired parsers and dependencies
99 * @param parserFactoryFactory -- the factory to use to generate the parser factory in the child process/server
100 * @param classLoader to use for all classes besides the parser in the child process/server
101 */
102 public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, ClassLoader classLoader) {
103 parser = null;
104 loader = classLoader;
105 this.tikaBin = tikaBin;
106 this.parserFactoryFactory = parserFactoryFactory;
107 }
59108
60109 /**
61110 * @param loader The ClassLoader to use
65114 if (parser instanceof ForkParser) {
66115 throw new IllegalArgumentException("The underlying parser of a ForkParser should not be a ForkParser, but a specific implementation.");
67116 }
117 this.tikaBin = null;
118 this.parserFactoryFactory = null;
68119 this.loader = loader;
69120 this.parser = parser;
70121 }
124175
125176 /**
126177 * Sets the command used to start the forked server process.
127 * The arguments "-jar" and "/path/to/bootstrap.jar" are
178 * The arguments "-jar" and "/path/to/bootstrap.jar"
179 * or "-cp" and "/path/to/tika_bin" are
128180 * appended to the given command when starting the process.
129181 * The default setting is {"java", "-Xmx32m"}.
130182 * <p/>
154206 return parser.getSupportedTypes(context);
155207 }
156208
209 /**
210 *
211 * This sends the objects to the server for parsing, and the server via
212 * the proxies acts on the handler as if it were updating it directly.
213 * <p>
214 * If using a RecursiveParserWrapper, there are two options:
215 * </p>
216 * <p>
217 * <ol>
218 * <li>Send in a class that extends {@link org.apache.tika.sax.RecursiveParserWrapperHandler},
219 * and the server will proxy back the data as best it can[0].</li>
220 * <li>Send in a class that extends {@link AbstractRecursiveParserWrapperHandler}
221 * and the server will act on the class but not proxy back the data. This
222 * can be used, for example, if all you want to do is write to disc, extend
223 * {@link AbstractRecursiveParserWrapperHandler} to write to disc when
224 * {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, Metadata)}
225 * is called, and the server will take care of the writing via the handler.</li>
226 * </ol>
227 * </p>
228 * <p>
229 * <b>NOTE:</b>[0] &quot;the server will proxy back the data as best it can&quot;. If the handler
230 * implements Serializable and is actually serializable, the server will send it and the
231 * {@link Metadata} back upon {@link org.apache.tika.sax.RecursiveParserWrapperHandler#endEmbeddedDocument(ContentHandler, Metadata)}
232 * or {@link org.apache.tika.sax.RecursiveParserWrapperHandler#endEmbeddedDocument(ContentHandler, Metadata)}.
233 * If the handler does not implement {@link java.io.Serializable} or if there is a
234 * {@link java.io.NotSerializableException} thrown during serialization, the server will
235 * call {@link ContentHandler#toString()} on the ContentHandler and set that value with the
236 * {@link org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT} key and then
237 * serialize and proxy that data back.
238 * </p>
239 *
240 * @param stream the document stream (input)
241 * @param handler handler for the XHTML SAX events (output)
242 * @param metadata document metadata (input and output)
243 * @param context parse context
244 * @throws IOException
245 * @throws SAXException
246 * @throws TikaException
247 */
157248 public void parse(
158249 InputStream stream, ContentHandler handler,
159250 Metadata metadata, ParseContext context)
167258 boolean alive = false;
168259 ForkClient client = acquireClient();
169260 try {
170 ContentHandler tee = new TeeContentHandler(
261 ContentHandler tee = (handler instanceof AbstractRecursiveParserWrapperHandler) ? handler :
262 new TeeContentHandler(
171263 handler, new MetadataContentHandler(metadata));
264
172265 t = client.call("parse", stream, tee, metadata, context);
173266 alive = true;
174267 } catch (TikaException te) {
213306
214307 // Create a new process if there's room in the pool
215308 if (client == null && currentlyInUse < poolSize) {
216 client = new ForkClient(loader, parser, java, serverPulseMillis);
309 client = newClient();
217310 }
218311
219312 // Ping the process, and get rid of it if it's inactive
236329 }
237330 }
238331
332 private ForkClient newClient() throws IOException, TikaException {
333 TimeoutLimits timeoutLimits = new TimeoutLimits(serverPulseMillis, serverParseTimeoutMillis, serverWaitTimeoutMillis);
334 if (loader == null && parser == null && tikaBin != null && parserFactoryFactory != null) {
335 return new ForkClient(tikaBin, parserFactoryFactory, java, timeoutLimits);
336 } else if (loader != null && parser != null && tikaBin == null && parserFactoryFactory == null) {
337 return new ForkClient(loader, parser, java, timeoutLimits);
338 } else if (loader != null && parser == null && tikaBin != null && parserFactoryFactory != null) {
339 return new ForkClient(tikaBin, parserFactoryFactory, loader, java, timeoutLimits);
340 } else {
341 //TODO: make this more useful
342 throw new IllegalStateException("Unexpected combination of state items");
343 }
344 }
345
239346 private synchronized void releaseClient(ForkClient client, boolean alive) {
240347 currentlyInUse--;
241348 if (currentlyInUse + pool.size() < poolSize && alive) {
242 pool.offer(client);
349 if (maxFilesProcessedPerClient > 0 && client.getFilesProcessed() >= maxFilesProcessedPerClient) {
350 client.close();
351 } else {
352 pool.offer(client);
353 }
243354 notifyAll();
244355 } else {
245356 client.close();
248359
249360 /**
250361 * The amount of time in milliseconds that the server
251 * should wait for any input or output. If it receives no
252 * input or output in this amount of time, it will shutdown.
362 * should wait before checking to see if the parse has timed out
363 * or if the wait has timed out
253364 * The default is 5 seconds.
254365 *
255366 * @param serverPulseMillis milliseconds to sleep before checking if there has been any activity
258369 this.serverPulseMillis = serverPulseMillis;
259370 }
260371
372 /**
373 * The maximum amount of time allowed for the server to try to parse a file.
374 * If more than this time elapses, the server shuts down, and the ForkParser
375 * throws an exception.
376 *
377 * @param serverParseTimeoutMillis
378 */
379 public void setServerParseTimeoutMillis(long serverParseTimeoutMillis) {
380 this.serverParseTimeoutMillis = serverParseTimeoutMillis;
381 }
382
383 /**
384 * The maximum amount of time allowed for the server to wait for a new request to parse
385 * a file. The server will shutdown after this amount of time, and a new server will have
386 * to be started by a new client.
387 * @param serverWaitTimeoutMillis
388 */
389 public void setServerWaitTimeoutMillis(long serverWaitTimeoutMillis) {
390 this.serverWaitTimeoutMillis = serverWaitTimeoutMillis;
391 }
392
393 /**
394 * If there is a slowly building memory leak in one of the parsers,
395 * it is useful to set a limit on the number of files processed
396 * by a server before it is shutdown and restarted. Default value is -1.
397 *
398 * @param maxFilesProcessedPerClient maximum number of files that a server can handle
399 * before the parser shuts down a client and creates
400 * a new process. If set to -1, the server is never restarted
401 * because of the number of files handled.
402 */
403 public void setMaxFilesProcessedPerServer(int maxFilesProcessedPerClient) {
404 this.maxFilesProcessedPerClient = maxFilesProcessedPerClient;
405 }
406
261407 }
1515 */
1616 package org.apache.tika.fork;
1717
18 import org.apache.tika.exception.TikaException;
19 import org.apache.tika.parser.ParserFactory;
20 import org.xml.sax.SAXException;
21
1822 import java.io.ByteArrayInputStream;
1923 import java.io.DataInputStream;
2024 import java.io.DataOutputStream;
2226 import java.io.InputStream;
2327 import java.io.NotSerializableException;
2428 import java.io.OutputStream;
29 import java.io.Serializable;
2530 import java.lang.reflect.InvocationTargetException;
2631 import java.lang.reflect.Method;
2732 import java.net.URL;
2934 import java.util.zip.CheckedOutputStream;
3035 import java.util.zip.Checksum;
3136
32 import org.apache.tika.exception.TikaException;
33
34 class ForkServer implements Runnable, Checksum {
37 class ForkServer implements Runnable {
3538
3639 public static final byte ERROR = -1;
3740
4447 public static final byte RESOURCE = 3;
4548
4649 public static final byte READY = 4;
50
51 public static final byte FAILED_TO_START = 5;
52
53 public static final byte INIT_PARSER_FACTORY_FACTORY = 6;
54 public static final byte INIT_LOADER_PARSER = 7;
55 public static final byte INIT_PARSER_FACTORY_FACTORY_LOADER = 8;
4756
4857 //milliseconds to sleep before checking to see if there has been any reading/writing
4958 //If no reading or writing in this time, shutdown the server.
5059 private long serverPulseMillis = 5000;
60 private long serverParserTimeoutMillis = 60000;
61 private long serverWaitTimeoutMillis = 60000;
62
63 private Object[] lock = new Object[0];
64
5165 /**
5266 * Starts a forked server process using the standard input and output
5367 * streams for communication with the parent process. Any attempts by
5872 * @throws Exception if the server could not be started
5973 */
6074 public static void main(String[] args) throws Exception {
61 long serverPulseMillis = -1;
62 if (args.length > 0) {
63 serverPulseMillis = Long.parseLong(args[0]);
64 }
75 long serverPulseMillis = Long.parseLong(args[0]);
76 long serverParseTimeoutMillis = Long.parseLong(args[1]);
77 long serverWaitTimeoutMillis = Long.parseLong(args[2]);
78
6579 URL.setURLStreamHandlerFactory(new MemoryURLStreamHandlerFactory());
6680
67 ForkServer server = new ForkServer(System.in, System.out, serverPulseMillis);
81 ForkServer server = new ForkServer(System.in, System.out,
82 serverPulseMillis, serverParseTimeoutMillis, serverWaitTimeoutMillis);
6883 System.setIn(new ByteArrayInputStream(new byte[0]));
6984 System.setOut(System.err);
7085
8297 private final DataOutputStream output;
8398
8499 private volatile boolean active = true;
100
101 //can't be class Parser because then you'd
102 //have to include that in bootstrap jar (legacy mode)
103 private Object parser;
104 private ClassLoader classLoader;
105
106 private boolean parsing = false;
107 private long since;
85108
86109 /**
87110 * Sets up a forked server instance using the given stdin/out
91114 * @param output output stream for writing to the parent process
92115 * @throws IOException if the server instance could not be created
93116 */
94 public ForkServer(InputStream input, OutputStream output, long serverPulseMillis)
117 public ForkServer(InputStream input, OutputStream output,
118 long serverPulseMillis, long serverParserTimeoutMillis, long serverWaitTimeoutMillis)
95119 throws IOException {
96120 this.input =
97 new DataInputStream(new CheckedInputStream(input, this));
121 new DataInputStream(input);
98122 this.output =
99 new DataOutputStream(new CheckedOutputStream(output, this));
123 new DataOutputStream(output);
100124 this.serverPulseMillis = serverPulseMillis;
125 this.serverParserTimeoutMillis = serverParserTimeoutMillis;
126 this.serverWaitTimeoutMillis = serverWaitTimeoutMillis;
127 this.parsing = false;
128 this.since = System.currentTimeMillis();
101129 }
102130
103131 public void run() {
104132 try {
105 while (active) {
106 active = false;
133 while (true) {
134 synchronized (lock) {
135 long elapsed = System.currentTimeMillis()-since;
136 if (parsing && elapsed > serverParserTimeoutMillis) {
137 break;
138 } else if (!parsing && serverWaitTimeoutMillis > 0 && elapsed > serverWaitTimeoutMillis) {
139 break;
140 }
141 }
107142 Thread.sleep(serverPulseMillis);
108143 }
109144 System.exit(0);
112147 }
113148
114149 public void processRequests() {
150 //initialize
115151 try {
116 output.writeByte(READY);
117 output.flush();
118
119 ClassLoader loader = (ClassLoader) readObject(
120 ForkServer.class.getClassLoader());
121 Thread.currentThread().setContextClassLoader(loader);
122
123 Object object = readObject(loader);
152 initializeParserAndLoader();
153 } catch (Throwable t) {
154 t.printStackTrace();
155 System.err.flush();
156 try {
157 output.writeByte(FAILED_TO_START);
158 output.flush();
159 } catch (IOException e) {
160 e.printStackTrace();
161 System.err.flush();
162 }
163 return;
164 }
165 //main loop
166 try {
124167 while (true) {
125168 int request = input.read();
126169 if (request == -1) {
128171 } else if (request == PING) {
129172 output.writeByte(PING);
130173 } else if (request == CALL) {
131 call(loader, object);
174 call(classLoader, parser);
132175 } else {
133176 throw new IllegalStateException("Unexpected request");
134177 }
140183 System.err.flush();
141184 }
142185
186 private void initializeParserAndLoader() throws IOException, ClassNotFoundException,
187 TikaException, SAXException {
188 output.writeByte(READY);
189 output.flush();
190
191 int configIndex = input.read();
192 if (configIndex == -1) {
193 throw new TikaException("eof! pipe closed?!");
194 }
195
196 Object firstObject = readObject(
197 ForkServer.class.getClassLoader());
198 switch (configIndex) {
199 case INIT_PARSER_FACTORY_FACTORY:
200 if (firstObject instanceof ParserFactoryFactory) {
201 //the user has submitted a parser factory, but no class loader
202 classLoader = ForkServer.class.getClassLoader();
203 ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build();
204 parser = parserFactory.build();
205 } else {
206 throw new IllegalArgumentException("Expecting only one object of class ParserFactoryFactory");
207 }
208 break;
209 case INIT_LOADER_PARSER:
210 if (firstObject instanceof ClassLoader) {
211 classLoader = (ClassLoader) firstObject;
212 Thread.currentThread().setContextClassLoader(classLoader);
213 //parser from parent process
214 parser = readObject(classLoader);
215 } else {
216 throw new IllegalArgumentException("Expecting ClassLoader followed by a Parser");
217 }
218 break;
219 case INIT_PARSER_FACTORY_FACTORY_LOADER:
220 if (firstObject instanceof ParserFactoryFactory) {
221 //the user has submitted a parser factory and a class loader
222 ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build();
223 parser = parserFactory.build();
224 classLoader = (ClassLoader) readObject(ForkServer.class.getClassLoader());
225 Thread.currentThread().setContextClassLoader(classLoader);
226 } else {
227 throw new IllegalStateException("Expecing ParserFactoryFactory followed by a class loader");
228 }
229 break;
230 }
231 output.writeByte(READY);
232 output.flush();
233 }
234
143235 private void call(ClassLoader loader, Object object) throws Exception {
144 Method method = getMethod(object, input.readUTF());
145 Object[] args =
146 new Object[method.getParameterTypes().length];
147 for (int i = 0; i < args.length; i++) {
148 args[i] = readObject(loader);
236 synchronized (lock) {
237 parsing = true;
238 since = System.currentTimeMillis();
149239 }
150240 try {
151 method.invoke(object, args);
152 output.write(DONE);
153 } catch (InvocationTargetException e) {
154 output.write(ERROR);
155
156 // Try to send the underlying Exception itself
157 Throwable toSend = e.getCause();
241 Method method = getMethod(object, input.readUTF());
242 Object[] args =
243 new Object[method.getParameterTypes().length];
244 for (int i = 0; i < args.length; i++) {
245 args[i] = readObject(loader);
246 }
158247 try {
159 ForkObjectInputStream.sendObject(toSend, output);
160 } catch (NotSerializableException nse) {
161 // Need to build a serializable version of it
162 TikaException te = new TikaException( toSend.getMessage() );
163 te.setStackTrace( toSend.getStackTrace() );
164 ForkObjectInputStream.sendObject(te, output);
248 method.invoke(object, args);
249 output.write(DONE);
250 } catch (InvocationTargetException e) {
251 output.write(ERROR);
252 // Try to send the underlying Exception itself
253 Throwable toSend = e.getCause();
254 try {
255 ForkObjectInputStream.sendObject(toSend, output);
256 } catch (NotSerializableException nse) {
257 // Need to build a serializable version of it
258 TikaException te = new TikaException(toSend.getMessage());
259 te.setStackTrace(toSend.getStackTrace());
260 ForkObjectInputStream.sendObject(te, output);
261 }
262
263 }
264 } finally {
265 synchronized (lock) {
266 parsing = false;
267 since = System.currentTimeMillis();
165268 }
166269 }
167270 }
186289 * is expected to be preceded by a size integer, that is used for reading
187290 * the entire serialization into a memory before deserializing it.
188291 *
189 * @param input input stream from which the serialized object is read
190292 * @param loader class loader to be used for loading referenced classes
191293 * @throws IOException if the object could not be deserialized
192294 * @throws ClassNotFoundException if a referenced class is not found
204306
205307 return object;
206308 }
207
208 //------------------------------------------------------------< Checksum >
209
210 public void update(int b) {
211 active = true;
212 }
213
214 public void update(byte[] b, int off, int len) {
215 active = true;
216 }
217
218 public long getValue() {
219 return 0;
220 }
221
222 public void reset() {
223 }
224
225309 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.fork;
18
19
20
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.parser.ParserFactory;
23
24 import java.io.Serializable;
25 import java.lang.reflect.Constructor;
26 import java.util.Map;
27
28 /**
29 * Lightweight, easily serializable class that contains enough information
30 * to build a {@link ParserFactory}
31 */
32 public class ParserFactoryFactory implements Serializable {
33
34 /** Serial version UID */
35 private static final long serialVersionUID = 4710974869988895410L;
36
37 private final String className;
38 private final Map<String, String> args;
39
40 public ParserFactoryFactory(String className, Map<String, String> args) {
41 this.className = className;
42 this.args = args;
43 }
44
45 public ParserFactory build() throws TikaException {
46 try {
47 Class<?> clazz = Class.forName(className);
48 Constructor<?> con = clazz.getConstructor(Map.class);
49 return (ParserFactory) con.newInstance(args);
50 } catch (ReflectiveOperationException|IllegalStateException e) {
51 throw new TikaException("Couldn't create factory", e);
52 }
53 }
54
55 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
20 import org.apache.tika.sax.ContentHandlerFactory;
21 import org.apache.tika.sax.RecursiveParserWrapperHandler;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24
25 import java.io.ByteArrayOutputStream;
26 import java.io.DataInputStream;
27 import java.io.DataOutputStream;
28 import java.io.IOException;
29 import java.io.NotSerializableException;
30 import java.io.ObjectOutputStream;
31 import java.io.Serializable;
32
33 /**
34 * <p>This class calls #toString() on the ContentHandler, inserts it into the Metadata object
35 * and serializes the Metadata object.
36 * </p>
37 * Ideally, this would serialize the ContentHandler and the Metadata object as separate objects,
38 * but we can't guarantee that the ContentHandler is Serializable (e.g. the StringWriter in
39 * the WriteOutContentHandler).
40 */
41 class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler implements ForkProxy {
42
43 public static final byte EMBEDDED_DOCUMENT = 1;
44 public static final byte MAIN_DOCUMENT = 2;
45 public static final byte HANDLER_AND_METADATA = 3;
46 public static final byte METADATA_ONLY = 4;
47 public static final byte COMPLETE = 5;
48
49 /** Serial version UID */
50 private static final long serialVersionUID = 737511106054617524L;
51
52 private final int resource;
53
54 private transient DataOutputStream output;
55
56 public RecursiveMetadataContentHandlerProxy(int resource, ContentHandlerFactory contentHandlerFactory) {
57 super(contentHandlerFactory);
58 this.resource = resource;
59 }
60
61 public void init(DataInputStream input, DataOutputStream output) {
62 this.output = output;
63 }
64
65 @Override
66 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
67 proxyBackToClient(EMBEDDED_DOCUMENT, contentHandler, metadata);
68 }
69 @Override
70 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
71 if (hasHitMaximumEmbeddedResources()) {
72 metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
73 }
74 proxyBackToClient(MAIN_DOCUMENT, contentHandler, metadata);
75 }
76
77 private void proxyBackToClient(int embeddedOrMainDocument,
78 ContentHandler contentHandler, Metadata metadata) throws SAXException {
79 try {
80 output.write(ForkServer.RESOURCE);
81 output.writeByte(resource);
82 output.writeByte(embeddedOrMainDocument);
83 boolean success = false;
84 if (contentHandler instanceof Serializable) {
85 byte[] bytes = null;
86 try {
87 bytes = serialize(contentHandler);
88 success = true;
89 } catch (NotSerializableException e) {
90 //object lied
91 }
92 if (success) {
93
94 output.write(HANDLER_AND_METADATA);
95 sendBytes(bytes);
96 send(metadata);
97 output.writeByte(COMPLETE);
98 return;
99 }
100 }
101 //if contenthandler is not allegedly or actually Serializable
102 //fall back to adding contentHandler.toString() to the metadata object
103 //and send that.
104 metadata.set(RecursiveParserWrapperHandler.TIKA_CONTENT, contentHandler.toString());
105 output.writeByte(METADATA_ONLY);
106 send(metadata);
107 output.writeByte(COMPLETE);
108 } catch (IOException e) {
109 throw new SAXException(e);
110 } finally {
111 doneSending();
112 }
113 }
114
115 private void send(Object object) throws IOException {
116 byte[] bytes = serialize(object);
117 sendBytes(bytes);
118 }
119
120 private void sendBytes(byte[] bytes) throws IOException {
121 output.writeInt(bytes.length);
122 output.write(bytes);
123 output.flush();
124 }
125
126 private byte[] serialize(Object object) throws IOException {
127 //can't figure out why I'm getting an IllegalAccessException
128 //when I try to use ForkedObjectInputStream, but
129 //not when I do this manually ?!
130 ByteArrayOutputStream bos = new ByteArrayOutputStream();
131 ObjectOutputStream oos = new ObjectOutputStream(bos);
132 oos.writeObject(object);
133 oos.flush();
134 oos.close();
135 return bos.toByteArray();
136
137 }
138
139 private void doneSending() throws SAXException {
140 try {
141 output.flush();
142 } catch (IOException e) {
143 throw new SAXException("Unexpected fork proxy problem", e);
144 }
145 }
146 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
20 import org.apache.tika.sax.RecursiveParserWrapperHandler;
21 import org.xml.sax.ContentHandler;
22 import org.xml.sax.SAXException;
23 import org.xml.sax.helpers.DefaultHandler;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.DataInputStream;
27 import java.io.DataOutputStream;
28 import java.io.IOException;
29 import java.io.ObjectInputStream;
30 import java.util.Arrays;
31
32 class RecursiveMetadataContentHandlerResource implements ForkResource {
33
34 private static final ContentHandler DEFAULT_HANDLER = new DefaultHandler();
35 private final AbstractRecursiveParserWrapperHandler handler;
36
37 public RecursiveMetadataContentHandlerResource(RecursiveParserWrapperHandler handler) {
38 this.handler = handler;
39 }
40
41 public Throwable process(DataInputStream input, DataOutputStream output)
42 throws IOException {
43 try {
44 internalProcess(input);
45 return null;
46 } catch (SAXException e) {
47 return e;
48 }
49 }
50
51 private void internalProcess(DataInputStream input)
52 throws IOException, SAXException {
53 byte embeddedOrMain = input.readByte();
54 byte handlerAndMetadataOrMetadataOnly = input.readByte();
55
56 ContentHandler localContentHandler = DEFAULT_HANDLER;
57 if (handlerAndMetadataOrMetadataOnly == RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) {
58 localContentHandler = (ContentHandler)readObject(input);
59 } else if (handlerAndMetadataOrMetadataOnly != RecursiveMetadataContentHandlerProxy.METADATA_ONLY) {
60 throw new IllegalArgumentException("Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:"
61 +handlerAndMetadataOrMetadataOnly);
62 }
63
64 Metadata metadata = (Metadata) readObject(input);
65 if (embeddedOrMain == RecursiveMetadataContentHandlerProxy.EMBEDDED_DOCUMENT) {
66 handler.endEmbeddedDocument(localContentHandler, metadata);
67 } else if (embeddedOrMain == RecursiveMetadataContentHandlerProxy.MAIN_DOCUMENT) {
68 handler.endDocument(localContentHandler, metadata);
69 } else {
70 throw new IllegalArgumentException("Expected either 0x01 or 0x02, but got: "+embeddedOrMain);
71 }
72 byte isComplete = input.readByte();
73 if (isComplete != RecursiveMetadataContentHandlerProxy.COMPLETE) {
74 throw new IOException("Expected the 'complete' signal, but got: "+isComplete);
75 }
76 }
77
78 private Object readObject(DataInputStream inputStream) throws IOException {
79 try {
80 return ForkObjectInputStream.readObject(inputStream, this.getClass().getClassLoader());
81 } catch (ClassNotFoundException e) {
82 throw new IOException(e);
83 }
84
85 }
86 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 class TimeoutLimits {
19
20 private final long pulseMS;
21 private final long parseTimeoutMS;
22 private final long waitTimeoutMS;
23
24
25 TimeoutLimits(long pulseMS, long parseTimeoutMS, long waitTimeoutMS) {
26 this.pulseMS = pulseMS;
27 this.parseTimeoutMS = parseTimeoutMS;
28 this.waitTimeoutMS = waitTimeoutMS;
29 }
30
31 public long getPulseMS() {
32 return pulseMS;
33 }
34
35 public long getParseTimeoutMS() {
36 return parseTimeoutMS;
37 }
38
39 public long getWaitTimeoutMS() {
40 return waitTimeoutMS;
41 }
42 }
468468 */
469469 private Object openContainer;
470470
471 private int consecutiveEOFs = 0;
472
471473 /**
472474 * Creates a TikaInputStream instance. This private constructor is used
473475 * by the static factory methods based on the available information.
672674 super.reset();
673675 position = mark;
674676 mark = -1;
677 consecutiveEOFs = 0;
675678 }
676679
677680 @Override
689692 }
690693
691694 @Override
692 protected void afterRead(int n) {
695 protected void afterRead(int n) throws IOException {
693696 if (n != -1) {
694697 position += n;
698 } else {
699 consecutiveEOFs++;
700 if (consecutiveEOFs > 1000) {
701 throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
702 "If you think your file is not corrupt, please open an issue on Tika's JIRA");
703 }
695704 }
696705 }
697706
1919 final static String PREFIX = "database"+Metadata.NAMESPACE_PREFIX_DELIMITER;
2020
2121 Property TABLE_NAME = Property.externalTextBag(PREFIX+"table_name");
22 Property COLUMN_COUNT = Property.externalText(PREFIX+"column_count");
22 Property ROW_COUNT = Property.externalInteger(PREFIX+"row_count");
23 Property COLUMN_COUNT = Property.externalInteger(PREFIX+"column_count");
2324 Property COLUMN_NAME = Property.externalTextBag(PREFIX+"column_name");
24 }
25 }
2020 import java.util.Map;
2121 import java.util.SortedSet;
2222 import java.util.TreeSet;
23 import java.util.concurrent.ConcurrentHashMap;
2324
2425 /**
2526 * Registry of known Internet media types.
4546 * as a mapping from the alias to the corresponding canonical type.
4647 */
4748 private final Map<MediaType, MediaType> registry =
48 new HashMap<MediaType, MediaType>();
49 new ConcurrentHashMap<>();
4950
5051 /**
5152 * Known type inheritance relationships. The mapping is from a media type
7374 * @return known aliases
7475 */
7576 public SortedSet<MediaType> getAliases(MediaType type) {
76 SortedSet<MediaType> aliases = new TreeSet<MediaType>();
77 SortedSet<MediaType> aliases = new TreeSet<>();
7778 for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
7879 if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
7980 aliases.add(entry.getKey());
110110 private List<String> extensions = null;
111111
112112 /**
113 * Whether this mime-type is used for server-side scripts,
114 * and thus cannot reliably be used for filename-based type detection
115 */
116 private boolean isInterpreted = false;
117
118 /**
113119 * Creates a media type with the give name and containing media type
114120 * registry. The name is expected to be valid and normalized to lower
115121 * case. This constructor should only be called by
302308 }
303309
304310 /**
311 * whether the type is used as a server-side scripting technology
312 */
313 boolean isInterpreted() {
314 return isInterpreted;
315 }
316
317 void setInterpreted(boolean interpreted) {
318 isInterpreted = interpreted;
319 }
320
321 /**
305322 * Defines a RootXML description. RootXML is made of a localName and/or a
306323 * namespaceURI.
307324 */
2929 import java.util.List;
3030 import java.util.Locale;
3131 import java.util.Map;
32 import java.util.concurrent.ConcurrentHashMap;
3233
3334 import javax.xml.namespace.QName;
3435
101102 private final MediaTypeRegistry registry = new MediaTypeRegistry();
102103
103104 /** All the registered MimeTypes indexed on their canonical names */
104 private final Map<MediaType, MimeType> types =
105 new HashMap<MediaType, MimeType>();
105 private final Map<MediaType, MimeType> types = new HashMap<>();
106106
107107 /** The patterns matcher */
108108 private Patterns patterns = new Patterns(registry);
424424 *
425425 * @return the minimum length of data to provide.
426426 * @see #getMimeType(byte[])
427 * @see #getMimeType(String, byte[])
428427 */
429428 public int getMinLength() {
430429 // This needs to be reasonably large to be able to correctly detect
500499 String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
501500 if (resourceName != null) {
502501 String name = null;
502 boolean isHttp = false;
503503
504504 // Deal with a URI or a path name in as the resource name
505505 try {
506506 URI uri = new URI(resourceName);
507 String scheme = uri.getScheme();
508 isHttp = scheme != null && scheme.startsWith("http"); // http or https
507509 String path = uri.getPath();
508510 if (path != null) {
509511 int slash = path.lastIndexOf('/');
517519
518520 if (name != null) {
519521 MimeType hint = getMimeType(name);
520
521 // If we have some types based on mime magic, try to specialise
522 // and/or select the type based on that
523 // Otherwise, use the type identified from the name
524 possibleTypes = applyHint(possibleTypes, hint);
522
523 // For server-side scripting languages, we cannot rely on the filename to detect the mime type
524 if (!(isHttp && hint.isInterpreted())) {
525 // If we have some types based on mime magic, try to specialise
526 // and/or select the type based on that
527 // Otherwise, use the type identified from the name
528 possibleTypes = applyHint(possibleTypes, hint);
529 }
525530 }
526531 }
527532
3232 import java.util.ArrayList;
3333 import java.util.Collections;
3434 import java.util.List;
35
35 import java.util.concurrent.ArrayBlockingQueue;
36 import java.util.concurrent.TimeUnit;
37 import java.util.concurrent.locks.ReentrantReadWriteLock;
38
39 import org.apache.tika.exception.TikaException;
3640 import org.w3c.dom.Document;
3741 import org.xml.sax.Attributes;
3842 import org.xml.sax.InputSource;
98102 * @see <a href="http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec">http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec</a>
99103 */
100104 public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
101 protected final MimeTypes types;
105 /**
106 * Parser pool size
107 */
108 private static int POOL_SIZE = 10;
109
110 private static final ReentrantReadWriteLock READ_WRITE_LOCK = new ReentrantReadWriteLock();
111
112 private static ArrayBlockingQueue<SAXParser> SAX_PARSERS = new ArrayBlockingQueue<>(POOL_SIZE);
113
114 static {
115 try {
116 setPoolSize(POOL_SIZE);
117 } catch (TikaException e) {
118 throw new RuntimeException("problem initializing SAXParser pool", e);
119 }
120 } protected final MimeTypes types;
102121
103122 /** Current type */
104123 protected MimeType type = null;
112131 }
113132
114133 public void read(InputStream stream) throws IOException, MimeTypeException {
115 try {
116 SAXParserFactory factory = SAXParserFactory.newInstance();
117 factory.setNamespaceAware(false);
118 factory.setFeature(
119 XMLConstants.FEATURE_SECURE_PROCESSING, true);
120 SAXParser parser = factory.newSAXParser();
134 SAXParser parser = null;
135 try {
136
137 parser = acquireSAXParser();
121138 parser.parse(stream, this);
122 } catch (ParserConfigurationException e) {
139 } catch (TikaException e) {
123140 throw new MimeTypeException("Unable to create an XML parser", e);
124141 } catch (SAXException e) {
125142 throw new MimeTypeException("Invalid type configuration", e);
143 } finally {
144 releaseParser(parser);
126145 }
127146 }
128147
129148 public void read(Document document) throws MimeTypeException {
130149 try {
131150 TransformerFactory factory = TransformerFactory.newInstance();
151 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
132152 Transformer transformer = factory.newTransformer();
133153 transformer.transform(new DOMSource(document), new SAXResult(this));
134154 } catch (TransformerException e) {
148168 if (type == null) {
149169 if (MIME_TYPE_TAG.equals(qName)) {
150170 String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
171 String interpretedAttr = attributes.getValue(INTERPRETED_ATTR);
172 boolean interpreted = "true".equals(interpretedAttr);
151173 try {
152174 type = types.forName(name);
175 type.setInterpreted(interpreted);
153176 } catch (MimeTypeException e) {
154177 handleMimeError(name, e, qName, attributes);
155178 }
290313 }
291314
292315 }
293
316 /**
317 * Acquire a SAXParser from the pool; create one if it
318 * doesn't exist. Make sure to {@link #releaseParser(SAXParser)} in
319 * a <code>finally</code> block every time you call this.
320 *
321 * @return a SAXParser
322 * @throws TikaException
323 */
324 private static SAXParser acquireSAXParser()
325 throws TikaException {
326 while (true) {
327 SAXParser parser = null;
328 try {
329 READ_WRITE_LOCK.readLock().lock();
330 parser = SAX_PARSERS.poll(10, TimeUnit.MILLISECONDS);
331 } catch (InterruptedException e) {
332 throw new TikaException("interrupted while waiting for SAXParser", e);
333 } finally {
334 READ_WRITE_LOCK.readLock().unlock();
335
336 }
337 if (parser != null) {
338 return parser;
339 }
340 }
341 }
342
343 /**
344 * Return parser to the pool for reuse
345 *
346 * @param parser parser to return
347 */
348 private static void releaseParser(SAXParser parser) {
349 try {
350 parser.reset();
351 } catch (UnsupportedOperationException e) {
352 //ignore
353 }
354 try {
355 READ_WRITE_LOCK.readLock().lock();
356 //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
357 // this parser will not be added and will then be gc'd
358 SAX_PARSERS.offer(parser);
359 } finally {
360 READ_WRITE_LOCK.readLock().unlock();
361 }
362 }
363
364 /**
365 * Set the pool size for cached XML parsers.
366 *
367 * @param poolSize
368 */
369 public static void setPoolSize(int poolSize) throws TikaException {
370 try {
371 //stop the world with a write lock
372 //parsers that are currently in use will be offered, but not
373 //accepted and will be gc'd
374 READ_WRITE_LOCK.writeLock().lock();
375 SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
376 for (int i = 0; i < poolSize; i++) {
377 SAX_PARSERS.offer(newSAXParser());
378 }
379 POOL_SIZE = poolSize;
380 } finally {
381 READ_WRITE_LOCK.writeLock().unlock();
382 }
383 }
384
385 private static SAXParser newSAXParser() throws TikaException {
386 SAXParserFactory factory = SAXParserFactory.newInstance();
387 factory.setNamespaceAware(false);
388 try {
389 factory.setFeature(
390 XMLConstants.FEATURE_SECURE_PROCESSING, true);
391 return factory.newSAXParser();
392 } catch (ParserConfigurationException|SAXException e) {
393 throw new TikaException("prooblem creating SAX parser factory", e);
394 }
395 }
294396 }
2525 String MIME_TYPE_TAG = "mime-type";
2626
2727 String MIME_TYPE_TYPE_ATTR = "type";
28
29 String INTERPRETED_ATTR = "interpreted";
2830
2931 String ACRONYM_TAG = "acronym";
3032
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser;
18
19 import org.apache.tika.config.TikaConfig;
20 import org.apache.tika.exception.TikaException;
21 import org.xml.sax.SAXException;
22
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.nio.file.Files;
26 import java.nio.file.Paths;
27 import java.util.Map;
28
29 /**
30 * Factory for an AutoDetectParser
31 */
32 public class AutoDetectParserFactory extends ParserFactory {
33
34 /**
35 * Path to a tika-config file. This must be a literal
36 * file or findable on the classpath.
37 */
38 public static final String TIKA_CONFIG_PATH = "tika_config_path";
39
40 public AutoDetectParserFactory(Map<String, String> args) {
41 super(args);
42 }
43
44 @Override
45 public Parser build() throws IOException, SAXException, TikaException {
46 String tikaConfigPath = args.remove(TIKA_CONFIG_PATH);
47 TikaConfig tikaConfig = null;
48 if (tikaConfigPath != null) {
49 if (Files.isReadable(Paths.get(tikaConfigPath))) {
50 tikaConfig = new TikaConfig(Paths.get(tikaConfigPath));
51 } else if (this.getClass().getResource(tikaConfigPath) != null) {
52 try (InputStream is = this.getClass().getResourceAsStream(tikaConfigPath)) {
53 tikaConfig = new TikaConfig(is);
54 }
55 }
56 }
57 if (tikaConfig == null) {
58 tikaConfig = TikaConfig.getDefaultConfig();
59 }
60 return new AutoDetectParser(tikaConfig);
61 }
62 }
3636 import org.apache.tika.sax.OfflineContentHandler;
3737 import org.apache.tika.sax.TaggedContentHandler;
3838 import org.apache.tika.sax.TeeContentHandler;
39 import org.apache.tika.utils.XMLReaderUtils;
3940 import org.xml.sax.Attributes;
4041 import org.xml.sax.ContentHandler;
4142 import org.xml.sax.SAXException;
4243 import org.xml.sax.helpers.DefaultHandler;
44
45 import javax.xml.parsers.SAXParser;
4346
4447 public class NetworkParser extends AbstractParser {
4548
124127 TaggedContentHandler tagged = new TaggedContentHandler(
125128 new OfflineContentHandler(handler));
126129 try {
127 context.getSAXParser().parse(
130 XMLReaderUtils.parseSAX(
128131 stream, new TeeContentHandler(
129 tagged, new MetaHandler(metadata)));
132 tagged, new MetaHandler(metadata)), context);
130133 } catch (SAXException e) {
131134 tagged.throwIfCauseOf(e);
132135 throw new TikaException(
2222 import javax.xml.parsers.SAXParser;
2323 import javax.xml.parsers.SAXParserFactory;
2424 import javax.xml.stream.XMLInputFactory;
25 import javax.xml.stream.XMLResolver;
26 import javax.xml.stream.XMLStreamException;
2725 import javax.xml.transform.Transformer;
28 import javax.xml.transform.TransformerConfigurationException;
29 import javax.xml.transform.TransformerFactory;
30 import javax.xml.transform.TransformerFactoryConfigurationError;
31
32 import java.io.IOException;
26
27 import java.io.InputStream;
3328 import java.io.Serializable;
3429 import java.util.HashMap;
3530 import java.util.Map;
3934 import org.xml.sax.SAXNotRecognizedException;
4035 import org.xml.sax.SAXNotSupportedException;
4136 import org.xml.sax.XMLReader;
37 import org.xml.sax.helpers.DefaultHandler;
4238
4339 /**
4440 * Parse context. Used to pass context information to Tika parsers.
120116 /**
121117 * Returns the SAX parser specified in this parsing context. If a parser
122118 * is not explicitly specified, then one is created using the specified
123 * or the default SAX parser factory.
119 * or the default SAX parser factory. Consider using
120 * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)}
121 * for more efficient reuse of SAXParsers.
124122 *
125123 * @see #getSAXParserFactory()
126124 * @since Apache Tika 0.8
194192 * instance is created and returned. The builder instance is
195193 * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
196194 * and it sets the ErrorHandler to <code>null</code>.
195 * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)}
196 * instead for more efficient reuse of document builders.
197197 *
198198 * @since Apache Tika 1.13
199199 * @return DOM Builder
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser;
18
19
20 import org.apache.tika.exception.TikaException;
21 import org.xml.sax.SAXException;
22
23 import java.io.IOException;
24 import java.util.Map;
25
26 public abstract class ParserFactory {
27
28 final Map<String, String> args;
29
30 public ParserFactory(Map<String, String> args) {
31 this.args = args;
32 }
33
34 public abstract Parser build() throws IOException, SAXException, TikaException;
35
36 }
1616 * limitations under the License.
1717 */
1818
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Date;
22 import java.util.LinkedList;
23 import java.util.List;
24 import java.util.Set;
25
2619 import org.apache.tika.exception.TikaException;
2720 import org.apache.tika.io.FilenameUtils;
2821 import org.apache.tika.metadata.Metadata;
2922 import org.apache.tika.metadata.Property;
3023 import org.apache.tika.metadata.TikaCoreProperties;
31 import org.apache.tika.metadata.TikaMetadataKeys;
3224 import org.apache.tika.mime.MediaType;
25 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3326 import org.apache.tika.sax.ContentHandlerFactory;
27 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3428 import org.apache.tika.utils.ExceptionUtils;
29 import org.apache.tika.utils.ParserUtils;
3530 import org.xml.sax.ContentHandler;
3631 import org.xml.sax.SAXException;
37 import org.xml.sax.helpers.DefaultHandler;
32
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.util.List;
36 import java.util.Set;
3837
3938 /**
4039 * This is a helper class that wraps a parser in a recursive handler.
7675 */
7776 private static final long serialVersionUID = 9086536568120690938L;
7877
79 //move this to TikaCoreProperties?
80 public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
81 public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
78 /**
79 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT}
80 */
81 @Deprecated
82 public final static Property TIKA_CONTENT = AbstractRecursiveParserWrapperHandler.TIKA_CONTENT;
83 /**
84 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#PARSE_TIME_MILLIS}
85 */
86 @Deprecated
87 public final static Property PARSE_TIME_MILLIS = AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS;
88
89 /**
90 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION}
91 */
92 @Deprecated
8293 public final static Property WRITE_LIMIT_REACHED =
83 Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
84 public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
85 Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
86
87 public final static Property EMBEDDED_EXCEPTION =
88 Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
89 //move this to TikaCoreProperties?
90 public final static Property EMBEDDED_RESOURCE_PATH =
91 Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
92
93 private final Parser wrappedParser;
94 private final ContentHandlerFactory contentHandlerFactory;
95 private final List<Metadata> metadatas = new LinkedList<>();
94 AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED;
95 /**
96 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_LIMIT_REACHED}
97 */
98 @Deprecated
99 public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
100 AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED;
101
102 /**
103 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION}
104 */
105 @Deprecated
106 public final static Property EMBEDDED_EXCEPTION = AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION;
107
108 /**
109 * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_PATH}
110 */
111 @Deprecated
112 public final static Property EMBEDDED_RESOURCE_PATH = AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH;
113
114 /**
115 * @deprecated this should be passed in via the {@link RecursiveParserWrapperHandler}
116 */
117 @Deprecated
118 private ContentHandlerFactory contentHandlerFactory = null;
96119
97120 private final boolean catchEmbeddedExceptions;
98121
99 //used in naming embedded resources that don't have a name.
100 private int unknownCount = 0;
122 /**
123 * set this on the RecursiveParserWrapperHandler instead
124 * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20
125 */
126 @Deprecated
101127 private int maxEmbeddedResources = -1;
102 private boolean hitMaxEmbeddedResources = false;
103
128 /**
129 * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20
130 */
131 @Deprecated
132 private ParserState lastParseState = null;
133
134 /**
135 * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
136 * to <code>true</code> as default.
137 *
138 * @param wrappedParser parser to use for the container documents and the embedded documents
139 */
140 public RecursiveParserWrapper(Parser wrappedParser) {
141 this(wrappedParser, true);
142 }
143
144 /**
145 *
146 * @param wrappedParser parser to wrap
147 * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions.
148 * If set to <code>false</code>, embedded exceptions will be thrown and
149 * the rest of the file will not be parsed
150 */
151 public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) {
152 super(wrappedParser);
153 this.catchEmbeddedExceptions = catchEmbeddedExceptions;
154 }
104155 /**
105156 * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
106157 * to <code>true</code> as default.
108159 * @param wrappedParser parser to use for the container documents and the embedded documents
109160 * @param contentHandlerFactory factory to use to generate a new content handler for
110161 * the container document and each embedded document
111 */
162 * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser)}
163 */
164 @Deprecated
112165 public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) {
113166 this(wrappedParser, contentHandlerFactory, true);
114167 }
122175 * @param catchEmbeddedExceptions whether or not to catch the embedded exceptions.
123176 * If set to <code>true</code>, the stack traces will be stored in
124177 * the metadata object with key: {@link #EMBEDDED_EXCEPTION}.
125 */
178 * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser, boolean)}
179 */
180 @Deprecated
126181 public RecursiveParserWrapper(Parser wrappedParser,
127182 ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) {
128183 super(wrappedParser);
129 this.wrappedParser = wrappedParser;
130184 this.contentHandlerFactory = contentHandlerFactory;
131185 this.catchEmbeddedExceptions = catchEmbeddedExceptions;
132186 }
133187
134188 @Override
135189 public Set<MediaType> getSupportedTypes(ParseContext context) {
136 return wrappedParser.getSupportedTypes(context);
190 return getWrappedParser().getSupportedTypes(context);
137191 }
138192
139193 /**
146200 * Make sure to call {@link #reset()} after each parse.
147201 */
148202 @Override
149 public void parse(InputStream stream, ContentHandler ignore,
203 public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler,
150204 Metadata metadata, ParseContext context) throws IOException,
151205 SAXException, TikaException {
152
153 EmbeddedParserDecorator decorator = new EmbeddedParserDecorator("/");
206 //this tracks the state of the parent parser, per call to #parse
207 //in future versions, we can remove lastParseState, and this will be thread-safe
208 ParserState parserState;
209 if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
210 parserState = new ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler);
211 } else {
212 parserState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources));
213 lastParseState = parserState;
214 }
215 EmbeddedParserDecorator decorator = new EmbeddedParserDecorator(getWrappedParser(), "/", parserState);
154216 context.set(Parser.class, decorator);
155 ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
156 long started = new Date().getTime();
217 ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler();
218 long started = System.currentTimeMillis();
219 parserState.recursiveParserWrapperHandler.startDocument();
157220 try {
158 wrappedParser.parse(stream, localHandler, metadata, context);
221 getWrappedParser().parse(stream, localHandler, metadata, context);
159222 } catch (SAXException e) {
160223 boolean wlr = isWriteLimitReached(e);
161224 if (wlr == false) {
162225 throw e;
163226 }
164 metadata.set(WRITE_LIMIT_REACHED, "true");
227 metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
228 } catch (Throwable e) {
229 //try our best to record the problem in the metadata object
230 //then rethrow
231 String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
232 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace);
233 throw e;
165234 } finally {
166 long elapsedMillis = new Date().getTime() - started;
167 metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
168 addContent(localHandler, metadata);
169
170 if (hitMaxEmbeddedResources) {
171 metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
172 }
173 metadatas.add(0, deepCopy(metadata));
174 }
175 }
176
177 /**
178 *
179 * The first element in the returned list represents the
235 long elapsedMillis = System.currentTimeMillis() - started;
236 metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
237 parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
238 parserState.recursiveParserWrapperHandler.endDocument();
239
240 }
241 }
242
243 /**
244 *
245 * The first element in the returned list represents the
180246 * data from the outer container file. There is no guarantee
181247 * about the ordering of the list after that.
182 *
248 *
249 * @deprecated use a {@link RecursiveParserWrapperHandler} instead
250 *
183251 * @return list of Metadata objects that were gathered during the parse
184 */
252 * @throws IllegalStateException if you've used a {@link RecursiveParserWrapperHandler} in your last
253 * call to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)}
254 */
255 @Deprecated
185256 public List<Metadata> getMetadata() {
186 return metadatas;
187 }
188
257 if (lastParseState != null) {
258 return ((RecursiveParserWrapperHandler) lastParseState.recursiveParserWrapperHandler).getMetadataList();
259 } else {
260 throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead");
261 }
262 }
263
189264 /**
190265 * Set the maximum number of embedded resources to store.
191266 * If the max is hit during parsing, the {@link #EMBEDDED_RESOURCE_LIMIT_REACHED}
193268 *
194269 * <p>
195270 * If this value is < 0 (the default), the wrapper will store all Metadata.
196 *
271 * @deprecated set this on a {@link RecursiveParserWrapperHandler}
197272 * @param max maximum number of embedded resources to store
198273 */
274 @Deprecated
199275 public void setMaxEmbeddedResources(int max) {
200276 maxEmbeddedResources = max;
201277 }
202278
203279
204280 /**
205 * This clears the metadata list and resets {@link #unknownCount} and
206 * {@link #hitMaxEmbeddedResources}
207 */
281 * This clears the last parser state (metadata list, unknown count, hit embeddedresource count)
282 *
283 * @deprecated use a {@link org.apache.tika.sax.RecursiveParserWrapperHandler} instead
284 * @throws IllegalStateException if you used a {@link RecursiveParserWrapper} in your call
285 * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)}
286 */
287 @Deprecated
208288 public void reset() {
209 metadatas.clear();
210 unknownCount = 0;
211 hitMaxEmbeddedResources = false;
289 if (lastParseState != null) {
290 lastParseState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources));
291 } else {
292 throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead");
293 }
212294 }
213295
214296 /**
219301 * @return
220302 */
221303 private boolean isWriteLimitReached(Throwable t) {
222 if (t.getMessage() != null &&
304 if (t.getMessage() != null &&
223305 t.getMessage().indexOf("Your document contained more than") == 0) {
224306 return true;
225307 } else {
226308 return t.getCause() != null && isWriteLimitReached(t.getCause());
227309 }
228310 }
229
230 //defensive copy
231 private Metadata deepCopy(Metadata m) {
232 Metadata clone = new Metadata();
233
234 for (String n : m.names()){
235 if (! m.isMultiValued(n)) {
236 clone.set(n, m.get(n));
237 } else {
238 String[] vals = m.getValues(n);
239 for (int i = 0; i < vals.length; i++) {
240 clone.add(n, vals[i]);
241 }
242 }
243 }
244 return clone;
245 }
246
247 private String getResourceName(Metadata metadata) {
311
312 private String getResourceName(Metadata metadata, ParserState state) {
248313 String objectName = "";
249 if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) != null) {
250 objectName = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
251 } else if (metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID) != null) {
252 objectName = metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID);
253 } else {
254 objectName = "embedded-" + (++unknownCount);
255 }
256 //make sure that there isn't any path info in the objectName
257 //some parsers can return paths, not just file names
258 objectName = FilenameUtils.getName(objectName);
259 return objectName;
260 }
261
262 private void addContent(ContentHandler handler, Metadata metadata) {
263
264 if (handler.getClass().equals(DefaultHandler.class)){
265 //no-op: we can't rely on just testing for
266 //empty content because DefaultHandler's toString()
267 //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
314 if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) {
315 objectName = metadata.get(Metadata.RESOURCE_NAME_KEY);
316 } else if (metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID) != null) {
317 objectName = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
268318 } else {
269 String content = handler.toString();
270 if (content != null && content.trim().length() > 0 ) {
271 metadata.add(TIKA_CONTENT, content);
272 }
273 }
274
319 objectName = "embedded-" + (++state.unknownCount);
320 }
321 //make sure that there isn't any path info in the objectName
322 //some parsers can return paths, not just file names
323 objectName = FilenameUtils.getName(objectName);
324 return objectName;
275325 }
276326
277327
280330 private static final long serialVersionUID = 207648200464263337L;
281331
282332 private String location = null;
333 private final ParserState parserState;
283334
284335
285 private EmbeddedParserDecorator(String location) {
286 super(wrappedParser);
336 private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) {
337 super(parser);
287338 this.location = location;
288339 if (! this.location.endsWith("/")) {
289340 this.location += "/";
290341 }
342 this.parserState = parseState;
291343 }
292344
293345 @Override
295347 Metadata metadata, ParseContext context) throws IOException,
296348 SAXException, TikaException {
297349 //Test to see if we should avoid parsing
298 if (maxEmbeddedResources > -1 &&
299 metadatas.size() >= maxEmbeddedResources) {
300 hitMaxEmbeddedResources = true;
350 if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) {
301351 return;
302352 }
303353 // Work out what this thing is
304 String objectName = getResourceName(metadata);
354 String objectName = getResourceName(metadata, parserState);
305355 String objectLocation = this.location + objectName;
306356
307 metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
308
309 //ignore the content handler that is passed in
310 //and get a fresh handler
311 ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
312
357 metadata.add(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, objectLocation);
358
359
360 //get a fresh handler
361 ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler();
362 parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);
363
313364 Parser preContextParser = context.get(Parser.class);
314 context.set(Parser.class, new EmbeddedParserDecorator(objectLocation));
315 long started = new Date().getTime();
365 context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
366 long started = System.currentTimeMillis();
316367 try {
317368 super.parse(stream, localHandler, metadata, context);
318369 } catch (SAXException e) {
321372 metadata.add(WRITE_LIMIT_REACHED, "true");
322373 } else {
323374 if (catchEmbeddedExceptions) {
324 String trace = ExceptionUtils.getStackTrace(e);
325 metadata.set(EMBEDDED_EXCEPTION, trace);
375 ParserUtils.recordParserFailure(this, e, metadata);
326376 } else {
327377 throw e;
328378 }
329379 }
330380 } catch (TikaException e) {
331381 if (catchEmbeddedExceptions) {
332 String trace = ExceptionUtils.getStackTrace(e);
333 metadata.set(EMBEDDED_EXCEPTION, trace);
382 ParserUtils.recordParserFailure(this, e, metadata);
334383 } else {
335384 throw e;
336385 }
337386 } finally {
338387 context.set(Parser.class, preContextParser);
339 long elapsedMillis = new Date().getTime() - started;
340 metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
388 long elapsedMillis = System.currentTimeMillis() - started;
389 metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
390 parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, metadata);
341391 }
342
343 //Because of recursion, we need
344 //to re-test to make sure that we limit the
345 //number of stored resources
346 if (maxEmbeddedResources > -1 &&
347 metadatas.size() >= maxEmbeddedResources) {
348 hitMaxEmbeddedResources = true;
349 return;
350 }
351 addContent(localHandler, metadata);
352 metadatas.add(deepCopy(metadata));
353 }
354 }
355
356
392 }
393 }
394
395 /**
396 * This tracks the state of the parse of a single document.
397 * In future versions, this will allow the RecursiveParserWrapper to be thread safe.
398 */
399 private class ParserState {
400 private int unknownCount = 0;
401 private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
402 private ParserState(AbstractRecursiveParserWrapperHandler handler) {
403 this.recursiveParserWrapperHandler = handler;
404 }
405
406
407 }
357408 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.metadata.Property;
20 import org.apache.tika.metadata.TikaCoreProperties;
21 import org.apache.tika.utils.ParserUtils;
22 import org.xml.sax.ContentHandler;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.helpers.DefaultHandler;
25
26 import java.io.OutputStream;
27 import java.io.Serializable;
28 import java.nio.charset.Charset;
29
30 /**
31 * This is a special handler to be used only with the {@link org.apache.tika.parser.RecursiveParserWrapper}.
32 * It allows for finer-grained processing of embedded documents than in the legacy handlers.
33 * Subclasses can choose how to process individual embedded documents.
34 */
35 public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable {
36
37 public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
38 public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
39 public final static Property WRITE_LIMIT_REACHED =
40 Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
41 public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
42 Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
43
44 public final static Property EMBEDDED_EXCEPTION = ParserUtils.EMBEDDED_EXCEPTION;
45
46 public final static Property EMBEDDED_RESOURCE_PATH =
47 Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
48
49 private final ContentHandlerFactory contentHandlerFactory;
50 private final int maxEmbeddedResources;
51 private int embeddedResources = 0;
52
53 public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
54 this(contentHandlerFactory, -1);
55 }
56
57 public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
58 this.contentHandlerFactory = contentHandlerFactory;
59 this.maxEmbeddedResources = maxEmbeddedResources;
60 }
61
62 public ContentHandler getNewContentHandler() {
63 return contentHandlerFactory.getNewContentHandler();
64 }
65
66 public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
67 return contentHandlerFactory.getNewContentHandler(os, charset);
68 }
69
70 /**
71 * This is called before parsing each embedded document. Override this
72 * for custom behavior. Make sure to call this in your custom classes
73 * because this tracks the number of embedded documents.
74 *
75 * @param contentHandler local handler to be used on this embedded document
76 * @param metadata embedded document's metadata
77 */
78 public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
79 embeddedResources++;
80 }
81 /**
82 * This is called after parsing each embedded document. Override this
83 * for custom behavior. This is currently a no-op.
84 *
85 * @param contentHandler content handler that was used on this embedded document
86 * @param metadata metadata for this embedded document
87 * @throws SAXException
88 */
89 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
90 }
91
92 /**
93 * This is called after the full parse has completed. Override this
94 * for custom behavior. Make sure to call this as <code>super.endDocument(...)</code>
95 * in subclasses because this adds whether or not the embedded resource
96 * maximum has been hit to the metadata.
97 *
98 * @param contentHandler content handler that was used on the main document
99 * @param metadata metadata that was gathered for the main document
100 * @throws SAXException
101 */
102 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
103 if (hasHitMaximumEmbeddedResources()) {
104 metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
105 }
106 }
107
108 /**
109 *
110 * @return whether this handler has hit the maximum embedded resources during the parse
111 */
112 public boolean hasHitMaximumEmbeddedResources() {
113 if (maxEmbeddedResources > -1 && embeddedResources >= maxEmbeddedResources) {
114 return true;
115 }
116 return false;
117 }
118
119 public ContentHandlerFactory getContentHandlerFactory() {
120 return contentHandlerFactory;
121 }
122 }
1818 import java.io.OutputStream;
1919 import java.io.OutputStreamWriter;
2020 import java.io.UnsupportedEncodingException;
21 import java.nio.charset.Charset;
2122 import java.util.Locale;
2223
2324 import org.xml.sax.ContentHandler;
115116
116117 @Override
117118 public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
119 return getNewContentHandler(os, Charset.forName(encoding));
120 }
121
122 @Override
123 public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
118124
119125 if (type == HANDLER_TYPE.IGNORE) {
120126 return new DefaultHandler();
121127 }
128 try {
129 if (writeLimit > -1) {
130 switch (type) {
131 case BODY:
132 return new WriteOutContentHandler(
133 new BodyContentHandler(
134 new OutputStreamWriter(os, charset)), writeLimit);
135 case TEXT:
136 return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
137 case HTML:
138 return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit);
139 case XML:
140 return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit);
141 default:
142 return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit);
143 }
144 } else {
145 switch (type) {
146 case BODY:
147 return new BodyContentHandler(new OutputStreamWriter(os, charset));
148 case TEXT:
149 return new ToTextContentHandler(os, charset.name());
150 case HTML:
151 return new ToHTMLContentHandler(os, charset.name());
152 case XML:
153 return new ToXMLContentHandler(os, charset.name());
154 default:
155 return new ToTextContentHandler(os, charset.name());
122156
123 if (writeLimit > -1) {
124 switch(type) {
125 case BODY:
126 return new WriteOutContentHandler(
127 new BodyContentHandler(
128 new OutputStreamWriter(os, encoding)), writeLimit);
129 case TEXT:
130 return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
131 case HTML:
132 return new WriteOutContentHandler(new ToHTMLContentHandler(os, encoding), writeLimit);
133 case XML:
134 return new WriteOutContentHandler(new ToXMLContentHandler(os, encoding), writeLimit);
135 default:
136 return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
157 }
137158 }
138 } else {
139 switch (type) {
140 case BODY:
141 return new BodyContentHandler(new OutputStreamWriter(os, encoding));
142 case TEXT:
143 return new ToTextContentHandler(os, encoding);
144 case HTML:
145 return new ToHTMLContentHandler(os, encoding);
146 case XML:
147 return new ToXMLContentHandler(os, encoding);
148 default:
149 return new ToTextContentHandler(os, encoding);
150
151 }
159 } catch (UnsupportedEncodingException e) {
160 throw new RuntimeException("couldn't find charset for name: "+charset);
152161 }
153162 }
154163
1919 import org.xml.sax.ContentHandler;
2020
2121 import java.io.OutputStream;
22 import java.io.Serializable;
2223 import java.io.UnsupportedEncodingException;
24 import java.nio.charset.Charset;
2325
2426 /**
2527 * Interface to allow easier injection of code for getting a new ContentHandler
2628 */
27 public interface ContentHandlerFactory {
29 public interface ContentHandlerFactory extends Serializable {
2830 public ContentHandler getNewContentHandler();
31 /**
32 * @deprecated use {@link #getNewContentHandler(OutputStream, Charset)}
33 */
34 @Deprecated
2935 public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException;
36 public ContentHandler getNewContentHandler(OutputStream os, Charset charset);
3037
3138 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.sax;
17
18 import org.apache.tika.metadata.Metadata;
19 import org.apache.tika.utils.ParserUtils;
20 import org.xml.sax.ContentHandler;
21 import org.xml.sax.SAXException;
22 import org.xml.sax.helpers.DefaultHandler;
23
24 import java.util.LinkedList;
25 import java.util.List;
26
27 /**
28 * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}.
29 * See its documentation for more details.
30 *
31 * This caches the a metadata object for each embedded file and for the container file.
32 * It places the extracted content in the metadata object, with this key: {@link AbstractRecursiveParserWrapperHandler#TIKA_CONTENT}
33 * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each
34 * embedded document.
35 * <p>
36 * <b>NOTE: This handler must only be used with the {@link org.apache.tika.parser.RecursiveParserWrapper}</b>
37 * </p>
38 */
39 public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {
40
41 protected final List<Metadata> metadataList = new LinkedList<>();
42
43 /**
44 * Create a handler with no limit on the number of embedded resources
45 */
46 public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
47 super(contentHandlerFactory);
48 }
49
50 /**
51 * Create a handler that limits the number of embedded resources that will be
52 * parsed
53 * @param maxEmbeddedResources number of embedded resources that will be parsed
54 */
55 public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
56 super(contentHandlerFactory, maxEmbeddedResources);
57 }
58
59 /**
60 * This is called before parsing an embedded document
61 *
62 * @param contentHandler - local content handler to use on the embedded document
63 * @param metadata metadata to use for the embedded document
64 * @throws SAXException
65 */
66 @Override
67 public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
68 super.startEmbeddedDocument(contentHandler, metadata);
69 }
70
71 /**
72 * This is called after parsing an embedded document.
73 * @param contentHandler local contenthandler used on the embedded document
74 * @param metadata metadata from the embedded document
75 * @throws SAXException
76 */
77 @Override
78 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
79 super.endEmbeddedDocument(contentHandler, metadata);
80 addContent(contentHandler, metadata);
81 metadataList.add(ParserUtils.cloneMetadata(metadata));
82 }
83
84 /**
85 *
86 * @param contentHandler content handler used on the main document
87 * @param metadata metadata from the main document
88 * @throws SAXException
89 */
90 @Override
91 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
92 super.endDocument(contentHandler, metadata);
93 addContent(contentHandler, metadata);
94
95 metadataList.add(0, ParserUtils.cloneMetadata(metadata));
96 }
97
98 /**
99 *
100 * @return a list of Metadata objects, one for the main document and one for each embedded document
101 */
102 public List<Metadata> getMetadataList() {
103 return metadataList;
104 }
105
106 void addContent(ContentHandler handler, Metadata metadata) {
107
108 if (handler.getClass().equals(DefaultHandler.class)){
109 //no-op: we can't rely on just testing for
110 //empty content because DefaultHandler's toString()
111 //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
112 } else {
113 String content = handler.toString();
114 if (content != null && content.trim().length() > 0 ) {
115 metadata.add(TIKA_CONTENT, content);
116 }
117 }
118 }
119 }
3737 */
3838 public TaggedSAXException(SAXException original, Object tag) {
3939 super(original.getMessage(), original);
40 initCause(original); // SAXException has it's own chaining mechanism!
4140 this.tag = tag;
4241 }
4342
137137
138138 // Call directly, so we don't go through our startElement(), which will
139139 // ignore these elements.
140 super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
140 AttributesImpl htmlAttrs = new AttributesImpl();
141 String lang = metadata.get(Metadata.CONTENT_LANGUAGE);
142 if (lang != null) {
143 htmlAttrs.addAttribute("", "lang", "lang", "CDATA", lang);
144 }
145 super.startElement(XHTML, "html", "html", htmlAttrs);
141146 newline();
142147 super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
143148 newline();
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18
19 import org.apache.tika.metadata.Metadata;
20 import org.apache.tika.metadata.Property;
21 import org.apache.tika.metadata.TikaCoreProperties;
22 import org.apache.tika.parser.Parser;
23 import org.apache.tika.parser.ParserDecorator;
24
25 /**
26 * Helper util methods for Parsers themselves.
27 */
28 public class ParserUtils {
29 public final static String X_PARSED_BY = "X-Parsed-By";
30 public final static Property EMBEDDED_PARSER =
31 Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
32 public final static Property EMBEDDED_EXCEPTION =
33 Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
34
35 /**
36 * Does a deep clone of a Metadata object.
37 */
38 public static Metadata cloneMetadata(Metadata m) {
39 Metadata clone = new Metadata();
40
41 for (String n : m.names()){
42 if (! m.isMultiValued(n)) {
43 clone.set(n, m.get(n));
44 } else {
45 String[] vals = m.getValues(n);
46 for (int i = 0; i < vals.length; i++) {
47 clone.add(n, vals[i]);
48 }
49 }
50 }
51 return clone;
52 }
53
54 /**
55 * Identifies the real class name of the {@link Parser}, unwrapping
56 * any {@link ParserDecorator} decorations on top of it.
57 */
58 public static String getParserClassname(Parser parser) {
59 if (parser instanceof ParserDecorator){
60 return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
61 } else {
62 return parser.getClass().getName();
63 }
64 }
65
66 /**
67 * Records details of the {@link Parser} used to the {@link Metadata},
68 * typically wanted where multiple parsers could be picked between
69 * or used.
70 */
71 public static void recordParserDetails(Parser parser, Metadata metadata) {
72 metadata.add(X_PARSED_BY, getParserClassname(parser));
73 }
74
75 /**
76 * Records details of a {@link Parser}'s failure to the
77 * {@link Metadata}, so you can check what went wrong even if the
78 * {@link Exception} wasn't immediately thrown (eg when several different
79 * Parsers are used)
80 */
81 public static void recordParserFailure(Parser parser, Throwable failure,
82 Metadata metadata) {
83 String trace = ExceptionUtils.getStackTrace(failure);
84 metadata.add(EMBEDDED_EXCEPTION, trace);
85 metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
86 }
87
88 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * <p>
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * <p>
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18
19 public class ProcessUtils {
20
21 /**
22 * This should correctly put double-quotes around an argument if
23 * ProcessBuilder doesn't seem to work (as it doesn't
24 * on paths with spaces on Windows)
25 *
26 * @param arg
27 * @return
28 */
29 public static String escapeCommandLine(String arg) {
30 if (arg == null) {
31 return arg;
32 }
33 //need to test for " " on windows, can't just add double quotes
34 //across platforms.
35 if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
36 arg = "\"" + arg + "\"";
37 }
38 return arg;
39 }
40 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.utils;
17
18 /**
19 * Copied from commons-lang to avoid requiring the dependency
20 */
21 public class SystemUtils {
22
23 private static final String OS_NAME_WINDOWS_PREFIX = "Windows";
24 public static final String OS_NAME = getSystemProperty("os.name");
25 public static final String OS_VERSION = getSystemProperty("os.version");
26 public static final boolean IS_OS_AIX = getOSMatchesName("AIX");
27 public static final boolean IS_OS_HP_UX = getOSMatchesName("HP-UX");
28 public static final boolean IS_OS_IRIX = getOSMatchesName("Irix");
29 public static final boolean IS_OS_LINUX = getOSMatchesName("Linux") || getOSMatchesName("LINUX");
30 public static final boolean IS_OS_MAC = getOSMatchesName("Mac");
31 public static final boolean IS_OS_MAC_OSX = getOSMatchesName("Mac OS X");
32 public static final boolean IS_OS_OS2 = getOSMatchesName("OS/2");
33 public static final boolean IS_OS_SOLARIS = getOSMatchesName("Solaris");
34 public static final boolean IS_OS_SUN_OS = getOSMatchesName("SunOS");
35 public static final boolean IS_OS_UNIX;
36 public static final boolean IS_OS_WINDOWS;
37
38
39 private static String getSystemProperty(String property) {
40 try {
41 return System.getProperty(property);
42 } catch (SecurityException var2) {
43 return null;
44 }
45 }
46
47 private static boolean getOSMatchesName(String osNamePrefix) {
48 return isOSNameMatch(OS_NAME, osNamePrefix);
49 }
50
51 static boolean isOSNameMatch(String osName, String osNamePrefix) {
52 return osName == null ? false : osName.startsWith(osNamePrefix);
53 }
54
55 static {
56 IS_OS_UNIX = IS_OS_AIX || IS_OS_HP_UX || IS_OS_IRIX || IS_OS_LINUX || IS_OS_MAC_OSX || IS_OS_SOLARIS || IS_OS_SUN_OS;
57 IS_OS_WINDOWS = getOSMatchesName(OS_NAME_WINDOWS_PREFIX);
58 }
59
60 }
1717 package org.apache.tika.utils;
1818
1919 import org.apache.tika.exception.TikaException;
20 import org.apache.tika.parser.ParseContext;
21 import org.w3c.dom.Document;
2022 import org.xml.sax.EntityResolver;
2123 import org.xml.sax.InputSource;
2224 import org.xml.sax.SAXException;
2325 import org.xml.sax.SAXNotRecognizedException;
2426 import org.xml.sax.SAXNotSupportedException;
2527 import org.xml.sax.XMLReader;
28 import org.xml.sax.helpers.DefaultHandler;
2629
2730 import javax.xml.XMLConstants;
2831 import javax.xml.parsers.DocumentBuilder;
3740 import javax.xml.transform.TransformerConfigurationException;
3841 import javax.xml.transform.TransformerFactory;
3942 import javax.xml.transform.TransformerFactoryConfigurationError;
40
4143 import java.io.IOException;
44 import java.io.InputStream;
45 import java.io.Serializable;
4246 import java.io.StringReader;
47 import java.lang.reflect.Method;
48 import java.util.Properties;
49 import java.util.concurrent.ArrayBlockingQueue;
50 import java.util.concurrent.TimeUnit;
51 import java.util.concurrent.locks.ReentrantReadWriteLock;
4352 import java.util.logging.Level;
4453 import java.util.logging.Logger;
4554
4857 * to use the {@link org.apache.tika.sax.OfflineContentHandler} to guard against
4958 * XML External Entity attacks.
5059 */
51 public class XMLReaderUtils {
60 public class XMLReaderUtils implements Serializable {
61
62 /**
63 * Serial version UID
64 */
65 private static final long serialVersionUID = 6110455808615143122L;
66
5267 private static final Logger LOG = Logger.getLogger(XMLReaderUtils.class.getName());
68
69 /**
70 * Parser pool size
71 */
72 private static int POOL_SIZE = 10;
73
74 private static long LAST_LOG = -1;
75
76 private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
77 private static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
78
79 private static int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
80
81 private static int determineMaxEntityExpansions() {
82 Properties properties = System.getProperties();
83 if (properties != null && properties.containsKey(JAXP_ENTITY_EXPANSION_LIMIT_KEY)) {
84 try {
85 return Integer.parseInt(properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY));
86 } catch (NumberFormatException e) {
87 LOG.log(Level.WARNING, "Couldn't parse an integer for the entity expansion limit:"+
88 properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY)+
89 "; backing off to default: "+DEFAULT_MAX_ENTITY_EXPANSIONS);
90 }
91 }
92 return DEFAULT_MAX_ENTITY_EXPANSIONS;
93 }
94
95 //TODO: figure out if the rw lock is any better than a simple lock
96 private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
97 private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
98
99 private static ArrayBlockingQueue<SAXParser> SAX_PARSERS = new ArrayBlockingQueue<>(POOL_SIZE);
100 private static ArrayBlockingQueue<DocumentBuilder> DOM_BUILDERS = new ArrayBlockingQueue<>(POOL_SIZE);
101
102 static {
103 try {
104 setPoolSize(POOL_SIZE);
105 } catch (TikaException e) {
106 throw new RuntimeException("problem initializing SAXParser and DOMBuilder pools", e);
107 }
108 }
109
53110
54111 private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
55112 public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
67124 };
68125
69126 /**
127 * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
128 * <b>NOTE:</b>A value less than or equal to zero indicates no limit.
129 * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
130 * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for pa
131 *
132 * @param maxEntityExpansions -- maximum number of allowable entity expansions
133 * @since Apache Tika 1.19
134 */
135 public static void setMaxEntityExpansions(int maxEntityExpansions) {
136 MAX_ENTITY_EXPANSIONS = maxEntityExpansions;
137 }
138
139 /**
70140 * Returns the XMLReader specified in this parsing context. If a reader
71141 * is not explicitly specified, then one is created using the specified
72142 * or the default SAX parser.
73143 *
144 * @return XMLReader
145 * @throws TikaException
74146 * @see #getSAXParser()
75147 * @since Apache Tika 1.13
76 * @return XMLReader
77 * @throws TikaException
78148 */
79149 public static XMLReader getXMLReader() throws TikaException {
80150 XMLReader reader;
95165 * Make sure to wrap your handler in the {@link org.apache.tika.sax.OfflineContentHandler} to
96166 * prevent XML External Entity attacks
97167 * </p>
98
99 *
168 *
169 * @return SAX parser
170 * @throws TikaException if a SAX parser could not be created
100171 * @see #getSAXParserFactory()
101172 * @since Apache Tika 0.8
102 * @return SAX parser
103 * @throws TikaException if a SAX parser could not be created
104173 */
105174 public static SAXParser getSAXParser() throws TikaException {
106175 try {
107 return getSAXParserFactory().newSAXParser();
176 SAXParser parser = getSAXParserFactory().newSAXParser();
177 trySetXercesSecurityManager(parser);
178 return parser;
108179 } catch (ParserConfigurationException e) {
109180 throw new TikaException("Unable to configure a SAX parser", e);
110181 } catch (SAXException e) {
123194 * prevent XML External Entity attacks
124195 * </p>
125196 *
197 * @return SAX parser factory
126198 * @since Apache Tika 0.8
127 * @return SAX parser factory
128199 */
129200 public static SAXParserFactory getSAXParserFactory() {
130201 SAXParserFactory factory = SAXParserFactory.newInstance();
153224 * configured to be namespace-aware and to apply reasonable security
154225 * features.
155226 *
227 * @return DOM parser factory
156228 * @since Apache Tika 1.13
157 * @return DOM parser factory
158229 */
159230 public static DocumentBuilderFactory getDocumentBuilderFactory() {
160231 //borrowed from Apache POI
168239 trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
169240 trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
170241 trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
242 trySetXercesSecurityManager(factory);
171243 return factory;
172244 }
173245
178250 * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
179251 * and it sets the ErrorHandler to <code>null</code>.
180252 *
253 * @return DOM Builder
181254 * @since Apache Tika 1.13
182 * @return DOM Builder
183255 */
184256 public static DocumentBuilder getDocumentBuilder() throws TikaException {
185257 try {
200272 * configured to be namespace-aware and to apply reasonable security
201273 * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
202274 *
275 * @return StAX input factory
203276 * @since Apache Tika 1.13
204 * @return StAX input factory
205277 */
206278 public static XMLInputFactory getXMLInputFactory() {
207279 XMLInputFactory factory = XMLInputFactory.newFactory();
210282 tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
211283
212284 factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
285 trySetStaxSecurityManager(factory);
213286 return factory;
214287 }
215288
217290 try {
218291 documentBuilderFactory.setFeature(feature, enabled);
219292 } catch (Exception e) {
220 LOG.log(Level.WARNING, "SAX Feature unsupported: "+feature, e);
293 LOG.log(Level.WARNING, "SAX Feature unsupported: " + feature, e);
221294 } catch (AbstractMethodError ame) {
222 LOG.log(Level.WARNING, "Cannot set SAX feature because outdated XML parser in classpath: "+ feature, ame);
295 LOG.log(Level.WARNING, "Cannot set SAX feature because outdated XML parser in classpath: " + feature, ame);
223296 }
224297 }
225298
227300 try {
228301 factory.setProperty(key, value);
229302 } catch (IllegalArgumentException e) {
230 //swallow
303 LOG.log(Level.WARNING, "StAX Feature unsupported: " + key, e);
231304 }
232305 }
233306
234307 /**
235308 * Returns a new transformer
236 *
309 * <p>
237310 * The transformer instance is configured to to use
238311 * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
239312 *
240 * @since Apache Tika 1.17
241313 * @return Transformer
242314 * @throws TikaException when the transformer can not be created
315 * @since Apache Tika 1.17
243316 */
244317 public static Transformer getTransformer() throws TikaException {
245318 try {
248321 return transformerFactory.newTransformer();
249322 } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
250323 throw new TikaException("Transformer not available", e);
251 }
252 }
253
324 }
325 }
326
327 /**
328 * This checks context for a user specified {@link DocumentBuilder}.
329 * If one is not found, this reuses a DocumentBuilder from the pool.
330 *
331 * @since Apache Tika 1.19
332 * @param is InputStream to parse
333 * @param context context to use
334 * @return a document
335 * @throws TikaException
336 * @throws IOException
337 * @throws SAXException
338 */
339 public static Document buildDOM(InputStream is, ParseContext context) throws TikaException, IOException, SAXException {
340 DocumentBuilder builderFromContext = context.get(DocumentBuilder.class);
341 DocumentBuilder builder = (builderFromContext == null) ? acquireDOMBuilder() : builderFromContext;
342
343 try {
344 return builder.parse(is);
345 } finally {
346 if (builderFromContext == null) {
347 releaseDOMBuilder(builder);
348 }
349 }
350 }
351
352 /**
353 * This checks context for a user specified {@link SAXParser}.
354 * If one is not found, this reuses a SAXParser from the pool.
355 *
356 * @since Apache Tika 1.19
357 * @param is InputStream to parse
358 * @param contentHandler handler to use
359 * @param context context to use
360 * @return
361 * @throws TikaException
362 * @throws IOException
363 * @throws SAXException
364 */
365 public static void parseSAX(InputStream is, DefaultHandler contentHandler, ParseContext context)
366 throws TikaException, IOException, SAXException {
367 SAXParser contextParser = context.get(SAXParser.class);
368 SAXParser parser = (contextParser == null) ? acquireSAXParser() : contextParser;
369 try {
370 parser.parse(is, contentHandler);
371 } finally {
372 if (contextParser == null) {
373 releaseParser(parser);
374 }
375 }
376 }
377
378 /**
379 * Acquire a SAXParser from the pool. Make sure to
380 * {@link #releaseParser(SAXParser)} in
381 * a <code>finally</code> block every time you call this.
382 *
383 * @return a SAXParser
384 * @throws TikaException
385 */
386 private static DocumentBuilder acquireDOMBuilder()
387 throws TikaException {
388 int waiting = 0;
389 while (true) {
390 DocumentBuilder builder = null;
391 try {
392 DOM_READ_WRITE_LOCK.readLock().lock();
393 builder = DOM_BUILDERS.poll(100, TimeUnit.MILLISECONDS);
394 } catch (InterruptedException e) {
395 throw new TikaException("interrupted while waiting for DOMBuilder", e);
396 } finally {
397 DOM_READ_WRITE_LOCK.readLock().unlock();
398 }
399 if (builder != null) {
400 return builder;
401 }
402 waiting++;
403 if (waiting > 3000) {
404 //freshen the pool. Something went very wrong...
405 setPoolSize(POOL_SIZE);
406 //better to get an exception than have permahang by a bug in one of our parsers
407 throw new TikaException("Waited more than 5 minutes for a DocumentBuilder; " +
408 "This could indicate that a parser has not correctly released its DocumentBuilder. " +
409 "Please report this to the Tika team: dev@tika.apache.org");
410
411 }
412 }
413 }
414
415 /**
416 * Return parser to the pool for reuse.
417 *
418 * @param builder builder to return
419 */
420 private static void releaseDOMBuilder(DocumentBuilder builder) {
421 try {
422 builder.reset();
423 } catch (UnsupportedOperationException e) {
424 //ignore
425 }
426 try {
427 DOM_READ_WRITE_LOCK.readLock().lock();
428 //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
429 // this parser will not be added and will then be gc'd
430 boolean success = DOM_BUILDERS.offer(builder);
431 if (! success) {
432 LOG.warning("DocumentBuilder not taken back into pool. If you haven't resized the pool, this could " +
433 "be a sign that there are more calls to 'acquire' than to 'release'");
434 }
435 } finally {
436 DOM_READ_WRITE_LOCK.readLock().unlock();
437 }
438 }
439
440
441 /**
442 * Acquire a SAXParser from the pool. Make sure to
443 * {@link #releaseParser(SAXParser)} in
444 * a <code>finally</code> block every time you call this.
445 *
446 * @return a SAXParser
447 * @throws TikaException
448 */
449 private static SAXParser acquireSAXParser()
450 throws TikaException {
451 int waiting = 0;
452 while (true) {
453 SAXParser parser = null;
454 try {
455 SAX_READ_WRITE_LOCK.readLock().lock();
456 parser = SAX_PARSERS.poll(100, TimeUnit.MILLISECONDS);
457 } catch (InterruptedException e) {
458 throw new TikaException("interrupted while waiting for SAXParser", e);
459 } finally {
460 SAX_READ_WRITE_LOCK.readLock().unlock();
461 }
462 if (parser != null) {
463 return parser;
464 }
465 waiting++;
466 if (waiting > 3000) {
467 //freshen the pool. Something went very wrong...
468 setPoolSize(POOL_SIZE);
469 //better to get an exception than have permahang by a bug in one of our parsers
470 throw new TikaException("Waited more than 5 minutes for a SAXParser; " +
471 "This could indicate that a parser has not correctly released its SAXParser. " +
472 "Please report this to the Tika team: dev@tika.apache.org");
473
474 }
475 }
476 }
477
478 /**
479 * Return parser to the pool for reuse
480 *
481 * @param parser parser to return
482 */
483 private static void releaseParser(SAXParser parser) {
484 try {
485 parser.reset();
486 } catch (UnsupportedOperationException e) {
487 //ignore
488 }
489 try {
490 SAX_READ_WRITE_LOCK.readLock().lock();
491 //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
492 // this parser will not be added and will then be gc'd
493 boolean success = SAX_PARSERS.offer(parser);
494 if (! success) {
495 LOG.warning("SAXParser not taken back into pool. If you haven't resized the pool, this could " +
496 "be a sign that there are more calls to 'acquire' than to 'release'");
497 }
498 } finally {
499 SAX_READ_WRITE_LOCK.readLock().unlock();
500 }
501 }
502
503 /**
504 * Set the pool size for cached XML parsers.
505 *
506 * @since Apache Tika 1.19
507 * @param poolSize
508 */
509 public static void setPoolSize(int poolSize) throws TikaException {
510 try {
511 //stop the world with a write lock.
512 //parsers that are currently in use will be offered later (once the lock is released),
513 //but not accepted and will be gc'd. We have to do this locking and
514 //the read locking in case one thread resizes the pool when the
515 //parsers have already started. We could have an NPE on SAX_PARSERS
516 //if we didn't lock.
517 SAX_READ_WRITE_LOCK.writeLock().lock();
518 if (SAX_PARSERS.size() != poolSize) {
519 SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
520 for (int i = 0; i < poolSize; i++) {
521 SAX_PARSERS.offer(getSAXParser());
522 }
523 }
524 } finally {
525 SAX_READ_WRITE_LOCK.writeLock().unlock();
526 }
527 try {
528 DOM_READ_WRITE_LOCK.writeLock().lock();
529
530 if (DOM_BUILDERS.size() != poolSize) {
531 DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
532 for (int i = 0; i < poolSize; i++) {
533 DOM_BUILDERS.offer(getDocumentBuilder());
534 }
535 }
536 } finally {
537 DOM_READ_WRITE_LOCK.writeLock().unlock();
538 }
539 POOL_SIZE = poolSize;
540 }
541
542 private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) {
543 //from POI
544 // Try built-in JVM one first, standalone if not
545 for (String securityManagerClassName : new String[] {
546 //"com.sun.org.apache.xerces.internal.util.SecurityManager",
547 "org.apache.xerces.util.SecurityManager"
548 }) {
549 try {
550 Object mgr = Class.forName(securityManagerClassName).newInstance();
551 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
552 setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
553 factory.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
554 // Stop once one can be setup without error
555 return;
556 } catch (ClassNotFoundException e) {
557 // continue without log, this is expected in some setups
558 } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
559 // throttle the log somewhat as it can spam the log otherwise
560 if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
561 LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
562 LAST_LOG = System.currentTimeMillis();
563 }
564 }
565 }
566
567 // separate old version of Xerces not found => use the builtin way of setting the property
568 try {
569 factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
570 } catch (IllegalArgumentException e) { // NOSONAR - also catch things like NoClassDefError here
571 // throttle the log somewhat as it can spam the log otherwise
572 if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
573 LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
574 LAST_LOG = System.currentTimeMillis();
575 }
576 }
577 }
578
579 private static void trySetXercesSecurityManager(SAXParser parser) {
580 //from POI
581 // Try built-in JVM one first, standalone if not
582 for (String securityManagerClassName : new String[] {
583 //"com.sun.org.apache.xerces.internal.util.SecurityManager",
584 "org.apache.xerces.util.SecurityManager"
585 }) {
586 try {
587 Object mgr = Class.forName(securityManagerClassName).newInstance();
588 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
589 setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
590 parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
591 // Stop once one can be setup without error
592 return;
593 } catch (ClassNotFoundException e) {
594 // continue without log, this is expected in some setups
595 } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
596 // throttle the log somewhat as it can spam the log otherwise
597 if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
598 LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
599 LAST_LOG = System.currentTimeMillis();
600 }
601 }
602 }
603
604 // separate old version of Xerces not found => use the builtin way of setting the property
605 try {
606 parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
607 } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
608 // throttle the log somewhat as it can spam the log otherwise
609 if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
610 LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
611 LAST_LOG = System.currentTimeMillis();
612 }
613 }
614 }
615
616 private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
617 try {
618 inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
619 } catch (IllegalArgumentException e) {
620 // throttle the log somewhat as it can spam the log otherwise
621 if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
622 LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
623 LAST_LOG = System.currentTimeMillis();
624 }
625 }
626 }
254627 }
403403 <mime-type type="application/mbox">
404404 <!-- MBOX files start with "From [sender] [date]" -->
405405 <!-- To avoid false matches, check for other headers after that -->
406
406407 <magic priority="70">
407408 <match value="From " type="string" offset="0">
408 <match value="\nFrom: " type="string" offset="32:256"/>
409 <match value="\nDate: " type="string" offset="32:256"/>
410 <match value="\nSubject: " type="string" offset="32:256"/>
411 <match value="\nDelivered-To: " type="string" offset="32:256"/>
412 <match value="\nReceived: by " type="string" offset="32:256"/>
413 <match value="\nReceived: via " type="string" offset="32:256"/>
414 <match value="\nReceived: from " type="string" offset="32:256"/>
415 <match value="\nMime-Version: " type="string" offset="32:256"/>
409 <match value="\nFrom: " type="string" offset="32:256"/>
410 <match value="\nDate: " type="string" offset="32:256"/>
411 <match value="\nSubject: " type="string" offset="32:256"/>
412 <match value="\nDelivered-To: " type="string" offset="32:256"/>
413 <match value="\nReceived: by " type="string" offset="32:256"/>
414 <match value="\nReceived: via " type="string" offset="32:256"/>
415 <match value="\nReceived: from " type="string" offset="32:256"/>
416 <match value="\nMime-Version: " type="string" offset="32:256"/>
417
418 <match value="\nX-" type="stringignorecase" offset="32:256">
419 <match value="\nFrom: " type="string" offset="32:8192"/>
420 <match value="\nDate: " type="string" offset="32:8192"/>
421 <match value="\nSubject: " type="string" offset="32:8192"/>
422 <match value="\nDelivered-To: " type="string" offset="32:8192"/>
423 <match value="\nReceived: by " type="string" offset="32:8192"/>
424 <match value="\nReceived: via " type="string" offset="32:8192"/>
425 <match value="\nReceived: from " type="string" offset="32:8192"/>
426 <match value="\nMime-Version: " type="string" offset="32:8192"/>
427 </match>
416428 </match>
417429 </magic>
418430 <glob pattern="*.mbox"/>
50065018 <glob pattern="*.xyz"/>
50075019 </mime-type>
50085020
5021 <mime-type type="image/aces">
5022 <_comment>ACES Image Container File</_comment>
5023 <magic priority="50">
5024 <match value="0x762F310102000000" type="string" offset="0"/>
5025 <match value="0x762F310102040000" type="string" offset="0"/>
5026 </magic>
5027 <glob pattern="*.exr"/>
5028 </mime-type>
5029
50095030 <mime-type type="image/bmp">
50105031 <alias type="image/x-bmp"/>
50115032 <alias type="image/x-ms-bmp"/>
50535074 </match>
50545075 </magic>
50555076 <glob pattern="*.cgm"/>
5077 </mime-type>
5078
5079 <mime-type type="image/x-dpx">
5080 <acronym>DPX</acronym>
5081 <_comment>Digital Picture Exchange from SMPTE</_comment>
5082 <magic priority="50">
5083 <match value="SDPX" type="string" offset="0" />
5084 <match value="XPDS" type="string" offset="0" />
5085 </magic>
5086 <glob pattern="*.dpx"/>
50565087 </mime-type>
50575088
50585089 <mime-type type="image/emf">
57645795 </magic>
57655796 <glob pattern="*.eml"/>
57665797 <glob pattern="*.mime"/>
5798 <sub-class-of type="text/x-tika-text-based-message"/>
5799 </mime-type>
5800
5801 <!-- TODO See TIKA-2723 for discussions on the mime type hierarchy -->
5802 <!-- and best parser structure for these email-like formats -->
5803 <mime-type type="multipart/related">
5804 <acronym>MHTML</acronym>
5805 <_comment>MIME Encapsulation of Aggregate HTML Documents</_comment>
5806 <tika:link>http://tools.ietf.org/html/rfc2557</tika:link>
5807 <alias type="application/x-mimearchive"/>
5808 <alias type="message/rfc2557"/>
5809 <!-- higher priority than message/rfc822 -->
5810 <magic priority="60">
5811 <match value="From: \x3cSaved by Windows Internet Explorer 8\x3e" type="stringignorecase" offset="0"/>
5812 <match value="From: \x22Saved by Internet Explorer 11\x22" type="stringignorecase" offset="0"/>
5813 <match value="MIME-Version: 1.0" type="string" offset="0">
5814 <match value="\nContent-Type: multipart/related" type="string" offset="16:512"/>
5815 </match>
5816 </magic>
57675817 <glob pattern="*.mht"/>
57685818 <glob pattern="*.mhtml"/>
5769 <sub-class-of type="text/x-tika-text-based-message"/>
5819 <sub-class-of type="message/rfc822"/>
57705820 </mime-type>
57715821
57725822 <mime-type type="message/s-http"/>
58685918 <mime-type type="multipart/header-set"/>
58695919 <mime-type type="multipart/mixed"/>
58705920 <mime-type type="multipart/parallel"/>
5871 <mime-type type="multipart/related"/>
58725921 <mime-type type="multipart/report"/>
58735922 <mime-type type="multipart/signed"/>
58745923 <mime-type type="multipart/voice-message"/>
59005949 <sub-class-of type="text/plain"/>
59015950 </mime-type>
59025951
5903 <mime-type type="text/asp">
5952 <mime-type type="text/asp" interpreted="true">
59045953 <_comment>Active Server Page</_comment>
59055954 <glob pattern="*.asp"/>
59065955 <sub-class-of type="text/plain"/>
59075956 </mime-type>
59085957
5909 <mime-type type="text/aspdotnet">
5958 <mime-type type="text/aspdotnet" interpreted="true">
59105959 <_comment>ASP .NET</_comment>
59115960 <glob pattern="*.aspx"/>
59125961 <sub-class-of type="text/plain"/>
62956344 <sub-class-of type="text/plain"/>
62966345 </mime-type>
62976346
6298 <mime-type type="text/x-cgi">
6347 <mime-type type="text/x-cgi" interpreted="true">
62996348 <_comment>CGI script</_comment>
63006349 <glob pattern="*.cgi"/>
63016350 <sub-class-of type="text/plain"/>
63496398 <sub-class-of type="text/plain"/>
63506399 </mime-type>
63516400
6352 <mime-type type="text/x-coldfusion">
6401 <mime-type type="text/x-coldfusion" interpreted="true">
63536402 <_comment>ColdFusion source code</_comment>
63546403 <glob pattern="*.cfm"/>
63556404 <glob pattern="*.cfml"/>
64656514 <sub-class-of type="text/plain"/>
64666515 </mime-type>
64676516
6468 <mime-type type="text/x-jsp">
6517 <mime-type type="text/x-jsp" interpreted="true">
64696518 <_comment>Java Server Page</_comment>
64706519 <alias type="application/x-httpd-jsp"/>
64716520 <sub-class-of type="text/plain"/>
65886637 <sub-class-of type="text/plain"/>
65896638 </mime-type>
65906639
6591 <mime-type type="text/x-php">
6640 <mime-type type="text/x-php" interpreted="true">
65926641 <_comment>PHP script</_comment>
65936642 <magic priority="50">
65946643 <match value="&lt;?php" type="string" offset="0"/>
1616
1717 package org.apache.tika;
1818
19 import org.apache.tika.detect.Detector;
20 import org.apache.tika.detect.XmlRootExtractor;
21 import org.apache.tika.exception.TikaException;
22 import org.apache.tika.io.TikaInputStream;
1923 import org.apache.tika.metadata.Metadata;
20 import org.apache.tika.parser.AutoDetectParser;
24 import org.apache.tika.mime.MediaType;
2125 import org.apache.tika.parser.ParseContext;
2226 import org.apache.tika.parser.Parser;
2327 import org.apache.tika.parser.RecursiveParserWrapper;
28 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
2429 import org.apache.tika.sax.BasicContentHandlerFactory;
25 import org.junit.Test;
26 import org.xml.sax.helpers.DefaultHandler;
30 import org.apache.tika.sax.RecursiveParserWrapperHandler;
31 import org.apache.tika.utils.XMLReaderUtils;
2732
2833 import java.io.FileFilter;
2934 import java.io.IOException;
3742 import java.nio.file.attribute.BasicFileAttributes;
3843 import java.util.ArrayList;
3944 import java.util.List;
40 import java.util.Locale;
4145 import java.util.Map;
4246 import java.util.Random;
4347 import java.util.concurrent.Callable;
4852 import java.util.concurrent.Executors;
4953 import java.util.concurrent.Future;
5054 import java.util.concurrent.TimeUnit;
55 import java.util.concurrent.atomic.AtomicInteger;
5156
5257 import static org.junit.Assert.assertEquals;
5358
5459 public class MultiThreadedTikaTest extends TikaTest {
5560 //TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed
5661 //TODO: Consider randomizing the Locale and timezone, like Lucene/Solr...
62 XmlRootExtractor ex = new XmlRootExtractor();
5763
5864 /**
59 * This calls {@link #testEach(Path[], int, int)} and then {@link #testAll(Path[], int, int)}
65 * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and
66 * then {@link #testAll(Parser parser, Path[], ParseContext[], int, int)}
6067 *
61 * @param numThreads number of threads to use
68 * @param numThreads number of threads to use
6269 * @param numIterations number of iterations per thread
63 * @param filter file filter to select files from "/test-documents"; if <code>null</code>,
64 * all files will be used
70 * @param filter file filter to select files from "/test-documents"; if <code>null</code>,
71 * all files will be used
6572 * @throws Exception
6673 */
67 protected void testMultiThreaded(int numThreads, int numIterations, FileFilter filter) throws Exception {
74 protected void testMultiThreaded(Parser parser, ParseContext[] parseContext, int numThreads, int numIterations, FileFilter filter) throws Exception {
6875 Path[] allFiles = getTestFiles(filter);
69 //testEach(allFiles, numThreads, numIterations);
70 testAll(allFiles, numThreads, numIterations);
71 }
72
73 /**
74 * Test each file, one at a time in multiple threads.
75 * This was required to test TIKA-2519 in a reasonable
76 * amount of time. This forced the parser to use the
77 * same underlying memory structurees because it was the same file.
78 * This is stricter than I think our agreement with clients is
79 * because this run tests on literally the same file and
80 * not a copy of the file per thread. Let's leave this as is
81 * unless there's a good reason to create a separate copy per thread.
82 *
83 * @param files files to test, one at a time
84 * @param numThreads number of threads to use
85 * @param numIterations number of iterations per thread
86 */
87 protected void testEach(Path[] files, int numThreads, int numIterations) {
76 testEach(parser, allFiles, parseContext, numThreads, numIterations);
77 testAll(parser, allFiles, parseContext, numThreads, numIterations);
78 }
79
80 public void testDetector(Detector detector, int numThreads, int numIterations, FileFilter filter, int randomlyResizeSAXPool) throws Exception {
81 Path[] files = getTestFiles(filter);
82 testDetectorEach(detector, files, numThreads, numIterations, randomlyResizeSAXPool);
83 testDetectorOnAll(detector, files, numThreads, numIterations, randomlyResizeSAXPool);
84 }
85
86 void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIterations, int randomlyResizeSAXPool) {
8887 for (Path p : files) {
8988 Path[] toTest = new Path[1];
9089 toTest[0] = p;
91 testAll(toTest, numThreads, numIterations);
92 }
93 }
94
95 /**
96 * This tests all files together. Each parser randomly selects
97 * a file from the array. Two parsers could wind up parsing the
98 * same file at the same time. Good.
99 *
100 * In the current implementation, this gets ground truth only
101 * from files that do not throw exceptions. This will ignore
102 * files that cause exceptions.
103 *
104 * @param files files to parse
105 * @param numThreads number of parser threads
106 * @param numIterations number of iterations per parser
107 */
108 protected void testAll(Path[] files, int numThreads, int numIterations) {
109
110 Map<Path, Extract> truth = getBaseline(files);
90 testDetectorOnAll(detector, toTest, numThreads, numIterations, randomlyResizeSAXPool);
91 }
92 }
93
94 private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, int numIterations, int randomlyResizeSAXPool) {
95 Map<Path, MediaType> truth = getBaselineDetection(detector, toTest);
11196 //if all files caused an exception
11297 if (truth.size() == 0) {
11398 return;
118103 for (Path testFile : truth.keySet()) {
119104 testFiles[j++] = testFile;
120105 }
121
122 ExecutorService ex = Executors.newFixedThreadPool(numThreads);
106 int actualThreadCount = numThreads + ((randomlyResizeSAXPool > 0) ? randomlyResizeSAXPool : 0);
107 ExecutorService ex = Executors.newFixedThreadPool(actualThreadCount);
123108 try {
124 _testAll(testFiles, numThreads, numIterations, truth, ex);
109 _testDetectorOnAll(detector, testFiles, numThreads, numIterations, truth, ex, randomlyResizeSAXPool);
125110 } finally {
126111 ex.shutdown();
127112 ex.shutdownNow();
128113 }
129114 }
130115
131 private void _testAll(Path[] testFiles, int numThreads, int numIterations,
132 Map<Path, Extract> truth, ExecutorService ex) {
133
116 private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThreads,
117 int numIterations, Map<Path, MediaType> truth, ExecutorService ex, int randomlyResizeSAXPool) {
134118 ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<>(ex);
135119
136 //use the same parser in all threads
137 Parser parser = new AutoDetectParser();
120 executorCompletionService.submit(new SAXPoolResizer(randomlyResizeSAXPool));
138121 for (int i = 0; i < numThreads; i++) {
139 executorCompletionService.submit(new TikaRunner(parser, numIterations, testFiles, truth));
122 executorCompletionService.submit(new TikaDetectorRunner(detector, numIterations, testFiles, truth));
140123 }
141124
142125 int completed = 0;
150133 future.get();//trigger exceptions from thread
151134 completed++;
152135 }
153 } catch (InterruptedException|ExecutionException e) {
136 } catch (InterruptedException | ExecutionException e) {
154137 throw new RuntimeException(e);
155138 }
156139 }
157 }
158
159 private static Path[] getTestFiles(final FileFilter fileFilter) throws URISyntaxException, IOException {
140 ex.shutdown();
141 ex.shutdownNow();
142 }
143
144 /**
145 * Test each file, one at a time in multiple threads.
146 * This was required to test TIKA-2519 in a reasonable
147 * amount of time. This forced the parser to use the
148 * same underlying memory structures because it was the same file.
149 * This is stricter than I think our agreement with clients is
150 * because this run tests on literally the same file and
151 * not a copy of the file per thread. Let's leave this as is
152 * unless there's a good reason to create a separate copy per thread.
153 *
154 * @param files files to test, one at a time
155 * @param numThreads number of threads to use
156 * @param numIterations number of iterations per thread
157 */
158 protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads, int numIterations) {
159 for (Path p : files) {
160 Path[] toTest = new Path[1];
161 toTest[0] = p;
162 testAll(parser, toTest, parseContext, numThreads, numIterations);
163 }
164 }
165
166 /**
167 * This tests all files together. Each parser randomly selects
168 * a file from the array. Two parsers could wind up parsing the
169 * same file at the same time. Good.
170 * <p>
171 * In the current implementation, this gets ground truth only
172 * from files that do not throw exceptions. This will ignore
173 * files that cause exceptions.
174 *
175 * @param files files to parse
176 * @param numThreads number of parser threads
177 * @param numIterations number of iterations per parser
178 */
179 protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads, int numIterations) {
180
181 Map<Path, Extract> truth = getBaseline(parser, files, parseContext[0]);
182 //if all files caused an exception
183 if (truth.size() == 0) {
184 //return;
185 }
186 //only those that parsed without exception
187 Path[] testFiles = new Path[truth.size()];
188 int j = 0;
189 for (Path testFile : truth.keySet()) {
190 testFiles[j++] = testFile;
191 }
192
193 ExecutorService ex = Executors.newFixedThreadPool(numThreads);
194 try {
195 _testAll(parser, files, parseContext, numThreads, numIterations, truth, ex);
196 } finally {
197 ex.shutdown();
198 ex.shutdownNow();
199 }
200 }
201
202 private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseContext, int numThreads, int numIterations,
203 Map<Path, Extract> truth, ExecutorService ex) {
204
205 ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<>(ex);
206
207 //use the same parser in all threads
208 for (int i = 0; i < numThreads; i++) {
209 executorCompletionService.submit(new TikaRunner(parser, parseContext[i], numIterations, testFiles, truth));
210 }
211
212 int completed = 0;
213 while (completed < numThreads) {
214 //TODO: add a maximum timeout threshold
215
216 Future<Integer> future = null;
217 try {
218 future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS);
219 if (future != null) {
220 future.get();//trigger exceptions from thread
221 completed++;
222 }
223 } catch (InterruptedException | ExecutionException e) {
224 throw new RuntimeException(e);
225 }
226 }
227 }
228
229 public static Path[] getTestFiles(final FileFilter fileFilter) throws URISyntaxException, IOException {
160230 Path root = Paths.get(
161231 MultiThreadedTikaTest.class.getResource("/test-documents").toURI());
162232 final List<Path> files = new ArrayList<>();
163233 Files.walkFileTree(root, new SimpleFileVisitor<Path>() {
164234 @Override
165235 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
166 if (fileFilter != null && ! fileFilter.accept(file.toFile())) {
236 if (fileFilter != null && !fileFilter.accept(file.toFile())) {
167237 return FileVisitResult.CONTINUE;
168238 }
169239 if (!attrs.isDirectory()) {
170 if (files.size() < 20) {
171 files.add(file);
172 }
240 files.add(file);
173241 }
174242 return FileVisitResult.CONTINUE;
175243 }
177245 return files.toArray(new Path[files.size()]);
178246 }
179247
180 private static ConcurrentHashMap<Path, Extract> getBaseline(Path[] files) {
248 private static ConcurrentHashMap<Path, MediaType> getBaselineDetection(Detector detector, Path[] files) {
249
250 ConcurrentHashMap<Path, MediaType> baseline = new ConcurrentHashMap<>();
251 XmlRootExtractor extractor = new XmlRootExtractor();
252 for (Path f : files) {
253 Metadata metadata = new Metadata();
254 try (TikaInputStream tis = TikaInputStream.get(f, metadata)) {
255 baseline.put(f, detector.detect(tis, metadata));
256 baseline.put(f, detector.detect(tis, metadata));
257 } catch (IOException e) {
258 e.printStackTrace();
259 }
260 }
261 return baseline;
262 }
263
264 private static ConcurrentHashMap<Path, Extract> getBaseline(Parser parser, Path[] files, ParseContext parseContext) {
181265 ConcurrentHashMap<Path, Extract> baseline = new ConcurrentHashMap<>();
266
182267 for (Path f : files) {
183
184 try {
185 Parser p = new AutoDetectParser();
186 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
187 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
188 try (InputStream is = Files.newInputStream(f)) {
189 wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
190 }
191 List<Metadata> metadataList = wrapper.getMetadata();
268 try (TikaInputStream is = TikaInputStream.get(f)) {
269
270 List<Metadata> metadataList = getRecursiveMetadata(is, parser, parseContext);
192271 baseline.put(f, new Extract(metadataList));
272
193273 } catch (Exception e) {
274 e.printStackTrace();
194275 //swallow
195276 }
196277 }
197278 return baseline;
198279 }
199280
200 private static List<Metadata> getRecursiveMetadata(InputStream is, Parser p) throws Exception {
281 private static List<Metadata> getRecursiveMetadata(InputStream is,
282 Parser parser, ParseContext parseContext) throws Exception {
201283 //different from parent TikaTest in that this extracts text.
202284 //can't extract xhtml because "tmp" file names wind up in
203285 //content's metadata and they'll differ by file.
204 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
205 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
206 wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
207 return wrapper.getMetadata();
208 }
209
210 //TODO: make this return something useful besides an integer
211 private class TikaRunner implements Callable<Integer> {
212 private final Parser parser;
286 parseContext = new ParseContext();
287 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
288 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
289 -1);
290 parser.parse(is, handler, new Metadata(), parseContext);
291 return handler.getMetadataList();
292 }
293
294 private class SAXPoolResizer implements Callable<Integer> {
295 private final int maxResize;
296 private final Random rand = new Random();
297 SAXPoolResizer(int maxResize) {
298 this.maxResize = maxResize;
299 }
300
301 public Integer call() throws TikaException {
302 int resized = 0;
303 while (true) {
304 try {
305 Thread.yield();
306 Thread.sleep(500);
307 } catch (InterruptedException e) {
308 return resized;
309 }
310 if (maxResize > 0 && rand.nextFloat() > 0.01) {
311 int sz = rand.nextInt(maxResize)+1;
312 XMLReaderUtils.setPoolSize(sz);
313 resized++;
314 }
315 }
316 }
317 }
318
319 private class TikaDetectorRunner implements Callable<Integer> {
320 private final Detector detector;
213321 private final int iterations;
214322 private final Path[] files;
215 private final Map<Path, Extract> truth;
216
323 private final Map<Path, MediaType> truth;
217324 private final Random random = new Random();
218325
219 private TikaRunner(Parser parser, int iterations, Path[] files, Map<Path, Extract> truth) {
220 this.parser = parser;
326 private TikaDetectorRunner(Detector detector, int iterations, Path[] files, Map<Path, MediaType> truth) {
327 this.detector = detector;
221328 this.iterations = iterations;
222329 this.files = files;
223330 this.truth = truth;
228335 for (int i = 0; i < iterations; i++) {
229336 int randIndex = random.nextInt(files.length);
230337 Path testFile = files[randIndex];
338 Metadata metadata = new Metadata();
339 try (TikaInputStream tis = TikaInputStream.get(testFile, metadata)) {
340 MediaType mediaType = detector.detect(tis, metadata);
341 assertEquals("failed on: " + testFile.getFileName(), truth.get(testFile), mediaType);
342 }
343 }
344 return 1;
345 }
346
347 }
348
349
350 //TODO: make this return something useful besides an integer
351 private static class TikaRunner implements Callable<Integer> {
352 private static AtomicInteger threadCount = new AtomicInteger(0);
353 private final Parser parser;
354 private final int iterations;
355 private final Path[] files;
356 private final Map<Path, Extract> truth;
357 private final ParseContext parseContext;
358 private final Random random = new Random();
359 private final int threadNumber;
360 private TikaRunner(Parser parser, ParseContext parseContext, int iterations, Path[] files, Map<Path, Extract> truth) {
361 this.parser = parser;
362 this.iterations = iterations;
363 this.files = files;
364 this.truth = truth;
365 this.parseContext = parseContext;
366 threadNumber = threadCount.getAndIncrement();
367 }
368
369 @Override
370 public Integer call() throws Exception {
371 for (int i = 0; i < iterations; i++) {
372 int randIndex = random.nextInt(files.length);
373 Path testFile = files[randIndex];
374 List<Metadata> metadataList = null;
375 boolean success = false;
231376 try (InputStream is = Files.newInputStream(testFile)) {
232 List<Metadata> metadataList = getRecursiveMetadata(is, parser);
377 metadataList = getRecursiveMetadata(is, parser, new ParseContext());
378 success = true;
379 } catch (Exception e) {
380 //swallow
381 //throw new RuntimeException(testFile + " triggered this exception", e);
382 }
383 if (success) {
233384 assertExtractEquals(truth.get(testFile), new Extract(metadataList));
234 } catch (Exception e) {
235 throw new RuntimeException(testFile+" triggered this exception", e);
236385 }
237386 }
238387 return 1;
240389
241390 }
242391
243 private void assertExtractEquals(Extract extractA, Extract extractB) {
392 private static void assertExtractEquals(Extract extractA, Extract extractB) {
244393 //this currently only checks the basics
245394 //might want to add more checks
246395
252401 extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size());
253402
254403 assertEquals("content in attachment: " + i,
255 extractA.metadataList.get(i).get(RecursiveParserWrapper.TIKA_CONTENT),
256 extractB.metadataList.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
404 extractA.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT),
405 extractB.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
257406 }
258407 }
259408
4343 import org.apache.tika.parser.RecursiveParserWrapper;
4444 import org.apache.tika.sax.BasicContentHandlerFactory;
4545 import org.apache.tika.sax.BodyContentHandler;
46 import org.apache.tika.sax.RecursiveParserWrapperHandler;
4647 import org.apache.tika.sax.ToXMLContentHandler;
4748 import org.xml.sax.ContentHandler;
4849 import org.xml.sax.helpers.DefaultHandler;
218219
219220 protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
220221 Parser p = new AutoDetectParser();
221 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
222 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
223 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
224 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
225
226 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
227 wrapper.parse(is, handler, metadata, context);
228 }
229 return handler.getMetadataList();
230 }
231
232 protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
233 Parser p = new AutoDetectParser();
234 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
235
236 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
222237 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
223238 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
224 wrapper.parse(is, new DefaultHandler(), metadata, context);
225 }
226 return wrapper.getMetadata();
227 }
228
229 protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
230 Parser p = new AutoDetectParser();
231 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
239 wrapper.parse(is, handler, new Metadata(), context);
240 }
241 return handler.getMetadataList();
242 }
243
244 protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception {
245 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
246 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
232247 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
233248 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
234 wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
235 }
236 return wrapper.getMetadata();
237 }
238
239 protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception {
240 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap,
249 wrapper.parse(is, handler, new Metadata(), new ParseContext());
250 }
251 return handler.getMetadataList();
252 }
253
254 protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
255 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
256 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
241257 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
258
242259 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
243 wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
244 }
245 return wrapper.getMetadata();
246 }
247
248 protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
249 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap,
250 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
251 try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
252 wrapper.parse(is, new DefaultHandler(), new Metadata(), parseContext);
253 }
254 return wrapper.getMetadata();
260 wrapper.parse(is, handler, new Metadata(), parseContext);
261 }
262 return handler.getMetadataList();
255263 }
256264
257265
2020 import static org.junit.Assert.fail;
2121
2222 import java.io.ByteArrayInputStream;
23 import java.io.File;
2324 import java.io.IOException;
2425 import java.io.InputStream;
26 import java.io.OutputStream;
2527 import java.io.PipedInputStream;
2628 import java.io.PipedOutputStream;
29 import java.io.Reader;
30 import java.io.Serializable;
31 import java.io.StringWriter;
32 import java.io.UnsupportedEncodingException;
33 import java.nio.charset.Charset;
2734 import java.nio.charset.StandardCharsets;
35 import java.nio.file.Files;
36 import java.nio.file.Path;
37 import java.util.ArrayList;
38 import java.util.List;
2839 import java.util.concurrent.Semaphore;
2940
41 import org.apache.tika.TikaTest;
3042 import org.apache.tika.exception.TikaException;
43 import org.apache.tika.metadata.DublinCore;
44 import org.apache.tika.io.IOUtils;
3145 import org.apache.tika.metadata.Metadata;
46 import org.apache.tika.metadata.TikaCoreProperties;
47 import org.apache.tika.parser.AutoDetectParser;
3248 import org.apache.tika.parser.ParseContext;
49 import org.apache.tika.parser.Parser;
50 import org.apache.tika.parser.RecursiveParserWrapper;
3351 import org.apache.tika.parser.mock.MockParser;
52 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
53 import org.apache.tika.sax.BasicContentHandlerFactory;
3454 import org.apache.tika.sax.BodyContentHandler;
55 import org.apache.tika.sax.ContentHandlerFactory;
56 import org.apache.tika.sax.RecursiveParserWrapperHandler;
57 import org.junit.Ignore;
3558 import org.junit.Test;
3659 import org.xml.sax.ContentHandler;
60 import org.xml.sax.SAXException;
3761 import org.xml.sax.helpers.DefaultHandler;
3862
39 public class ForkParserTest {
63 public class ForkParserTest extends TikaTest {
4064
4165 @Test
4266 public void testHelloWorld() throws Exception {
182206 }
183207
184208 @Test
185 public void testPulse() throws Exception {
186 //test default 5000 ms
209 public void testPulseAndTimeouts() throws Exception {
210
187211 ForkParser forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser());
212 forkParser.setServerPulseMillis(500);
213 forkParser.setServerParseTimeoutMillis(5000);
214 forkParser.setServerWaitTimeoutMillis(60000);
188215 String sleepCommand = "<mock>\n" +
189216 " <write element=\"p\">Hello, World!</write>\n" +
190217 " <hang millis=\"11000\" heavy=\"false\" interruptible=\"false\" />\n" +
202229 //test setting very short pulse (10 ms) and a parser that takes at least 1000 ms
203230 forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser());
204231 forkParser.setServerPulseMillis(10);
232 forkParser.setServerParseTimeoutMillis(100);
205233 sleepCommand = "<mock>\n" +
206234 " <write element=\"p\">Hello, World!</write>\n" +
207235 " <hang millis=\"1000\" heavy=\"false\" interruptible=\"false\" />\n" +
217245 }
218246 }
219247
248 @Test
249 public void testPackageCanBeAccessed() throws Exception {
250 ForkParser parser = new ForkParser(
251 ForkParserTest.class.getClassLoader(),
252 new ForkTestParser.ForkTestParserAccessingPackage());
253 try {
254 Metadata metadata = new Metadata();
255 ContentHandler output = new BodyContentHandler();
256 InputStream stream = new ByteArrayInputStream(new byte[0]);
257 ParseContext context = new ParseContext();
258 parser.parse(stream, output, metadata, context);
259 assertEquals("Hello, World!", output.toString().trim());
260 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
261 } finally {
262 parser.close();
263 }
264 }
265 @Test
266 public void testRecursiveParserWrapper() throws Exception {
267 Parser parser = new AutoDetectParser();
268 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
269 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
270 new BasicContentHandlerFactory(
271 BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
272 ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
273 Metadata metadata = new Metadata();
274 ParseContext context = new ParseContext();
275 try (InputStream is = getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) {
276 fork.parse(is, handler, metadata, context);
277 } finally {
278 fork.close();
279 }
280 List<Metadata> metadataList = handler.getMetadataList();
281 Metadata m0 = metadataList.get(0);
282 assertEquals("Nikolai Lobachevsky", m0.get(DublinCore.CREATOR));
283 assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
284 assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
285
286 Metadata m1 = metadataList.get(1);
287 assertEquals("embeddedAuthor", m1.get(DublinCore.CREATOR));
288 assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
289 assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
290 }
291
292 @Test
293 public void testRPWWithEmbeddedNPE() throws Exception {
294 Parser parser = new AutoDetectParser();
295 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
296 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
297 new BasicContentHandlerFactory(
298 BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
299 ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
300 Metadata metadata = new Metadata();
301 ParseContext context = new ParseContext();
302 try (InputStream is = getClass().getResourceAsStream("/test-documents/embedded_with_npe.xml")) {
303 fork.parse(is, handler, metadata, context);
304 } finally {
305 fork.close();
306 }
307 List<Metadata> metadataList = handler.getMetadataList();
308 Metadata m0 = metadataList.get(0);
309 assertEquals("Nikolai Lobachevsky", m0.get(DublinCore.CREATOR));
310 assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
311 assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
312
313 Metadata m1 = metadataList.get(1);
314 assertEquals("embeddedAuthor", m1.get(DublinCore.CREATOR));
315 assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
316 assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
317 assertContains("another null pointer exception", m1.get(RecursiveParserWrapperHandler.EMBEDDED_EXCEPTION));
318 }
319
320 @Test
321 public void testRPWWithMainDocNPE() throws Exception {
322 Parser parser = new AutoDetectParser();
323 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
324 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
325 new BasicContentHandlerFactory(
326 BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
327 ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
328 Metadata metadata = new Metadata();
329 ParseContext context = new ParseContext();
330 try (InputStream is = getClass().getResourceAsStream("/test-documents/embedded_then_npe.xml")) {
331 fork.parse(is, handler, metadata, context);
332 fail();
333 } catch (TikaException e) {
334 assertTrue(e.getCause() instanceof NullPointerException);
335 assertContains("another", e.getCause().getMessage());
336 } finally {
337 fork.close();
338 }
339 List<Metadata> metadataList = handler.getMetadataList();
340 Metadata m0 = metadataList.get(0);
341 assertEquals("Nikolai Lobachevsky", m0.get(DublinCore.CREATOR));
342 assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
343 assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
344
345 Metadata m1 = metadataList.get(1);
346 assertEquals("embeddedAuthor", m1.get(DublinCore.CREATOR));
347 assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
348 assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
349 }
350
351 @Test
352 public void testToFileHandler() throws Exception {
353 //test that a server-side write-to-file works without proxying back the
354 //AbstractContentHandlerFactory
355 Path target = Files.createTempFile("fork-to-file-handler-", ".txt");
356 try {
357 ForkParser forkParser = null;
358 try (InputStream is = this.getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) {
359 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(new AutoDetectParser());
360 ToFileHandler toFileHandler = new ToFileHandler(new SBContentHandlerFactory(), target.toFile());
361 forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
362 Metadata m = new Metadata();
363 ParseContext context = new ParseContext();
364 forkParser.parse(is, toFileHandler, m, context);
365 } finally {
366 if (forkParser != null) {
367 forkParser.close();
368 }
369 }
370
371 String contents = null;
372 try (Reader reader = Files.newBufferedReader(target, StandardCharsets.UTF_8)) {
373 contents = IOUtils.toString(reader);
374 }
375 assertContainsCount("X-Parsed-By : org.apache.tika.parser.DefaultParser", contents,2);
376 assertContainsCount("X-Parsed-By : org.apache.tika.parser.mock.MockParser", contents,2);
377 assertContains("Nikolai Lobachevsky", contents);
378 assertContains("embeddedAuthor", contents);
379 assertContains("main_content", contents);
380 assertContains("some_embedded_content", contents);
381 assertContains("X-TIKA:embedded_resource_path : /embed1.xml", contents);
382 } finally {
383 Files.delete(target);
384 }
385 }
386
387 @Test
388 public void testRecursiveParserWrapperWithProxyingContentHandlersAndMetadata() throws Exception {
389 Parser parser = new AutoDetectParser();
390 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
391 BufferingHandler handler =
392 new BufferingHandler(new SBContentHandlerFactory());
393 ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
394 Metadata metadata = new Metadata();
395 ParseContext context = new ParseContext();
396 try (InputStream is = getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) {
397 fork.parse(is, handler, metadata, context);
398 } finally {
399 fork.close();
400 }
401 List<Metadata> metadataList = handler.getMetadataList();
402 List<ContentHandler> contentHandlers = handler.getContentHandlers();
403 Metadata m0 = metadataList.get(0);
404 String content0 = contentHandlers.get(0).toString();
405 assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR));
406 assertContains("main_content", content0);
407 assertContains("embed1.xml", content0);
408
409 Metadata m1 = metadataList.get(1);
410 String content1 = contentHandlers.get(1).toString();
411 assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR));
412 assertContains("some_embedded_content", content1);
413 assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
414 }
415
416
417 @Test
418 public void testRPWWithNonSerializableContentHandler() throws Exception {
419 Parser parser = new AutoDetectParser();
420 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
421 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
422 new NonSerializableHandlerFactory());
423 ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
424 Metadata metadata = new Metadata();
425 ParseContext context = new ParseContext();
426 try (InputStream is = getClass().getResourceAsStream("/test-documents/embedded_then_npe.xml")) {
427 fork.parse(is, handler, metadata, context);
428 fail();
429 } catch (TikaException e) {
430 assertTrue(e.getCause() instanceof NullPointerException);
431 assertContains("another", e.getCause().getMessage());
432 } finally {
433 fork.close();
434 }
435 List<Metadata> metadataList = handler.getMetadataList();
436 Metadata m0 = metadataList.get(0);
437 assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR));
438 assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
439 assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
440
441 Metadata m1 = metadataList.get(1);
442 assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR));
443 assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
444 assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
445 }
446
447
448 //use this to test that the wrapper handler is acted upon by the server but not proxied back
449 private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {
450
451 //this needs to be a file because a File is serializable
452 private final File file;
453 private OutputStream os;
454
455 public ToFileHandler(ContentHandlerFactory contentHandlerFactory, File file) {
456 super(contentHandlerFactory);
457 this.file = file;
458 }
459
460 @Override
461 public void startDocument() throws SAXException {
462 try {
463 os = Files.newOutputStream(file.toPath());
464 } catch (IOException e) {
465 throw new SAXException(e);
466 }
467 }
468
469 @Override
470 public void endDocument() throws SAXException {
471 try {
472 os.flush();
473 os.close();
474 } catch (IOException e) {
475 throw new SAXException(e);
476 }
477 }
478
479 @Override
480 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
481 try {
482 byte[] bytes = toString(contentHandler, metadata);
483 os.write(bytes, 0, bytes.length);
484 } catch (IOException e) {
485 throw new SAXException(e);
486 }
487 }
488 @Override
489 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
490 try {
491 byte[] bytes = toString(contentHandler, metadata);
492 os.write(bytes, 0, bytes.length);
493 } catch (IOException e) {
494 throw new SAXException(e);
495 }
496 }
497
498 private byte[] toString(ContentHandler contentHandler, Metadata metadata) {
499 StringBuilder sb = new StringBuilder();
500 for (String n : metadata.names()) {
501 for (String v : metadata.getValues(n)) {
502 sb.append(n).append(" : ").append(v).append("\n");;
503 }
504 }
505 if (! contentHandler.getClass().equals(DefaultHandler.class)) {
506 sb.append("\n");
507 sb.append("CONTENT: "+ contentHandler.toString());
508 sb.append("\n\n");
509 }
510 return sb.toString().getBytes(StandardCharsets.UTF_8);
511 }
512 }
513
514 private static class SBContentHandler extends DefaultHandler implements Serializable {
515 StringBuilder sb = new StringBuilder();
516
517 @Override
518 public void characters(char ch[], int start, int length)
519 throws SAXException {
520 sb.append(ch, start, length);
521 sb.append(" ");
522 }
523
524 @Override
525 public String toString() {
526 return sb.toString();
527 }
528 }
529
530 private static class SBContentHandlerFactory implements ContentHandlerFactory {
531
532 @Override
533 public ContentHandler getNewContentHandler() {
534 return new SBContentHandler();
535 }
536
537 @Override
538 public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
539 throw new IllegalArgumentException("can't use this option in this test class");
540 }
541
542 @Override
543 public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
544 throw new IllegalArgumentException("can't use this option in this test class");
545 }
546 }
547
548 private static class NonSerializableHandlerFactory implements ContentHandlerFactory {
549 @Override
550 public ContentHandler getNewContentHandler() {
551 return new LyingNonSerializableContentHandler();
552 }
553
554 @Override
555 public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
556 throw new IllegalArgumentException("can't use this option in this test class");
557 }
558
559 @Override
560 public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
561 throw new IllegalArgumentException("can't use this option in this test class");
562 }
563 }
564
565 private static class LyingNonSerializableContentHandler
566 extends DefaultHandler implements Serializable {
567 //StringWriter makes this class not actually Serializable
568 //as is.
569 StringWriter writer = new StringWriter();
570
571 @Override
572 public void characters(char ch[], int start, int length)
573 throws SAXException {
574 writer.write(ch, start, length);
575 }
576
577 @Override
578 public String toString() {
579 return writer.toString();
580 }
581 }
582
583 //use this to test that a handler that extends RecursiveParserWrapperHandler
584 //does have both contenthandlers and metadata objects proxied back from the
585 //server.
586 private static class BufferingHandler extends RecursiveParserWrapperHandler {
587 List<ContentHandler> contentHandlers = new ArrayList<>();
588
589 public BufferingHandler(ContentHandlerFactory contentHandlerFactory) {
590 super(contentHandlerFactory);
591 }
592
593
594 @Override
595 public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
596 contentHandlers.add(contentHandler);
597 metadataList.add(metadata);
598 }
599
600 @Override
601 public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
602 contentHandlers.add(0, contentHandler);
603 metadataList.add(0, metadata);
604 }
605
606 public List<ContentHandler> getContentHandlers() {
607 return contentHandlers;
608 }
609
610 @Override
611 public List<Metadata> getMetadataList() {
612 return metadataList;
613 }
614
615 }
220616 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import org.apache.tika.TikaTest;
19 import org.apache.tika.exception.TikaException;
20 import org.apache.tika.io.IOUtils;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.TikaCoreProperties;
23 import org.apache.tika.parser.AutoDetectParserFactory;
24 import org.apache.tika.parser.ParseContext;
25 import org.apache.tika.sax.ToXMLContentHandler;
26 import org.junit.AfterClass;
27 import org.junit.BeforeClass;
28 import org.junit.Test;
29 import org.xml.sax.ContentHandler;
30 import org.xml.sax.SAXException;
31
32 import java.io.File;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.io.OutputStream;
36 import java.net.URL;
37 import java.nio.charset.StandardCharsets;
38 import java.nio.file.Files;
39 import java.nio.file.Path;
40 import java.util.ArrayList;
41 import java.util.Collections;
42 import java.util.Enumeration;
43 import java.util.HashMap;
44 import java.util.List;
45 import java.util.Map;
46 import java.util.jar.JarEntry;
47 import java.util.jar.JarOutputStream;
48
49 import static org.junit.Assert.assertEquals;
50
51 public class ForkParserTikaBinTest extends TikaTest {
52 private static Path JAR_DIR;
53 private static final String JAR_FILE_NAME = "mock-tika-app.jar";
54 private static Path JAR_FILE;
55
56 @SuppressWarnings("unchecked")
57 private static final Map<String, String> EMPTY_MAP = Collections.EMPTY_MAP;
58
59 @BeforeClass
60 public static void bootstrapJar() throws Exception {
61 JAR_DIR = Files.createTempDirectory("tika-fork-tikabin-");
62 JAR_FILE = JAR_DIR.resolve(JAR_FILE_NAME);
63
64 try (JarOutputStream jarOs = new JarOutputStream(Files.newOutputStream(JAR_FILE))) {
65 ClassLoader loader = ForkServer.class.getClassLoader();
66 for (Class<?> klass : getClasses("org.apache.tika")) {
67 String path = klass.getName().replace('.', '/') + ".class";
68 try (InputStream input = loader.getResourceAsStream(path)) {
69 jarOs.putNextEntry(new JarEntry(path));
70 IOUtils.copy(input, jarOs);
71 }
72 }
73 try (InputStream input = ForkParserTikaBinTest.class.getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) {
74 jarOs.putNextEntry(new JarEntry("org/apache/tika/parser/TIKA-2653-vowel-parser-ae.xml"));
75 IOUtils.copy(input, jarOs);
76 }
77 try (InputStream input = ForkParserTikaBinTest.class.getResourceAsStream("/org/apache/tika/mime/tika-mimetypes.xml")) {
78 jarOs.putNextEntry(new JarEntry("org/apache/tika/mime/tika-mimetypes.xml"));
79 IOUtils.copy(input, jarOs);
80 }
81 try (InputStream input = ForkParserTikaBinTest.class.getResourceAsStream("/org/apache/tika/mime/custom-mimetypes.xml")) {
82 jarOs.putNextEntry(new JarEntry("org/apache/tika/mime/custom-mimetypes.xml"));
83 IOUtils.copy(input, jarOs);
84 }
85
86 jarOs.putNextEntry(new JarEntry("META-INF/services/org.apache.tika.parser.Parser"));
87 jarOs.write("org.apache.tika.parser.mock.VowelParser\n".getBytes(StandardCharsets.UTF_8));
88 }
89
90 Path tikaConfigVowelParser = JAR_DIR.resolve("TIKA_2653-iou.xml");
91 try (InputStream is = ForkServer.class.getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml");
92 OutputStream os = Files.newOutputStream(tikaConfigVowelParser)) {
93 IOUtils.copy(is, os);
94 }
95 }
96
97
98 @AfterClass
99 public static void tearDown() throws Exception {
100
101 Files.delete(JAR_DIR.resolve("TIKA_2653-iou.xml"));
102 Files.delete(JAR_FILE);
103 Files.delete(JAR_DIR);
104 }
105
106 @Test
107 public void testExplicitParserFactory() throws Exception {
108 XMLResult xmlResult = getXML(new ParserFactoryFactory("org.apache.tika.parser.mock.MockParserFactory",
109 EMPTY_MAP));
110 assertContains("hello world!", xmlResult.xml);
111 assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
112 }
113
114 @Test
115 public void testVowelParserAsDefault() throws Exception {
116 ParserFactoryFactory pff = new ParserFactoryFactory(
117 "org.apache.tika.parser.AutoDetectParserFactory",
118 EMPTY_MAP);
119 XMLResult xmlResult = getXML(pff);
120 assertContains("eooeuiooueoeeao", xmlResult.xml);
121 assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
122 }
123
124 @Test
125 public void testVowelParserInClassPath() throws Exception {
126 Map<String, String> args = new HashMap<>();
127 args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, "TIKA-2653-vowel-parser-ae.xml");
128 ParserFactoryFactory pff = new ParserFactoryFactory(
129 "org.apache.tika.parser.AutoDetectParserFactory",
130 args);
131 XMLResult xmlResult = getXML(pff);
132 assertContains("eeeeea", xmlResult.xml);
133 assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
134 }
135
136 @Test
137 public void testVowelParserFromDirectory() throws Exception {
138 Map<String, String> args = new HashMap<>();
139 args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, JAR_DIR.resolve("TIKA_2653-iou.xml").toAbsolutePath().toString());
140 ParserFactoryFactory pff = new ParserFactoryFactory(
141 "org.apache.tika.parser.AutoDetectParserFactory",
142 args);
143 XMLResult xmlResult = getXML(pff);
144 assertContains("oouioouoo", xmlResult.xml);
145 assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
146 }
147
148 @Test
149 public void testPFFWithClassLoaderFromParentProcess() throws Exception {
150 //The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass.
151 //this tests that the content handler was loaded from the parent process.
152
153 ParserFactoryFactory pff = new ParserFactoryFactory(
154 "org.apache.tika.parser.AutoDetectParserFactory",
155 EMPTY_MAP);
156 XMLResult xmlResult = getXML(pff, this.getClass().getClassLoader(), new UpperCasingContentHandler());
157 assertContains("EOOEUIOOUEOEEAO", xmlResult.xml);
158 assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
159
160 }
161
162 private XMLResult getXML(ParserFactoryFactory pff) throws TikaException, SAXException, IOException {
163 return getXML(pff, null, null);
164 }
165
166 private XMLResult getXML(ParserFactoryFactory pff, ClassLoader classloader, ContentHandler contentHandler) throws TikaException, SAXException, IOException {
167
168 List<String> java = new ArrayList<>();
169 java.add("java");
170 ForkParser parser = null;
171 if (classloader != null) {
172 parser = new ForkParser(JAR_DIR, pff, classloader);
173 } else {
174 parser = new ForkParser(JAR_DIR, pff);
175 }
176 parser.setJavaCommand(java);
177 parser.setServerPulseMillis(10000);
178
179 ContentHandler handler = (contentHandler == null) ? new ToXMLContentHandler() : contentHandler;
180 Metadata m = new Metadata();
181 try (InputStream is = getClass().getResourceAsStream("/test-documents/example.xml")) {
182 parser.parse(is, handler, m, new ParseContext());
183 } finally {
184 parser.close();
185 }
186 return new XMLResult(handler.toString(), m);
187 }
188
189 private static List<Class> getClasses(String packageName)
190 throws ClassNotFoundException, IOException {
191 ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
192 String path = packageName.replace('.', '/');
193 Enumeration<URL> resources = classLoader.getResources(path);
194 List<File> dirs = new ArrayList<>();
195 while (resources.hasMoreElements()) {
196 URL resource = resources.nextElement();
197 dirs.add(new File(resource.getFile().replaceAll("%20", " ")));
198 }
199 ArrayList<Class> classes = new ArrayList<>();
200 for (File directory : dirs) {
201 classes.addAll(findClasses(directory, packageName));
202 }
203 return classes;
204 }
205
206 private static List<Class> findClasses(File dir, String packageName) throws ClassNotFoundException {
207 List<Class> classes = new ArrayList<>();
208 if (!dir.exists()) {
209 return classes;
210 }
211 File[] files = dir.listFiles();
212 for (File file : files) {
213 if (file.isDirectory()) {
214 classes.addAll(findClasses(file, packageName + "." + file.getName()));
215 } else if (file.getName().endsWith(".class")) {
216 //exclude TypeDetectionBenchmark because it is not serializable
217 //exclude UpperCasingContentHandler because we want to test that
218 //we can serialize it from the parent process into the child process
219 if (! file.getName().contains("TypeDetectionBenchmark") &&
220 !file.getName().contains("UpperCasingContentHandler")) {
221 classes.add(Class.forName(packageName + '.' + file.getName().substring(0, file.getName().length() - 6)));
222 }
223 }
224 }
225 return classes;
226 }
227 }
2121 import java.util.Set;
2222
2323 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.fork.unusedpackage.ClassInUnusedPackage;
2425 import org.apache.tika.metadata.Metadata;
2526 import org.apache.tika.mime.MediaType;
2627 import org.apache.tika.parser.AbstractParser;
2728 import org.apache.tika.parser.ParseContext;
2829 import org.apache.tika.sax.XHTMLContentHandler;
30 import org.junit.Assert;
2931 import org.xml.sax.ContentHandler;
3032 import org.xml.sax.SAXException;
3133
5355 xhtml.endDocument();
5456 }
5557
58 static class ForkTestParserAccessingPackage extends ForkTestParser {
59 @Override
60 public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
61 ParseContext context) throws IOException, SAXException, TikaException {
62 Assert.assertNotNull(ClassInUnusedPackage.class.getPackage());
63 super.parse(stream, handler, metadata, context);
64 }
65 }
5666 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * <p>
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * <p>
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork;
17
18 import org.xml.sax.SAXException;
19 import org.xml.sax.helpers.DefaultHandler;
20
21 import java.util.Locale;
22
23 public class UpperCasingContentHandler extends DefaultHandler {
24 StringBuilder sb = new StringBuilder();
25
26 @Override
27 public void characters(char[] ch, int start, int length)
28 throws SAXException {
29 String chars = new String(ch, start, length);
30 sb.append(chars.toUpperCase(Locale.US));
31 }
32
33 @Override
34 public String toString() {
35 return sb.toString();
36 }
37
38 }
0 /**
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * <p>
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * <p>
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.fork.unusedpackage;
17
18 public class ClassInUnusedPackage {
19 }
2222
2323 import org.junit.Test;
2424 import static org.junit.Assert.assertEquals;
25 import static org.junit.Assert.assertTrue;
2526
2627 import org.xml.sax.Attributes;
2728 import org.xml.sax.SAXException;
8889 assertEquals(1, reader.ignorePatterns.size());
8990 assertEquals(another.toString()+">>*"+hello.getExtension(),
9091 reader.ignorePatterns.get(0));
92 assertTrue("Server-side script type not detected", another.isInterpreted());
9193
9294 //System.out.println( mimeTypes.getMediaTypeRegistry().getTypes() );
9395 }
2828
2929 import org.apache.tika.config.TikaConfig;
3030 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.TikaCoreProperties;
3132 import org.junit.Before;
3233 import org.junit.Test;
3334
8182 }
8283
8384 @Test
85 public void testDetectionWithoutContent() throws IOException {
86 testUrlWithoutContent("text/html", "test.html");
87 testUrlWithoutContent("text/html", "http://test.com/test.html");
88 testUrlWithoutContent("text/plain", "http://test.com/test.txt");
89
90 // In case the url contains a filename referencing a server-side scripting language,
91 // it gives us no clue concerning the actual mime type of the response
92 testUrlWithoutContent("application/octet-stream", "http://test.com/test.php");
93 testUrlWithoutContent("application/octet-stream", "http://test.com/test.cgi");
94 testUrlWithoutContent("application/octet-stream", "http://test.com/test.jsp");
95 // But in case the protocol is not http or https, the script is probably not interpreted
96 testUrlWithoutContent("text/x-php", "ftp://test.com/test.php");
97 }
98
99 @Test
84100 public void testByteOrderMark() throws Exception {
85101 assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
86102 new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)),
132148 private void testUrlOnly(String expected, String url) throws IOException{
133149 InputStream in = new URL(url).openStream();
134150 testStream(expected, url, in);
151 }
152
153 private void testUrlWithoutContent(String expected, String url) throws IOException {
154 Metadata metadata = new Metadata();
155 metadata.set(Metadata.RESOURCE_NAME_KEY, url);
156 String mime = this.mimeTypes.detect(null, metadata).toString();
157 assertEquals(url + " is not properly detected using only resource name", expected, mime);
135158 }
136159
137160 private void testUrl(String expected, String url, String file) throws IOException{
2424 import java.io.ByteArrayInputStream;
2525 import java.lang.reflect.Field;
2626 import java.util.ArrayList;
27 import java.util.ConcurrentModificationException;
2728 import java.util.List;
2829 import java.util.Set;
30 import java.util.concurrent.Executors;
2931
3032 import org.apache.tika.config.TikaConfig;
3133 import org.apache.tika.metadata.Metadata;
4648 * update it to match the new state of the file!
4749 */
4850 public class MimeTypesReaderTest {
51
52 static boolean stop = false;
4953
5054 private MimeTypes mimeTypes;
5155 private List<Magic> magics;
278282 assertEquals(name, mimeType.toString());
279283 assertEquals(".ditamap", mimeType.getExtension());
280284 }
285
286 @Test
287 public void testMultiThreaded() throws Exception {
288 MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
289 Executors.newSingleThreadExecutor().execute(()-> {
290 try {
291 for (int i = 0; i < 500 && !stop; i++) {
292 mimeTypes.forName("abc"+i+"/abc");
293 }
294 } catch (MimeTypeException e ) {
295 e.printStackTrace();
296 }}
297 );
298
299 for (int i = 0; i < 500 & !stop; i++) {
300 mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
301 }
302 }
281303 }
6767
6868 @Override
6969 public Set<MediaType> getSupportedTypes(ParseContext context) {
70 Set<MediaType> types = new HashSet<MediaType>();
70 Set<MediaType> types = new HashSet<>();
7171 MediaType type = MediaType.application("mock+xml");
7272 types.add(type);
7373 return types;
7777 public void parse(InputStream stream, ContentHandler handler,
7878 Metadata metadata, ParseContext context) throws IOException,
7979 SAXException, TikaException {
80 if (Thread.currentThread().isInterrupted()) {
81 throw new TikaException("interrupted", new InterruptedException());
82 }
8083 Document doc = null;
8184 try {
8285 DocumentBuilder docBuilder = context.getDocumentBuilder();
120123 handleEmbedded(action, xhtml, context);
121124 } else if ("throwIllegalChars".equals(name)) {
122125 throwIllegalChars();
126 } else if ("system_exit".equals(name)) {
127 System.exit(1);
128 } else if ("thread_interrupt".equals(name)) {
129 Thread.currentThread().interrupt();
123130 } else {
124131 throw new IllegalArgumentException("Didn't recognize mock action: "+name);
125132 }
255262 }
256263 }
257264
258 private void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
265 protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
259266 NamedNodeMap attrs = action.getAttributes();
260267 Node eNode = attrs.getNamedItem("element");
261268 String elementType = "p";
304311 }
305312
306313 private void kabOOM() {
307 List<int[]> ints = new ArrayList<int[]>();
314 List<int[]> ints = new ArrayList<>();
308315
309316 while (true) {
310317 int[] intArr = new int[32000];
315322 private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) {
316323 //do some heavy computation and occasionally check for
317324 //whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
318 //or whether the thread was interrupted
325 //or whether the thread was interrupted.
326 //By creating a new Date in the inner loop, we're also intentionally
327 //triggering the gc most likely.
319328 long start = new Date().getTime();
320 int lastChecked = 0;
329 long lastChecked = start;
321330 while (true) {
322331 for (int i = 1; i < Integer.MAX_VALUE; i++) {
323332 for (int j = 1; j < Integer.MAX_VALUE; j++) {
324333 double div = (double) i / (double) j;
325 lastChecked++;
326 if (lastChecked > pulseCheckMillis) {
327 lastChecked = 0;
334
335 long elapsedSinceLastCheck = new Date().getTime()-lastChecked;
336 if (elapsedSinceLastCheck > pulseCheckMillis) {
337 lastChecked = new Date().getTime();
328338 if (interruptible && Thread.currentThread().isInterrupted()) {
329339 return;
330340 }
0 package org.apache.tika.parser.mock;
1
2 /*
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19
20 import org.apache.tika.exception.TikaException;
21 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
22 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
23 import org.apache.tika.io.IOExceptionWithCause;
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaCoreProperties;
26 import org.apache.tika.mime.MediaType;
27 import org.apache.tika.parser.AbstractParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.parser.ParserFactory;
31 import org.apache.tika.sax.EmbeddedContentHandler;
32 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.w3c.dom.Document;
34 import org.w3c.dom.NamedNodeMap;
35 import org.w3c.dom.Node;
36 import org.w3c.dom.NodeList;
37 import org.xml.sax.ContentHandler;
38 import org.xml.sax.SAXException;
39
40 import javax.xml.parsers.DocumentBuilder;
41 import java.io.ByteArrayInputStream;
42 import java.io.IOException;
43 import java.io.InputStream;
44 import java.lang.reflect.Constructor;
45 import java.util.ArrayList;
46 import java.util.Date;
47 import java.util.HashSet;
48 import java.util.List;
49 import java.util.Map;
50 import java.util.Set;
51
52 import static java.nio.charset.StandardCharsets.UTF_8;
53
54
55 public class MockParserFactory extends ParserFactory {
56
57 public MockParserFactory(Map<String, String> args) {
58 super(args);
59 }
60
61 @Override
62 public Parser build() throws IOException, TikaException {
63 return new MockParser();
64 }
65 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.mock;
18
19
20
21
22 import org.apache.tika.config.Field;
23 import org.apache.tika.sax.XHTMLContentHandler;
24 import org.w3c.dom.NamedNodeMap;
25 import org.w3c.dom.Node;
26 import org.xml.sax.SAXException;
27
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 /**
32 * only parses vowels as specified in "vowel" field.
33 */
34 public class VowelParser extends MockParser {
35
36 private static final long serialVersionUID = 1L;
37
38 @Field
39 private String vowel = "aeiou";
40
41 protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
42 NamedNodeMap attrs = action.getAttributes();
43 Node eNode = attrs.getNamedItem("element");
44 String elementType = "p";
45 if (eNode != null) {
46 elementType = eNode.getTextContent();
47 }
48 String text = action.getTextContent();
49 StringBuilder sb = new StringBuilder();
50 Matcher m = Pattern.compile("(?i)(["+vowel+"])").matcher(text);
51 while (m.find()) {
52 sb.append(m.group(1));
53 }
54 xhtml.startElement(elementType);
55 xhtml.characters(sb.toString());
56 xhtml.endElement(elementType);
57 }
58
59 }
1919 import static org.junit.Assert.assertEquals;
2020
2121 import java.io.ByteArrayOutputStream;
22 import java.io.InputStream;
2223 import java.io.OutputStream;
2324
25 import org.apache.tika.TikaTest;
2426 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.parser.AutoDetectParser;
28 import org.apache.tika.parser.ParseContext;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.parser.mock.MockParser;
2531 import org.junit.Test;
2632
2733 /**
2834 * Test cases for the {@link BodyContentHandler} class.
2935 */
30 public class BodyContentHandlerTest {
36 public class BodyContentHandlerTest extends TikaTest {
3137
3238 /**
3339 * Test that the conversion to an {@link OutputStream} doesn't leave
4854 assertEquals("Test text\n", buffer.toString(UTF_8.name()));
4955 }
5056
57 @Test
58 public void testLimit() throws Exception {
59 //TIKA-2668 - java 11-ea
60 Parser p = new MockParser();
61 WriteOutContentHandler handler = new WriteOutContentHandler(15);
62 Metadata metadata = new Metadata();
63 ParseContext parseContext = new ParseContext();
64 Parser[] parsers = new Parser[1];
65 parsers[0] = p;
66 Parser autoDetectParser = new AutoDetectParser(parsers);
67 try (InputStream is = getResourceAsStream("/test-documents/example.xml")) {
68 autoDetectParser.parse(is, handler, metadata, parseContext);
69 } catch (Exception e) {
70 tryToFindIllegalStateException(e);
71 }
72 assertEquals("hello wo", handler.toString().trim());
73 }
74
75 private void tryToFindIllegalStateException(Throwable e) throws Exception {
76 if (e instanceof IllegalStateException) {
77 throw (Exception)e;
78 }
79 if (e.getCause() != null) {
80 tryToFindIllegalStateException(e.getCause());
81 }
82 }
5183 }
2323 <parent>
2424 <groupId>org.apache.tika</groupId>
2525 <artifactId>tika-parent</artifactId>
26 <version>1.18</version>
26 <version>1.19</version>
2727 <relativePath>../tika-parent/pom.xml</relativePath>
2828 </parent>
2929
3535
3636 <properties>
3737 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
38 <dl4j.version>0.8.0</dl4j.version>
39 <dl4j.model.version>0.8.0-2</dl4j.model.version>
38 <dl4j.version>1.0.0-beta2</dl4j.version>
4039 </properties>
4140
4241 <dependencies>
5049 <groupId>joda-time</groupId>
5150 <artifactId>joda-time</artifactId>
5251 </exclusion>
52 <exclusion>
53 <groupId>org.apache.commons</groupId>
54 <artifactId>commons-compress</artifactId>
55 </exclusion>
56 <exclusion>
57 <groupId>com.google.guava</groupId>
58 <artifactId>guava</artifactId>
59 </exclusion>
60 <exclusion>
61 <groupId>commons-io</groupId>
62 <artifactId>commons-io</artifactId>
63 </exclusion>
5364 </exclusions>
5465 </dependency>
5566 <dependency>
5970 </dependency>
6071 <dependency>
6172 <groupId>org.deeplearning4j</groupId>
62 <artifactId>deeplearning4j-keras</artifactId>
63 <version>${dl4j.model.version}</version>
64 <!-- exclude this because of non-ASF friendly "do no evil" license.
65 Because this relies on tika-parsers, that should have ted-dunning's drop-in
66 -->
67 <exclusions>
68 <exclusion>
69 <groupId>org.json</groupId>
70 <artifactId>json</artifactId>
71 </exclusion>
72 <exclusion>
73 <groupId>com.google.guava</groupId>
74 <artifactId>guava</artifactId>
75 </exclusion>
73 <artifactId>deeplearning4j-zoo</artifactId>
74 <version>${dl4j.version}</version>
75 <exclusions>
7676 <exclusion>
7777 <groupId>org.deeplearning4j</groupId>
78 <artifactId>deeplearning4j-modelimport</artifactId>
79 </exclusion>
80 <exclusion>
81 <groupId>org.apache.commons</groupId>
82 <artifactId>commons-compress</artifactId>
83 </exclusion>
84 <exclusion>
85 <groupId>org.apache.commons</groupId>
86 <artifactId>commons-math3</artifactId>
87 </exclusion>
88 <exclusion>
89 <groupId>commons-io</groupId>
90 <artifactId>commons-io</artifactId>
78 <artifactId>deeplearning4j-nn</artifactId>
79 </exclusion>
80 <exclusion>
81 <groupId>org.nd4j</groupId>
82 <artifactId>nd4j-api</artifactId>
83 </exclusion>
84 <exclusion>
85 <groupId>org.nd4j</groupId>
86 <artifactId>jackson</artifactId>
87 </exclusion>
88 <exclusion>
89 <groupId>com.google.guava</groupId>
90 <artifactId>guava</artifactId>
9191 </exclusion>
9292 </exclusions>
9393 </dependency>
9999 <dependency>
100100 <groupId>org.deeplearning4j</groupId>
101101 <artifactId>deeplearning4j-modelimport</artifactId>
102 <version>${dl4j.model.version}</version>
103 <exclusions>
104 <exclusion>
105 <groupId>org.deeplearning4j</groupId>
106 <artifactId>deeplearning4j-keras</artifactId>
107 </exclusion>
102 <version>${dl4j.version}</version>
103 <exclusions>
104 <exclusion>
105 <groupId>org.deeplearning4j</groupId>
106 <artifactId>deeplearning4j-keras</artifactId>
107 </exclusion>
108 <exclusion>
109 <groupId>org.nd4j</groupId>
110 <artifactId>nd4j-api</artifactId>
111 </exclusion>
112 <exclusion>
113 <groupId>org.nd4j</groupId>
114 <artifactId>jackson</artifactId>
115 </exclusion>
108116 <exclusion>
109117 <groupId>org.bytedeco</groupId>
110118 <artifactId>javacpp</artifactId>
116124 </exclusions>
117125 </dependency>
118126 <dependency>
127 <groupId>org.deeplearning4j</groupId>
128 <artifactId>deeplearning4j-nn</artifactId>
129 <version>${dl4j.version}</version>
130 <exclusions>
131 <exclusion>
132 <groupId>org.nd4j</groupId>
133 <artifactId>nd4j-api</artifactId>
134 </exclusion>
135 <exclusion>
136 <groupId>org.nd4j</groupId>
137 <artifactId>jackson</artifactId>
138 </exclusion>
139 <exclusion>
140 <groupId>org.apache.commons</groupId>
141 <artifactId>commons-compress</artifactId>
142 </exclusion>
143 <exclusion>
144 <groupId>commons-codec</groupId>
145 <artifactId>commons-codec</artifactId>
146 </exclusion>
147 <exclusion>
148 <groupId>org.apache.commons</groupId>
149 <artifactId>commons-math3</artifactId>
150 </exclusion>
151 <exclusion>
152 <groupId>com.google.guava</groupId>
153 <artifactId>guava</artifactId>
154 </exclusion>
155 <exclusion>
156 <groupId>org.datavec</groupId>
157 <artifactId>datavec-data-image</artifactId>
158 </exclusion>
159 <exclusion>
160 <groupId>commons-io</groupId>
161 <artifactId>commons-io</artifactId>
162 </exclusion>
163 </exclusions>
164 </dependency>
165 <dependency>
119166 <groupId>org.datavec</groupId>
120167 <artifactId>datavec-data-image</artifactId>
121168 <version>${dl4j.version}</version>
122169 <exclusions>
123170 <exclusion>
171 <groupId>org.nd4j</groupId>
172 <artifactId>nd4j-api</artifactId>
173 </exclusion>
174 <exclusion>
124175 <groupId>com.google.guava</groupId>
125176 <artifactId>guava</artifactId>
126177 </exclusion>
143194 </exclusions>
144195 </dependency>
145196 <dependency>
197 <groupId>org.datavec</groupId>
198 <artifactId>datavec-api</artifactId>
199 <version>${dl4j.version}</version>
200 <exclusions>
201 <exclusion>
202 <groupId>org.apache.commons</groupId>
203 <artifactId>commons-compress</artifactId>
204 </exclusion>
205 <exclusion>
206 <groupId>org.apache.commons</groupId>
207 <artifactId>commons-lang3</artifactId>
208 </exclusion>
209 <exclusion>
210 <groupId>commons-codec</groupId>
211 <artifactId>commons-codec</artifactId>
212 </exclusion>
213 <exclusion>
214 <groupId>joda-time</groupId>
215 <artifactId>joda-time</artifactId>
216 </exclusion>
217 <exclusion>
218 <groupId>org.nd4j</groupId>
219 <artifactId>nd4j-api</artifactId>
220 </exclusion>
221 <exclusion>
222 <groupId>org.nd4j</groupId>
223 <artifactId>jackson</artifactId>
224 </exclusion>
225 <exclusion>
226 <groupId>commons-io</groupId>
227 <artifactId>commons-io</artifactId>
228 </exclusion>
229 <exclusion>
230 <groupId>commons-lang</groupId>
231 <artifactId>commons-lang</artifactId>
232 </exclusion>
233 </exclusions>
234 </dependency>
235 <dependency>
236 <groupId>org.objenesis</groupId>
237 <artifactId>objenesis</artifactId>
238 <version>2.6</version>
239 </dependency>
240 <dependency>
241 <groupId>org.nd4j</groupId>
242 <artifactId>nd4j-api</artifactId>
243 <version>${dl4j.version}</version>
244 <exclusions>
245 <exclusion>
246 <groupId>org.objenesis</groupId>
247 <artifactId>objenesis</artifactId>
248 </exclusion>
249 <exclusion>
250 <groupId>com.google.guava</groupId>
251 <artifactId>guava</artifactId>
252 </exclusion>
253 <exclusion>
254 <groupId>org.apache.commons</groupId>
255 <artifactId>commons-math3</artifactId>
256 </exclusion>
257 </exclusions>
258 </dependency>
259 <dependency>
146260 <groupId>org.nd4j</groupId>
147261 <artifactId>nd4j-native-platform</artifactId>
148262 <version>${dl4j.version}</version>
149263 <exclusions>
150264 <exclusion>
265 <groupId>org.nd4j</groupId>
266 <artifactId>nd4j-api</artifactId>
267 </exclusion>
268 <exclusion>
151269 <groupId>org.bytedeco</groupId>
152270 <artifactId>javacpp</artifactId>
153271 </exclusion>
154272 </exclusions>
155273 </dependency>
156274 <dependency>
157 <groupId>org.bytedeco</groupId>
158 <artifactId>javacpp</artifactId>
159 <version>1.3.2</version>
160 </dependency>
161 <dependency>
162 <groupId>org.apache.commons</groupId>
163 <artifactId>commons-compress</artifactId>
164 <version>${commons.compress.version}</version>
275 <groupId>org.nd4j</groupId>
276 <artifactId>jackson</artifactId>
277 <version>${dl4j.version}</version>
278 <exclusions>
279 <exclusion>
280 <groupId>joda-time</groupId>
281 <artifactId>joda-time</artifactId>
282 </exclusion>
283 <exclusion>
284 <groupId>org.codehaus.woodstox</groupId>
285 <artifactId>stax2-api</artifactId>
286 </exclusion>
287 </exclusions>
288 </dependency>
289 <dependency>
290 <groupId>org.apache.commons</groupId>
291 <artifactId>commons-compress</artifactId>
292 <version>${commons.compress.version}</version>
293 </dependency>
294 <dependency>
295 <groupId>org.projectlombok</groupId>
296 <artifactId>lombok</artifactId>
297 <version>1.16.22</version>
298 </dependency>
299 <dependency>
300 <groupId>com.google.guava</groupId>
301 <artifactId>guava</artifactId>
302 <version>20.0</version>
303 </dependency>
304 <dependency>
305 <groupId>joda-time</groupId>
306 <artifactId>joda-time</artifactId>
307 <version>2.9.2</version>
308 </dependency>
309 <dependency>
310 <groupId>commons-io</groupId>
311 <artifactId>commons-io</artifactId>
312 <version>2.6</version>
165313 </dependency>
166314 </dependencies>
167315
193341 </executions>
194342 </plugin>
195343 <plugin>
344 <groupId>org.apache.maven.plugins</groupId>
345 <artifactId>maven-jar-plugin</artifactId>
346 <configuration>
347 <archive>
348 <manifestEntries>
349 <Automatic-Module-Name>org.apache.tika.dl</Automatic-Module-Name>
350 </manifestEntries>
351 </archive>
352 </configuration>
353 </plugin>
354
355 <plugin>
196356 <groupId>org.apache.rat</groupId>
197357 <artifactId>apache-rat-plugin</artifactId>
198358 <configuration>
199 <excludes>
200 <exclude>src/main/resources/org/apache/tika/dl/imagerec/*.json</exclude>
201 </excludes>
202359 </configuration>
203360 </plugin>
204361
205362 </plugins>
206363 </build>
207 <repositories>
208 <repository>
209 <id>oss-sonatype</id>
210 <name>oss-sonatype</name>
211 <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
212 <snapshots>
213 <enabled>true</enabled>
214 </snapshots>
215 </repository>
216 </repositories>
217364 </project>
1515 */
1616 package org.apache.tika.dl.imagerec;
1717
18 import org.apache.commons.io.FileUtils;
19 import org.apache.tika.config.Field;
20 import org.apache.tika.config.InitializableProblemHandler;
21 import org.apache.tika.config.Param;
22 import org.apache.tika.exception.TikaConfigException;
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.io.IOUtils;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MediaType;
27 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.parser.recognition.ObjectRecogniser;
29 import org.apache.tika.parser.recognition.RecognisedObject;
30 import org.datavec.image.loader.NativeImageLoader;
31 import org.deeplearning4j.nn.graph.ComputationGraph;
32 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
33 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
34 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
35 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelBuilder;
36 import org.json.simple.JSONArray;
37 import org.json.simple.JSONObject;
38 import org.json.simple.parser.JSONParser;
39 import org.json.simple.parser.ParseException;
40 import org.nd4j.linalg.api.ndarray.INDArray;
41 import org.slf4j.Logger;
42 import org.slf4j.LoggerFactory;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
1846 import java.io.File;
1947 import java.io.FileInputStream;
2048 import java.io.FileNotFoundException;
3159 import java.util.Map;
3260 import java.util.Set;
3361
34 import org.apache.commons.io.FileUtils;
35 import org.apache.tika.config.Field;
36 import org.apache.tika.config.InitializableProblemHandler;
37 import org.apache.tika.config.Param;
38 import org.apache.tika.exception.TikaConfigException;
39 import org.apache.tika.exception.TikaException;
40 import org.apache.tika.io.IOUtils;
41 import org.apache.tika.metadata.Metadata;
42 import org.apache.tika.mime.MediaType;
43 import org.apache.tika.parser.ParseContext;
44 import org.apache.tika.parser.recognition.ObjectRecogniser;
45 import org.apache.tika.parser.recognition.RecognisedObject;
46 import org.datavec.image.loader.NativeImageLoader;
47 import org.deeplearning4j.nn.graph.ComputationGraph;
48 import org.deeplearning4j.nn.modelimport.keras.InvalidKerasConfigurationException;
49 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
50 import org.deeplearning4j.nn.modelimport.keras.UnsupportedKerasConfigurationException;
51 import org.json.simple.JSONArray;
52 import org.json.simple.JSONObject;
53 import org.json.simple.parser.JSONParser;
54 import org.json.simple.parser.ParseException;
55 import org.nd4j.linalg.api.ndarray.INDArray;
56 import org.slf4j.Logger;
57 import org.slf4j.LoggerFactory;
58 import org.xml.sax.ContentHandler;
59 import org.xml.sax.SAXException;
60
6162 /**
6263 * {@link DL4JInceptionV3Net} is an implementation of {@link ObjectRecogniser}.
6364 * This object recogniser is powered by <a href="http://deeplearning4j.org">Deeplearning4j</a>.
6869 * for advances users who are interested in tweaking the settings, the following fields are configurable:
6970 * <ul>
7071 * <li>{@link #modelWeightsPath}</li>
71 * <li>{@link #modelJsonPath}</li>
7272 * <li>{@link #labelFile}</li>
7373 * <li>{@link #labelLang}</li>
7474 * <li>{@link #cacheDir}</li>
9090 private static final Set<MediaType> MEDIA_TYPES
9191 = Collections.singleton(MediaType.image("jpeg"));
9292 private static final Logger LOG = LoggerFactory.getLogger(DL4JInceptionV3Net.class);
93 private static final String DEF_WEIGHTS_URL = "https://raw.githubusercontent.com/USCDataScience/dl4j-kerasimport-examples/98ec48b56a5b8fb7d54a2994ce9cb23bfefac821/dl4j-import-example/data/inception-model-weights.h5";
94 public static final String DEF_MODEL_JSON = "org/apache/tika/dl/imagerec/inceptionv3-model.json";
95 public static final String DEF_LABEL_MAPPING = "org/apache/tika/dl/imagerec/imagenet_incpetionv3_class_index.json";
93 private static final String DEF_WEIGHTS_URL = "https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/inception_v3_keras_2.h5";
94 private static final String DEF_LABEL_MAPPING_URL = "https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/imagenet_class_index.json";
95 private static final String BASE_DIR = System.getProperty("user.home") + File.separator + ".tika-dl" +
96 File.separator + "models" + File.separator + "keras";
97 private static final String MODEL_DIR = BASE_DIR + File.separator + "inception-v3";
9698
9799 /**
98100 * Cache dir to be used for downloading the weights file.
99101 * This is used to download the model.
100102 */
101103 @Field
102 private File cacheDir = new File(".tmp-inception");
104 private File cacheDir = new File(MODEL_DIR);
103105
104106 /**
105107 * Path to a HDF5 file that contains weights of the Keras network
111113 @Field
112114 private String modelWeightsPath = DEF_WEIGHTS_URL;
113115
114 /**
115 * Path to a JSON file that contains network (graph) structure exported from Keras.
116 * <p>
117 * <br/>
118 * Default is retrieved from {@value DEF_MODEL_JSON}
119 */
120 @Field
121 private String modelJsonPath = DEF_MODEL_JSON;
122116 /***
123117 * Path to file that tells how to map node index to human readable label names
124118 * <br/>
125 * The default is retrieved from {@value DEF_LABEL_MAPPING}
126 */
127 @Field
128 private String labelFile = DEF_LABEL_MAPPING;
119 * The default is retrieved from {@value DEF_LABEL_MAPPING_URL}
120 */
121 @Field
122 private String labelFile = DEF_LABEL_MAPPING_URL;
129123
130124 /**
131125 * Language name of the labels.
137131
138132 @Field
139133 private int imgHeight = 299;
134
140135 @Field
141136 private int imgWidth = 299;
137
142138 @Field
143139 private int imgChannels = 3;
144140 /***
168164 " Asking the classloader", path);
169165 URL url = getClass().getClassLoader().getResource(path);
170166 if (url == null) {
171 LOG.debug("Classloader does not knows the file {}", path);
167 LOG.debug("Classloader does not know the file {}", path);
172168 file = null;
173169 } else {
174 LOG.debug("Class loader knows the file {}", path);
170 LOG.debug("Classloader knows the file {}", path);
175171 try {
176172 file = cachedDownload(cacheDir, url.toURI());
177173 } catch (URISyntaxException | IOException e) {
191187 return getClass().getClassLoader().getResourceAsStream(path);
192188 }
193189
194 public static synchronized File cachedDownload(File cacheDir, URI uri)
190 private static synchronized File cachedDownload(File cacheDir, URI uri)
195191 throws IOException {
196192
197193 if ("file".equals(uri.getScheme()) || uri.getScheme() == null) {
221217 return cacheFile;
222218 }
223219
220 private String mayBeDownloadFile(String path) throws TikaConfigException {
221 String resolvedFilePath;
222 if (path.startsWith("http://") || path.startsWith("https://")) {
223 LOG.debug("Config instructed to download the file, doing so.");
224 try {
225 resolvedFilePath = cachedDownload(cacheDir, URI.create(path)).getAbsolutePath();
226 } catch (IOException e) {
227 throw new TikaConfigException(e.getMessage(), e);
228 }
229 } else {
230 File file = retrieveFile(path);
231 if (!file.exists()) {
232 LOG.error("File does not exist at :: {}", path);
233 }
234 resolvedFilePath = file.getAbsolutePath();
235 }
236 return resolvedFilePath;
237 }
238
224239 @Override
225240 public void initialize(Map<String, Param> params)
226241 throws TikaConfigException {
242
227243 //STEP 1: resolve weights file, download if necessary
228 if (modelWeightsPath.startsWith("http://") || modelWeightsPath.startsWith("https://")) {
229 LOG.debug("Config instructed to download the weights file, doing so.");
230 try {
231 modelWeightsPath = cachedDownload(cacheDir, URI.create(modelWeightsPath)).getAbsolutePath();
232 } catch (IOException e) {
233 throw new TikaConfigException(e.getMessage(), e);
234 }
235 } else {
236 File modelFile = retrieveFile(modelWeightsPath);
237 if (!modelFile.exists()) {
238 LOG.error("modelWeights does not exist at :: {}", modelWeightsPath);
239 return;
240 }
241 modelWeightsPath = modelFile.getAbsolutePath();
242 }
243
244 //STEP 2: resolve model JSON
245 File modelJsonFile = retrieveFile(modelJsonPath);
246 if (modelJsonFile == null || !modelJsonFile.exists()) {
247 LOG.error("Could not locate file {}", modelJsonPath);
248 return;
249 }
250 modelJsonPath = modelJsonFile.getAbsolutePath();
251
252 //STEP 3: Load labels map
253 try (InputStream stream = retrieveResource(labelFile)) {
244 modelWeightsPath = mayBeDownloadFile(modelWeightsPath);
245
246 //STEP 2: Load labels map
247 try (InputStream stream = retrieveResource(mayBeDownloadFile(labelFile))) {
254248 this.labelMap = loadClassIndex(stream);
255249 } catch (IOException | ParseException e) {
256250 LOG.error("Could not load labels map", e);
257251 return;
258252 }
259253
260 //STEP 4: initialize the graph
254 //STEP 3: initialize the graph
261255 try {
262256 this.imageLoader = new NativeImageLoader(imgHeight, imgWidth, imgChannels);
263257 LOG.info("Going to load Inception network...");
264258 long st = System.currentTimeMillis();
265 this.graph = KerasModelImport.importKerasModelAndWeights(modelJsonPath,
266 modelWeightsPath, false);
259
260 KerasModelBuilder builder = new KerasModel().modelBuilder().modelHdf5Filename(modelWeightsPath)
261 .enforceTrainingConfig(false);
262 builder.inputShape(new int[]{imgHeight, imgWidth, 3});
263 KerasModel model = builder.buildModel();
264 this.graph = model.getComputationGraph();
265
267266 long time = System.currentTimeMillis() - st;
268267 LOG.info("Loaded the Inception model. Time taken={}ms", time);
269268 } catch (IOException | InvalidKerasConfigurationException
1717 package org.apache.tika.dl.imagerec;
1818
1919 import org.apache.tika.config.Field;
20 import org.apache.tika.config.InitializableProblemHandler;
2021 import org.apache.tika.config.Param;
21 import org.apache.tika.config.InitializableProblemHandler;
2222 import org.apache.tika.exception.TikaConfigException;
2323 import org.apache.tika.exception.TikaException;
2424 import org.apache.tika.metadata.Metadata;
2828 import org.apache.tika.parser.recognition.RecognisedObject;
2929 import org.datavec.image.loader.NativeImageLoader;
3030 import org.deeplearning4j.nn.graph.ComputationGraph;
31 import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModelHelper;
32 import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModels;
3331 import org.deeplearning4j.util.ModelSerializer;
32 import org.deeplearning4j.zoo.PretrainedType;
33 import org.deeplearning4j.zoo.ZooModel;
34 import org.deeplearning4j.zoo.model.VGG16;
35 import org.deeplearning4j.zoo.util.imagenet.ImageNetLabels;
3436 import org.nd4j.linalg.api.ndarray.INDArray;
3537 import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization;
3638 import org.nd4j.linalg.dataset.api.preprocessor.VGG16ImagePreProcessor;
3941 import org.slf4j.LoggerFactory;
4042 import org.xml.sax.ContentHandler;
4143 import org.xml.sax.SAXException;
42 import org.deeplearning4j.nn.modelimport.keras.trainedmodels.Utils.ImageNetLabels;
44
4345 import java.io.File;
4446 import java.io.IOException;
4547 import java.io.InputStream;
5355
5456 private static final Logger LOG = LoggerFactory.getLogger(DL4JVGG16Net.class);
5557 public static final Set<MediaType> SUPPORTED_MIMES = Collections.singleton(MediaType.image("jpeg"));
56 private static final String HOME_DIR = System.getProperty("user.home");
57 private static final String BASE_DIR = ".dl4j" + File.separator + "trainedmodels";
58 private static String MODEL_DIR = HOME_DIR + File.separator + BASE_DIR;
59 private static String MODEL_DIR_PREPROCESSED = MODEL_DIR + File.separator + "tikaPreprocessed" + File.separator;
60 private static TrainedModelHelper MODEL_HELPER = new TrainedModelHelper(TrainedModels.VGG16);
58 private static final String BASE_DIR = System.getProperty("user.home") + File.separator + ".tika-dl" +
59 File.separator + "models" + File.separator + "dl4j";
60 private static final String MODEL_DIR = BASE_DIR + File.separator + "vgg-16";
6161
6262 @Field
63 private File modelFile = new File(MODEL_DIR_PREPROCESSED + File.separator + "vgg16.zip");
64
65 @Field
66 private File locationToSave = new File(MODEL_DIR + File.separator
67 + "tikaPreprocessed" + File.separator + "vgg16.zip");
63 private File cacheDir = new File(MODEL_DIR + File.separator + "vgg16.zip");
6864
6965 @Field
7066 private boolean serialize = true;
67
7168 @Field
7269 private int topN;
70
7371 private NativeImageLoader imageLoader = new NativeImageLoader(224, 224, 3);
7472 private DataNormalization preProcessor = new VGG16ImagePreProcessor();
7573 private boolean available = false;
7775 public Set<MediaType> getSupportedMimes() {
7876 return SUPPORTED_MIMES;
7977 }
78 private ImageNetLabels imageNetLabels;
8079
8180 @Override
8281 public boolean isAvailable() {
8584
8685 @Override
8786 public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException {
88 //TODO: what do we want to check here?
87 //TODO: what do we want to check here?
8988 }
9089
9190 @Override
9291 public void initialize(Map<String, Param> params) throws TikaConfigException {
9392 try {
9493 if (serialize) {
95 if (locationToSave.exists()) {
96 model = ModelSerializer.restoreComputationGraph(locationToSave);
97 LOG.info("Preprocessed Model Loaded from {}", locationToSave);
94 if (cacheDir.exists()) {
95 model = ModelSerializer.restoreComputationGraph(cacheDir);
96 LOG.info("Preprocessed Model Loaded from {}", cacheDir);
9897 } else {
99 LOG.warn("Preprocessed Model doesn't exist at {}", locationToSave);
100 locationToSave.getParentFile().mkdirs();
101 model = MODEL_HELPER.loadModel();
98 LOG.warn("Preprocessed Model doesn't exist at {}", cacheDir);
99 cacheDir.getParentFile().mkdirs();
100 ZooModel zooModel = VGG16.builder().build();
101 model = (ComputationGraph)zooModel.initPretrained(PretrainedType.IMAGENET);
102102 LOG.info("Saving the Loaded model for future use. Saved models are more optimised to consume less resources.");
103 ModelSerializer.writeModel(model, locationToSave, true);
103 ModelSerializer.writeModel(model, cacheDir, true);
104104 }
105105 } else {
106106 LOG.info("Weight graph model loaded via dl4j Helper functions");
107 model = MODEL_HELPER.loadModel();
107 ZooModel zooModel = VGG16.builder().build();
108 model = (ComputationGraph)zooModel.initPretrained(PretrainedType.IMAGENET);
108109 }
110 imageNetLabels = new ImageNetLabels();
109111 available = true;
110112 } catch (Exception e) {
111113 available = false;
125127 }
126128 private List<RecognisedObject> predict(INDArray predictions)
127129 {
128 ArrayList<String> labels;
129 labels=ImageNetLabels.getLabels();
130130 List<RecognisedObject> objects = new ArrayList<>();
131131 int[] topNPredictions = new int[topN];
132132 float[] topNProb = new float[topN];
139139 topNPredictions[i] = Nd4j.argMax(currentBatch, 1).getInt(0, 0);
140140 topNProb[i] = currentBatch.getFloat(batch, topNPredictions[i]);
141141 currentBatch.putScalar(0, topNPredictions[i], 0);
142 outLabels[i]= labels.get(topNPredictions[i]);
142 outLabels[i]= imageNetLabels.getLabel(topNPredictions[i]);
143143 objects.add(new RecognisedObject(outLabels[i], "eng", outLabels[i], topNProb[i]));
144144 i++;
145145 }
+0
-1
tika-dl/src/main/resources/org/apache/tika/dl/imagerec/imagenet_incpetionv3_class_index.json less more
0 {"0":["n01440764","tench"],"1":["n01443537","goldfish"],"2":["n01484850","great_white_shark"],"3":["n01491361","tiger_shark"],"4":["n01494475","hammerhead"],"5":["n01496331","electric_ray"],"6":["n01498041","stingray"],"7":["n01514668","cock"],"8":["n01514859","hen"],"9":["n01518878","ostrich"],"10":["n01530575","brambling"],"11":["n01531178","goldfinch"],"12":["n01532829","house_finch"],"13":["n01534433","junco"],"14":["n01537544","indigo_bunting"],"15":["n01558993","robin"],"16":["n01560419","bulbul"],"17":["n01580077","jay"],"18":["n01582220","magpie"],"19":["n01592084","chickadee"],"20":["n01601694","water_ouzel"],"21":["n01608432","kite"],"22":["n01614925","bald_eagle"],"23":["n01616318","vulture"],"24":["n01622779","great_grey_owl"],"25":["n01629819","European_fire_salamander"],"26":["n01630670","common_newt"],"27":["n01631663","eft"],"28":["n01632458","spotted_salamander"],"29":["n01632777","axolotl"],"30":["n01641577","bullfrog"],"31":["n01644373","tree_frog"],"32":["n01644900","tailed_frog"],"33":["n01664065","loggerhead"],"34":["n01665541","leatherback_turtle"],"35":["n01667114","mud_turtle"],"36":["n01667778","terrapin"],"37":["n01669191","box_turtle"],"38":["n01675722","banded_gecko"],"39":["n01677366","common_iguana"],"40":["n01682714","American_chameleon"],"41":["n01685808","whiptail"],"42":["n01687978","agama"],"43":["n01688243","frilled_lizard"],"44":["n01689811","alligator_lizard"],"45":["n01692333","Gila_monster"],"46":["n01693334","green_lizard"],"47":["n01694178","African_chameleon"],"48":["n01695060","Komodo_dragon"],"49":["n01697457","African_crocodile"],"50":["n01698640","American_alligator"],"51":["n01704323","triceratops"],"52":["n01728572","thunder_snake"],"53":["n01728920","ringneck_snake"],"54":["n01729322","hognose_snake"],"55":["n01729977","green_snake"],"56":["n01734418","king_snake"],"57":["n01735189","garter_snake"],"58":["n01737021","water_snake"],"59":["n01739381","vine_snake"],"60":["n01740131","night_snake"],"61":["n01742172","boa_constrictor"],"62":["n01744401","rock_python"],"63":["n01748264","Indian_cobra"],"64":["n01749939","green_mamba"],"65":["n01751748","sea_snake"],"66":["n01753488","horned_viper"],"67":["n01755581","diamondback"],"68":["n01756291","sidewinder"],"69":["n01768244","trilobite"],"70":["n01770081","harvestman"],"71":["n01770393","scorpion"],"72":["n01773157","black_and_gold_garden_spider"],"73":["n01773549","barn_spider"],"74":["n01773797","garden_spider"],"75":["n01774384","black_widow"],"76":["n01774750","tarantula"],"77":["n01775062","wolf_spider"],"78":["n01776313","tick"],"79":["n01784675","centipede"],"80":["n01795545","black_grouse"],"81":["n01796340","ptarmigan"],"82":["n01797886","ruffed_grouse"],"83":["n01798484","prairie_chicken"],"84":["n01806143","peacock"],"85":["n01806567","quail"],"86":["n01807496","partridge"],"87":["n01817953","African_grey"],"88":["n01818515","macaw"],"89":["n01819313","sulphur-crested_cockatoo"],"90":["n01820546","lorikeet"],"91":["n01824575","coucal"],"92":["n01828970","bee_eater"],"93":["n01829413","hornbill"],"94":["n01833805","hummingbird"],"95":["n01843065","jacamar"],"96":["n01843383","toucan"],"97":["n01847000","drake"],"98":["n01855032","red-breasted_merganser"],"99":["n01855672","goose"],"100":["n01860187","black_swan"],"101":["n01871265","tusker"],"102":["n01872401","echidna"],"103":["n01873310","platypus"],"104":["n01877812","wallaby"],"105":["n01882714","koala"],"106":["n01883070","wombat"],"107":["n01910747","jellyfish"],"108":["n01914609","sea_anemone"],"109":["n01917289","brain_coral"],"110":["n01924916","flatworm"],"111":["n01930112","nematode"],"112":["n01943899","conch"],"113":["n01944390","snail"],"114":["n01945685","slug"],"115":["n01950731","sea_slug"],"116":["n01955084","chiton"],"117":["n01968897","chambered_nautilus"],"118":["n01978287","Dungeness_crab"],"119":["n01978455","rock_crab"],"120":["n01980166","fiddler_crab"],"121":["n01981276","king_crab"],"122":["n01983481","American_lobster"],"123":["n01984695","spiny_lobster"],"124":["n01985128","crayfish"],"125":["n01986214","hermit_crab"],"126":["n01990800","isopod"],"127":["n02002556","white_stork"],"128":["n02002724","black_stork"],"129":["n02006656","spoonbill"],"130":["n02007558","flamingo"],"131":["n02009229","little_blue_heron"],"132":["n02009912","American_egret"],"133":["n02011460","bittern"],"134":["n02012849","crane"],"135":["n02013706","limpkin"],"136":["n02017213","European_gallinule"],"137":["n02018207","American_coot"],"138":["n02018795","bustard"],"139":["n02025239","ruddy_turnstone"],"140":["n02027492","red-backed_sandpiper"],"141":["n02028035","redshank"],"142":["n02033041","dowitcher"],"143":["n02037110","oystercatcher"],"144":["n02051845","pelican"],"145":["n02056570","king_penguin"],"146":["n02058221","albatross"],"147":["n02066245","grey_whale"],"148":["n02071294","killer_whale"],"149":["n02074367","dugong"],"150":["n02077923","sea_lion"],"151":["n02085620","Chihuahua"],"152":["n02085782","Japanese_spaniel"],"153":["n02085936","Maltese_dog"],"154":["n02086079","Pekinese"],"155":["n02086240","Shih-Tzu"],"156":["n02086646","Blenheim_spaniel"],"157":["n02086910","papillon"],"158":["n02087046","toy_terrier"],"159":["n02087394","Rhodesian_ridgeback"],"160":["n02088094","Afghan_hound"],"161":["n02088238","basset"],"162":["n02088364","beagle"],"163":["n02088466","bloodhound"],"164":["n02088632","bluetick"],"165":["n02089078","black-and-tan_coonhound"],"166":["n02089867","Walker_hound"],"167":["n02089973","English_foxhound"],"168":["n02090379","redbone"],"169":["n02090622","borzoi"],"170":["n02090721","Irish_wolfhound"],"171":["n02091032","Italian_greyhound"],"172":["n02091134","whippet"],"173":["n02091244","Ibizan_hound"],"174":["n02091467","Norwegian_elkhound"],"175":["n02091635","otterhound"],"176":["n02091831","Saluki"],"177":["n02092002","Scottish_deerhound"],"178":["n02092339","Weimaraner"],"179":["n02093256","Staffordshire_bullterrier"],"180":["n02093428","American_Staffordshire_terrier"],"181":["n02093647","Bedlington_terrier"],"182":["n02093754","Border_terrier"],"183":["n02093859","Kerry_blue_terrier"],"184":["n02093991","Irish_terrier"],"185":["n02094114","Norfolk_terrier"],"186":["n02094258","Norwich_terrier"],"187":["n02094433","Yorkshire_terrier"],"188":["n02095314","wire-haired_fox_terrier"],"189":["n02095570","Lakeland_terrier"],"190":["n02095889","Sealyham_terrier"],"191":["n02096051","Airedale"],"192":["n02096177","cairn"],"193":["n02096294","Australian_terrier"],"194":["n02096437","Dandie_Dinmont"],"195":["n02096585","Boston_bull"],"196":["n02097047","miniature_schnauzer"],"197":["n02097130","giant_schnauzer"],"198":["n02097209","standard_schnauzer"],"199":["n02097298","Scotch_terrier"],"200":["n02097474","Tibetan_terrier"],"201":["n02097658","silky_terrier"],"202":["n02098105","soft-coated_wheaten_terrier"],"203":["n02098286","West_Highland_white_terrier"],"204":["n02098413","Lhasa"],"205":["n02099267","flat-coated_retriever"],"206":["n02099429","curly-coated_retriever"],"207":["n02099601","golden_retriever"],"208":["n02099712","Labrador_retriever"],"209":["n02099849","Chesapeake_Bay_retriever"],"210":["n02100236","German_short-haired_pointer"],"211":["n02100583","vizsla"],"212":["n02100735","English_setter"],"213":["n02100877","Irish_setter"],"214":["n02101006","Gordon_setter"],"215":["n02101388","Brittany_spaniel"],"216":["n02101556","clumber"],"217":["n02102040","English_springer"],"218":["n02102177","Welsh_springer_spaniel"],"219":["n02102318","cocker_spaniel"],"220":["n02102480","Sussex_spaniel"],"221":["n02102973","Irish_water_spaniel"],"222":["n02104029","kuvasz"],"223":["n02104365","schipperke"],"224":["n02105056","groenendael"],"225":["n02105162","malinois"],"226":["n02105251","briard"],"227":["n02105412","kelpie"],"228":["n02105505","komondor"],"229":["n02105641","Old_English_sheepdog"],"230":["n02105855","Shetland_sheepdog"],"231":["n02106030","collie"],"232":["n02106166","Border_collie"],"233":["n02106382","Bouvier_des_Flandres"],"234":["n02106550","Rottweiler"],"235":["n02106662","German_shepherd"],"236":["n02107142","Doberman"],"237":["n02107312","miniature_pinscher"],"238":["n02107574","Greater_Swiss_Mountain_dog"],"239":["n02107683","Bernese_mountain_dog"],"240":["n02107908","Appenzeller"],"241":["n02108000","EntleBucher"],"242":["n02108089","boxer"],"243":["n02108422","bull_mastiff"],"244":["n02108551","Tibetan_mastiff"],"245":["n02108915","French_bulldog"],"246":["n02109047","Great_Dane"],"247":["n02109525","Saint_Bernard"],"248":["n02109961","Eskimo_dog"],"249":["n02110063","malamute"],"250":["n02110185","Siberian_husky"],"251":["n02110341","dalmatian"],"252":["n02110627","affenpinscher"],"253":["n02110806","basenji"],"254":["n02110958","pug"],"255":["n02111129","Leonberg"],"256":["n02111277","Newfoundland"],"257":["n02111500","Great_Pyrenees"],"258":["n02111889","Samoyed"],"259":["n02112018","Pomeranian"],"260":["n02112137","chow"],"261":["n02112350","keeshond"],"262":["n02112706","Brabancon_griffon"],"263":["n02113023","Pembroke"],"264":["n02113186","Cardigan"],"265":["n02113624","toy_poodle"],"266":["n02113712","miniature_poodle"],"267":["n02113799","standard_poodle"],"268":["n02113978","Mexican_hairless"],"269":["n02114367","timber_wolf"],"270":["n02114548","white_wolf"],"271":["n02114712","red_wolf"],"272":["n02114855","coyote"],"273":["n02115641","dingo"],"274":["n02115913","dhole"],"275":["n02116738","African_hunting_dog"],"276":["n02117135","hyena"],"277":["n02119022","red_fox"],"278":["n02119789","kit_fox"],"279":["n02120079","Arctic_fox"],"280":["n02120505","grey_fox"],"281":["n02123045","tabby"],"282":["n02123159","tiger_cat"],"283":["n02123394","Persian_cat"],"284":["n02123597","Siamese_cat"],"285":["n02124075","Egyptian_cat"],"286":["n02125311","cougar"],"287":["n02127052","lynx"],"288":["n02128385","leopard"],"289":["n02128757","snow_leopard"],"290":["n02128925","jaguar"],"291":["n02129165","lion"],"292":["n02129604","tiger"],"293":["n02130308","cheetah"],"294":["n02132136","brown_bear"],"295":["n02133161","American_black_bear"],"296":["n02134084","ice_bear"],"297":["n02134418","sloth_bear"],"298":["n02137549","mongoose"],"299":["n02138441","meerkat"],"300":["n02165105","tiger_beetle"],"301":["n02165456","ladybug"],"302":["n02167151","ground_beetle"],"303":["n02168699","long-horned_beetle"],"304":["n02169497","leaf_beetle"],"305":["n02172182","dung_beetle"],"306":["n02174001","rhinoceros_beetle"],"307":["n02177972","weevil"],"308":["n02190166","fly"],"309":["n02206856","bee"],"310":["n02219486","ant"],"311":["n02226429","grasshopper"],"312":["n02229544","cricket"],"313":["n02231487","walking_stick"],"314":["n02233338","cockroach"],"315":["n02236044","mantis"],"316":["n02256656","cicada"],"317":["n02259212","leafhopper"],"318":["n02264363","lacewing"],"319":["n02268443","dragonfly"],"320":["n02268853","damselfly"],"321":["n02276258","admiral"],"322":["n02277742","ringlet"],"323":["n02279972","monarch"],"324":["n02280649","cabbage_butterfly"],"325":["n02281406","sulphur_butterfly"],"326":["n02281787","lycaenid"],"327":["n02317335","starfish"],"328":["n02319095","sea_urchin"],"329":["n02321529","sea_cucumber"],"330":["n02325366","wood_rabbit"],"331":["n02326432","hare"],"332":["n02328150","Angora"],"333":["n02342885","hamster"],"334":["n02346627","porcupine"],"335":["n02356798","fox_squirrel"],"336":["n02361337","marmot"],"337":["n02363005","beaver"],"338":["n02364673","guinea_pig"],"339":["n02389026","sorrel"],"340":["n02391049","zebra"],"341":["n02395406","hog"],"342":["n02396427","wild_boar"],"343":["n02397096","warthog"],"344":["n02398521","hippopotamus"],"345":["n02403003","ox"],"346":["n02408429","water_buffalo"],"347":["n02410509","bison"],"348":["n02412080","ram"],"349":["n02415577","bighorn"],"350":["n02417914","ibex"],"351":["n02422106","hartebeest"],"352":["n02422699","impala"],"353":["n02423022","gazelle"],"354":["n02437312","Arabian_camel"],"355":["n02437616","llama"],"356":["n02441942","weasel"],"357":["n02442845","mink"],"358":["n02443114","polecat"],"359":["n02443484","black-footed_ferret"],"360":["n02444819","otter"],"361":["n02445715","skunk"],"362":["n02447366","badger"],"363":["n02454379","armadillo"],"364":["n02457408","three-toed_sloth"],"365":["n02480495","orangutan"],"366":["n02480855","gorilla"],"367":["n02481823","chimpanzee"],"368":["n02483362","gibbon"],"369":["n02483708","siamang"],"370":["n02484975","guenon"],"371":["n02486261","patas"],"372":["n02486410","baboon"],"373":["n02487347","macaque"],"374":["n02488291","langur"],"375":["n02488702","colobus"],"376":["n02489166","proboscis_monkey"],"377":["n02490219","marmoset"],"378":["n02492035","capuchin"],"379":["n02492660","howler_monkey"],"380":["n02493509","titi"],"381":["n02493793","spider_monkey"],"382":["n02494079","squirrel_monkey"],"383":["n02497673","Madagascar_cat"],"384":["n02500267","indri"],"385":["n02504013","Indian_elephant"],"386":["n02504458","African_elephant"],"387":["n02509815","lesser_panda"],"388":["n02510455","giant_panda"],"389":["n02514041","barracouta"],"390":["n02526121","eel"],"391":["n02536864","coho"],"392":["n02606052","rock_beauty"],"393":["n02607072","anemone_fish"],"394":["n02640242","sturgeon"],"395":["n02641379","gar"],"396":["n02643566","lionfish"],"397":["n02655020","puffer"],"398":["n02666196","abacus"],"399":["n02667093","abaya"],"400":["n02669723","academic_gown"],"401":["n02672831","accordion"],"402":["n02676566","acoustic_guitar"],"403":["n02687172","aircraft_carrier"],"404":["n02690373","airliner"],"405":["n02692877","airship"],"406":["n02699494","altar"],"407":["n02701002","ambulance"],"408":["n02704792","amphibian"],"409":["n02708093","analog_clock"],"410":["n02727426","apiary"],"411":["n02730930","apron"],"412":["n02747177","ashcan"],"413":["n02749479","assault_rifle"],"414":["n02769748","backpack"],"415":["n02776631","bakery"],"416":["n02777292","balance_beam"],"417":["n02782093","balloon"],"418":["n02783161","ballpoint"],"419":["n02786058","Band_Aid"],"420":["n02787622","banjo"],"421":["n02788148","bannister"],"422":["n02790996","barbell"],"423":["n02791124","barber_chair"],"424":["n02791270","barbershop"],"425":["n02793495","barn"],"426":["n02794156","barometer"],"427":["n02795169","barrel"],"428":["n02797295","barrow"],"429":["n02799071","baseball"],"430":["n02802426","basketball"],"431":["n02804414","bassinet"],"432":["n02804610","bassoon"],"433":["n02807133","bathing_cap"],"434":["n02808304","bath_towel"],"435":["n02808440","bathtub"],"436":["n02814533","beach_wagon"],"437":["n02814860","beacon"],"438":["n02815834","beaker"],"439":["n02817516","bearskin"],"440":["n02823428","beer_bottle"],"441":["n02823750","beer_glass"],"442":["n02825657","bell_cote"],"443":["n02834397","bib"],"444":["n02835271","bicycle-built-for-two"],"445":["n02837789","bikini"],"446":["n02840245","binder"],"447":["n02841315","binoculars"],"448":["n02843684","birdhouse"],"449":["n02859443","boathouse"],"450":["n02860847","bobsled"],"451":["n02865351","bolo_tie"],"452":["n02869837","bonnet"],"453":["n02870880","bookcase"],"454":["n02871525","bookshop"],"455":["n02877765","bottlecap"],"456":["n02879718","bow"],"457":["n02883205","bow_tie"],"458":["n02892201","brass"],"459":["n02892767","brassiere"],"460":["n02894605","breakwater"],"461":["n02895154","breastplate"],"462":["n02906734","broom"],"463":["n02909870","bucket"],"464":["n02910353","buckle"],"465":["n02916936","bulletproof_vest"],"466":["n02917067","bullet_train"],"467":["n02927161","butcher_shop"],"468":["n02930766","cab"],"469":["n02939185","caldron"],"470":["n02948072","candle"],"471":["n02950826","cannon"],"472":["n02951358","canoe"],"473":["n02951585","can_opener"],"474":["n02963159","cardigan"],"475":["n02965783","car_mirror"],"476":["n02966193","carousel"],"477":["n02966687","carpenter's_kit"],"478":["n02971356","carton"],"479":["n02974003","car_wheel"],"480":["n02977058","cash_machine"],"481":["n02978881","cassette"],"482":["n02979186","cassette_player"],"483":["n02980441","castle"],"484":["n02981792","catamaran"],"485":["n02988304","CD_player"],"486":["n02992211","cello"],"487":["n02992529","cellular_telephone"],"488":["n02999410","chain"],"489":["n03000134","chainlink_fence"],"490":["n03000247","chain_mail"],"491":["n03000684","chain_saw"],"492":["n03014705","chest"],"493":["n03016953","chiffonier"],"494":["n03017168","chime"],"495":["n03018349","china_cabinet"],"496":["n03026506","Christmas_stocking"],"497":["n03028079","church"],"498":["n03032252","cinema"],"499":["n03041632","cleaver"],"500":["n03042490","cliff_dwelling"],"501":["n03045698","cloak"],"502":["n03047690","clog"],"503":["n03062245","cocktail_shaker"],"504":["n03063599","coffee_mug"],"505":["n03063689","coffeepot"],"506":["n03065424","coil"],"507":["n03075370","combination_lock"],"508":["n03085013","computer_keyboard"],"509":["n03089624","confectionery"],"510":["n03095699","container_ship"],"511":["n03100240","convertible"],"512":["n03109150","corkscrew"],"513":["n03110669","cornet"],"514":["n03124043","cowboy_boot"],"515":["n03124170","cowboy_hat"],"516":["n03125729","cradle"],"517":["n03126707","crane"],"518":["n03127747","crash_helmet"],"519":["n03127925","crate"],"520":["n03131574","crib"],"521":["n03133878","Crock_Pot"],"522":["n03134739","croquet_ball"],"523":["n03141823","crutch"],"524":["n03146219","cuirass"],"525":["n03160309","dam"],"526":["n03179701","desk"],"527":["n03180011","desktop_computer"],"528":["n03187595","dial_telephone"],"529":["n03188531","diaper"],"530":["n03196217","digital_clock"],"531":["n03197337","digital_watch"],"532":["n03201208","dining_table"],"533":["n03207743","dishrag"],"534":["n03207941","dishwasher"],"535":["n03208938","disk_brake"],"536":["n03216828","dock"],"537":["n03218198","dogsled"],"538":["n03220513","dome"],"539":["n03223299","doormat"],"540":["n03240683","drilling_platform"],"541":["n03249569","drum"],"542":["n03250847","drumstick"],"543":["n03255030","dumbbell"],"544":["n03259280","Dutch_oven"],"545":["n03271574","electric_fan"],"546":["n03272010","electric_guitar"],"547":["n03272562","electric_locomotive"],"548":["n03290653","entertainment_center"],"549":["n03291819","envelope"],"550":["n03297495","espresso_maker"],"551":["n03314780","face_powder"],"552":["n03325584","feather_boa"],"553":["n03337140","file"],"554":["n03344393","fireboat"],"555":["n03345487","fire_engine"],"556":["n03347037","fire_screen"],"557":["n03355925","flagpole"],"558":["n03372029","flute"],"559":["n03376595","folding_chair"],"560":["n03379051","football_helmet"],"561":["n03384352","forklift"],"562":["n03388043","fountain"],"563":["n03388183","fountain_pen"],"564":["n03388549","four-poster"],"565":["n03393912","freight_car"],"566":["n03394916","French_horn"],"567":["n03400231","frying_pan"],"568":["n03404251","fur_coat"],"569":["n03417042","garbage_truck"],"570":["n03424325","gasmask"],"571":["n03425413","gas_pump"],"572":["n03443371","goblet"],"573":["n03444034","go-kart"],"574":["n03445777","golf_ball"],"575":["n03445924","golfcart"],"576":["n03447447","gondola"],"577":["n03447721","gong"],"578":["n03450230","gown"],"579":["n03452741","grand_piano"],"580":["n03457902","greenhouse"],"581":["n03459775","grille"],"582":["n03461385","grocery_store"],"583":["n03467068","guillotine"],"584":["n03476684","hair_slide"],"585":["n03476991","hair_spray"],"586":["n03478589","half_track"],"587":["n03481172","hammer"],"588":["n03482405","hamper"],"589":["n03483316","hand_blower"],"590":["n03485407","hand-held_computer"],"591":["n03485794","handkerchief"],"592":["n03492542","hard_disc"],"593":["n03494278","harmonica"],"594":["n03495258","harp"],"595":["n03496892","harvester"],"596":["n03498962","hatchet"],"597":["n03527444","holster"],"598":["n03529860","home_theater"],"599":["n03530642","honeycomb"],"600":["n03532672","hook"],"601":["n03534580","hoopskirt"],"602":["n03535780","horizontal_bar"],"603":["n03538406","horse_cart"],"604":["n03544143","hourglass"],"605":["n03584254","iPod"],"606":["n03584829","iron"],"607":["n03590841","jack-o'-lantern"],"608":["n03594734","jean"],"609":["n03594945","jeep"],"610":["n03595614","jersey"],"611":["n03598930","jigsaw_puzzle"],"612":["n03599486","jinrikisha"],"613":["n03602883","joystick"],"614":["n03617480","kimono"],"615":["n03623198","knee_pad"],"616":["n03627232","knot"],"617":["n03630383","lab_coat"],"618":["n03633091","ladle"],"619":["n03637318","lampshade"],"620":["n03642806","laptop"],"621":["n03649909","lawn_mower"],"622":["n03657121","lens_cap"],"623":["n03658185","letter_opener"],"624":["n03661043","library"],"625":["n03662601","lifeboat"],"626":["n03666591","lighter"],"627":["n03670208","limousine"],"628":["n03673027","liner"],"629":["n03676483","lipstick"],"630":["n03680355","Loafer"],"631":["n03690938","lotion"],"632":["n03691459","loudspeaker"],"633":["n03692522","loupe"],"634":["n03697007","lumbermill"],"635":["n03706229","magnetic_compass"],"636":["n03709823","mailbag"],"637":["n03710193","mailbox"],"638":["n03710637","maillot"],"639":["n03710721","maillot"],"640":["n03717622","manhole_cover"],"641":["n03720891","maraca"],"642":["n03721384","marimba"],"643":["n03724870","mask"],"644":["n03729826","matchstick"],"645":["n03733131","maypole"],"646":["n03733281","maze"],"647":["n03733805","measuring_cup"],"648":["n03742115","medicine_chest"],"649":["n03743016","megalith"],"650":["n03759954","microphone"],"651":["n03761084","microwave"],"652":["n03763968","military_uniform"],"653":["n03764736","milk_can"],"654":["n03769881","minibus"],"655":["n03770439","miniskirt"],"656":["n03770679","minivan"],"657":["n03773504","missile"],"658":["n03775071","mitten"],"659":["n03775546","mixing_bowl"],"660":["n03776460","mobile_home"],"661":["n03777568","Model_T"],"662":["n03777754","modem"],"663":["n03781244","monastery"],"664":["n03782006","monitor"],"665":["n03785016","moped"],"666":["n03786901","mortar"],"667":["n03787032","mortarboard"],"668":["n03788195","mosque"],"669":["n03788365","mosquito_net"],"670":["n03791053","motor_scooter"],"671":["n03792782","mountain_bike"],"672":["n03792972","mountain_tent"],"673":["n03793489","mouse"],"674":["n03794056","mousetrap"],"675":["n03796401","moving_van"],"676":["n03803284","muzzle"],"677":["n03804744","nail"],"678":["n03814639","neck_brace"],"679":["n03814906","necklace"],"680":["n03825788","nipple"],"681":["n03832673","notebook"],"682":["n03837869","obelisk"],"683":["n03838899","oboe"],"684":["n03840681","ocarina"],"685":["n03841143","odometer"],"686":["n03843555","oil_filter"],"687":["n03854065","organ"],"688":["n03857828","oscilloscope"],"689":["n03866082","overskirt"],"690":["n03868242","oxcart"],"691":["n03868863","oxygen_mask"],"692":["n03871628","packet"],"693":["n03873416","paddle"],"694":["n03874293","paddlewheel"],"695":["n03874599","padlock"],"696":["n03876231","paintbrush"],"697":["n03877472","pajama"],"698":["n03877845","palace"],"699":["n03884397","panpipe"],"700":["n03887697","paper_towel"],"701":["n03888257","parachute"],"702":["n03888605","parallel_bars"],"703":["n03891251","park_bench"],"704":["n03891332","parking_meter"],"705":["n03895866","passenger_car"],"706":["n03899768","patio"],"707":["n03902125","pay-phone"],"708":["n03903868","pedestal"],"709":["n03908618","pencil_box"],"710":["n03908714","pencil_sharpener"],"711":["n03916031","perfume"],"712":["n03920288","Petri_dish"],"713":["n03924679","photocopier"],"714":["n03929660","pick"],"715":["n03929855","pickelhaube"],"716":["n03930313","picket_fence"],"717":["n03930630","pickup"],"718":["n03933933","pier"],"719":["n03935335","piggy_bank"],"720":["n03937543","pill_bottle"],"721":["n03938244","pillow"],"722":["n03942813","ping-pong_ball"],"723":["n03944341","pinwheel"],"724":["n03947888","pirate"],"725":["n03950228","pitcher"],"726":["n03954731","plane"],"727":["n03956157","planetarium"],"728":["n03958227","plastic_bag"],"729":["n03961711","plate_rack"],"730":["n03967562","plow"],"731":["n03970156","plunger"],"732":["n03976467","Polaroid_camera"],"733":["n03976657","pole"],"734":["n03977966","police_van"],"735":["n03980874","poncho"],"736":["n03982430","pool_table"],"737":["n03983396","pop_bottle"],"738":["n03991062","pot"],"739":["n03992509","potter's_wheel"],"740":["n03995372","power_drill"],"741":["n03998194","prayer_rug"],"742":["n04004767","printer"],"743":["n04005630","prison"],"744":["n04008634","projectile"],"745":["n04009552","projector"],"746":["n04019541","puck"],"747":["n04023962","punching_bag"],"748":["n04026417","purse"],"749":["n04033901","quill"],"750":["n04033995","quilt"],"751":["n04037443","racer"],"752":["n04039381","racket"],"753":["n04040759","radiator"],"754":["n04041544","radio"],"755":["n04044716","radio_telescope"],"756":["n04049303","rain_barrel"],"757":["n04065272","recreational_vehicle"],"758":["n04067472","reel"],"759":["n04069434","reflex_camera"],"760":["n04070727","refrigerator"],"761":["n04074963","remote_control"],"762":["n04081281","restaurant"],"763":["n04086273","revolver"],"764":["n04090263","rifle"],"765":["n04099969","rocking_chair"],"766":["n04111531","rotisserie"],"767":["n04116512","rubber_eraser"],"768":["n04118538","rugby_ball"],"769":["n04118776","rule"],"770":["n04120489","running_shoe"],"771":["n04125021","safe"],"772":["n04127249","safety_pin"],"773":["n04131690","saltshaker"],"774":["n04133789","sandal"],"775":["n04136333","sarong"],"776":["n04141076","sax"],"777":["n04141327","scabbard"],"778":["n04141975","scale"],"779":["n04146614","school_bus"],"780":["n04147183","schooner"],"781":["n04149813","scoreboard"],"782":["n04152593","screen"],"783":["n04153751","screw"],"784":["n04154565","screwdriver"],"785":["n04162706","seat_belt"],"786":["n04179913","sewing_machine"],"787":["n04192698","shield"],"788":["n04200800","shoe_shop"],"789":["n04201297","shoji"],"790":["n04204238","shopping_basket"],"791":["n04204347","shopping_cart"],"792":["n04208210","shovel"],"793":["n04209133","shower_cap"],"794":["n04209239","shower_curtain"],"795":["n04228054","ski"],"796":["n04229816","ski_mask"],"797":["n04235860","sleeping_bag"],"798":["n04238763","slide_rule"],"799":["n04239074","sliding_door"],"800":["n04243546","slot"],"801":["n04251144","snorkel"],"802":["n04252077","snowmobile"],"803":["n04252225","snowplow"],"804":["n04254120","soap_dispenser"],"805":["n04254680","soccer_ball"],"806":["n04254777","sock"],"807":["n04258138","solar_dish"],"808":["n04259630","sombrero"],"809":["n04263257","soup_bowl"],"810":["n04264628","space_bar"],"811":["n04265275","space_heater"],"812":["n04266014","space_shuttle"],"813":["n04270147","spatula"],"814":["n04273569","speedboat"],"815":["n04275548","spider_web"],"816":["n04277352","spindle"],"817":["n04285008","sports_car"],"818":["n04286575","spotlight"],"819":["n04296562","stage"],"820":["n04310018","steam_locomotive"],"821":["n04311004","steel_arch_bridge"],"822":["n04311174","steel_drum"],"823":["n04317175","stethoscope"],"824":["n04325704","stole"],"825":["n04326547","stone_wall"],"826":["n04328186","stopwatch"],"827":["n04330267","stove"],"828":["n04332243","strainer"],"829":["n04335435","streetcar"],"830":["n04336792","stretcher"],"831":["n04344873","studio_couch"],"832":["n04346328","stupa"],"833":["n04347754","submarine"],"834":["n04350905","suit"],"835":["n04355338","sundial"],"836":["n04355933","sunglass"],"837":["n04356056","sunglasses"],"838":["n04357314","sunscreen"],"839":["n04366367","suspension_bridge"],"840":["n04367480","swab"],"841":["n04370456","sweatshirt"],"842":["n04371430","swimming_trunks"],"843":["n04371774","swing"],"844":["n04372370","switch"],"845":["n04376876","syringe"],"846":["n04380533","table_lamp"],"847":["n04389033","tank"],"848":["n04392985","tape_player"],"849":["n04398044","teapot"],"850":["n04399382","teddy"],"851":["n04404412","television"],"852":["n04409515","tennis_ball"],"853":["n04417672","thatch"],"854":["n04418357","theater_curtain"],"855":["n04423845","thimble"],"856":["n04428191","thresher"],"857":["n04429376","throne"],"858":["n04435653","tile_roof"],"859":["n04442312","toaster"],"860":["n04443257","tobacco_shop"],"861":["n04447861","toilet_seat"],"862":["n04456115","torch"],"863":["n04458633","totem_pole"],"864":["n04461696","tow_truck"],"865":["n04462240","toyshop"],"866":["n04465501","tractor"],"867":["n04467665","trailer_truck"],"868":["n04476259","tray"],"869":["n04479046","trench_coat"],"870":["n04482393","tricycle"],"871":["n04483307","trimaran"],"872":["n04485082","tripod"],"873":["n04486054","triumphal_arch"],"874":["n04487081","trolleybus"],"875":["n04487394","trombone"],"876":["n04493381","tub"],"877":["n04501370","turnstile"],"878":["n04505470","typewriter_keyboard"],"879":["n04507155","umbrella"],"880":["n04509417","unicycle"],"881":["n04515003","upright"],"882":["n04517823","vacuum"],"883":["n04522168","vase"],"884":["n04523525","vault"],"885":["n04525038","velvet"],"886":["n04525305","vending_machine"],"887":["n04532106","vestment"],"888":["n04532670","viaduct"],"889":["n04536866","violin"],"890":["n04540053","volleyball"],"891":["n04542943","waffle_iron"],"892":["n04548280","wall_clock"],"893":["n04548362","wallet"],"894":["n04550184","wardrobe"],"895":["n04552348","warplane"],"896":["n04553703","washbasin"],"897":["n04554684","washer"],"898":["n04557648","water_bottle"],"899":["n04560804","water_jug"],"900":["n04562935","water_tower"],"901":["n04579145","whiskey_jug"],"902":["n04579432","whistle"],"903":["n04584207","wig"],"904":["n04589890","window_screen"],"905":["n04590129","window_shade"],"906":["n04591157","Windsor_tie"],"907":["n04591713","wine_bottle"],"908":["n04592741","wing"],"909":["n04596742","wok"],"910":["n04597913","wooden_spoon"],"911":["n04599235","wool"],"912":["n04604644","worm_fence"],"913":["n04606251","wreck"],"914":["n04612504","yawl"],"915":["n04613696","yurt"],"916":["n06359193","web_site"],"917":["n06596364","comic_book"],"918":["n06785654","crossword_puzzle"],"919":["n06794110","street_sign"],"920":["n06874185","traffic_light"],"921":["n07248320","book_jacket"],"922":["n07565083","menu"],"923":["n07579787","plate"],"924":["n07583066","guacamole"],"925":["n07584110","consomme"],"926":["n07590611","hot_pot"],"927":["n07613480","trifle"],"928":["n07614500","ice_cream"],"929":["n07615774","ice_lolly"],"930":["n07684084","French_loaf"],"931":["n07693725","bagel"],"932":["n07695742","pretzel"],"933":["n07697313","cheeseburger"],"934":["n07697537","hotdog"],"935":["n07711569","mashed_potato"],"936":["n07714571","head_cabbage"],"937":["n07714990","broccoli"],"938":["n07715103","cauliflower"],"939":["n07716358","zucchini"],"940":["n07716906","spaghetti_squash"],"941":["n07717410","acorn_squash"],"942":["n07717556","butternut_squash"],"943":["n07718472","cucumber"],"944":["n07718747","artichoke"],"945":["n07720875","bell_pepper"],"946":["n07730033","cardoon"],"947":["n07734744","mushroom"],"948":["n07742313","Granny_Smith"],"949":["n07745940","strawberry"],"950":["n07747607","orange"],"951":["n07749582","lemon"],"952":["n07753113","fig"],"953":["n07753275","pineapple"],"954":["n07753592","banana"],"955":["n07754684","jackfruit"],"956":["n07760859","custard_apple"],"957":["n07768694","pomegranate"],"958":["n07802026","hay"],"959":["n07831146","carbonara"],"960":["n07836838","chocolate_sauce"],"961":["n07860988","dough"],"962":["n07871810","meat_loaf"],"963":["n07873807","pizza"],"964":["n07875152","potpie"],"965":["n07880968","burrito"],"966":["n07892512","red_wine"],"967":["n07920052","espresso"],"968":["n07930864","cup"],"969":["n07932039","eggnog"],"970":["n09193705","alp"],"971":["n09229709","bubble"],"972":["n09246464","cliff"],"973":["n09256479","coral_reef"],"974":["n09288635","geyser"],"975":["n09332890","lakeside"],"976":["n09399592","promontory"],"977":["n09421951","sandbar"],"978":["n09428293","seashore"],"979":["n09468604","valley"],"980":["n09472597","volcano"],"981":["n09835506","ballplayer"],"982":["n10148035","groom"],"983":["n10565667","scuba_diver"],"984":["n11879895","rapeseed"],"985":["n11939491","daisy"],"986":["n12057211","yellow_lady's_slipper"],"987":["n12144580","corn"],"988":["n12267677","acorn"],"989":["n12620546","hip"],"990":["n12768682","buckeye"],"991":["n12985857","coral_fungus"],"992":["n12998815","agaric"],"993":["n13037406","gyromitra"],"994":["n13040303","stinkhorn"],"995":["n13044778","earthstar"],"996":["n13052670","hen-of-the-woods"],"997":["n13054560","bolete"],"998":["n13133613","ear"],"999":["n15075141","toilet_tissue"]}
+0
-1
tika-dl/src/main/resources/org/apache/tika/dl/imagerec/inceptionv3-model.json less more
0 {"class_name":"Model","keras_version":"1.1.0","config":{"layers":[{"class_name":"InputLayer","config":{"batch_input_shape":[null,299,299,3],"input_dtype":"float32","sparse":false,"name":"input_1"},"inbound_nodes":[],"name":"input_1"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_1","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[2,2],"init":"glorot_uniform","bias":true,"nb_filter":32,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["input_1",0,0]]],"name":"convolution2d_1"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_1","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_1",0,0]]],"name":"batchnormalization_1"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_2","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":32,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_1",0,0]]],"name":"convolution2d_2"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_2","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_2",0,0]]],"name":"batchnormalization_2"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_3","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_2",0,0]]],"name":"convolution2d_3"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_3","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_3",0,0]]],"name":"batchnormalization_3"},{"class_name":"MaxPooling2D","config":{"name":"maxpooling2d_1","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[2,2],"border_mode":"valid"},"inbound_nodes":[[["batchnormalization_3",0,0]]],"name":"maxpooling2d_1"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_4","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":80,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["maxpooling2d_1",0,0]]],"name":"convolution2d_4"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_4","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_4",0,0]]],"name":"batchnormalization_4"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_5","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_4",0,0]]],"name":"convolution2d_5"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_5","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_5",0,0]]],"name":"batchnormalization_5"},{"class_name":"MaxPooling2D","config":{"name":"maxpooling2d_2","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[2,2],"border_mode":"valid"},"inbound_nodes":[[["batchnormalization_5",0,0]]],"name":"maxpooling2d_2"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_9","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["maxpooling2d_2",0,0]]],"name":"convolution2d_9"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_9","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_9",0,0]]],"name":"batchnormalization_9"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_7","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":48,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["maxpooling2d_2",0,0]]],"name":"convolution2d_7"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_10","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_9",0,0]]],"name":"convolution2d_10"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_7","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_7",0,0]]],"name":"batchnormalization_7"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_10","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_10",0,0]]],"name":"batchnormalization_10"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_1","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["maxpooling2d_2",0,0]]],"name":"averagepooling2d_1"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_6","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["maxpooling2d_2",0,0]]],"name":"convolution2d_6"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_8","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":5,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":5},"inbound_nodes":[[["batchnormalization_7",0,0]]],"name":"convolution2d_8"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_11","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_10",0,0]]],"name":"convolution2d_11"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_12","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":32,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_1",0,0]]],"name":"convolution2d_12"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_6","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_6",0,0]]],"name":"batchnormalization_6"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_8","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_8",0,0]]],"name":"batchnormalization_8"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_11","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_11",0,0]]],"name":"batchnormalization_11"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_12","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_12",0,0]]],"name":"batchnormalization_12"},{"class_name":"Merge","config":{"name":"mixed0","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_6",0,0],["batchnormalization_8",0,0],["batchnormalization_11",0,0],["batchnormalization_12",0,0]]],"name":"mixed0"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_16","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed0",0,0]]],"name":"convolution2d_16"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_16","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_16",0,0]]],"name":"batchnormalization_16"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_14","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":48,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed0",0,0]]],"name":"convolution2d_14"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_17","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_16",0,0]]],"name":"convolution2d_17"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_14","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_14",0,0]]],"name":"batchnormalization_14"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_17","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_17",0,0]]],"name":"batchnormalization_17"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_2","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed0",0,0]]],"name":"averagepooling2d_2"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_13","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed0",0,0]]],"name":"convolution2d_13"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_15","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":5,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":5},"inbound_nodes":[[["batchnormalization_14",0,0]]],"name":"convolution2d_15"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_18","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_17",0,0]]],"name":"convolution2d_18"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_19","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":32,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_2",0,0]]],"name":"convolution2d_19"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_13","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_13",0,0]]],"name":"batchnormalization_13"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_15","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_15",0,0]]],"name":"batchnormalization_15"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_18","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_18",0,0]]],"name":"batchnormalization_18"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_19","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_19",0,0]]],"name":"batchnormalization_19"},{"class_name":"Merge","config":{"name":"mixed1","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_13",0,0],["batchnormalization_15",0,0],["batchnormalization_18",0,0],["batchnormalization_19",0,0]]],"name":"mixed1"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_23","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed1",0,0]]],"name":"convolution2d_23"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_23","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_23",0,0]]],"name":"batchnormalization_23"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_21","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":48,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed1",0,0]]],"name":"convolution2d_21"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_24","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_23",0,0]]],"name":"convolution2d_24"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_21","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_21",0,0]]],"name":"batchnormalization_21"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_24","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_24",0,0]]],"name":"batchnormalization_24"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_3","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed1",0,0]]],"name":"averagepooling2d_3"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_20","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed1",0,0]]],"name":"convolution2d_20"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_22","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":5,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":5},"inbound_nodes":[[["batchnormalization_21",0,0]]],"name":"convolution2d_22"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_25","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_24",0,0]]],"name":"convolution2d_25"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_26","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":32,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_3",0,0]]],"name":"convolution2d_26"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_20","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_20",0,0]]],"name":"batchnormalization_20"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_22","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_22",0,0]]],"name":"batchnormalization_22"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_25","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_25",0,0]]],"name":"batchnormalization_25"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_26","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_26",0,0]]],"name":"batchnormalization_26"},{"class_name":"Merge","config":{"name":"mixed2","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_20",0,0],["batchnormalization_22",0,0],["batchnormalization_25",0,0],["batchnormalization_26",0,0]]],"name":"mixed2"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_28","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":64,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed2",0,0]]],"name":"convolution2d_28"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_28","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_28",0,0]]],"name":"batchnormalization_28"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_29","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_28",0,0]]],"name":"convolution2d_29"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_29","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_29",0,0]]],"name":"batchnormalization_29"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_27","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[2,2],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["mixed2",0,0]]],"name":"convolution2d_27"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_30","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[2,2],"init":"glorot_uniform","bias":true,"nb_filter":96,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_29",0,0]]],"name":"convolution2d_30"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_27","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_27",0,0]]],"name":"batchnormalization_27"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_30","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_30",0,0]]],"name":"batchnormalization_30"},{"class_name":"MaxPooling2D","config":{"name":"maxpooling2d_3","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[2,2],"border_mode":"valid"},"inbound_nodes":[[["mixed2",0,0]]],"name":"maxpooling2d_3"},{"class_name":"Merge","config":{"name":"mixed3","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_27",0,0],["batchnormalization_30",0,0],["maxpooling2d_3",0,0]]],"name":"mixed3"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_35","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed3",0,0]]],"name":"convolution2d_35"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_35","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_35",0,0]]],"name":"batchnormalization_35"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_36","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_35",0,0]]],"name":"convolution2d_36"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_36","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_36",0,0]]],"name":"batchnormalization_36"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_32","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed3",0,0]]],"name":"convolution2d_32"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_37","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_36",0,0]]],"name":"convolution2d_37"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_32","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_32",0,0]]],"name":"batchnormalization_32"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_37","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_37",0,0]]],"name":"batchnormalization_37"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_33","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_32",0,0]]],"name":"convolution2d_33"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_38","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":128,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_37",0,0]]],"name":"convolution2d_38"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_33","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_33",0,0]]],"name":"batchnormalization_33"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_38","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_38",0,0]]],"name":"batchnormalization_38"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_4","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed3",0,0]]],"name":"averagepooling2d_4"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_31","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed3",0,0]]],"name":"convolution2d_31"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_34","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_33",0,0]]],"name":"convolution2d_34"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_39","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_38",0,0]]],"name":"convolution2d_39"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_40","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_4",0,0]]],"name":"convolution2d_40"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_31","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_31",0,0]]],"name":"batchnormalization_31"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_34","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_34",0,0]]],"name":"batchnormalization_34"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_39","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_39",0,0]]],"name":"batchnormalization_39"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_40","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_40",0,0]]],"name":"batchnormalization_40"},{"class_name":"Merge","config":{"name":"mixed4","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_31",0,0],["batchnormalization_34",0,0],["batchnormalization_39",0,0],["batchnormalization_40",0,0]]],"name":"mixed4"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_45","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed4",0,0]]],"name":"convolution2d_45"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_45","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_45",0,0]]],"name":"batchnormalization_45"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_46","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_45",0,0]]],"name":"convolution2d_46"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_46","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_46",0,0]]],"name":"batchnormalization_46"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_42","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed4",0,0]]],"name":"convolution2d_42"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_47","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_46",0,0]]],"name":"convolution2d_47"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_42","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_42",0,0]]],"name":"batchnormalization_42"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_47","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_47",0,0]]],"name":"batchnormalization_47"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_43","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_42",0,0]]],"name":"convolution2d_43"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_48","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_47",0,0]]],"name":"convolution2d_48"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_43","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_43",0,0]]],"name":"batchnormalization_43"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_48","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_48",0,0]]],"name":"batchnormalization_48"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_5","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed4",0,0]]],"name":"averagepooling2d_5"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_41","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed4",0,0]]],"name":"convolution2d_41"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_44","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_43",0,0]]],"name":"convolution2d_44"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_49","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_48",0,0]]],"name":"convolution2d_49"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_50","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_5",0,0]]],"name":"convolution2d_50"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_41","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_41",0,0]]],"name":"batchnormalization_41"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_44","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_44",0,0]]],"name":"batchnormalization_44"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_49","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_49",0,0]]],"name":"batchnormalization_49"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_50","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_50",0,0]]],"name":"batchnormalization_50"},{"class_name":"Merge","config":{"name":"mixed5","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_41",0,0],["batchnormalization_44",0,0],["batchnormalization_49",0,0],["batchnormalization_50",0,0]]],"name":"mixed5"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_55","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed5",0,0]]],"name":"convolution2d_55"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_55","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_55",0,0]]],"name":"batchnormalization_55"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_56","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_55",0,0]]],"name":"convolution2d_56"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_56","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_56",0,0]]],"name":"batchnormalization_56"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_52","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed5",0,0]]],"name":"convolution2d_52"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_57","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_56",0,0]]],"name":"convolution2d_57"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_52","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_52",0,0]]],"name":"batchnormalization_52"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_57","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_57",0,0]]],"name":"batchnormalization_57"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_53","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_52",0,0]]],"name":"convolution2d_53"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_58","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_57",0,0]]],"name":"convolution2d_58"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_53","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_53",0,0]]],"name":"batchnormalization_53"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_58","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_58",0,0]]],"name":"batchnormalization_58"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_6","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed5",0,0]]],"name":"averagepooling2d_6"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_51","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed5",0,0]]],"name":"convolution2d_51"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_54","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_53",0,0]]],"name":"convolution2d_54"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_59","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_58",0,0]]],"name":"convolution2d_59"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_60","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_6",0,0]]],"name":"convolution2d_60"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_51","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_51",0,0]]],"name":"batchnormalization_51"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_54","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_54",0,0]]],"name":"batchnormalization_54"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_59","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_59",0,0]]],"name":"batchnormalization_59"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_60","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_60",0,0]]],"name":"batchnormalization_60"},{"class_name":"Merge","config":{"name":"mixed6","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_51",0,0],["batchnormalization_54",0,0],["batchnormalization_59",0,0],["batchnormalization_60",0,0]]],"name":"mixed6"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_65","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":160,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed6",0,0]]],"name":"convolution2d_65"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_65","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_65",0,0]]],"name":"batchnormalization_65"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_66","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_65",0,0]]],"name":"convolution2d_66"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_66","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_66",0,0]]],"name":"batchnormalization_66"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_62","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed6",0,0]]],"name":"convolution2d_62"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_67","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_66",0,0]]],"name":"convolution2d_67"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_62","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_62",0,0]]],"name":"batchnormalization_62"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_67","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_67",0,0]]],"name":"batchnormalization_67"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_63","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_62",0,0]]],"name":"convolution2d_63"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_68","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_67",0,0]]],"name":"convolution2d_68"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_63","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_63",0,0]]],"name":"batchnormalization_63"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_68","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_68",0,0]]],"name":"batchnormalization_68"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_7","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed6",0,0]]],"name":"averagepooling2d_7"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_61","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed6",0,0]]],"name":"convolution2d_61"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_64","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_63",0,0]]],"name":"convolution2d_64"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_69","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_68",0,0]]],"name":"convolution2d_69"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_70","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_7",0,0]]],"name":"convolution2d_70"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_61","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_61",0,0]]],"name":"batchnormalization_61"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_64","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_64",0,0]]],"name":"batchnormalization_64"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_69","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_69",0,0]]],"name":"batchnormalization_69"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_70","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_70",0,0]]],"name":"batchnormalization_70"},{"class_name":"Merge","config":{"name":"mixed7","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_61",0,0],["batchnormalization_64",0,0],["batchnormalization_69",0,0],["batchnormalization_70",0,0]]],"name":"mixed7"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_73","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed7",0,0]]],"name":"convolution2d_73"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_73","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_73",0,0]]],"name":"batchnormalization_73"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_74","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":7,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_73",0,0]]],"name":"convolution2d_74"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_74","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_74",0,0]]],"name":"batchnormalization_74"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_71","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed7",0,0]]],"name":"convolution2d_71"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_75","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":7},"inbound_nodes":[[["batchnormalization_74",0,0]]],"name":"convolution2d_75"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_71","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_71",0,0]]],"name":"batchnormalization_71"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_75","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_75",0,0]]],"name":"batchnormalization_75"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_72","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[2,2],"init":"glorot_uniform","bias":true,"nb_filter":320,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_71",0,0]]],"name":"convolution2d_72"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_76","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[2,2],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"valid","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_75",0,0]]],"name":"convolution2d_76"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_72","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_72",0,0]]],"name":"batchnormalization_72"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_76","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_76",0,0]]],"name":"batchnormalization_76"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_8","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[2,2],"border_mode":"valid"},"inbound_nodes":[[["mixed7",0,0]]],"name":"averagepooling2d_8"},{"class_name":"Merge","config":{"name":"mixed8","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_72",0,0],["batchnormalization_76",0,0],["averagepooling2d_8",0,0]]],"name":"mixed8"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_81","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":448,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed8",0,0]]],"name":"convolution2d_81"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_81","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_81",0,0]]],"name":"batchnormalization_81"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_78","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed8",0,0]]],"name":"convolution2d_78"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_82","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_81",0,0]]],"name":"convolution2d_82"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_78","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_78",0,0]]],"name":"batchnormalization_78"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_82","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_82",0,0]]],"name":"batchnormalization_82"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_79","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_78",0,0]]],"name":"convolution2d_79"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_80","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_78",0,0]]],"name":"convolution2d_80"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_83","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_82",0,0]]],"name":"convolution2d_83"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_84","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_82",0,0]]],"name":"convolution2d_84"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_9","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed8",0,0]]],"name":"averagepooling2d_9"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_77","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":320,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed8",0,0]]],"name":"convolution2d_77"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_79","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_79",0,0]]],"name":"batchnormalization_79"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_80","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_80",0,0]]],"name":"batchnormalization_80"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_83","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_83",0,0]]],"name":"batchnormalization_83"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_84","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_84",0,0]]],"name":"batchnormalization_84"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_85","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_9",0,0]]],"name":"convolution2d_85"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_77","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_77",0,0]]],"name":"batchnormalization_77"},{"class_name":"Merge","config":{"name":"mixed9_0","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_79",0,0],["batchnormalization_80",0,0]]],"name":"mixed9_0"},{"class_name":"Merge","config":{"name":"merge_1","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_83",0,0],["batchnormalization_84",0,0]]],"name":"merge_1"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_85","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_85",0,0]]],"name":"batchnormalization_85"},{"class_name":"Merge","config":{"name":"mixed9","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_77",0,0],["mixed9_0",0,0],["merge_1",0,0],["batchnormalization_85",0,0]]],"name":"mixed9"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_90","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":448,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed9",0,0]]],"name":"convolution2d_90"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_90","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_90",0,0]]],"name":"batchnormalization_90"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_87","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed9",0,0]]],"name":"convolution2d_87"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_91","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_90",0,0]]],"name":"convolution2d_91"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_87","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_87",0,0]]],"name":"batchnormalization_87"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_91","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_91",0,0]]],"name":"batchnormalization_91"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_88","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_87",0,0]]],"name":"convolution2d_88"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_89","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_87",0,0]]],"name":"convolution2d_89"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_92","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":3,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["batchnormalization_91",0,0]]],"name":"convolution2d_92"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_93","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":384,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":3},"inbound_nodes":[[["batchnormalization_91",0,0]]],"name":"convolution2d_93"},{"class_name":"AveragePooling2D","config":{"name":"averagepooling2d_10","trainable":true,"dim_ordering":"tf","pool_size":[3,3],"strides":[1,1],"border_mode":"same"},"inbound_nodes":[[["mixed9",0,0]]],"name":"averagepooling2d_10"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_86","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":320,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["mixed9",0,0]]],"name":"convolution2d_86"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_88","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_88",0,0]]],"name":"batchnormalization_88"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_89","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_89",0,0]]],"name":"batchnormalization_89"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_92","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_92",0,0]]],"name":"batchnormalization_92"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_93","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_93",0,0]]],"name":"batchnormalization_93"},{"class_name":"Convolution2D","config":{"W_constraint":null,"b_constraint":null,"name":"convolution2d_94","activity_regularizer":null,"trainable":true,"dim_ordering":"tf","nb_col":1,"subsample":[1,1],"init":"glorot_uniform","bias":true,"nb_filter":192,"border_mode":"same","b_regularizer":null,"W_regularizer":null,"activation":"relu","nb_row":1},"inbound_nodes":[[["averagepooling2d_10",0,0]]],"name":"convolution2d_94"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_86","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_86",0,0]]],"name":"batchnormalization_86"},{"class_name":"Merge","config":{"name":"mixed9_1","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_88",0,0],["batchnormalization_89",0,0]]],"name":"mixed9_1"},{"class_name":"Merge","config":{"name":"merge_2","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_92",0,0],["batchnormalization_93",0,0]]],"name":"merge_2"},{"class_name":"BatchNormalization","config":{"gamma_regularizer":null,"name":"batchnormalization_94","epsilon":1e-05,"trainable":true,"mode":0,"beta_regularizer":null,"momentum":0.99,"axis":3},"inbound_nodes":[[["convolution2d_94",0,0]]],"name":"batchnormalization_94"},{"class_name":"Merge","config":{"name":"mixed10","concat_axis":3,"mode_type":"raw","dot_axes":-1,"mode":"concat","output_shape":null,"output_shape_type":"raw"},"inbound_nodes":[[["batchnormalization_86",0,0],["mixed9_1",0,0],["merge_2",0,0],["batchnormalization_94",0,0]]],"name":"mixed10"},{"class_name":"AveragePooling2D","config":{"name":"avg_pool","trainable":true,"dim_ordering":"tf","pool_size":[8,8],"strides":[8,8],"border_mode":"valid"},"inbound_nodes":[[["mixed10",0,0]]],"name":"avg_pool"},{"class_name":"Flatten","config":{"trainable":true,"name":"flatten"},"inbound_nodes":[[["avg_pool",0,0]]],"name":"flatten"},{"class_name":"Dense","config":{"W_constraint":null,"b_constraint":null,"name":"predictions","activity_regularizer":null,"trainable":true,"init":"glorot_uniform","bias":true,"input_dim":null,"b_regularizer":null,"W_regularizer":null,"activation":"softmax","output_dim":1000},"inbound_nodes":[[["flatten",0,0]]],"name":"predictions"}],"input_layers":[["input_1",0,0]],"output_layers":[["predictions",0,0]],"name":"model_1"}}
1515 */
1616 package org.apache.tika.dl.imagerec;
1717
18 import static org.junit.Assert.assertTrue;
19
2018 import org.apache.tika.Tika;
2119 import org.apache.tika.config.TikaConfig;
22 import org.apache.tika.exception.TikaConfigException;
2320 import org.apache.tika.metadata.Metadata;
2421 import org.junit.Test;
22
23 import static org.junit.Assert.assertTrue;
2524
2625 public class DL4JInceptionV3NetTest {
2726
3837 return;
3938 }
4039 }
41 Tika tika = new Tika(config);
42 Metadata md = new Metadata();
43 tika.parse(getClass().getResourceAsStream("cat.jpg"), md);
44 String[] objects = md.getValues("OBJECT");
45 boolean found = false;
46 for (String object : objects) {
47 if (object.contains("_cat")){
48 found = true;
40 if (config != null) {
41 Tika tika = new Tika(config);
42 Metadata md = new Metadata();
43 tika.parse(getClass().getResourceAsStream("cat.jpg"), md);
44 String[] objects = md.getValues("OBJECT");
45 boolean found = false;
46 for (String object : objects) {
47 if (object.contains("_cat")) {
48 found = true;
49 }
4950 }
51 assertTrue(found);
5052 }
51 assertTrue(found);
5253 }
5354 }
1515 */
1616 package org.apache.tika.dl.imagerec;
1717
18 import static org.junit.Assert.assertTrue;
19
2018 import org.apache.tika.Tika;
2119 import org.apache.tika.config.TikaConfig;
22 import org.apache.tika.exception.TikaConfigException;
2320 import org.apache.tika.metadata.Metadata;
2421 import org.junit.Test;
22
23 import java.io.InputStream;
24
25 import static org.junit.Assert.assertTrue;
2526
2627 public class DL4JVGG16NetTest {
2728
2829 @Test
2930 public void recognise() throws Exception {
3031 TikaConfig config = null;
32 InputStream is = getClass().getResourceAsStream("dl4j-vgg16-config.xml");
3133 try {
3234 config = new TikaConfig(getClass().getResourceAsStream("dl4j-vgg16-config.xml"));
3335 } catch (Exception e) {
3941 }
4042 }
4143
42 if(config != null)
43 {
44 if(config != null) {
4445 Tika tika = new Tika(config);
4546 Metadata md = new Metadata();
4647 tika.parse(getClass().getResourceAsStream("lion.jpg"), md);
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
3434
3535 <properties>
3636 <cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
37 <!-- upgrade to 6.x or something more modern once Tika requires Java 1.8 -->
38 <lucene.version>5.5.4</lucene.version>
39 <poi.version>3.17</poi.version>
37 <lucene.version>7.4.0</lucene.version>
38 <poi.version>4.0.0</poi.version>
4039 </properties>
4140
4241 <dependencies>
224223 <plugin>
225224 <groupId>org.apache.maven.plugins</groupId>
226225 <artifactId>maven-jar-plugin</artifactId>
226 <configuration>
227 <archive>
228 <manifestEntries>
229 <Automatic-Module-Name>org.apache.tika.eval</Automatic-Module-Name>
230 </manifestEntries>
231 </archive>
232 </configuration>
227233 <executions>
228234 <execution>
229235 <goals>
6060 import org.apache.tika.metadata.PagedText;
6161 import org.apache.tika.metadata.TikaCoreProperties;
6262 import org.apache.tika.parser.RecursiveParserWrapper;
63 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
6364 import org.apache.tika.utils.ExceptionUtils;
6465 import org.slf4j.Logger;
6566 import org.slf4j.LoggerFactory;
253254 data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
254255 } else {
255256 data.put(Cols.IS_EMBEDDED, TRUE);
256 data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
257 data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
257258 }
258259 String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
259260 ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
390391 String getTime(Metadata m) {
391392 String elapsed = "-1";
392393
393 String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS);
394 String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS);
394395 if (v != null) {
395396 return v;
396397 }
413414 String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime");
414415
415416 if (fullTrace == null) {
416 fullTrace = metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION);
417 fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
417418 }
418419
419420 if (fullTrace != null) {
474475 if (metadata == null) {
475476 return "";
476477 }
477 String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT);
478 String c = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
478479 if (c == null) {
479480 return "";
480481 }
722723
723724 Map<String, Integer> counts = new HashMap<>();
724725 for (int i = 1; i < list.size(); i++) {
725 String path = list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
726 String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
726727 if (path == null) {
727728 //shouldn't ever happen
728729 continue;
744745 }
745746
746747 for (int i = 1; i < list.size(); i++) {
747 Integer count = counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
748 Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
748749 if (count == null) {
749750 count = 0;
750751 }
4444 import org.apache.tika.metadata.Metadata;
4545 import org.apache.tika.metadata.TikaCoreProperties;
4646 import org.apache.tika.parser.RecursiveParserWrapper;
47 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
4748
4849 public class ExtractComparer extends AbstractProfiler {
4950
351352 String pathA = null;
352353 String pathB = null;
353354 if (mA != null) {
354 pathA = mA.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
355 pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
355356 }
356357 if (mB != null) {
357 pathB = mB.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
358 pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
358359 }
359360 if (pathA != null) {
360361 Map<Cols, String> d = new HashMap<>();
389390
390391
391392 /**
392 * Try to find the matching metadata based on the RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH
393 * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
393394 * If you can't find it, return -1;
394395 *
395396 * @param i index for match in metadataListA
418419
419420 //assume same embedded resource path. Not always true!
420421 Metadata thisMetadata = metadataListA.get(i);
421 String embeddedPath = thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
422 String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
422423 if (embeddedPath != null) {
423424 for (int j = 0; j < metadataListB.size(); j++) {
424425 String thatEmbeddedPath = metadataListB.get(j).get(
425 RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
426 AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
426427 if (embeddedPath.equals(thatEmbeddedPath)) {
427428 return j;
428429 }
3636 import org.apache.tika.eval.io.IDBWriter;
3737 import org.apache.tika.metadata.Metadata;
3838 import org.apache.tika.parser.RecursiveParserWrapper;
39 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3940
4041 public class ExtractProfiler extends AbstractProfiler {
4142
246247 Map<Cols, String> data = new HashMap<>();
247248 data.put(Cols.ID, fileId);
248249 data.put(Cols.EMBEDDED_FILE_PATH,
249 m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
250 m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
250251 try {
251252 writer.writeRow(embeddedFilePathTable, data);
252253 } catch (IOException e) {
2222 import org.apache.tika.metadata.serialization.JsonMetadataList;
2323 import org.apache.tika.mime.MediaType;
2424 import org.apache.tika.parser.RecursiveParserWrapper;
25 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
2526 import org.slf4j.Logger;
2627 import org.slf4j.LoggerFactory;
2728
147148 Metadata containerMetadata = metadataList.get(0);
148149 for (int i = 0; i < metadataList.size(); i++) {
149150 Metadata m = metadataList.get(i);
150 String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
151 String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
151152 if (c != null) {
152153 sb.append(c);
153154 sb.append(" ");
154155 }
155156 }
156 containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, sb.toString());
157 containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString());
157158 while (metadataList.size() > 1) {
158159 metadataList.remove(metadataList.size()-1);
159160 }
177178 List<Metadata> metadataList = new ArrayList<>();
178179 String content = IOUtils.toString(reader);
179180 Metadata m = new Metadata();
180 m.set(RecursiveParserWrapper.TIKA_CONTENT, content);
181 m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
181182 //Let's hope the file name has a suffix that can
182183 //be used to determine the mime. Could be wrong or missing,
183184 //but better than nothing.
1818 import java.io.IOException;
1919 import java.util.Map;
2020
21 import org.apache.lucene.analysis.FilteringTokenFilter;
2122 import org.apache.lucene.analysis.TokenStream;
2223 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
23 import org.apache.lucene.analysis.util.FilteringTokenFilter;
2424 import org.apache.lucene.analysis.util.TokenFilterFactory;
2525
2626 /**
1717
1818
1919 import java.io.IOException;
20 import java.io.Reader;
2120 import java.lang.reflect.Type;
2221 import java.util.Collections;
2322 import java.util.HashMap;
3231 import com.google.gson.JsonObject;
3332 import com.google.gson.JsonParseException;
3433 import org.apache.lucene.analysis.Analyzer;
35 import org.apache.lucene.analysis.TokenStream;
36 import org.apache.lucene.analysis.Tokenizer;
37 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
34 import org.apache.lucene.analysis.custom.CustomAnalyzer;
3835 import org.apache.lucene.analysis.util.CharFilterFactory;
3936 import org.apache.lucene.analysis.util.ClasspathResourceLoader;
40 import org.apache.lucene.analysis.util.ResourceLoaderAware;
4137 import org.apache.lucene.analysis.util.TokenFilterFactory;
42 import org.apache.lucene.analysis.util.TokenizerFactory;
4338
4439 class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
4540
9691 throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
9792 }
9893 JsonObject aRoot = (JsonObject)value;
99 CharFilterFactory[] charFilters = new CharFilterFactory[0];
100 TokenizerFactory tokenizerFactory = null;
101 TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
94 CustomAnalyzer.Builder builder = CustomAnalyzer.builder(new ClasspathResourceLoader(AnalyzerDeserializer.class));
10295 for ( Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
10396 String k = e.getKey();
10497 if (k.equals(CHAR_FILTERS)) {
105 charFilters = buildCharFilters(e.getValue(), analyzerName);
98 buildCharFilters(e.getValue(), analyzerName, builder);
10699 } else if (k.equals(TOKEN_FILTERS)) {
107 tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
100 buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens, builder);
108101 } else if (k.equals(TOKENIZER)) {
109 tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
102 buildTokenizerFactory(e.getValue(), analyzerName, builder);
110103 } else if (! k.equals(COMMENT)) {
111104 throw new IllegalArgumentException("Should have one of three values here:"+
112105 CHAR_FILTERS + ", "+
115108 ". I don't recognize: "+k);
116109 }
117110 }
118 if (tokenizerFactory == null) {
119 throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
120 }
121 return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
122 }
123
124 private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
111 return builder.build();
112 }
113
114 private static void buildTokenizerFactory(JsonElement map, String analyzerName,
115 CustomAnalyzer.Builder builder) throws IOException {
125116 if (!(map instanceof JsonObject)) {
126117 throw new IllegalArgumentException("Expecting a map with \"factory\" string and " +
127118 "\"params\" map in tokenizer factory;"+
138129
139130 JsonElement paramsEl = ((JsonObject)map).get(PARAMS);
140131 Map<String, String> params = mapify(paramsEl);
141 String spiName = "";
142 for (String s : TokenizerFactory.availableTokenizers()) {
143 Class clazz = TokenizerFactory.lookupClass(s);
144 if (clazz.getName().equals(factoryName)) {
145 spiName = s;
146 break;
147 }
148 }
149 if (spiName.equals("")) {
150 throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name"+
151 "'"+factoryName+"' does not exist.");
152 }
153 try {
154 TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
155 if (tokenizerFactory instanceof ResourceLoaderAware) {
156 ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
157 }
158
159 return tokenizerFactory;
160 } catch (IllegalArgumentException e) {
161 throw new IllegalArgumentException("While working on "+analyzerName, e);
162 }
163 }
164
165 private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException {
132 builder.withTokenizer(factoryName, params);
133 }
134
135 private static void buildCharFilters(JsonElement el,
136 String analyzerName, CustomAnalyzer.Builder builder) throws IOException {
166137 if (el == null || el.isJsonNull()) {
167 return null;
138 return;
168139 }
169140 if (! el.isJsonArray()) {
170141 throw new IllegalArgumentException("Expecting array for charfilters, but got:"+el.toString() +
187158
188159 JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
189160 Map<String, String> params = mapify(paramsEl);
190 String spiName = "";
191 for (String s : CharFilterFactory.availableCharFilters()) {
192 Class clazz = CharFilterFactory.lookupClass(s);
193 if (clazz.getName().equals(factoryName)) {
194 spiName = s;
195 break;
196 }
197 }
198 if (spiName.equals("")) {
199 throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name"+
200 "'"+factoryName+"' does not exist.");
201 }
202
203 try {
204 CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params);
205 if (charFilterFactory instanceof ResourceLoaderAware) {
206 ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
207 }
208 ret.add(charFilterFactory);
209 } catch (IllegalArgumentException e) {
210 throw new IllegalArgumentException("While trying to load "+
211 analyzerName + ": "+ e.getMessage(), e);
212 }
213 }
214 if (ret.size() == 0) {
215 return new CharFilterFactory[0];
216 }
217 return ret.toArray(new CharFilterFactory[ret.size()]);
218 }
219
220 private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el,
221 String analyzerName, int maxTokens) throws IOException {
161 builder.addCharFilter(factoryName, params);
162 }
163 }
164
165 private static void buildTokenFilterFactories(JsonElement el,
166 String analyzerName,
167 int maxTokens, CustomAnalyzer.Builder builder) throws IOException {
222168 if (el == null || el.isJsonNull()) {
223 return null;
169 return;
224170 }
225171 if (! el.isJsonArray()) {
226172 throw new IllegalArgumentException(
241187 factoryName = factoryName.startsWith("oala.") ?
242188 factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") :
243189 factoryName;
244
245190 JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
246191 Map<String, String> params = mapify(paramsEl);
247 String spiName = "";
248 for (String s : TokenFilterFactory.availableTokenFilters()) {
249 Class clazz = TokenFilterFactory.lookupClass(s);
250 if (clazz.getName().equals(factoryName)) {
251 spiName = s;
252 break;
253 }
254 }
255 if (spiName.equals("")) {
256 throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name"+
257 "'"+factoryName+"' does not exist.");
258 }
259
260 try {
261 TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params);
262 if (tokenFilterFactory instanceof ResourceLoaderAware) {
263 ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
264 }
265 ret.add(tokenFilterFactory);
266 } catch (IllegalArgumentException e) {
267 throw new IllegalArgumentException("While loading "+analyzerName, e);
268 }
192 builder.addTokenFilter(factoryName, params);
269193 }
270194
271195 if (maxTokens > -1) {
272196 Map<String, String> m = new HashMap<>();
273197 m.put("maxTokenCount", Integer.toString(maxTokens));
274 ret.add(new LimitTokenCountFilterFactory(m));
275 }
276
277 if (ret.size() == 0) {
278 return new TokenFilterFactory[0];
279 }
280 return ret.toArray(new TokenFilterFactory[ret.size()]);
198 builder.addTokenFilter(
199 "limittokencount",
200 m);
201 }
281202 }
282203
283204 private static Map<String, String> mapify(JsonElement paramsEl) {
298219 }
299220 return params;
300221 }
301
302 /**
303 * Plagiarized verbatim from Solr!
304 */
305 private static class MyTokenizerChain extends Analyzer {
306
307 final private CharFilterFactory[] charFilters;
308 final private TokenizerFactory tokenizer;
309 final private TokenFilterFactory[] filters;
310
311 public MyTokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
312 this(null, tokenizer, filters);
313 }
314
315 public MyTokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
316 this.charFilters = charFilters;
317 this.tokenizer = tokenizer;
318 this.filters = filters;
319 }
320
321 public CharFilterFactory[] getCharFilterFactories() {
322 return charFilters;
323 }
324
325 public TokenizerFactory getTokenizerFactory() {
326 return tokenizer;
327 }
328
329 public TokenFilterFactory[] getTokenFilterFactories() {
330 return filters;
331 }
332
333 @Override
334 public Reader initReader(String fieldName, Reader reader) {
335
336 if (charFilters != null && charFilters.length > 0) {
337 Reader cs = reader;
338 for (CharFilterFactory charFilter : charFilters) {
339 cs = charFilter.create(cs);
340 }
341 reader = cs;
342 }
343
344 return reader;
345 }
346
347 @Override
348 protected TokenStreamComponents createComponents(String fieldName) {
349 Tokenizer tk = tokenizer.create();
350 TokenStream ts = tk;
351 for (TokenFilterFactory filter : filters) {
352 ts = filter.create(ts);
353 }
354
355 return new TokenStreamComponents(tk, ts);
356 }
357 }
358
359222 }
2222 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
2323 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2424 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
25 import org.apache.lucene.analysis.util.FilteringTokenFilter;
25 import org.apache.lucene.analysis.FilteringTokenFilter;
2626 import org.apache.lucene.analysis.util.TokenFilterFactory;
2727
2828 /**
128128
129129
130130 if (is == null) {
131 LOG.warn("Couldn't find common tokens file for: '" + langCode + "': " +
132 p.toAbsolutePath());
131 String path = (p == null) ? "resource on class path: /common_tokens/"+langCode
132 : p.toAbsolutePath().toString();
133 LOG.warn("Couldn't find common tokens file for: '" + langCode + "' tried here: " +
134 path);
133135 alreadyTriedToLoad.add(langCode);
134136 return;
135137 }
22 "general": {
33 "charfilters": [
44 {
5 "factory": "oala.charfilter.MappingCharFilterFactory",
5 "factory": "mapping",
66 "params": {
77 "mapping": "/lucene-char-mapping.txt"
88 }
99 }
1010 ],
1111 "tokenizer": {
12 "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
12 "factory": "uax29urlemail",
1313 "params": {}
1414 },
1515 "tokenfilters": [
1616 {
17 "factory": "oala.icu.ICUFoldingFilterFactory",
17 "factory": "icufolding",
1818 "params": {}
1919 },
2020 {
21 "factory": "oala.cjk.CJKBigramFilterFactory",
21 "factory": "cjkbigram",
2222 "params": {
2323 "outputUnigrams": "false"
2424 }
2929 "_comment" : "Use this analyzer for counting common tokens in a corpus.",
3030 "_comment" : "This isn't used by tika-eval during profiling or comparing",
3131 "tokenizer": {
32 "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
32 "factory": "uax29urlemail",
3333 "params": {}
3434 },
3535 "tokenfilters": [
3636 {
37 "factory": "oala.icu.ICUFoldingFilterFactory",
37 "factory": "icufolding",
3838 "params": {}
3939 },
4040 {
41 "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory",
41 "factory": "alphaideograph",
4242 "params": {}
4343 },
4444 {
45 "factory": "oala.pattern.PatternReplaceFilterFactory",
45 "factory": "patternreplace",
4646 "params": {
4747 "pattern": "^[\\w+\\.]{1,30}@(?:\\w+\\.){1,10}\\w+$",
4848 "replacement": "___email___",
5050 }
5151 },
5252 {
53 "factory": "oala.pattern.PatternReplaceFilterFactory",
53 "factory": "patternreplace",
5454 "params": {
5555 "pattern": "^(?:(?:ftp|https?):\\/\\/)?(?:\\w+\\.){1,10}\\w+$",
5656 "replacement": "___url___",
5858 }
5959 },
6060 {
61 "factory": "oala.cjk.CJKBigramFilterFactory",
61 "factory": "cjkbigram",
6262 "params": {
6363 "outputUnigrams": "false"
6464 }
6565 },
6666 {
67 "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory",
67 "factory": "cjkbigramawarelength",
6868 "params": {
6969 "min": 4,
7070 "max": 20
4040 import org.apache.tika.eval.util.LanguageIDWrapper;
4141 import org.apache.tika.metadata.Metadata;
4242 import org.apache.tika.parser.RecursiveParserWrapper;
43 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
4344 import org.junit.Before;
4445 import org.junit.Ignore;
4546 import org.junit.Test;
183184 @Test
184185 public void testGetContent() throws Exception {
185186 Metadata m = new Metadata();
186 m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
187 m.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "0123456789");
187188 Map<Cols, String> data = new HashMap<>();
188189 String content = getContent(m, 10, data);
189190 assertEquals(10, content.length());
232233 public void testAttachmentCounts() {
233234 List<Metadata> list = new ArrayList<>();
234235 Metadata m0 = new Metadata();
235 m0.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored
236 m0.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored
236237 //in the first metadata object
237238 list.add(m0);
238239 Metadata m1 = new Metadata();
239 m1.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt");
240 m1.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt");
240241 list.add(m1);
241242 Metadata m2 = new Metadata();
242 m2.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt");
243 m2.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt");
243244 list.add(m2);
244245 Metadata m3 = new Metadata();
245 m3.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip");
246 m3.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip");
246247 list.add(m3);
247248 Metadata m4 = new Metadata();
248 m4.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx");
249 m4.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx");
249250 list.add(m4);
250251 Metadata m5 = new Metadata();
251 m5.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt");
252 m5.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt");
252253 list.add(m5);
253254
254255 List<Integer> counts = AbstractProfiler.countAttachments(list);
2525 import org.apache.tika.TikaTest;
2626 import org.apache.tika.metadata.Metadata;
2727 import org.apache.tika.parser.RecursiveParserWrapper;
28 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
2829 import org.junit.Before;
2930 import org.junit.Test;
3031
4647 List<Metadata> metadataList = extractReader.loadExtract(testJsonFile);
4748
4849 assertEquals(2, metadataList.size());
49 assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
50 assertEquals(1, metadataList.get(1).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
51 assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
52 assertContains("attachment", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
50 assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length);
51 assertEquals(1, metadataList.get(1).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length);
52 assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
53 assertContains("attachment", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
5354
5455 extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY);
5556 metadataList = extractReader.loadExtract(testJsonFile);
5657 assertEquals(1, metadataList.size());
57 assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
58 assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
59 assertNotContained("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
58 assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length);
59 assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
60 assertNotContained("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
6061
6162 extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST);
6263 metadataList = extractReader.loadExtract(testJsonFile);
6364 assertEquals(1, metadataList.size());
64 assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
65 assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
66 assertContains("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
65 assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length);
66 assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
67 assertContains("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
6768 }
6869
6970 @Test
7273 List<Metadata> metadataList = extractReader.loadExtract(testTxtFile);
7374 assertEquals(1, metadataList.size());
7475 Metadata m = metadataList.get(0);
75 assertEquals(1, m.getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
76 assertEquals(1, m.getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length);
7677 assertEquals("the quick brown fox fox fox jumped over the lazy lazy dog\n",
77 m.get(RecursiveParserWrapper.TIKA_CONTENT));
78 m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
7879
7980 //test that the mime is inferred from the file extension
8081 assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE));
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
8888 <dependency>
8989 <groupId>org.apache.jackrabbit</groupId>
9090 <artifactId>jackrabbit-jcr-server</artifactId>
91 <version>2.3.6</version>
91 <version>2.17.4</version>
9292 <exclusions>
9393 <exclusion>
9494 <groupId>org.apache.tika</groupId>
107107 <dependency>
108108 <groupId>org.apache.jackrabbit</groupId>
109109 <artifactId>jackrabbit-core</artifactId>
110 <version>2.3.6</version>
110 <version>2.17.4</version>
111111 <exclusions>
112112 <exclusion>
113113 <groupId>org.apache.tika</groupId>
126126 <dependency>
127127 <groupId>org.apache.lucene</groupId>
128128 <artifactId>lucene-core</artifactId>
129 <version>3.5.0</version>
129 <version>7.4.0</version>
130130 </dependency>
131131 <dependency>
132132 <groupId>commons-io</groupId>
2020 import java.io.InputStream;
2121 import java.nio.file.Files;
2222 import java.nio.file.Path;
23 import java.util.UUID;
2324
2425 import org.apache.commons.io.FilenameUtils;
2526 import org.apache.tika.config.TikaConfig;
8182 //make sure to select only the file name (not any directory paths
8283 //that might be included in the name) and make sure
8384 //to normalize the name
85 name = name.replaceAll("\u0000", " ");
86 int prefix = FilenameUtils.getPrefixLength(name);
87 if (prefix > -1) {
88 name = name.substring(prefix);
89 }
8490 name = FilenameUtils.normalize(FilenameUtils.getName(name));
8591 }
8692
95101 e.printStackTrace();
96102 }
97103 }
98 //should add check to make sure that you aren't overwriting a file
104
99105 Path outputFile = outputDir.resolve(name);
100 //do a better job than this of checking
106 if (Files.exists(outputFile)) {
107 outputFile = outputDir.resolve(UUID.randomUUID().toString()+"-"+name);
108 }
101109 Files.createDirectories(outputFile.getParent());
102110 Files.copy(stream, outputFile);
103111 }
+0
-210
tika-example/src/main/java/org/apache/tika/example/LazyTextExtractorField.java less more
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.example;
18
19 import java.io.InputStream;
20 import java.io.Reader;
21 import java.util.concurrent.Executor;
22
23 import org.apache.jackrabbit.core.query.lucene.FieldNames;
24 import org.apache.jackrabbit.core.value.InternalValue;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.document.AbstractField;
27 import org.apache.lucene.document.Field;
28 import org.apache.lucene.document.Field.Store;
29 import org.apache.lucene.document.Field.TermVector;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.parser.Parser;
33 import org.slf4j.Logger;
34 import org.slf4j.LoggerFactory;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37 import org.xml.sax.helpers.DefaultHandler;
38
39 /**
40 * <code>LazyTextExtractorField</code> implements a Lucene field with a String
41 * value that is lazily initialized from a given {@link Reader}. In addition
42 * this class provides a method to find out whether the purpose of the reader is
43 * to extract text and whether the extraction process is already finished.
44 *
45 * @see #isExtractorFinished()
46 */
47 @SuppressWarnings("serial")
48 public class LazyTextExtractorField extends AbstractField {
49 /**
50 * The logger instance for this class.
51 */
52 private static final Logger LOG = LoggerFactory.getLogger(LazyTextExtractorField.class);
53
54 /**
55 * The exception used to forcibly terminate the extraction process when the
56 * maximum field length is reached.
57 * <p>
58 * Such exceptions shouldn't be used in logging since its stack trace is meaningless.
59 */
60 private static final SAXException STOP = new SAXException("max field length reached");
61
62 /**
63 * The extracted text content of the given binary value. Set to non-null
64 * when the text extraction task finishes.
65 */
66 private volatile String extract = null;
67
68 /**
69 * Creates a new <code>LazyTextExtractorField</code> with the given
70 * <code>name</code>.
71 *
72 * @param name the name of the field.
73 * @param reader the reader where to obtain the string from.
74 * @param highlighting set to <code>true</code> to enable result highlighting support
75 */
76 public LazyTextExtractorField(Parser parser, InternalValue value,
77 Metadata metadata, Executor executor, boolean highlighting,
78 int maxFieldLength) {
79 super(FieldNames.FULLTEXT, highlighting ? Store.YES : Store.NO,
80 Field.Index.ANALYZED, highlighting ? TermVector.WITH_OFFSETS
81 : TermVector.NO);
82 executor.execute(new ParsingTask(parser, value, metadata,
83 maxFieldLength));
84 }
85
86 /**
87 * Returns the extracted text. This method blocks until the text extraction
88 * task has been completed.
89 *
90 * @return the string value of this field
91 */
92 public synchronized String stringValue() {
93 try {
94 while (!isExtractorFinished()) {
95 wait();
96 }
97 return extract;
98 } catch (InterruptedException e) {
99 LOG.error("Text extraction thread was interrupted", e);
100 return "";
101 }
102 }
103
104 /**
105 * @return always <code>null</code>
106 */
107 public Reader readerValue() {
108 return null;
109 }
110
111 /**
112 * @return always <code>null</code>
113 */
114 public byte[] binaryValue() {
115 return null;
116 }
117
118 /**
119 * @return always <code>null</code>
120 */
121 public TokenStream tokenStreamValue() {
122 return null;
123 }
124
125 /**
126 * Checks whether the text extraction task has finished.
127 *
128 * @return <code>true</code> if the extracted text is available
129 */
130 public boolean isExtractorFinished() {
131 return extract != null;
132 }
133
134 private synchronized void setExtractedText(String value) {
135 extract = value;
136 notify();
137 }
138
139 /**
140 * Releases all resources associated with this field.
141 */
142 public void dispose() {
143 // TODO: Cause the ContentHandler below to throw an exception
144 }
145
146 /**
147 * The background task for extracting text from a binary value.
148 */
149 private class ParsingTask extends DefaultHandler implements Runnable {
150 private final Parser parser;
151
152 private final InternalValue value;
153
154 private final Metadata metadata;
155
156 private final int maxFieldLength;
157
158 private final StringBuilder builder = new StringBuilder();
159
160 private final ParseContext context = new ParseContext();
161
162 // NOTE: not a part of Jackrabbit code, made
163 private final ContentHandler handler = new DefaultHandler();
164
165 public ParsingTask(Parser parser, InternalValue value,
166 Metadata metadata, int maxFieldLength) {
167 this.parser = parser;
168 this.value = value;
169 this.metadata = metadata;
170 this.maxFieldLength = maxFieldLength;
171 }
172
173 public void run() {
174 try {
175 try (InputStream stream = value.getStream()) {
176 parser.parse(stream, handler, metadata, context);
177 }
178 } catch (LinkageError e) {
179 // Capture and ignore
180 } catch (Throwable t) {
181 if (t != STOP) {
182 LOG.debug("Failed to extract text.", t);
183 setExtractedText("TextExtractionError");
184 return;
185 }
186 } finally {
187 value.discard();
188 }
189 setExtractedText(handler.toString());
190
191 }
192
193 @Override
194 public void characters(char[] ch, int start, int length)
195 throws SAXException {
196 builder.append(ch, start,
197 Math.min(length, maxFieldLength - builder.length()));
198 if (builder.length() >= maxFieldLength) {
199 throw STOP;
200 }
201 }
202
203 @Override
204 public void ignorableWhitespace(char[] ch, int start, int length)
205 throws SAXException {
206 characters(ch, start, length);
207 }
208 }
209 }
1919 import java.io.File;
2020
2121 import org.apache.lucene.document.Document;
22 import org.apache.lucene.document.Field;
23 import org.apache.lucene.document.Field.Index;
2422 import org.apache.lucene.document.Field.Store;
23 import org.apache.lucene.document.TextField;
2524 import org.apache.lucene.index.IndexWriter;
2625 import org.apache.tika.Tika;
2726
3736
3837 public void indexDocument(File file) throws Exception {
3938 Document document = new Document();
40 document.add(new Field("filename", file.getName(), Store.YES, Index.ANALYZED));
41 document.add(new Field("fulltext", tika.parseToString(file), Store.NO, Index.ANALYZED));
39 document.add(new TextField("filename", file.getName(), Store.YES));
40 document.add(new TextField("fulltext", tika.parseToString(file), Store.NO));
4241 writer.addDocument(document);
4342 }
4443 }
1818
1919 import java.io.File;
2020 import java.io.Reader;
21 import java.nio.file.Paths;
2122
2223 import org.apache.lucene.analysis.standard.StandardAnalyzer;
2324 import org.apache.lucene.document.Document;
24 import org.apache.lucene.document.Field;
25 import org.apache.lucene.document.Field.Index;
2625 import org.apache.lucene.document.Field.Store;
26 import org.apache.lucene.document.TextField;
2727 import org.apache.lucene.index.IndexWriter;
28 import org.apache.lucene.index.IndexWriter.MaxFieldLength;
29 import org.apache.lucene.store.SimpleFSDirectory;
30 import org.apache.lucene.util.Version;
28 import org.apache.lucene.index.IndexWriterConfig;
29 import org.apache.lucene.store.FSDirectory;
3130 import org.apache.tika.Tika;
3231
3332 @SuppressWarnings("deprecation")
4241 }
4342
4443 public static void main(String[] args) throws Exception {
45 try (IndexWriter writer = new IndexWriter(
46 new SimpleFSDirectory(new File(args[0])),
47 new StandardAnalyzer(Version.LUCENE_30),
48 MaxFieldLength.UNLIMITED)) {
44 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
45 try (IndexWriter writer =
46 new IndexWriter(FSDirectory.open(Paths.get(args[0])),
47 indexWriterConfig)) {
4948 LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer);
5049 for (int i = 1; i < args.length; i++) {
5150 indexer.indexDocument(new File(args[i]));
5655 public void indexDocument(File file) throws Exception {
5756 try (Reader fulltext = tika.parse(file)) {
5857 Document document = new Document();
59 document.add(new Field("filename", file.getName(), Store.YES, Index.ANALYZED));
60 document.add(new Field("fulltext", fulltext));
58 document.add(new TextField("filename", file.getName(), Store.YES));
59 document.add(new TextField("fulltext", fulltext));
6160 writer.addDocument(document);
6261 }
6362 }
2222 import java.util.Date;
2323
2424 import org.apache.lucene.document.Document;
25 import org.apache.lucene.document.Field;
26 import org.apache.lucene.document.Field.Index;
2725 import org.apache.lucene.document.Field.Store;
26 import org.apache.lucene.document.TextField;
2827 import org.apache.lucene.index.IndexWriter;
2928 import org.apache.tika.Tika;
3029 import org.apache.tika.metadata.DublinCore;
5352 for (String key : met.names()) {
5453 String[] values = met.getValues(key);
5554 for (String val : values) {
56 document.add(new Field(key, val, Store.YES, Index.ANALYZED));
55 document.add(new TextField(key, val, Store.YES));
5756 }
5857 writer.addDocument(document);
5958 }
7877 for (String key : met.names()) {
7978 String[] values = met.getValues(key);
8079 for (String val : values) {
81 document.add(new Field(key, val, Store.YES, Index.ANALYZED));
80 document.add(new TextField(key, val, Store.YES));
8281 }
8382 writer.addDocument(document);
8483 }
3838 import org.apache.tika.sax.BasicContentHandlerFactory;
3939 import org.apache.tika.sax.BodyContentHandler;
4040 import org.apache.tika.sax.ContentHandlerFactory;
41 import org.apache.tika.sax.RecursiveParserWrapperHandler;
4142 import org.xml.sax.SAXException;
4243 import org.xml.sax.helpers.DefaultHandler;
4344
162163 ContentHandlerFactory factory = new BasicContentHandlerFactory(
163164 BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
164165
165 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
166 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
166167 Metadata metadata = new Metadata();
167168 metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
168169 ParseContext context = new ParseContext();
169
170 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1);
170171 try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
171 wrapper.parse(stream, new DefaultHandler(), metadata, context);
172 }
173 return wrapper.getMetadata();
172 wrapper.parse(stream, handler, metadata, context);
173 }
174
175 return handler.getMetadataList();
174176 }
175177
176178 /**
1616
1717 package org.apache.tika.example;
1818
19 import java.io.File;
2019 import java.io.IOException;
20 import java.nio.file.Path;
2121 import java.text.SimpleDateFormat;
2222 import java.util.Date;
2323 import java.util.GregorianCalendar;
2727 import org.apache.jackrabbit.util.ISO8601;
2828 import org.apache.lucene.document.Document;
2929 import org.apache.lucene.index.CorruptIndexException;
30 import org.apache.lucene.index.DirectoryReader;
3031 import org.apache.lucene.index.IndexReader;
3132 import org.apache.lucene.search.IndexSearcher;
3233 import org.apache.lucene.search.ScoreDoc;
3334 import org.apache.lucene.search.TermRangeQuery;
3435 import org.apache.lucene.search.TopScoreDocCollector;
35 import org.apache.lucene.store.SimpleFSDirectory;
36 import org.apache.lucene.store.FSDirectory;
37 import org.apache.lucene.util.BytesRef;
3638 import org.apache.tika.metadata.DublinCore;
3739 import org.apache.tika.metadata.Metadata;
3840
4850 private SimpleDateFormat rssDateFormat = new SimpleDateFormat(
4951 "E, dd MMM yyyy HH:mm:ss z", Locale.getDefault());
5052
51 public String generateRSS(File indexFile) throws CorruptIndexException,
53 public String generateRSS(Path indexFile) throws CorruptIndexException,
5254 IOException {
5355 StringBuffer output = new StringBuffer();
5456 output.append(getRSSHeaders());
5557 IndexSearcher searcher = null;
5658 try {
57 reader = IndexReader.open(new SimpleFSDirectory(indexFile));
59 reader = DirectoryReader.open(FSDirectory.open(indexFile));
5860 searcher = new IndexSearcher(reader);
5961 GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
6062 gc.setTime(new Date());
6163 String nowDateTime = ISO8601.format(gc);
6264 gc.add(java.util.GregorianCalendar.MINUTE, -5);
6365 String fiveMinsAgo = ISO8601.format(gc);
64 TermRangeQuery query = new TermRangeQuery(Metadata.DATE.toString(),
65 fiveMinsAgo, nowDateTime, true, true);
66 TopScoreDocCollector collector = TopScoreDocCollector.create(20,
67 true);
66 TermRangeQuery query = new TermRangeQuery(
67 Metadata.DATE.toString(),
68 new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime),
69 true, true);
70 TopScoreDocCollector collector = TopScoreDocCollector.create(20);
6871 searcher.search(query, collector);
6972 ScoreDoc[] hits = collector.topDocs().scoreDocs;
7073 for (int i = 0; i < hits.length; i++) {
7477
7578 } finally {
7679 if (reader != null) reader.close();
77 if (searcher != null) searcher.close();
7880 }
7981
8082 output.append(getRSSFooters());
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
3939 <artifactId>tika-core</artifactId>
4040 <version>${project.version}</version>
4141 </dependency>
42
43 <!-- for java 10 -->
44 <dependency>
45 <groupId>javax.xml.bind</groupId>
46 <artifactId>jaxb-api</artifactId>
47 <version>${jaxb.version}</version>
48 </dependency>
49 <dependency>
50 <groupId>com.sun.xml.bind</groupId>
51 <artifactId>jaxb-core</artifactId>
52 <version>${jaxb.version}</version>
53 </dependency>
54 <dependency>
55 <groupId>com.sun.xml.bind</groupId>
56 <artifactId>jaxb-impl</artifactId>
57 <version>${jaxb.version}</version>
58 </dependency>
59 <dependency>
60 <groupId>javax.activation</groupId>
61 <artifactId>activation</artifactId>
62 <version>1.1.1</version>
63 </dependency>
64
4265 <dependency>
4366 <groupId>com.optimaize.languagedetector</groupId>
4467 <artifactId>language-detector</artifactId>
115138 <plugin>
116139 <groupId>org.apache.maven.plugins</groupId>
117140 <artifactId>maven-jar-plugin</artifactId>
141 <configuration>
142 <archive>
143 <manifestEntries>
144 <Automatic-Module-Name>org.apache.tika.langdetect</Automatic-Module-Name>
145 </manifestEntries>
146 </archive>
147 </configuration>
118148 <executions>
119149 <execution>
120150 <goals>
2929 import org.apache.tika.language.detect.LanguageNames;
3030 import org.apache.tika.language.detect.LanguageResult;
3131
32 import com.google.common.collect.ImmutableList;
33 import com.google.common.collect.ImmutableSet;
3234 import com.optimaize.langdetect.DetectedLanguage;
3335 import com.optimaize.langdetect.LanguageDetectorBuilder;
3436 import com.optimaize.langdetect.i18n.LdLocale;
4345 */
4446 public class OptimaizeLangDetector extends LanguageDetector {
4547
48 private static final List<LanguageProfile> DEFAULT_LANGUAGE_PROFILES;
49 private static final ImmutableSet<String> DEFAULT_LANGUAGES;
50 private static final com.optimaize.langdetect.LanguageDetector DEFAULT_DETECTOR;
51
52
53 static {
54 try {
55 DEFAULT_LANGUAGE_PROFILES = ImmutableList.copyOf(new LanguageProfileReader().readAllBuiltIn());
56
57 ImmutableSet.Builder<String> builder = new ImmutableSet.Builder<>();
58 for (LanguageProfile profile : DEFAULT_LANGUAGE_PROFILES) {
59 builder.add(makeLanguageName(profile.getLocale()));
60 }
61 DEFAULT_LANGUAGES = builder.build();
62
63 DEFAULT_DETECTOR = createDetector(DEFAULT_LANGUAGE_PROFILES, null);
64 } catch (IOException e) {
65 throw new RuntimeException("can't initialize OptimaizeLangDetector");
66 }
67 }
68
4669 private static final int MAX_CHARS_FOR_DETECTION = 20000;
4770 private static final int MAX_CHARS_FOR_SHORT_DETECTION = 200;
4871
5073 private CharArrayWriter writer;
5174 private Set<String> languages;
5275 private Map<String, Float> languageProbabilities;
53
76
5477 public OptimaizeLangDetector() {
5578 super();
5679
5881 }
5982
6083 @Override
61 public LanguageDetector loadModels() throws IOException {
62 List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
63
84 public LanguageDetector loadModels() {
6485 // FUTURE when the "language-detector" project supports short profiles, check if
6586 // isShortText() returns true and switch to those.
66
67 languages = new HashSet<>();
68 for (LanguageProfile profile : languageProfiles) {
69 languages.add(makeLanguageName(profile.getLocale()));
70 }
71
72 detector = createDetector(languageProfiles);
73
87
88 languages = DEFAULT_LANGUAGES;
89
90 if (languageProbabilities != null) {
91 detector = createDetector(DEFAULT_LANGUAGE_PROFILES, languageProbabilities);
92 } else {
93 detector = DEFAULT_DETECTOR;
94 }
95
7496 return this;
7597
7698 }
7799
78 private String makeLanguageName(LdLocale locale) {
100 private static String makeLanguageName(LdLocale locale) {
79101 return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull());
80102 }
81103
97119 }
98120 }
99121
100 detector = createDetector(new LanguageProfileReader().readBuiltIn(locales));
122 detector = createDetector(new LanguageProfileReader().readBuiltIn(locales), languageProbabilities);
101123
102124 return this;
103125 }
104126
105 private com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles) {
127 private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) {
106128 // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which
107129 // means you can often get 0 probabilities. So we pick a very short length for this limit.
108130 LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
2323 <parent>
2424 <groupId>org.apache.tika</groupId>
2525 <artifactId>tika-parent</artifactId>
26 <version>1.18</version>
26 <version>1.19</version>
2727 <relativePath>../tika-parent/pom.xml</relativePath>
2828 </parent>
2929
4343 <artifactId>tika-parsers</artifactId>
4444 <version>${project.version}</version>
4545 <scope>provided</scope>
46 </dependency>
47 <dependency>
48 <groupId>junit</groupId>
49 <artifactId>junit</artifactId>
50 <scope>test</scope>
5146 </dependency>
5247
5348 <dependency>
136131 <groupId>org.apache.avro</groupId>
137132 <artifactId>avro</artifactId>
138133 </exclusion>
134 <exclusion>
135 <groupId>javax.ws.rs</groupId>
136 <artifactId>javax.ws.rs-api</artifactId>
137 </exclusion>
138 <exclusion>
139 <groupId>javax.annotation</groupId>
140 <artifactId>javax.annotation-api</artifactId>
141 </exclusion>
139142 </exclusions>
140143 </dependency>
141144 <dependency>
201204 </dependency>
202205 <!-- Test dependencies -->
203206 <dependency>
207 <groupId>org.mockito</groupId>
208 <artifactId>mockito-core</artifactId>
209 <version>${mockito.version}</version>
210 <scope>test</scope>
211 </dependency>
212 <dependency>
213 <groupId>org.slf4j</groupId>
214 <artifactId>slf4j-log4j12</artifactId>
215 <scope>test</scope>
216 </dependency>
217 <dependency>
204218 <groupId>junit</groupId>
205219 <artifactId>junit</artifactId>
206 </dependency>
207 <dependency>
208 <groupId>org.mockito</groupId>
209 <artifactId>mockito-core</artifactId>
210 <version>2.15.0</version>
211 <scope>test</scope>
212 </dependency>
213 <dependency>
214 <groupId>org.slf4j</groupId>
215 <artifactId>slf4j-log4j12</artifactId>
216 <scope>test</scope>
217 </dependency>
218
220 <scope>test</scope>
221 </dependency>
222
219223 </dependencies>
220224
221225 <build>
3030
3131 <groupId>org.apache.tika</groupId>
3232 <artifactId>tika-parent</artifactId>
33 <version>1.18</version>
33 <version>1.19</version>
3434 <packaging>pom</packaging>
3535
3636 <name>Apache Tika parent</name>
300300 </dependencyManagement>
301301
302302 <properties>
303 <maven.compiler.source>1.7</maven.compiler.source>
304 <maven.compiler.target>1.7</maven.compiler.target>
303 <maven.compiler.source>1.8</maven.compiler.source>
304 <maven.compiler.target>1.8</maven.compiler.target>
305305 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
306306 <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
307307 <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
308 <commons.compress.version>1.16.1</commons.compress.version>
308 <commons.compress.version>1.18</commons.compress.version>
309309 <commons.io.version>2.6</commons.io.version>
310 <gson.version>2.8.1</gson.version>
311 <cxf.version>3.0.16</cxf.version>
312 <slf4j.version>1.7.24</slf4j.version>
313 <jackson.version>2.9.5</jackson.version>
310 <gson.version>2.8.5</gson.version>
311 <cxf.version>3.2.6</cxf.version>
312 <slf4j.version>1.7.25</slf4j.version>
313 <jackson.version>2.9.6</jackson.version>
314 <jaxb.version>2.3.0</jaxb.version>
315 <mockito.version>2.20.0</mockito.version>
314316 </properties>
315317
316318 <build>
352354 <plugin>
353355 <groupId>org.apache.felix</groupId>
354356 <artifactId>maven-bundle-plugin</artifactId>
355 <version>3.3.0</version>
357 <version>3.5.1</version>
356358 </plugin>
357359 <plugin>
358360 <groupId>org.apache.maven.plugins</groupId>
459461 <connection>scm:git:https://github.com/apache/</connection>
460462 <developerConnection>scm:git:https://github.com/apache/</developerConnection>
461463 <url>https://github.com/apache/tika</url>
462 <tag>1.18-rc3</tag>
464 <tag>1.19-rc1</tag>
463465 </scm>
464466 </project>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
3434 <url>http://tika.apache.org/</url>
3535
3636 <properties>
37 <poi.version>3.17</poi.version>
37 <poi.version>4.0.0</poi.version>
3838 <!-- NOTE: sync codec version with POI -->
39 <codec.version>1.10</codec.version>
39 <codec.version>1.11</codec.version>
4040 <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
4141 <tukaani.version>1.8</tukaani.version>
4242 <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
4343 <brotli.version>0.1.2</brotli.version>
44 <mime4j.version>0.8.1</mime4j.version>
44 <mime4j.version>0.8.2</mime4j.version>
4545 <vorbis.version>0.8</vorbis.version>
46 <pdfbox.version>2.0.9</pdfbox.version>
47 <jempbox.version>1.8.13</jempbox.version>
46 <pdfbox.version>2.0.11</pdfbox.version>
47 <jempbox.version>1.8.15</jempbox.version>
4848 <netcdf-java.version>4.5.5</netcdf-java.version>
4949 <sis.version>0.8</sis.version>
50 <parso.version>2.0.9</parso.version>
5051 <!-- used by POI, PDFBox and Jackcess ...try to sync -->
51 <bouncycastle.version>1.54</bouncycastle.version>
52 <bouncycastle.version>1.60</bouncycastle.version>
5253 <commonsexec.version>1.3</commonsexec.version>
53 <httpcomponents.version>4.5.4</httpcomponents.version>
54 <httpcomponents.version>4.5.6</httpcomponents.version>
5455 </properties>
5556
5657 <dependencies>
7778 <scope>test</scope>
7879 </dependency>
7980
81 <!-- for java 10 -->
82 <dependency>
83 <groupId>javax.xml.bind</groupId>
84 <artifactId>jaxb-api</artifactId>
85 <version>${jaxb.version}</version>
86 </dependency>
87 <dependency>
88 <groupId>com.sun.xml.bind</groupId>
89 <artifactId>jaxb-core</artifactId>
90 <version>${jaxb.version}</version>
91 </dependency>
92 <dependency>
93 <groupId>com.sun.xml.bind</groupId>
94 <artifactId>jaxb-impl</artifactId>
95 <version>${jaxb.version}</version>
96 </dependency>
97 <dependency>
98 <groupId>javax.activation</groupId>
99 <artifactId>activation</artifactId>
100 <version>1.1.1</version>
101 </dependency>
102
80103 <!-- Externally Maintained Parsers -->
81104 <dependency>
82105 <groupId>org.gagravarr</groupId>
92115 <dependency>
93116 <groupId>com.healthmarketscience.jackcess</groupId>
94117 <artifactId>jackcess</artifactId>
95 <version>2.1.10</version>
118 <version>2.1.12</version>
96119 <exclusions>
97120 <exclusion>
98121 <groupId>commons-logging</groupId>
129152 <dependency>
130153 <groupId>org.tallison</groupId>
131154 <artifactId>jmatio</artifactId>
132 <version>1.2</version>
155 <version>1.4</version>
133156 </dependency>
134157 <dependency>
135158 <groupId>org.apache.james</groupId>
152175 <version>${tukaani.version}</version>
153176 </dependency>
154177 <dependency>
178 <groupId>com.epam</groupId>
179 <artifactId>parso</artifactId>
180 <version>${parso.version}</version>
181 </dependency>
182 <dependency>
155183 <groupId>org.brotli</groupId>
156184 <artifactId>dec</artifactId>
157185 <version>${brotli.version}</version>
159187 <dependency>
160188 <groupId>com.github.luben</groupId>
161189 <artifactId>zstd-jni</artifactId>
162 <version>1.3.3-3</version>
190 <version>1.3.5-3</version>
163191 <scope>provided</scope>
164192 </dependency>
165193
216244 <groupId>org.apache.poi</groupId>
217245 <artifactId>poi</artifactId>
218246 <version>${poi.version}</version>
247 <exclusions>
248 <exclusion>
249 <groupId>commons-codec</groupId>
250 <artifactId>commons-codec</artifactId>
251 </exclusion>
252 </exclusions>
219253 </dependency>
220254 <dependency>
221255 <groupId>org.apache.poi</groupId>
245279 <dependency>
246280 <groupId>org.ow2.asm</groupId>
247281 <artifactId>asm</artifactId>
248 <version>5.0.4</version>
282 <version>6.2</version>
249283 </dependency>
250284 <dependency>
251285 <groupId>com.googlecode.mp4parser</groupId>
252286 <artifactId>isoparser</artifactId>
253 <version>1.1.18</version>
287 <version>1.1.22</version>
254288 </dependency>
255289 <dependency>
256290 <groupId>com.drewnoakes</groupId>
257291 <artifactId>metadata-extractor</artifactId>
258 <version>2.10.1</version>
292 <version>2.11.0</version>
259293 </dependency>
260294 <dependency>
261295 <groupId>de.l3s.boilerpipe</groupId>
286320 <dependency>
287321 <groupId>org.codelibs</groupId>
288322 <artifactId>jhighlight</artifactId>
289 <version>1.0.2</version>
323 <version>1.0.3</version>
324 <exclusions>
325 <exclusion>
326 <groupId>commons-io</groupId>
327 <artifactId>commons-io</artifactId>
328 </exclusion>
329 </exclusions>
290330 </dependency>
291331 <!-- can't upgrade to java-libpst 0.9.3 because it requires Java 8
292332 and is buggy with OST TIKA-2415 -->
298338 <dependency>
299339 <groupId>com.github.junrar</groupId>
300340 <artifactId>junrar</artifactId>
301 <version>0.7</version>
341 <version>2.0.0</version>
302342 <exclusions>
303343 <exclusion>
304344 <groupId>commons-logging</groupId>
340380 <dependency>
341381 <groupId>org.apache.opennlp</groupId>
342382 <artifactId>opennlp-tools</artifactId>
343 <version>1.8.4</version>
383 <version>1.9.0</version>
344384 </dependency>
345385
346386 <dependency>
399439 <dependency>
400440 <groupId>org.mockito</groupId>
401441 <artifactId>mockito-core</artifactId>
402 <version>2.15.0</version>
442 <version>${mockito.version}</version>
403443 <scope>test</scope>
404444 </dependency>
405445 <dependency>
423463 <groupId>org.jdom</groupId>
424464 <artifactId>jdom2</artifactId>
425465 </exclusion>
466 <!--TIKA 2672 exclude jna to resolve the dependency convergence with tika-dl's
467 deeplearning4j-nn:1.0.0-SNAPSHOT-->
468 <exclusion>
469 <groupId>net.java.dev.jna</groupId>
470 <artifactId>jna</artifactId>
471 </exclusion>
426472 </exclusions>
427473 </dependency>
428474 <dependency>
444490 </exclusion>
445491 </exclusions>
446492 </dependency>
493 <!--TIKA 2672 include a later version of jna as a direct dependency to resolve dependency convergence with tika-dl's
494 deeplearning4j-nn:1.0.0-SNAPSHOT -->
495 <dependency>
496 <groupId>net.java.dev.jna</groupId>
497 <artifactId>jna</artifactId>
498 <version>4.3.0</version>
499 </dependency>
500
447501 <!-- grib's current jsoup is vulnerable to xss
448502 exclude and import a more modern version TIKA-2561-->
449503 <dependency>
450504 <groupId>org.jsoup</groupId>
451505 <artifactId>jsoup</artifactId>
452 <version>1.11.2</version>
453 </dependency> <dependency>
506 <version>1.11.3</version>
507 </dependency>
508 <dependency>
454509 <groupId>edu.ucar</groupId>
455510 <artifactId>cdm</artifactId>
456511 <version>${netcdf-java.version}</version>
506561 <groupId>commons-logging</groupId>
507562 <artifactId>commons-logging</artifactId>
508563 </exclusion>
564 <exclusion>
565 <groupId>commons-codec</groupId>
566 <artifactId>commons-codec</artifactId>
567 </exclusion>
509568 </exclusions>
510569 </dependency>
511570 <dependency>
518577 <dependency>
519578 <groupId>org.apache.commons</groupId>
520579 <artifactId>commons-csv</artifactId>
521 <version>1.0</version>
580 <version>1.5</version>
522581 </dependency>
523582
524583 <dependency>
663722 <groupId>commons-io</groupId>
664723 <artifactId>commons-io</artifactId>
665724 </exclusion>
725 <exclusion>
726 <groupId>commons-logging</groupId>
727 <artifactId>commons-logging-api</artifactId>
728 </exclusion>
729 <exclusion>
730 <groupId>commons-logging</groupId>
731 <artifactId>commons-logging</artifactId>
732 </exclusion>
733 <exclusion>
734 <groupId>org.springframework</groupId>
735 <artifactId>spring-context</artifactId>
736 </exclusion>
737 <exclusion>
738 <groupId>org.springframework</groupId>
739 <artifactId>spring-beans</artifactId>
740 </exclusion>
741 <exclusion>
742 <groupId>org.springframework</groupId>
743 <artifactId>spring-core</artifactId>
744 </exclusion>
666745 </exclusions>
667746 </dependency>
668747 <!-- need to specify this to avoid
708787 <dependency>
709788 <groupId>org.apache.pdfbox</groupId>
710789 <artifactId>jbig2-imageio</artifactId>
711 <version>3.0.0</version>
790 <version>3.0.1</version>
712791 </dependency>
713792
714793 <!-- jai-imageio-core is allowed since LEGAL-304 -->
715794 <dependency>
716795 <groupId>com.github.jai-imageio</groupId>
717796 <artifactId>jai-imageio-core</artifactId>
718 <version>1.3.1</version>
797 <version>1.4.0</version>
719798 </dependency>
720799 <!-- For legal reasons (incompatible license), jai-imageio-jpeg2000 is to be used
721800 only in the tests and may not be distributed. See also LEGAL-195 -->
731810 </exclusion>
732811 </exclusions>
733812 </dependency>
734
735813 </dependencies>
736814 <build>
737815 <plugins>
767845 <plugin>
768846 <groupId>org.apache.maven.plugins</groupId>
769847 <artifactId>maven-jar-plugin</artifactId>
848 <configuration>
849 <archive>
850 <manifestEntries>
851 <Automatic-Module-Name>org.apache.tika.parsers</Automatic-Module-Name>
852 </manifestEntries>
853 </archive>
854 </configuration>
770855 <executions>
771856 <execution>
772857 <goals>
3131 import javax.sound.sampled.AudioSystem;
3232 import javax.sound.sampled.UnsupportedAudioFileException;
3333
34 import org.apache.commons.io.IOUtils;
3435 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.io.ProxyInputStream;
3537 import org.apache.tika.metadata.Metadata;
3638 import org.apache.tika.metadata.XMPDM;
3739 import org.apache.tika.mime.MediaType;
6567 if (!stream.markSupported()) {
6668 stream = new BufferedInputStream(stream);
6769 }
70 stream = new SkipFullyInputStream(stream);
6871 try {
6972 AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
7073 Type type = fileFormat.getType();
136139 }
137140 }
138141
142 private static class SkipFullyInputStream extends ProxyInputStream {
143
144 public SkipFullyInputStream(InputStream proxy) {
145 super(proxy);
146 }
147
148 @Override
149 public long skip(long ln) throws IOException {
150 IOUtils.skipFully(in, ln);
151 return ln;
152 }
153 }
154
139155 }
3232 import org.apache.tika.sax.OfflineContentHandler;
3333 import org.apache.tika.sax.TaggedContentHandler;
3434 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.apache.tika.utils.XMLReaderUtils;
3536 import org.xml.sax.ContentHandler;
3637 import org.xml.sax.SAXException;
38
39 import javax.xml.parsers.SAXParser;
3740
3841 public class DIFParser extends AbstractParser {
3942
6164 xhtml.startElement("p");
6265 TaggedContentHandler tagged = new TaggedContentHandler(handler);
6366 try {
64 context.getSAXParser().parse(
67 XMLReaderUtils.parseSAX(
6568 new CloseShieldInputStream(stream),
6669 new OfflineContentHandler(new EmbeddedContentHandler(
67 getContentHandler(tagged, metadata, context))));
70 getContentHandler(tagged, metadata, context))), context);
6871 } catch (SAXException e) {
6972 tagged.throwIfCauseOf(e);
7073 throw new TikaException("XML parse error", e);
1515 */
1616 package org.apache.tika.parser.epub;
1717
18 import javax.xml.parsers.SAXParser;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Collections;
22 import java.util.Set;
23
2418 import org.apache.commons.io.input.CloseShieldInputStream;
2519 import org.apache.tika.exception.TikaException;
2620 import org.apache.tika.metadata.Metadata;
2822 import org.apache.tika.parser.AbstractParser;
2923 import org.apache.tika.parser.ParseContext;
3024 import org.apache.tika.sax.OfflineContentHandler;
25 import org.apache.tika.utils.XMLReaderUtils;
3126 import org.xml.sax.ContentHandler;
3227 import org.xml.sax.SAXException;
28
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.util.Collections;
32 import java.util.Set;
3333
3434 /**
3535 * Parser for EPUB OPS <code>*.html</code> files.
4747 Metadata metadata, ParseContext context)
4848 throws IOException, SAXException, TikaException {
4949
50 SAXParser parser = context.getSAXParser();
51 parser.parse(
50 XMLReaderUtils.parseSAX(
5251 new CloseShieldInputStream(stream),
53 new OfflineContentHandler(handler));
52 new OfflineContentHandler(handler), context);
5453 }
5554
5655 }
104104 meta.parse(zip, new DefaultHandler(), metadata, context);
105105 } else if (entry.getName().endsWith(".opf")) {
106106 meta.parse(zip, new DefaultHandler(), metadata, context);
107 } else if (entry.getName().endsWith(".html") ||
107 } else if (entry.getName().endsWith(".htm") ||
108 entry.getName().endsWith(".html") ||
108109 entry.getName().endsWith(".xhtml")) {
109110 content.parse(zip, childHandler, metadata, context);
110111 }
8282
8383
8484 private static final Pattern HTTP_META_PATTERN = Pattern.compile(
85 "(?is)<\\s*meta\\s+([^<>]+)"
85 "(?is)<\\s*meta(?:/|\\s+)([^<>]+)"
8686 );
8787
8888 //this should match both the older:
9696 //For a more general "not" matcher, try:
9797 //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
9898 private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
99 ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
99 ("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
100100 );
101101
102102 private static final Charset ASCII = Charset.forName("US-ASCII");
153153 if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
154154 continue;
155155 }
156 if ("x-user-defined".equalsIgnoreCase(candCharset)) {
157 candCharset = "windows-1252";
158 }
159
156160 if (CharsetUtils.isSupported(candCharset)) {
157161 try {
158162 return CharsetUtils.forName(candCharset);
118118 String uri, String local, String name, Attributes atts)
119119 throws SAXException {
120120
121 if ("HTML".equals(name) && atts.getValue("lang") != null) {
122 metadata.set(Metadata.CONTENT_LANGUAGE, atts.getValue("lang"));
123 }
121124 if ("SCRIPT".equals(name)) {
122125 scriptLevel++;
123126 }
2828 import org.apache.tika.detect.AutoDetectReader;
2929 import org.apache.tika.detect.EncodingDetector;
3030 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.io.TemporaryResources;
32 import org.apache.tika.io.TikaInputStream;
3133 import org.apache.tika.metadata.Metadata;
3234 import org.apache.tika.mime.MediaType;
3335 import org.apache.tika.parser.AbstractEncodingDetectorParser;
3941 import org.xml.sax.ContentHandler;
4042 import org.xml.sax.SAXException;
4143
44 import javax.swing.text.AbstractDocument;
45
4246 /**
4347 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
4448 * and post-processes the events to produce XHTML and metadata expected by
8993 Metadata metadata, ParseContext context)
9094 throws IOException, SAXException, TikaException {
9195
96 TemporaryResources tmp = null;
97 try {
98 if (!TikaInputStream.isTikaInputStream(stream)) {
99 tmp = new TemporaryResources();
100 stream = TikaInputStream.get(stream, tmp);
101 }
102 //AutoDetectReader can throw exceptions during
103 //initialization. If we just created a
104 //TemporaryResources, we need to make sure to close it.
105 parseImpl(stream, handler, metadata, context);
106 } finally {
107 if (tmp != null) {
108 tmp.close();
109 }
110 }
111
112 }
113
114
115 private void parseImpl(InputStream stream, ContentHandler handler,
116 Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
92117 // Automatically detect the character encoding
93118 try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
94119 metadata, getEncodingDetector(context))) {
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector;
17
18
19 import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
20 import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset;
21
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.StandardCharsets;
25 import java.nio.charset.UnsupportedCharsetException;
26 import java.util.HashMap;
27 import java.util.Locale;
28 import java.util.Map;
29
30 /**
31 * Singleton class that associates standard charset names to java charset implementations
32 * https://encoding.spec.whatwg.org/#ref-for-iso-8859-8-i
33 */
34 final class CharsetAliases {
35
36 private static final Map<String, Charset> charsetsByLabel = new HashMap<>();
37
38 private CharsetAliases() {
39 }
40
41 /**
42 * @param label a charset name
43 * @return the corresponding java charset, if there is one. Otherwise, null
44 */
45 static Charset getCharsetByLabel(String label) {
46 if (label == null) return null;
47 synchronized (charsetsByLabel) {
48 // Lazy initialization
49 if (charsetsByLabel.isEmpty()) addAll();
50 }
51 label = label.trim().toLowerCase(Locale.US);
52 return charsetsByLabel.get(label);
53 }
54
55 private static void addAll() {
56 addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
57 addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
58 addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
59 "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949");
60 addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
61 "gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
62 addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
63 addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
64 addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157",
65 "iso8859-10", "iso885910", "l6", "latin6");
66 addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
67 addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914");
68 addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
69 "iso_8859-15", "l9");
70 addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16");
71 addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
72 "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
73 addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
74 "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
75 addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
76 "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
77 addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144",
78 "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
79 addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
80 "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127",
81 "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
82 addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", "greek8",
83 "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek");
84 // ISO-8859-8 actually should have an influence on the layout direction
85 // (text should be decoded in the visual order). However, this is not implemented in tika.
86 addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
87 "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual");
88 addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
89 addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
90 addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u");
91 addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis",
92 "sjis", "windows-31j", "x-sjis");
93 addCharset(charset("UTF-16BE"), "utf-16be");
94 addCharset(charset("UTF-16LE"), "utf-16", "utf-16le");
95 addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
96 addCharset(charset("gb18030"), "gb18030");
97 addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
98 addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
99 addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
100 "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
101 "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
102 addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
103 addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
104 "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
105 addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
106 addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
107 addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
108 addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
109 addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
110 "tis-620", "windows-874");
111 addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
112 addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
113 // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset attacks
114 addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext",
115 "iso-2022-kr", "replacement");
116 // The x-user-defined charset is not present in java
117 addCharset(new XUserDefinedCharset(), "x-user-defined");
118 }
119
120 /**
121 * @param names jvm charset names
122 * @return the first of the given charsets that exists in the current JVM,
123 * or ISO_8859_1 if none exists
124 */
125 private static Charset charset(String... names) {
126 for (String name : names) {
127 try {
128 return Charset.forName(name);
129 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
130 }
131 // The only single-byte charset extended charset that must be present on every Java platform
132 return StandardCharsets.ISO_8859_1;
133 }
134
135 /**
136 * @param charset name of the charset in the JVM
137 * @param names standard W3C charset names
138 */
139 private static void addCharset(Charset charset, String... names) {
140 for (String name : names) {
141 charsetsByLabel.put(name, charset);
142 }
143 }
144 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector;
17
18 import java.nio.charset.Charset;
19
20 import static java.nio.charset.StandardCharsets.UTF_16BE;
21 import static java.nio.charset.StandardCharsets.UTF_16LE;
22 import static java.nio.charset.StandardCharsets.UTF_8;
23
24
25 /**
26 * A detection may either not find a charset, find an invalid charset, or find a valid charset
27 */
28 class CharsetDetectionResult {
29 private boolean found = false;
30 private Charset charset = null;
31
32 private CharsetDetectionResult() { /* default result: not found */}
33
34 static CharsetDetectionResult notFound() {
35 return new CharsetDetectionResult();
36 }
37
38 public boolean isFound() {
39 return found;
40 }
41
42 public void find(String charsetName) {
43 this.found = true;
44 charsetName = charsetName.trim();
45 if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
46 this.charset = CharsetAliases.getCharsetByLabel(charsetName);
47 // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
48 if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
49 }
50
51 public Charset getCharset() {
52 // the result may be null even if found is true, in the case there is a charset specified,
53 // but it is invalid
54 return charset;
55 }
56
57 public void setCharset(Charset charset) {
58 this.found = true;
59 this.charset = charset;
60 }
61 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector;
17
18 import java.util.HashSet;
19 import java.util.Map;
20 import java.util.Set;
21
22 import static org.apache.tika.parser.html.charsetdetector.PreScanner.getEncodingFromMeta;
23
24
25 /**
26 * A class to process the attributes of an HTML meta tag in order to extract a character set.
27 * The user should repeatedly call {@link #processAttribute} on each attributes of the tag,
28 * then update its current detection result with {@link #updateDetectedCharset(CharsetDetectionResult)}
29 * <p>
30 * The algorithm implemented is meant to match the one described by the W3C here:
31 * https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
32 */
33 class MetaProcessor {
34 private Set<String> attributeNames = new HashSet<>();
35 private boolean gotPragma = false;
36 private Boolean needPragma = null; // needPragma can be null, true, or false
37 private CharsetDetectionResult detectionResult = CharsetDetectionResult.notFound();
38
39 void updateDetectedCharset(CharsetDetectionResult currentDetectionResult) {
40 if (detectionResult.isFound() &&
41 needPragma != null &&
42 !(needPragma && !gotPragma)) {
43 currentDetectionResult.setCharset(detectionResult.getCharset());
44 }
45 }
46
47 void processAttribute(Map.Entry<String, String> attribute) {
48 // Ignore duplicate attributes
49 if (attributeNames.contains(attribute.getKey())) return;
50
51 attributeNames.add(attribute.getKey());
52
53 // Handle charset-related attributes
54 switch (attribute.getKey()) {
55 case "http-equiv":
56 if (attribute.getValue().equals("content-type"))
57 gotPragma = true;
58 break;
59 case "content":
60 String charsetName = getEncodingFromMeta(attribute.getValue());
61 if (!detectionResult.isFound() && charsetName != null) {
62 detectionResult.find(charsetName);
63 needPragma = true;
64 }
65 break;
66 case "charset":
67 detectionResult.find(attribute.getValue());
68 needPragma = false;
69 break;
70 default: // Ignore non-charset related attributes
71 }
72 }
73 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector;
17
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.nio.charset.Charset;
22 import java.nio.charset.StandardCharsets;
23 import java.util.AbstractMap;
24 import java.util.BitSet;
25 import java.util.Map;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 /**
30 * A scanner meant to detect charset meta tags in a byte stream
31 * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
32 */
33 class PreScanner {
34
35 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
36 private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
37 private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
38 private static final byte[] META_TAG_START = {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
39 private static final byte SLASH = (byte) '/';
40 private static final byte EQUAL = (byte) '=';
41 private static final byte TAG_START = (byte) '<';
42 private static final byte TAG_END = (byte) '>';
43 private static final BitSet QUOTE = bitSet('"', '\'');
44
45 private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
46 private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
47 private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
48 private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');
49
50 private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
51 private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
52 private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
53 private static final byte LOWER_A = (byte) 'a';
54 private static final byte LOWER_Z = (byte) 'z';
55 private static final byte UPPER_A = (byte) 'A';
56 private static final byte UPPER_Z = (byte) 'Z';
57 private BufferedInputStream stream;
58 private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();
59
60 PreScanner(InputStream inputStream) {
61 this.stream = new BufferedInputStream(inputStream);
62 }
63
64 private static BitSet bitSet(int... bs) {
65 BitSet bitSet = new BitSet(0xFF);
66 for (int b : bs) bitSet.set(b);
67 return bitSet;
68 }
69
70 private static BitSet bitSet(BitSet base, int... bs) {
71 BitSet bitSet = (BitSet) base.clone();
72 for (int b : bs) bitSet.set(b);
73 return bitSet;
74 }
75
76 static String getEncodingFromMeta(String attributeValue) {
77 Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
78 if (!matcher.find()) return null;
79 return matcher.group(2);
80 }
81
82 private static boolean contains(BitSet bitSet, byte b) {
83 return bitSet.get(b & 0xFF);
84 }
85
86 Charset scan() {
87 while (processAtLeastOneByte()) {
88 if (detectedCharset.isFound()) {
89 return detectedCharset.getCharset();
90 }
91 }
92 return null;
93 }
94
95 Charset detectBOM() {
96 try {
97 if (expect(UTF8_BOM)) return StandardCharsets.UTF_8;
98 else if (expect(UTF16_BE_BOM)) return StandardCharsets.UTF_16BE;
99 else if (expect(UTF16_LE_BOM)) return StandardCharsets.UTF_16LE;
100 } catch (IOException e) { /* stream could not be read, also return null */ }
101 return null;
102 }
103
104 private boolean processAtLeastOneByte() {
105 try {
106 return processComment() ||
107 processMeta() ||
108 processTag() ||
109 processSpecialTag() ||
110 processAny();
111 } catch (IOException e) {
112 return false;
113 }
114 }
115
116 private boolean processAny() throws IOException {
117 int read = stream.read();
118 return read != -1;
119 }
120
121 private boolean processTag() throws IOException {
122 stream.mark(3);
123 if (read() == TAG_START) {
124 int read = stream.read();
125 if (read == SLASH) read = stream.read();
126 if ((LOWER_A <= read && read <= LOWER_Z) ||
127 (UPPER_A <= read && read <= UPPER_Z)) {
128 do stream.mark(1);
129 while (!contains(SPACE_OR_TAG_END, read()));
130 stream.reset();
131 while (getAttribute() != null) {/* ignore the attribute*/}
132 return true;
133 }
134 }
135 stream.reset();
136 return false;
137 }
138
139 private boolean processSpecialTag() throws IOException {
140 stream.mark(2);
141 if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
142 skipUntil(TAG_END);
143 return true;
144 }
145 stream.reset();
146 return false;
147 }
148
149 private boolean processMeta() throws IOException {
150 stream.mark(6); // len("<meta ") == 6
151 if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
152 MetaProcessor metaProcessor = new MetaProcessor();
153 for (Map.Entry<String, String> attribute = getAttribute(); attribute != null; attribute = getAttribute()) {
154 metaProcessor.processAttribute(attribute);
155 }
156 metaProcessor.updateDetectedCharset(detectedCharset);
157 return true;
158 }
159 stream.reset();
160 return false;
161 }
162
163 /**
164 * Read an attribute from the stream
165 *
166 * @return the attribute as a Map.Entry, where the key is the attribute's name and
167 * the value is the attribute's value. If there is no attribute, return null
168 */
169 private Map.Entry<String, String> getAttribute() throws IOException {
170 String name = getAttributeName();
171 if (name == null) return null;
172
173 if (!expect(EQUAL)) return new AbstractMap.SimpleEntry<>(name, "");
174 skipAll(WHITESPACE);
175
176 String value = getAttributeValue();
177 return new AbstractMap.SimpleEntry<>(name, value);
178 }
179
180 private String getAttributeName() throws IOException {
181 skipAll(SPACE_OR_SLASH);
182 if (expect(TAG_END)) return null;
183 StringBuilder name = new StringBuilder();
184 while (!(peek() == EQUAL && name.length() > 0) &&
185 !(peek() == TAG_END || peek() == SLASH) &&
186 !skipAll(WHITESPACE)) {
187 name.append((char) getLowerCaseChar());
188 }
189 return name.toString();
190 }
191
192 private String getAttributeValue() throws IOException {
193 StringBuilder value = new StringBuilder();
194 stream.mark(1);
195 byte quote = read();
196 if (contains(QUOTE, quote)) {
197 for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
198 value.append((char) b);
199 }
200 } else {
201 stream.reset();
202 for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); b = getLowerCaseChar()) {
203 value.append((char) b);
204 stream.mark(1);
205 }
206 stream.reset(); // unread the space or tag end
207 }
208 return value.toString();
209 }
210
211 private boolean skipAll(BitSet bitSet) throws IOException {
212 boolean skipped = false;
213 stream.mark(1);
214 for (byte read = read(); contains(bitSet, read); read = read()) {
215 skipped = true;
216 stream.mark(1);
217 }
218 stream.reset();
219 return skipped;
220 }
221
222 private byte getLowerCaseChar() throws IOException {
223 byte nextPoint = read();
224 if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
225 return nextPoint;
226 }
227
228 private boolean processComment() throws IOException {
229 if (!expect(COMMENT_START)) return false;
230 if (!expect(TAG_END)) skipUntil(COMMENT_END);
231 return true;
232 }
233
234 private boolean expect(byte... expected) throws IOException {
235 stream.mark(expected.length);
236 for (byte b : expected) {
237 byte read = read();
238 if (read != b) {
239 stream.reset();
240 return false;
241 }
242 }
243 return true;
244 }
245
246 private void skipUntil(byte... expected) throws IOException {
247 while (!expect(expected)) {
248 if (stream.read() == -1) return;
249 }
250 }
251
252 private boolean readCaseInsensitive(byte... bs) throws IOException {
253 for (byte b : bs) if (getLowerCaseChar() != b) return false;
254 return true;
255 }
256
257 private byte read() throws IOException {
258 int r = stream.read();
259 if (r == -1) throw new IOException();
260 return (byte) r;
261 }
262
263 private byte peek() throws IOException {
264 stream.mark(1);
265 byte b = read();
266 stream.reset();
267 return b;
268 }
269 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector;
17
18 import org.apache.commons.io.input.BoundedInputStream;
19 import org.apache.tika.config.Field;
20 import org.apache.tika.detect.EncodingDetector;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.mime.MediaType;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.nio.charset.Charset;
27
28 import static org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel;
29
30 /**
31 * An encoding detector that tries to respect the spirit of the HTML spec
32 * part 12.2.3 "The input byte stream", or at least the part that is compatible with
33 * the implementation of tika.
34 * <p>
35 * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream
36 * <p>
37 * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata
38 * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain charset information.
39 * <p>
40 * This encoding detector may return null if no encoding is detected.
41 * It is meant to be used inside a {@link org.apache.tika.detect.CompositeDetector}.
42 * For instance:
43 * <pre> {@code
44 * EncodingDetector detector = new CompositeDetector(
45 * new StandardHtmlEncodingDetector(),
46 * new Icu4jEncodingDetector()
47 * );
48 * }</pre>
49 * <p>
50 */
51 public final class StandardHtmlEncodingDetector implements EncodingDetector {
52 private static final int META_TAG_BUFFER_SIZE = 8192;
53
54 @Field
55 private int markLimit = META_TAG_BUFFER_SIZE;
56
57 /**
58 * Extracts a charset from a Content-Type HTTP header.
59 *
60 * @param metadata parser metadata
61 * @return a charset if there is one specified, or null
62 */
63 private static Charset charsetFromContentType(Metadata metadata) {
64 String contentType = metadata.get(Metadata.CONTENT_TYPE);
65 MediaType mediatype = MediaType.parse(contentType);
66 if (mediatype == null) return null;
67 String charsetLabel = mediatype.getParameters().get("charset");
68 return getCharsetByLabel(charsetLabel);
69 }
70
71 @Override
72 public Charset detect(InputStream input, Metadata metadata) throws IOException {
73 int limit = getMarkLimit();
74 input.mark(limit);
75 // Never read more than the first META_TAG_BUFFER_SIZE bytes
76 InputStream limitedStream = new BoundedInputStream(input, limit);
77 PreScanner preScanner = new PreScanner(limitedStream);
78
79 // The order of priority for detection is:
80 // 1. Byte Order Mark
81 Charset detectedCharset = preScanner.detectBOM();
82 // 2. Transport-level information (Content-Type HTTP header)
83 if (detectedCharset == null) detectedCharset = charsetFromContentType(metadata);
84 // 3. HTML <meta> tag
85 if (detectedCharset == null) detectedCharset = preScanner.scan();
86
87 input.reset();
88 return detectedCharset;
89 }
90
91 public int getMarkLimit() {
92 return markLimit;
93 }
94
95 /**
96 * How far into the stream to read for charset detection.
97 * Default is 8192.
98 */
99 @Field
100 public void setMarkLimit(int markLimit) {
101 this.markLimit = markLimit;
102 }
103 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector.charsets;
17
18 import java.nio.ByteBuffer;
19 import java.nio.CharBuffer;
20 import java.nio.charset.Charset;
21 import java.nio.charset.CharsetDecoder;
22 import java.nio.charset.CharsetEncoder;
23 import java.nio.charset.CoderResult;
24
25 /**
26 * An implementation of the standard "replacement" charset defined by the W3C.
27 * See: https://encoding.spec.whatwg.org/#replacement
28 */
29 public class ReplacementCharset extends Charset {
30
31 public ReplacementCharset() {
32 super("replacement", null);
33 }
34
35 @Override
36 public boolean contains(Charset cs) {
37 return cs.equals(this);
38 }
39
40 public CharsetDecoder newDecoder() {
41 return new CharsetDecoder(this, Float.MIN_VALUE, 1) {
42 private boolean replacementErrorReturned = false;
43
44 @Override
45 protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
46 if (in.hasRemaining() && !replacementErrorReturned) {
47 replacementErrorReturned = true;
48 return CoderResult.malformedForLength(in.remaining());
49 }
50 in.position(in.limit());
51 return CoderResult.UNDERFLOW;
52 }
53
54 @Override
55 protected void implReset() {
56 replacementErrorReturned = false;
57 }
58 };
59 }
60
61 public CharsetEncoder newEncoder() {
62 throw new UnsupportedOperationException("This charset does not support encoding");
63 }
64 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.html.charsetdetector.charsets;
17
18 import org.apache.commons.lang.NotImplementedException;
19
20 import java.nio.ByteBuffer;
21 import java.nio.CharBuffer;
22 import java.nio.charset.Charset;
23 import java.nio.charset.CharsetDecoder;
24 import java.nio.charset.CharsetEncoder;
25 import java.nio.charset.CoderResult;
26 import java.nio.charset.StandardCharsets;
27
28 public class XUserDefinedCharset extends Charset {
29
30 public XUserDefinedCharset() {
31 super("x-user-defined", null);
32 }
33
34 @Override
35 public boolean contains(Charset cs) {
36 return cs.equals(StandardCharsets.US_ASCII);
37 }
38
39 public CharsetDecoder newDecoder() {
40 return new CharsetDecoder(this, 1, 1) {
41 @Override
42 protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
43 while (true) {
44 if (!in.hasRemaining()) return CoderResult.UNDERFLOW;
45 if (!out.hasRemaining()) return CoderResult.OVERFLOW;
46 byte b = in.get();
47 out.append((char) ((b >= 0) ? b : 0xF700 + (b & 0xFF)));
48 }
49 }
50 };
51 }
52
53 public CharsetEncoder newEncoder() {
54 throw new NotImplementedException("Encoding to x-user-defined is not implemented");
55 }
56 }
7070 * As of 2.8.0 the library supports webp.
7171 */
7272 public class ImageMetadataExtractor {
73
73 //TODO: add this to the signatures from the actual parse
74 private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
7475 private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
7576 private final Metadata metadata;
7677 private DirectoryHandler[] handlers;
183184 try (InputStream decoded =
184185 new ByteArrayInputStream(xmpData)
185186 ) {
186 Document dom = XMLReaderUtils.getDocumentBuilder().parse(decoded);
187 Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
187188 if (dom != null) {
188189 xmp = new XMPMetadata(dom);
189190 }
4242
4343 public class JempboxExtractor {
4444
45
45 //TODO: change signature to require parsecontext from parse
46 private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
4647 private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
4748
4849 // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
6667 try (InputStream decoded =
6768 new ByteArrayInputStream(xmpraw.toByteArray())
6869 ) {
69 Document dom = XMLReaderUtils.getDocumentBuilder().parse(decoded);
70 Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
7071 if (dom != null) {
7172 xmp = new XMPMetadata(dom);
7273 }
462462 while ((val_next != LT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c) and not EOL
463463 bdy_heading += (char)(val_next & 0xff); // convert the byte to an unsigned int
464464 val_next = (read < value.length) ? value[read++] : 0x00;
465 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
465 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
466466 }
467467 if (val_next == LT) {
468468 // hit the delimiter, carry on
507507 while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL
508508 bdy_title += (char)(val_next & 0xff); // convert the byte to an unsigned int
509509 val_next = (read < value.length) ? value[read++] : 0x00;
510 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
510 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
511511 }
512512
513513 if (val_next == CT) { // start of a new section , when first didn't finish cleanly
574574 // less than delimiter (\x3c), maybe also badly formed with just new line
575575 tmp_line += (char)(val_next & 0xff); // convert the byte to an unsigned int
576576 val_next = (read < value.length) ? value[read++] : 0x00;
577 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
577 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
578578 }
579579
580580 if (val_next == CT) { // start of a new section , when first didn't finish cleanly
673673 // read until the train runs out of tracks
674674 bdy_body += (char)(val_next & 0xff); // convert the byte to an unsigned int
675675 val_next = (read < value.length) ? value[read++] : 0x00;
676 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
676 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
677677 }
678678
679679 }
712712 ftr_source += (char)(val_next & 0xff); // convert the byte to an unsigned int
713713 val_next = (read < value.length) ? value[read] : 0x00; // attempt to read until end of stream
714714 read++;
715 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
715 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
716716 }
717717
718718 while ((val_next != LT) && (val_next != CR) && (val_next != LF) && (val_next != 0)) { // get as much timedate as possible
719719 // this is an american format, so arrives as mm-dd-yy HHiizzz
720720 ftr_datetime += (char)(val_next & 0xff); // convert the byte to an unsigned int
721721 val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines
722 if (read > value.length) { break; } // shouldn't ever hit this, but save a NPE
722 if (read >= value.length) { break; } // shouldn't ever hit this, but save a NPE
723723 }
724724 if (val_next == LT) {
725725 // hit the delimiter, carry on
1515 */
1616 package org.apache.tika.parser.iwork;
1717
18 import java.io.BufferedInputStream;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import javax.xml.namespace.QName;
27
2818 import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
2919 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
3020 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
3828 import org.apache.tika.parser.ParseContext;
3929 import org.apache.tika.sax.OfflineContentHandler;
4030 import org.apache.tika.sax.XHTMLContentHandler;
31 import org.apache.tika.utils.XMLReaderUtils;
4132 import org.xml.sax.ContentHandler;
4233 import org.xml.sax.SAXException;
34
35 import javax.xml.namespace.QName;
36 import java.io.BufferedInputStream;
37 import java.io.IOException;
38 import java.io.InputStream;
39 import java.util.Arrays;
40 import java.util.Collections;
41 import java.util.HashSet;
42 import java.util.Set;
4343
4444 /**
4545 * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files.
201201
202202 metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
203203 xhtml.startDocument();
204 if (contentHandler != null) {
205 context.getSAXParser().parse(
206 new CloseShieldInputStream(entryStream),
207 new OfflineContentHandler(contentHandler)
208 );
209 }
204 if (contentHandler != null) {
205 XMLReaderUtils.parseSAX(
206 new CloseShieldInputStream(entryStream),
207 new OfflineContentHandler(contentHandler),
208 context
209 );
210 }
210211 xhtml.endDocument();
211212 }
212213
2525 import org.apache.tika.exception.TikaException;
2626 import org.apache.tika.metadata.Metadata;
2727 import org.apache.tika.parser.ParseContext;
28 import org.apache.tika.utils.XMLReaderUtils;
2829 import org.w3c.dom.Document;
2930 import org.w3c.dom.Element;
3031 import org.w3c.dom.NamedNodeMap;
3940
4041 public Metadata parse(String source, ParseContext parseContext) throws TikaException, SAXException, IOException {
4142
42 Document root = parseContext.getDocumentBuilder().parse(
43 new ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8))
44 );
43 Document root = XMLReaderUtils.buildDOM(
44 new ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8)), parseContext);
45
4546 Metadata metadata = new Metadata();
4647 createGrobidMetadata(source, root.getDocumentElement(), metadata);
4748 return metadata;
7373
7474 //Extract information from header file
7575 MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file
76
7677 MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information
7778
7879 // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
1515 */
1616 package org.apache.tika.parser.microsoft;
1717
18 import java.awt.*;
18 import java.awt.Point;
1919 import java.io.IOException;
2020 import java.text.NumberFormat;
2121 import java.util.ArrayList;
4141 import org.apache.poi.hssf.record.DateWindow1904Record;
4242 import org.apache.poi.hssf.record.DrawingGroupRecord;
4343 import org.apache.poi.hssf.record.EOFRecord;
44 import org.apache.poi.hssf.record.ExtSSTRecord;
4544 import org.apache.poi.hssf.record.ExtendedFormatRecord;
4645 import org.apache.poi.hssf.record.FooterRecord;
4746 import org.apache.poi.hssf.record.FormatRecord;
6463 import org.apache.poi.poifs.filesystem.DirectoryNode;
6564 import org.apache.poi.poifs.filesystem.DocumentInputStream;
6665 import org.apache.poi.poifs.filesystem.Entry;
67 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
66 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
6867 import org.apache.tika.exception.EncryptedDocumentException;
6968 import org.apache.tika.exception.TikaException;
7069 import org.apache.tika.io.TikaInputStream;
139138 * or writing the extracted content
140139 */
141140 protected void parse(
142 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
141 POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
143142 Locale locale) throws IOException, SAXException, TikaException {
144143 parse(filesystem.getRoot(), xhtml, locale);
145144 }
273272 * @throws IOException on any IO errors.
274273 * @throws SAXException on any SAX parsing errors.
275274 */
276 public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
275 public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
277276 throws IOException, SAXException, TikaException {
278277 processFile(filesystem.getRoot(), listenForAllRecords);
279278 }
280279
281280 public void processFile(DirectoryNode root, boolean listenForAllRecords)
282281 throws IOException, SAXException, TikaException {
283
284282 // Set up listener and register the records we want to process
285283 HSSFRequest hssfRequest = new HSSFRequest();
286284 if (listenForAllRecords) {
493491 HeaderRecord headerRecord = (HeaderRecord) record;
494492 addTextCell(record, headerRecord.getText());
495493 }
496 break;
494 break;
497495
498496 case FooterRecord.sid:
499497 if (extractor.officeParserConfig.getIncludeHeadersAndFooters()) {
500498 FooterRecord footerRecord = (FooterRecord) record;
501499 addTextCell(record, footerRecord.getText());
502500 }
503 break;
504
501 break;
505502 }
506503
507504 previousSid = record.getSid();
598595 handler.startElement("tr");
599596 handler.startElement("td");
600597 for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
601 while (currentRow < entry.getKey().y) {
602 handler.endElement("td");
603 handler.endElement("tr");
604 handler.startElement("tr");
605 handler.startElement("td");
606 currentRow++;
598 if (currentRow != entry.getKey().y) {
599 // We've moved onto a new row, possibly skipping some
600 do {
601 handler.endElement("td");
602 handler.endElement("tr");
603 handler.startElement("tr");
604 handler.startElement("td");
605 currentRow++;
606 } while (officeParserConfig.getIncludeMissingRows() &&
607 currentRow < entry.getKey().y);
608 currentRow = entry.getKey().y;
607609 currentColumn = 0;
608610 }
609611
2323
2424 import org.apache.poi.common.usermodel.Hyperlink;
2525 import org.apache.poi.hslf.exceptions.EncryptedPowerPointFileException;
26 import org.apache.poi.hslf.model.Comment;
2726 import org.apache.poi.hslf.model.HeadersFooters;
28 import org.apache.poi.hslf.model.OLEShape;
2927 import org.apache.poi.hslf.record.DocInfoListContainer;
3028 import org.apache.poi.hslf.record.RecordTypes;
3129 import org.apache.poi.hslf.record.VBAInfoAtom;
3432 import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
3533 import org.apache.poi.hslf.usermodel.HSLFNotes;
3634 import org.apache.poi.hslf.usermodel.HSLFObjectData;
35 import org.apache.poi.hslf.usermodel.HSLFObjectShape;
3736 import org.apache.poi.hslf.usermodel.HSLFPictureData;
3837 import org.apache.poi.hslf.usermodel.HSLFShape;
3938 import org.apache.poi.hslf.usermodel.HSLFSlide;
4544 import org.apache.poi.hslf.usermodel.HSLFTextRun;
4645 import org.apache.poi.hslf.usermodel.HSLFTextShape;
4746 import org.apache.poi.poifs.filesystem.DirectoryNode;
48 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
47 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
48 import org.apache.poi.sl.usermodel.Comment;
49 import org.apache.poi.sl.usermodel.SimpleShape;
4950 import org.apache.tika.exception.TikaException;
5051 import org.apache.tika.exception.EncryptedDocumentException;
5152 import org.apache.tika.extractor.EmbeddedDocumentUtil;
6667 }
6768
6869 protected void parse(
69 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
70 POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
7071 throws IOException, SAXException, TikaException {
7172 parse(filesystem.getRoot(), xhtml);
7273 }
268269 long persistId = vbaAtom.getPersistIdRef();
269270 for (HSLFObjectData objData : ppt.getEmbeddedObjects()) {
270271 if (objData.getExOleObjStg().getPersistId() == persistId) {
271 try (NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(objData.getData())) {
272 try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem(objData.getInputStream())) {
272273 try {
273 OfficeParser.extractMacros(npoifsFileSystem, xhtml,
274 OfficeParser.extractMacros(poifsFileSystem, xhtml,
274275 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
275276 } catch (IOException|SAXException inner) {
276277 EmbeddedDocumentUtil.recordException(inner, parentMetadata);
294295
295296 xhtml.startElement("div", "class", "slide-master-content");
296297 for (HSLFShape shape : shapes) {
297 if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
298 if (shape != null && ! isPlaceholder(shape)) {
298299 if (shape instanceof HSLFTextShape) {
299300 HSLFTextShape tsh = (HSLFTextShape) shape;
300301 String text = tsh.getText();
305306 }
306307 }
307308 xhtml.endElement("div");
309 }
310
311 private boolean isPlaceholder(HSLFShape shape) {
312 return shape instanceof SimpleShape && ((SimpleShape)shape).isPlaceholder();
308313 }
309314
310315 private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
448453 }
449454
450455 for (HSLFShape shape : shapes) {
451 if (shape instanceof OLEShape) {
452 OLEShape oleShape = (OLEShape) shape;
456 if (shape instanceof HSLFObjectShape) {
457 HSLFObjectShape oleShape = (HSLFObjectShape) shape;
453458 HSLFObjectData data = null;
454459 try {
455460 data = oleShape.getObjectData();
473478 xhtml.endElement("div");
474479 InputStream dataStream = null;
475480 try {
476 dataStream = data.getData();
481 dataStream = data.getInputStream();
477482 } catch (Exception e) {
478483 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
479484 continue;
480485 }
481486 try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
482487 String mediaType = null;
483 if ("Excel.Chart.8".equals(oleShape.getProgID())) {
488 if ("Excel.Chart.8".equals(oleShape.getProgId())) {
484489 mediaType = "application/vnd.ms-excel";
485490 } else {
486491 MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
488493 }
489494 if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
490495 || mediaType.equals("application/x-tika-msoffice")) {
491 NPOIFSFileSystem npoifs = null;
496 POIFSFileSystem poifs = null;
492497
493498 try {
494 npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
499 poifs = new POIFSFileSystem(new CloseShieldInputStream(stream));
495500 } catch (RuntimeException e) {
496501 throw new IOExceptionWithCause(e);
497502 }
498503 try {
499 handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
504 handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
500505 } finally {
501 if (npoifs != null) {
502 npoifs.close();
506 if (poifs != null) {
507 poifs.close();
503508 }
504509 }
505510 } else {
0 /*
1 Copyright (c) 2013 James Ahlborn
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 */
15
16 package org.apache.tika.parser.microsoft;
17
18 import com.healthmarketscience.jackcess.RuntimeIOException;
19 import com.healthmarketscience.jackcess.impl.ByteUtil;
20 import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
21 import com.healthmarketscience.jackcess.util.MemFileChannel;
22 import com.healthmarketscience.jackcess.util.OleBlob;
23 import org.apache.commons.lang.builder.ToStringBuilder;
24 import org.apache.poi.poifs.filesystem.DirectoryEntry;
25 import org.apache.poi.poifs.filesystem.DocumentEntry;
26 import org.apache.poi.poifs.filesystem.DocumentInputStream;
27 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
28
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.OutputStream;
33 import java.io.UnsupportedEncodingException;
34 import java.net.URLDecoder;
35 import java.net.URLEncoder;
36 import java.nio.ByteBuffer;
37 import java.util.ArrayList;
38 import java.util.Iterator;
39 import java.util.List;
40
41 /**
42 * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0.
43 * This class will be removed once POI 4.0.0 is released and jackcess
44 * updates to the most recent version of POI.
45 * @deprecated -- this class will be removed in Tika >= 1.20
46 */
47 @Deprecated
48 class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory {
49 private static final String ENTRY_NAME_CHARSET = "UTF-8";
50 private static final String ENTRY_SEPARATOR = "/";
51 private static final String CONTENTS_ENTRY = "CONTENTS";
52
53 static {
54 // force a poi class to be loaded to ensure that when this class is
55 // loaded, we know that the poi classes are available
56 POIFSFileSystem.class.getName();
57 }
58
59 public JackcessCompoundOleUtil() {
60 }
61
62 /**
63 * Creates a nes CompoundContent for the given blob information.
64 */
65 public JackcessOleUtil.ContentImpl createCompoundPackageContent(
66 JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName,
67 ByteBuffer blobBb, int dataBlockLen) {
68 return new CompoundContentImpl(blob, prettyName, className, typeName,
69 blobBb.position(), dataBlockLen);
70 }
71
72 /**
73 * Gets a DocumentEntry from compound storage based on a fully qualified,
74 * encoded entry name.
75 *
76 * @param entryName fully qualified, encoded entry name
77 * @param dir root directory of the compound storage
78 * @return the relevant DocumentEntry
79 * @throws FileNotFoundException if the entry does not exist
80 * @throws IOException if some other io error occurs
81 */
82 public static DocumentEntry getDocumentEntry(String entryName,
83 DirectoryEntry dir)
84 throws IOException {
85 // split entry name into individual components and decode them
86 List<String> entryNames = new ArrayList<String>();
87 for (String str : entryName.split(ENTRY_SEPARATOR)) {
88 if (str.length() == 0) {
89 continue;
90 }
91 entryNames.add(decodeEntryName(str));
92 }
93
94 DocumentEntry entry = null;
95 Iterator<String> iter = entryNames.iterator();
96 while (iter.hasNext()) {
97 org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next());
98 if (tmpEntry instanceof DirectoryEntry) {
99 dir = (DirectoryEntry) tmpEntry;
100 } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) {
101 entry = (DocumentEntry) tmpEntry;
102 } else {
103 break;
104 }
105 }
106
107 if (entry == null) {
108 throw new FileNotFoundException("Could not find document " + entryName);
109 }
110
111 return entry;
112 }
113
114 private static String encodeEntryName(String name) {
115 try {
116 return URLEncoder.encode(name, ENTRY_NAME_CHARSET);
117 } catch (UnsupportedEncodingException e) {
118 throw new RuntimeException(e);
119 }
120 }
121
122 private static String decodeEntryName(String name) {
123 try {
124 return URLDecoder.decode(name, ENTRY_NAME_CHARSET);
125 } catch (UnsupportedEncodingException e) {
126 throw new RuntimeException(e);
127 }
128 }
129
130 private static final class CompoundContentImpl
131 extends JackcessOleUtil.EmbeddedPackageContentImpl
132 implements OleBlob.CompoundContent {
133 private POIFSFileSystem _fs;
134
135 private CompoundContentImpl(
136 JackcessOleUtil.OleBlobImpl blob, String prettyName, String className,
137 String typeName, int position, int length) {
138 super(blob, prettyName, className, typeName, position, length);
139 }
140
141 public OleBlob.ContentType getType() {
142 return OleBlob.ContentType.COMPOUND_STORAGE;
143 }
144
145 private POIFSFileSystem getFileSystem() throws IOException {
146 if (_fs == null) {
147 _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r"));
148 }
149 return _fs;
150 }
151
152 public Iterator<Entry> iterator() {
153 try {
154 return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(),
155 ENTRY_SEPARATOR).iterator();
156 } catch (IOException e) {
157 throw new RuntimeIOException(e);
158 }
159 }
160
161 public EntryImpl getEntry(String entryName) throws IOException {
162 return new EntryImpl(entryName,
163 getDocumentEntry(entryName, getFileSystem().getRoot()));
164 }
165
166 public boolean hasContentsEntry() throws IOException {
167 return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY);
168 }
169
170 public EntryImpl getContentsEntry() throws IOException {
171 return getEntry(CONTENTS_ENTRY);
172 }
173
174 private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir,
175 String prefix) {
176 for (org.apache.poi.poifs.filesystem.Entry entry : dir) {
177 if (entry instanceof DirectoryEntry) {
178 // .. recurse into this directory
179 getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR);
180 } else if (entry instanceof DocumentEntry) {
181 // grab the entry name/detils
182 DocumentEntry de = (DocumentEntry) entry;
183 String entryName = prefix + encodeEntryName(entry.getName());
184 entries.add(new EntryImpl(entryName, de));
185 }
186 }
187 return entries;
188 }
189
190 @Override
191 public void close() {
192 ByteUtil.closeQuietly(_fs);
193 _fs = null;
194 super.close();
195 }
196
197 @Override
198 public String toString() {
199 ToStringBuilder sb = toString(CustomToStringStyle.builder(this));
200
201 try {
202 sb.append("hasContentsEntry", hasContentsEntry());
203 sb.append("entries", getEntries(new ArrayList<Entry>(),
204 getFileSystem().getRoot(),
205 ENTRY_SEPARATOR));
206 } catch (IOException e) {
207 sb.append("entries", "<" + e + ">");
208 }
209
210 return sb.toString();
211 }
212
213 private final class EntryImpl implements OleBlob.CompoundContent.Entry {
214 private final String _name;
215 private final DocumentEntry _docEntry;
216
217 private EntryImpl(String name, DocumentEntry docEntry) {
218 _name = name;
219 _docEntry = docEntry;
220 }
221
222 public OleBlob.ContentType getType() {
223 return OleBlob.ContentType.UNKNOWN;
224 }
225
226 public String getName() {
227 return _name;
228 }
229
230 public CompoundContentImpl getParent() {
231 return CompoundContentImpl.this;
232 }
233
234 public JackcessOleUtil.OleBlobImpl getBlob() {
235 return getParent().getBlob();
236 }
237
238 public long length() {
239 return _docEntry.getSize();
240 }
241
242 public InputStream getStream() throws IOException {
243 return new DocumentInputStream(_docEntry);
244 }
245
246 public void writeTo(OutputStream out) throws IOException {
247 InputStream in = null;
248 try {
249 ByteUtil.copy(in = getStream(), out);
250 } finally {
251 ByteUtil.closeQuietly(in);
252 }
253 }
254
255 @Override
256 public String toString() {
257 return CustomToStringStyle.valueBuilder(this)
258 .append("name", _name)
259 .append("length", length())
260 .toString();
261 }
262 }
263 }
264 }
265
266
267
4040 import com.healthmarketscience.jackcess.Table;
4141 import com.healthmarketscience.jackcess.query.Query;
4242 import com.healthmarketscience.jackcess.util.OleBlob;
43 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
43 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
4444 import org.apache.tika.exception.TikaException;
4545 import org.apache.tika.extractor.EmbeddedDocumentUtil;
4646 import org.apache.tika.io.IOUtils;
301301 }
302302 }
303303
304
304305 private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
305 OleBlob blob = row.getBlob(cName);
306 OleBlob blob = getBlob(row, cName);
306307 //lifted shamelessly from Jackcess's OleBlobTest
307308 if (blob == null)
308309 return;
366367 }
367368 }
368369
370 /*
371 Temporary work around until POI 4.0.0 is released and jackcess upgrades
372 This is copy/pasted from jackcess
373 */
374 private OleBlob getBlob(Row row, String cName) {
375 byte[] bytes = row.getBytes(cName);
376 if (bytes == null) {
377 return null;
378 }
379 return JackcessOleUtil.parseBlob(bytes);
380 }
381
369382 private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
370383 InputStream is = null;
371 NPOIFSFileSystem nfs = null;
384 POIFSFileSystem fileSystem = null;
372385 try {
373386 try {
374387 is = cc.getStream();
378391 }
379392
380393 try {
381 nfs = new NPOIFSFileSystem(is);
394 fileSystem = new POIFSFileSystem(is);
382395 } catch (Exception e) {
383396 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
384397 return;
385398 }
386399
387 handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
400 handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
388401
389402 } finally {
390 if (nfs != null) {
403 if (fileSystem != null) {
391404 try {
392 nfs.close();
405 fileSystem.close();
393406 } catch (IOException e) {
394407 //swallow
395408 }
413426 }
414427 return shortDateTimeFormatter.format(d);
415428 }
429
416430 }
417431
0 /*
1 Copyright (c) 2013 James Ahlborn
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 */
15
16 package org.apache.tika.parser.microsoft;
17
18 import java.io.ByteArrayInputStream;
19 import java.io.Closeable;
20 import java.io.FileInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.OutputStream;
24 import java.nio.ByteBuffer;
25 import java.nio.charset.Charset;
26 import java.sql.Blob;
27 import java.sql.SQLException;
28 import java.sql.SQLFeatureNotSupportedException;
29 import java.text.Normalizer;
30 import java.util.EnumSet;
31 import java.util.Set;
32 import java.util.regex.Pattern;
33
34 import com.healthmarketscience.jackcess.DataType;
35 import com.healthmarketscience.jackcess.util.OleBlob;
36 import static com.healthmarketscience.jackcess.util.OleBlob.*;
37 import org.apache.commons.lang.builder.ToStringBuilder;
38
39 import com.healthmarketscience.jackcess.impl.ByteUtil;
40 import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
41 import com.healthmarketscience.jackcess.impl.PageChannel;
42
43 /**
44 * Utility code for working with OLE data.
45 * Temporary workaround until POI 4.0.0 is released and Jackcess is updated
46 *
47 *
48 * @author James Ahlborn
49 * @usage _advanced_class_
50 * @deprecated this class will be removed in Tika >= 1.20
51 */
52 @Deprecated
53 class JackcessOleUtil {
54
55
56 /**
57 * Interface used to allow optional inclusion of the poi library for working
58 * with compound ole data.
59 */
60 interface CompoundPackageFactory
61 {
62 public ContentImpl createCompoundPackageContent(
63 OleBlobImpl blob, String prettyName, String className, String typeName,
64 ByteBuffer blobBb, int dataBlockLen);
65 }
66
67 private static final int PACKAGE_SIGNATURE = 0x1C15;
68 private static final Charset OLE_CHARSET = Charset.forName("US-ASCII");
69 private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE");
70 private static final byte[] COMPOUND_STORAGE_SIGNATURE =
71 {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,
72 (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1};
73 private static final String SIMPLE_PACKAGE_TYPE = "Package";
74 private static final int PACKAGE_OBJECT_TYPE = 0x02;
75 private static final int OLE_VERSION = 0x0501;
76 private static final int OLE_FORMAT = 0x02;
77 private static final int PACKAGE_STREAM_SIGNATURE = 0x02;
78 private static final int PS_EMBEDDED_FILE = 0x030000;
79 private static final int PS_LINKED_FILE = 0x010000;
80 private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of(
81 ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER);
82 private static final byte[] NO_DATA = new byte[0];
83 private static final int LINK_HEADER = 0x01;
84 private static final byte[] PACKAGE_FOOTER = {
85 0x01, 0x05, 0x00, 0x00, 0x00, 0x00,
86 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
87 };
88
89 // regex pattern which matches all the crazy extra stuff in unicode
90 private static final Pattern UNICODE_ACCENT_PATTERN =
91 Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
92
93 private static final CompoundPackageFactory COMPOUND_FACTORY;
94
95 static {
96 CompoundPackageFactory compoundFactory = null;
97 try {
98 compoundFactory = (CompoundPackageFactory)
99 Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil")
100 .newInstance();
101 } catch(Throwable t) {
102 // must not have poi, will load compound ole data as "other"
103 }
104 COMPOUND_FACTORY = compoundFactory;
105 }
106
107 /**
108 * Parses an access database blob structure and returns an appropriate
109 * OleBlob instance.
110 */
111 public static OleBlob parseBlob(byte[] bytes) {
112 return new OleBlobImpl(bytes);
113 }
114
115 /**
116 * Creates a new OlBlob instance using the given information.
117 */
118 public static OleBlob createBlob(Builder oleBuilder)
119 throws IOException
120 {
121 try {
122
123 if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) {
124 throw new IllegalArgumentException(
125 "Cannot currently create ole values of type " +
126 oleBuilder.getType());
127 }
128
129 long contentLen = oleBuilder.getContentLength();
130 byte[] contentBytes = oleBuilder.getBytes();
131 InputStream contentStream = oleBuilder.getStream();
132 byte[] packageStreamHeader = NO_DATA;
133 byte[] packageStreamFooter = NO_DATA;
134
135 switch(oleBuilder.getType()) {
136 case LINK:
137 packageStreamHeader = writePackageStreamHeader(oleBuilder);
138
139 // link "content" is file path
140 contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
141 contentLen = contentBytes.length;
142 break;
143
144 case SIMPLE_PACKAGE:
145 packageStreamHeader = writePackageStreamHeader(oleBuilder);
146 packageStreamFooter = writePackageStreamFooter(oleBuilder);
147 break;
148
149 case OTHER:
150 // nothing more to do
151 break;
152 default:
153 throw new RuntimeException("unexpected type " + oleBuilder.getType());
154 }
155
156 long payloadLen = packageStreamHeader.length + packageStreamFooter.length +
157 contentLen;
158 byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen);
159
160 long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length +
161 payloadLen;
162 if(totalOleLen > DataType.OLE.getMaxSize()) {
163 throw new IllegalArgumentException("Content size of " + totalOleLen +
164 " is too large for ole column");
165 }
166
167 byte[] oleBytes = new byte[(int)totalOleLen];
168 ByteBuffer bb = PageChannel.wrap(oleBytes);
169 bb.put(packageHeader);
170 bb.put(packageStreamHeader);
171
172 if(contentLen > 0L) {
173 if(contentBytes != null) {
174 bb.put(contentBytes);
175 } else {
176 byte[] buf = new byte[8192];
177 int numBytes = 0;
178 while((numBytes = contentStream.read(buf)) >= 0) {
179 bb.put(buf, 0, numBytes);
180 }
181 }
182 }
183
184 bb.put(packageStreamFooter);
185 bb.put(PACKAGE_FOOTER);
186
187 return parseBlob(oleBytes);
188
189 } finally {
190 ByteUtil.closeQuietly(oleBuilder.getStream());
191 }
192 }
193
194 private static byte[] writePackageHeader(Builder oleBuilder,
195 long contentLen) {
196
197 byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName());
198 String className = oleBuilder.getClassName();
199 String typeName = oleBuilder.getTypeName();
200 if(className == null) {
201 className = typeName;
202 } else if(typeName == null) {
203 typeName = className;
204 }
205 byte[] classNameBytes = getZeroTermStrBytes(className);
206 byte[] typeNameBytes = getZeroTermStrBytes(typeName);
207
208 int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length;
209
210 int oleHeaderLen = 24 + typeNameBytes.length;
211
212 byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen];
213
214 ByteBuffer bb = PageChannel.wrap(headerBytes);
215
216 // write outer package header
217 bb.putShort((short)PACKAGE_SIGNATURE);
218 bb.putShort((short)packageHeaderLen);
219 bb.putInt(PACKAGE_OBJECT_TYPE);
220 bb.putShort((short)prettyNameBytes.length);
221 bb.putShort((short)classNameBytes.length);
222 int prettyNameOff = bb.position() + 8;
223 bb.putShort((short)prettyNameOff);
224 bb.putShort((short)(prettyNameOff + prettyNameBytes.length));
225 bb.putInt(-1);
226 bb.put(prettyNameBytes);
227 bb.put(classNameBytes);
228
229 // put ole header
230 bb.putInt(OLE_VERSION);
231 bb.putInt(OLE_FORMAT);
232 bb.putInt(typeNameBytes.length);
233 bb.put(typeNameBytes);
234 bb.putLong(0L);
235 bb.putInt((int)contentLen);
236
237 return headerBytes;
238 }
239
240 private static byte[] writePackageStreamHeader(Builder oleBuilder) {
241
242 byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName());
243 byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
244
245 int headerLen = 6 + fileNameBytes.length + filePathBytes.length;
246
247 if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
248
249 headerLen += 8 + filePathBytes.length;
250
251 } else {
252
253 headerLen += 2;
254 }
255
256 byte[] headerBytes = new byte[headerLen];
257 ByteBuffer bb = PageChannel.wrap(headerBytes);
258 bb.putShort((short)PACKAGE_STREAM_SIGNATURE);
259 bb.put(fileNameBytes);
260 bb.put(filePathBytes);
261
262 if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
263 bb.putInt(PS_EMBEDDED_FILE);
264 bb.putInt(filePathBytes.length);
265 bb.put(filePathBytes, 0, filePathBytes.length);
266 bb.putInt((int) oleBuilder.getContentLength());
267 } else {
268 bb.putInt(PS_LINKED_FILE);
269 bb.putShort((short)LINK_HEADER);
270 }
271
272 return headerBytes;
273 }
274
275 private static byte[] writePackageStreamFooter(Builder oleBuilder) {
276
277 // note, these are _not_ zero terminated
278 byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET);
279 byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET);
280
281 int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length;
282
283 byte[] footerBytes = new byte[footerLen];
284 ByteBuffer bb = PageChannel.wrap(footerBytes);
285
286 bb.putInt(filePathBytes.length/2);
287 bb.put(filePathBytes);
288 bb.putInt(fileNameBytes.length/2);
289 bb.put(fileNameBytes);
290 bb.putInt(filePathBytes.length/2);
291 bb.put(filePathBytes);
292
293 return footerBytes;
294 }
295
296 /**
297 * creates the appropriate ContentImpl for the given blob.
298 */
299 private static ContentImpl parseContent(OleBlobImpl blob)
300 throws IOException
301 {
302 ByteBuffer bb = PageChannel.wrap(blob.getBytes());
303
304 if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) {
305 return new UnknownContentImpl(blob);
306 }
307
308 // read outer package header
309 int headerSize = bb.getShort();
310 /* int objType = */ bb.getInt();
311 int prettyNameLen = bb.getShort();
312 int classNameLen = bb.getShort();
313 int prettyNameOff = bb.getShort();
314 int classNameOff = bb.getShort();
315 /* int objSize = */ bb.getInt();
316 String prettyName = readStr(bb, prettyNameOff, prettyNameLen);
317 String className = readStr(bb, classNameOff, classNameLen);
318 bb.position(headerSize);
319
320 // read ole header
321 int oleVer = bb.getInt();
322 /* int format = */ bb.getInt();
323
324 if(oleVer != OLE_VERSION) {
325 return new UnknownContentImpl(blob);
326 }
327
328 int typeNameLen = bb.getInt();
329 String typeName = readStr(bb, bb.position(), typeNameLen);
330 bb.getLong(); // unused
331 int dataBlockLen = bb.getInt();
332 int dataBlockPos = bb.position();
333
334
335 if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) {
336 return createSimplePackageContent(
337 blob, prettyName, className, typeName, bb, dataBlockLen);
338 }
339
340 // if COMPOUND_FACTORY is null, the poi library isn't available, so just
341 // load compound data as "other"
342 if((COMPOUND_FACTORY != null) &&
343 (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) &&
344 ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) {
345 return COMPOUND_FACTORY.createCompoundPackageContent(
346 blob, prettyName, className, typeName, bb, dataBlockLen);
347 }
348
349 // this is either some other "special" (as yet unhandled) format, or it is
350 // simply an embedded file (or it is compound data and poi isn't available)
351 return new OtherContentImpl(blob, prettyName, className,
352 typeName, dataBlockPos, dataBlockLen);
353 }
354
355 private static ContentImpl createSimplePackageContent(
356 OleBlobImpl blob, String prettyName, String className, String typeName,
357 ByteBuffer blobBb, int dataBlockLen) {
358
359 int dataBlockPos = blobBb.position();
360 ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos,
361 dataBlockPos + dataBlockLen);
362
363 int packageSig = bb.getShort();
364 if(packageSig != PACKAGE_STREAM_SIGNATURE) {
365 return new OtherContentImpl(blob, prettyName, className,
366 typeName, dataBlockPos, dataBlockLen);
367 }
368
369 String fileName = readZeroTermStr(bb);
370 String filePath = readZeroTermStr(bb);
371 int packageType = bb.getInt();
372
373 if(packageType == PS_EMBEDDED_FILE) {
374
375 int localFilePathLen = bb.getInt();
376 String localFilePath = readStr(bb, bb.position(), localFilePathLen);
377 int dataLen = bb.getInt();
378 int dataPos = bb.position();
379 bb.position(dataLen + dataPos);
380
381 // remaining strings are in "reverse" order (local file path, file name,
382 // file path). these string usee a real utf charset, and therefore can
383 // "fix" problems with ascii based names (so we prefer these strings to
384 // the original strings we found)
385 int strNum = 0;
386 while(true) {
387
388 int rem = bb.remaining();
389 if(rem < 4) {
390 break;
391 }
392
393 int strLen = bb.getInt();
394 String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET);
395
396 switch(strNum) {
397 case 0:
398 localFilePath = remStr;
399 break;
400 case 1:
401 fileName = remStr;
402 break;
403 case 2:
404 filePath = remStr;
405 break;
406 default:
407 // ignore
408 }
409
410 ++strNum;
411 }
412
413 return new SimplePackageContentImpl(
414 blob, prettyName, className, typeName, dataPos, dataLen,
415 fileName, filePath, localFilePath);
416 }
417
418 if(packageType == PS_LINKED_FILE) {
419
420 bb.getShort(); //unknown
421 String linkStr = readZeroTermStr(bb);
422
423 return new LinkContentImpl(blob, prettyName, className, typeName,
424 fileName, linkStr, filePath);
425 }
426
427 return new OtherContentImpl(blob, prettyName, className,
428 typeName, dataBlockPos, dataBlockLen);
429 }
430
431 private static String readStr(ByteBuffer bb, int off, int len) {
432 return readStr(bb, off, len, OLE_CHARSET);
433 }
434
435 private static String readZeroTermStr(ByteBuffer bb) {
436 int off = bb.position();
437 while(bb.hasRemaining()) {
438 byte b = bb.get();
439 if(b == 0) {
440 break;
441 }
442 }
443 int len = bb.position() - off;
444 return readStr(bb, off, len);
445 }
446
447 private static String readStr(ByteBuffer bb, int off, int len,
448 Charset charset) {
449 String str = new String(bb.array(), off, len, charset);
450 bb.position(off + len);
451 if(str.charAt(str.length() - 1) == '\0') {
452 str = str.substring(0, str.length() - 1);
453 }
454 return str;
455 }
456
457 private static byte[] getZeroTermStrBytes(String str) {
458 // since we are converting to ascii, try to make "nicer" versions of crazy
459 // chars (e.g. convert "u with an umlaut" to just "u"). this may not
460 // ultimately help anything but it is what ms access does.
461
462 // decompose complex chars into combos of char and accent
463 str = Normalizer.normalize(str, Normalizer.Form.NFD);
464 // strip the accents
465 str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
466 // (re)normalize what is left
467 str = Normalizer.normalize(str, Normalizer.Form.NFC);
468
469 return (str + '\0').getBytes(OLE_CHARSET);
470 }
471
472
473 static final class OleBlobImpl implements OleBlob
474 {
475 private byte[] _bytes;
476 private ContentImpl _content;
477
478 private OleBlobImpl(byte[] bytes) {
479 _bytes = bytes;
480 }
481
482 public void writeTo(OutputStream out) throws IOException {
483 out.write(_bytes);
484 }
485
486 public Content getContent() throws IOException {
487 if(_content == null) {
488 _content = parseContent(this);
489 }
490 return _content;
491 }
492
493 public InputStream getBinaryStream() throws SQLException {
494 return new ByteArrayInputStream(_bytes);
495 }
496
497 public InputStream getBinaryStream(long pos, long len)
498 throws SQLException
499 {
500 return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len);
501 }
502
503 public long length() throws SQLException {
504 return _bytes.length;
505 }
506
507 public byte[] getBytes() throws IOException {
508 if(_bytes == null) {
509 throw new IOException("blob is closed");
510 }
511 return _bytes;
512 }
513
514 public byte[] getBytes(long pos, int len) throws SQLException {
515 return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len);
516 }
517
518 public long position(byte[] pattern, long start) throws SQLException {
519 int pos = ByteUtil.findRange(PageChannel.wrap(_bytes),
520 fromJdbcOffset(start), pattern);
521 return((pos >= 0) ? toJdbcOffset(pos) : pos);
522 }
523
524 public long position(Blob pattern, long start) throws SQLException {
525 return position(pattern.getBytes(1L, (int)pattern.length()), start);
526 }
527
528 public OutputStream setBinaryStream(long position) throws SQLException {
529 throw new SQLFeatureNotSupportedException();
530 }
531
532 public void truncate(long len) throws SQLException {
533 throw new SQLFeatureNotSupportedException();
534 }
535
536 public int setBytes(long pos, byte[] bytes) throws SQLException {
537 throw new SQLFeatureNotSupportedException();
538 }
539
540 public int setBytes(long pos, byte[] bytes, int offset, int lesn)
541 throws SQLException {
542 throw new SQLFeatureNotSupportedException();
543 }
544
545 public void free() {
546 close();
547 }
548
549 public void close() {
550 _bytes = null;
551 ByteUtil.closeQuietly(_content);
552 _content = null;
553 }
554
555 private static int toJdbcOffset(int off) {
556 return off + 1;
557 }
558
559 private static int fromJdbcOffset(long off) {
560 return (int)off - 1;
561 }
562
563 @Override
564 public String toString() {
565 ToStringBuilder sb = CustomToStringStyle.builder(this);
566 if(_content != null) {
567 sb.append("content", _content);
568 } else {
569 sb.append("bytes", _bytes);
570 sb.append("content", "(uninitialized)");
571 }
572 return sb.toString();
573 }
574 }
575
576 static abstract class ContentImpl implements Content, Closeable
577 {
578 protected final OleBlobImpl _blob;
579
580 protected ContentImpl(OleBlobImpl blob) {
581 _blob = blob;
582 }
583
584 public OleBlobImpl getBlob() {
585 return _blob;
586 }
587
588 protected byte[] getBytes() throws IOException {
589 return getBlob().getBytes();
590 }
591
592 public void close() {
593 // base does nothing
594 }
595
596 protected ToStringBuilder toString(ToStringBuilder sb) {
597 sb.append("type", getType());
598 return sb;
599 }
600 }
601
602 static abstract class EmbeddedContentImpl extends ContentImpl
603 implements EmbeddedContent
604 {
605 private final int _position;
606 private final int _length;
607
608 protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length)
609 {
610 super(blob);
611 _position = position;
612 _length = length;
613 }
614
615 public long length() {
616 return _length;
617 }
618
619 public InputStream getStream() throws IOException {
620 return new ByteArrayInputStream(getBytes(), _position, _length);
621 }
622
623 public void writeTo(OutputStream out) throws IOException {
624 out.write(getBytes(), _position, _length);
625 }
626
627 @Override
628 protected ToStringBuilder toString(ToStringBuilder sb) {
629 super.toString(sb);
630 if(_position >= 0) {
631 sb.append("content", ByteBuffer.wrap(_blob._bytes, _position, _length));
632 }
633 return sb;
634 }
635 }
636
637 static abstract class EmbeddedPackageContentImpl
638 extends EmbeddedContentImpl
639 implements PackageContent
640 {
641 private final String _prettyName;
642 private final String _className;
643 private final String _typeName;
644
645 protected EmbeddedPackageContentImpl(
646 OleBlobImpl blob, String prettyName, String className,
647 String typeName, int position, int length)
648 {
649 super(blob, position, length);
650 _prettyName = prettyName;
651 _className = className;
652 _typeName = typeName;
653 }
654
655 public String getPrettyName() {
656 return _prettyName;
657 }
658
659 public String getClassName() {
660 return _className;
661 }
662
663 public String getTypeName() {
664 return _typeName;
665 }
666
667 @Override
668 protected ToStringBuilder toString(ToStringBuilder sb) {
669 sb.append("prettyName", _prettyName)
670 .append("className", _className)
671 .append("typeName", _typeName);
672 super.toString(sb);
673 return sb;
674 }
675 }
676
677 private static final class LinkContentImpl
678 extends EmbeddedPackageContentImpl
679 implements LinkContent
680 {
681 private final String _fileName;
682 private final String _linkPath;
683 private final String _filePath;
684
685 private LinkContentImpl(OleBlobImpl blob, String prettyName,
686 String className, String typeName,
687 String fileName, String linkPath,
688 String filePath)
689 {
690 super(blob, prettyName, className, typeName, -1, -1);
691 _fileName = fileName;
692 _linkPath = linkPath;
693 _filePath = filePath;
694 }
695
696 public ContentType getType() {
697 return ContentType.LINK;
698 }
699
700 public String getFileName() {
701 return _fileName;
702 }
703
704 public String getLinkPath() {
705 return _linkPath;
706 }
707
708 public String getFilePath() {
709 return _filePath;
710 }
711
712 public InputStream getLinkStream() throws IOException {
713 return new FileInputStream(getLinkPath());
714 }
715
716 @Override
717 public String toString() {
718 return toString(CustomToStringStyle.builder(this))
719 .append("fileName", _fileName)
720 .append("linkPath", _linkPath)
721 .append("filePath", _filePath)
722 .toString();
723 }
724 }
725
726 private static final class SimplePackageContentImpl
727 extends EmbeddedPackageContentImpl
728 implements SimplePackageContent
729 {
730 private final String _fileName;
731 private final String _filePath;
732 private final String _localFilePath;
733
734 private SimplePackageContentImpl(OleBlobImpl blob, String prettyName,
735 String className, String typeName,
736 int position, int length,
737 String fileName, String filePath,
738 String localFilePath)
739 {
740 super(blob, prettyName, className, typeName, position, length);
741 _fileName = fileName;
742 _filePath = filePath;
743 _localFilePath = localFilePath;
744 }
745
746 public ContentType getType() {
747 return ContentType.SIMPLE_PACKAGE;
748 }
749
750 public String getFileName() {
751 return _fileName;
752 }
753
754 public String getFilePath() {
755 return _filePath;
756 }
757
758 public String getLocalFilePath() {
759 return _localFilePath;
760 }
761
762 @Override
763 public String toString() {
764 return toString(CustomToStringStyle.builder(this))
765 .append("fileName", _fileName)
766 .append("filePath", _filePath)
767 .append("localFilePath", _localFilePath)
768 .toString();
769 }
770 }
771
772 private static final class OtherContentImpl
773 extends EmbeddedPackageContentImpl
774 implements OtherContent
775 {
776 private OtherContentImpl(
777 OleBlobImpl blob, String prettyName, String className,
778 String typeName, int position, int length)
779 {
780 super(blob, prettyName, className, typeName, position, length);
781 }
782
783 public ContentType getType() {
784 return ContentType.OTHER;
785 }
786
787 @Override
788 public String toString() {
789 return toString(CustomToStringStyle.builder(this))
790 .toString();
791 }
792 }
793
794 private static final class UnknownContentImpl extends ContentImpl
795 {
796 private UnknownContentImpl(OleBlobImpl blob) {
797 super(blob);
798 }
799
800 public ContentType getType() {
801 return ContentType.UNKNOWN;
802 }
803
804 @Override
805 public String toString() {
806 return toString(CustomToStringStyle.builder(this))
807 .append("content", _blob._bytes)
808 .toString();
809 }
810 }
811
812 }
3535 import org.apache.poi.poifs.filesystem.DirectoryEntry;
3636 import org.apache.poi.poifs.filesystem.DirectoryNode;
3737 import org.apache.poi.poifs.filesystem.Entry;
38 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
3938 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
4039 import org.apache.poi.poifs.macros.VBAMacroReader;
4140 import org.apache.poi.util.IOUtils;
104103
105104 final DirectoryNode root;
106105 TikaInputStream tstream = TikaInputStream.cast(stream);
107 NPOIFSFileSystem mustCloseFs = null;
106 POIFSFileSystem mustCloseFs = null;
108107 try {
109108 if (tstream == null) {
110 mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
109 mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream));
111110 root = mustCloseFs.getRoot();
112111 } else {
113112 final Object container = tstream.getOpenContainer();
114 if (container instanceof NPOIFSFileSystem) {
115 root = ((NPOIFSFileSystem) container).getRoot();
113 if (container instanceof POIFSFileSystem) {
114 root = ((POIFSFileSystem) container).getRoot();
116115 } else if (container instanceof DirectoryNode) {
117116 root = (DirectoryNode) container;
118117 } else {
119 NPOIFSFileSystem fs = null;
118 POIFSFileSystem fs = null;
120119 if (tstream.hasFile()) {
121 fs = new NPOIFSFileSystem(tstream.getFile(), true);
120 fs = new POIFSFileSystem(tstream.getFile(), true);
122121 } else {
123 fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
122 fs = new POIFSFileSystem(new CloseShieldInputStream(tstream));
124123 }
125124 //tstream will close the fs, no need to close this below
126125 tstream.setOpenContainer(fs);
273272 return detectType(fs.getRoot());
274273 }
275274
276 public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
277 return detectType(fs.getRoot());
278 }
279
280275 public static POIFSDocumentType detectType(DirectoryEntry node) {
281276 Set<String> names = new HashSet<String>();
282277 for (Entry entry : node) {
312307 * @throws IOException on IOException if it occurs during the extraction of the embedded doc
313308 * @throws SAXException on SAXException for writing to xhtml
314309 */
315 public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor
316 embeddedDocumentExtractor) throws IOException, SAXException {
310 public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml,
311 EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException {
317312
318313 VBAMacroReader reader = null;
319314 Map<String, String> macros = null;
321316 reader = new VBAMacroReader(fs);
322317 macros = reader.readMacros();
323318 } catch (Exception e) {
324 //swallow
319 if (e instanceof SecurityException) {
320 throw e;
321 }
322 Metadata m = new Metadata();
323 m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
324 m.set(Metadata.CONTENT_TYPE, "text/x-vbasic");
325 EmbeddedDocumentUtil.recordException(e, m);
326 if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
327 embeddedDocumentExtractor.parseEmbedded(
328 //pass in space character so that we don't trigger a zero-byte exception
329 new ByteArrayInputStream(new byte[]{'\u0020'}), xhtml, m, true);
330 }
325331 return;
326332 }
327333 for (Map.Entry<String, String> e : macros.entrySet()) {
2828 private boolean includeMoveFromContent = false;
2929 private boolean includeShapeBasedContent = true;
3030 private boolean includeHeadersAndFooters = true;
31 private boolean includeMissingRows = false;
3132 private boolean concatenatePhoneticRuns = true;
3233
3334 private boolean useSAXDocxExtractor = false;
187188 this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
188189 }
189190
190
191191 public boolean getExtractAllAlternativesFromMSG() {
192192 return extractAllAlternativesFromMSG;
193193 }
194
195 /**
196 * For table-like formats, and tables within other formats, should
197 * missing rows in sparse tables be output where detected?
198 * The default is to only output rows defined within the file, which
199 * avoid lots of blank lines, but means layout isn't preserved.
200 */
201 public void setIncludeMissingRows(boolean includeMissingRows) {
202 this.includeMissingRows = includeMissingRows;
203 }
204
205 public boolean getIncludeMissingRows() {
206 return includeMissingRows;
207 }
194208 }
195209
196210
5050 import org.apache.poi.hsmf.datatypes.Types;
5151 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
5252 import org.apache.poi.poifs.filesystem.DirectoryNode;
53 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
53 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
5454 import org.apache.poi.util.CodePageUtil;
55 import org.apache.tika.config.Field;
5655 import org.apache.tika.exception.TikaException;
5756 import org.apache.tika.extractor.EmbeddedDocumentUtil;
5857 import org.apache.tika.io.TikaInputStream;
7372 import org.apache.tika.sax.BodyContentHandler;
7473 import org.apache.tika.sax.EmbeddedContentHandler;
7574 import org.apache.tika.sax.XHTMLContentHandler;
76 import org.bouncycastle.cms.Recipient;
7775 import org.xml.sax.SAXException;
7876
7977 /**
127125
128126 private final boolean extractAllAlternatives;
129127
130 public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
128 public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException {
131129 this(filesystem.getRoot(), context);
132130 }
133131
148146 msg.setReturnNullOnMissingChunk(true);
149147
150148 try {
151 metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
149 metadata.set(Office.MAPI_MESSAGE_CLASS, msg.getMessageClassEnum().name());
152150 } catch (ChunkNotFoundException e){}
153151
154152 // If the message contains strings that aren't stored
484482 metadata.add(property, chunks.get(0).toString());
485483 }
486484
487 //TODO: replace this with getMessageClassEnum when we upgrade POI
485 //Still needed by PSTParser
488486 public static String getMessageClass(String messageClass){
489487 if (messageClass == null || messageClass.trim().length() == 0) {
490488 return "UNSPECIFIED";
3232 import org.apache.poi.poifs.filesystem.DocumentInputStream;
3333 import org.apache.poi.poifs.filesystem.DocumentNode;
3434 import org.apache.poi.poifs.filesystem.Entry;
35 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
35 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
3636 import org.apache.tika.detect.Detector;
3737 import org.apache.tika.io.TikaInputStream;
3838 import org.apache.tika.metadata.Metadata;
384384 File file = stream.getFile();
385385
386386 try {
387 NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
387 POIFSFileSystem fs = new POIFSFileSystem(file, true);
388388
389389 // Optimize a possible later parsing process by keeping
390390 // a reference to the already opened POI file system
422422 Set<String> names = null;
423423 if (tis != null) {
424424 Object container = tis.getOpenContainer();
425 if (container instanceof NPOIFSFileSystem) {
426 names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
425 if (container instanceof POIFSFileSystem) {
426 names = getTopLevelNames(((POIFSFileSystem) container).getRoot());
427427 } else if (container instanceof DirectoryNode) {
428428 names = getTopLevelNames((DirectoryNode) container);
429429 }
453453 // Detect based on the names (as available)
454454 if (tis != null &&
455455 tis.getOpenContainer() != null &&
456 tis.getOpenContainer() instanceof NPOIFSFileSystem) {
457 return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
456 tis.getOpenContainer() instanceof POIFSFileSystem) {
457 return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot());
458458 } else {
459459 return detect(names, null);
460460 }
3131 import org.apache.poi.poifs.filesystem.DirectoryNode;
3232 import org.apache.poi.poifs.filesystem.DocumentEntry;
3333 import org.apache.poi.poifs.filesystem.DocumentInputStream;
34 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
34 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
3535 import org.apache.tika.exception.TikaException;
3636 import org.apache.tika.metadata.MSOffice;
3737 import org.apache.tika.metadata.Metadata;
6262 this.metadata = metadata;
6363 }
6464
65 public void parseSummaries(NPOIFSFileSystem filesystem)
65 public void parseSummaries(POIFSFileSystem filesystem)
6666 throws IOException, TikaException {
6767 parseSummaries(filesystem.getRoot());
6868 }
9393 // no property stream, just skip it
9494 } catch (UnexpectedPropertySetTypeException e) {
9595 throw new TikaException("Unexpected HPSF document", e);
96 } catch (MarkUnsupportedException e) {
97 throw new TikaException("Invalid DocumentInputStream", e);
9896 } catch (Exception e) {
9997 LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e);
10098 }
6363 HwmfPicture picture = new HwmfPicture(stream);
6464 //TODO: make x/y info public in POI so that we can use it here
6565 //to determine when to keep two text parts on the same line
66 Charset charset = LocaleUtil.CHARSET_1252;
6667 for (HwmfRecord record : picture.getRecords()) {
67 Charset charset = LocaleUtil.CHARSET_1252;
6868 //this is pure hackery for specifying the font
6969 //TODO: do what Graphics does by maintaining the stack, etc.!
7070 //This fix should be done within POI
4848 import org.apache.poi.poifs.filesystem.DirectoryEntry;
4949 import org.apache.poi.poifs.filesystem.DirectoryNode;
5050 import org.apache.poi.poifs.filesystem.Entry;
51 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
51 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
5252 import org.apache.tika.exception.EncryptedDocumentException;
5353 import org.apache.tika.exception.TikaException;
5454 import org.apache.tika.io.TikaInputStream;
144144 }
145145
146146 protected void parse(
147 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
147 POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
148148 throws IOException, SAXException, TikaException {
149149 parse(filesystem.getRoot(), xhtml);
150150 }
660660 }
661661
662662 protected void parseWord6(
663 NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
663 POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
664664 throws IOException, SAXException, TikaException {
665665 parseWord6(filesystem.getRoot(), xhtml);
666666 }
2424 import java.net.URI;
2525 import java.util.HashMap;
2626 import java.util.HashSet;
27 import java.util.Iterator;
2827 import java.util.List;
2928 import java.util.Map;
3029 import java.util.Set;
3130
32 import org.apache.poi.POIXMLDocument;
33 import org.apache.poi.POIXMLTextExtractor;
31 import org.apache.poi.extractor.POITextExtractor;
32 import org.apache.poi.ooxml.POIXMLDocument;
33 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
3434 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
3535 import org.apache.poi.openxml4j.opc.OPCPackage;
3636 import org.apache.poi.openxml4j.opc.PackagePart;
4040 import org.apache.poi.openxml4j.opc.TargetMode;
4141 import org.apache.poi.openxml4j.opc.internal.FileHelper;
4242 import org.apache.poi.poifs.filesystem.DirectoryNode;
43 import org.apache.poi.poifs.filesystem.DocumentEntry;
44 import org.apache.poi.poifs.filesystem.Entry;
45 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
4643 import org.apache.poi.poifs.filesystem.Ole10Native;
4744 import org.apache.poi.poifs.filesystem.Ole10NativeException;
4845 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
6259 import org.apache.tika.sax.OfflineContentHandler;
6360 import org.apache.tika.sax.XHTMLContentHandler;
6461 import org.apache.tika.utils.ExceptionUtils;
62 import org.apache.tika.utils.XMLReaderUtils;
6563 import org.apache.xmlbeans.XmlException;
6664 import org.xml.sax.ContentHandler;
6765 import org.xml.sax.SAXException;
115113 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
116114 */
117115 public POIXMLDocument getDocument() {
118 return extractor.getDocument();
116 return (POIXMLDocument)extractor.getDocument();
119117 }
120118
121119 /**
418416
419417 if (officeParserConfig.getExtractMacros()) {
420418 try (InputStream is = macroPart.getInputStream()) {
421 try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
419 try (POIFSFileSystem poifs = new POIFSFileSystem(is)) {
422420 //Macro reading exceptions are already swallowed here
423 OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
421 OfficeParser.extractMacros(poifs, handler, embeddedExtractor);
424422 }
425423 } catch (IOException e) {
426424 throw new TikaException("Broken OOXML file", e);
515513 try {
516514 PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
517515 try (InputStream stream = relatedPartPart.getInputStream()) {
518 context.getSAXParser().parse(stream,
519 new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
516 XMLReaderUtils.parseSAX(stream,
517 new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)), context);
520518
521519 } catch (IOException|TikaException e) {
522520 parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
523521 ExceptionUtils.getStackTrace(e));
524522 }
525
526523 } catch (InvalidFormatException e) {
527524 parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
528525 ExceptionUtils.getStackTrace(e));
1515 */
1616 package org.apache.tika.parser.microsoft.ooxml;
1717
18 import java.math.BigDecimal;
19 import java.util.Date;
20
21 import org.apache.poi.POIXMLProperties.CoreProperties;
22 import org.apache.poi.POIXMLProperties.CustomProperties;
23 import org.apache.poi.POIXMLProperties.ExtendedProperties;
24 import org.apache.poi.POIXMLTextExtractor;
18 import org.apache.poi.ooxml.POIXMLProperties;
19 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2520 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
26 import org.apache.poi.openxml4j.util.Nullable;
2721 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
2822 import org.apache.tika.exception.TikaException;
2923 import org.apache.tika.metadata.MSOffice;
4236 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
4337 import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
4438
39 import java.math.BigDecimal;
40 import java.util.Date;
41 import java.util.Optional;
42
4543 /**
4644 * OOXML metadata extractor.
4745 * <p/>
7068 }
7169 }
7270
73 private void extractMetadata(CoreProperties properties, Metadata metadata) {
71 private void extractMetadata(POIXMLProperties.CoreProperties properties, Metadata metadata) {
7472 PackagePropertiesPart propsHolder = properties
7573 .getUnderlyingProperties();
7674
77 addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
78 addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
75 setProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
76 setProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
7977 .getContentStatusProperty());
80 addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
78 setProperty(metadata, TikaCoreProperties.CREATED, propsHolder
8179 .getCreatedProperty());
8280 addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
8381 .getCreatorProperty());
84 addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
82 setProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
8583 .getDescriptionProperty());
86 addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
84 setProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
8785 .getIdentifierProperty());
86 addProperty(metadata, OfficeOpenXMLCore.SUBJECT,
87 propsHolder.getSubjectProperty());
8888 addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
8989 .getKeywordsProperty());
90 addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
90 setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
9191 .getLanguageProperty());
92 addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
92 setProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
9393 .getLastModifiedByProperty());
94 addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
94 setProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
9595 .getLastPrintedProperty());
96 addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
96 setProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
9797 .getModifiedProperty());
98 addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
99 .getModifiedProperty());
100 addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
98 setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
10199 .getRevisionProperty());
102100 // TODO: Move to OO subject in Tika 2.0
103 addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
101 setProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
104102 propsHolder.getSubjectProperty());
105 addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
106 addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
103 setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
104 setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
107105
108106 // Legacy Tika-1.0 style stats
109107 // TODO Remove these in Tika 2.0
110 addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
111 addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
108 setProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
109 setProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
112110 .getContentStatusProperty());
113 addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
111 setProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
114112 .getRevisionProperty());
115 addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
116 }
117
118 private void extractMetadata(ExtendedProperties properties,
113 setProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
114 }
115
116 private void extractMetadata(POIXMLProperties.ExtendedProperties properties,
119117 Metadata metadata) {
120118 CTProperties propsHolder = properties.getUnderlyingProperties();
121119
129127 } catch (XmlValueOutOfRangeException e) {
130128 //swallow for now
131129 }
132 addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
133 addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
134 addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
135 addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
130 setProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
131 setProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
132 setProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
133 setProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
136134 SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager());
137 addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
138 addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
139 addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
140 addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
135 setProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
136 setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
137 setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
138 setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
141139
142140 if (propsHolder.getPages() > 0) {
143141 metadata.set(PagedText.N_PAGES, propsHolder.getPages());
146144 }
147145
148146 // Process the document statistics
149 addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
150 addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
151 addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
152 addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
153 addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
154 addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
155 addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
147 setProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
148 setProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
149 setProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
150 setProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
151 setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
152 setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
153 setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
156154
157155 // Legacy Tika-1.0 style stats
158156 // TODO Remove these in Tika 2.0
159 addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
160 addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
161 addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
162 addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
163 addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
164 addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
165 addProperty(metadata, Metadata.TOTAL_TIME, totalTime);
166 addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
167 addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
168 addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
169 addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
170 addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
171 addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
172 addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
173 }
174
175 private void extractMetadata(CustomProperties properties,
157 setProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
158 setProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
159 setProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
160 setProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
161 setProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
162 setProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
163 setProperty(metadata, Metadata.TOTAL_TIME, totalTime);
164 setProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
165 setProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
166 setProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
167 setProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
168 setProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
169 setProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
170 setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
171 }
172
173 private void extractMetadata(POIXMLProperties.CustomProperties properties,
176174 Metadata metadata) {
177175 org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
178176 props = properties.getUnderlyingProperties();
256254 }
257255 }
258256
259 private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
260 T value = nullableValue.getValue();
261 if (value != null) {
262 if (value instanceof Date) {
263 metadata.set(property, (Date) value);
264 } else if (value instanceof String) {
265 metadata.set(property, (String) value);
266 } else if (value instanceof Integer) {
267 metadata.set(property, (Integer) value);
268 } else if (value instanceof Double) {
269 metadata.set(property, (Double) value);
270 }
271 }
272 }
273
274 private void addProperty(Metadata metadata, String name, Nullable<?> value) {
275 if (value.getValue() != null) {
276 addProperty(metadata, name, value.getValue().toString());
277 }
278 }
279
280 private void addProperty(Metadata metadata, Property property, String value) {
257 private <T> void setProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
258 if (!nullableValue.isPresent()) {
259 return;
260 }
261 T value = nullableValue.get();
262 if (value instanceof Date) {
263 metadata.set(property, (Date) value);
264 } else if (value instanceof String) {
265 metadata.set(property, (String) value);
266 } else if (value instanceof Integer) {
267 metadata.set(property, (Integer) value);
268 } else if (value instanceof Double) {
269 metadata.set(property, (Double) value);
270 }
271 }
272
273 private <T> void addProperty(Metadata metadata, Property property, Optional<T> nullableValue) {
274 if (!nullableValue.isPresent()) {
275 return;
276 }
277 T value = nullableValue.get();
278 if (value instanceof String) {
279 metadata.add(property, (String) value);
280 } else {
281 throw new IllegalArgumentException("Can't add property of class: " + nullableValue.getClass());
282 }
283 }
284
285 private void setProperty(Metadata metadata, String property, Optional<String> nullableValue) {
286 if (!nullableValue.isPresent()) {
287 return;
288 }
289 String value = nullableValue.get();
290 metadata.set(property, value);
291 }
292
293 private void setProperty(Metadata metadata, Property property, String value) {
281294 if (value != null) {
282295 metadata.set(property, value);
283296 }
284297 }
285298
286 private void addProperty(Metadata metadata, String name, String value) {
299 private void setProperty(Metadata metadata, String name, String value) {
287300 if (value != null) {
288301 metadata.set(name, value);
289302 }
290303 }
291304
292 private void addProperty(Metadata metadata, Property property, int value) {
305 private void setProperty(Metadata metadata, Property property, int value) {
293306 if (value > 0) {
294307 metadata.set(property, value);
295308 }
296309 }
297310
298 private void addProperty(Metadata metadata, String name, int value) {
311 private void setProperty(Metadata metadata, String name, int value) {
299312 if (value > 0) {
300313 metadata.set(name, Integer.toString(value));
301314 }
302315 }
303316
304 private void addMultiProperty(Metadata metadata, Property property, Nullable<String> value) {
305 if (value == null) {
317 private void addMultiProperty(Metadata metadata, Property property, Optional<String> value) {
318 if (!value.isPresent()) {
306319 return;
307320 }
308 SummaryExtractor.addMulti(metadata, property, value.getValue());
321 SummaryExtractor.addMulti(metadata, property, value.get());
309322 }
310323
311324 }
1717
1818 import java.io.IOException;
1919
20 import org.apache.poi.POIXMLDocument;
21 import org.apache.poi.POIXMLTextExtractor;
20 import org.apache.poi.ooxml.POIXMLDocument;
2221 import org.apache.tika.exception.TikaException;
2322 import org.apache.tika.metadata.Metadata;
2423 import org.apache.tika.parser.ParseContext;
2928 /**
3029 * Interface implemented by all Tika OOXML extractors.
3130 *
32 * @see org.apache.poi.POIXMLTextExtractor
31 * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor
3332 */
3433 public interface OOXMLExtractor {
3534
3635 /**
3736 * Returns the opened document.
3837 *
39 * @see POIXMLTextExtractor#getDocument()
38 * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getDocument()
4039 */
4140 POIXMLDocument getDocument();
4241
4342 /**
44 * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
43 * {@link org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
4544 * for OOXML by POI.
4645 */
4746 MetadataExtractor getMetadataExtractor();
2020 import java.util.Locale;
2121
2222 import org.apache.commons.io.input.CloseShieldInputStream;
23 import org.apache.poi.POIXMLDocument;
24 import org.apache.poi.POIXMLTextExtractor;
25 import org.apache.poi.extractor.ExtractorFactory;
23 import org.apache.poi.ooxml.POIXMLDocument;
24 import org.apache.poi.ooxml.extractor.ExtractorFactory;
25 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2626 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
2727 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
2828 import org.apache.poi.openxml4j.opc.OPCPackage;
3030 import org.apache.poi.openxml4j.opc.PackagePart;
3131 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
3232 import org.apache.poi.util.LocaleUtil;
33 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
3334 import org.apache.poi.xslf.usermodel.XMLSlideShow;
3435 import org.apache.poi.xslf.usermodel.XSLFRelation;
36 import org.apache.poi.xslf.usermodel.XSLFSlideShow;
3537 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
3638 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
3739 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
102104 if (config.getUseSAXDocxExtractor()) {
103105 poiExtractor = trySXWPF(pkg);
104106 }
105 if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
106 poiExtractor = trySXSLF(pkg);
107 if (poiExtractor == null) {
108 poiExtractor = tryXSLF(pkg, config.getUseSAXPptxExtractor());
107109 }
108110 if (type.equals(OOXMLParser.XPS)) {
109111 poiExtractor = new XPSTextExtractor(pkg);
110112 }
111113
112114 if (poiExtractor == null) {
113 poiExtractor = ExtractorFactory.createExtractor(pkg);
115 poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
114116 }
115117
116118 POIXMLDocument document = poiExtractor.getDocument();
189191 return null;
190192 }
191193
192 private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
194 private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) throws XmlException, OpenXML4JException, IOException {
193195
194196 PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
195197 if (packageRelationshipCollection.size() == 0) {
207209 for (int i = 0; i < xslfRelations.length; i++) {
208210 XSLFRelation xslfRelation = xslfRelations[i];
209211 if (xslfRelation.getContentType().equals(targetContentType)) {
212 if (eventBased) {
213 return new XSLFEventBasedPowerPointExtractor(pkg);
214 } else {
215 return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
216 }
217 }
218 }
219
220 if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
221 if (eventBased) {
210222 return new XSLFEventBasedPowerPointExtractor(pkg);
211 }
212 }
213
214 if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
215 return new XSLFEventBasedPowerPointExtractor(pkg);
223 } else {
224 return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
225 }
216226 }
217227 return null;
218228 }
1818 import java.util.ArrayList;
1919 import java.util.List;
2020
21 import org.apache.poi.POIXMLTextExtractor;
21 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2222 import org.apache.poi.openxml4j.opc.PackagePart;
2323 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
2424 import org.apache.tika.parser.ParseContext;
4343 import org.apache.tika.sax.OfflineContentHandler;
4444 import org.apache.tika.sax.XHTMLContentHandler;
4545 import org.apache.tika.utils.ExceptionUtils;
46 import org.apache.tika.utils.XMLReaderUtils;
4647 import org.xml.sax.Attributes;
4748 import org.xml.sax.ContentHandler;
4849 import org.xml.sax.SAXException;
4950 import org.xml.sax.helpers.DefaultHandler;
51
52 import javax.xml.parsers.SAXParser;
5053
5154 /**
5255 * SAX/Streaming pptx extractior
158161 continue;
159162 }
160163 try (InputStream stream = commentAuthorsPart.getInputStream()) {
161 context.getSAXParser().parse(
164 XMLReaderUtils.parseSAX(
162165 new CloseShieldInputStream(stream),
163 new OfflineContentHandler(new XSLFCommentAuthorHandler()));
166 new OfflineContentHandler(new XSLFCommentAuthorHandler()),
167 context);
164168
165169 } catch (TikaException | SAXException | IOException e) {
166170 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
176180 // Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
177181 xhtml.startElement("div", "class", "slide-content");
178182 try (InputStream stream = slidePart.getInputStream()) {
179 context.getSAXParser().parse(
183 XMLReaderUtils.parseSAX(
180184 new CloseShieldInputStream(stream),
181185 new OfflineContentHandler(new EmbeddedContentHandler(
182186 new OOXMLWordAndPowerPointTextHandler(
183 new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
184
185 } catch (TikaException e) {
187 new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))),
188 context);
189
190 } catch (TikaException|IOException e) {
186191 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
187192 ExceptionUtils.getStackTrace(e));
188193 }
4343 import org.apache.tika.sax.OfflineContentHandler;
4444 import org.apache.tika.sax.XHTMLContentHandler;
4545 import org.apache.tika.utils.ExceptionUtils;
46 import org.apache.tika.utils.XMLReaderUtils;
4647 import org.apache.xmlbeans.XmlException;
4748 import org.xml.sax.SAXException;
49
50 import javax.xml.parsers.SAXParser;
4851
4952 /**
5053 * This is an experimental, alternative extractor for docx files.
187190
188191 Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata);
189192 try (InputStream stream = packagePart.getInputStream()) {
190 context.getSAXParser().parse(
193 XMLReaderUtils.parseSAX(
191194 new CloseShieldInputStream(stream),
192195 new OfflineContentHandler(new EmbeddedContentHandler(
193196 new OOXMLWordAndPowerPointTextHandler(
194197 new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
195 config), linkedRelationships, config.getIncludeShapeBasedContent(), config.getConcatenatePhoneticRuns()))));
196 } catch (TikaException e) {
198 config), linkedRelationships, config.getIncludeShapeBasedContent(), config.getConcatenatePhoneticRuns()))),
199 context);
200 } catch (TikaException|IOException e) {
197201 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
198202 ExceptionUtils.getStackTrace(e));
199
200203 }
201204
202205 }
2929 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
3030 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
3131 import org.apache.poi.openxml4j.opc.TargetMode;
32 import org.apache.poi.sl.extractor.SlideShowExtractor;
3233 import org.apache.poi.sl.usermodel.Placeholder;
3334 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
3435 import org.apache.poi.xslf.usermodel.XMLSlideShow;
36 import org.apache.poi.xslf.usermodel.XSLFComment;
3537 import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
3638 import org.apache.poi.xslf.usermodel.XSLFComments;
3739 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
5860 import org.apache.tika.sax.XHTMLContentHandler;
5961 import org.apache.xmlbeans.XmlException;
6062 import org.apache.xmlbeans.XmlObject;
61 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
62 import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
6363 import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
6464 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
6565 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
135135 }
136136
137137 // comments (if present)
138 XSLFComments comments = slide.getComments();
138 List<XSLFComment> comments = slide.getComments();
139139 if (comments != null) {
140140 StringBuilder authorStringBuilder = new StringBuilder();
141 for (int i = 0; i < comments.getNumberOfComments(); i++) {
141 for (int i = 0; i < comments.size(); i++) {
142142 authorStringBuilder.setLength(0);
143 CTComment comment = comments.getCommentAt(i);
143 XSLFComment comment = comments.get(i);
144144 xhtml.startElement("p", "class", "slide-comment");
145 CTCommentAuthor cta = commentAuthors.getAuthorById(comment.getAuthorId());
146 if (cta != null) {
147 if (cta.getName() != null) {
148 authorStringBuilder.append(cta.getName());
149 }
150 if (cta.getInitials() != null) {
145 if (comment.getAuthor() != null) {
146 authorStringBuilder.append(comment.getAuthor());
147 }
148 if (comment.getAuthorInitials() != null) {
151149 if (authorStringBuilder.length() > 0) {
152150 authorStringBuilder.append(" ");
153151 }
154 authorStringBuilder.append("("+cta.getInitials()+")");
152 authorStringBuilder.append("("+comment.getAuthorInitials()+")");
155153 }
156154 if (comment.getText() != null && authorStringBuilder.length() > 0) {
157155 authorStringBuilder.append(" - ");
161159 xhtml.characters(authorStringBuilder.toString());
162160 xhtml.endElement("b");
163161 }
164 }
162
165163 xhtml.characters(comment.getText());
166164 xhtml.endElement("p");
167165 }
1717
1818 import java.io.IOException;
1919 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.HashMap;
2220 import java.util.List;
2321 import java.util.Locale;
2422
25 import org.apache.poi.POIXMLTextExtractor;
23 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2624 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
2725 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
2826 import org.apache.poi.openxml4j.opc.OPCPackage;
2927 import org.apache.poi.openxml4j.opc.PackagePart;
30 import org.apache.poi.openxml4j.opc.PackagePartName;
31 import org.apache.poi.openxml4j.opc.PackageRelationship;
32 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
33 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
34 import org.apache.poi.openxml4j.opc.TargetMode;
3528 import org.apache.poi.xssf.binary.XSSFBCommentsTable;
3629 import org.apache.poi.xssf.binary.XSSFBSharedStringsTable;
3730 import org.apache.poi.xssf.binary.XSSFBSheetHandler;
3932 import org.apache.poi.xssf.eventusermodel.XSSFBReader;
4033 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
4134 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
42 import org.apache.poi.xssf.usermodel.XSSFDrawing;
43 import org.apache.poi.xssf.usermodel.XSSFRelation;
4435 import org.apache.poi.xssf.usermodel.XSSFShape;
45 import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
4636 import org.apache.tika.exception.TikaException;
4737 import org.apache.tika.metadata.Metadata;
4838 import org.apache.tika.metadata.TikaCoreProperties;
5040 import org.apache.tika.parser.ParseContext;
5141 import org.apache.tika.sax.XHTMLContentHandler;
5242 import org.apache.xmlbeans.XmlException;
53 import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
54 import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
55 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
56 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
5743 import org.xml.sax.ContentHandler;
5844 import org.xml.sax.SAXException;
5945
117103 addDrawingHyperLinks(sheetPart);
118104 sheetParts.add(sheetPart);
119105
120 SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config.getIncludeHeadersAndFooters(), xhtml);
106 SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
121107 XSSFBCommentsTable comments = iter.getXSSFBSheetComments();
122108
123109 // Start, and output the sheet name
165151 }
166152 }
167153
168 private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
169 try {
170 for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
171 xhtml.startElement("a", "href", rel.getTargetURI().toString());
172 xhtml.characters(rel.getTargetURI().toString());
173 xhtml.endElement("a");
174 }
175 } catch (InvalidFormatException e) {
176 //swallow
177 }
178 }
179
180 private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
181 if (shapes == null) {
182 return;
183 }
184 for (XSSFShape shape : shapes) {
185 if (shape instanceof XSSFSimpleShape) {
186 String sText = ((XSSFSimpleShape) shape).getText();
187 if (sText != null && sText.length() > 0) {
188 xhtml.element("p", sText);
189 }
190 extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
191 }
192 XSSFDrawing drawing = shape.getDrawing();
193 if (drawing != null) {
194 //dump diagram data
195 handleGeneralTextContainingPart(
196 AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
197 "diagram-data",
198 drawing.getPackagePart(),
199 metadata,
200 new OOXMLWordAndPowerPointTextHandler(
201 new OOXMLTikaBodyPartHandler(xhtml),
202 new HashMap<String, String>()//empty
203 )
204 );
205 //dump chart data
206 handleGeneralTextContainingPart(
207 XSSFRelation.CHART.getRelation(),
208 "chart",
209 drawing.getPackagePart(),
210 metadata,
211 new OOXMLWordAndPowerPointTextHandler(
212 new OOXMLTikaBodyPartHandler(xhtml),
213 new HashMap<String, String>()//empty
214 )
215 );
216 }
217 }
218 }
219
220 private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {
221
222 if (ctShape == null)
223 return;
224
225 CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
226 if (nvSpPR == null)
227 return;
228
229 CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
230 if (cNvPr == null)
231 return;
232
233 CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
234 if (ctHyperlink == null)
235 return;
236
237 String url = drawingHyperlinks.get(ctHyperlink.getId());
238 if (url != null) {
239 xhtml.startElement("a", "href", url);
240 xhtml.characters(url);
241 xhtml.endElement("a");
242 }
243
244 CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
245 if (ctHoverHyperlink == null)
246 return;
247
248 url = drawingHyperlinks.get(ctHoverHyperlink.getId());
249 if (url != null) {
250 xhtml.startElement("a", "href", url);
251 xhtml.characters(url);
252 xhtml.endElement("a");
253 }
254
255 }
256154
257155 private void processSheet(
258156 SheetContentsHandler sheetContentsExtractor,
273171 );
274172 xssfbSheetHandler.parse();
275173 }
276
277 /**
278 * In Excel files, sheets have things embedded in them,
279 * and sheet drawings which have the images
280 */
281 @Override
282 protected List<PackagePart> getMainDocumentParts() throws TikaException {
283 List<PackagePart> parts = new ArrayList<PackagePart>();
284 for (PackagePart part : sheetParts) {
285 // Add the sheet
286 parts.add(part);
287
288 // If it has drawings, return those too
289 try {
290 for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
291 if (rel.getTargetMode() == TargetMode.INTERNAL) {
292 PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
293 parts.add(rel.getPackage().getPart(relName));
294 }
295 }
296 for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
297 if (rel.getTargetMode() == TargetMode.INTERNAL) {
298 PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
299 parts.add(rel.getPackage().getPart(relName));
300 }
301 }
302 } catch (InvalidFormatException e) {
303 throw new TikaException("Broken OOXML file", e);
304 }
305 }
306
307 //add main document so that macros can be extracted
308 //by AbstractOOXMLExtractor
309 for (PackagePart part : extractor.getPackage().
310 getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
311 parts.add(part);
312 }
313
314 return parts;
315 }
316174 }
1515 */
1616 package org.apache.tika.parser.microsoft.ooxml;
1717
18 import javax.xml.parsers.SAXParser;
18
1919 import java.io.IOException;
2020 import java.io.InputStream;
2121 import java.util.ArrayList;
2222 import java.util.HashMap;
23 import java.util.HashSet;
2324 import java.util.List;
2425 import java.util.Locale;
2526 import java.util.Map;
26
27 import org.apache.poi.POIXMLDocument;
28 import org.apache.poi.POIXMLTextExtractor;
27 import java.util.Set;
28
2929 import org.apache.poi.hssf.extractor.ExcelExtractor;
30 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
3031 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
3132 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
3233 import org.apache.poi.openxml4j.opc.OPCPackage;
3839 import org.apache.poi.openxml4j.opc.TargetMode;
3940 import org.apache.poi.ss.usermodel.DataFormatter;
4041 import org.apache.poi.ss.usermodel.HeaderFooter;
42 import org.apache.poi.ss.util.CellReference;
4143 import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
4244 import org.apache.poi.xssf.eventusermodel.XSSFReader;
4345 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
5658 import org.apache.tika.metadata.TikaCoreProperties;
5759 import org.apache.tika.metadata.TikaMetadataKeys;
5860 import org.apache.tika.parser.ParseContext;
61 import org.apache.tika.parser.microsoft.OfficeParserConfig;
5962 import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
6063 import org.apache.tika.sax.OfflineContentHandler;
6164 import org.apache.tika.sax.XHTMLContentHandler;
65 import org.apache.tika.utils.XMLReaderUtils;
6266 import org.apache.xmlbeans.XmlException;
6367 import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
6468 import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
6670 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
6771 import org.xml.sax.Attributes;
6872 import org.xml.sax.ContentHandler;
69 import org.xml.sax.InputSource;
7073 import org.xml.sax.Locator;
7174 import org.xml.sax.SAXException;
72 import org.xml.sax.XMLReader;
7375 import org.xml.sax.helpers.DefaultHandler;
7476
7577 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
144146 }
145147
146148 while (iter.hasNext()) {
147
148 SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config.getIncludeHeadersAndFooters(), xhtml);
149 SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
149150 PackagePart sheetPart = null;
150151 try (InputStream stream = iter.next()) {
151152 sheetPart = iter.getSheetPart();
195196
196197 //consider adding this back to POI
197198 try (InputStream wbData = xssfReader.getWorkbookData()) {
198 SAXParser parser = parseContext.getSAXParser();
199 parser.parse(wbData, new OfflineContentHandler(new AbsPathExtractorHandler()));
199 XMLReaderUtils.parseSAX(wbData, new OfflineContentHandler(new AbsPathExtractorHandler()), parseContext);
200200 } catch (InvalidFormatException|TikaException e) {
201201 //swallow
202202 }
228228 }
229229
230230
231 private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
231 protected void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
232232 try {
233233 for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
234234 xhtml.startElement("a", "href", rel.getTargetURI().toString());
249249 }
250250 }
251251
252 private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
252 protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
253253 if (shapes == null) {
254254 return;
255255 }
256 //We don't currently have an obvious way to get drawings
257 //directly from sheetIter. Therefore, we grab the shapes and process those.
258 //To get the diagrams and charts, we need to get the parent drawing for each
259 //shape, and we need to make sure that we only process each parent shape once!
260 //SEE TIKA-2703 TODO: add unit test
261 Set<String> seenParentDrawings = new HashSet<>();
256262 for (XSSFShape shape : shapes) {
257263 if (shape instanceof XSSFSimpleShape) {
258264 String sText = ((XSSFSimpleShape) shape).getText();
261267 }
262268 extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
263269 }
264 XSSFDrawing drawing = shape.getDrawing();
265 if (drawing != null) {
266 //dump diagram data
267 handleGeneralTextContainingPart(
268 AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
269 "diagram-data",
270 drawing.getPackagePart(),
271 metadata,
272 new OOXMLWordAndPowerPointTextHandler(
273 new OOXMLTikaBodyPartHandler(xhtml),
274 new HashMap<String, String>()//empty
275 )
276 );
277 //dump chart data
278 handleGeneralTextContainingPart(
279 XSSFRelation.CHART.getRelation(),
280 "chart",
281 drawing.getPackagePart(),
282 metadata,
283 new OOXMLWordAndPowerPointTextHandler(
284 new OOXMLTikaBodyPartHandler(xhtml),
285 new HashMap<String, String>()//empty
286 )
287 );
270
271 XSSFDrawing parentDrawing = shape.getDrawing();
272 if (parentDrawing != null) {
273 if (! seenParentDrawings.contains(parentDrawing.getPackagePart().getPartName().toString())) {
274 //dump diagram data
275 handleGeneralTextContainingPart(
276 AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
277 "diagram-data",
278 parentDrawing.getPackagePart(),
279 metadata,
280 new OOXMLWordAndPowerPointTextHandler(
281 new OOXMLTikaBodyPartHandler(xhtml),
282 new HashMap<String, String>()//empty
283 )
284 );
285 //dump chart data
286 handleGeneralTextContainingPart(
287 XSSFRelation.CHART.getRelation(),
288 "chart",
289 parentDrawing.getPackagePart(),
290 metadata,
291 new OOXMLWordAndPowerPointTextHandler(
292 new OOXMLTikaBodyPartHandler(xhtml),
293 new HashMap<String, String>()//empty
294 )
295 );
296 }
297 seenParentDrawings.add(parentDrawing.getPackagePart().getPartName().toString());
288298 }
289299 }
290300 }
333343 ReadOnlySharedStringsTable strings,
334344 InputStream sheetInputStream)
335345 throws IOException, SAXException {
336 InputSource sheetSource = new InputSource(sheetInputStream);
337346 try {
338 XMLReader sheetParser = parseContext.getXMLReader();
347
339348 XSSFSheetInterestingPartsCapturer handler =
340349 new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
341350 styles, comments, strings, sheetContentsExtractor, formatter, false));
342 sheetParser.setContentHandler(handler);
343 sheetParser.parse(sheetSource);
351 XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext);
344352 sheetInputStream.close();
345353
346354 if (handler.hasProtection) {
397405 protected static class SheetTextAsHTML implements SheetContentsHandler {
398406 private XHTMLContentHandler xhtml;
399407 private final boolean includeHeadersFooters;
408 private final boolean includeMissingRows;
400409 protected List<String> headers;
401410 protected List<String> footers;
402
403 protected SheetTextAsHTML(boolean includeHeaderFooters, XHTMLContentHandler xhtml) {
404 this.includeHeadersFooters = includeHeaderFooters;
411 private int lastSeenRow = -1;
412 private int lastSeenCol = -1;
413
414 protected SheetTextAsHTML(OfficeParserConfig config, XHTMLContentHandler xhtml) {
415 this.includeHeadersFooters = config.getIncludeHeadersAndFooters();
416 this.includeMissingRows = config.getIncludeMissingRows();
405417 this.xhtml = xhtml;
406418 headers = new ArrayList<String>();
407419 footers = new ArrayList<String>();
409421
410422 public void startRow(int rowNum) {
411423 try {
424 // Missing rows, if desired, with a single empty row
425 if (includeMissingRows && rowNum > (lastSeenRow+1)) {
426 for (int rn=lastSeenRow+1; rn<rowNum; rn++) {
427 xhtml.startElement("tr");
428 xhtml.startElement("td");
429 xhtml.endElement("td");
430 xhtml.endElement("tr");
431 }
432 }
433
434 // Start the new row
412435 xhtml.startElement("tr");
436 lastSeenCol = -1;
413437 } catch (SAXException e) {
414438 }
415439 }
423447
424448 public void cell(String cellRef, String formattedValue, XSSFComment comment) {
425449 try {
450 // Handle any missing cells
451 int colNum = (new CellReference(cellRef)).getCol();
452 for (int cn=lastSeenCol+1; cn<colNum; cn++) {
453 xhtml.startElement("td");
454 xhtml.endElement("td");
455 }
456 lastSeenCol = colNum;
457
458 // Start this cell
426459 xhtml.startElement("td");
427460
428461 // Main cell contents
489522 * Captures information on interesting tags, whilst
490523 * delegating the main work to the formatting handler
491524 */
492 protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
525 protected static class XSSFSheetInterestingPartsCapturer extends DefaultHandler {
493526 private ContentHandler delegate;
494527 private boolean hasProtection = false;
495528
1616
1717 package org.apache.tika.parser.microsoft.ooxml.xps;
1818
19 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
1920 import org.apache.commons.io.IOUtils;
2021 import org.apache.commons.io.input.CloseShieldInputStream;
21 import org.apache.poi.POIXMLDocument;
22 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.poi.ooxml.POIXMLDocument;
23 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2324 import org.apache.poi.openxml4j.opc.PackagePart;
2425 import org.apache.poi.openxml4j.opc.PackageRelationship;
2526 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
2829 import org.apache.tika.exception.TikaException;
2930 import org.apache.tika.extractor.EmbeddedDocumentUtil;
3031 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.TikaCoreProperties;
3232 import org.apache.tika.parser.ParseContext;
3333 import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
3434 import org.apache.tika.sax.EmbeddedContentHandler;
3535 import org.apache.tika.sax.OfflineContentHandler;
3636 import org.apache.tika.sax.XHTMLContentHandler;
37 import org.apache.tika.utils.ExceptionUtils;
37 import org.apache.tika.utils.XMLReaderUtils;
3838 import org.xml.sax.Attributes;
3939 import org.xml.sax.SAXException;
4040 import org.xml.sax.helpers.DefaultHandler;
4646 import java.util.HashMap;
4747 import java.util.List;
4848 import java.util.Map;
49 import java.util.zip.ZipEntry;
5049
5150 public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
5251
127126
128127 private void handleDocuments(PackageRelationship packageRelationship,
129128 XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
130
131129 try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
132 context.getSAXParser().parse(
130 XMLReaderUtils.parseSAX(
133131 new CloseShieldInputStream(stream),
134132 new OfflineContentHandler(new EmbeddedContentHandler(
135 new FixedDocSeqHandler(xhtml))));
133 new FixedDocSeqHandler(xhtml))),
134 context);
136135 }
137136 }
138137
179178 String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
180179 if (pkg instanceof ZipPackage) {
181180 try (InputStream stream = getZipStream(zipPath, pkg)) {
182 context.getSAXParser().parse(
181 XMLReaderUtils.parseSAX(
183182 new CloseShieldInputStream(stream),
184183 new OfflineContentHandler(new EmbeddedContentHandler(
185 new PageContentPartHandler(relativeRoot, xhtml))));
184 new PageContentPartHandler(relativeRoot, xhtml))),
185 context);
186186
187187 } catch (IOException | TikaException e) {
188188 throw new SAXException(new TikaException("IOException trying to read: " + docRef));
226226 pagePath = pagePath.substring(1);
227227 }
228228 try (InputStream stream = getZipStream(pagePath, pkg)) {
229 context.getSAXParser().parse(
229 XMLReaderUtils.parseSAX(
230230 new CloseShieldInputStream(stream),
231231 new OfflineContentHandler(
232232 new XPSPageContentHandler(xhtml, embeddedImages)
233 )
233 ),
234 context
234235 );
235236 } catch (TikaException | IOException e) {
236237 throw new SAXException(e);
244245 private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
245246 String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
246247 ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
247 Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
248 ZipEntry zipEntry = null;
248 Enumeration<? extends ZipArchiveEntry> zipEntryEnumeration = zipEntrySource.getEntries();
249 ZipArchiveEntry zipEntry = null;
249250 while (zipEntryEnumeration.hasMoreElements()) {
250 ZipEntry ze = zipEntryEnumeration.nextElement();
251 ZipArchiveEntry ze = zipEntryEnumeration.nextElement();
251252 if (ze.getName().equals(targPath)) {
252253 zipEntry = ze;
253254 break;
1616
1717 package org.apache.tika.parser.microsoft.ooxml.xps;
1818
19 import org.apache.poi.POIXMLDocument;
20 import org.apache.poi.POIXMLProperties;
21 import org.apache.poi.POIXMLTextExtractor;
19
20 import org.apache.poi.ooxml.POIXMLDocument;
21 import org.apache.poi.ooxml.POIXMLProperties;
22 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2223 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
2324 import org.apache.poi.openxml4j.opc.OPCPackage;
2425 import org.apache.xmlbeans.XmlException;
1919 import java.io.IOException;
2020 import java.util.Date;
2121
22 import org.apache.poi.POIXMLDocument;
23 import org.apache.poi.POIXMLProperties;
24 import org.apache.poi.POIXMLTextExtractor;
22 import org.apache.poi.ooxml.POIXMLDocument;
23 import org.apache.poi.ooxml.POIXMLProperties;
24 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
2525 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
2626 import org.apache.poi.openxml4j.opc.OPCPackage;
2727 import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
2525 import java.util.Map;
2626
2727 import org.apache.commons.io.input.CloseShieldInputStream;
28 import org.apache.poi.POIXMLDocument;
29 import org.apache.poi.POIXMLProperties;
30 import org.apache.poi.POIXMLTextExtractor;
28 import org.apache.poi.ooxml.POIXMLDocument;
29 import org.apache.poi.ooxml.POIXMLProperties;
30 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
31 import org.apache.poi.ooxml.util.SAXHelper;
3132 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
3233 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
3334 import org.apache.poi.openxml4j.opc.OPCPackage;
3435 import org.apache.poi.openxml4j.opc.PackagePart;
3536 import org.apache.poi.openxml4j.opc.PackageRelationship;
3637 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
37 import org.apache.poi.util.SAXHelper;
3838 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
3939 import org.apache.poi.xwpf.usermodel.XWPFRelation;
4040 import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
2626 import org.apache.tika.parser.ParseContext;
2727 import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
2828 import org.apache.tika.sax.OfflineContentHandler;
29 import org.apache.tika.utils.XMLReaderUtils;
2930 import org.xml.sax.Attributes;
3031 import org.xml.sax.SAXException;
3132 import org.xml.sax.helpers.DefaultHandler;
33
34 import javax.xml.parsers.SAXParser;
3235
3336 /**
3437 * For Tika, all we need (so far) is a mapping between styleId and a style's name.
5760 }
5861
5962 private void onDocumentLoad(ParseContext parseContext, InputStream stream) throws TikaException, IOException, SAXException {
60 parseContext.getSAXParser().parse(stream,
61 new OfflineContentHandler(new StylesStripper()));
63 XMLReaderUtils.parseSAX(stream,
64 new OfflineContentHandler(new StylesStripper()), parseContext);
6265 }
6366
6467 /**
3030 import org.apache.tika.sax.EmbeddedContentHandler;
3131 import org.apache.tika.sax.OfflineContentHandler;
3232 import org.apache.tika.sax.XHTMLContentHandler;
33 import org.apache.tika.utils.XMLReaderUtils;
3334 import org.xml.sax.ContentHandler;
3435 import org.xml.sax.SAXException;
36
37 import javax.xml.parsers.SAXParser;
3538
3639
3740 public class Word2006MLParser extends AbstractOfficeParser {
5457 new XHTMLContentHandler(handler, metadata);
5558
5659 xhtml.startDocument();
57
5860 try {
59 context.getSAXParser().parse(
61 XMLReaderUtils.parseSAX(
6062 new CloseShieldInputStream(stream),
6163 new OfflineContentHandler(new EmbeddedContentHandler(
62 new Word2006MLDocHandler(xhtml, metadata, context))));
64 new Word2006MLDocHandler(xhtml, metadata, context))),
65 context);
6366 } catch (SAXException e) {
6467 throw new TikaException("XML parse error", e);
65 } finally {
68 }
6669 xhtml.endDocument();
67 }
6870 }
69
7071 }
3434 import org.apache.tika.sax.TaggedContentHandler;
3535 import org.apache.tika.sax.TeeContentHandler;
3636 import org.apache.tika.sax.XHTMLContentHandler;
37 import org.apache.tika.utils.XMLReaderUtils;
3738 import org.xml.sax.Attributes;
3839 import org.xml.sax.ContentHandler;
3940 import org.xml.sax.SAXException;
4041 import org.xml.sax.helpers.AttributesImpl;
42
43 import javax.xml.parsers.SAXParser;
4144
4245
4346 public abstract class AbstractXML2003Parser extends AbstractParser {
9497
9598 TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
9699 try {
97 context.getSAXParser().parse(
100 XMLReaderUtils.parseSAX(
98101 new CloseShieldInputStream(stream),
99102 new OfflineContentHandler(new EmbeddedContentHandler(
100 getContentHandler(tagged, metadata, context))));
103 getContentHandler(tagged, metadata, context))),
104 context);
101105 } catch (SAXException e) {
102106 tagged.throwIfCauseOf(e);
103107 throw new TikaException("XML parse error", e);
1515 */
1616 package org.apache.tika.parser.mp3;
1717
18 import org.apache.poi.util.IOUtils;
19
20 import java.io.EOFException;
1821 import java.io.IOException;
1922 import java.io.InputStream;
2023 import java.io.PushbackInputStream;
156159 {
157160 if (currentHeader != null)
158161 {
159 skipStream(in, currentHeader.getLength() - HEADER_SIZE);
162 long toSkip = currentHeader.getLength() - HEADER_SIZE;
163 long skipped = IOUtils.skipFully(in, toSkip);
164 if (skipped < toSkip) {
165 throw new EOFException("EOF: tried to skip "+toSkip +
166 " but could only skip "+skipped);
167 }
160168 currentHeader = null;
161169 return true;
162170 }
265273 unread(field.toArray());
266274 }
267275
268 /**
269 * Skips the given number of bytes from the specified input stream.
270 *
271 * @param in the input stream
272 * @param count the number of bytes to skip
273 * @throws IOException if an IO error occurs
274 */
275 private static void skipStream(InputStream in, long count)
276 throws IOException
277 {
278 long size = count;
279 long skipped = 0;
280 while (size > 0 && skipped >= 0)
281 {
282 skipped = in.skip(size);
283 if (skipped != -1)
284 {
285 size -= skipped;
286 }
287 }
288 }
289
290276 /**
291277 * Calculates the bit rate based on the given parameters.
292278 *
427413 * index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the
428414 * first 4 bits.
429415 *
430 * @param the from index
416 * @param from index
431417 * @param to the to index
432418 * @return the value of this group of bits
433419 */
7070 private String pageSegMode = "1";
7171
7272 // Minimum file size to submit file to ocr.
73 private int minFileSizeToOcr = 0;
73 private long minFileSizeToOcr = 0;
7474
7575 // Maximum file size to submit file to ocr.
76 private int maxFileSizeToOcr = Integer.MAX_VALUE;
76 private long maxFileSizeToOcr = Integer.MAX_VALUE;
7777
7878 // Maximum time (seconds) to wait for the ocring process termination
7979 private int timeout = 120;
321321 return preserveInterwordSpacing;
322322 }
323323 /**
324 * @see #setMinFileSizeToOcr(int minFileSizeToOcr)
325 */
326 public int getMinFileSizeToOcr() {
324 * @see #setMinFileSizeToOcr(long minFileSizeToOcr)
325 */
326 public long getMinFileSizeToOcr() {
327327 return minFileSizeToOcr;
328328 }
329329
331331 * Set minimum file size to submit file to ocr.
332332 * Default is 0.
333333 */
334 public void setMinFileSizeToOcr(int minFileSizeToOcr) {
334 public void setMinFileSizeToOcr(long minFileSizeToOcr) {
335335 this.minFileSizeToOcr = minFileSizeToOcr;
336336 }
337337
338338 /**
339 * @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)
340 */
341 public int getMaxFileSizeToOcr() {
339 * @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
340 */
341 public long getMaxFileSizeToOcr() {
342342 return maxFileSizeToOcr;
343343 }
344344
346346 * Set maximum file size to submit file to ocr.
347347 * Default is Integer.MAX_VALUE.
348348 */
349 public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
349 public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
350350 this.maxFileSizeToOcr = maxFileSizeToOcr;
351351 }
352352
629629 * @param defaultMissing default parameter to use.
630630 * @return the value.
631631 */
632 private long getProp(Properties properties, String property, long defaultMissing) {
633 String p = properties.getProperty(property);
634 if (p == null || p.isEmpty()) {
635 return defaultMissing;
636 }
637 try {
638 return Integer.parseInt(p);
639 } catch (Throwable ex) {
640 throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
641 property), ex);
642 }
643 }
644
645
646 /**
647 * Get property from the properties file passed in.
648 *
649 * @param properties properties file to read from.
650 * @param property the property to fetch.
651 * @param defaultMissing default parameter to use.
652 * @return the value.
653 */
632654 private String getProp(Properties properties, String property, String defaultMissing) {
633655 return properties.getProperty(property, defaultMissing);
634656 }
1515 */
1616 package org.apache.tika.parser.ocr;
1717
18 import static java.nio.charset.StandardCharsets.UTF_8;
18 import org.apache.commons.exec.CommandLine;
19 import org.apache.commons.exec.DefaultExecutor;
20 import org.apache.commons.exec.PumpStreamHandler;
21 import org.apache.commons.io.FileUtils;
22 import org.apache.commons.io.IOUtils;
23 import org.apache.commons.lang.SystemUtils;
24 import org.apache.tika.config.Field;
25 import org.apache.tika.config.Initializable;
26 import org.apache.tika.config.InitializableProblemHandler;
27 import org.apache.tika.config.Param;
28 import org.apache.tika.exception.TikaConfigException;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.io.TemporaryResources;
31 import org.apache.tika.io.TikaInputStream;
32 import org.apache.tika.metadata.Metadata;
33 import org.apache.tika.mime.MediaType;
34 import org.apache.tika.mime.MediaTypeRegistry;
35 import org.apache.tika.parser.AbstractParser;
36 import org.apache.tika.parser.CompositeParser;
37 import org.apache.tika.parser.ParseContext;
38 import org.apache.tika.parser.Parser;
39 import org.apache.tika.parser.external.ExternalParser;
40 import org.apache.tika.parser.image.ImageParser;
41 import org.apache.tika.parser.image.TiffParser;
42 import org.apache.tika.parser.jpeg.JpegParser;
43 import org.apache.tika.sax.OfflineContentHandler;
44 import org.apache.tika.sax.XHTMLContentHandler;
45 import org.apache.tika.utils.XMLReaderUtils;
46 import org.slf4j.Logger;
47 import org.slf4j.LoggerFactory;
48 import org.xml.sax.Attributes;
49 import org.xml.sax.ContentHandler;
50 import org.xml.sax.SAXException;
51 import org.xml.sax.helpers.DefaultHandler;
1952
2053 import javax.imageio.ImageIO;
21 import javax.xml.parsers.SAXParser;
2254 import java.awt.Image;
2355 import java.awt.image.BufferedImage;
2456 import java.io.ByteArrayOutputStream;
4981 import java.util.concurrent.TimeUnit;
5082 import java.util.concurrent.TimeoutException;
5183
52 import org.apache.commons.exec.CommandLine;
53 import org.apache.commons.exec.DefaultExecutor;
54 import org.apache.commons.exec.PumpStreamHandler;
55 import org.apache.commons.io.FileUtils;
56 import org.apache.commons.io.IOUtils;
57 import org.apache.commons.lang.SystemUtils;
58 import org.apache.tika.config.Initializable;
59 import org.apache.tika.config.InitializableProblemHandler;
60 import org.apache.tika.config.Param;
61 import org.apache.tika.exception.TikaConfigException;
62 import org.apache.tika.exception.TikaException;
63 import org.apache.tika.io.TemporaryResources;
64 import org.apache.tika.io.TikaInputStream;
65 import org.apache.tika.metadata.Metadata;
66 import org.apache.tika.mime.MediaType;
67 import org.apache.tika.mime.MediaTypeRegistry;
68 import org.apache.tika.parser.AbstractParser;
69 import org.apache.tika.parser.CompositeParser;
70 import org.apache.tika.parser.ParseContext;
71 import org.apache.tika.parser.Parser;
72 import org.apache.tika.parser.external.ExternalParser;
73 import org.apache.tika.parser.image.ImageParser;
74 import org.apache.tika.parser.image.TiffParser;
75 import org.apache.tika.parser.jpeg.JpegParser;
76 import org.apache.tika.sax.OfflineContentHandler;
77 import org.apache.tika.sax.XHTMLContentHandler;
78 import org.slf4j.Logger;
79 import org.slf4j.LoggerFactory;
80 import org.xml.sax.Attributes;
81 import org.xml.sax.ContentHandler;
82 import org.xml.sax.SAXException;
83 import org.xml.sax.helpers.DefaultHandler;
84 import static java.nio.charset.StandardCharsets.UTF_8;
8485
8586 /**
8687 * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
104105
105106
106107 private static final long serialVersionUID = -8167538283213097265L;
107 private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
108108 private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
109109 new HashSet<>(Arrays.asList(new MediaType[]{
110110 MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
111111 MediaType.image("bmp"), MediaType.image("gif"), MediaType.image("jp2"),
112112 MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
113113 })));
114 private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
115
114116 private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
115117 private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
116118
118120 @Override
119121 public Set<MediaType> getSupportedTypes(ParseContext context) {
120122 // If Tesseract is installed, offer our supported image types
121 TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
123 TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig);
122124 if (hasTesseract(config)) {
123125 return SUPPORTED_TYPES;
124126 }
256258 @Override
257259 public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
258260 throws IOException, SAXException, TikaException {
259 TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
261 TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
260262 // If Tesseract is not on the path with the current config, do not try to run OCR
261263 // getSupportedTypes shouldn't have listed us as handling it, so this should only
262264 // occur if someone directly calls this parser, not via DefaultParser or similar
469471 //by sending in a bogus tesseract path via a custom TesseractOCRConfig.
470472 //TODO: figure out how to solve that.
471473 if (! hasWarned()) {
472 if (hasTesseract(DEFAULT_CONFIG)) {
474 if (hasTesseract(defaultConfig)) {
473475 problemHandler.handleInitializableProblem(this.getClass().getName(),
474476 "Tesseract OCR is installed and will be automatically applied to image files unless\n" +
475477 "you've excluded the TesseractOCRParser from the default parser.\n"+
591593 if (parseContext == null) {
592594 parseContext = new ParseContext();
593595 }
594 SAXParser parser = parseContext.getSAXParser();
596
595597 xhtml.startElement("div", "class", "ocr");
596 parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)));
598 XMLReaderUtils.parseSAX(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)), parseContext);
597599 xhtml.endElement("div");
600
598601 }
599602
600603 /**
693696 protected void warn() {
694697 HAS_WARNED = true;
695698 }
699
700 @Field
701 public void setTesseractPath(String tesseractPath) {
702 defaultConfig.setTesseractPath(tesseractPath);
703 }
704
705 @Field
706 public void setTessdataPath(String tessdataPath) {
707 defaultConfig.setTessdataPath(tessdataPath);
708 }
709
710 @Field
711 public void setLanguage(String language) {
712 defaultConfig.setLanguage(language);
713 }
714
715 @Field
716 public void setPageSegMode(String pageSegMode) {
717 defaultConfig.setPageSegMode(pageSegMode);
718 }
719
720 @Field
721 public void setMinFileSizeToOcr(long minFileSizeToOcr) {
722 defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
723 }
724
725 @Field
726 public void setTimeout(int timeout) {
727 defaultConfig.setTimeout(timeout);
728 }
729
730 @Field
731 public void setOutputType(String outputType) {
732 defaultConfig.setOutputType(outputType);
733 }
734
735 @Field
736 public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
737 defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing);
738 }
739
740 @Field
741 public void setEnableImageProcessing(int enableImageProcessing) {
742 defaultConfig.setEnableImageProcessing(enableImageProcessing);
743 }
744
745 @Field
746 public void setImageMagickPath(String imageMagickPath) {
747 defaultConfig.setImageMagickPath(imageMagickPath);
748 }
749
750 @Field
751 public void setDensity(int density) {
752 defaultConfig.setDensity(density);
753 }
754
755 @Field
756 public void setDepth(int depth) {
757 defaultConfig.setDepth(depth);
758 }
759
760 @Field
761 public void setColorspace(String colorspace) {
762 defaultConfig.setColorspace(colorspace);
763 }
764
765 @Field
766 public void setFilter(String filter) {
767 defaultConfig.setFilter(filter);
768 }
769
770 @Field
771 public void setResize(int resize) {
772 defaultConfig.setResize(resize);
773 }
774
775 @Field
776 public void setApplyRotation(boolean applyRotation) {
777 defaultConfig.setApplyRotation(applyRotation);
778 }
779
780 public TesseractOCRConfig getDefaultConfig() {
781 return defaultConfig;
782 }
696783 }
697784
1515 */
1616 package org.apache.tika.parser.odf;
1717
18 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
19
20 import javax.xml.namespace.QName;
21 import javax.xml.parsers.SAXParser;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.BitSet;
25 import java.util.Collections;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.Set;
29 import java.util.Stack;
30
3118 import org.apache.commons.io.input.CloseShieldInputStream;
3219 import org.apache.tika.exception.TikaException;
3320 import org.apache.tika.metadata.Metadata;
3825 import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
3926 import org.apache.tika.sax.OfflineContentHandler;
4027 import org.apache.tika.sax.XHTMLContentHandler;
28 import org.apache.tika.utils.XMLReaderUtils;
4129 import org.xml.sax.Attributes;
4230 import org.xml.sax.ContentHandler;
4331 import org.xml.sax.SAXException;
4432 import org.xml.sax.helpers.AttributesImpl;
4533 import org.xml.sax.helpers.DefaultHandler;
34
35 import javax.xml.namespace.QName;
36 import java.io.IOException;
37 import java.io.InputStream;
38 import java.util.BitSet;
39 import java.util.Collections;
40 import java.util.HashMap;
41 import java.util.Map;
42 import java.util.Set;
43 import java.util.Stack;
44
45 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
4646
4747 /**
4848 * Parser for ODF <code>content.xml</code> files.
595595 DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
596596
597597
598 SAXParser parser = context.getSAXParser();
599 parser.parse(
598 XMLReaderUtils.parseSAX(
600599 new CloseShieldInputStream(stream),
601600 new OfflineContentHandler(
602 new NSNormalizerContentHandler(dh)));
601 new NSNormalizerContentHandler(dh)),
602 context);
603603 }
604604
605605 }
2525 import java.util.HashSet;
2626 import java.util.Set;
2727 import java.util.zip.ZipEntry;
28 import java.util.zip.ZipException;
2829 import java.util.zip.ZipFile;
2930 import java.util.zip.ZipInputStream;
3031
174175
175176 private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
176177 ZipEntry entry = zipStream.getNextEntry();
177 while (entry != null) {
178 if (entry == null) {
179 throw new IOException("No entries found in ZipInputStream");
180 }
181 do {
178182 handleZipEntry(entry, zipStream, metadata, context, handler);
179183 entry = zipStream.getNextEntry();
180 }
184 } while (entry != null);
181185 }
182186
183187 private void handleZipFile(ZipFile zipFile, Metadata metadata,
1515 */
1616 package org.apache.tika.parser.pdf;
1717
18 import javax.xml.parsers.DocumentBuilder;
1918 import javax.xml.stream.XMLStreamException;
2019 import java.io.ByteArrayInputStream;
2120 import java.io.IOException;
6867 import org.apache.tika.parser.image.xmp.JempboxExtractor;
6968 import org.apache.tika.parser.ocr.TesseractOCRParser;
7069 import org.apache.tika.sax.XHTMLContentHandler;
70 import org.apache.tika.utils.XMLReaderUtils;
7171 import org.w3c.dom.Document;
7272 import org.xml.sax.ContentHandler;
73 import org.xml.sax.ErrorHandler;
7473 import org.xml.sax.SAXException;
7574
7675 /**
103102
104103 private static volatile boolean HAS_WARNED = false;
105104 private static final Object[] LOCK = new Object[0];
105 //the old "created" metadata. This will go away in Tika 2.0
106 private static final Property DEPRECATED_CREATED = Property.externalDate("created");
106107
107108 /**
108109 * Metadata key for giving the document password to the parser.
266267 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
267268 addMetadata(metadata, "trapped", info.getTrapped());
268269 addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
269 // TODO Remove these in Tika 2.0
270 addMetadata(metadata, "created", info.getCreationDate());
271 addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
272 addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
270 // TODO Remove these in Tika 2.0
271 Calendar created = info.getCreationDate();
272 addMetadata(metadata, DEPRECATED_CREATED, created);
273 addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
274 addMetadata(metadata, TikaCoreProperties.CREATED, created);
273275 Calendar modified = info.getModificationDate();
274276 addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
275277 addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
276 addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());
278 addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
277279
278280 // All remaining metadata is custom
279281 // Copy this over as-is
484486 return value;
485487 }
486488
487 private void addMetadata(Metadata metadata, String name, Calendar value) {
488 if (value != null) {
489 metadata.set(name, value.getTime().toString());
490 }
491 }
492489
493490 private void addMetadata(Metadata metadata, Property property, Calendar value) {
494491 if (value != null) {
495 metadata.set(property, value.getTime());
492 metadata.set(property, value);
496493 }
497494 }
498495
738735 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
739736 return null;
740737 }
741 DocumentBuilder documentBuilder = context.getDocumentBuilder();
742 documentBuilder.setErrorHandler((ErrorHandler)null);
743 return documentBuilder.parse(is);
738 return XMLReaderUtils.buildDOM(is, context);
744739 } catch (IOException|SAXException|TikaException e) {
745740 EmbeddedDocumentUtil.recordException(e, metadata);
746741 } finally {
2222
2323 import com.github.junrar.Archive;
2424 import com.github.junrar.exception.RarException;
25 import com.github.junrar.impl.FileVolumeManager;
2526 import com.github.junrar.rarfile.FileHeader;
2627 import org.apache.tika.exception.EncryptedDocumentException;
2728 import org.apache.tika.exception.TikaException;
6465 Archive rar = null;
6566 try (TemporaryResources tmp = new TemporaryResources()) {
6667 TikaInputStream tis = TikaInputStream.get(stream, tmp);
67 rar = new Archive(tis.getFile());
68 rar = new Archive(new FileVolumeManager(tis.getFile()));
6869
6970 if (rar.isEncrypted()) {
7071 throw new EncryptedDocumentException();
1515 */
1616 package org.apache.tika.parser.pkg;
1717
18 import static java.nio.charset.StandardCharsets.UTF_8;
19
20 import java.io.ByteArrayInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.StandardCharsets;
24 import java.util.Enumeration;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.Locale;
28 import java.util.Set;
29 import java.util.regex.Pattern;
30
3118 import org.apache.commons.compress.archivers.ArchiveException;
3219 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
3320 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
3522 import org.apache.commons.compress.compressors.CompressorException;
3623 import org.apache.commons.compress.compressors.CompressorStreamFactory;
3724 import org.apache.commons.io.IOUtils;
25 import org.apache.poi.UnsupportedFileFormatException;
3826 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
3927 import org.apache.poi.openxml4j.opc.OPCPackage;
4028 import org.apache.poi.openxml4j.opc.PackageAccess;
4129 import org.apache.poi.openxml4j.opc.PackagePart;
4230 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
4331 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
32 import org.apache.poi.openxml4j.util.ZipEntrySource;
33 import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
4434 import org.apache.tika.detect.Detector;
4535 import org.apache.tika.exception.TikaException;
4636 import org.apache.tika.io.TemporaryResources;
5040 import org.apache.tika.parser.iwork.IWorkPackageParser;
5141 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
5242 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
43
44 import java.io.ByteArrayInputStream;
45 import java.io.IOException;
46 import java.io.InputStream;
47 import java.util.Enumeration;
48 import java.util.HashSet;
49 import java.util.Iterator;
50 import java.util.Locale;
51 import java.util.Set;
52 import java.util.regex.Pattern;
53
54 import static java.nio.charset.StandardCharsets.UTF_8;
5355
5456 /**
5557 * A detector that works on Zip documents and other archive and compression
171173 if (type != null) {
172174 return type;
173175 }
174
175176 ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
176177 try {
177178 type = detectOpenDocument(zip);
233234 }
234235
235236 private static MediaType detectOPCBased(TikaInputStream stream) {
236 try {
237 // if (zip.getEntry("_rels/.rels") != null
238 // || zip.getEntry("[Content_Types].xml") != null) {
239 // Use POI to open and investigate it for us
240 OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
241 stream.setOpenContainer(pkg);
242
243 // Is at an OOXML format?
244 MediaType type = detectOfficeOpenXML(pkg);
245 if (type != null) return type;
246
237
238 ZipEntrySource zipEntrySource = null;
239 try {
240 zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
241 } catch (IOException e) {
242 return null;
243 }
244
245 //if (zip.getEntry("_rels/.rels") != null
246 // || zip.getEntry("[Content_Types].xml") != null) {
247 // Use POI to open and investigate it for us
248 //Unfortunately, POI can throw a RuntimeException...so we
249 //have to catch that.
250 OPCPackage pkg = null;
251 try {
252 pkg = OPCPackage.open(zipEntrySource);
253 } catch (SecurityException e) {
254 closeQuietly(zipEntrySource);
255 //TIKA-2571
256 throw e;
257 } catch (InvalidFormatException|RuntimeException e) {
258 closeQuietly(zipEntrySource);
259 return null;
260 }
261
262 MediaType type = null;
263 try {
264
265 // Is at an OOXML format?
266 type = detectOfficeOpenXML(pkg);
267 if (type == null) {
247268 // Is it XPS format?
248269 type = detectXPSOPC(pkg);
249 if (type != null) return type;
250
270 }
271 if (type == null) {
251272 // Is it an AutoCAD format?
252273 type = detectAutoCADOPC(pkg);
253 if (type != null) return type;
254
255 // We don't know what it is, sorry
256 return null;
257 } catch (IOException e) {
258 return null;
274 }
275
259276 } catch (SecurityException e) {
277 closeQuietly(zipEntrySource);
260278 //TIKA-2571
261279 throw e;
262280 } catch (RuntimeException e) {
263 return null;
264 } catch (InvalidFormatException e) {
265 return null;
281 closeQuietly(zipEntrySource);
282 return null;
283 }
284 //only set the open container if we made it here
285 stream.setOpenContainer(pkg);
286 // We don't know what it is, sorry
287 return type;
288 }
289
290 private static void closeQuietly(ZipEntrySource zipEntrySource) {
291 if (zipEntrySource == null) {
292 return;
293 }
294 try {
295 zipEntrySource.close();
296 } catch (IOException e) {
297 //swallow
266298 }
267299 }
268300 /**
3131 import org.apache.poi.poifs.filesystem.DocumentEntry;
3232 import org.apache.poi.poifs.filesystem.DocumentInputStream;
3333 import org.apache.poi.poifs.filesystem.Entry;
34 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
34 import org.apache.poi.poifs.filesystem.FileMagic;
35 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
3536 import org.apache.poi.poifs.filesystem.Ole10Native;
3637 import org.apache.poi.poifs.filesystem.Ole10NativeException;
3738 import org.apache.poi.util.IOUtils;
114115 ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
115116 boolean hasPoifs = false;
116117 try {
117 hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs);
118 hasPoifs = hasPOIFSHeader(embIs);
118119 } catch (IOException e) {
119120 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
120121 return embObjBytes;
138139 throws IOException {
139140
140141 byte[] ret = null;
141 try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
142 try (POIFSFileSystem fs = new POIFSFileSystem(is)) {
142143
143144 DirectoryNode root = fs.getRoot();
144145
327328 return new byte[(int) len];
328329
329330 }
331
332 private static boolean hasPOIFSHeader(InputStream is) throws IOException {
333 return FileMagic.valueOf(is) == FileMagic.OLE2;
334 }
330335 }
331336
1919 import java.io.IOException;
2020 import java.io.InputStream;
2121 import java.io.PushbackInputStream;
22 import java.nio.Buffer;
2223 import java.nio.ByteBuffer;
2324 import java.nio.CharBuffer;
2425 import java.nio.charset.Charset;
237238
238239 // Used when we decode bytes -> chars using CharsetDecoder:
239240 private final char[] outputArray = new char[128];
240 private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
241 private final Buffer outputCharBuffer = CharBuffer.wrap(outputArray);
241242 // Holds the font table from this RTF doc, mapping
242243 // the font number (from \fN control word) to the
243244 // corresponding charset:
261262 // for text output:
262263 private byte[] pendingBytes = new byte[16];
263264 private int pendingByteCount;
264 private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
265 private Buffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
265266 // Holds pending chars for text output
266267 private char[] pendingChars = new char[10];
267268 private int pendingCharCount;
662663 final CharsetDecoder decoder = getDecoder();
663664 pendingByteBuffer.limit(pendingByteCount);
664665 assert pendingByteBuffer.position() == 0;
665 assert outputBuffer.position() == 0;
666 assert outputCharBuffer.position() == 0;
666667
667668 while (true) {
668669 // We pass true for endOfInput because, when
669670 // we are called, we should have seen a
670671 // complete sequence of characters for this
671672 // charset:
672 final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
673
674 final int pos = outputBuffer.position();
673 final CoderResult result = decoder.decode((ByteBuffer)pendingByteBuffer, (CharBuffer) outputCharBuffer, true);
674
675 final int pos = outputCharBuffer.position();
675676 if (pos > 0) {
676677 if (inHeader || fieldState == 1) {
677678 pendingBuffer.append(outputArray, 0, pos);
679680 lazyStartParagraph();
680681 out.characters(outputArray, 0, pos);
681682 }
682 outputBuffer.position(0);
683 outputCharBuffer.position(0);
683684 }
684685
685686 if (result == CoderResult.UNDERFLOW) {
688689 }
689690
690691 while (true) {
691 final CoderResult result = decoder.flush(outputBuffer);
692
693 final int pos = outputBuffer.position();
692 final CoderResult result = decoder.flush((CharBuffer) outputCharBuffer);
693
694 final int pos = outputCharBuffer.position();
694695 if (pos > 0) {
695696 if (inHeader || fieldState == 1) {
696697 pendingBuffer.append(outputArray, 0, pos);
698699 lazyStartParagraph();
699700 out.characters(outputArray, 0, pos);
700701 }
701 outputBuffer.position(0);
702 outputCharBuffer.position(0);
702703 }
703704
704705 if (result == CoderResult.UNDERFLOW) {
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.sas;
17
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.util.Collections;
21 import java.util.Set;
22
23 import org.apache.tika.exception.TikaException;
24 import org.apache.tika.metadata.Database;
25 import org.apache.tika.metadata.HttpHeaders;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.OfficeOpenXMLExtended;
28 import org.apache.tika.metadata.PagedText;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AbstractParser;
32 import org.apache.tika.parser.ParseContext;
33 import org.apache.tika.parser.executable.MachineMetadata;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37
38 import com.epam.parso.Column;
39 import com.epam.parso.DataWriterUtil;
40 import com.epam.parso.SasFileProperties;
41 import com.epam.parso.SasFileReader;
42 import com.epam.parso.impl.SasFileReaderImpl;
43
44 /**
45 * Processes the SAS7BDAT data columnar database file used by SAS and
46 * other similar languages.
47 */
48 public class SAS7BDATParser extends AbstractParser {
49 private static final long serialVersionUID = -2775485539937983150L;
50
51 private static final MediaType TYPE_SAS7BDAT =
52 MediaType.application("x-sas-data");
53 private static final Set<MediaType> SUPPORTED_TYPES =
54 Collections.singleton(TYPE_SAS7BDAT);
55
56 @Override
57 public Set<MediaType> getSupportedTypes(ParseContext context) {
58 return SUPPORTED_TYPES;
59 }
60
61 @Override
62 public void parse(InputStream stream, ContentHandler handler,
63 Metadata metadata, ParseContext context)
64 throws IOException, SAXException, TikaException {
65 metadata.set(Metadata.CONTENT_TYPE, TYPE_SAS7BDAT.toString());
66
67 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68 xhtml.startDocument();
69
70 SasFileReader sas = new SasFileReaderImpl(stream);
71 SasFileProperties props = sas.getSasFileProperties();
72
73 // Record the interesting parts of the file's metadata
74 metadata.set(TikaCoreProperties.TITLE, props.getName());
75 metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
76 metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());
77
78 metadata.set(PagedText.N_PAGES, (int)props.getPageCount());
79 metadata.set(Database.COLUMN_COUNT, (int)props.getColumnsCount());
80 metadata.set(Database.ROW_COUNT, (int)props.getRowCount());
81
82 // TODO Can we find more general properties for these / move
83 // these to more general places?
84 metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding());
85 metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType());
86 metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease());
87 metadata.set(MachineMetadata.ARCHITECTURE_BITS,
88 props.isU64() ? "64" : "32");
89 metadata.set(MachineMetadata.ENDIAN, props.getEndianness() == 1 ?
90 MachineMetadata.Endian.LITTLE.getName() :
91 MachineMetadata.Endian.BIG.getName());
92
93 // The following SAS Metadata fields are currently ignored:
94 // compressionMethod
95 // sessionEncoding
96 // fileType
97 // osName -
98 // osType -
99 // mixPageRowCount
100 // headerLength
101 // pageLength
102 // rowLength
103
104 // Process the column metadata
105 // TODO Find keys to record the format and the type
106 for (Column c : sas.getColumns()) {
107 String name = c.getLabel();
108 if (name == null || name.isEmpty()) name = c.getName();
109 metadata.add(Database.COLUMN_NAME, name);
110 }
111
112
113 // Output file contents as a table
114 xhtml.element("h1", props.getName());
115 xhtml.startElement("table");
116 xhtml.newline();
117
118 // Do the column headings
119 xhtml.startElement("tr");
120 for (Column c : sas.getColumns()) {
121 String label = c.getLabel();
122 if (label == null || label.isEmpty()) label = c.getName();
123
124 xhtml.startElement("th", "title", c.getName());
125 xhtml.characters(label);
126 xhtml.endElement("th");
127 }
128 xhtml.endElement("tr");
129 xhtml.newline();
130
131 // Process each row in turn
132 Object[] row = null;
133 while ((row = sas.readNext()) != null) {
134 xhtml.startElement("tr");
135 for (String val : DataWriterUtil.getRowValues(sas.getColumns(), row)) {
136 // Use explicit start/end, rather than element, to
137 // ensure that empty cells still get output
138 xhtml.startElement("td");
139 xhtml.characters(val);
140 xhtml.endElement("td");
141 }
142 xhtml.endElement("tr");
143 xhtml.newline();
144 }
145
146 // Finish
147 xhtml.endElement("table");
148 xhtml.endDocument();
149 }
150 }
3333 import org.apache.tika.sax.TaggedContentHandler;
3434 import org.apache.tika.sax.TextContentHandler;
3535 import org.apache.tika.sax.XHTMLContentHandler;
36 import org.apache.tika.utils.XMLReaderUtils;
3637 import org.xml.sax.ContentHandler;
3738 import org.xml.sax.SAXException;
39
40 import javax.xml.parsers.SAXParser;
3841
3942 /**
4043 * XML parser.
6871
6972 TaggedContentHandler tagged = new TaggedContentHandler(handler);
7073 try {
71 context.getSAXParser().parse(
74 XMLReaderUtils.parseSAX(
7275 new CloseShieldInputStream(stream),
7376 new OfflineContentHandler(new EmbeddedContentHandler(
74 getContentHandler(tagged, metadata, context))));
77 getContentHandler(tagged, metadata, context))), context);
7578 } catch (SAXException e) {
7679 tagged.throwIfCauseOf(e);
7780 throw new TikaException("XML parse error", e);
5757 org.apache.tika.parser.pkg.PackageParser
5858 org.apache.tika.parser.pkg.RarParser
5959 org.apache.tika.parser.rtf.RTFParser
60 org.apache.tika.parser.sas.SAS7BDATParser
6061 org.apache.tika.parser.txt.TXTParser
6162 org.apache.tika.parser.video.FLVParser
6263 org.apache.tika.parser.wordperfect.QuattroProParser
0 # Licensed to the Apache Software Foundation (ASF) under one or more
1 # contributor license agreements. See the NOTICE file distributed with
2 # this work for additional information regarding copyright ownership.
3 # The ASF licenses this file to You under the Apache License, Version 2.0
4 # (the "License"); you may not use this file except in compliance with
5 # the License. You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.unicode-1-1-utf-8 UTF-8
14 #
15 # label encoding fallback
16 utf-8 UTF-8
17 utf8 UTF-8
18 866 IBM866
19 cp866 IBM866
20 csibm866 IBM866
21 ibm866 IBM866
22 csisolatin2 ISO-8859-2
23 iso-8859-2 ISO-8859-2
24 iso-ir-101 ISO-8859-2
25 iso8859-2 ISO-8859-2
26 iso88592 ISO-8859-2
27 iso_8859-2 ISO-8859-2
28 iso_8859-2:1987 ISO-8859-2
29 l2 ISO-8859-2
30 latin2 ISO-8859-2
31 csisolatin3 ISO-8859-3
32 iso-8859-3 ISO-8859-3
33 iso-ir-109 ISO-8859-3
34 iso8859-3 ISO-8859-3
35 iso88593 ISO-8859-3
36 iso_8859-3 ISO-8859-3
37 iso_8859-3:1988 ISO-8859-3
38 l3 ISO-8859-3
39 latin3 ISO-8859-3
40 csisolatin4 ISO-8859-4
41 iso-8859-4 ISO-8859-4
42 iso-ir-110 ISO-8859-4
43 iso8859-4 ISO-8859-4
44 iso88594 ISO-8859-4
45 iso_8859-4 ISO-8859-4
46 iso_8859-4:1988 ISO-8859-4
47 l4 ISO-8859-4
48 latin4 ISO-8859-4
49 csisolatincyrillic ISO-8859-5
50 cyrillic ISO-8859-5
51 iso-8859-5 ISO-8859-5
52 iso-ir-144 ISO-8859-5
53 iso8859-5 ISO-8859-5
54 iso88595 ISO-8859-5
55 iso_8859-5 ISO-8859-5
56 iso_8859-5:1988 ISO-8859-5
57 arabic ISO-8859-6
58 asmo-708 ISO-8859-6
59 csiso88596e ISO-8859-6
60 csiso88596i ISO-8859-6
61 csisolatinarabic ISO-8859-6
62 ecma-114 ISO-8859-6
63 iso-8859-6 ISO-8859-6
64 iso-8859-6-e ISO-8859-6
65 iso-8859-6-i ISO-8859-6
66 iso-ir-127 ISO-8859-6
67 iso8859-6 ISO-8859-6
68 iso88596 ISO-8859-6
69 iso_8859-6 ISO-8859-6
70 iso_8859-6:1987 ISO-8859-6
71 csisolatingreek ISO-8859-7
72 ecma-118 ISO-8859-7
73 elot_928 ISO-8859-7
74 greek ISO-8859-7
75 greek8 ISO-8859-7
76 iso-8859-7 ISO-8859-7
77 iso-ir-126 ISO-8859-7
78 iso8859-7 ISO-8859-7
79 iso88597 ISO-8859-7
80 iso_8859-7 ISO-8859-7
81 iso_8859-7:1987 ISO-8859-7
82 sun_eu_greek ISO-8859-7
83 csiso88598e ISO-8859-8
84 csisolatinhebrew ISO-8859-8
85 hebrew ISO-8859-8
86 iso-8859-8 ISO-8859-8
87 iso-8859-8-e ISO-8859-8
88 iso-ir-138 ISO-8859-8
89 iso8859-8 ISO-8859-8
90 iso88598 ISO-8859-8
91 iso_8859-8 ISO-8859-8
92 iso_8859-8:1988 ISO-8859-8
93 visual ISO-8859-8
94 csiso88598i ISO-8859-8-I ISO-8859-8
95 iso-8859-8-i ISO-8859-8-I ISO-8859-8
96 logical ISO-8859-8-I ISO-8859-8
97 csisolatin6 ISO-8859-10 ISO-8859-4
98 iso-8859-10 ISO-8859-10 ISO-8859-4
99 iso-ir-157 ISO-8859-10 ISO-8859-4
100 iso8859-10 ISO-8859-10 ISO-8859-4
101 iso885910 ISO-8859-10 ISO-8859-4
102 l6 ISO-8859-10 ISO-8859-4
103 latin6 ISO-8859-10 ISO-8859-4
104 iso-8859-13 ISO-8859-13
105 iso8859-13 ISO-8859-13
106 iso885913 ISO-8859-13
107 iso-8859-14 ISO-8859-14 ISO-8859-1
108 iso8859-14 ISO-8859-14 ISO-8859-1
109 iso885914 ISO-8859-14 ISO-8859-1
110 csisolatin9 ISO-8859-15
111 iso-8859-15 ISO-8859-15
112 iso8859-15 ISO-8859-15
113 iso885915 ISO-8859-15
114 iso_8859-15 ISO-8859-15
115 l9 ISO-8859-15
116 iso-8859-16 ISO-8859-16 ISO-8859-1
117 cskoi8r KOI8-R
118 koi KOI8-R
119 koi8 KOI8-R
120 koi8-r KOI8-R
121 koi8_r KOI8-R
122 koi8-ru KOI8-U
123 koi8-u KOI8-U
124 csmacintosh x-MacRoman
125 mac x-MacRoman
126 macintosh x-MacRoman
127 x-mac-roman x-MacRoman
128 dos-874 windows-874
129 iso-8859-11 windows-874
130 iso8859-11 windows-874
131 iso885911 windows-874
132 tis-620 windows-874
133 windows-874 windows-874
134 cp1250 windows-1250
135 windows-1250 windows-1250
136 x-cp1250 windows-1250
137 cp1251 windows-1251
138 windows-1251 windows-1251
139 x-cp1251 windows-1251
140 ansi_x3.4-1968 windows-1252
141 ascii windows-1252
142 cp1252 windows-1252
143 cp819 windows-1252
144 csisolatin1 windows-1252
145 ibm819 windows-1252
146 iso-8859-1 windows-1252
147 iso-ir-100 windows-1252
148 iso8859-1 windows-1252
149 iso88591 windows-1252
150 iso_8859-1 windows-1252
151 iso_8859-1:1987 windows-1252
152 l1 windows-1252
153 latin1 windows-1252
154 us-ascii windows-1252
155 windows-1252 windows-1252
156 x-cp1252 windows-1252
157 cp1253 windows-1253
158 windows-1253 windows-1253
159 x-cp1253 windows-1253
160 cp1254 windows-1254
161 csisolatin5 windows-1254
162 iso-8859-9 windows-1254
163 iso-ir-148 windows-1254
164 iso8859-9 windows-1254
165 iso88599 windows-1254
166 iso_8859-9 windows-1254
167 iso_8859-9:1989 windows-1254
168 l5 windows-1254
169 latin5 windows-1254
170 windows-1254 windows-1254
171 x-cp1254 windows-1254
172 cp1255 windows-1255
173 windows-1255 windows-1255
174 x-cp1255 windows-1255
175 cp1256 windows-1256
176 windows-1256 windows-1256
177 x-cp1256 windows-1256
178 cp1257 windows-1257
179 windows-1257 windows-1257
180 x-cp1257 windows-1257
181 cp1258 windows-1258
182 windows-1258 windows-1258
183 x-cp1258 windows-1258
184 x-mac-cyrillic x-MacCyrillic
185 x-mac-ukrainian x-MacCyrillic
186 chinese GBK
187 csgb2312 GBK
188 csiso58gb231280 GBK
189 gb2312 GBK
190 gb_2312 GBK
191 gb_2312-80 GBK
192 gbk GBK
193 iso-ir-58 GBK
194 x-gbk GBK
195 gb18030 gb18030
196 big5 Big5
197 big5-hkscs Big5
198 cn-big5 Big5
199 csbig5 Big5
200 x-x-big5 Big5
201 cseucpkdfmtjapanese EUC-JP
202 euc-jp EUC-JP
203 x-euc-jp EUC-JP
204 csiso2022jp ISO-2022-JP
205 iso-2022-jp ISO-2022-JP
206 csshiftjis Shift_JIS
207 ms932 Shift_JIS
208 ms_kanji Shift_JIS
209 shift-jis Shift_JIS
210 shift_jis Shift_JIS
211 sjis Shift_JIS
212 windows-31j Shift_JIS
213 x-sjis Shift_JIS
214 cseuckr EUC-KR
215 csksc56011987 EUC-KR
216 euc-kr EUC-KR
217 iso-ir-149 EUC-KR
218 korean EUC-KR
219 ks_c_5601-1987 EUC-KR
220 ks_c_5601-1989 EUC-KR
221 ksc5601 EUC-KR
222 ksc_5601 EUC-KR
223 windows-949 EUC-KR
224 csiso2022kr replacement
225 hz-gb-2312 replacement
226 iso-2022-cn replacement
227 iso-2022-cn-ext replacement
228 iso-2022-kr replacement
229 replacement replacement
230 utf-16be UTF-16BE
231 utf-16 UTF-16LE
232 utf-16le UTF-16LE
233 x-user-defined x-user-defined
112112 @Ignore("ignore for regular builds; run occasionally")
113113 public void testAllMultiThreaded() throws Exception {
114114 //this runs against all files in /test-documents
115 testMultiThreaded(10, 100, null);
115 //testMultiThreaded(10, 100, null);
116116 }
117117 }
2525 import java.io.ByteArrayInputStream;
2626 import java.nio.charset.StandardCharsets;
2727
28 import static org.apache.tika.XMLTestBase.injectXML;
29 import static org.apache.tika.XMLTestBase.parse;
3028 import static org.junit.Assert.assertTrue;
31 import static org.junit.Assert.fail;
3229
3330 /**
3431 * Tests to confirm defenses against entity expansion attacks.
3532 */
3633 @Ignore("initial draft, needs more work")
37 public class TestXMLEntityExpansion
38 {
34 public class TestXMLEntityExpansion extends XMLTestBase {
35
3936 private static final byte[] ENTITY_EXPANSION_BOMB = new String(
4037 "<!DOCTYPE kaboom [ " +
4138 "<!ENTITY a \"1234567890\" > " +
6057 "]> " +
6158 "<kaboom>&s;</kaboom>").getBytes(StandardCharsets.UTF_8);
6259
63 //a truly vulnerable parser, say xerces2, doesn't oom, it thrashes with gc.
6460 //Set a reasonable amount of time as the timeout
61 //Make sure that the test apparatus actually works.
6562 @Test(timeout = 20000)
66 public void testInjectedXML() throws Exception {
63 public void testVulnerableParser() throws Exception {
6764 byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
6865 byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
69 parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
66
67 Thread thread = new Thread() {
68 @Override
69 public void run() {
70 try {
71 parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
72 } catch (Exception e) {
73 throw new RuntimeException(e);
74 }
75 }
76 };
77 thread.start();
78 Thread.sleep(10000);
79 assertTrue(thread.isAlive());
80 thread.interrupt();
81
7082 }
7183
7284 @Test(timeout = 20000)//
2020 import static org.junit.Assert.assertTrue;
2121
2222 import java.io.File;
23 import java.io.FileFilter;
2324 import java.io.FilenameFilter;
2425 import java.io.IOException;
2526 import java.io.InputStream;
26
27 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
27 import java.util.Random;
28
29 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
30 import org.apache.tika.MultiThreadedTikaTest;
31 import org.apache.tika.Tika;
2832 import org.apache.tika.config.TikaConfig;
33 import org.apache.tika.exception.TikaException;
2934 import org.apache.tika.io.TikaInputStream;
3035 import org.apache.tika.metadata.Metadata;
3136 import org.apache.tika.mime.MediaType;
3237 import org.apache.tika.mime.MimeTypes;
3338 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
39 import org.apache.tika.utils.XMLReaderUtils;
40 import org.junit.After;
3441 import org.junit.Test;
3542
3643 /**
37 * Junit test class for {@link ContainerAwareDetector}
44 * Junit test class for {@link org.apache.tika.parser.microsoft.POIFSContainerDetector}
3845 */
39 public class TestContainerAwareDetector {
46 public class TestContainerAwareDetector extends MultiThreadedTikaTest {
4047 private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
4148 private final MimeTypes mimeTypes = tikaConfig.getMimeRepository();
4249 private final Detector detector = new DefaultDetector(mimeTypes);
50
51 @After
52 public void tearDown() throws TikaException {
53 //make sure to reset pool size because it is being randomly resized during the tests
54 XMLReaderUtils.setPoolSize(10);
55 }
4356
4457 private void assertTypeByData(String file, String type) throws Exception {
4558 assertTypeByNameAndData(file, null, type);
169182 assertEquals(
170183 MediaType.parse("application/vnd.ms-powerpoint"),
171184 detector.detect(stream, new Metadata()));
172 assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
185 assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem);
173186 }
174187 }
175188
434447 }
435448 }
436449
450 @Test
451 public void testXMLMultiThreaded() throws Exception {
452 Detector detector = new Tika().getDetector();
453 FileFilter filter = new FileFilter() {
454 @Override
455 public boolean accept(File pathname) {
456 if (pathname.getName().endsWith(".xml")) {
457 return true;
458 }
459 return false;
460 }
461 };
462 int numThreads = 1;
463 XMLReaderUtils.setPoolSize(numThreads);
464 testDetector(detector, numThreads, 20, filter, numThreads * 2);
465 }
466
467 @Test
468 public void testAllMultithreaded() throws Exception {
469
470 Detector detector = new Tika().getDetector();
471 FileFilter filter = new FileFilter() {
472 //TODO: create proper randomized framework that will record seed, etc...
473 private final Random random = new Random();
474 //increase this to the number of files for a true smoke test
475 //for now, randomly pick 20 files.
476 int toProcess = 20;
477 int processed = 0;
478 @Override
479 public boolean accept(File pathname) {
480 if (processed >= toProcess) {
481 return false;
482 } else if (random.nextBoolean()) {
483 processed++;
484 return true;
485 }
486 return false;
487 }
488 };
489 int numThreads = 20;
490 XMLReaderUtils.setPoolSize(numThreads);
491
492 testDetector(detector, numThreads, 50, filter, numThreads*3);
493 }
494
437495 }
897897
898898 // MBOX
899899 assertTypeDetection("headers.mbox", "application/mbox");
900
900
901 // MBOX
902 assertTypeDetection("testMBOX_lengthy_x-headers.mbox", "application/mbox");
903
904
901905 // Thunderbird
902906 assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
903907
1818
1919
2020 import static org.apache.tika.TikaTest.assertContains;
21 import static org.apache.tika.TikaTest.debug;
2122 import static org.junit.Assert.assertEquals;
2223 import static org.junit.Assert.assertNull;
2324 import static org.junit.Assert.assertTrue;
3132 import org.apache.tika.exception.TikaException;
3233 import org.apache.tika.io.TikaInputStream;
3334 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.metadata.TikaMetadataKeys;
35 import org.apache.tika.metadata.TikaCoreProperties;
3536 import org.apache.tika.parser.utils.CommonsDigester;
37 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3638 import org.apache.tika.sax.BasicContentHandlerFactory;
3739 import org.apache.tika.sax.ContentHandlerFactory;
40 import org.apache.tika.sax.RecursiveParserWrapperHandler;
41 import org.apache.tika.utils.ParserUtils;
3842 import org.junit.Test;
3943 import org.xml.sax.helpers.DefaultHandler;
4044
4549 List<Metadata> list = getMetadata(new Metadata(),
4650 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
4751 Metadata container = list.get(0);
48 String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
52 String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
4953 //not much differentiates html from xml in this test file
5054 assertTrue(content.indexOf("<p class=\"header\" />") > -1);
5155 }
5559 List<Metadata> list = getMetadata(new Metadata(),
5660 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
5761 Metadata container = list.get(0);
58 String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
62 String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
5963 //not much differentiates html from xml in this test file
6064 assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
6165 }
6569 List<Metadata> list = getMetadata(new Metadata(),
6670 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
6771 Metadata container = list.get(0);
68 String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
72 String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
6973 assertTrue(content.indexOf("<p ") < 0);
7074 assertTrue(content.indexOf("embed_0") > -1);
7175 }
7579 List<Metadata> list = getMetadata(new Metadata(),
7680 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
7781 Metadata container = list.get(0);
78 String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
82 String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
7983 assertNull(content);
8084 }
8185
8690 Metadata metadata = new Metadata();
8791
8892 Parser wrapped = new AutoDetectParser();
89 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
93 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped);
94 InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
95 "/test-documents/test_recursive_embedded.docx");
96 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
9097 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
91 InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
92 "/test-documents/test_recursive_embedded.docx");
93 wrapper.parse(stream, new DefaultHandler(), metadata, context);
94 List<Metadata> list = wrapper.getMetadata();
98 wrapper.parse(stream, handler, metadata, context);
99 List<Metadata> list = handler.getMetadataList();
95100
96101 assertEquals(5, list.size());
97102
98103 int wlr = 0;
99104 for (Metadata m : list) {
100 String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
105 String limitReached = m.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED);
101106 if (limitReached != null && limitReached.equals("true")) {
102107 wlr++;
103108 }
106111
107112 }
108113
109 @Test
110 public void testMaxEmbedded() throws Exception {
114 /**
115 * @deprecated this will be removed in 1.20 or 2.0
116 * @throws Exception
117 */
118 @Test
119 public void testMaxEmbeddedLegacy() throws Exception {
111120 int maxEmbedded = 4;
112121 int totalNoLimit = 12;//including outer container file
113122 ParseContext context = new ParseContext();
125134 //test default
126135 assertEquals(totalNoLimit, list.size());
127136
128 limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
137 limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
129138 assertNull(limitReached);
130139
131140
141150 list = wrapper.getMetadata();
142151
143152 //add 1 for outer container file
144 assertEquals(maxEmbedded + 1, list.size());
145
146 limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
153 assertEquals(maxEmbedded+1, list.size());
154
155 limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
147156 assertEquals("true", limitReached);
148157
149158 wrapper.reset();
156165
157166 wrapper.setMaxEmbeddedResources(-2);
158167 wrapper.parse(stream, new DefaultHandler(), metadata, context);
168 assertEquals(totalNoLimit, wrapper.getMetadata().size());
169 limitReached = wrapper.getMetadata().get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
170 assertNull(limitReached);
171 }
172
173 @Test
174 public void testMaxEmbedded() throws Exception {
175 int maxEmbedded = 4;
176 int totalNoLimit = 12;//including outer container file
177 ParseContext context = new ParseContext();
178 Metadata metadata = new Metadata();
179 String limitReached = null;
180
181 Parser wrapped = new AutoDetectParser();
182 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped);
183
184 InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
185 "/test-documents/test_recursive_embedded.docx");
186 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
187
188 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1));
189 wrapper.parse(stream, handler, metadata, context);
190 List<Metadata> list = handler.getMetadataList();
191 //test default
159192 assertEquals(totalNoLimit, list.size());
160 limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
193
194 limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
161195 assertNull(limitReached);
162 }
196
197 stream.close();
198
199 //test setting value
200 metadata = new Metadata();
201 stream = RecursiveParserWrapperTest.class.getResourceAsStream(
202 "/test-documents/test_recursive_embedded.docx");
203 handler = new RecursiveParserWrapperHandler(
204 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), maxEmbedded);
205 wrapper.parse(stream, handler, metadata, context);
206 list = handler.getMetadataList();
207 //add 1 for outer container file
208 assertEquals(maxEmbedded+1, list.size());
209
210 limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
211 assertEquals("true", limitReached);
212
213 stream.close();
214
215 //test setting value < 0
216 metadata = new Metadata();
217 stream = RecursiveParserWrapperTest.class.getResourceAsStream(
218 "/test-documents/test_recursive_embedded.docx");
219 handler = new RecursiveParserWrapperHandler(
220 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1), -2);
221 wrapper.parse(stream, handler, metadata, context);
222 list = handler.getMetadataList();
223 assertEquals(totalNoLimit, list.size());
224 limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
225 assertNull(limitReached);
226 }
227
163228
164229 @Test
165230 public void testEmbeddedResourcePath() throws Exception {
182247 List<Metadata> list = getMetadata(metadata,
183248 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
184249 Metadata container = list.get(0);
185 String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
250 String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
186251 assertTrue(content.indexOf("<p class=\"header\" />") > -1);
187252
188253 Set<String> seen = new HashSet<String>();
189254 for (Metadata m : list) {
190 String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
255 String path = m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
191256 if (path != null) {
192257 seen.add(path);
193258 }
205270 //is to catch the exception
206271 assertEquals(13, list.size());
207272 Metadata mockNPEMetadata = list.get(10);
208 assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
273 assertContains("java.lang.NullPointerException", mockNPEMetadata.get(ParserUtils.EMBEDDED_EXCEPTION));
209274
210275 metadata = new Metadata();
211276 metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
215280
216281 //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
217282 //and just doesn't bother to report that there was an exception.
218 assertEquals(12, list.size());
283 assertEquals(13, list.size());
219284 }
220285
221286 @Test
229294
230295 ParseContext context = new ParseContext();
231296 Parser wrapped = new AutoDetectParser();
232 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
233 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
297 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, true);
298 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
299 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
300
234301 String path = "/test-documents/mock/embedded_then_npe.xml";
235302
236303 InputStream stream = null;
238305 try {
239306 stream = RecursiveParserWrapperTest.class.getResourceAsStream(
240307 path);
241 wrapper.parse(stream, new DefaultHandler(), metadata, context);
308 wrapper.parse(stream, handler, metadata, context);
242309 } catch (TikaException e) {
243310 if (e.getCause().getClass().equals(NullPointerException.class)) {
244311 npe = true;
248315 }
249316 assertTrue("npe", npe);
250317
251 List<Metadata> metadataList = wrapper.getMetadata();
318 List<Metadata> metadataList = handler.getMetadataList();
252319 assertEquals(2, metadataList.size());
253320 Metadata outerMetadata = metadataList.get(0);
254321 Metadata embeddedMetadata = metadataList.get(1);
255 assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
256 assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
322 assertContains("main_content", outerMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
323 assertEquals("embedded_then_npe.xml", outerMetadata.get(Metadata.RESOURCE_NAME_KEY));
257324 assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
258325
259 assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
260 assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
326 assertContains("some_embedded_content", embeddedMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
327 assertEquals("embed1.xml", embeddedMetadata.get(Metadata.RESOURCE_NAME_KEY));
261328 assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
262329 }
263330
267334 metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
268335 List<Metadata> list = getMetadata(metadata,
269336 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
270 true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
337 true, new CommonsDigester(100000, "md5"));
271338 int i = 0;
272339 Metadata m0 = list.get(0);
273340 Metadata m6 = list.get(6);
285352 if (digester != null) {
286353 wrapped = new DigestingParser(wrapped, digester);
287354 }
288 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
289 contentHandlerFactory, catchEmbeddedExceptions);
355 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
290356 String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
291357 if (path == null) {
292358 path = "/test-documents/test_recursive_embedded.docx";
294360 path = "/test-documents/" + path;
295361 }
296362 InputStream stream = null;
363 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory);
297364 try {
298365 stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
299 wrapper.parse(stream, new DefaultHandler(), metadata, context);
366 wrapper.parse(stream, handler, metadata, context);
300367 } finally {
301368 IOUtils.closeQuietly(stream);
302369 }
303 return wrapper.getMetadata();
370 return handler.getMetadataList();
304371
305372 }
306373
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser;
17
18
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertTrue;
21
22 import java.util.Arrays;
23 import java.util.List;
24 import java.util.regex.Pattern;
25
26 import org.apache.tika.TikaTest;
27 import org.junit.Test;
28
29 /**
30 * Ensure that our various Table-based formats produce consistent,
31 * broadly similar output.
32 * This is mostly focused on the XHTML output
33 */
34 public class TabularFormatsTest extends TikaTest {
35 protected static final String[] columnNames = new String[] {
36 "recnum","square","desc","pctdone","pctincr",
37 "date","datetime","time"
38 };
39 protected static final String[] columnLabels = new String[] {
40 "Record Number","Square of the Record Number",
41 "Description of the Row","Percent Done",
42 "Percent Increment","date","datetime","time"
43 };
44
45 /**
46 * Expected values, by <em>column</em>
47 */
48 protected static final Object[][] table = new Object[][] {
49 new String[] {
50 "0","1","2","3","4","5","6","7","8","9","10"
51 },
52 new String[] {
53 "0","1","4","9","16","25","36","49","64","81","100"
54 },
55 new String[] {}, // Generated later
56 new String[] {
57 "0%","10%","20%","30%","40%","50%",
58 "60%","70%","80%","90%","100%"
59 },
60 new String[] {
61 "","0.0%","50.0%","66.7%",
62 "75.0%","80.0%","83.3%","85.7%",
63 "87.5%","88.9%","90.0%"
64 },
65 new Pattern[] {
66 Pattern.compile("0?1-01-1960"),
67 Pattern.compile("0?2-01-1960"),
68 Pattern.compile("17-01-1960"),
69 Pattern.compile("22-03-1960"),
70 Pattern.compile("13-09-1960"),
71 Pattern.compile("17-09-1961"),
72 Pattern.compile("20-07-1963"),
73 Pattern.compile("29-07-1966"),
74 Pattern.compile("20-03-1971"),
75 Pattern.compile("18-12-1977"),
76 Pattern.compile("19-05-1987"),
77 },
78 new Pattern[] {
79 Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:01(.00)?"),
80 Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:10(.00)?"),
81 Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:01:40(.00)?"),
82 Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:16:40(.00)?"),
83 Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]02:46:40(.00)?"),
84 Pattern.compile("02(JAN|Jan)(60|1960)[:\\s]03:46:40(.00)?"),
85 Pattern.compile("12(JAN|Jan)(60|1960)[:\\s]13:46:40(.00)?"),
86 Pattern.compile("25(APR|Apr)(60|1960)[:\\s]17:46:40(.00)?"),
87 Pattern.compile("03(MAR|Mar)(63|1963)[:\\s]09:46:40(.00)?"),
88 Pattern.compile("09(SEP|Sep)(91|1991)[:\\s]01:46:40(.00)?"),
89 Pattern.compile("19(NOV|Nov)(76|2276)[:\\s]17:46:40(.00)?")
90 },
91 new Pattern[] {
92 Pattern.compile("0?0:00:01(.\\d\\d)?"),
93 Pattern.compile("0?0:00:03(.\\d\\d)?"),
94 Pattern.compile("0?0:00:09(.\\d\\d)?"),
95 Pattern.compile("0?0:00:27(.\\d\\d)?"),
96 Pattern.compile("0?0:01:21(.\\d\\d)?"),
97 Pattern.compile("0?0:04:03(.\\d\\d)?"),
98 Pattern.compile("0?0:12:09(.\\d\\d)?"),
99 Pattern.compile("0?0:36:27(.\\d\\d)?"),
100 Pattern.compile("0?1:49:21(.\\d\\d)?"),
101 Pattern.compile("0?5:28:03(.\\d\\d)?"),
102 Pattern.compile("16:24:09(.\\d\\d)?")
103 }
104 };
105 static {
106 // Row text in 3rd column
107 table[2] = new String[table[0].length];
108 for (int i=0; i<table[0].length; i++) {
109 table[2][i] = "This is row " + i + " of 10";
110 }
111 }
112 // Which columns hold percentages? Not all parsers
113 // correctly format these...
114 protected static final List<Integer> percentageColumns =
115 Arrays.asList(new Integer[] { 3, 4 });
116
117 protected static String[] toCells(String row, boolean isTH) {
118 // Split into cells, ignoring stuff before first cell
119 String[] cells;
120 if (isTH) {
121 cells = row.split("<th");
122 } else {
123 cells = row.split("<td");
124 }
125 cells = Arrays.copyOfRange(cells, 1, cells.length);
126
127 // Ignore the closing tag onwards, and normalise whitespace
128 for (int i=0; i<cells.length; i++) {
129 cells[i] = cells[i].trim();
130 if (cells[i].equals("/>")) {
131 cells[i] = "";
132 continue;
133 }
134
135 int splitAt = cells[i].lastIndexOf("</");
136 cells[i] = cells[i].substring(0, splitAt).trim();
137 cells[i] = cells[i].replaceAll("\\s+", " ");
138 }
139 return cells;
140 }
141
142 protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
143 // Find the first row
144 int splitAt = xml.indexOf("</tr>");
145 String hRow = xml.substring(0, splitAt);
146 splitAt = xml.indexOf("<tr>");
147 hRow = hRow.substring(splitAt+4);
148
149 // Split into cells, ignoring stuff before first cell
150 String[] cells = toCells(hRow, isTH);
151
152 // Check we got the right number
153 assertEquals("Wrong number of cells in header row " + hRow,
154 columnLabels.length, cells.length);
155
156 // Check we got the right stuff
157 for (int i=0; i<cells.length; i++) {
158 if (hasLabel && hasName) {
159 assertContains("title=\"" + columnNames[i] + "\"", cells[i]);
160 assertContains(">" + columnLabels[i], cells[i]);
161 } else if (hasName) {
162 assertContains(">" + columnNames[i], cells[i]);
163 } else {
164 assertContains(">" + columnLabels[i], cells[i]);
165 }
166 }
167 }
168 protected void assertContents(String xml, boolean hasHeader, boolean doesPercents) {
169 // Ignore anything before the first <tr>
170 // Ignore the header row if there is one
171 int ignores = 1;
172 if (hasHeader) ignores++;
173
174 // Split into rows, and discard the row closing (and anything after)
175 String[] rows = xml.split("<tr>");
176 rows = Arrays.copyOfRange(rows, ignores, rows.length);
177 for (int i=0; i<rows.length; i++) {
178 rows[i] = rows[i].split("</tr>")[0].trim();
179 }
180
181 // Check we got the right number of rows
182 for (int cn=0; cn<table.length; cn++) {
183 assertEquals("Wrong number of rows found compared to column " + (cn+1),
184 table[cn].length, rows.length);
185 }
186
187 // Check each row's values
188 for (int rn=0; rn<rows.length; rn++) {
189 String[] cells = toCells(rows[rn], false);
190 assertEquals("Wrong number of values in row " + (rn+1),
191 table.length, cells.length);
192
193 for (int cn=0; cn<table.length; cn++) {
194 String val = cells[cn];
195
196 // If the parser doesn't know about % formats,
197 // skip the cell if the column in a % one
198 if (!doesPercents && percentageColumns.contains(cn)) continue;
199
200 // Ignore cell attributes
201 if (! val.isEmpty()) val = val.split(">")[1];
202 // Check
203 String error = "Wrong text in row " + (rn+1) + " and column " +
204 (cn+1) + " - " + table[cn][rn] + " vs " + val;
205 if (table[cn][rn] instanceof String) {
206 assertEquals(error, table[cn][rn], val);
207 } else {
208 assertTrue(error, ((Pattern)table[cn][rn]).matcher(val).matches());
209 }
210 }
211 }
212 }
213
214 @Test
215 public void testSAS7BDAT() throws Exception {
216 XMLResult result = getXML("test-columnar.sas7bdat");
217 String xml = result.xml;
218 assertHeaders(xml, true, true, true);
219 // TODO Wait for https://github.com/epam/parso/issues/28 to be fixed
220 // then check the % formats again
221 assertContents(xml, true, false);
222 }
223 @Test
224 public void testXLS() throws Exception {
225 XMLResult result = getXML("test-columnar.xls");
226 String xml = result.xml;
227 assertHeaders(xml, false, true, false);
228 assertContents(xml, true, false);
229 }
230 @Test
231 public void testXLSX() throws Exception {
232 XMLResult result = getXML("test-columnar.xlsx");
233 String xml = result.xml;
234 assertHeaders(xml, false, true, false);
235 assertContents(xml, true, false);
236 }
237 @Test
238 public void testXLSB() throws Exception {
239 XMLResult result = getXML("test-columnar.xlsb");
240 String xml = result.xml;
241 assertHeaders(xml, false, true, false);
242 assertContents(xml, true, false);
243 }
244
245 // TODO Fix the ODS test - currently failing with
246 // org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared
247 // @Test
248 // public void testODS() throws Exception {
249 // XMLResult result = getXML("test-columnar.ods");
250 // String xml = result.xml;
251 // assertHeaders(xml, false, true, false);
252 // assertContents(xml, true, true);
253 // }
254
255 // TODO Test other formats, eg Database formats
256
257 /**
258 * Note - we don't have a dedicated CSV parser
259 *
260 * This means we don't get proper HTML out...
261 */
262 @Test
263 public void testCSV() throws Exception {
264 XMLResult result = getXML("test-columnar.csv");
265 String xml = result.xml;
266 // Normalise whitespace before testing
267 xml = xml.replaceAll("\\s+", " ");
268
269 for (String label : columnLabels) {
270 assertContains(label, xml);
271 }
272 for (Object[] vals : table) {
273 for (Object val : vals) {
274 if (val instanceof String)
275 assertContains((String)val, xml);
276 else if (val instanceof Pattern)
277 assertTrue("Not matched: " + val,
278 ((Pattern)val).matcher(xml).find());
279 }
280 }
281 }
282 }
1818 import org.apache.tika.MultiThreadedTikaTest;
1919 import org.apache.tika.exception.TikaException;
2020 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.parser.AutoDetectParser;
2122 import org.apache.tika.parser.ParseContext;
2223 import org.apache.tika.parser.Parser;
24 import org.apache.tika.parser.RecursiveParserWrapper;
2325 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
2426 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
2527 import org.apache.tika.parser.chm.core.ChmExtractor;
213215
214216 @Test
215217 public void testMultiThreaded() throws Exception {
216 testMultiThreaded(10, 10, new FileFilter() {
218 ParseContext[] parseContexts = new ParseContext[10];
219 for (int i = 0; i < parseContexts.length; i++) {
220 parseContexts[i] = new ParseContext();
221 }
222 Parser p = new AutoDetectParser();
223 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
224 testMultiThreaded(wrapper, parseContexts, 10, 10, new FileFilter() {
217225 @Override
218226 public boolean accept(File pathname) {
219227 if (pathname.getName().toLowerCase(Locale.ENGLISH).endsWith(".chm")) {
2121 import static org.junit.Assert.assertNotNull;
2222 import static org.junit.Assert.fail;
2323
24 import java.io.File;
25 import java.io.FileFilter;
2426 import java.io.IOException;
2527 import java.io.InputStream;
2628 import java.io.NotSerializableException;
2830 import java.util.HashSet;
2931 import java.util.Set;
3032
33 import org.apache.commons.io.filefilter.TrueFileFilter;
34 import org.apache.tika.MultiThreadedTikaTest;
3135 import org.apache.tika.Tika;
3236 import org.apache.tika.TikaTest;
3337 import org.apache.tika.detect.Detector;
3842 import org.apache.tika.parser.EmptyParser;
3943 import org.apache.tika.parser.ParseContext;
4044 import org.apache.tika.parser.Parser;
45 import org.apache.tika.parser.RecursiveParserWrapper;
4146 import org.apache.tika.sax.BodyContentHandler;
47 import org.junit.Ignore;
4248 import org.junit.Test;
4349 import org.xml.sax.ContentHandler;
4450 import org.xml.sax.SAXException;
4753 * Test that the ForkParser correctly behaves when
4854 * wired in to the regular Parsers and their test data
4955 */
50 public class ForkParserIntegrationTest extends TikaTest {
56 public class ForkParserIntegrationTest extends MultiThreadedTikaTest {
5157
5258 private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works
5359
286292 parser.close();
287293 }
288294 }
295
296 @Test
297 @Ignore("use for development/one off testing. This is a beast and takes enormous resources and time")
298 public void smokeTest() throws Exception {
299 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(tika.getParser());
300 int numThreads = 5;
301 ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
302 wrapper);
303 parser.setServerPulseMillis(500);
304 parser.setServerParseTimeoutMillis(1000);
305 parser.setPoolSize(numThreads);
306 ParseContext[] parseContexts = new ParseContext[numThreads];
307 for (int i = 0; i < numThreads; i++) {
308 parseContexts[i] = new ParseContext();
309 }
310 try {
311 super.testMultiThreaded(parser, parseContexts, numThreads, 5,
312 new FileFilter() {
313 @Override
314 public boolean accept(File pathname) {
315 if (pathname.getAbsolutePath().contains("mock")) {
316 return true;
317 } else {
318 return false;
319 }/*
320 if (pathname.getName().contains("11_hang.rar") ||
321 pathname.getName().contains("radar_profiles_2009.mat") ||
322 pathname.getAbsolutePath().contains("mock")) {
323 //return false;
324 }
325 return true;*/
326 }
327 });
328 } catch (Throwable t) {
329 t.printStackTrace();
330 }
331 }
332
289333 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.html;
18
19
20 import org.apache.tika.metadata.Metadata;
21 import org.junit.Ignore;
22 import org.junit.Test;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.nio.charset.Charset;
28 import java.nio.charset.StandardCharsets;
29
30 import static org.junit.Assert.*;
31
32 public class HtmlEncodingDetectorTest {
33
34 @Test
35 public void basic() throws IOException {
36 assertWindows1252("<meta charset='WINDOWS-1252'>");
37 }
38
39 @Test
40 @Ignore("can we can prove this harms detection")
41 public void utf16() throws IOException {
42 // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
43 assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
44 }
45
46 @Test
47 public void xUserDefined() throws IOException {
48 // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
49 assertWindows1252("<meta charset='x-user-defined'>");
50 }
51
52 @Test
53 public void withSlash() throws IOException {
54 assertWindows1252("<meta/charset='WINDOWS-1252'>");
55 }
56
57 @Test
58 @Ignore("until we do a full parse")
59 public void insideTag() throws IOException {
60 assertWindows1252("<meta name='description'" +
61 "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
62 "<meta charset='WINDOWS-1252'>");
63 }
64
65 @Test
66 @Ignore("until we do a full parse")
67 public void missingAttribute() throws IOException {
68 assertWindows1252(
69 "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
70 "<meta charset='WINDOWS-1252'>" // valid declaration
71 );
72 }
73
74 @Test
75 @Ignore("until we do a full parse")
76 public void insideSpecialTag() throws IOException {
77 // Content inside <?, <!, and </ should be ignored
78 for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
79 assertWindows1252(
80 "<" + (char) b + // start comment
81 "<meta charset='UTF-8'>" + // inside special tag
82 "<meta charset='WINDOWS-1252'>" // real charset declaration
83 );
84 }
85
86 @Test
87 @Ignore("until we can prove this harms detection")
88 public void spaceBeforeTag() throws IOException {
89 assertWindows1252(
90 "< meta charset='UTF-8'>" + // invalid charset declaration
91 "<meta charset='WINDOWS-1252'>" // real charset declaration
92 );
93 }
94
95 @Test
96 public void invalidAttribute() throws IOException {
97 assertWindows1252(
98 "<meta " +
99 "badcharset='UTF-8' " + // invalid charset declaration
100 "charset='WINDOWS-1252'>" // real charset declaration
101 );
102 }
103
104 @Test
105 @Ignore("until we can prove this harms detection")
106 public void unmatchedQuote() throws IOException {
107 assertWindows1252(
108 "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
109 "<meta charset='WINDOWS-1252'>" // real charset declaration
110 );
111 }
112
113
114 @Test
115 @Ignore("until we do a full parse")
116 public void withCompactComment() throws IOException {
117 // <!--> is a valid comment
118 assertWindows1252(
119 "<!--" + // start comment
120 "<meta charset='UTF-8'>" + // inside comment
121 "-->" + // end comment
122 "<!-->" + // compact comment
123 "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
124 );
125 }
126
127 private void assertWindows1252(String html) throws IOException {
128 assertCharset(html, Charset.forName("WINDOWS-1252"));
129 }
130
131 private void assertCharset(String html, Charset charset) throws IOException {
132 assertEquals(html + " should be detected as " + charset,
133 charset, detectCharset(html));
134 }
135
136 private Charset detectCharset(String test) throws IOException {
137 Metadata metadata = new Metadata();
138 InputStream inStream = new ByteArrayInputStream(test.getBytes(StandardCharsets.UTF_8));
139 return new HtmlEncodingDetector().detect(inStream, metadata);
140 }
141 }
7070 import org.apache.tika.parser.ParseContext;
7171 import org.apache.tika.parser.Parser;
7272 import org.apache.tika.parser.RecursiveParserWrapper;
73 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
7374 import org.apache.tika.sax.BodyContentHandler;
7475 import org.apache.tika.sax.LinkContentHandler;
7576 import org.apache.tika.sax.TeeContentHandler;
738739 String content = sw.toString();
739740
740741 // Should have <html>, <head>, <title>, <body> elements
741 assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
742 assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">.*</html>.*$", content));
742743 assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
743744 assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
744745 assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
869870 String result = handler.toString();
870871
871872 assertTrue(Pattern.matches("\tone\n\n", result));
873 }
874
875 /**
876 * Test case for Tika-2100
877 * @see <a href="https://issues.apache.org/jira/browse/TIKA-2100">TIKA-2100</a>
878 */
879 @Test
880 public void testHtmlLanguage() throws Exception {
881 final String html = "<html lang=\"fr\"></html>";
882
883 StringWriter sw = new StringWriter();
884 Metadata metadata = new Metadata();
885 new HtmlParser().parse(
886 new ByteArrayInputStream(html.getBytes(UTF_8)),
887 makeHtmlTransformer(sw), metadata, new ParseContext());
888
889 assertEquals("fr", metadata.get(Metadata.CONTENT_LANGUAGE));
890 assertTrue("Missing HTML lang attribute",
891 Pattern.matches("(?s)<html[^>]* lang=\"fr\".*", sw.toString()));
872892 }
873893
874894 /**
12481268 assertEquals(2, metadataList.size());
12491269 assertEquals("MACRO", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
12501270 assertContains("cool",
1251 metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
1252 assertNotContained("cool", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
1271 metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
1272 assertNotContained("cool", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
12531273 }
12541274
12551275 @Test
12621282 assertEquals(2, metadataList.size());
12631283 assertEquals("MACRO", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
12641284 assertContains("cool",
1265 metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
1266 assertNotContained("cool", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
1285 metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
1286 assertNotContained("cool", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
12671287
12681288 }
12691289
12711291 public void testDataURI() throws Exception {
12721292 List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img.html");
12731293 assertEquals(2, metadataList.size());
1274 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
1294 String content = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
12751295 assertContains("some content", content);
12761296 //make sure that you've truncated the data: value
12771297 assertContains("src=\"data:\"", content);
12891309 Parser p = new AutoDetectParser(tikaConfig);
12901310 List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img_in_js.html", p);
12911311 assertEquals(3, metadataList.size());
1292 String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
1312 String content = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
12931313 assertContains("some content", content);
12941314 Metadata imgMetadata = metadataList.get(1);
12951315 assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE));
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.parser.html;
18
19
20 import org.apache.tika.metadata.Metadata;
21 import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
22 import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
23 import org.junit.Before;
24 import org.junit.Test;
25
26 import java.io.BufferedInputStream;
27 import java.io.ByteArrayInputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.SequenceInputStream;
31 import java.nio.charset.Charset;
32 import java.nio.charset.StandardCharsets;
33
34 import static org.junit.Assert.assertArrayEquals;
35 import static org.junit.Assert.assertEquals;
36
37 public class StandardHtmlEncodingDetectorTest {
38 private Metadata metadata = new Metadata();
39
40 @Before
41 public void setUp() {
42 this.metadata = new Metadata();
43 }
44
45 @Test
46 public void basic() throws IOException {
47 assertWindows1252("<meta charset=WINDOWS-1252>");
48 }
49
50 @Test
51 public void quoted() throws IOException {
52 assertWindows1252("<meta charset='WINDOWS-1252'>");
53 }
54
55 @Test
56 public void duplicateMeta() throws IOException {
57 assertWindows1252("<meta charset='WINDOWS-1252'>" +
58 "<meta charset='UTF-8'>");
59 }
60
61 @Test
62 public void duplicateAttribute() throws IOException {
63 assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
64 }
65
66 @Test
67 public void invalidThenValid() throws IOException {
68 assertCharset("<meta charset=blah>" +
69 "<meta charset=WINDOWS-1252>", null);
70 }
71
72 @Test
73 public void spacesInAttributes() throws IOException {
74 assertWindows1252("<meta charset\u000C= \t WINDOWS-1252>");
75 }
76
77 @Test
78 public void httpEquiv() throws IOException {
79 assertWindows1252("<meta " +
80 "http-equiv='content-type' " +
81 "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
82 assertWindows1252("<meta " +
83 "content=' charset = WINDOWS-1252' " + // The charset may be anywhere in the content attribute
84 "http-equiv='content-type' >");
85 }
86
87 @Test
88 public void emptyAttributeEnd() throws IOException {
89 assertWindows1252("<meta charset=WINDOWS-1252 a>");
90 }
91
92 @Test
93 public void httpEquivDuplicateCharset() throws IOException {
94 assertWindows1252("<meta " +
95 "http-equiv='content-type' " +
96 "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
97 "charset=UTF-8'>");
98 }
99
100 @Test
101 public void htmlFragment() throws IOException {
102 assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
103 }
104
105 @Test
106 public void veryBadHtml() throws IOException {
107 // check that the parser is not confused by garbage before the declaration
108 assertWindows1252("<< l \" == / '=x\n >" +
109 "<!--> " +
110 "< <x'/ <=> " +
111 "<meta/>" +
112 "<meta>" +
113 "<a x/>" +
114 "<meta charset='WINDOWS-1252'>");
115 }
116
117 @Test
118 public void specialTag() throws IOException {
119 // special tags cannot have arguments, any '>' ends them
120 assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
121 }
122
123 @Test
124 public void longHtml() throws IOException {
125 StringBuilder sb = new StringBuilder("<!doctype html>\n" +
126 "<html>\n" +
127 "<head>\n" +
128 "<title>Hello world</title>\n");
129 String repeated = "<meta x='y' />\n";
130 String charsetMeta = "<meta charset='windows-1252'>";
131
132 while (sb.length() + repeated.length() + charsetMeta.length() < 1024) sb.append(repeated);
133
134 sb.append(charsetMeta);
135
136 assertWindows1252(sb.toString());
137 }
138
139 @Test
140 public void tooLong() throws IOException {
141 // Create a string with 1Mb of '\0' followed by a meta
142 String padded = new String(new byte[1000000], StandardCharsets.US_ASCII) + "<meta charset='windows-1252'>";
143 // Only the first bytes should be prescanned, so the algorithm should stop before the meta tag
144 assertCharset(padded, null);
145 }
146
147 @Test
148 public void incompleteMeta() throws IOException {
149 assertCharset("<meta charset='WINDOWS-1252'", null); // missing '>' at the end
150 }
151
152 @Test
153 public void charsetWithWhiteSpaces() throws IOException {
154 assertWindows1252("<meta charset=' \t\n WINDOWS-1252 \t\n'>");
155 }
156
157 @Test
158 public void mixedCase() throws IOException {
159 assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
160 }
161
162 @Test
163 public void utf16() throws IOException {
164 // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
165 assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
166 }
167
168 @Test
169 public void xUserDefined() throws IOException {
170 // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
171 assertWindows1252("<meta charset='x-user-defined'>");
172 }
173
174 @Test
175 public void replacement() throws IOException {
176 // Several dangerous charsets should are aliases of 'replacement' in the spec
177 String inString = "<meta charset='iso-2022-cn'>";
178 assertCharset(new ByteArrayInputStream(inString.getBytes(StandardCharsets.US_ASCII)), new ReplacementCharset());
179 }
180
181 @Test
182 public void iso88591() throws IOException {
183 // In the spec, iso-8859-1 is an alias for WINDOWS-1252
184 assertWindows1252("<meta charset='iso-8859-1'>");
185 }
186
187 @Test
188 public void macintoshEncoding() throws IOException {
189 // The mac roman encoding exists in java, but under the name x-MacRoman
190 assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
191 }
192
193 @Test
194 public void bom() throws IOException {
195 // A BOM should have precedence over the meta
196 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
197 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
198 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
199 }
200
201 @Test
202 public void withSlash() throws IOException {
203 assertWindows1252("<meta/charset='WINDOWS-1252'>");
204 }
205
206 @Test
207 public void insideDescription() throws IOException {
208 assertWindows1252("<meta name='description'" +
209 "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
210 "<meta charset='WINDOWS-1252'>");
211 }
212
213 @Test
214 public void insideTag() throws IOException {
215 assertWindows1252("<tag " +
216 "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
217 "<meta charset='UTF-8' " + // still inside tag
218 "/>" + // tag end
219 "<meta charset='WINDOWS-1252'>");
220 }
221
222 @Test
223 public void missingAttribute() throws IOException {
224 assertWindows1252(
225 "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
226 "<meta charset='WINDOWS-1252'>" // valid declaration
227 );
228 }
229
230 @Test
231 public void insideSpecialTag() throws IOException {
232 // Content inside <?, <!, and </ should be ignored
233 for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
234 assertWindows1252(
235 "<" + (char) b + // start comment
236 "<meta charset='UTF-8'>" + // inside special tag
237 "<meta charset='WINDOWS-1252'>" // real charset declaration
238 );
239 }
240
241 @Test
242 public void spaceBeforeTag() throws IOException {
243 assertWindows1252(
244 "< meta charset='UTF-8'>" + // invalid charset declaration
245 "<meta charset='WINDOWS-1252'>" // real charset declaration
246 );
247 }
248
249 @Test
250 public void invalidAttribute() throws IOException {
251 assertWindows1252(
252 "<meta " +
253 "badcharset='UTF-8' " + // invalid charset declaration
254 "charset='WINDOWS-1252'>" // real charset declaration
255 );
256 }
257
258 @Test
259 public void unmatchedQuote() throws IOException {
260 assertWindows1252(
261 "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
262 "<meta charset='WINDOWS-1252'>" // real charset declaration
263 );
264 }
265
266 @Test
267 public void realWorld() throws IOException {
268 assertWindows1252("<!DOCTYPE html>\n" +
269 "<html lang=\"fr\">\n" +
270 "<head>\n" +
271 "<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
272 "\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n" +
273 "\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
274 "\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n" +
275 "\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
276 "<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
277 "<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n" +
278 "<meta name=\"keywords\" content=\"horaires transilien\">\n" +
279 "<meta charset=\"windows-1252\">\n" +
280 "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
281 "<meta name=\"robots\" content=\"follow, index\">\n" +
282 "<base hr");
283 }
284
285 @Test
286 public void withCompactComment() throws IOException {
287 // <!--> is a valid comment
288 assertWindows1252(
289 "<!--" + // start comment
290 "<meta charset='UTF-8'>" + // inside comment
291 "-->" + // end comment
292 "<!-->" + // compact comment
293 "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
294 );
295 }
296
297 @Test
298 public void withCharsetInContentType() throws IOException {
299 metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
300 // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
301 assertWindows1252("");
302 assertWindows1252("<meta charset='UTF-8'>");
303 assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
304 // if a BOM is present, it has precedence over transport layer information
305 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
306 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
307 assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
308 }
309
310 @Test
311 public void throwResistance() throws IOException {
312 // The preprocessing should return right after having found the charset
313 // So if an error is thrown in the stream AFTER the declaration,
314 // it shouldn't see it
315 assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
316 assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
317
318 // But if an error is thrown before the end of the meta tag, it should see it
319 // and return unsuccessfully
320 assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
321
322 // If there is no meta, but an error is thrown, the detector simply returns
323 // unsuccessfully (it should not throw runtime errors)
324 assertCharset(throwAfter("<"), null);
325 assertCharset(throwAfter("<!"), null);
326 assertCharset(throwAfter("<!doctype"), null);
327 assertCharset(throwAfter("<!doctype html><html"), null);
328 assertCharset(throwAfter("<!doctype html><html attr"), null);
329 assertCharset(throwAfter("<!doctype html><html attr="), null);
330 assertCharset(throwAfter("<!doctype html><html attr=x"), null);
331 assertCharset(throwAfter("<!doctype html><html attr='x"), null);
332 }
333
334 @Test
335 public void streamReset() throws IOException {
336 // The stream should be reset after detection
337 byte[] inBytes = {0,1,2,3,4};
338 byte[] outBytes = new byte[5];
339 InputStream inStream = new ByteArrayInputStream(inBytes);
340 detectCharset(inStream);
341 // The stream should still be readable from the beginning after detection
342 inStream.read(outBytes);
343 assertArrayEquals(inBytes, outBytes);
344 }
345
346 private void assertWindows1252(String html) throws IOException {
347 assertCharset(html, Charset.forName("WINDOWS-1252"));
348 }
349
350 private void assertWindows1252(InputStream inStream) throws IOException {
351 assertCharset(inStream, Charset.forName("WINDOWS-1252"));
352 }
353
354 private void assertCharset(String html, Charset charset) throws IOException {
355 final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
356 InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
357 final Charset detected = detectCharset(inStream);
358 assertEquals(html + " should be detected as " + charset, charset, detected);
359 }
360
361 private void assertCharset(InputStream inStream, Charset charset) throws IOException {
362 final Charset detected = detectCharset(inStream);
363 assertEquals(charset, detected);
364 }
365
366 private Charset detectCharset(InputStream inStream) throws IOException {
367 return new StandardHtmlEncodingDetector().detect(inStream, metadata);
368 }
369
370 private InputStream throwAfter(String html) {
371 byte[] contents = html.getBytes(StandardCharsets.UTF_8);
372 InputStream contentsInStream = new ByteArrayInputStream(contents);
373 InputStream errorThrowing = new InputStream() {
374 @Override
375 public int read() throws IOException {
376 throw new IOException("test exception");
377 }
378 };
379 return new BufferedInputStream(new SequenceInputStream(contentsInStream, errorThrowing));
380 }
381 }
2020
2121 import java.io.InputStream;
2222
23 import org.apache.tika.TikaTest;
2324 import org.apache.tika.metadata.Metadata;
2425 import org.apache.tika.parser.AutoDetectParser;
2526 import org.apache.tika.parser.ParseContext;
3031 /**
3132 * Test cases to exercise the {@link MatParser}.
3233 */
33 public class MatParserTest {
34 public class MatParserTest extends TikaTest {
3435 @Test
3536 public void testParser() throws Exception {
3637 AutoDetectParser parser = new AutoDetectParser();
3839 Metadata metadata = new Metadata();
3940 String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
4041
41 try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
42 try (InputStream stream = getResourceAsStream(path)) {
4243 parser.parse(stream, handler, metadata, new ParseContext());
4344 }
4445
6869 Metadata metadata = new Metadata();
6970 String path = "/test-documents/test_mat_text.mat";
7071
71 try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
72 try (InputStream stream = getResourceAsStream(path)) {
7273 parser.parse(stream, handler, metadata, new ParseContext());
7374 }
7475
2929 import org.apache.tika.parser.ParseContext;
3030 import org.apache.tika.parser.Parser;
3131 import org.apache.tika.parser.RecursiveParserWrapper;
32 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3233 import org.apache.tika.sax.BodyContentHandler;
3334 import org.junit.Before;
3435 import org.junit.Test;
4041 private Parser autoDetectParser;
4142 private TypeDetector typeDetector;
4243 private MboxParser mboxParser;
43
44 private static InputStream getStream(String name) {
45 return MboxParserTest.class.getClass().getResourceAsStream(name);
46 }
4744
4845 @Before
4946 public void setUp() throws Exception {
6158 ContentHandler handler = new BodyContentHandler();
6259 Metadata metadata = new Metadata();
6360
64 try (InputStream stream = getStream("/test-documents/simple.mbox")) {
61 try (InputStream stream = getResourceAsStream("/test-documents/simple.mbox")) {
6562 mboxParser.parse(stream, handler, metadata, recursingContext);
6663 }
6764
8784 ContentHandler handler = new BodyContentHandler();
8885 Metadata metadata = new Metadata();
8986
90 try (InputStream stream = getStream("/test-documents/headers.mbox")) {
87 try (InputStream stream = getResourceAsStream("/test-documents/headers.mbox")) {
9188 mboxParser.parse(stream, handler, metadata, recursingContext);
9289 }
9390
110107 ContentHandler handler = new BodyContentHandler();
111108 Metadata metadata = new Metadata();
112109
113 try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
110 try (InputStream stream = getResourceAsStream("/test-documents/multiline.mbox")) {
114111 mboxParser.parse(stream, handler, metadata, recursingContext);
115112 }
116113
125122 ContentHandler handler = new BodyContentHandler();
126123 Metadata metadata = new Metadata();
127124
128 try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
125 try (InputStream stream = getResourceAsStream("/test-documents/quoted.mbox")) {
129126 mboxParser.parse(stream, handler, metadata, recursingContext);
130127 }
131128
138135 ContentHandler handler = new BodyContentHandler();
139136 Metadata metadata = new Metadata();
140137
141 try (InputStream stream = getStream("/test-documents/complex.mbox")) {
138 try (InputStream stream = getResourceAsStream("/test-documents/complex.mbox")) {
142139 mboxParser.parse(stream, handler, metadata, recursingContext);
143140 }
144141
161158 ParseContext context = new ParseContext();
162159 context.set(Parser.class, new AutoDetectParser());
163160
164 try (InputStream stream = getStream("/test-documents/single_mail.mbox")) {
161 try (InputStream stream = getResourceAsStream("/test-documents/single_mail.mbox")) {
165162 mboxParser.parse(stream, handler, metadata, context);
166163 }
167164
175172 assertEquals(2, metadataList.size());
176173 assertEquals("application/mbox", metadataList.get(0).get(Metadata.CONTENT_TYPE));
177174 assertEquals("message/rfc822", metadataList.get(1).get(Metadata.CONTENT_TYPE));
178 assertContains("body 2", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
179 assertNotContained("body 1", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
175 assertContains("body 2", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
176 assertNotContained("body 1", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
180177 }
181178 }
1919 import static org.junit.Assert.assertTrue;
2020 import static org.junit.Assert.fail;
2121
22 import java.io.File;
2322 import java.io.InputStream;
2423 import java.text.DecimalFormatSymbols;
2524 import java.util.List;
8079 assertNotContained("9.0", content);
8180 assertContains("196", content);
8281 assertNotContained("196.0", content);
82
83
84 // Won't include missing rows by default
85 assertContains("Numbers and their Squares\n\t\tNumber", content);
86 assertContains("\tSquare\n\t\t1", content);
87 }
88
89 // Request with missing rows
90 try (InputStream input = ExcelParserTest.class.getResourceAsStream(
91 "/test-documents/testEXCEL.xls")) {
92 OfficeParserConfig config = new OfficeParserConfig();
93 config.setIncludeMissingRows(true);
94
95 Metadata metadata = new Metadata();
96 ContentHandler handler = new BodyContentHandler();
97 ParseContext context = new ParseContext();
98 context.set(Locale.class, Locale.US);
99 context.set(OfficeParserConfig.class, config);
100 new OfficeParser().parse(input, handler, metadata, context);
101
102 // Will now have the missing rows, each with a single empty cell
103 String content = handler.toString();
104 assertContains("Numbers and their Squares\n\t\n\t\n\t\tNumber", content);
105 assertContains("\tSquare\n\t\n\t\t1", content);
83106 }
84107 }
85108
3535 import org.apache.tika.parser.PasswordProvider;
3636 import org.apache.tika.parser.RecursiveParserWrapper;
3737 import org.apache.tika.sax.BasicContentHandlerFactory;
38 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3839 import org.junit.Test;
3940 import org.xml.sax.helpers.DefaultHandler;
4041
4546
4647 Parser p = new AutoDetectParser();
4748
48 RecursiveParserWrapper w = new RecursiveParserWrapper(p,
49 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
49 RecursiveParserWrapper w = new RecursiveParserWrapper(p);
5050
5151 for (String fName : new String[]{"testAccess2.accdb", "testAccess2_2000.mdb",
5252 "testAccess2_2002-2003.mdb"}) {
5353 InputStream is = null;
54 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
55 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)
56 );
5457 try {
5558 is = this.getResourceAsStream("/test-documents/" + fName);
5659
5760 Metadata meta = new Metadata();
5861 ParseContext c = new ParseContext();
59 w.parse(is, new DefaultHandler(), meta, c);
62 w.parse(is, handler, meta, c);
6063 } finally {
6164 IOUtils.closeQuietly(is);
6265 }
63 List<Metadata> list = w.getMetadata();
66 List<Metadata> list = handler.getMetadataList();
6467 assertEquals(4, list.size());
6568 String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
6669
7174 assertContains("<th>ShortTextField</th>", mainContent);
7275
7376 //test date format
74 assertContains("6/24/15", mainContent);
77 //java 8 is 6/24/15 ...java 10 is 2015-06-24
78 assertTrue (mainContent.contains("6/24/15") || mainContent.contains("2015-06-24"));
7579
7680 //test that markup is stripped
7781 assertContains("over the bold italic dog", mainContent);
8286 //test embedded document handling
8387 assertContains("Test Document with embedded pdf",
8488 list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
85
86 w.reset();
8789 }
8890 }
8991
270270
271271 // As the HTML version should have been processed, ensure
272272 // we got some of the links
273 String content = sw.toString().replaceAll("<p>\\s+", "<p>");
273 String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " ");
274274 assertContains("<dd>New Outlook User</dd>", content);
275275 assertContains("designed <i>to help you", content);
276 assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
276 assertContains("<p> <a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
277277
278278 // Link - check text around it, and the link itself
279279 assertContains("sign up for a free subscription", content);
136136 // Make sure boilerplate text didn't come through:
137137 assertEquals(-1, content.indexOf("Click to edit Master"));
138138
139 //TIKA-1171
139 //TIKA-1171, POI-62591
140140 assertEquals(-1, content.indexOf("*"));
141141 }
142142
160160 // Make sure boilerplate text didn't come through:
161161 assertEquals(-1, content.indexOf("Click to edit Master"));
162162
163 //TIKA-1171
163 //TIKA-1171, POI-62591
164164 assertEquals(-1, content.indexOf("*"));
165165 }
166166
179179
180180 // Make sure boilerplate text didn't come through:
181181 assertEquals(-1, content.indexOf("Click to edit Master"));
182 //TIKA-1171
182 //TIKA-1171, POI-62591
183183 assertEquals(-1, content.indexOf("*"));
184184 }
185185
2828
2929 @Test
3030 public void testTextExtractionWindows() throws Exception {
31 List<Metadata> metadataList = getRecursiveMetadata("testXLSX_Thumbnail.xlsx");
32 Metadata wmfMetadata = metadataList.get(1);
33 assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
34 assertContains("This file contains an embedded thumbnail",
35 wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
31 testTextExtraction("testXLSX_Thumbnail.xlsx", 1, "This file contains an embedded thumbnail");
3632 }
3733
38 //TODO fix wmf text extraction in "testRTFEmbeddedFiles.rtf"
39 //Chinese is garbled.
34 @Test
35 public void testTextExtractionShiftJISencoding() throws Exception {
36 testTextExtraction("testWMF_charset.wmf", 0, "普林斯");
37 }
38
39 private void testTextExtraction(String fileName, int metaDataItemIndex, String expectedText) throws Exception {
40 List<Metadata> metadataList = getRecursiveMetadata(fileName);
41 Metadata wmfMetadata = metadataList.get(metaDataItemIndex);
42
43 assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
44 assertContains(expectedText, wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
45 }
4046 }
4147
18261826 assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
18271827 xlsx.get(Metadata.CONTENT_TYPE));
18281828 }
1829
1830 @Test(expected = org.apache.tika.exception.TikaException.class)
1831 public void testCorruptedZip() throws Exception {
1832 //TIKA_2446
1833 getRecursiveMetadata("testZIP_corrupted_oom.zip");
1834 }
18291835 }
18301836
18311837
1717
1818 import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
1919 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertNotNull;
2021 import static org.junit.Assert.assertTrue;
2122 import static org.junit.Assume.assumeTrue;
2223
2526 import java.util.regex.Matcher;
2627 import java.util.regex.Pattern;
2728
29 import org.apache.tika.Tika;
2830 import org.apache.tika.TikaTest;
31 import org.apache.tika.config.TikaConfig;
2932 import org.apache.tika.metadata.Metadata;
3033 import org.apache.tika.mime.MediaType;
3134 import org.apache.tika.parser.AutoDetectParser;
35 import org.apache.tika.parser.CompositeParser;
3236 import org.apache.tika.parser.DefaultParser;
3337 import org.apache.tika.parser.ParseContext;
3438 import org.apache.tika.parser.Parser;
3640 import org.apache.tika.parser.external.ExternalParser;
3741 import org.apache.tika.parser.image.ImageParser;
3842 import org.apache.tika.parser.pdf.PDFParserConfig;
43 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
3944 import org.apache.tika.sax.BasicContentHandlerFactory;
4045 import org.junit.Test;
4146 import org.xml.sax.helpers.DefaultHandler;
179184
180185 StringBuilder contents = new StringBuilder();
181186 for (Metadata m : metadataList) {
182 contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
187 contents.append(m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
183188 }
184189
185190 for (String needle : nonOCRContains) {
301306 assertContains("Its had resolving otherwise she contented therefore", ocr);
302307 }
303308 }
309
310 @Test
311 public void testConfig() throws Exception {
312 TikaConfig config = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2705-tesseract.xml"));
313 Parser p = config.getParser();
314 Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
315 assertNotNull(tesseractOCRParser);
316
317 TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
318 assertEquals(241, tesseractOCRConfig.getTimeout());
319 assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
320 assertEquals("ceb", tesseractOCRConfig.getLanguage());
321 assertEquals(false, tesseractOCRConfig.getApplyRotation());
322 assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
323 }
324
325 private Parser findParser(Parser parser, Class clazz) {
326 if (parser instanceof CompositeParser) {
327 for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
328 Parser found = findParser(child, clazz);
329 if (found != null) {
330 return found;
331 }
332 }
333 } else if (clazz.isInstance(parser)) {
334 return parser;
335 }
336 return null;
337 }
304338 }
1818 import static org.junit.Assert.assertEquals;
1919 import static org.junit.Assert.assertTrue;
2020
21 import java.io.IOException;
2122 import java.io.InputStream;
2223 import java.util.List;
2324
405406 assertEquals(3, metadataList.size());
406407 }
407408
409 @Test(expected = IOException.class)
410 public void testInvalidFromStream() throws Exception {
411 try (InputStream is = this.getClass().getResource(
412 "/test-documents/testODTnotaZipFile.odt").openStream()) {
413 OpenDocumentParser parser = new OpenDocumentParser();
414 Metadata metadata = new Metadata();
415 ContentHandler handler = new BodyContentHandler();
416 parser.parse(is, handler, metadata, new ParseContext());
417 }
418 }
419
420 @Test(expected = IOException.class)
421 public void testInvalidFromFile() throws Exception {
422 try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
423 "/test-documents/testODTnotaZipFile.odt"))) {
424 OpenDocumentParser parser = new OpenDocumentParser();
425 Metadata metadata = new Metadata();
426 ContentHandler handler = new BodyContentHandler();
427 parser.parse(tis, handler, metadata, new ParseContext());
428 }
429 }
430
408431 private ParseContext getNonRecursingParseContext() {
409432 ParseContext parseContext = new ParseContext();
410433 parseContext.set(Parser.class, new EmptyParser());
12461246 context.set(Parser.class, new AutoDetectParser());
12471247 //make sure everything works with regular xml _and_ with recursive
12481248 XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context);
1249 assertContains("pdf_haystack", xmlResult.xml);
1249 //can get dehaystack depending on version of tesseract and/or preprocessing
1250 if (xmlResult.xml.contains("pdf_haystack") || xmlResult.xml.contains("dehaystack")) {
1251 //great
1252 } else {
1253 fail("couldn't find pdf_haystack or its variants");
1254 }
12501255 assertContains("Haystack", xmlResult.xml);
12511256 assertContains("Needle", xmlResult.xml);
12521257 if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
12531258 // Tesseract may see the t in haystack as a ! some times...
1254 String div = "<div class=\"ocr\">pdf_hays";
1255 if (xmlResult.xml.contains(div+"!ack")) {
1256 assertContains(div+"!ack", xmlResult.xml);
1259 //or it might see dehayslack...
1260 //TODO: figure out how to make this test less hacky
1261 String div = "<div class=\"ocr\">";
1262 if (xmlResult.xml.contains(div+"pdf_hays!ack")) {
1263 } else if (xmlResult.xml.contains(div+"pdf_haystack")) {
1264 } else if (xmlResult.xml.contains(div+"dehayslack")) {
12571265 } else {
1258 assertContains(div+"tack", xmlResult.xml);
1266 fail("couldn't find acceptable variants of haystack");
12591267 }
12601268 } else {
12611269 assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
3737 import org.apache.tika.parser.ParseContext;
3838 import org.apache.tika.parser.Parser;
3939 import org.apache.tika.parser.RecursiveParserWrapper;
40 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
4041 import org.apache.tika.sax.BodyContentHandler;
4142 import org.junit.Test;
4243 import org.xml.sax.ContentHandler;
211212 assertContains("EncryptedDocumentException: stream (encrypted.txt) is encrypted", values[0]);
212213
213214
214 assertContains("hello world", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
215 assertContains("hello world", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
215216 }
216217
217218 @Test
4848 import org.apache.tika.parser.ParseContext;
4949 import org.apache.tika.parser.Parser;
5050 import org.apache.tika.parser.RecursiveParserWrapper;
51 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
5152 import org.apache.tika.sax.BasicContentHandlerFactory;
5253 import org.apache.tika.sax.BodyContentHandler;
54 import org.apache.tika.sax.RecursiveParserWrapperHandler;
5355 import org.apache.tika.sax.WriteOutContentHandler;
5456 import org.junit.Test;
5557 import org.xml.sax.ContentHandler;
443445 //directory: _1457338524/HW.txt
444446 assertEquals("filename equals ",
445447 p.fileName, FilenameUtils.getName(
446 metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
448 metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
447449
448450 assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
449451 }
456458 public void testRegularImages() throws Exception {
457459 Parser base = new AutoDetectParser();
458460 ParseContext ctx = new ParseContext();
459 RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
460 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
461 ContentHandler handler = new BodyContentHandler();
461 RecursiveParserWrapper parser = new RecursiveParserWrapper(base);
462 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
463 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1),-1);
462464 Metadata rootMetadata = new Metadata();
463465 rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
464466 try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
465467 parser.parse(tis, handler, rootMetadata, ctx);
466468 }
467 List<Metadata> metadatas = parser.getMetadata();
469 List<Metadata> metadatas = handler.getMetadataList();
468470
469471 Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
470472 Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.parser.sas;
17
18 import static org.junit.Assert.assertEquals;
19
20 import java.io.InputStream;
21 import java.util.Arrays;
22
23 import org.apache.tika.TikaTest;
24 import org.apache.tika.metadata.Database;
25 import org.apache.tika.metadata.HttpHeaders;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.OfficeOpenXMLExtended;
28 import org.apache.tika.metadata.PagedText;
29 import org.apache.tika.metadata.TikaCoreProperties;
30 import org.apache.tika.parser.AutoDetectParser;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.parser.Parser;
33 import org.apache.tika.parser.executable.MachineMetadata;
34 import org.apache.tika.sax.BodyContentHandler;
35 import org.junit.Test;
36 import org.xml.sax.ContentHandler;
37
38 public class SAS7BDATParserTest extends TikaTest {
39 private Parser parser = new SAS7BDATParser();
40
41 @Test
42 public void testSimpleFile() throws Exception {
43 ContentHandler handler = new BodyContentHandler();
44 Metadata metadata = new Metadata();
45
46 try (InputStream stream = SAS7BDATParserTest.class.getResourceAsStream(
47 "/test-documents/testSAS.sas7bdat")) {
48 parser.parse(stream, handler, metadata, new ParseContext());
49 }
50
51 assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
52 assertEquals("TESTING", metadata.get(TikaCoreProperties.TITLE));
53
54 // Mon Jan 30 07:31:47 GMT 2017
55 assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.CREATED));
56 assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.MODIFIED));
57
58 assertEquals("1", metadata.get(PagedText.N_PAGES));
59 assertEquals("2", metadata.get(Database.COLUMN_COUNT));
60 assertEquals("11", metadata.get(Database.ROW_COUNT));
61 assertEquals("windows-1252", metadata.get(HttpHeaders.CONTENT_ENCODING));
62 assertEquals("W32_7PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION));
63 assertEquals("9.0301M2", metadata.get(OfficeOpenXMLExtended.APP_VERSION));
64 assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS));
65 assertEquals("Little", metadata.get(MachineMetadata.ENDIAN));
66 assertEquals(Arrays.asList("recnum","label"),
67 Arrays.asList(metadata.getValues(Database.COLUMN_NAME)));
68
69 String content = handler.toString();
70 assertContains("TESTING", content);
71 assertContains("\t3\t", content);
72 assertContains("\t10\t", content);
73 assertContains("\tThis is row", content);
74 assertContains(" of ", content);
75 }
76
77 @Test
78 public void testMultiColumns() throws Exception {
79 Parser parser = new AutoDetectParser(); // Should auto-detect!
80 ContentHandler handler = new BodyContentHandler();
81 Metadata metadata = new Metadata();
82
83 try (InputStream stream = SAS7BDATParserTest.class.getResourceAsStream(
84 "/test-documents/test-columnar.sas7bdat")) {
85 parser.parse(stream, handler, metadata, new ParseContext());
86 }
87
88 assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
89 assertEquals("TESTING", metadata.get(TikaCoreProperties.TITLE));
90
91 assertEquals("2018-05-18T11:38:30Z", metadata.get(TikaCoreProperties.CREATED));
92 assertEquals("2018-05-18T11:38:30Z", metadata.get(TikaCoreProperties.MODIFIED));
93
94 assertEquals("1", metadata.get(PagedText.N_PAGES));
95 assertEquals("8", metadata.get(Database.COLUMN_COUNT));
96 assertEquals("11", metadata.get(Database.ROW_COUNT));
97 assertEquals("windows-1252", metadata.get(HttpHeaders.CONTENT_ENCODING));
98 assertEquals("X64_7PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION));
99 assertEquals("9.0401M5", metadata.get(OfficeOpenXMLExtended.APP_VERSION));
100 assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS));
101 assertEquals("Little", metadata.get(MachineMetadata.ENDIAN));
102 assertEquals(Arrays.asList("Record Number","Square of the Record Number",
103 "Description of the Row","Percent Done",
104 "Percent Increment","date","datetime","time"),
105 Arrays.asList(metadata.getValues(Database.COLUMN_NAME)));
106
107 String content = handler.toString();
108 assertContains("TESTING", content);
109 assertContains("0\t0\tThis", content);
110 assertContains("2\t4\tThis", content);
111 assertContains("4\t16\tThis", content);
112 assertContains("\t01-01-1960\t", content);
113 assertContains("\t01Jan1960:00:00", content);
114 }
115
116 @Test
117 public void testHTML() throws Exception {
118 XMLResult result = getXML("testSAS.sas7bdat");
119 String xml = result.xml;
120
121 // Check the title came through
122 assertContains("<h1>TESTING</h1>", xml);
123 // Check the headings
124 assertContains("<th title=\"recnum\">recnum</th>", xml);
125 assertContains("<th title=\"label\">label</th>", xml);
126 // Check some rows
127 assertContains("<td>3</td>", xml);
128 assertContains("<td>This is row", xml);
129 assertContains("10</td>", xml);
130 }
131
132 @Test
133 public void testHTML2() throws Exception {
134 XMLResult result = getXML("test-columnar.sas7bdat");
135 String xml = result.xml;
136
137 // Check the title came through
138 assertContains("<h1>TESTING</h1>", xml);
139 // Check the headings
140 assertContains("<th title=\"recnum\">Record Number</th>", xml);
141 assertContains("<th title=\"square\">Square of the Record Number</th>", xml);
142 assertContains("<th title=\"date\">date</th>", xml);
143 // Check formatting of dates
144 assertContains("<td>01-01-1960</td>", xml);
145 assertContains("<td>01Jan1960:00:00:10.00</td>", xml);
146 }
147 }
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
6767 <plugin>
6868 <groupId>org.apache.maven.plugins</groupId>
6969 <artifactId>maven-jar-plugin</artifactId>
70 <configuration>
71 <archive>
72 <manifestEntries>
73 <Automatic-Module-Name>org.apache.tika.serialization</Automatic-Module-Name>
74 </manifestEntries>
75 </archive>
76 </configuration>
7077 <executions>
7178 <execution>
7279 <goals>
2727 import com.google.gson.reflect.TypeToken;
2828 import org.apache.tika.exception.TikaException;
2929 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3031
3132 public class JsonMetadataList extends JsonMetadataBase {
3233
6970 //covers both io and parse exceptions
7071 throw new TikaException(e.getMessage());
7172 }
73 if (ms == null) {
74 return null;
75 }
76 //if the last object is the main document,
77 //as happens with the streaming serializer,
78 //flip it to be the first element.
79 if (ms.size() > 1) {
80 Metadata last = ms.get(ms.size()-1);
81 String embResourcePath = last.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
82 if (embResourcePath == null &&
83 ms.get(0).get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH) != null) {
84 ms.add(0, ms.remove(ms.size()-1));
85 }
86 }
7287 return ms;
7388 }
7489
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.metadata.serialization;
17
18 import com.google.gson.stream.JsonWriter;
19 import org.apache.tika.metadata.Metadata;
20
21 import java.io.IOException;
22 import java.io.Writer;
23 import java.util.Arrays;
24
25
26 public class JsonStreamingSerializer implements AutoCloseable {
27
28 private final JsonWriter jsonWriter;
29 boolean hasStartedArray = false;
30 public JsonStreamingSerializer(Writer writer) {
31 this.jsonWriter = new JsonWriter(writer);
32 }
33
34 public void add(Metadata metadata) throws IOException {
35 if (!hasStartedArray) {
36 jsonWriter.beginArray();
37 hasStartedArray = true;
38 }
39 String[] names = metadata.names();
40 Arrays.sort(names);
41 jsonWriter.beginObject();
42 for (String n : names) {
43 jsonWriter.name(n);
44 String[] values = metadata.getValues(n);
45 if (values.length == 1) {
46 jsonWriter.value(values[0]);
47 } else {
48 jsonWriter.beginArray();
49 for (String v : values) {
50 jsonWriter.value(v);
51 }
52 jsonWriter.endArray();
53 }
54 }
55 jsonWriter.endObject();
56 }
57
58 @Override
59 public void close() throws IOException {
60 jsonWriter.endArray();
61 jsonWriter.flush();
62 jsonWriter.close();
63 }
64 }
2525 return -1;
2626 }
2727
28 //this is stinky. This should reference RecursiveParserWrapper.TIKA_CONTENT
28 //this is stinky. This should reference AbstractRecursiveParserWrapperHandler.TIKA_CONTENT
2929 //but that would require making core a dependency of serialization...
3030 //do we want to do that?
3131 if (s1.equals("tika:content")) {
2020 import static org.junit.Assert.assertNull;
2121 import static org.junit.Assert.assertTrue;
2222
23 import java.io.Reader;
2324 import java.io.StringReader;
2425 import java.io.StringWriter;
26 import java.util.ArrayList;
2527 import java.util.LinkedList;
2628 import java.util.List;
2729
2830 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.sax.RecursiveParserWrapperHandler;
2932 import org.junit.Test;
3033
3134 public class JsonMetadataListTest {
5659 JsonMetadataList.toJson(metadataList, writer);
5760 List<Metadata> deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString()));
5861 assertEquals(metadataList, deserialized);
62
63 //now test streaming serializer
64 writer = new StringWriter();
65 try(JsonStreamingSerializer streamingSerializer = new JsonStreamingSerializer(writer)) {
66 streamingSerializer.add(m1);
67 streamingSerializer.add(m2);
68 }
69 deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString()));
70 assertEquals(metadataList, deserialized);
71
5972 }
6073
6174 @Test
119132 JsonMetadataList.toJson(metadataList, writer);
120133 assertTrue(writer.toString().startsWith("[{\"tika:content\":\"this is the content\",\"zk1\":[\"v1\",\"v2\","));
121134 }
135
136 @Test
137 public void testSwitchingOrderOfMainDoc() throws Exception {
138 Metadata m1 = new Metadata();
139 m1.add("k1", "v1");
140 m1.add("k1", "v2");
141 m1.add("k1", "v3");
142 m1.add("k1", "v4");
143 m1.add("k1", "v4");
144 m1.add("k2", "v1");
145 m1.add(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/embedded-1");
146
147 Metadata m2 = new Metadata();
148 m2.add("k3", "v1");
149 m2.add("k3", "v2");
150 m2.add("k3", "v3");
151 m2.add("k3", "v4");
152 m2.add("k3", "v4");
153 m2.add("k4", "v1");
154
155 List<Metadata> truth = new ArrayList<>();
156 truth.add(m2);
157 truth.add(m1);
158 StringWriter stringWriter = new StringWriter();
159 try(JsonStreamingSerializer serializer = new JsonStreamingSerializer(stringWriter)) {
160 serializer.add(m1);
161 serializer.add(m2);
162 }
163 Reader reader = new StringReader(stringWriter.toString());
164 List<Metadata> deserialized = JsonMetadataList.fromJson(reader);
165 assertEquals(truth, deserialized);
166
167 }
122168 }
1919 <parent>
2020 <groupId>org.apache.tika</groupId>
2121 <artifactId>tika-parent</artifactId>
22 <version>1.18</version>
22 <version>1.19</version>
2323 <relativePath>../tika-parent/pom.xml</relativePath>
2424 </parent>
2525
5353 <groupId>${project.groupId}</groupId>
5454 <artifactId>tika-langdetect</artifactId>
5555 <version>${project.version}</version>
56 <exclusions>
57 <exclusion>
58 <groupId>javax.activation</groupId>
59 <artifactId>activation</artifactId>
60 </exclusion>
61 </exclusions>
5662 </dependency>
5763 <dependency>
5864 <groupId>${project.groupId}</groupId>
9096 <artifactId>cxf-rt-rs-security-cors</artifactId>
9197 <version>${cxf.version}</version>
9298 </dependency>
93 <dependency>
94 <groupId>javax.mail</groupId>
95 <artifactId>mail</artifactId>
96 <version>1.4.4</version>
97 </dependency>
99
98100 <dependency>
99101 <groupId>commons-cli</groupId>
100102 <artifactId>commons-cli</artifactId>
274276 <plugin>
275277 <groupId>org.apache.maven.plugins</groupId>
276278 <artifactId>maven-jar-plugin</artifactId>
279 <configuration>
280 <archive>
281 <manifestEntries>
282 <Automatic-Module-Name>org.apache.tika.server</Automatic-Module-Name>
283 </manifestEntries>
284 </archive>
285 </configuration>
277286 <executions>
278287 <execution>
279288 <goals>
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 import org.slf4j.Logger;
19 import org.slf4j.LoggerFactory;
20
21 import java.time.Instant;
22 import java.util.HashMap;
23 import java.util.Map;
24 import java.util.concurrent.atomic.AtomicLong;
25
26 public class ServerStatus {
27
28 enum DIRECTIVES {
29 PING((byte)0),
30 PING_ACTIVE_SERVER_TASKS((byte)1),
31 SHUTDOWN((byte)2);
32
33 private final byte b;
34 DIRECTIVES(byte b) {
35 this.b = b;
36 }
37 byte getByte() { return b;}
38 }
39
40 public enum STATUS {
41 OPERATING(0),
42 HIT_MAX(1),
43 TIMEOUT(2),
44 ERROR(3),
45 PARENT_REQUESTED_SHUTDOWN(4),
46 PARENT_EXCEPTION(5);
47
48 private final int shutdownCode;
49
50 static STATUS lookup(int i) {
51 STATUS[] values = STATUS.values();
52 if (i < 0 || i >= values.length) {
53 throw new ArrayIndexOutOfBoundsException(i +
54 " is not acceptable for an array of length "+values.length);
55 }
56 return STATUS.values()[i];
57 }
58
59 STATUS(int shutdownCode) {
60 this.shutdownCode = shutdownCode;
61 }
62 int getShutdownCode() {
63 return shutdownCode;
64 }
65 byte getByte() { return (byte) shutdownCode;}
66
67 }
68 public enum TASK {
69 PARSE,
70 DETECT,
71 TRANSLATE
72 };
73 private static final Logger LOG = LoggerFactory.getLogger(ServerStatus.class);
74
75 private AtomicLong counter = new AtomicLong(0);
76 private Map<Long, TaskStatus> tasks = new HashMap<>();
77
78 private STATUS status = STATUS.OPERATING;
79
80 public synchronized long start(TASK task, String fileName) {
81 long taskId = counter.incrementAndGet();
82 tasks.put(taskId, new TaskStatus(task, Instant.now(), fileName));
83 return taskId;
84 }
85
86 /**
87 * Removes the task from the collection of currently running tasks.
88 *
89 * @param taskId
90 * @throws IllegalArgumentException if there is no task by that taskId in the collection
91 */
92 public synchronized void complete(long taskId) throws IllegalArgumentException {
93 TaskStatus status = tasks.remove(taskId);
94 if (status == null) {
95 throw new IllegalArgumentException("TaskId is not in map:"+taskId);
96 }
97 }
98
99 public synchronized void setStatus(STATUS status) {
100 this.status = status;
101 }
102
103 public synchronized STATUS getStatus() {
104 return status;
105 }
106
107 public synchronized Map<Long, TaskStatus> getTasks() {
108 Map<Long, TaskStatus> ret = new HashMap<>();
109 ret.putAll(tasks);
110 return ret;
111 }
112
113 public synchronized long getFilesProcessed() {
114 return counter.get();
115 }
116
117 public synchronized boolean isOperating() {
118 return status == STATUS.OPERATING;
119 }
120
121 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.slf4j.Logger;
20 import org.slf4j.LoggerFactory;
21
22 import java.io.DataInputStream;
23 import java.io.DataOutputStream;
24 import java.io.InputStream;
25 import java.io.OutputStream;
26 import java.time.Duration;
27 import java.time.Instant;
28
29 public class ServerStatusWatcher implements Runnable {
30
31
32 private static final Logger LOG = LoggerFactory.getLogger(ServerStatusWatcher.class);
33 private final ServerStatus serverStatus;
34 private final DataInputStream fromParent;
35 private final DataOutputStream toParent;
36 private final long maxFiles;
37 private final ServerTimeouts serverTimeouts;
38
39
40 private volatile Instant lastPing = null;
41
42 public ServerStatusWatcher(ServerStatus serverStatus,
43 InputStream inputStream, OutputStream outputStream,
44 long maxFiles,
45 ServerTimeouts serverTimeouts) {
46 this.serverStatus = serverStatus;
47 this.maxFiles = maxFiles;
48 this.serverTimeouts = serverTimeouts;
49
50 this.fromParent = new DataInputStream(inputStream);
51 this.toParent = new DataOutputStream(outputStream);
52 Thread statusWatcher = new Thread(new StatusWatcher());
53 statusWatcher.setDaemon(true);
54 statusWatcher.start();
55 }
56
57 @Override
58 public void run() {
59 //let parent know child is alive
60 try {
61 toParent.writeByte(ServerStatus.STATUS.OPERATING.getByte());
62 toParent.flush();
63 } catch (Exception e) {
64 LOG.warn("Exception writing startup ping to parent", e);
65 serverStatus.setStatus(ServerStatus.STATUS.PARENT_EXCEPTION);
66 shutdown(ServerStatus.STATUS.PARENT_EXCEPTION);
67 }
68
69 byte directive = (byte)-1;
70 while (true) {
71 try {
72 directive = fromParent.readByte();
73 lastPing = Instant.now();
74 } catch (Exception e) {
75 LOG.warn("Exception reading from parent", e);
76 serverStatus.setStatus(ServerStatus.STATUS.PARENT_EXCEPTION);
77 shutdown(ServerStatus.STATUS.PARENT_EXCEPTION);
78 }
79 if (directive == ServerStatus.DIRECTIVES.PING.getByte()) {
80 if (serverStatus.getStatus().equals(ServerStatus.STATUS.OPERATING)) {
81 checkForHitMaxFiles();
82 checkForTaskTimeouts();
83 }
84 try {
85 toParent.writeByte(serverStatus.getStatus().getByte());
86 toParent.flush();
87 } catch (Exception e) {
88 LOG.warn("Exception writing to parent", e);
89 serverStatus.setStatus(ServerStatus.STATUS.PARENT_EXCEPTION);
90 shutdown(ServerStatus.STATUS.PARENT_EXCEPTION);
91 }
92 } else if (directive == ServerStatus.DIRECTIVES.SHUTDOWN.getByte()) {
93 LOG.info("Parent requested shutdown");
94 serverStatus.setStatus(ServerStatus.STATUS.PARENT_REQUESTED_SHUTDOWN);
95 shutdown(ServerStatus.STATUS.PARENT_REQUESTED_SHUTDOWN);
96 } else if (directive == ServerStatus.DIRECTIVES.PING_ACTIVE_SERVER_TASKS.getByte()) { try {
97 toParent.writeInt(serverStatus.getTasks().size());
98 toParent.flush();
99 } catch (Exception e) {
100 LOG.warn("Exception writing to parent", e);
101 serverStatus.setStatus(ServerStatus.STATUS.PARENT_EXCEPTION);
102 shutdown(ServerStatus.STATUS.PARENT_EXCEPTION);
103 }
104 }
105 }
106 }
107
108 private void checkForHitMaxFiles() {
109 if (maxFiles < 0) {
110 return;
111 }
112 long filesProcessed = serverStatus.getFilesProcessed();
113 if (filesProcessed >= maxFiles) {
114 serverStatus.setStatus(ServerStatus.STATUS.HIT_MAX);
115 }
116 }
117
118 private void checkForTaskTimeouts() {
119 Instant now = Instant.now();
120 for (TaskStatus status : serverStatus.getTasks().values()) {
121 long millisElapsed = Duration.between(status.started, now).toMillis();
122 if (millisElapsed > serverTimeouts.getTaskTimeoutMillis()) {
123 serverStatus.setStatus(ServerStatus.STATUS.TIMEOUT);
124 if (status.fileName.isPresent()) {
125 LOG.error("Timeout task {}, millis elapsed {}, file {}",
126 status.task.toString(), Long.toString(millisElapsed), status.fileName.get());
127 } else {
128 LOG.error("Timeout task {}, millis elapsed {}",
129 status.task.toString(), Long.toString(millisElapsed));
130 }
131 }
132 }
133 }
134
135 private void shutdown(ServerStatus.STATUS status) {
136 LOG.info("Shutting down child process with status: " +status.name());
137 System.exit(status.getShutdownCode());
138 }
139
140 //This is an internal thread that pulses every 100MS
141 //within the child to see if the child should die.
142 private class StatusWatcher implements Runnable {
143
144 @Override
145 public void run() {
146 while (true) {
147 ServerStatus.STATUS currStatus = serverStatus.getStatus();
148
149 if (currStatus != ServerStatus.STATUS.OPERATING) {
150 LOG.warn("child process observed "+currStatus.name()+ " and is shutting down.");
151 shutdown(currStatus);
152 }
153
154 if (lastPing != null) {
155 long elapsed = Duration.between(lastPing, Instant.now()).toMillis();
156 if (elapsed > serverTimeouts.getPingTimeoutMillis()) {
157 serverStatus.setStatus(ServerStatus.STATUS.PARENT_EXCEPTION);
158 shutdown(ServerStatus.STATUS.PARENT_EXCEPTION);
159 }
160 }
161 try {
162 Thread.sleep(serverTimeouts.getPingPulseMillis());
163 } catch (InterruptedException e) {
164 return;
165 }
166 }
167 }
168 }
169 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 public class ServerTimeouts {
19
20 /*
21 TODO: integrate these settings:
22 * Number of milliseconds to wait to start child process.
23 public static final long DEFAULT_CHILD_PROCESS_STARTUP_MILLIS = 60000;
24
25 * Maximum number of milliseconds to wait to shutdown child process to allow
26 * for current parses to complete.
27 public static final long DEFAULT_CHILD_PROCESS_SHUTDOWN_MILLIS = 30000;
28
29 private long childProcessStartupMillis = DEFAULT_CHILD_PROCESS_STARTUP_MILLIS;
30
31 private long childProcessShutdownMillis = DEFAULT_CHILD_PROCESS_SHUTDOWN_MILLIS;
32
33 */
34
35
36
37 /**
38 * If the child doesn't receive a ping or the parent doesn't
39 * hear back from a ping in this amount of time, kill and restart the child.
40 */
41 public static final long DEFAULT_PING_TIMEOUT_MILLIS = 30000;
42
43 /**
44 * How often should the parent try to ping the child to check status
45 */
46 public static final long DEFAULT_PING_PULSE_MILLIS = 500;
47
48 /**
49 * Number of milliseconds to wait per server task (parse, detect, unpack, translate,
50 * etc.) before timing out and shutting down the child process.
51 */
52 public static final long DEFAULT_TASK_TIMEOUT_MILLIS = 120000;
53
54 private long taskTimeoutMillis = DEFAULT_TASK_TIMEOUT_MILLIS;
55
56 private long pingTimeoutMillis = DEFAULT_PING_TIMEOUT_MILLIS;
57
58 private long pingPulseMillis = DEFAULT_PING_PULSE_MILLIS;
59
60
61 /**
62 * How long to wait for a task before shutting down the child server process
63 * and restarting it.
64 * @return
65 */
66 public long getTaskTimeoutMillis() {
67 return taskTimeoutMillis;
68 }
69
70 /**
71 *
72 * @param taskTimeoutMillis number of milliseconds to allow per task
73 * (parse, detection, unzipping, etc.)
74 */
75 public void setTaskTimeoutMillis(long taskTimeoutMillis) {
76 this.taskTimeoutMillis = taskTimeoutMillis;
77 }
78
79 public long getPingTimeoutMillis() {
80 return pingTimeoutMillis;
81 }
82
83 /**
84 *
85 * @param pingTimeoutMillis if the parent doesn't receive a response
86 * in this amount of time, or
87 * if the child doesn't receive a ping
88 * in this amount of time, restart the child process
89 */
90 public void setPingTimeoutMillis(long pingTimeoutMillis) {
91 this.pingTimeoutMillis = pingTimeoutMillis;
92 }
93
94 public long getPingPulseMillis() {
95 return pingPulseMillis;
96 }
97
98 /**
99 *
100 * @param pingPulseMillis how often to test that the parent and/or child is alive
101 */
102 public void setPingPulseMillis(long pingPulseMillis) {
103 this.pingPulseMillis = pingPulseMillis;
104 }
105 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 import java.time.Instant;
19 import java.util.Optional;
20 import java.util.concurrent.ConcurrentHashMap;
21 import java.util.concurrent.atomic.AtomicInteger;
22
23 public class TaskStatus {
24 final ServerStatus.TASK task;
25 final Instant started;
26 final Optional<String> fileName;
27
28 TaskStatus(ServerStatus.TASK task, Instant started, String fileName) {
29 this.task = task;
30 this.started = started;
31 this.fileName = Optional.ofNullable(fileName);
32 }
33
34
35 @Override
36 public String toString() {
37 return "";
38 }
39
40 }
1616
1717 package org.apache.tika.server;
1818
19 import java.io.ByteArrayInputStream;
1920 import java.util.ArrayList;
2021 import java.util.Arrays;
2122 import java.util.HashSet;
6162 import org.slf4j.LoggerFactory;
6263
6364 public class TikaServerCli {
65
66
67 //used in spawn-child mode
68 private static final long DEFAULT_MAX_FILES = 100000;
69
70
6471 public static final int DEFAULT_PORT = 9998;
6572 private static final int DEFAULT_DIGEST_MARK_LIMIT = 20*1024*1024;
6673 public static final String DEFAULT_HOST = "localhost";
7380 "as tika-server. Users could request and receive a sensitive file from your\n" +
7481 "drive or a webpage from your intranet. See CVE-2015-3271.\n"+
7582 "Please make sure you know what you are doing.";
83
84 private static final List<String> ONLY_IN_SPAWN_CHILD_MODE =
85 Arrays.asList(new String[] { "taskTimeoutMillis", "taskPulseMillis",
86 "pingTimeoutMillis", "pingPulseMillis", "maxFiles"});
7687
7788 private static Options getOptions() {
7889 Options options = new Options();
8798 options.addOption("?", "help", false, "this help message");
8899 options.addOption("enableUnsecureFeatures", false, "this is required to enable fileUrl.");
89100 options.addOption("enableFileUrl", false, "allows user to pass in fileUrl instead of InputStream.");
90
101 options.addOption("spawnChild", false, "whether or not to spawn a child process for robustness");
102 options.addOption("taskTimeoutMillis", true, "Only in spawn child mode: how long to wait for a task (e.g. parse) to finish");
103 options.addOption("taskPulseMillis", true, "Only in spawn child mode: how often to check if a task has timed out.");
104 options.addOption("pingTimeoutMillis", true, "Only in spawn child mode: how long to wait to wait for a ping and/or ping response.");
105 options.addOption("pingPulseMillis", true, "Only in spawn child mode: how often to check if a ping has timed out.");
106
107 options.addOption("maxFiles", false, "Only in spawn child mode: shutdown server after this many files -- use only in 'spawnChild' mode");
108 options.addOption("child", false, "this process is a child process -- EXPERT -- " +
109 "should normally only be invoked by parent process");
91110 return options;
92111 }
93112
94113 public static void main(String[] args) {
95114 LOG.info("Starting {} server", new Tika());
96
97115 try {
98 Options options = getOptions();
99
100 CommandLineParser cliParser = new GnuParser();
101 CommandLine line = cliParser.parse(options, args);
102
116 execute(args);
117 } catch (Exception e) {
118 e.printStackTrace();
119 LOG.error("Can't start", e);
120 System.exit(-1);
121 }
122 }
123
124 private static void execute(String[] args) throws Exception {
125 Options options = getOptions();
126
127 CommandLineParser cliParser = new GnuParser();
128
129 //need to strip out -J (child jvm opts) from this parse
130 //they'll be processed correctly in args in the watch dog
131 //and they won't be needed in legacy.
132 CommandLine line = cliParser.parse(options, stripChildArgs(args));
133 if (line.hasOption("spawnChild")) {
134 TikaServerWatchDog watchDog = new TikaServerWatchDog();
135 watchDog.execute(args, configureServerTimeouts(line));
136 } else {
137 if (! line.hasOption("child")) {
138 //make sure the user didn't misunderstand the options
139 for (String childOnly : ONLY_IN_SPAWN_CHILD_MODE) {
140 if (line.hasOption(childOnly)) {
141 System.err.println("The option '" + childOnly +
142 "' can only be used with '-spawnChild'");
143 usage(options);
144 }
145 }
146 }
147 executeLegacy(line, options);
148 }
149 }
150
151 private static String[] stripChildArgs(String[] args) {
152 List<String> ret = new ArrayList<>();
153 for (int i = 0; i < args.length; i++) {
154 if (! args[i].startsWith("-J")) {
155 ret.add(args[i]);
156 }
157 }
158 return ret.toArray(new String[ret.size()]);
159 }
160
161 private static void executeLegacy(CommandLine line, Options options) throws Exception {
103162 if (line.hasOption("help")) {
104 HelpFormatter helpFormatter = new HelpFormatter();
105 helpFormatter.printHelp("tikaserver", options);
106 System.exit(-1);
163 usage(options);
107164 }
108165
109166 String host = DEFAULT_HOST;
195252 inputStreamFactory = new DefaultInputStreamFactory();
196253 }
197254
198 TikaResource.init(tika, digester, inputStreamFactory);
255 ServerStatus serverStatus = new ServerStatus();
256 //if this is a child process
257 if (line.hasOption("child")) {
258 long maxFiles = DEFAULT_MAX_FILES;
259 if (line.hasOption("maxFiles")) {
260 maxFiles = Long.parseLong(line.getOptionValue("maxFiles"));
261 }
262
263 ServerTimeouts serverTimeouts = configureServerTimeouts(line);
264 Thread serverThread =
265 new Thread(new ServerStatusWatcher(serverStatus, System.in,
266 System.out, maxFiles, serverTimeouts));
267 serverThread.start();
268 System.setIn(new ByteArrayInputStream(new byte[0]));
269 System.setOut(System.err);
270 }
271 TikaResource.init(tika, digester, inputStreamFactory, serverStatus);
199272 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
200273
201274 List<ResourceProvider> rCoreProviders = new ArrayList<>();
202275 rCoreProviders.add(new SingletonResourceProvider(new MetadataResource()));
203276 rCoreProviders.add(new SingletonResourceProvider(new RecursiveMetadataResource()));
204 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource()));
277 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource(serverStatus)));
205278 rCoreProviders.add(new SingletonResourceProvider(new LanguageResource()));
206 rCoreProviders.add(new SingletonResourceProvider(new TranslateResource()));
279 rCoreProviders.add(new SingletonResourceProvider(new TranslateResource(serverStatus)));
207280 rCoreProviders.add(new SingletonResourceProvider(new TikaResource()));
208281 rCoreProviders.add(new SingletonResourceProvider(new UnpackerResource()));
209282 rCoreProviders.add(new SingletonResourceProvider(new TikaMimeTypes()));
240313 manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
241314 sf.create();
242315 LOG.info("Started Apache Tika server at {}", url);
243 } catch (Exception ex) {
244 LOG.error("Can't start", ex);
245 System.exit(-1);
246 }
247 }
316 }
317
318 private static void usage(Options options) {
319 HelpFormatter helpFormatter = new HelpFormatter();
320 helpFormatter.printHelp("tikaserver", options);
321 System.exit(-1);
322 }
323
324 private static ServerTimeouts configureServerTimeouts(CommandLine line) {
325 ServerTimeouts serverTimeouts = new ServerTimeouts();
326 /*TODO -- add these in
327 if (line.hasOption("childProcessStartupMillis")) {
328 serverTimeouts.setChildProcessStartupMillis(
329 Long.parseLong(line.getOptionValue("childProcessStartupMillis")));
330 }
331 if (line.hasOption("childProcessShutdownMillis")) {
332 serverTimeouts.setChildProcessShutdownMillis(
333 Long.parseLong(line.getOptionValue("childProcesShutdownMillis")));
334 }*/
335 if (line.hasOption("taskTimeoutMillis")) {
336 serverTimeouts.setTaskTimeoutMillis(
337 Long.parseLong(line.getOptionValue("taskTimeoutMillis")));
338 }
339 if (line.hasOption("pingTimeoutMillis")) {
340 serverTimeouts.setPingTimeoutMillis(
341 Long.parseLong(line.getOptionValue("pingTimeoutMillis")));
342 }
343 if (line.hasOption("pingPulseMillis")) {
344 serverTimeouts.setPingPulseMillis(
345 Long.parseLong(line.getOptionValue("pingPulseMillis")));
346 }
347
348 return serverTimeouts;
349 }
350
248351 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.tika.server;
18
19 import org.slf4j.Logger;
20 import org.slf4j.LoggerFactory;
21
22 import java.io.DataInputStream;
23 import java.io.DataOutputStream;
24 import java.io.IOException;
25 import java.time.Duration;
26 import java.time.Instant;
27 import java.util.ArrayList;
28 import java.util.List;
29 import java.util.concurrent.TimeUnit;
30
31 public class TikaServerWatchDog {
32
33 private enum CHILD_STATUS {
34 INITIALIZING,
35 RUNNING,
36 SHUTTING_DOWN
37 }
38
39 private static final Logger LOG = LoggerFactory.getLogger(TikaServerWatchDog.class);
40
41 private Object[] childStatusLock = new Object[0];
42 private volatile CHILD_STATUS childStatus = CHILD_STATUS.INITIALIZING;
43 private volatile Instant lastPing = null;
44 private ChildProcess childProcess = null;
45 int restarts = 0;
46
47 public void execute(String[] args, ServerTimeouts serverTimeouts) throws Exception {
48 //if the child thread is in stop-the-world mode, and isn't
49 //responding to the ping, this thread checks to make sure
50 //that the parent ping is sent and received often enough
51 //If it isn't, this force destroys the child process.
52 Thread pingTimer = new Thread(new Runnable() {
53 @Override
54 public void run() {
55 while (true) {
56 long tmpLastPing = -1L;
57 synchronized (childStatusLock) {
58 if (childStatus == CHILD_STATUS.RUNNING) {
59 tmpLastPing = lastPing.toEpochMilli();
60 }
61 }
62 if (tmpLastPing > 0) {
63 long elapsed = Duration.between(Instant.ofEpochMilli(tmpLastPing), Instant.now()).toMillis();
64 if (elapsed > serverTimeouts.getPingTimeoutMillis()) {
65 Process processToDestroy = null;
66 try {
67 processToDestroy = childProcess.process;
68 } catch (NullPointerException e) {
69 //ignore
70 }
71 destroyChildForcibly(processToDestroy);
72 }
73 }
74 try {
75 Thread.sleep(serverTimeouts.getPingPulseMillis());
76 } catch (InterruptedException e) {
77 //swallow
78 }
79 }
80 }
81 }
82 );
83 pingTimer.setDaemon(true);
84 pingTimer.start();
85 try {
86 childProcess = new ChildProcess(args);
87 setChildStatus(CHILD_STATUS.RUNNING);
88 while (true) {
89
90 if (!childProcess.ping()) {
91 setChildStatus(CHILD_STATUS.INITIALIZING);
92 lastPing = null;
93 childProcess.close();
94 LOG.info("About to restart the child process");
95 childProcess = new ChildProcess(args);
96 LOG.info("Successfully restarted child process -- {} restarts so far)", ++restarts);
97 setChildStatus(CHILD_STATUS.RUNNING);
98 }
99 Thread.sleep(serverTimeouts.getPingPulseMillis());
100 }
101 } catch (InterruptedException e) {
102 //interrupted...shutting down
103 } finally {
104 setChildStatus(CHILD_STATUS.SHUTTING_DOWN);
105 if (childProcess != null) {
106 childProcess.close();
107 }
108 }
109 }
110
111 private void setChildStatus(CHILD_STATUS status) {
112 synchronized (childStatusLock) {
113 childStatus = status;
114 }
115 }
116
117 private static List<String> extractArgs(String[] args) {
118 List<String> argList = new ArrayList<>();
119 for (int i = 0; i < args.length; i++) {
120 if (args[i].startsWith("-J") || args[i].equals("-spawnChild") || args[i].equals("--spawnChild")) {
121 continue;
122 }
123 argList.add(args[i]);
124 }
125 return argList;
126 }
127
128 private static List<String> extractJVMArgs(String[] args) {
129 List<String> jvmArgs = new ArrayList<>();
130 for (int i = 0; i < args.length; i++) {
131 if (args[i].startsWith("-J")) {
132 jvmArgs.add("-"+args[i].substring(2));
133 }
134 }
135 return jvmArgs;
136 }
137
138 private class ChildProcess {
139 private Thread SHUTDOWN_HOOK = null;
140
141 Process process;
142 DataInputStream fromChild;
143 DataOutputStream toChild;
144
145
146
147 private ChildProcess(String[] args) throws Exception {
148 this.process = startProcess(args);
149
150 this.fromChild = new DataInputStream(process.getInputStream());
151 this.toChild = new DataOutputStream(process.getOutputStream());
152 byte status = fromChild.readByte();
153 if (status != ServerStatus.STATUS.OPERATING.getByte()) {
154 throw new IOException("bad status from child process: "+
155 ServerStatus.STATUS.lookup(status));
156 }
157 lastPing = Instant.now();
158 }
159
160 public boolean ping() {
161 lastPing = Instant.now();
162 try {
163 toChild.writeByte(ServerStatus.DIRECTIVES.PING.getByte());
164 toChild.flush();
165 } catch (Exception e) {
166 LOG.warn("Exception pinging child process", e);
167 return false;
168 }
169 try {
170 byte status = fromChild.readByte();
171 if (status != ServerStatus.STATUS.OPERATING.getByte()) {
172 LOG.warn("Received status from child: {}",
173 ServerStatus.STATUS.lookup(status));
174 return false;
175 }
176 } catch (Exception e) {
177 LOG.warn("Exception receiving status from child", e);
178 return false;
179 }
180 return true;
181 }
182
183 private void close() {
184 try {
185 toChild.writeByte(ServerStatus.DIRECTIVES.SHUTDOWN.getByte());
186 toChild.flush();
187 } catch (Exception e) {
188 LOG.warn("Exception asking child to shutdown", e);
189 }
190 //TODO: add a gracefully timed shutdown routine
191 try {
192 fromChild.close();
193 } catch (Exception e) {
194 LOG.warn("Problem shutting down reader from child", e);
195 }
196
197 try {
198 toChild.close();
199 } catch (Exception e) {
200 LOG.warn("Problem shutting down writer to child", e);
201 }
202 destroyChildForcibly(process);
203 }
204
205 private Process startProcess(String[] args) throws IOException {
206 ProcessBuilder builder = new ProcessBuilder();
207 builder.redirectError(ProcessBuilder.Redirect.INHERIT);
208 List<String> argList = new ArrayList<>();
209 List<String> jvmArgs = extractJVMArgs(args);
210 List<String> childArgs = extractArgs(args);
211 argList.add("java");
212 if (! jvmArgs.contains("-cp") && ! jvmArgs.contains("--classpath")) {
213 String cp = System.getProperty("java.class.path");
214 jvmArgs.add("-cp");
215 jvmArgs.add(cp);
216 }
217 argList.addAll(jvmArgs);
218 argList.add("org.apache.tika.server.TikaServerCli");
219 argList.addAll(childArgs);
220 argList.add("-child");
221
222 builder.command(argList);
223 Process process = builder.start();
224 if (SHUTDOWN_HOOK != null) {
225 Runtime.getRuntime().removeShutdownHook(SHUTDOWN_HOOK);
226 }
227 SHUTDOWN_HOOK = new Thread(() -> process.destroyForcibly());
228 Runtime.getRuntime().addShutdownHook(SHUTDOWN_HOOK);
229
230 return process;
231 }
232 }
233
234 private static synchronized void destroyChildForcibly(Process process) {
235 process = process.destroyForcibly();
236 try {
237 boolean destroyed = process.waitFor(60, TimeUnit.SECONDS);
238 if (! destroyed) {
239 LOG.error("Child process still alive after 60 seconds. " +
240 "Shutting down the parent.");
241 System.exit(1);
242 }
243
244 } catch (InterruptedException e) {
245 //swallow
246 }
247 }
248
249 }
2828
2929 import org.apache.tika.io.TikaInputStream;
3030 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.metadata.TikaCoreProperties;
3132 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.server.ServerStatus;
3234 import org.slf4j.Logger;
3335 import org.slf4j.LoggerFactory;
3436
3537 @Path("/detect")
3638 public class DetectorResource {
3739 private static final Logger LOG = LoggerFactory.getLogger(DetectorResource.class);
40 private final ServerStatus serverStatus;
3841
42 public DetectorResource(ServerStatus serverStatus) {
43 this.serverStatus = serverStatus;
44 }
3945 @PUT
4046 @Path("stream")
4147 @Consumes("*/*")
4854 .getRequestHeaders());
4955 LOG.info("Detecting media type for Filename: {}", filename);
5056 met.add(Metadata.RESOURCE_NAME_KEY, filename);
57 TikaResource.checkIsOperating();
58 long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename);
5159 try {
5260 return TikaResource.getConfig().getDetector().detect(tis, met).toString();
5361 } catch (IOException e) {
5462 LOG.warn("Unable to detect MIME type for file. Reason: {}", e.getMessage(), e);
5563 return MediaType.OCTET_STREAM.toString();
64 } catch (OutOfMemoryError e) {
65 serverStatus.setStatus(ServerStatus.STATUS.ERROR);
66 throw e;
67 } finally {
68 serverStatus.complete(taskId);
5669 }
5770 }
5871 }
3636 import org.apache.tika.parser.Parser;
3737 import org.apache.tika.parser.RecursiveParserWrapper;
3838 import org.apache.tika.sax.BasicContentHandlerFactory;
39 import org.apache.tika.sax.RecursiveParserWrapperHandler;
3940 import org.apache.tika.server.MetadataList;
4041 import org.slf4j.Logger;
4142 import org.slf4j.LoggerFactory;
127128 final ParseContext context = new ParseContext();
128129 Parser parser = TikaResource.createParser();
129130 // TODO: parameterize choice of max chars/max embedded attachments
130 BasicContentHandlerFactory.HANDLER_TYPE type =
131 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
132 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
133 new BasicContentHandlerFactory(type, -1));
131 RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
132
133
134134 TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
135135 // no need to add parser to parse recursively
136136 TikaResource.fillParseContext(context, httpHeaders, null);
137137 TikaResource.logRequest(LOG, info, metadata);
138 TikaResource.parse(wrapper, LOG, info.getPath(), is,
139 new LanguageHandler() {
138
139 BasicContentHandlerFactory.HANDLER_TYPE type =
140 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
141 RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
142 new BasicContentHandlerFactory(type, -1), -1);
143 TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
144 /*
145 We used to have this non-functional bit of code...refactor to add it back and make it work?
146 new LanguageHandler() {
140147 public void endDocument() {
141148 metadata.set("language", getLanguage().getLanguage());
142149 }
143 }, metadata, context);
144 return new MetadataList(wrapper.getMetadata());
150 },
151 */
152 return new MetadataList(handler.getMetadataList());
145153 }
154
146155 }
2323 import java.util.List;
2424 import java.util.Map;
2525
26 import com.google.gson.Gson;
27 import com.google.gson.GsonBuilder;
2628 import org.apache.tika.detect.CompositeDetector;
2729 import org.apache.tika.detect.Detector;
2830 import org.apache.tika.server.HTMLHelper;
29 import org.eclipse.jetty.util.ajax.JSON;
3031
3132 /**
3233 * <p>Provides details of all the {@link Detector}s registered with
3435 */
3536 @Path("/detectors")
3637 public class TikaDetectors {
38 private static final Gson GSON = new GsonBuilder().disableHtmlEscaping().create();
39
40
3741 private HTMLHelper html;
3842
3943 public TikaDetectors() {
7579 public String getDetectorsJSON() {
7680 Map<String, Object> details = new HashMap<String, Object>();
7781 detectorAsMap(TikaResource.getConfig().getDetector(), details);
78 return JSON.toString(details);
82 return GSON.toJson(details);
7983 }
8084
8185 private void detectorAsMap(Detector d, Map<String, Object> details) {
2525 import java.util.SortedMap;
2626 import java.util.TreeMap;
2727
28 import com.google.gson.Gson;
29 import com.google.gson.GsonBuilder;
2830 import org.apache.tika.mime.MediaType;
2931 import org.apache.tika.mime.MediaTypeRegistry;
3032 import org.apache.tika.parser.CompositeParser;
3133 import org.apache.tika.parser.Parser;
3234 import org.apache.tika.server.HTMLHelper;
33 import org.eclipse.jetty.util.ajax.JSON;
3435
3536 /**
3637 * <p>Provides details of all the mimetypes known to Apache Tika,
3839 */
3940 @Path("/mime-types")
4041 public class TikaMimeTypes {
42
43 private static final Gson GSON = new GsonBuilder().disableHtmlEscaping().create();
44
4145 private HTMLHelper html;
4246
4347 public TikaMimeTypes() {
9599 for (MediaTypeDetails type : getMediaTypes()) {
96100 Map<String, Object> typeDets = new HashMap<String, Object>();
97101
98 typeDets.put("alias", type.aliases);
102 typeDets.put("alias", copyToStringArray(type.aliases));
99103 if (type.supertype != null) {
100 typeDets.put("supertype", type.supertype);
104 typeDets.put("supertype", type.supertype.toString());
101105 }
102106 if (type.parser != null) {
103107 typeDets.put("parser", type.parser);
106110 details.put(type.type.toString(), typeDets);
107111 }
108112
109 return JSON.toString(details);
113 return GSON.toJson(details);
114 }
115
116 private static String[] copyToStringArray(MediaType[] aliases) {
117 String[] strings = new String[aliases.length];
118 for (int i = 0; i < aliases.length; i++) {
119 strings[i] = aliases[i].toString();
120 }
121 return strings;
110122 }
111123
112124 @GET
2727 import java.util.Map;
2828 import java.util.Set;
2929
30 import com.google.gson.Gson;
31 import com.google.gson.GsonBuilder;
3032 import org.apache.tika.mime.MediaType;
3133 import org.apache.tika.parser.CompositeParser;
3234 import org.apache.tika.parser.ParseContext;
3335 import org.apache.tika.parser.Parser;
3436 import org.apache.tika.parser.ParserDecorator;
3537 import org.apache.tika.server.HTMLHelper;
36 import org.eclipse.jetty.util.ajax.JSON;
3738
3839 /**
3940 * <p>Provides details of all the {@link Parser}s registered with
4344 @Path("/parsers")
4445 public class TikaParsers {
4546 private static final ParseContext EMPTY_PC = new ParseContext();
47 private static final Gson GSON = new GsonBuilder().disableHtmlEscaping().create();
4648 private HTMLHelper html;
4749
4850 public TikaParsers() {
126128 protected String getParsersJSON(boolean withMimeTypes) {
127129 Map<String, Object> details = new HashMap<String, Object>();
128130 parserAsMap(new ParserDetails(TikaResource.getConfig().getParser()), withMimeTypes, details);
129 return JSON.toString(details);
131
132 return GSON.toJson(details);
130133 }
131134
132135 private void parserAsMap(ParserDetails p, boolean withMimeTypes, Map<String, Object> details) {
1616
1717 package org.apache.tika.server.resource;
1818
19 import static java.nio.charset.StandardCharsets.UTF_8;
20
21 import javax.mail.internet.ContentDisposition;
22 import javax.mail.internet.ParseException;
19 import org.apache.commons.lang.StringUtils;
20 import org.apache.cxf.attachment.ContentDisposition;
21 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
22 import org.apache.poi.ooxml.extractor.ExtractorFactory;
23 import org.apache.tika.Tika;
24 import org.apache.tika.config.TikaConfig;
25 import org.apache.tika.detect.Detector;
26 import org.apache.tika.exception.EncryptedDocumentException;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.metadata.TikaCoreProperties;
29 import org.apache.tika.metadata.TikaMetadataKeys;
30 import org.apache.tika.mime.MediaType;
31 import org.apache.tika.parser.AutoDetectParser;
32 import org.apache.tika.parser.DigestingParser;
33 import org.apache.tika.parser.ParseContext;
34 import org.apache.tika.parser.Parser;
35 import org.apache.tika.parser.ParserDecorator;
36 import org.apache.tika.parser.PasswordProvider;
37 import org.apache.tika.parser.html.BoilerpipeContentHandler;
38 import org.apache.tika.parser.html.HtmlParser;
39 import org.apache.tika.parser.ocr.TesseractOCRConfig;
40 import org.apache.tika.parser.pdf.PDFParserConfig;
41 import org.apache.tika.sax.BodyContentHandler;
42 import org.apache.tika.sax.ExpandedTitleContentHandler;
43 import org.apache.tika.sax.RichTextContentHandler;
44 import org.apache.tika.server.InputStreamFactory;
45 import org.apache.tika.server.ServerStatus;
46 import org.apache.tika.server.TikaServerParseException;
47 import org.slf4j.Logger;
48 import org.slf4j.LoggerFactory;
49 import org.xml.sax.ContentHandler;
50 import org.xml.sax.SAXException;
51
2352 import javax.ws.rs.Consumes;
2453 import javax.ws.rs.GET;
2554 import javax.ws.rs.POST;
5180 import java.util.regex.Matcher;
5281 import java.util.regex.Pattern;
5382
54 import org.apache.commons.lang.StringUtils;
55 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
56 import org.apache.poi.extractor.ExtractorFactory;
57 import org.apache.tika.Tika;
58 import org.apache.tika.config.TikaConfig;
59 import org.apache.tika.detect.Detector;
60 import org.apache.tika.exception.EncryptedDocumentException;
61 import org.apache.tika.metadata.Metadata;
62 import org.apache.tika.metadata.TikaMetadataKeys;
63 import org.apache.tika.mime.MediaType;
64 import org.apache.tika.parser.AutoDetectParser;
65 import org.apache.tika.parser.DigestingParser;
66 import org.apache.tika.parser.ParseContext;
67 import org.apache.tika.parser.Parser;
68 import org.apache.tika.parser.ParserDecorator;
69 import org.apache.tika.parser.PasswordProvider;
70 import org.apache.tika.parser.html.BoilerpipeContentHandler;
71 import org.apache.tika.parser.html.HtmlParser;
72 import org.apache.tika.parser.ocr.TesseractOCRConfig;
73 import org.apache.tika.parser.pdf.PDFParserConfig;
74 import org.apache.tika.sax.BodyContentHandler;
75 import org.apache.tika.sax.ExpandedTitleContentHandler;
76 import org.apache.tika.sax.RichTextContentHandler;
77 import org.apache.tika.server.InputStreamFactory;
78 import org.apache.tika.server.TikaServerParseException;
79 import org.slf4j.Logger;
80 import org.slf4j.LoggerFactory;
81 import org.xml.sax.ContentHandler;
82 import org.xml.sax.SAXException;
83
84 import static java.nio.charset.StandardCharsets.UTF_8;
8385
8486 @Path("/tika")
8587 public class TikaResource {
9597 private static TikaConfig tikaConfig;
9698 private static DigestingParser.Digester digester = null;
9799 private static InputStreamFactory inputStreamFactory = null;
98
100 private static ServerStatus SERVER_STATUS = null;
99101 public static void init(TikaConfig config, DigestingParser.Digester digestr,
100 InputStreamFactory iSF) {
102 InputStreamFactory iSF, ServerStatus serverStatus) {
101103 tikaConfig = config;
102104 digester = digestr;
103105 inputStreamFactory = iSF;
106 SERVER_STATUS = serverStatus;
104107 }
105108
106109 static {
138141
139142 String disposition = httpHeaders.getFirst("Content-Disposition");
140143 if (disposition != null) {
141 try {
142 ContentDisposition c = new ContentDisposition(disposition);
143
144 // only support "attachment" dispositions
145 if ("attachment".equals(c.getDisposition())) {
146 String fn = c.getParameter("filename");
147 if (fn != null) {
148 return fn;
149 }
144 ContentDisposition c = new ContentDisposition(disposition);
145
146 // only support "attachment" dispositions
147 if ("attachment".equals(c.getType())) {
148 String fn = c.getParameter("filename");
149 if (fn != null) {
150 return fn;
150151 }
151 } catch (ParseException e) {
152 // not a valid content-disposition field
153 LOG.warn("Parse exception {} determining content disposition", e.getMessage(), e);
154152 }
155153 }
156154
160158
161159 public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders,
162160 Parser embeddedParser) {
163 TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
164 PDFParserConfig pdfParserConfig = new PDFParserConfig();
161 //lazily initialize configs
162 //if a header is submitted, any params set in --tika-config tika-config.xml
163 //upon server startup will be ignored.
164 TesseractOCRConfig ocrConfig = null;
165 PDFParserConfig pdfParserConfig = null;
165166 for (String key : httpHeaders.keySet()) {
166167 if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
168 ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
167169 processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
168170 } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
171 pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
169172 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
170173 }
171174 }
172 parseContext.set(TesseractOCRConfig.class, ocrConfig);
173 parseContext.set(PDFParserConfig.class, pdfParserConfig);
175 if (ocrConfig != null) {
176 parseContext.set(TesseractOCRConfig.class, ocrConfig);
177 }
178 if (pdfParserConfig != null) {
179 parseContext.set(PDFParserConfig.class, pdfParserConfig);
180 }
174181 if (embeddedParser != null) {
175182 parseContext.set(Parser.class, embeddedParser);
176183 }
385392 */
386393 public static void parse(Parser parser, Logger logger, String path, InputStream inputStream,
387394 ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException {
395
396 checkIsOperating();
397
398 long taskId = SERVER_STATUS.start(ServerStatus.TASK.PARSE,
399 metadata.get(Metadata.RESOURCE_NAME_KEY));
388400 try {
389401 parser.parse(inputStream, handler, metadata, parseContext);
390402 } catch (SAXException e) {
395407 } catch (Exception e) {
396408 logger.warn("{}: Text extraction failed", path, e);
397409 throw new TikaServerParseException(e);
410 } catch (OutOfMemoryError e) {
411 SERVER_STATUS.setStatus(ServerStatus.STATUS.ERROR);
412 throw e;
398413 } finally {
414 SERVER_STATUS.complete(taskId);
399415 inputStream.close();
416 }
417 }
418
419 public static void checkIsOperating() {
420 //check that server is not in shutdown mode
421 if (! SERVER_STATUS.isOperating()) {
422 throw new WebApplicationException(Response.Status.SERVICE_UNAVAILABLE);
400423 }
401424 }
402425
411434 @GET
412435 @Produces("text/plain")
413436 public String getMessage() {
437 checkIsOperating();
414438 return GREETING;
415439 }
416440
3232 @GET
3333 @Produces("text/plain")
3434 public String getVersion() {
35 TikaResource.checkIsOperating();
3536 return tika.toString();
3637 }
3738 }
2323 import javax.ws.rs.PUT;
2424 import javax.ws.rs.Path;
2525 import javax.ws.rs.Produces;
26 import javax.ws.rs.WebApplicationException;
27 import javax.ws.rs.core.Response;
2628 import java.lang.annotation.Annotation;
2729 import java.lang.reflect.Method;
2830 import java.util.ArrayList;
134136 @GET
135137 @Produces("text/html")
136138 public String getWelcomeHTML() {
139 TikaResource.checkIsOperating();
140
137141 StringBuffer h = new StringBuffer();
138142 String tikaVersion = tika.toString();
139143
189193 @GET
190194 @Produces("text/plain")
191195 public String getWelcomePlain() {
196 TikaResource.checkIsOperating();
192197 StringBuffer text = new StringBuffer();
193198
194199 text.append(tika.toString());
2828 import javax.ws.rs.Path;
2929 import javax.ws.rs.PathParam;
3030 import javax.ws.rs.Produces;
31 import javax.ws.rs.WebApplicationException;
32 import javax.ws.rs.core.Response;
3133
3234 import org.apache.commons.io.IOUtils;
3335 import org.apache.tika.config.LoadErrorHandler;
3638 import org.apache.tika.langdetect.OptimaizeLangDetector;
3739 import org.apache.tika.language.detect.LanguageResult;
3840 import org.apache.tika.language.translate.Translator;
41 import org.apache.tika.server.ServerStatus;
3942 import org.slf4j.Logger;
4043 import org.slf4j.LoggerFactory;
4144
4750
4851 private static final Logger LOG = LoggerFactory.getLogger(TranslateResource.class);
4952
50 public TranslateResource() {
53 private final ServerStatus serverStatus;
54 public TranslateResource(ServerStatus serverStatus) {
5155 this.loader = new ServiceLoader(ServiceLoader.class.getClassLoader(),
5256 LoadErrorHandler.WARN);
5357 this.defaultTranslator = TikaResource.getConfig().getTranslator();
58 this.serverStatus = serverStatus;
5459 }
5560
5661 @PUT
9398 translate = this.defaultTranslator;
9499 LOG.info("Using default translator");
95100 }
96
97 return translate.translate(content, sLang, dLang);
101 TikaResource.checkIsOperating();
102 long taskId = serverStatus.start(ServerStatus.TASK.TRANSLATE, null);
103 try {
104 return translate.translate(content, sLang, dLang);
105 } catch (OutOfMemoryError e) {
106 serverStatus.setStatus(ServerStatus.STATUS.ERROR);
107 throw e;
108 } finally {
109 serverStatus.complete(taskId);
110 }
98111 }
99112
100113 private Translator byClassName(String className) {
3636 import java.util.Arrays;
3737 import java.util.HashMap;
3838 import java.util.Map;
39 import java.util.UUID;
3940
4041 import au.com.bytecode.opencsv.CSVWriter;
42 import org.apache.commons.io.FilenameUtils;
4143 import org.apache.commons.lang.mutable.MutableInt;
4244 import org.apache.poi.poifs.filesystem.DirectoryEntry;
4345 import org.apache.poi.poifs.filesystem.DocumentEntry;
219221 }
220222 }
221223
222 final String finalName = name;
224 final String finalName = getFinalName(name, zout);
223225
224226 if (data.length > 0) {
225227 zout.put(finalName, data);
240242 }
241243 }
242244 }
245 }
246
247 private String getFinalName(String name, Map<String, byte[]> zout) {
248 name = name.replaceAll("\u0000", " ");
249 String normalizedName = FilenameUtils.normalize(name);
250
251 if (normalizedName == null) {
252 normalizedName = FilenameUtils.getName(name);
253 }
254
255 if (normalizedName == null) {
256 normalizedName = count.toString();
257 }
258 //strip off initial C:/ or ~/ or /
259 int prefixLength = FilenameUtils.getPrefixLength(normalizedName);
260 if (prefixLength > -1) {
261 normalizedName = normalizedName.substring(prefixLength);
262 }
263 if (zout.containsKey(normalizedName)) {
264 return UUID.randomUUID().toString()+"-"+normalizedName;
265 }
266 return normalizedName;
243267 }
244268
245269 protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
7979 }
8080
8181 @Before
82 public void setUp() {
83 this.tika = TikaConfig.getDefaultConfig();
82 public void setUp() throws Exception {
83 this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
8484 TikaResource.init(tika,
8585 new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
86 new DefaultInputStreamFactory());
86 new DefaultInputStreamFactory(), new ServerStatus());
8787 JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
8888 setUpResources(sf);
8989 setUpProviders(sf);
4444 protected void setUpResources(JAXRSServerFactoryBean sf) {
4545 sf.setResourceClasses(DetectorResource.class);
4646 sf.setResourceProvider(DetectorResource.class,
47 new SingletonResourceProvider(new DetectorResource()));
47 new SingletonResourceProvider(new DetectorResource(new ServerStatus())));
4848
4949 }
5050
3737 import org.apache.tika.metadata.Metadata;
3838 import org.apache.tika.metadata.serialization.JsonMetadataList;
3939 import org.apache.tika.parser.RecursiveParserWrapper;
40 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
4041 import org.apache.tika.server.resource.RecursiveMetadataResource;
4142 import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
4243 import org.junit.Test;
127128 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
128129 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
129130 assertEquals(12, metadataList.size());
130 String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
131 String content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
131132 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
132133
133134 //extra slash
139140 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
140141 metadataList = JsonMetadataList.fromJson(reader);
141142 assertEquals(12, metadataList.size());
142 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
143 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
143144 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
144145
145146 //unparseable
151152 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
152153 metadataList = JsonMetadataList.fromJson(reader);
153154 assertEquals(12, metadataList.size());
154 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
155 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
155156 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
156157
157158 //xml
163164 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
164165 metadataList = JsonMetadataList.fromJson(reader);
165166 assertEquals(12, metadataList.size());
166 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
167 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
167168 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
168169
169170 //text
175176 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
176177 metadataList = JsonMetadataList.fromJson(reader);
177178 assertEquals(12, metadataList.size());
178 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
179 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
179180 assertTrue(content.startsWith("embed_3"));
180181
181182 //ignore
187188 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
188189 metadataList = JsonMetadataList.fromJson(reader);
189190 assertEquals(12, metadataList.size());
190 assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
191 assertNull(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
191192
192193 }
193194
206207 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
207208 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
208209 assertEquals(12, metadataList.size());
209 String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
210 String content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
210211 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
211212
212213 //unparseable
222223 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
223224 metadataList = JsonMetadataList.fromJson(reader);
224225 assertEquals(12, metadataList.size());
225 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
226 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
226227 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
227228
228229 //xml
238239 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
239240 metadataList = JsonMetadataList.fromJson(reader);
240241 assertEquals(12, metadataList.size());
241 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
242 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
242243 assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
243244
244245 //text
254255 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
255256 metadataList = JsonMetadataList.fromJson(reader);
256257 assertEquals(12, metadataList.size());
257 content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
258 content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim();
258259 assertTrue(content.startsWith("embed_3"));
259260
260261 //ignore -- no content
270271 reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
271272 metadataList = JsonMetadataList.fromJson(reader);
272273 assertEquals(12, metadataList.size());
273 assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
274 assertNull(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
274275 }
275276
276277 }
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 import org.junit.Test;
19
20 import java.util.Map;
21 import java.util.Random;
22 import java.util.concurrent.Callable;
23 import java.util.concurrent.ExecutorCompletionService;
24 import java.util.concurrent.ExecutorService;
25 import java.util.concurrent.Executors;
26 import java.util.concurrent.Future;
27
28 import static org.junit.Assert.assertEquals;
29 import static org.junit.Assert.assertNotNull;
30
31 public class ServerStatusTest {
32
33 @Test(expected = IllegalArgumentException.class)
34 public void testBadId() throws Exception {
35 ServerStatus status = new ServerStatus();
36 status.complete(2);
37 }
38
39 @Test(timeout = 60000)
40 public void testBasicMultiThreading() throws Exception {
41 //make sure that synchronization is basically working
42 int numThreads = 10;
43 int filesToProcess = 20;
44 ExecutorService service = Executors.newFixedThreadPool(numThreads);
45 ExecutorCompletionService<Integer> completionService = new ExecutorCompletionService<>(service);
46 ServerStatus serverStatus = new ServerStatus();
47 for (int i = 0; i < numThreads; i++) {
48 completionService.submit(new MockTask(serverStatus, filesToProcess));
49 }
50 int finished = 0;
51 int totalProcessed = 0;
52 while (finished < numThreads) {
53 Future<Integer> future = completionService.take();
54 if (future != null) {
55 finished++;
56 Integer completed = future.get();
57 totalProcessed += completed;
58 }
59 }
60 assertEquals(numThreads*filesToProcess, totalProcessed);
61 assertEquals(0, serverStatus.getTasks().size());
62 assertEquals(totalProcessed, serverStatus.getFilesProcessed());
63
64 }
65
66 private class MockTask implements Callable<Integer> {
67 Random r = new Random();
68 private final ServerStatus serverStatus;
69 private final int filesToProcess;
70 public MockTask(ServerStatus serverStatus, int filesToProcess) {
71 this.serverStatus = serverStatus;
72 this.filesToProcess = filesToProcess;
73 }
74
75 @Override
76 public Integer call() throws Exception {
77 int processed = 0;
78 for (int i = 0; i < filesToProcess; i++) {
79 sleepRandom(200);
80 long taskId = serverStatus.start(ServerStatus.TASK.PARSE, null);
81 sleepRandom(100);
82 serverStatus.complete(taskId);
83 processed++;
84 serverStatus.getStatus();
85 sleepRandom(10);
86 serverStatus.setStatus(ServerStatus.STATUS.OPERATING);
87 sleepRandom(20);
88 Map<Long, TaskStatus> tasks = serverStatus.getTasks();
89 assertNotNull(tasks);
90 }
91 return processed;
92 }
93
94 private void sleepRandom(int millis) throws InterruptedException {
95 int sleep = r.nextInt(millis);
96 Thread.sleep(sleep);
97 }
98 }
99 }
6464 List<ResourceProvider> rCoreProviders = new ArrayList<ResourceProvider>();
6565 rCoreProviders.add(new SingletonResourceProvider(new MetadataResource()));
6666 rCoreProviders.add(new SingletonResourceProvider(new RecursiveMetadataResource()));
67 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource()));
67 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource(new ServerStatus())));
6868 rCoreProviders.add(new SingletonResourceProvider(new TikaResource()));
6969 rCoreProviders.add(new SingletonResourceProvider(new UnpackerResource()));
7070 sf.setResourceProviders(rCoreProviders);
5858 List<ResourceProvider> rCoreProviders = new ArrayList<ResourceProvider>();
5959 rCoreProviders.add(new SingletonResourceProvider(new MetadataResource()));
6060 rCoreProviders.add(new SingletonResourceProvider(new RecursiveMetadataResource()));
61 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource()));
61 rCoreProviders.add(new SingletonResourceProvider(new DetectorResource(new ServerStatus())));
6262 rCoreProviders.add(new SingletonResourceProvider(new TikaResource()));
6363 rCoreProviders.add(new SingletonResourceProvider(new UnpackerResource()));
6464 sf.setResourceProviders(rCoreProviders);
2222 import javax.ws.rs.core.Response;
2323
2424 import java.io.InputStream;
25 import java.util.HashMap;
26 import java.util.List;
2527 import java.util.Map;
2628
29 import com.google.gson.Gson;
30 import com.google.gson.GsonBuilder;
2731 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
2832 import org.apache.cxf.jaxrs.client.WebClient;
2933 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
3135 import org.apache.tika.parser.microsoft.POIFSContainerDetector;
3236 import org.apache.tika.parser.pkg.ZipContainerDetector;
3337 import org.apache.tika.server.resource.TikaDetectors;
34 import org.eclipse.jetty.util.ajax.JSON;
3538 import org.gagravarr.tika.OggDetector;
3639 import org.junit.Test;
3740
3841 public class TikaDetectorsTest extends CXFTestBase {
42
43 private static final Gson GSON = new GsonBuilder().create();
44
45
3946 private static final String DETECTORS_PATH = "/detectors";
4047
4148 @Override
99106 .get();
100107
101108 String jsonStr = getStringFromInputStream((InputStream) response.getEntity());
102 Map<String, Object> json = (Map<String, Object>) JSON.parse(jsonStr);
109 Map<String, Object> json = (Map<String, Object>) GSON.fromJson(jsonStr, Map.class);
103110
104111 // Should have a nested structure
105112 assertTrue(json.containsKey("name"));
109116 assertEquals(Boolean.TRUE, json.get("composite"));
110117
111118 // At least 4 child detectors, none of them composite
112 Object[] children = (Object[]) json.get("children");
113 assertTrue(children.length >= 4);
119 List<Object> children = (List) json.get("children");
120 assertTrue(children.size() >= 4);
114121 boolean hasOgg = false, hasPOIFS = false, hasZIP = false, hasMime = false;
115122 for (Object o : children) {
116123 Map<String, Object> d = (Map<String, Object>) o;
138145 assertTrue(hasZIP);
139146 assertTrue(hasMime);
140147 }
148
141149 }
2222 import javax.ws.rs.core.Response;
2323
2424 import java.io.InputStream;
25 import java.util.List;
2526 import java.util.Map;
2627
28 import com.google.gson.Gson;
29 import com.google.gson.GsonBuilder;
2730 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
2831 import org.apache.cxf.jaxrs.client.WebClient;
2932 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
3033 import org.apache.tika.server.resource.TikaMimeTypes;
31 import org.eclipse.jetty.util.ajax.JSON;
3234 import org.junit.Test;
3335
3436 public class TikaMimeTypesTest extends CXFTestBase {
37
38 private static final Gson GSON = new GsonBuilder().create();
39
3540 private static final String MIMETYPES_PATH = "/mime-types";
3641
3742 @Override
96101 .get();
97102
98103 String jsonStr = getStringFromInputStream((InputStream) response.getEntity());
99 Map<String, Map<String, Object>> json = (Map<String, Map<String, Object>>) JSON.parse(jsonStr);
104 Map<String, Map<String, Object>> json = (Map<String, Map<String, Object>>)
105 GSON.fromJson(jsonStr, Map.class);
100106
101107 assertEquals(true, json.containsKey("text/plain"));
102108 assertEquals(true, json.containsKey("application/xml"));
105111
106112 Map<String, Object> bmp = json.get("image/bmp");
107113 assertEquals(true, bmp.containsKey("alias"));
108 Object[] aliases = (Object[]) bmp.get("alias");
109 assertEquals(2, aliases.length);
110 assertEquals("image/x-bmp", aliases[0]);
111 assertEquals("image/x-ms-bmp", aliases[1]);
114 List<Object> aliases = (List) bmp.get("alias");
115 assertEquals(2, aliases.size());
116
117 assertEquals("image/x-bmp", aliases.get(0));
118 assertEquals("image/x-ms-bmp", aliases.get(1));
112119
113120 String whichParser = bmp.get("parser").toString();
114121 assertTrue("Which parser", whichParser.equals("org.apache.tika.parser.ocr.TesseractOCRParser") ||
2222 import javax.ws.rs.core.Response;
2323
2424 import java.io.InputStream;
25 import java.util.List;
2526 import java.util.Map;
2627
28 import com.google.gson.Gson;
29 import com.google.gson.GsonBuilder;
2730 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
2831 import org.apache.cxf.jaxrs.client.WebClient;
2932 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
3134 import org.apache.tika.parser.pdf.PDFParser;
3235 import org.apache.tika.parser.pkg.PackageParser;
3336 import org.apache.tika.server.resource.TikaParsers;
34 import org.eclipse.jetty.util.ajax.JSON;
3537 import org.gagravarr.tika.OpusParser;
3638 import org.junit.Test;
3739
3840 public class TikaParsersTest extends CXFTestBase {
41
42 private static final Gson GSON = new GsonBuilder().create();
43
3944 private static final String PARSERS_SUMMARY_PATH = "/parsers";
4045 private static final String PARSERS_DETAILS_PATH = "/parsers/details";
4146
95100 .get();
96101
97102 String text = getStringFromInputStream((InputStream) response.getEntity());
98 assertContains("<h2>DefaultParser</h2>", text);
103 assertContains("<h3>DefaultParser</h3>", text);
99104 assertContains("Composite", text);
100105
101 assertContains("<h3>OpusParser", text);
102 assertContains("<h3>PackageParser", text);
103 assertContains("<h3>OOXMLParser", text);
106 assertContains("<h4>OpusParser", text);
107 assertContains("<h4>PackageParser", text);
108 assertContains("<h4>OOXMLParser", text);
104109
105110 assertContains(OpusParser.class.getName(), text);
106111 assertContains(PackageParser.class.getName(), text);
131136 .get();
132137
133138 String jsonStr = getStringFromInputStream((InputStream) response.getEntity());
134 Map<String, Map<String, Object>> json = (Map<String, Map<String, Object>>) JSON.parse(jsonStr);
139 Map<String, Map<String, Object>> json = (Map<String, Map<String, Object>>)
140 GSON.fromJson(jsonStr, Map.class);
135141
136142 // Should have a nested structure
137143 assertEquals(true, json.containsKey("name"));
138144 assertEquals(true, json.containsKey("composite"));
139145 assertEquals(true, json.containsKey("children"));
140 assertEquals("org.apache.tika.parser.DefaultParser", json.get("name"));
146 assertEquals("org.apache.tika.parser.CompositeParser", json.get("name"));
141147 assertEquals(Boolean.TRUE, json.get("composite"));
142148
143149 // At least 20 child parsers which aren't composite, except for CompositeExternalParser
144 Object[] children = (Object[]) (Object) json.get("children");
145 assertTrue(children.length >= 20);
146 boolean hasOpus = false, hasOOXML = false, hasPDF = false, hasZip = false;
150 List<Object> children = (List)json.get("children");
151 assertTrue(children.size() >= 2);
152 boolean hasOpus = false, hasOOXML = false, hasZip = false;
147153 int nonComposite = 0;
148154 int composite = 0;
149155 for (Object o : children) {
150 Map<String, Object> d = (Map<String, Object>) o;
151 assertEquals(true, d.containsKey("name"));
152 assertEquals(true, d.containsKey("composite"));
156 Map<String, Object> child = (Map<String, Object>) o;
157 assertEquals(true, child.containsKey("name"));
158 assertEquals(true, child.containsKey("composite"));
153159
154 if (d.get("composite") == Boolean.FALSE)
155 nonComposite++;
156 else
157 composite++;
158
159 // Will only have mime types if requested
160 if (d.get("composite") == Boolean.FALSE)
161 assertEquals(details, d.containsKey("supportedTypes"));
160 List<Object> grandChildrenArr = (List) child.get("children");
161 if (grandChildrenArr == null) {
162 continue;
163 }
164 assertTrue(grandChildrenArr.size() > 50);
165 for (Object grandChildO : grandChildrenArr) {
166 Map<String, Object> grandChildren = (Map<String, Object>) grandChildO;
162167
163 String name = (String) d.get("name");
164 if (OpusParser.class.getName().equals(name)) {
165 hasOpus = true;
166 }
167 if (OOXMLParser.class.getName().equals(name)) {
168 hasOOXML = true;
169 }
170 if (PDFParser.class.getName().equals(name)) {
171 hasPDF = true;
172 }
173 if (PackageParser.class.getName().equals(name)) {
174 hasZip = true;
168 if (grandChildren.get("composite") == Boolean.FALSE)
169 nonComposite++;
170 else
171 composite++;
172
173 // Will only have mime types if requested
174 if (grandChildren.get("composite") == Boolean.FALSE)
175 assertEquals(details, grandChildren.containsKey("supportedTypes"));
176
177 String name = (String) grandChildren.get("name");
178 if (OpusParser.class.getName().equals(name)) {
179 hasOpus = true;
180 }
181 if (OOXMLParser.class.getName().equals(name)) {
182 hasOOXML = true;
183 }
184 if (PackageParser.class.getName().equals(name)) {
185 hasZip = true;
186 }
175187 }
176188 }
177189 assertEquals(true, hasOpus);
178190 assertEquals(true, hasOOXML);
179 assertEquals(true, hasPDF);
180191 assertEquals(true, hasZip);
181192 assertTrue(nonComposite > 20);
182193 assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1
257257 assertEquals(500, response.getStatus());
258258 }
259259
260 //TIKA-2669
261 @Test
262 public void testPDFConfig() throws Exception {
263
264 Response response = WebClient.create(endPoint + TIKA_PATH)
265 .type("application/pdf")
266 .accept("text/plain")
267 .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
268 String responseMsg = getStringFromInputStream((InputStream) response
269 .getEntity());
270 responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
271 assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
272 responseMsg);
273
274 response = WebClient.create(endPoint + TIKA_PATH)
275 .type("application/pdf")
276 .accept("text/plain")
277 .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"sortByPosition", "false")
278 .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
279 responseMsg = getStringFromInputStream((InputStream) response
280 .getEntity());
281 responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
282 assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);
283
284 //make sure that default reverts to initial config option
285 response = WebClient.create(endPoint + TIKA_PATH)
286 .type("application/pdf")
287 .accept("text/plain")
288 .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
289 responseMsg = getStringFromInputStream((InputStream) response
290 .getEntity());
291 responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
292 assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
293 responseMsg);
294
295 }
296
297
260298 @Test
261299 public void testExtractTextAcceptPlainText() throws Exception {
262300 //TIKA-2384
286324 .accept("text/plain")
287325 .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
288326 "tesseractPath",
289
290327 "C://tmp//hello.bat\u0000")
291328 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
292 assertEquals(500, response.getStatus());
329 assertEquals(400, response.getStatus());
293330
294331 response = WebClient.create(endPoint + TIKA_PATH)
295332 .type("application/pdf")
310347 "trustedPageSeparator",
311348 "\u0010")
312349 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
313 assertEquals(500, response.getStatus());
350 assertEquals(400, response.getStatus());
314351
315352 }
316353
0 /*
1 * Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 package org.apache.tika.server;
17
18 import org.apache.cxf.jaxrs.client.WebClient;
19 import org.apache.tika.TikaTest;
20 import org.apache.tika.io.IOUtils;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.metadata.OfficeOpenXMLExtended;
23 import org.apache.tika.metadata.serialization.JsonMetadataList;
24 import org.junit.Test;
25
26 import javax.ws.rs.core.Response;
27 import java.io.InputStream;
28 import java.io.InputStreamReader;
29 import java.io.Reader;
30 import java.time.Duration;
31 import java.time.Instant;
32 import java.util.List;
33
34 import static java.nio.charset.StandardCharsets.UTF_8;
35 import static org.junit.Assert.assertEquals;
36
37 public class TikaServerIntegrationTest extends TikaTest {
38
39 private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
40 private static final String TEST_OOM = "mock/real_oom.xml";
41 private static final String TEST_SYSTEM_EXIT = "mock/system_exit.xml";
42 private static final String TEST_HEAVY_HANG = "mock/heavy_hang_30000.xml";
43 private static final String TEST_HEAVY_HANG_SHORT = "mock/heavy_hang_100.xml";
44 private static final String META_PATH = "/rmeta";
45
46 //running into conflicts on 9998 with the CXFTestBase tests
47 //TODO: figure out why?!
48 private static final String INTEGRATION_TEST_PORT = "9999";
49
50 protected static final String endPoint =
51 "http://localhost:" + INTEGRATION_TEST_PORT;
52
53 @Test
54 public void testBasic() throws Exception {
55
56 Thread serverThread = new Thread() {
57 @Override
58 public void run() {
59 TikaServerCli.main(
60 new String[]{
61 "-spawnChild",
62 "-p", INTEGRATION_TEST_PORT
63 });
64 }
65 };
66 serverThread.start();
67 awaitServerStartup();
68
69 Response response = WebClient
70 .create(endPoint + META_PATH)
71 .accept("application/json")
72 .put(ClassLoader
73 .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
74 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
75 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
76 assertEquals(12, metadataList.size());
77 assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
78 assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
79
80 //assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
81
82 serverThread.interrupt();
83
84
85 }
86
87 @Test
88 public void testOOM() throws Exception {
89
90 Thread serverThread = new Thread() {
91 @Override
92 public void run() {
93 TikaServerCli.main(
94 new String[]{
95 "-spawnChild", "-JXmx512m",
96 "-p", INTEGRATION_TEST_PORT
97 });
98 }
99 };
100 serverThread.start();
101 awaitServerStartup();
102 Response response = WebClient
103 .create(endPoint + META_PATH)
104 .accept("application/json")
105 .put(ClassLoader
106 .getSystemResourceAsStream(TEST_OOM));
107 //give some time for the server to crash/kill itself
108 Thread.sleep(2000);
109 awaitServerStartup();
110
111 response = WebClient
112 .create(endPoint + META_PATH)
113 .accept("application/json")
114 .put(ClassLoader
115 .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
116 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
117 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
118 assertEquals(12, metadataList.size());
119 assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
120 assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
121
122 serverThread.interrupt();
123 }
124
125 @Test
126 public void testSystemExit() throws Exception {
127
128 Thread serverThread = new Thread() {
129 @Override
130 public void run() {
131 TikaServerCli.main(
132 new String[]{
133 "-spawnChild",
134 "-p", INTEGRATION_TEST_PORT
135 });
136 }
137 };
138 serverThread.start();
139 awaitServerStartup();
140 Response response = null;
141 try {
142 response = WebClient
143 .create(endPoint + META_PATH)
144 .accept("application/json")
145 .put(ClassLoader
146 .getSystemResourceAsStream(TEST_SYSTEM_EXIT));
147 } catch (Exception e) {
148 //sys exit causes catchable problems for the client
149 }
150 //give some time for the server to crash/kill itself
151 Thread.sleep(2000);
152
153 awaitServerStartup();
154
155 response = WebClient
156 .create(endPoint + META_PATH)
157 .accept("application/json")
158 .put(ClassLoader
159 .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
160
161 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
162 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
163 assertEquals(12, metadataList.size());
164 assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
165 assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
166
167 serverThread.interrupt();
168
169
170 }
171
172 @Test
173 public void testTimeoutOk() throws Exception {
174 //test that there's enough time for this file.
175 Thread serverThread = new Thread() {
176 @Override
177 public void run() {
178 TikaServerCli.main(
179 new String[]{
180 "-spawnChild", "-p", INTEGRATION_TEST_PORT,
181 "-taskTimeoutMillis", "10000", "-taskPulseMillis", "500",
182 "-pingPulseMillis", "500"
183 });
184 }
185 };
186 serverThread.start();
187 awaitServerStartup();
188 Response response = WebClient
189 .create(endPoint + META_PATH)
190 .accept("application/json")
191 .put(ClassLoader
192 .getSystemResourceAsStream(TEST_HEAVY_HANG_SHORT));
193 awaitServerStartup();
194
195 response = WebClient
196 .create(endPoint + META_PATH)
197 .accept("application/json")
198 .put(ClassLoader
199 .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
200 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
201 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
202 assertEquals(12, metadataList.size());
203 assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
204 assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
205
206 serverThread.interrupt();
207
208
209 }
210
211 @Test
212 public void testTimeout() throws Exception {
213
214 Thread serverThread = new Thread() {
215 @Override
216 public void run() {
217 TikaServerCli.main(
218 new String[]{
219 "-spawnChild", "-p", INTEGRATION_TEST_PORT,
220 "-taskTimeoutMillis", "10000", "-taskPulseMillis", "500",
221 "-pingPulseMillis", "500"
222 });
223 }
224 };
225 serverThread.start();
226 awaitServerStartup();
227 Response response = null;
228 try {
229 response = WebClient
230 .create(endPoint + META_PATH)
231 .accept("application/json")
232 .put(ClassLoader
233 .getSystemResourceAsStream(TEST_HEAVY_HANG));
234 } catch (Exception e) {
235 //catchable exception when server shuts down.
236 }
237 awaitServerStartup();
238
239 response = WebClient
240 .create(endPoint + META_PATH)
241 .accept("application/json")
242 .put(ClassLoader
243 .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
244 Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
245 List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
246 assertEquals(12, metadataList.size());
247 assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
248 assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
249
250 serverThread.interrupt();
251
252
253 }
254
255 private void awaitServerStartup() throws Exception {
256
257 Instant started = Instant.now();
258 long elapsed = Duration.between(started, Instant.now()).toMillis();
259 while (elapsed < 30000) {
260 try {
261 Response response = WebClient
262 .create(endPoint + "/tika")
263 .accept("text/plain")
264 .get();
265 if (response.getStatus() == 200) {
266 return;
267 }
268 } catch (javax.ws.rs.ProcessingException e) {
269 }
270 Thread.sleep(1000);
271 elapsed = Duration.between(started, Instant.now()).toMillis();
272 }
273
274 }
275 }
4444 List<ResourceProvider> rpsCore =
4545 new ArrayList<ResourceProvider>();
4646 rpsCore.add(new SingletonResourceProvider(new TikaVersion()));
47 rpsCore.add(new SingletonResourceProvider(new DetectorResource()));
47 rpsCore.add(new SingletonResourceProvider(new DetectorResource(new ServerStatus())));
4848 rpsCore.add(new SingletonResourceProvider(new MetadataResource()));
4949 List<ResourceProvider> all = new ArrayList<ResourceProvider>(rpsCore);
5050 all.add(new SingletonResourceProvider(new TikaWelcome(rpsCore)));
4646 protected void setUpResources(JAXRSServerFactoryBean sf) {
4747 sf.setResourceClasses(TranslateResource.class);
4848 sf.setResourceProvider(TranslateResource.class,
49 new SingletonResourceProvider(new TranslateResource()));
49 new SingletonResourceProvider(new TranslateResource(new ServerStatus())));
5050
5151 }
5252
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
134134 <plugin>
135135 <groupId>org.apache.maven.plugins</groupId>
136136 <artifactId>maven-jar-plugin</artifactId>
137 <configuration>
138 <archive>
139 <manifestEntries>
140 <Automatic-Module-Name>org.apache.tika.translate</Automatic-Module-Name>
141 </manifestEntries>
142 </archive>
143 </configuration>
137144 <executions>
138145 <execution>
139146 <goals>
2424 <parent>
2525 <groupId>org.apache.tika</groupId>
2626 <artifactId>tika-parent</artifactId>
27 <version>1.18</version>
27 <version>1.19</version>
2828 <relativePath>../tika-parent/pom.xml</relativePath>
2929 </parent>
3030
5858 <plugin>
5959 <groupId>org.apache.maven.plugins</groupId>
6060 <artifactId>maven-jar-plugin</artifactId>
61 <configuration>
62 <archive>
63 <manifestEntries>
64 <Automatic-Module-Name>org.apache.tika.xmp</Automatic-Module-Name>
65 </manifestEntries>
66 </archive>
67 </configuration>
6168 <executions>
6269 <execution>
6370 <goals>