List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler
DefaultHandler
From source file:org.apache.tika.server.MetadataEP.java
/** * Get a specific metadata field. If the input stream cannot be parsed, but a * value was found for the given metadata field, then the value of the field * is returned as part of a 200 OK response; otherwise a * {@link Status#BAD_REQUEST} is generated. If the stream was successfully * parsed but the specific metadata field was not found, then a * {@link Status#NOT_FOUND} is returned. * <p>//from w w w .j a v a 2 s . c o m * Note that this method handles multivalue fields and returns possibly more * metadata than requested. * * @param field * the tika metadata field name * @param is * the document stream * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or * {@link Status#BAD_REQUEST} * @throws Exception */ @POST @Path("{field}") public Response getMetadataField(@PathParam("field") String field, InputStream is) throws Exception { // use BAD request to indicate that we may not have had enough data to // process the request Status defaultErrorResponse = Status.BAD_REQUEST; try { parser.parse(is, new DefaultHandler(), metadata); // once we've parsed the document successfully, we should use NOT_FOUND // if we did not see the field defaultErrorResponse = Status.NOT_FOUND; } catch (Exception e) { logger.info("Failed to process field " + field, e); } String[] values = metadata.getValues(field); if (values.length == 0) { return Response.status(defaultErrorResponse).entity("Failed to get metadata field " + field).build(); } // remove fields we don't care about for the response for (String name : metadata.names()) { if (!field.equals(name)) { metadata.remove(name); } } return Response.ok(metadata).build(); }
From source file:org.apache.tika.server.MetadataResource.java
@PUT @Produces("text/csv") public StreamingOutput getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception { final Metadata metadata = new Metadata(); AutoDetectParser parser = TikaResource.createParser(); TikaResource.fillMetadata(parser, metadata, httpHeaders); TikaResource.logRequest(logger, info, metadata); parser.parse(is, new DefaultHandler(), metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream)); for (String name : metadata.names()) { String[] values = metadata.getValues(name); ArrayList<String> list = new ArrayList<String>(values.length + 1); list.add(name);//from ww w .j av a 2 s. co m list.addAll(Arrays.asList(values)); writer.writeNext(list.toArray(values)); } writer.close(); } }; }
From source file:org.apache.tika.server.RecursiveMetadataResource.java
private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws Exception { final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); AutoDetectParser parser = TikaResource.createParser(tikaConfig); //TODO: parameterize choice of handler and max chars? BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1)); TikaResource.fillMetadata(parser, metadata, context, httpHeaders); TikaResource.fillParseContext(context, httpHeaders); TikaResource.logRequest(logger, info, metadata); wrapper.parse(is, new DefaultHandler(), metadata, context); return new MetadataList(wrapper.getMetadata()); }
From source file:org.apache.tika.server.resource.UnpackerResource.java
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception { Metadata metadata = new Metadata(); ParseContext pc = new ParseContext(); Parser parser = TikaResource.createParser(); if (parser instanceof DigestingParser) { //no need to digest for unwrapping parser = ((DigestingParser) parser).getWrappedParser(); }//from w w w. j a v a 2s . c o m TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders()); TikaResource.logRequest(LOG, info, metadata); ContentHandler ch; ByteArrayOutputStream text = new ByteArrayOutputStream(); if (saveAll) { ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8))); } else { ch = new DefaultHandler(); } Map<String, byte[]> files = new HashMap<>(); MutableInt count = new MutableInt(); pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files)); TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc); if (count.intValue() == 0 && !saveAll) { throw new WebApplicationException(Response.Status.NO_CONTENT); } if (saveAll) { files.put(TEXT_FILENAME, text.toByteArray()); ByteArrayOutputStream metaStream = new ByteArrayOutputStream(); metadataToCsv(metadata, metaStream); files.put(META_FILENAME, metaStream.toByteArray()); } return files; }
From source file:org.apache.tika.server.UnpackerResource.java
@PUT @Produces("application/zip") public StreamingOutput getText(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception { Metadata metadata = new Metadata(); AutoDetectParser parser = TikaResource.createParser(); TikaResource.fillMetadata(parser, metadata, httpHeaders); TikaResource.logRequest(logger, info, metadata); ContentHandler ch = new DefaultHandler(); ParseContext pc = new ParseContext(); ZipOutput zout = new ZipOutput(); MutableInt count = new MutableInt(); pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, zout)); try {//from ww w . j a v a2 s. c om parser.parse(is, ch, metadata, pc); } catch (TikaException ex) { logger.warn(String.format("%s: Unpacker failed", info.getPath()), ex); } if (count.intValue() == 0) { throw new WebApplicationException(Response.Status.NO_CONTENT); } return zout; }
From source file:org.datacleaner.cli.MainTest.java
public void testWriteHtmlToFile() throws Throwable { String filename = "target/test_write_html_to_file.html"; Main.main(/* w ww . j a v a 2 s .c o m*/ ("-conf src/test/resources/cli-examples/conf.xml -job src/test/resources/cli-examples/employees_job.xml -of " + filename + " -ot HTML").split(" ")); File file = new File(filename); assertTrue(file.exists()); { String result = FileHelper.readFileAsString(file); String[] lines = result.split("\n"); assertEquals("<html>", lines[1]); } InputStream in = FileHelper.getInputStream(file); try { // parse it with validator.nu for HTML correctness final HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.FATAL); final AtomicInteger elementCounter = new AtomicInteger(); htmlParser.setContentHandler(new DefaultHandler() { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { elementCounter.incrementAndGet(); } }); final List<Exception> warningsAndErrors = new ArrayList<Exception>(); htmlParser.setErrorHandler(new ErrorHandler() { @Override public void warning(SAXParseException exception) throws SAXException { System.err.println("Warning: " + exception.getMessage()); warningsAndErrors.add(exception); } @Override public void fatalError(SAXParseException exception) throws SAXException { System.out.println("Fatal error: " + exception.getMessage()); throw exception; } @Override public void error(SAXParseException exception) throws SAXException { System.err.println("Error: " + exception.getMessage()); warningsAndErrors.add(exception); } }); htmlParser.parse(new InputSource(in)); // the output has approx 3600 XML elements int elementCount = elementCounter.get(); assertTrue("Element count: " + elementCount, elementCount > 3000); assertTrue("Element count: " + elementCount, elementCount < 5000); if (!warningsAndErrors.isEmpty()) { for (Exception error : warningsAndErrors) { String message = error.getMessage(); if (message.startsWith("No explicit character encoding declaration has been seen yet") || message.startsWith("The character encoding of the document was not declared.")) { // ignore/accept this one continue; } error.printStackTrace(); fail("Got " + warningsAndErrors.size() + " warnings and errors, see log for details"); } } } finally { in.close(); } }
From source file:org.dita.dost.module.GenMapAndTopicListModule.java
/** * Initialize reusable filters.// ww w . j a va2s. c o m */ private void initFilters() { listFilter = new GenListModuleReader(); listFilter.setLogger(logger); listFilter.setPrimaryDitamap(rootFile); listFilter.setJob(job); if (profilingEnabled) { filterUtils = parseFilterFile(); } exportAnchorsFilter = new ExportAnchorsFilter(); exportAnchorsFilter.setInputFile(rootFile); keydefFilter = new KeydefFilter(); keydefFilter.setLogger(logger); keydefFilter.setCurrentFile(rootFile); keydefFilter.setJob(job); nullHandler = new DefaultHandler(); }
From source file:org.dita.dost.module.reader.AbstractReaderModule.java
/** * Initialize reusable filters./*from w ww.j a v a2 s . c om*/ */ void initFilters() { tempFileNameScheme.setBaseDir(job.getInputDir()); listFilter = new GenListModuleReader(); listFilter.setLogger(logger); listFilter.setPrimaryDitamap(rootFile); listFilter.setJob(job); listFilter.setFormatFilter(formatFilter); if (profilingEnabled) { filterUtils = parseFilterFile(); } if (INDEX_TYPE_ECLIPSEHELP.equals(transtype)) { exportAnchorsFilter = new ExportAnchorsFilter(); exportAnchorsFilter.setInputFile(rootFile); } keydefFilter = new KeydefFilter(); keydefFilter.setLogger(logger); keydefFilter.setCurrentFile(rootFile); keydefFilter.setJob(job); nullHandler = new DefaultHandler(); ditaWriterFilter = new DitaWriterFilter(); ditaWriterFilter.setTempFileNameScheme(tempFileNameScheme); ditaWriterFilter.setLogger(logger); ditaWriterFilter.setJob(job); ditaWriterFilter.setEntityResolver(reader.getEntityResolver()); topicFragmentFilter = new TopicFragmentFilter(ATTRIBUTE_NAME_CONREF, ATTRIBUTE_NAME_CONREFEND); }
From source file:org.dita.dost.reader.TestGenListModuleReader.java
private void run(final File rootFile) throws Exception { final File ditaDir = new File("src" + File.separator + "main").getAbsoluteFile(); final boolean validate = false; reader = new GenListModuleReader(); reader.setLogger(new TestUtils.TestLogger()); reader.setCurrentFile(rootFile.toURI()); reader.setPrimaryDitamap(rootFile.toURI()); reader.setJob(new Job(tempDir)); reader.setContentHandler(new DefaultHandler()); final XMLReader parser = initXMLReader(ditaDir, validate, new File(rootFile.getPath()).getCanonicalFile()); parser.setContentHandler(reader);/* w w w . j a v a2 s . c o m*/ parser.parse(rootFile.toURI().toString()); }
From source file:org.eclim.plugin.core.util.XmlUtils.java
/** * Gets an aggregate handler which delegates accordingly to the supplied * handlers.// ww w. j a v a2s . c o m * * @param handler Main DefaultHandler to delegate to (may be null). * @param errorHandler DefaultHandler to delegate errors to (may be null). * @param entityResolver EntityResolver to delegate to (may be null). * @return */ private static DefaultHandler getHandler(DefaultHandler handler, DefaultHandler errorHandler, EntityResolver entityResolver) { DefaultHandler hdlr = handler != null ? handler : new DefaultHandler(); return new AggregateHandler(hdlr, errorHandler, entityResolver); }