Example usage for org.xml.sax.helpers DefaultHandler DefaultHandler

Introduction

In this page you can find the example usage for org.xml.sax.helpers DefaultHandler DefaultHandler.

Prototype

DefaultHandler

Source Link

Usage

From source file:com.shin1ogawa.appengine.marketplace.gdata.LicensingAPI.java

static List<Map<String, String>> parseLicenseFeed(InputStream is)
        throws SAXException, IOException, ParserConfigurationException {
    SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
    final List<Map<String, String>> list = Lists.newArrayList();
    parser.parse(is, new DefaultHandler() {

        boolean entity = false;

        String currentElement;//from w  w  w .j av  a  2 s.co  m

        Map<String, String> map;

        @Override
        public void characters(char[] ch, int start, int length) {
            if (entity) {
                map.put(currentElement, new String(ch, start, length));
            }
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) {
            if (entity == false && StringUtils.equals(qName, "entity")) {
                entity = true;
                map = Maps.newHashMap();
                list.add(map);
                return;
            }
            currentElement = qName;
        }

        @Override
        public void endElement(String uri, String localName, String qName) {
            if (entity && StringUtils.equals(qName, "entity")) {
                entity = false;
            }
        }
    });
    return list;
}

From source file:com.zegoggles.smssync.XOAuthConsumer.java

protected String getUsernameFromContacts() {

    final HttpClient httpClient = new DefaultHttpClient();
    final String url = "https://www.google.com/m8/feeds/contacts/default/thin?max-results=1";
    final StringBuilder email = new StringBuilder();

    try {//from   w  w  w  .  j  a  va2s .c  om
        HttpGet get = new HttpGet(sign(url));
        HttpResponse resp = httpClient.execute(get);
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser sp = spf.newSAXParser();
        XMLReader xr = sp.getXMLReader();
        xr.setContentHandler(new DefaultHandler() {
            boolean inEmail;

            @Override
            public void startElement(String uri, String localName, String qName, Attributes atts) {
                inEmail = "email".equals(localName);
            }

            @Override
            public void characters(char[] c, int start, int length) {
                if (inEmail) {
                    email.append(c, start, length);
                }
            }
        });
        xr.parse(new InputSource(resp.getEntity().getContent()));
        return email.toString();

    } catch (oauth.signpost.exception.OAuthException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (org.xml.sax.SAXException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (java.io.IOException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (javax.xml.parsers.ParserConfigurationException e) {
        Log.e(TAG, "error", e);
        return null;
    }
}

From source file:com.nidhinova.tika.server.TikaService.java

/**
 * Serves HTTP GET Returns metadata formatted as json or plain text content
 * of the file. File should be locally accessible for Tika Server using
 * pathkey JNDI/*w w  w  . j  a  v a2s . co  m*/
 * 
 * @param filename
 * @param pathkey
 *            (JNDI lookup key)
 * @param opkey
 *            (can be "text" or "metadata" or "fulldata")
 * @param httpHeaders
 * @return
 * @throws Exception
 */
@GET
@Produces({ MediaType.APPLICATION_JSON })
@Path("/{opkey}/{pathkey}/{resourceid: .*}")
public StreamingOutput getMetadata(@javax.ws.rs.core.Context javax.ws.rs.core.UriInfo uriInfo,
        @PathParam("opkey") final String opkey, @PathParam("pathkey") final String pathkey,
        @PathParam("resourceid") final String resourceId, @Context HttpHeaders httpHeaders) throws Exception {

    // get the resource segment, this may have query params
    // we are ok with it as long as we can get something at that location
    String[] segments = uriInfo.getRequestUri().toASCIIString().split("/" + opkey + "/" + pathkey + "/");
    final String filename = segments[segments.length - 1];
    logger.info("resource :" + segments[segments.length - 1]);

    final Detector detector = createDetector(httpHeaders);
    final AutoDetectParser parser = new AutoDetectParser(detector);
    final ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
    setMetadataFromHeader(parser, metadata, httpHeaders);

    URL url = null;
    try {
        if (pathkey != null && resourceId != null) {
            String filepath = getFilePath(pathkey) + filename;
            File file = new File(filepath);
            if (file.isFile()) {
                url = file.toURI().toURL();
            } else {
                url = new URL(filepath);
            }
        }
    } catch (MalformedURLException mex) {
        throw new WebApplicationException(Response.Status.NOT_FOUND);
    }

    final InputStream is = TikaInputStream.get(url, metadata);

    return new StreamingOutput() {
        public void write(OutputStream outputStream) throws IOException, WebApplicationException {

            StringWriter textBuffer = new StringWriter();
            ContentHandler handler = null;
            if (opkey.equalsIgnoreCase("metadata")) {
                handler = new DefaultHandler();
            } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) {
                handler = new BodyContentHandler(textBuffer);
            }
            try {

                parser.parse(is, handler, metadata, context);

                String contentEncoding = (metadata
                        .get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING) == null ? "UTF-8"
                                : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                logger.info("Content encoding: "
                        + metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                Writer outWriter = getOutputWriter(outputStream, contentEncoding);

                //metadata is always gathered
                // munch tika metadata object it to make json
                String jsonMetadata = JSONHelper.metadataToJson(metadata);

                if (opkey.equalsIgnoreCase("metadata")) {
                    outWriter.write("{\"metadata\":" + jsonMetadata + "}");
                } else if (opkey.equalsIgnoreCase("text")) {
                    // write it out
                    outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                } else if (opkey.equalsIgnoreCase("fulldata")) {
                    StringBuilder data = new StringBuilder();
                    data.append("{ \"metadata\":" + jsonMetadata).append(", ")
                            .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                    outWriter.write(data.toString());
                }
                outWriter.flush();
            } catch (SAXException e) {
                throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
            } catch (TikaException e) {
                if (e.getCause() != null && e.getCause() instanceof WebApplicationException) {
                    throw (WebApplicationException) e.getCause();
                }

                if (e.getCause() != null && e.getCause() instanceof IllegalStateException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                logger.warn("Text extraction failed", e);

                throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
            }
        }
    };

}

From source file:TransformThread.java

/** Initialize the results (m_outResult) according
 * to RESULT_FLAVOR/*from ww w.  j  ava2  s.c  o  m*/
 */
private void initResult() {
    try {
        for (int i = 0; i < NUM_TRANSFORMATIONS; i++) {
            switch (RESULT_FLAVOR) {
            case STREAM:
                OutputStream outStream = new FileOutputStream(
                        FILE_OUT_BASE + "thread_" + m_thrdNum + "_transformation_" + i + FILE_OUT_EXT);

                m_outResult[i] = new StreamResult(outStream);
                break;

            case SAX:
                DefaultHandler defaultHandler = new DefaultHandler();
                m_outResult[i] = new SAXResult(defaultHandler);
                break;

            case DOM:
                m_outResult[i] = new DOMResult();
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(1);
    }
}

From source file:com.vmware.photon.controller.model.adapters.vsphere.ovf.OvfRetriever.java

private StoringInputStream toStream(URI ovfUri) throws IOException {
    SAXParser saxParser = newSaxParser();
    DefaultHandler handler = new DefaultHandler();

    InputStream is;/*from   www. ja v  a  2s.  c  om*/
    HttpResponse response = null;
    HttpGet request = null;

    if (ovfUri.getScheme().equals("file")) {
        is = new FileInputStream(new File(ovfUri));
    } else {
        request = new HttpGet(ovfUri);
        response = this.client.execute(request);

        if (response.getStatusLine().getStatusCode() != 200) {
            throw new IOException(
                    "Ovf descriptor not found at " + ovfUri + ". Error code " + response.getStatusLine());
        }

        is = response.getEntity().getContent();
    }

    StoringInputStream storingInputStream = new StoringInputStream(is);

    try {
        saxParser.parse(storingInputStream, handler);
        if (response != null) {
            EntityUtils.consumeQuietly(response.getEntity());
        }
    } catch (SAXException e) {
        // not a valid ovf - abort
        if (request != null) {
            request.abort();
        }
        EntityUtils.consumeQuietly(response.getEntity());

        throw new IOException("Ovf not a valid xml: " + e.getMessage(), e);
    } finally {
        //close stream, could be file
        IOUtils.closeQuietly(is);
    }

    return storingInputStream;
}

From source file:MockFedoraIT.java

private void parseIrodsFile(IrodsIFileSystem module, String testPath) throws LowlevelStorageException {
    InputStream is = module.read(new File(testPath));
    // initialize sax for this parse
    try {//from ww  w  . j av  a  2s  .co  m
        SAXParserFactory spf = SAXParserFactory.newInstance();
        // spf.setValidating(false);
        // spf.setNamespaceAware(true);
        SAXParser parser = spf.newSAXParser();
        parser.parse(is, new DefaultHandler());
    } catch (Exception e) {
        throw new RuntimeException("Error with SAX parser", e);
    }
}

From source file:at.molindo.webtools.crawler.CrawlerTask.java

protected void parseResult(final String string) throws SAXException, IOException {

    InputSource inputSource;//from  w  w w. ja  v a  2s  . c o m
    if (_tidy) {
        final Tidy tidy = new Tidy();
        tidy.setXHTML(true);
        tidy.setErrfile("/dev/null");
        final ByteArrayInputStream in = new ByteArrayInputStream(string.getBytes());
        final ByteArrayOutputStream out = new ByteArrayOutputStream();

        tidy.parse(in, out);

        inputSource = new InputSource(new ByteArrayInputStream(out.toByteArray()));
    } else {
        inputSource = new InputSource(new StringReader(string));
    }

    ((CrawlerThread) Thread.currentThread()).getParser().parse(inputSource, new DefaultHandler() {

        @Override
        public void startElement(final String uri, final String localName, final String name,
                final Attributes attributes) throws SAXException {

            if ("a".equals(name)) {
                String href = attributes.getValue("href");
                if (href != null) {
                    final int anchorIndex = href.lastIndexOf("#");
                    if (anchorIndex > 0) {
                        href = href.substring(0, anchorIndex);
                    } else if (anchorIndex == 0) {
                        // anchor on same page: ignore
                        return;
                    }

                    if (href != null) {
                        final CrawlerReferrer referrer = new CrawlerReferrer(_urlString, href);
                        if (!href.startsWith("http://")) {
                            if (href.startsWith("/")) {
                                _crawler.queue(_crawler._host + href.substring(1), referrer);
                            } else if (!href.startsWith("javascript:") && !href.startsWith("ftp:")
                                    && !href.startsWith("mailto:")) {
                                String relativeTo = _urlString.substring(0, _urlString.lastIndexOf("/"));
                                boolean one = false, two = false;
                                while ((two = href.startsWith("../")) || (one = href.startsWith("./"))) {
                                    if (two) {
                                        href = href.substring(3);
                                        relativeTo = relativeTo.substring(0, relativeTo.lastIndexOf("/"));
                                    } else if (one) {
                                        href = href.substring(2);
                                    }
                                }

                                _crawler.queue(relativeTo + "/" + href, referrer);
                            }
                        } else if (href.startsWith(_crawler._host)) {
                            _crawler.queue(href, referrer);
                        }
                    }
                }
            }
        }

        @Override
        public InputSource resolveEntity(final String publicId, String systemId)
                throws IOException, SAXException {
            if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) {
                systemId = getClass().getClassLoader().getResource("xhtml1-transitional.dtd").toString();
            }

            return _crawler.getDtdMemoryCache().resolveEntity(publicId, systemId);
        }
    });
}

From source file:crawler.configuration.rabbitmq.RabbitMQConfiguration.java

public void onMessage(Message message) {
    String url = new String(message.getBody());
    log.info("parse(" + url + ")..");

    CloseableHttpResponse closeableHttpResponse = null;
    try {/*from  w w w.j av a2s. c  o  m*/
        closeableHttpResponse = socksSocketService.connect(url);

        if (closeableHttpResponse.getStatusLine().getStatusCode() == 200) {

            long length = closeableHttpResponse.getEntity().getContentLength();
            Header contentType = closeableHttpResponse.getEntity().getContentType();

            InputStream inputStream = closeableHttpResponse.getEntity().getContent();

            TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
            Metadata metadata = new Metadata();
            metadata.add(Metadata.RESOURCE_NAME_KEY, url);
            URL u = new URL(metadata.get(Metadata.RESOURCE_NAME_KEY));
            metadata.add(Metadata.CONTENT_TYPE, contentType.getValue());

            Detector detector = tikaConfig.getDetector();
            MediaType mediaType = detector.detect(tikaInputStream, metadata);

            List<Page> pages = mongoPageRepository.findByUrl(u);
            // String text = bodyContentHandler.toString();
            Page page = null;

            if (pages.size() == 0) {
                // manually inserted page
                page = new Page();
                page.setInsertDate(new Date());
                page.setUrl(u);
                log.info(" .. manual(" + url + ")");
            } else {
                if (pages.size() > 1) {
                    log.warn("url(" + url + ") multiple times in db!");
                }
                // from html link extractor
                page = pages.get(0);
                page.setUpdateDate(new Date());
            }

            ParseContext parseContext = new ParseContext();
            LinkContentHandler linkContentHandler = new LinkContentHandler();
            BodyContentHandler bodyContentHandler = new BodyContentHandler(MAX_BODY_LENGTH);
            ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
            HtmlParser htmlParser = new HtmlParser();
            DefaultHandler defaultHandler = new DefaultHandler();

            TeeContentHandler teeContentHandler = new TeeContentHandler(linkContentHandler, bodyContentHandler,
                    toHTMLContentHandler);

            AutoDetectParser autoDetectParser = new AutoDetectParser();
            autoDetectParser.parse(tikaInputStream, teeContentHandler, metadata);

            log.info("autodetect: defaultHandler(" + gson.toJson(defaultHandler) + "), metadata("
                    + gson.toJson(metadata) + "), mediaType("
                    + gson.toJson(autoDetectParser.getMediaTypeRegistry()) + "), parseContext("
                    + gson.toJson(parseContext) + "): content(" + gson.toJson(teeContentHandler) + ")");

            if ("application".equals(mediaType.getType())) {
                if ("xhtml+xml".equals(mediaType.getSubtype())) {
                    htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext);
                    String html = toHTMLContentHandler.toString();
                    page.setContent(html.toString().getBytes());
                } else {
                    log.warn("this contentType(" + mediaType.toString() + ") not supported!");
                }
            } else if ("text".equals(mediaType.getType())) {
                if ("html".equals(mediaType.getSubtype())) {
                    htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext);
                    String html = toHTMLContentHandler.toString();
                    page.setContent(html.toString().getBytes());
                } else {
                    log.warn("this contentType(" + mediaType.toString() + ") not supported!");
                }
            }

            page.setContent(IOUtils.toByteArray(tikaInputStream));
            page.setMetadata(metadata);
            mongoPageRepository.save(page);

        } else {
            log.warn("url(" + url + "): statuscode(" + closeableHttpResponse.getStatusLine().getStatusCode()
                    + ")");
        }
        EntityUtils.consume(closeableHttpResponse.getEntity());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (TikaException e) {
        e.printStackTrace();
    } finally {
        if (closeableHttpResponse != null) {
            try {
                closeableHttpResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.odf.OpenDocumentParser.java

public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    // TODO: reuse the already opened ZIPFile, if
    // present/* ww w . ja va2s .c  o m*/

    /*
     * ZipFile zipFile; if (stream instanceof TikaInputStream) { TikaInputStream
     * tis = (TikaInputStream) stream; Object container = ((TikaInputStream)
     * stream).getOpenContainer(); if (container instanceof ZipFile) { zipFile =
     * (ZipFile) container; } else if (tis.hasFile()) { zipFile = new
     * ZipFile(tis.getFile()); } }
     */

    // TODO: if incoming IS is a TIS with a file
    // associated, we should open ZipFile so we can
    // visit metadata, mimetype first; today we lose
    // all the metadata if meta.xml is hit after
    // content.xml in the stream. Then we can still
    // read-once for the content.xml.

    XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);

    // As we don't know which of the metadata or the content
    // we'll hit first, catch the endDocument call initially
    EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);

    TikaImageHelper helper = new TikaImageHelper(metadata);
    try {
        // Process the file in turn
        ZipInputStream zip = new ZipInputStream(stream);
        ZipEntry entry = zip.getNextEntry();
        while (entry != null) {
            // TODO: images
            String entryExtension = null;
            try {
                entryExtension = FilenameUtils.getExtension(new File(entry.getName()).getName());
            } catch (Exception e) {
                e.printStackTrace();
            }

            if (entryExtension != null && FileType.isValidImageFileExtension(entryExtension)
                    && Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
                File imageFile = null;
                try {
                    imageFile = TikaImageHelper.saveZipEntryToTemp(zip, entry);
                    helper.addImage(imageFile);
                } catch (Exception e) {
                    e.printStackTrace();
                } finally {
                    if (imageFile != null) {
                        imageFile.delete();
                    }
                }
            } else if (entry.getName().equals("mimetype")) {
                String type = IOUtils.toString(zip, "UTF-8");
                metadata.set(Metadata.CONTENT_TYPE, type);
            } else if (entry.getName().equals("meta.xml")) {
                meta.parse(zip, new DefaultHandler(), metadata, context);
            } else if (entry.getName().endsWith("content.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            } else if (entry.getName().endsWith("styles.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            }
            entry = zip.getNextEntry();
        }
        helper.addTextToHandler(xhtml);
    } catch (Exception e) {
        LOG.info("Extract error", e);
    } finally {
        if (helper != null) {
            helper.close();
        }
    }

    // Only now call the end document
    if (handler.getEndDocumentWasCalled()) {
        handler.reallyEndDocument();
    }
}