Example usage for org.xml.sax.helpers DefaultHandler DefaultHandler

List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler

Introduction

In this page you can find the example usage for org.xml.sax.helpers DefaultHandler DefaultHandler.

Prototype

DefaultHandler

Source Link

Usage

From source file:com.shin1ogawa.appengine.marketplace.gdata.LicensingAPI.java

static List<Map<String, String>> parseLicenseFeed(InputStream is)
        throws SAXException, IOException, ParserConfigurationException {
    SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
    final List<Map<String, String>> list = Lists.newArrayList();
    parser.parse(is, new DefaultHandler() {

        boolean entity = false;

        String currentElement;//from w  w  w .j av  a  2 s.co  m

        Map<String, String> map;

        @Override
        public void characters(char[] ch, int start, int length) {
            if (entity) {
                map.put(currentElement, new String(ch, start, length));
            }
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) {
            if (entity == false && StringUtils.equals(qName, "entity")) {
                entity = true;
                map = Maps.newHashMap();
                list.add(map);
                return;
            }
            currentElement = qName;
        }

        @Override
        public void endElement(String uri, String localName, String qName) {
            if (entity && StringUtils.equals(qName, "entity")) {
                entity = false;
            }
        }
    });
    return list;
}

From source file:com.zegoggles.smssync.XOAuthConsumer.java

protected String getUsernameFromContacts() {

    final HttpClient httpClient = new DefaultHttpClient();
    final String url = "https://www.google.com/m8/feeds/contacts/default/thin?max-results=1";
    final StringBuilder email = new StringBuilder();

    try {//from   w  w  w  .  j  a  va2s .c  om
        HttpGet get = new HttpGet(sign(url));
        HttpResponse resp = httpClient.execute(get);
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser sp = spf.newSAXParser();
        XMLReader xr = sp.getXMLReader();
        xr.setContentHandler(new DefaultHandler() {
            boolean inEmail;

            @Override
            public void startElement(String uri, String localName, String qName, Attributes atts) {
                inEmail = "email".equals(localName);
            }

            @Override
            public void characters(char[] c, int start, int length) {
                if (inEmail) {
                    email.append(c, start, length);
                }
            }
        });
        xr.parse(new InputSource(resp.getEntity().getContent()));
        return email.toString();

    } catch (oauth.signpost.exception.OAuthException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (org.xml.sax.SAXException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (java.io.IOException e) {
        Log.e(TAG, "error", e);
        return null;
    } catch (javax.xml.parsers.ParserConfigurationException e) {
        Log.e(TAG, "error", e);
        return null;
    }
}

From source file:com.nidhinova.tika.server.TikaService.java

/**
 * Serves HTTP GET Returns metadata formatted as json or plain text content
 * of the file. File should be locally accessible for Tika Server using
 * pathkey JNDI/*w w  w  . j  a  v a2s . co  m*/
 * 
 * @param filename
 * @param pathkey
 *            (JNDI lookup key)
 * @param opkey
 *            (can be "text" or "metadata" or "fulldata")
 * @param httpHeaders
 * @return
 * @throws Exception
 */
@GET
@Produces({ MediaType.APPLICATION_JSON })
@Path("/{opkey}/{pathkey}/{resourceid: .*}")
public StreamingOutput getMetadata(@javax.ws.rs.core.Context javax.ws.rs.core.UriInfo uriInfo,
        @PathParam("opkey") final String opkey, @PathParam("pathkey") final String pathkey,
        @PathParam("resourceid") final String resourceId, @Context HttpHeaders httpHeaders) throws Exception {

    // get the resource segment, this may have query params
    // we are ok with it as long as we can get something at that location
    String[] segments = uriInfo.getRequestUri().toASCIIString().split("/" + opkey + "/" + pathkey + "/");
    final String filename = segments[segments.length - 1];
    logger.info("resource :" + segments[segments.length - 1]);

    final Detector detector = createDetector(httpHeaders);
    final AutoDetectParser parser = new AutoDetectParser(detector);
    final ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
    setMetadataFromHeader(parser, metadata, httpHeaders);

    URL url = null;
    try {
        if (pathkey != null && resourceId != null) {
            String filepath = getFilePath(pathkey) + filename;
            File file = new File(filepath);
            if (file.isFile()) {
                url = file.toURI().toURL();
            } else {
                url = new URL(filepath);
            }
        }
    } catch (MalformedURLException mex) {
        throw new WebApplicationException(Response.Status.NOT_FOUND);
    }

    final InputStream is = TikaInputStream.get(url, metadata);

    return new StreamingOutput() {
        public void write(OutputStream outputStream) throws IOException, WebApplicationException {

            StringWriter textBuffer = new StringWriter();
            ContentHandler handler = null;
            if (opkey.equalsIgnoreCase("metadata")) {
                handler = new DefaultHandler();
            } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) {
                handler = new BodyContentHandler(textBuffer);
            }
            try {

                parser.parse(is, handler, metadata, context);

                String contentEncoding = (metadata
                        .get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING) == null ? "UTF-8"
                                : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                logger.info("Content encoding: "
                        + metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                Writer outWriter = getOutputWriter(outputStream, contentEncoding);

                //metadata is always gathered
                // munch tika metadata object it to make json
                String jsonMetadata = JSONHelper.metadataToJson(metadata);

                if (opkey.equalsIgnoreCase("metadata")) {
                    outWriter.write("{\"metadata\":" + jsonMetadata + "}");
                } else if (opkey.equalsIgnoreCase("text")) {
                    // write it out
                    outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                } else if (opkey.equalsIgnoreCase("fulldata")) {
                    StringBuilder data = new StringBuilder();
                    data.append("{ \"metadata\":" + jsonMetadata).append(", ")
                            .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                    outWriter.write(data.toString());
                }
                outWriter.flush();
            } catch (SAXException e) {
                throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
            } catch (TikaException e) {
                if (e.getCause() != null && e.getCause() instanceof WebApplicationException) {
                    throw (WebApplicationException) e.getCause();
                }

                if (e.getCause() != null && e.getCause() instanceof IllegalStateException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) {
                    throw new WebApplicationException(Response.status(422).build());
                }

                logger.warn("Text extraction failed", e);

                throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
            }
        }
    };

}

From source file:TransformThread.java

/** Initialize the results (m_outResult) according
 * to RESULT_FLAVOR/*from ww w.  j  ava2  s.c  o  m*/
 */
private void initResult() {
    try {
        for (int i = 0; i < NUM_TRANSFORMATIONS; i++) {
            switch (RESULT_FLAVOR) {
            case STREAM:
                OutputStream outStream = new FileOutputStream(
                        FILE_OUT_BASE + "thread_" + m_thrdNum + "_transformation_" + i + FILE_OUT_EXT);

                m_outResult[i] = new StreamResult(outStream);
                break;

            case SAX:
                DefaultHandler defaultHandler = new DefaultHandler();
                m_outResult[i] = new SAXResult(defaultHandler);
                break;

            case DOM:
                m_outResult[i] = new DOMResult();
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(1);
    }
}

From source file:com.vmware.photon.controller.model.adapters.vsphere.ovf.OvfRetriever.java

private StoringInputStream toStream(URI ovfUri) throws IOException {
    SAXParser saxParser = newSaxParser();
    DefaultHandler handler = new DefaultHandler();

    InputStream is;/*from   www. ja v  a  2s.  c  om*/
    HttpResponse response = null;
    HttpGet request = null;

    if (ovfUri.getScheme().equals("file")) {
        is = new FileInputStream(new File(ovfUri));
    } else {
        request = new HttpGet(ovfUri);
        response = this.client.execute(request);

        if (response.getStatusLine().getStatusCode() != 200) {
            throw new IOException(
                    "Ovf descriptor not found at " + ovfUri + ". Error code " + response.getStatusLine());
        }

        is = response.getEntity().getContent();
    }

    StoringInputStream storingInputStream = new StoringInputStream(is);

    try {
        saxParser.parse(storingInputStream, handler);
        if (response != null) {
            EntityUtils.consumeQuietly(response.getEntity());
        }
    } catch (SAXException e) {
        // not a valid ovf - abort
        if (request != null) {
            request.abort();
        }
        EntityUtils.consumeQuietly(response.getEntity());

        throw new IOException("Ovf not a valid xml: " + e.getMessage(), e);
    } finally {
        //close stream, could be file
        IOUtils.closeQuietly(is);
    }

    return storingInputStream;
}

From source file:MockFedoraIT.java

private void parseIrodsFile(IrodsIFileSystem module, String testPath) throws LowlevelStorageException {
    InputStream is = module.read(new File(testPath));
    // initialize sax for this parse
    try {//from ww  w  . j av  a  2s  .co  m
        SAXParserFactory spf = SAXParserFactory.newInstance();
        // spf.setValidating(false);
        // spf.setNamespaceAware(true);
        SAXParser parser = spf.newSAXParser();
        parser.parse(is, new DefaultHandler());
    } catch (Exception e) {
        throw new RuntimeException("Error with SAX parser", e);
    }
}

From source file:at.molindo.webtools.crawler.CrawlerTask.java

protected void parseResult(final String string) throws SAXException, IOException {

    InputSource inputSource;//from  w  w w. ja  v a  2s  . c o m
    if (_tidy) {
        final Tidy tidy = new Tidy();
        tidy.setXHTML(true);
        tidy.setErrfile("/dev/null");
        final ByteArrayInputStream in = new ByteArrayInputStream(string.getBytes());
        final ByteArrayOutputStream out = new ByteArrayOutputStream();

        tidy.parse(in, out);

        inputSource = new InputSource(new ByteArrayInputStream(out.toByteArray()));
    } else {
        inputSource = new InputSource(new StringReader(string));
    }

    ((CrawlerThread) Thread.currentThread()).getParser().parse(inputSource, new DefaultHandler() {

        @Override
        public void startElement(final String uri, final String localName, final String name,
                final Attributes attributes) throws SAXException {

            if ("a".equals(name)) {
                String href = attributes.getValue("href");
                if (href != null) {
                    final int anchorIndex = href.lastIndexOf("#");
                    if (anchorIndex > 0) {
                        href = href.substring(0, anchorIndex);
                    } else if (anchorIndex == 0) {
                        // anchor on same page: ignore
                        return;
                    }

                    if (href != null) {
                        final CrawlerReferrer referrer = new CrawlerReferrer(_urlString, href);
                        if (!href.startsWith("http://")) {
                            if (href.startsWith("/")) {
                                _crawler.queue(_crawler._host + href.substring(1), referrer);
                            } else if (!href.startsWith("javascript:") && !href.startsWith("ftp:")
                                    && !href.startsWith("mailto:")) {
                                String relativeTo = _urlString.substring(0, _urlString.lastIndexOf("/"));
                                boolean one = false, two = false;
                                while ((two = href.startsWith("../")) || (one = href.startsWith("./"))) {
                                    if (two) {
                                        href = href.substring(3);
                                        relativeTo = relativeTo.substring(0, relativeTo.lastIndexOf("/"));
                                    } else if (one) {
                                        href = href.substring(2);
                                    }
                                }

                                _crawler.queue(relativeTo + "/" + href, referrer);
                            }
                        } else if (href.startsWith(_crawler._host)) {
                            _crawler.queue(href, referrer);
                        }
                    }
                }
            }
        }

        @Override
        public InputSource resolveEntity(final String publicId, String systemId)
                throws IOException, SAXException {
            if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) {
                systemId = getClass().getClassLoader().getResource("xhtml1-transitional.dtd").toString();
            }

            return _crawler.getDtdMemoryCache().resolveEntity(publicId, systemId);
        }
    });
}

From source file:crawler.configuration.rabbitmq.RabbitMQConfiguration.java

public void onMessage(Message message) {
    String url = new String(message.getBody());
    log.info("parse(" + url + ")..");

    CloseableHttpResponse closeableHttpResponse = null;
    try {/*from  w w w.j av a2s. c  o  m*/
        closeableHttpResponse = socksSocketService.connect(url);

        if (closeableHttpResponse.getStatusLine().getStatusCode() == 200) {

            long length = closeableHttpResponse.getEntity().getContentLength();
            Header contentType = closeableHttpResponse.getEntity().getContentType();

            InputStream inputStream = closeableHttpResponse.getEntity().getContent();

            TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
            Metadata metadata = new Metadata();
            metadata.add(Metadata.RESOURCE_NAME_KEY, url);
            URL u = new URL(metadata.get(Metadata.RESOURCE_NAME_KEY));
            metadata.add(Metadata.CONTENT_TYPE, contentType.getValue());

            Detector detector = tikaConfig.getDetector();
            MediaType mediaType = detector.detect(tikaInputStream, metadata);

            List<Page> pages = mongoPageRepository.findByUrl(u);
            // String text = bodyContentHandler.toString();
            Page page = null;

            if (pages.size() == 0) {
                // manually inserted page
                page = new Page();
                page.setInsertDate(new Date());
                page.setUrl(u);
                log.info(" .. manual(" + url + ")");
            } else {
                if (pages.size() > 1) {
                    log.warn("url(" + url + ") multiple times in db!");
                }
                // from html link extractor
                page = pages.get(0);
                page.setUpdateDate(new Date());
            }

            ParseContext parseContext = new ParseContext();
            LinkContentHandler linkContentHandler = new LinkContentHandler();
            BodyContentHandler bodyContentHandler = new BodyContentHandler(MAX_BODY_LENGTH);
            ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
            HtmlParser htmlParser = new HtmlParser();
            DefaultHandler defaultHandler = new DefaultHandler();

            TeeContentHandler teeContentHandler = new TeeContentHandler(linkContentHandler, bodyContentHandler,
                    toHTMLContentHandler);

            AutoDetectParser autoDetectParser = new AutoDetectParser();
            autoDetectParser.parse(tikaInputStream, teeContentHandler, metadata);

            log.info("autodetect: defaultHandler(" + gson.toJson(defaultHandler) + "), metadata("
                    + gson.toJson(metadata) + "), mediaType("
                    + gson.toJson(autoDetectParser.getMediaTypeRegistry()) + "), parseContext("
                    + gson.toJson(parseContext) + "): content(" + gson.toJson(teeContentHandler) + ")");

            if ("application".equals(mediaType.getType())) {
                if ("xhtml+xml".equals(mediaType.getSubtype())) {
                    htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext);
                    String html = toHTMLContentHandler.toString();
                    page.setContent(html.toString().getBytes());
                } else {
                    log.warn("this contentType(" + mediaType.toString() + ") not supported!");
                }
            } else if ("text".equals(mediaType.getType())) {
                if ("html".equals(mediaType.getSubtype())) {
                    htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext);
                    String html = toHTMLContentHandler.toString();
                    page.setContent(html.toString().getBytes());
                } else {
                    log.warn("this contentType(" + mediaType.toString() + ") not supported!");
                }
            }

            page.setContent(IOUtils.toByteArray(tikaInputStream));
            page.setMetadata(metadata);
            mongoPageRepository.save(page);

        } else {
            log.warn("url(" + url + "): statuscode(" + closeableHttpResponse.getStatusLine().getStatusCode()
                    + ")");
        }
        EntityUtils.consume(closeableHttpResponse.getEntity());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (TikaException e) {
        e.printStackTrace();
    } finally {
        if (closeableHttpResponse != null) {
            try {
                closeableHttpResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.odf.OpenDocumentParser.java

public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    // TODO: reuse the already opened ZIPFile, if
    // present/* ww w . ja va2s .c  o m*/

    /*
     * ZipFile zipFile; if (stream instanceof TikaInputStream) { TikaInputStream
     * tis = (TikaInputStream) stream; Object container = ((TikaInputStream)
     * stream).getOpenContainer(); if (container instanceof ZipFile) { zipFile =
     * (ZipFile) container; } else if (tis.hasFile()) { zipFile = new
     * ZipFile(tis.getFile()); } }
     */

    // TODO: if incoming IS is a TIS with a file
    // associated, we should open ZipFile so we can
    // visit metadata, mimetype first; today we lose
    // all the metadata if meta.xml is hit after
    // content.xml in the stream. Then we can still
    // read-once for the content.xml.

    XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);

    // As we don't know which of the metadata or the content
    // we'll hit first, catch the endDocument call initially
    EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);

    TikaImageHelper helper = new TikaImageHelper(metadata);
    try {
        // Process the file in turn
        ZipInputStream zip = new ZipInputStream(stream);
        ZipEntry entry = zip.getNextEntry();
        while (entry != null) {
            // TODO: images
            String entryExtension = null;
            try {
                entryExtension = FilenameUtils.getExtension(new File(entry.getName()).getName());
            } catch (Exception e) {
                e.printStackTrace();
            }

            if (entryExtension != null && FileType.isValidImageFileExtension(entryExtension)
                    && Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
                File imageFile = null;
                try {
                    imageFile = TikaImageHelper.saveZipEntryToTemp(zip, entry);
                    helper.addImage(imageFile);
                } catch (Exception e) {
                    e.printStackTrace();
                } finally {
                    if (imageFile != null) {
                        imageFile.delete();
                    }
                }
            } else if (entry.getName().equals("mimetype")) {
                String type = IOUtils.toString(zip, "UTF-8");
                metadata.set(Metadata.CONTENT_TYPE, type);
            } else if (entry.getName().equals("meta.xml")) {
                meta.parse(zip, new DefaultHandler(), metadata, context);
            } else if (entry.getName().endsWith("content.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            } else if (entry.getName().endsWith("styles.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            }
            entry = zip.getNextEntry();
        }
        helper.addTextToHandler(xhtml);
    } catch (Exception e) {
        LOG.info("Extract error", e);
    } finally {
        if (helper != null) {
            helper.close();
        }
    }

    // Only now call the end document
    if (handler.getEndDocumentWasCalled()) {
        handler.reallyEndDocument();
    }
}