Example usage for org.xml.sax InputSource setEncoding

Introduction

In this page you can find the example usage for org.xml.sax InputSource setEncoding.

Prototype

public void setEncoding(String encoding)

Source Link

Document

Set the character encoding, if known.

Usage

From source file:org.apache.marmotta.ucuenca.wk.provider.gs.GoogleScholarProvider.java

@Override
public List<String> parseResponse(String resource, String requestUrl, Model triples, InputStream input,
        String contentType) throws DataRetrievalException {
    log.debug("Request Successful to {0}", requestUrl);
    final GSXMLHandler gsXMLHandler = new GSXMLHandler();
    gsXMLHandler.clearGSresultList();/*w  ww .j  a va2  s .  c  om*/
    try {
        XMLReader xr = XMLReaderFactory.createXMLReader("org.ccil.cowan.tagsoup.Parser");
        xr.setContentHandler(gsXMLHandler);
        InputSource gsxml = new InputSource(input);
        gsxml.setEncoding("iso-8859-1");
        xr.parse(gsxml);

        final Set<GSresult> gsresultlist = gsXMLHandler.getGSresultList();
        Gson gson = new Gson();
        JsonArray json = new JsonArray();
        for (GSresult d : gsresultlist) {
            json.add(gson.toJsonTree(d).getAsJsonObject());
        }
        JSONtoRDF parser = new JSONtoRDF(resource, MAPPINGSCHEMA, json, triples);
        try {
            parser.parse();
        } catch (Exception e) {
            throw new DataRetrievalException("I/O exception while retrieving resource: " + requestUrl, e);
        }
        int numPages = (int) ((double) (gsXMLHandler.getNumResults() / 10)) + 1;
        int pagesLoaded = 1;
        Model model = null;
        while (pagesLoaded < numPages) {

            String pagenumquery = Integer.toString(pagesLoaded * 10);
            String moreDataUrl = String.format(API, pagenumquery, stringSearch, authorSearch, advancedSearch);
            ClientConfiguration conf = new ClientConfiguration();
            LDClient ldClient = new LDClient(conf);
            ClientResponse response = ldClient.retrieveResource(moreDataUrl);
            Model pageModel = response.getData();
            if (model == null) {
                model = pageModel;
            } else {
                model.addAll(pageModel);
            }
            pagesLoaded++;
        }
        triples.addAll(model);

    } catch (SAXException | IOException e) {
        throw new DataRetrievalException("I/O exception while retrieving resource: " + requestUrl, e);
    }

    //       try {
    //          List<String> candidates = new ArrayList<String>();
    //          ValueFactory factory = ValueFactoryImpl.getInstance();
    //          final Document doc = new SAXBuilder(XMLReaders.NONVALIDATING).build(input);
    //          for(Element element: queryElements(doc, "/result/hits/hit/info/url")) {
    //             String candidate = element.getText();
    //             triples.add(factory.createStatement(factory.createURI( resource ), FOAF.member, factory.createURI( candidate ) ));
    //             candidates.add(candidate);
    //          }
    //          ClientConfiguration conf = new ClientConfiguration();
    //           LDClient ldClient = new LDClient(conf);
    //           if(!candidates.isEmpty()) {
    //              Model candidateModel = null;
    //             for(String author: candidates) {
    //                ClientResponse response = ldClient.retrieveResource(author);
    //                 Model authorModel = response.getData();
    //                 if(candidateModel == null) {
    //                    candidateModel = authorModel;
    //                 } else {
    //                    candidateModel.addAll(authorModel);
    //                 }
    //             }
    //             triples.addAll(candidateModel);
    //           }
    //       }catch (IOException e) {
    //            throw new DataRetrievalException("I/O error while parsing HTML response", e);
    //        }catch (JDOMException e) {
    //            throw new DataRetrievalException("could not parse XML response. It is not in proper XML format", e);
    //        }
    return Collections.emptyList();
}

From source file:org.apache.myfaces.config.impl.FacesConfigEntityResolver.java

public InputSource resolveEntity(String publicId, String systemId) throws IOException {
    InputStream stream;/*www .  j a v  a2s. co  m*/
    if (systemId.equals(FACES_CONFIG_1_0_DTD_SYSTEM_ID)) {
        stream = ClassUtils.getResourceAsStream(FACES_CONFIG_1_0_DTD_RESOURCE);
    } else if (systemId.equals(FACES_CONFIG_1_1_DTD_SYSTEM_ID)) {
        stream = ClassUtils.getResourceAsStream(FACES_CONFIG_1_1_DTD_RESOURCE);
    }

    else if (systemId.startsWith("jar:")) {
        URL url = new URL(systemId);
        JarURLConnection conn = (JarURLConnection) url.openConnection();
        JarEntry jarEntry = conn.getJarEntry();
        if (jarEntry == null) {
            log.fatal("JAR entry '" + systemId + "' not found.");
        }
        //_jarFile.getInputStream(jarEntry);
        stream = conn.getJarFile().getInputStream(jarEntry);
    } else {
        if (_externalContext == null) {
            stream = ClassUtils.getResourceAsStream(systemId);
        } else {
            if (systemId.startsWith("file:")) {
                systemId = systemId.substring(7); // remove file://
            }
            stream = _externalContext.getResourceAsStream(systemId);
        }
    }

    if (stream == null) {
        return null;
    }
    InputSource is = new InputSource(stream);
    is.setPublicId(publicId);
    is.setSystemId(systemId);
    is.setEncoding("ISO-8859-1");
    return is;
}

From source file:org.apache.nutch.parse.feed.FeedParser.java

/**
 * Parses the given feed and extracts out and parsers all linked items within
 * the feed, using the underlying ROME feed parsing library.
 * /*  www  . j  a  v  a  2 s  . co m*/
 * @param content
 *          A {@link Content} object representing the feed that is being
 *          parsed by this {@link Parser}.
 * 
 * @return A {@link ParseResult} containing all {@link Parse}d feeds that
 *         were present in the feed file that this {@link Parser} dealt with.
 * 
 */
public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());

    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
        InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
        input.setEncoding(encoding);
        SyndFeedInput feedInput = new SyndFeedInput();
        feed = feedInput.build(input);
    } catch (Exception e) {
        // return empty parse
        LOG.warn(
                "Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
        feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
        if (feedLink != null)
            feedLink = filters.filter(feedLink);
    } catch (Exception e) {
        feedLink = null;
    }

    for (Iterator i = entries.iterator(); i.hasNext();) {
        SyndEntry entry = (SyndEntry) i.next();
        addToMap(parseResult, feed, feedLink, entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
            new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));

    return parseResult;
}

From source file:org.apache.nutch.parse.html.HtmlParser.java

public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();

    URL base;/*from  w  w w  .  j a v  a 2s  . com*/
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();

    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));

        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);

        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);

        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        e.printStackTrace(LogUtil.getWarnStream(LOG));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        utils.getText(sb, root); // extract text
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        utils.getTitle(sb, root); // extract title
        title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
        ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
        URL baseTag = utils.getBase(root);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
                Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
            entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}

From source file:org.apache.nutch.store.readable.StoreReadable.java

public Parse getParse(String url, WebPage page) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    System.out.println("[STORE-READABLE]getParse-------------------------------------------------------------");
    String baseUrl = TableUtil.toString(page.getBaseUrl());
    URL base;/*from w w w  .j a v a 2 s. c om*/
    try {
        base = new URL(baseUrl);
    } catch (MalformedURLException e) {
        return ParseStatusUtils.getEmptyParse(e, getConf());
    }

    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];

    // parse the content
    DocumentFragment root;
    try {
        ByteBuffer contentInOctets = page.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(),
                contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()));

        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(page, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(page, defaultCharEncoding);

        page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING),
                ByteBuffer.wrap(Bytes.toBytes(encoding)));
        page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION),
                ByteBuffer.wrap(Bytes.toBytes(encoding)));

        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        LOG.error("Failed with the following IOException: ", e);
        return ParseStatusUtils.getEmptyParse(e, getConf());
    } catch (DOMException e) {
        LOG.error("Failed with the following DOMException: ", e);
        return ParseStatusUtils.getEmptyParse(e, getConf());
    } catch (SAXException e) {
        LOG.error("Failed with the following SAXException: ", e);
        return ParseStatusUtils.getEmptyParse(e, getConf());
    } catch (Exception e) {
        LOG.error("Failed with the following Exception: ", e);
        return ParseStatusUtils.getEmptyParse(e, getConf());
    }

    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
        StringBuilder sb = new StringBuilder();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        utils.getText(sb, root); // extract text
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        utils.getTitle(sb, root); // extract title
        title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
        ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
        URL baseTag = utils.getBase(root);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + url);
        }
    }

    ParseStatus status = ParseStatus.newBuilder().build();
    status.setMajorCode((int) ParseStatusCodes.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
        status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
        status.getArgs().add(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    String strJo = addJsonToPage(url, page);

    //        storeJsonToSchema(url, page ,strJo);
    page.setReadable(new Utf8(strJo));

    Parse parse = new Parse(text, title, outlinks, status, strJo);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

    if (metaTags.getNoCache()) { // not okay to cache
        page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
                ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
    }
    parse.setJsonRead(strJo);

    return parse;
}

From source file:org.apache.nutchbase.parse.html.HtmlParserHbase.java

public ParseHbase getParse(String url, RowPart row) {
    HTMLMetaTags metaTags = new HTMLMetaTags();

    String baseUrl = row.getBaseUrl();
    URL base;//  w ww .  ja va  2s. co m
    try {
        base = new URL(baseUrl);
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseHbase(getConf());
    }

    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();

    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = row.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));

        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(row, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(row, defaultCharEncoding);

        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);

        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseHbase(getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseHbase(getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseHbase(getConf());
    } catch (Exception e) {
        e.printStackTrace(LogUtil.getWarnStream(LOG));
        return new ParseStatus(e).getEmptyParseHbase(getConf());
    }

    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        utils.getText(sb, root); // extract text
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        utils.getTitle(sb, root); // extract title
        title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow() || ignoreNoFollow) { // okay to follow links
        ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
        URL baseTag = utils.getBase(root);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + url);
        }
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
                Integer.toString(metaTags.getRefreshTime()) });
    }

    ParseHbase parse = new ParseHbase(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, row, parse, metaTags, root);

    if (metaTags.getNoCache()) { // not okay to cache
        row.putMeta(Nutch.CACHING_FORBIDDEN_KEY, Bytes.toBytes(cachingPolicy));
    }

    return parse;
}

From source file:org.apache.solr.analysis.HyphenationCompoundWordTokenFilterFactory.java

public void inform(ResourceLoader loader) {
    InputStream stream = null;//ww w  .  j  a  v  a2 s .c  om
    try {
        if (dictFile != null) // the dictionary can be empty.
            dictionary = getWordSet(loader, dictFile, false);
        // TODO: Broken, because we cannot resolve real system id
        // ResourceLoader should also supply method like ClassLoader to get resource URL
        stream = loader.openResource(hypFile);
        final InputSource is = new InputSource(stream);
        is.setEncoding(encoding); // if it's null let xml parser decide
        is.setSystemId(hypFile);
        hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception"
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeQuietly(stream);
    }
}

From source file:org.apache.solr.handler.loader.XMLLoader.java

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream,
        UpdateRequestProcessor processor) throws Exception {
    final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());

    InputStream is = null;//from ww w. j  av  a  2s  . c  om
    XMLStreamReader parser = null;

    String tr = req.getParams().get(CommonParams.TR, null);
    if (tr != null) {
        final Transformer t = getTransformer(tr, req);
        final DOMResult result = new DOMResult();

        // first step: read XML and build DOM using Transformer (this is no overhead, as XSL always produces
        // an internal result DOM tree, we just access it directly as input for StAX):
        try {
            is = stream.getStream();
            final InputSource isrc = new InputSource(is);
            isrc.setEncoding(charset);
            final XMLReader xmlr = saxFactory.newSAXParser().getXMLReader();
            xmlr.setErrorHandler(xmllog);
            xmlr.setEntityResolver(EmptyEntityResolver.SAX_INSTANCE);
            final SAXSource source = new SAXSource(xmlr, isrc);
            t.transform(source, result);
        } catch (TransformerException te) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, te.getMessage(), te);
        } finally {
            IOUtils.closeQuietly(is);
        }
        // second step: feed the intermediate DOM tree into StAX parser:
        try {
            parser = inputFactory.createXMLStreamReader(new DOMSource(result.getNode()));
            this.processUpdate(req, processor, parser);
        } catch (XMLStreamException e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
        } finally {
            if (parser != null)
                parser.close();
        }
    }
    // Normal XML Loader
    else {
        try {
            is = stream.getStream();
            if (log.isTraceEnabled()) {
                final byte[] body = IOUtils.toByteArray(is);
                // TODO: The charset may be wrong, as the real charset is later
                // determined by the XML parser, the content-type is only used as a hint!
                log.trace("body",
                        new String(body, (charset == null) ? ContentStreamBase.DEFAULT_CHARSET : charset));
                IOUtils.closeQuietly(is);
                is = new ByteArrayInputStream(body);
            }
            parser = (charset == null) ? inputFactory.createXMLStreamReader(is)
                    : inputFactory.createXMLStreamReader(is, charset);
            this.processUpdate(req, processor, parser);
        } catch (XMLStreamException e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
        } finally {
            if (parser != null)
                parser.close();
            IOUtils.closeQuietly(is);
        }
    }
}

From source file:org.apache.solr.handler.XsltXMLLoader.java

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws Exception {
    final DOMResult result = new DOMResult();
    final Transformer t = getTransformer(req);
    InputStream is = null;/*from   ww  w.j  a va  2 s  .c o  m*/
    XMLStreamReader parser = null;
    // first step: read XML and build DOM using Transformer (this is no overhead, as XSL always produces
    // an internal result DOM tree, we just access it directly as input for StAX):
    try {
        is = stream.getStream();
        final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        final InputSource isrc = new InputSource(is);
        isrc.setEncoding(charset);
        final SAXSource source = new SAXSource(isrc);
        t.transform(source, result);
    } catch (TransformerException te) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, te.getMessage(), te);
    } finally {
        IOUtils.closeQuietly(is);
    }
    // second step feed the intermediate DOM tree into StAX parser:
    try {
        parser = inputFactory.createXMLStreamReader(new DOMSource(result.getNode()));
        this.processUpdate(processor, parser);
    } catch (XMLStreamException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
    } finally {
        if (parser != null)
            parser.close();
    }
}

From source file:org.codelibs.fess.transformer.FessXpathTransformer.java

@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
    final File tempFile = ResponseDataUtil.createResponseBodyFile(responseData);
    try {/*from   w  ww . java 2  s .  com*/
        final DOMParser parser = getDomParser();
        BufferedInputStream bis = null;
        try {
            bis = new BufferedInputStream(new FileInputStream(tempFile));
            final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
            bis.mark(UTF8_BOM_SIZE);
            bis.read(bomBytes); // NOSONAR
            if (!isUtf8BomBytes(bomBytes)) {
                bis.reset();
            }
            final InputSource is = new InputSource(bis);
            if (responseData.getCharSet() != null) {
                is.setEncoding(responseData.getCharSet());
            }
            parser.parse(is);
        } catch (final Exception e) {
            throw new RobotCrawlAccessException("Could not parse " + responseData.getUrl(), e);
        } finally {
            IOUtils.closeQuietly(bis);
        }

        final Document document = parser.getDocument();

        final Map<String, Object> dataMap = new HashMap<String, Object>();
        for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
            final String path = entry.getValue();
            try {
                final XObject xObj = getXPathAPI().eval(document, path);
                final int type = xObj.getType();
                switch (type) {
                case XObject.CLASS_BOOLEAN:
                    final boolean b = xObj.bool();
                    putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
                    break;
                case XObject.CLASS_NUMBER:
                    final double d = xObj.num();
                    putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
                    break;
                case XObject.CLASS_STRING:
                    final String str = xObj.str();
                    putResultDataBody(dataMap, entry.getKey(), str);
                    break;
                case XObject.CLASS_NULL:
                case XObject.CLASS_UNKNOWN:
                case XObject.CLASS_NODESET:
                case XObject.CLASS_RTREEFRAG:
                case XObject.CLASS_UNRESOLVEDVARIABLE:
                default:
                    final Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
                    putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
                    break;
                }
            } catch (final TransformerException e) {
                logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
            }
        }

        FileInputStream fis = null;
        try {
            fis = new FileInputStream(tempFile);
            responseData.setResponseBody(fis);
            putAdditionalData(dataMap, responseData, document);
        } catch (final FileNotFoundException e) {
            logger.warn(tempFile + " does not exist.", e);
            putAdditionalData(dataMap, responseData, document);
        } finally {
            IOUtils.closeQuietly(fis);
        }

        try {
            resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
        } catch (final Exception e) {
            throw new RobotCrawlAccessException("Could not serialize object: " + responseData.getUrl(), e);
        }
        resultData.setEncoding(charsetName);
    } finally {
        if (!tempFile.delete()) {
            logger.warn("Could not delete a temp file: " + tempFile);
        }
    }
}