Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:fr.paris.lutece.plugins.document.modules.solr.indexer.SolrDocIndexer.java

/**
 * Builds a document which will be used by solr during the indexing of the
 * pages of the site with the following fields : summary, uid, url,
 * contents, title and description./* w w w.j a v  a2  s  . c o m*/
 *
 * @param document the document to index
 * @param strUrl the url of the documents
 * @param strRole the lutece role of the page associate to the document
 * @param strPortletDocumentId the document id concatened to the id portlet
 * with a & in the middle
 * @return the built Document
 * @throws IOException The IO Exception
 * @throws InterruptedException The InterruptedException
 */
private SolrItem getDocument(Document document, String strUrl, String strRole, String strPortletDocumentId)
        throws IOException, InterruptedException {
    // make a new, empty document
    SolrItem item = new SolrItem();

    // Add the url as a field named "url".  Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    item.setUrl(strUrl);

    // Add the PortletDocumentId as a field named "document_portlet_id".  
    item.setDocPortletId(strPortletDocumentId);

    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    item.setDate(document.getDateModification());

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdDocument = String.valueOf(document.getId());
    item.setUid(getResourceUid(strIdDocument, DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE));

    String strContentToIndex = getContentToIndex(document, item);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try {
        new org.apache.tika.parser.html.HtmlParser().parse(
                new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata, new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    item.setContent(handler.toString());

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    item.setTitle(document.getTitle());

    item.setType(document.getType());

    item.setRole(strRole);

    item.setSite(SolrIndexerService.getWebAppName());

    // return the document
    return item;
}

From source file:fr.paris.lutece.plugins.document.modules.solr.indexer.SolrDocIndexer.java

/**
 * Get item//from  ww w  . j  a va2  s. c  o m
 * @param portlet The portlet
 * @param document The document
 * @return The item
 * @throws IOException
 */
private SolrItem getItem(Portlet portlet, Document document) throws IOException {
    // the item
    SolrItem item = new SolrItem();
    item.setUid(getResourceUid(Integer.valueOf(document.getId()).toString(),
            DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE));
    item.setDate(document.getDateModification());
    item.setType(document.getType());
    item.setSummary(document.getSummary());
    item.setTitle(document.getTitle());
    item.setSite(SolrIndexerService.getWebAppName());
    item.setRole("none");

    if (portlet != null) {
        item.setDocPortletId(document.getId() + SolrConstants.CONSTANT_AND + portlet.getId());
    }

    item.setXmlContent(document.getXmlValidatedContent());

    // Reload the full object to get all its searchable attributes
    UrlItem url = new UrlItem(SolrIndexerService.getBaseUrl());
    url.addParameter(PARAMETER_DOCUMENT_ID, document.getId());
    url.addParameter(PARAMETER_PORTLET_ID, portlet.getId());
    item.setUrl(url.getUrl());

    // Date Hierarchy
    GregorianCalendar calendar = new GregorianCalendar();
    calendar.setTime(document.getDateModification());
    item.setHieDate(calendar.get(GregorianCalendar.YEAR) + "/" + (calendar.get(GregorianCalendar.MONTH) + 1)
            + "/" + calendar.get(GregorianCalendar.DAY_OF_MONTH) + "/");

    List<String> categorie = new ArrayList<String>();

    for (Category cat : document.getCategories()) {
        categorie.add(cat.getName());
    }

    item.setCategorie(categorie);

    // The content
    String strContentToIndex = getContentToIndex(document, item);
    ContentHandler handler = null;
    if (PARAMETER_DOCUMENT_MAX_CHARS != null) {
        handler = new BodyContentHandler(PARAMETER_DOCUMENT_MAX_CHARS);
    } else {
        handler = new BodyContentHandler();
    }

    Metadata metadata = new Metadata();

    try {
        new HtmlParser().parse(new ByteArrayInputStream(strContentToIndex.getBytes()), handler, metadata,
                new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during document parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during document parsing.");
    }

    item.setContent(handler.toString());

    return item;
}

From source file:it.polito.tellmefirst.web.rest.clients.ClientEpub.java

private HashMap<String, String> parseEpub(File file) throws IOException, TMFVisibleException {

    LOG.debug("[parseEpub] - BEGIN");

    ZipFile fi = new ZipFile(file);

    for (Enumeration e = fi.entries(); e.hasMoreElements();) {
        ZipEntry entry = (ZipEntry) e.nextElement();
        if (entry.getName().endsWith("ncx")) {
            InputStream tocMaybeDirty = fi.getInputStream(entry);
            Scanner scanner = new Scanner(tocMaybeDirty, "UTF-8").useDelimiter("\\A");
            String theString = scanner.hasNext() ? scanner.next() : "";
            tocMaybeDirty.close();//from   ww w. j  a v  a 2 s  .c  o m
            scanner.close();

            String res = theString.replaceAll(">[\\s]*?<", "><");

            InputStream toc = new ByteArrayInputStream(res.getBytes(StandardCharsets.UTF_8));

            try {
                DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
                Document doc = dBuilder.parse(toc);
                toc.close();

                if (doc.hasChildNodes()) {
                    findNavMap(doc.getChildNodes());
                }
            } catch (Exception ex) {
                LOG.error("Unable to navigate the TOC");
            }

            removeEmptyTOC(epub);

            //search anchors in links and split
            Set set = epub.entrySet();
            Iterator i = set.iterator();
            while (i.hasNext()) {
                Map.Entry me = (Map.Entry) i.next();
                if (me.getValue().toString().contains("#")) {
                    String[] parts = me.getValue().toString().split("#");
                    String anchor = parts[1];
                    epub.put(me.getKey().toString(), anchor);
                }
            }
        }
        if (entry.getName().endsWith("opf")) { //manage files because order is important
            InputStream content = fi.getInputStream(entry);

            Scanner scanner = new Scanner(content, "UTF-8").useDelimiter("\\A");
            String contentString = scanner.hasNext() ? scanner.next() : "";
            content.close();
            scanner.close();

            String filenameRegex = "href=\"(.*.htm(|l))\".*media-type=\"application/xhtml";
            Pattern pattern = Pattern.compile(filenameRegex);
            Matcher matcher = pattern.matcher(contentString);

            Integer count = 0;
            while (matcher.find()) {
                files.put(count, matcher.group(1));
                count++;
            }
        }
        if (entry.getName().endsWith("html") || entry.getName().endsWith("htm")
                || entry.getName().endsWith("xhtml")) {
            InputStream htmlFile = fi.getInputStream(entry);

            Scanner scanner = new Scanner(htmlFile, "UTF-8").useDelimiter("\\A");
            String htmlString = scanner.hasNext() ? scanner.next() : "";

            String regex1 = htmlString.replaceAll("^[^_]*?<body>", ""); //remove head
            String regex2 = regex1.replaceAll("</body>.*$", ""); //remove tail
            String htmlCleaned = regex2.replaceAll("<a.*?/>", ""); //anchor with one tag

            String[] bits = entry.getName().split("/");
            String fileName = bits[bits.length - 1];

            htmls.put(fileName, htmlCleaned);
        }
    }
    fi.close();
    Integer i;
    for (i = 0; i < files.size(); i++) {
        stringBuilder.append("<p id=\"" + files.get(i) + "\"></p>"); // "anchor" also the heads of each files
        stringBuilder.append(htmls.get(files.get(i)));
    }
    String htmlAll = stringBuilder.toString();

    /* We have all needed files, start to split
       For each link -> made a chunk
       Start from the bottom */
    Metadata metadata = new Metadata();
    Parser parser = new HtmlParser();
    ListIterator<Map.Entry<String, String>> iter = new ArrayList<>(epub.entrySet()).listIterator(epub.size());

    while (iter.hasPrevious()) {
        Map.Entry<String, String> me = iter.previous();
        try {
            ContentHandler contenthandler = new BodyContentHandler(10 * htmlAll.length());
            Scanner sc = new Scanner(htmlAll);
            sc.useDelimiter("id=\"" + me.getValue().toString() + "\">");
            htmlAll = sc.next();
            InputStream stream = new ByteArrayInputStream(sc.next().getBytes(StandardCharsets.UTF_8));
            parser.parse(stream, contenthandler, metadata, new ParseContext());
            String chapterText = contenthandler.toString().toLowerCase().replaceAll("\\d+.*", "");
            String chapterTextWithoutNo = chapterText.replaceAll("\\d+.*", "");
            // Remove the Project Gutenberg meta information from the text
            String chapterTextCleaned = chapterTextWithoutNo.split("end of the project gutenberg ebook")[0];
            epub.put(me.getKey().toString(), chapterTextCleaned);

        } catch (Exception ex) {
            LOG.error("Unable to parse content for index: " + me.getKey() + ", this chapter will be deleted");
            removeChapter(epub, me.getKey().toString());
        }
    }

    /* I remove the Project Gutenberg license chapter from the Map, because it is useless
      for the classification and it generates a Lucene Exception in case of the Italian language
      (the license text is always in English).
            
      You can use this method in order to remove each chapter that is useless for classifying
      your Epub document. */
    removeChapter(epub, "A Word from Project Gutenberg");
    removeEmptyItems(epub);

    //If the Epub file has a bad structure, I try to use the basic Epub extractor of Tika.
    if (epub.size() == 0) {
        LOG.info("The Epub file has a bad structure. Try to use the Tika extractor");
        epub.put("All text", autoParseAll(file));
    }

    removeEmptyItems(epub);

    if (epub.size() == 0) {
        LOG.error("Unable to extract text from this Epub");
        throw new TMFVisibleException("Unable to extract any text from this Epub.");
    }

    removeDownloadedFile(TEMPORARY_PATH);

    LOG.debug("[parseEpub] - END");

    return epub;
}

From source file:com.digitalpebble.storm.crawler.bolt.ParserBolt.java

@Override
public void execute(Tuple tuple) {
    eventCounter.scope("tuple_in").incrBy(1);

    byte[] content = tuple.getBinaryByField("content");

    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    long start = System.currentTimeMillis();

    // rely on mime-type provided by server or guess?

    ByteArrayInputStream bais = new ByteArrayInputStream(content);
    org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();

    LinkContentHandler linkHandler = new LinkContentHandler();
    ContentHandler textHandler = new BodyContentHandler(-1);
    TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler);
    ParseContext parseContext = new ParseContext();

    try {/*from w w w  .ja v  a  2  s . c o m*/
        parseContext.set(HtmlMapper.class, (HtmlMapper) HTMLMapperClass.newInstance());
    } catch (Exception e) {
        LOG.error("Exception while specifying HTMLMapper {}", url, e);
    }

    // build a DOM if required by the parseFilters
    DocumentFragment root = null;
    if (parseFilters.needsDOM()) {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        root = doc.createDocumentFragment();
        DOMBuilder domhandler = new DOMBuilder(doc, root);
        domhandler.setUpperCaseElementNames(upperCaseElementNames);
        domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        teeHandler = new TeeContentHandler(linkHandler, textHandler, domhandler);
    }

    // parse
    String text;
    try {
        tika.getParser().parse(bais, teeHandler, md, parseContext);
        text = textHandler.toString();
    } catch (Exception e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        LOG.error(errorMessage);
        // send to status stream in case another component wants to update
        // its status
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content parsing");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_parsing_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    } finally {
        try {
            bais.close();
        } catch (IOException e) {
            LOG.error("Exception while closing stream", e);
        }
    }

    // add parse md to metadata
    for (String k : md.names()) {
        String[] values = md.getValues(k);
        metadata.setValues("parse." + k, values);
    }

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    // filter and convert the outlinks
    List<Outlink> outlinks = toOutlinks(url, linkHandler.getLinks(), metadata);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, root, metadata, outlinks);
    } catch (RuntimeException e) {
        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        LOG.error(errorMessage);
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_filtering_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : outlinks) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    collector.emit(tuple, new Values(url, content, metadata, text.trim()));
    collector.ack(tuple);
    eventCounter.scope("tuple_success").incrBy(1);
}

From source file:com.thetdgroup.TextExtractionAdapter.java

private ContentInformation processFile(File fileName) throws IOException {
    ContentInformation extractedContent = new ContentInformation();
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    ///*  ww w. j a v a  2 s. c  o  m*/
    InputStream inputStream = null;
    BOMInputStream bomInputStream = null;

    try {
        inputStream = new FileInputStream(fileName);
        bomInputStream = new BOMInputStream(inputStream, false);

        contenthandler = new BodyContentHandler();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName.getName());

        Parser parser = new AutoDetectParser();
        parser.parse(bomInputStream, contenthandler, metadata);
    } catch (Exception exception) {
        extractedContent.hasException();
        extractedContent.setException(exception.toString());
    } finally {
        if (bomInputStream != null) {
            bomInputStream.close();
        }

        if (inputStream != null) {
            inputStream.close();
        }
    }

    //
    //
    extractedContent.setImportedFileName(fileName.getName());

    if (contenthandler != null) {
        String content = contenthandler.toString().replace("\n", " ");
        extractedContent.setContentData(content);
    }

    if (metadata != null) {
        // CREATIVE COMMONS
        extractedContent.setLicenseLocation(metadata.get(Metadata.LICENSE_LOCATION));
        extractedContent.setLicenceURL(metadata.get(Metadata.LICENSE_URL));
        extractedContent.setWorkType(metadata.get(Metadata.WORK_TYPE));

        // DUBLIN CORE
        extractedContent.setContributor(metadata.get(Metadata.CONTRIBUTOR));
        extractedContent.setCoverage(metadata.get(Metadata.COVERAGE));
        extractedContent.setCreator(metadata.get(Metadata.CREATOR));
        extractedContent.setDate(metadata.get(Metadata.DATE));
        extractedContent.setDescription(metadata.get(Metadata.DESCRIPTION));
        extractedContent.setFormat(metadata.get(Metadata.FORMAT));
        extractedContent.setIdentifier(metadata.get(Metadata.IDENTIFIER));
        extractedContent.setLanguage(metadata.get(Metadata.LANGUAGE));
        extractedContent.setModified(metadata.get(Metadata.MODIFIED));
        extractedContent.setPublisher(metadata.get(Metadata.PUBLISHER));
        extractedContent.setRelation(metadata.get(Metadata.RELATION));
        extractedContent.setRights(metadata.get(Metadata.RIGHTS));
        extractedContent.setDublinSource(metadata.get(org.apache.tika.metadata.DublinCore.SOURCE));
        extractedContent.setSubject(metadata.get(Metadata.SUBJECT));
        extractedContent.setTitle(metadata.get(Metadata.TITLE));
        extractedContent.setType(metadata.get(Metadata.TYPE));

        // GEOGRAPHIC
        //extractedContent.setAltitude(metadata.get(Metadata.ALTITUDE));
        //extractedContent.setLatitude(metadata.get(Metadata.LATITUDE));
        //extractedContent.setLongitude(metadata.get(Metadata.LONGITUDE));

        // HTTP HEADERS
        extractedContent.setContentDisposition(metadata.get(Metadata.CONTENT_DISPOSITION));
        extractedContent.setContentEncoding(metadata.get(Metadata.CONTENT_ENCODING));
        extractedContent.setContentLanguage(metadata.get(Metadata.CONTENT_LANGUAGE));
        extractedContent.setContentLength(metadata.get(Metadata.CONTENT_LENGTH));
        extractedContent.setContentLocation(metadata.get(Metadata.CONTENT_LOCATION));
        extractedContent.setContentMD5(metadata.get(Metadata.CONTENT_MD5));
        extractedContent.setContentType(metadata.get(Metadata.CONTENT_TYPE));
        extractedContent.setLastModifier(metadata.get(Metadata.LAST_MODIFIED));
        extractedContent.setLocation(metadata.get(Metadata.LOCATION));

        // MESSAGE (EMAIL)
        //extractedContent.setMessageBCC(metadata.get(Metadata.MESSAGE_BCC));
        //extractedContent.setMessageCC(metadata.get(Metadata.MESSAGE_CC));
        //extractedContent.setMessageFrom(metadata.get(Metadata.MESSAGE_FROM));
        //extractedContent.setMessageRecipientAddress(metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
        //extractedContent.setMessageTo(metadata.get(Metadata.MESSAGE_TO));

        // MS OFFICE
        extractedContent.setApplicationName(metadata.get(Metadata.APPLICATION_NAME));
        extractedContent.setApplicationVersion(metadata.get(Metadata.APPLICATION_VERSION));
        extractedContent.setAuthor(metadata.get(Metadata.AUTHOR));
        extractedContent.setCategory(metadata.get(Metadata.CATEGORY));
        extractedContent.setCharacterCount(metadata.get(Metadata.CHARACTER_COUNT));
        extractedContent.setCharacterCountWithSpace(metadata.get(Metadata.CHARACTER_COUNT_WITH_SPACES));
        extractedContent.setComments(metadata.get(Metadata.COMMENTS));
        extractedContent.setCompany(metadata.get(Metadata.COMPANY));
        extractedContent.setContentStatus(metadata.get(Metadata.CONTENT_STATUS));
        extractedContent.setCreationDate(metadata.get(Metadata.CREATION_DATE));
        extractedContent.setEditTime(metadata.get(Metadata.EDIT_TIME));
        extractedContent.setKeywords(metadata.get(Metadata.KEYWORDS));
        extractedContent.setLastAuthor(metadata.get(Metadata.LAST_AUTHOR));
        extractedContent.setLastPrinted(metadata.get(Metadata.LAST_PRINTED));
        extractedContent.setLastSaved(metadata.get(Metadata.LAST_SAVED));
        extractedContent.setLineCount(metadata.get(Metadata.LINE_COUNT));
        extractedContent.setManager(metadata.get(Metadata.MANAGER));
        extractedContent.setNotes(metadata.get(Metadata.NOTES));
        extractedContent.setPageCount(metadata.get(Metadata.PAGE_COUNT));
        extractedContent.setParagraphCount(metadata.get(Metadata.PARAGRAPH_COUNT));
        extractedContent.setPresentationFormat(metadata.get(Metadata.PRESENTATION_FORMAT));
        extractedContent.setRevisionNumber(metadata.get(Metadata.REVISION_NUMBER));
        extractedContent.setSecurity(metadata.get(Metadata.SECURITY));
        extractedContent.setSlideCount(metadata.get(Metadata.SLIDE_COUNT));
        extractedContent.setTemplate(metadata.get(Metadata.TEMPLATE));
        extractedContent.setTotalTime(metadata.get(Metadata.TOTAL_TIME));
        extractedContent.setVersion(metadata.get(Metadata.VERSION));
        extractedContent.setWordCount(metadata.get(Metadata.WORD_COUNT));

        // CLIMATEFORCAST
        //extractedContent.setClimateForcastAcknowledgement(metadata.get(org.apache.tika.metadata.ClimateForcast.ACKNOWLEDGEMENT));     
        //extractedContent.setClimateForcastCommandLine(metadata.get(org.apache.tika.metadata.ClimateForcast.COMMAND_LINE));     
        //extractedContent.setClimateForcastComment(metadata.get(org.apache.tika.metadata.ClimateForcast.COMMENT));     
        //extractedContent.setClimateForcastContact(metadata.get(org.apache.tika.metadata.ClimateForcast.CONTACT));     
        //extractedContent.setClimateForcastConvention(metadata.get(org.apache.tika.metadata.ClimateForcast.CONVENTIONS));     
        //extractedContent.setClimateForcastExperimentID(metadata.get(org.apache.tika.metadata.ClimateForcast.EXPERIMENT_ID));     
        //extractedContent.setClimateForcastHistory(metadata.get(org.apache.tika.metadata.ClimateForcast.HISTORY));     
        //extractedContent.setClimateForcastInstitution(metadata.get(org.apache.tika.metadata.ClimateForcast.INSTITUTION));     
        //extractedContent.setClimateForcastModelName(metadata.get(org.apache.tika.metadata.ClimateForcast.MODEL_NAME_ENGLISH));     
        //extractedContent.setClimateForcastProgramID(metadata.get(org.apache.tika.metadata.ClimateForcast.PROGRAM_ID));     
        //extractedContent.setClimateForcastProjectID(metadata.get(org.apache.tika.metadata.ClimateForcast.PROJECT_ID));     
        //extractedContent.setClimateForcastRealization(metadata.get(org.apache.tika.metadata.ClimateForcast.REALIZATION));     
        //extractedContent.setClimateForcastReferences(metadata.get(org.apache.tika.metadata.ClimateForcast.REFERENCES));     
        //extractedContent.setClimateForcastSource(metadata.get(org.apache.tika.metadata.ClimateForcast.SOURCE));     
        //extractedContent.setClimateForcastTableID(metadata.get(org.apache.tika.metadata.ClimateForcast.TABLE_ID));     

        // TIFF
        //extractedContent.setTIFFBitsPerSample(metadata.get(Metadata.BITS_PER_SAMPLE));
        //extractedContent.setTIFFEquipmentMake(metadata.get(Metadata.EQUIPMENT_MAKE));
        //extractedContent.setTIFFEquipmentModel(metadata.get(Metadata.EQUIPMENT_MODEL));
        //extractedContent.setTIFFExposureLimit(metadata.get(Metadata.EXPOSURE_TIME));
        //extractedContent.setTIFFFNumber(metadata.get(Metadata.F_NUMBER));
        //extractedContent.setTIFFFlashFired(metadata.get(Metadata.FLASH_FIRED));
        //extractedContent.setTIFFFocalLength(metadata.get(Metadata.FOCAL_LENGTH));
        //extractedContent.setTIFFImageLength(metadata.get(Metadata.IMAGE_LENGTH));
        //extractedContent.setTIFFImageWidth(metadata.get(Metadata.IMAGE_WIDTH));
        //extractedContent.setTIFFISOSpeedRating(metadata.get(Metadata.ISO_SPEED_RATINGS));
        //extractedContent.setTIFFOrientation(metadata.get(Metadata.ORIENTATION));
        //extractedContent.setTIFFOriginalDate(metadata.get(Metadata.ORIGINAL_DATE));
        //extractedContent.setTIFFResolutionHorizontal(metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        //extractedContent.setTIFFResolutionUnit(metadata.get(Metadata.RESOLUTION_UNIT));
        //extractedContent.setTIFFResolutionVertical(metadata.get(Metadata.RESOLUTION_VERTICAL));
        //extractedContent.setTIFFSamplePerPixel(metadata.get(Metadata.SAMPLES_PER_PIXEL));
        //extractedContent.setTIFFSoftware(metadata.get(Metadata.SOFTWARE));

        // TIKA METADATA KEYS
        extractedContent.setResourceNameKey(metadata.get(Metadata.RESOURCE_NAME_KEY));

        // TIKA MIME KEYS
        extractedContent.setMimeTypeMagic(metadata.get(Metadata.MIME_TYPE_MAGIC));
        extractedContent.setTikaMimeType(metadata.get(Metadata.TIKA_MIME_FILE));
    }

    //
    return extractedContent;
}

From source file:com.digitalpebble.storm.crawler.tika.ParserBolt.java

@Override
public void execute(Tuple tuple) {
    eventCounter.scope("tuple_in").incrBy(1);

    byte[] content = tuple.getBinaryByField("content");

    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    long start = System.currentTimeMillis();

    // rely on mime-type provided by server or guess?

    ByteArrayInputStream bais = new ByteArrayInputStream(content);
    org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();

    LinkContentHandler linkHandler = new LinkContentHandler();
    ContentHandler textHandler = new BodyContentHandler(-1);
    TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler);
    ParseContext parseContext = new ParseContext();

    try {//from  w w w .j a  v a 2s.  c om
        parseContext.set(HtmlMapper.class, (HtmlMapper) HTMLMapperClass.newInstance());
    } catch (Exception e) {
        LOG.error("Exception while specifying HTMLMapper {}", url, e);
    }

    // build a DOM if required by the parseFilters
    DocumentFragment root = null;
    if (parseFilters.needsDOM()) {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        root = doc.createDocumentFragment();
        DOMBuilder domhandler = new DOMBuilder(doc, root);
        domhandler.setUpperCaseElementNames(upperCaseElementNames);
        domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        teeHandler = new TeeContentHandler(linkHandler, textHandler, domhandler);
    }

    // parse
    String text;
    try {
        tika.getParser().parse(bais, teeHandler, md, parseContext);
        text = textHandler.toString();
    } catch (Exception e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        LOG.error(errorMessage);
        // send to status stream in case another component wants to update
        // its status
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content parsing");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_parsing_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    } finally {
        try {
            bais.close();
        } catch (IOException e) {
            LOG.error("Exception while closing stream", e);
        }
    }

    // add parse md to metadata
    for (String k : md.names()) {
        String[] values = md.getValues(k);
        metadata.setValues("parse." + k, values);
    }

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    // filter and convert the outlinks
    List<Outlink> outlinks = toOutlinks(url, linkHandler.getLinks(), metadata);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, root, parse);
    } catch (RuntimeException e) {
        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        LOG.error(errorMessage);
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_filtering_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : outlinks) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incrBy(1);
}

From source file:com.digitalpebble.stormcrawler.tika.ParserBolt.java

@Override
public void execute(Tuple tuple) {
    eventCounter.scope("tuple_in").incrBy(1);

    byte[] content = tuple.getBinaryByField("content");

    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    long start = System.currentTimeMillis();

    ByteArrayInputStream bais = new ByteArrayInputStream(content);
    org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();

    // provide the mime-type as a clue for guessing
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
        // pass content type from server as a clue
        md.set(org.apache.tika.metadata.Metadata.CONTENT_TYPE, httpCT);
    }// ww  w.  j a va2  s .c om

    // as well as the filename
    try {
        URL _url = new URL(url);
        md.set(org.apache.tika.metadata.Metadata.RESOURCE_NAME_KEY, _url.getFile());
    } catch (MalformedURLException e1) {
        throw new IllegalStateException("Malformed URL", e1);
    }

    LinkContentHandler linkHandler = new LinkContentHandler();
    ContentHandler textHandler = new BodyContentHandler(-1);
    TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler);
    ParseContext parseContext = new ParseContext();

    try {
        parseContext.set(HtmlMapper.class, (HtmlMapper) HTMLMapperClass.newInstance());
    } catch (Exception e) {
        LOG.error("Exception while specifying HTMLMapper {}", url, e);
    }

    // build a DOM if required by the parseFilters
    DocumentFragment root = null;
    if (parseFilters.needsDOM()) {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        root = doc.createDocumentFragment();
        DOMBuilder domhandler = new DOMBuilder(doc, root);
        domhandler.setUpperCaseElementNames(upperCaseElementNames);
        domhandler.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        teeHandler = new TeeContentHandler(linkHandler, textHandler, domhandler);
    }

    // parse
    String text;
    try {
        tika.getParser().parse(bais, teeHandler, md, parseContext);
        text = textHandler.toString();
    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        LOG.error(errorMessage);
        // send to status stream in case another component wants to update
        // its status
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content parsing");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_parsing_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    } finally {
        try {
            bais.close();
        } catch (IOException e) {
            LOG.error("Exception while closing stream", e);
        }
    }

    // add parse md to metadata
    for (String k : md.names()) {
        String[] values = md.getValues(k);
        metadata.setValues("parse." + k, values);
    }

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    // filter and convert the outlinks
    List<Outlink> outlinks = toOutlinks(url, linkHandler.getLinks(), metadata);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, root, parse);
    } catch (RuntimeException e) {
        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        LOG.error(errorMessage);
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
        collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
        collector.ack(tuple);
        // Increment metric that is context specific
        eventCounter.scope("error_content_filtering_" + e.getClass().getSimpleName()).incrBy(1);
        // Increment general metric
        eventCounter.scope("parse exception").incrBy(1);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : parse.getOutlinks()) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incrBy(1);
}

From source file:edu.stanford.muse.datacache.Blob.java

public Pair<String, String> getContent(BlobStore store) {
    Metadata metadata = new Metadata();
    StringBuilder metadataBuffer = new StringBuilder();
    ContentHandler handler = new BodyContentHandler(-1); // no character limit
    InputStream stream = null;//from   w  w  w. j  a v  a2s .  c  o m
    boolean failed = false;

    try {
        stream = store.getInputStream(this);

        try {
            // skip mp3 files, tika has trouble with it and hangs
            if (!Util.nullOrEmpty(this.filename) && !this.filename.toLowerCase().endsWith(".mp3"))
                parser.parse(stream, handler, metadata, context);

            String[] names = metadata.names();
            //Arrays.sort(names);
            for (String name : names) {
                // some metadata tags are problematic and result in large hex strings... ignore them. (caused memory problems with Henry's archive)
                // https://github.com/openplanets/SPRUCE/blob/master/TikaFileIdentifier/python/config.py
                // we've seen at least unknown tags: (0x8649) (0x935c) (0x02bc)... better to drop them all
                String lname = name.toLowerCase();
                if (lname.startsWith("unknown tag") || lname.startsWith("intel color profile")) {
                    log.info("Warning: dropping metadata tag: " + name + " for blob: " + this.getName());
                    continue;
                }
                metadataBuffer.append(": ");
                metadataBuffer.append(metadata.get(name));
                metadataBuffer.append("\n");
            }
        } catch (Exception e) {
            log.warn("Tika is unable to extract content of blob " + this + ":" + Util.stackTrace(e));
            // often happens for psd files, known tika issue: 
            // http://mail-archives.apache.org/mod_mbox/tika-dev/201210.mbox/%3Calpine.DEB.2.00.1210111525530.7309@urchin.earth.li%3E
            failed = true;
        } finally {
            try {
                stream.close();
            } catch (Exception e) {
                failed = true;
            }
        }

    } catch (IOException e) {
        log.warn("Unable to access content of blob " + filename + ":" + Util.stackTrace(e));
        failed = true;
    }

    if (failed) {
        processedSuccessfully = false;
        return null;
    } else {
        processedSuccessfully = true;
        return new Pair<String, String>(metadataBuffer.toString(), handler.toString());
    }

}

From source file:com.doculibre.constellio.feedprotocol.FeedProcessor.java

private ContentParse asContentParse(String url, List<FeedContent> feedContents,
        ConnectorInstance connectorInstance) {
    ConnectorInstanceServices connectorInstanceServices = ConstellioSpringUtils.getConnectorInstanceServices();
    ContentParse contentParse = new ContentParse();
    List<RecordMeta> metas = new ArrayList<RecordMeta>();
    contentParse.setMetas(metas);/*from www  . j av  a  2  s.c om*/

    List<String> md5s = new ArrayList<String>();
    StringBuffer contentBuffer = new StringBuffer();
    MessageDigest md;
    try {
        md = MessageDigest.getInstance("MD5");
    } catch (NoSuchAlgorithmException e1) {
        throw new RuntimeException(e1);
    }
    for (FeedContent feedContent : feedContents) {
        InputStream input = null;
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler(-1);
            ParseContext parseContext = new ParseContext();

            if (feedContent.getEncoding() == FeedContent.ENCODING.BASE64BINARY) {
                input = new BufferedInputStream(new Base64InputStream(
                        new FileInputStream(feedContent.getValue()), false, 80, new byte[] { (byte) '\n' }));
            } else {
                input = new BufferedInputStream(new FileInputStream(feedContent.getValue()));
            }
            // MD5 on the fly
            DigestInputStream dis = new DigestInputStream(input, md);

            if (connectorInstance.getConnectorType().getName().equals("mailbox-connector")) {
                // FIXME : a supprimer et ajouter un Detector qui detecte
                // correctement les fichiers eml
                // CompositeParser parser = new AutoDetectParser();
                // Map<String, Parser> parsers = parser.getParsers();
                // parsers.put("text/plain", new
                // EmlParser());//message/rfc822
                // parser.setParsers(parsers);
                // Autre pb avec detection des fichiers eml
                Parser parser = new EmlParser();
                parser.parse(dis, handler, metadata, parseContext);
            } else {
                // IOUtils.copy(input, new FileOutputStream(new
                // File("C:/tmp/test.pdf")));
                PARSER.parse(dis, handler, metadata, parseContext);
            }

            md5s.add(Base64.encodeBase64String(md.digest()));

            for (String name : metadata.names()) {
                for (String content : metadata.getValues(name)) {
                    if (!"null".equals(content)) {
                        RecordMeta meta = new RecordMeta();
                        ConnectorInstanceMeta connectorInstanceMeta = connectorInstance.getOrCreateMeta(name);
                        if (connectorInstanceMeta.getId() == null) {
                            connectorInstanceServices.makePersistent(connectorInstance);
                        }
                        meta.setConnectorInstanceMeta(connectorInstanceMeta);
                        meta.setContent(content);
                        metas.add(meta);
                    }
                }
            }

            String contentString = handler.toString();
            // remove the duplication of white space, Bin
            contentBuffer.append(contentString.replaceAll("(\\s){2,}", "$1"));
        } catch (Throwable e) {
            LOG.warning("Could not parse document " + StringUtils.defaultString(url) + " for connector : "
                    + connectorInstance.getName() + " Message: " + e.getMessage());
        } finally {
            IOUtils.closeQuietly(input);
            if (feedContent != null) {
                FileUtils.deleteQuietly(feedContent.getValue());
            }
        }
    }
    contentParse.setContent(contentBuffer.toString());
    contentParse.setMd5(md5s);

    return contentParse;
}

From source file:org.berlin.crawl.parse.WebParser.java

public List<BotLink> parse(final BotLink origLink, final URIBuilder lastBuilder, final String document) {
    final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory");
    final Session session = sf.openSession();
    try {//from   www  .  j  a v a 2 s  .  co  m
        final InputStream input = new ByteArrayInputStream(document.getBytes());
        final LinkContentHandler linkHandler = new LinkContentHandler();
        final ContentHandler textHandler = new BodyContentHandler();
        final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
        final TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler);
        final Metadata metadata = new Metadata();
        final ParseContext parseContext = new ParseContext();
        final HtmlParser parser = new HtmlParser();
        parser.parse(input, teeHandler, metadata, parseContext);

        final String titleOfPage = metadata.get("title");
        // For analytical data, ignore pages that don't have titles
        if (!NullRef.hasValue(titleOfPage)) {
            logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink);
            return null;
        }

        // Continue with parsing //
        final List<BotLink> linksForProcessing = new ArrayList<BotLink>();
        final Set<String> urls = new HashSet<String>();

        int fullLinkCount = 0;
        for (final Link link : linkHandler.getLinks()) {
            fullLinkCount++;
        }
        int linkcount = 0;
        // Loop through the links on the page
        // And add a set number to the queue.
        final Random rchk = new Random(System.currentTimeMillis());
        final List<Link> linksFromPage = linkHandler.getLinks();
        Collections.shuffle(linksFromPage);
        for (final Link link : linksFromPage) {
            // Add a 30% chance of adding this link
            final double rval = rchk.nextDouble();
            final boolean okToAdd = rval > 0.65;
            if (okToAdd && link.getUri() != null) {
                linkcount++;
                if (linkcount > MAX_LINKS_PAGE) {
                    // Only process a given number of links on a page //
                    break;
                } // End of if max reached
                final String fullURL = this.fullURL(link, lastBuilder, urls);
                if (fullURL != null) {
                    try {
                        this.processFullURL(linksForProcessing, link, fullURL);
                    } catch (final Throwable te) {
                        te.printStackTrace();
                    }
                }
            } // End of the if //             
        } // End of the for through the links //

        // Parse the available URLS //
        logger.info("In Web Parser for " + lastBuilder + " # availableNumberOfLinks=" + urls.size()
                + " fullLinkCount=" + fullLinkCount);

        // Persist the current link // 
        origLink.setNumberLinks(fullLinkCount);
        this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString());

        processLinksForQueue(linksForProcessing);
        return linksForProcessing;

    } catch (final Throwable e) {
        e.printStackTrace();
    } finally {
        if (session != null) {
            session.close();
        }
    } // End of the try - catch //
    return null;
}