Example usage for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString()

Source Link

Document

Returns a string representation of the object.

Usage

From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java

public ExtractionResult doExtract(String infileName, String outfileName) {
    File outfile = new File(outfileName);

    Date stepStart = new Date();
    Date current;//w w w. j  a  v a  2 s. com

    logger.info("Extracting PDF content ----------------------------------------");
    logger.info("Infile: " + infileName);
    logger.info("Outfile: " + outfileName);

    logger.info(stepStart + " -- started");

    // xPDF

    try {
        logger.info("Extracting with xPDF");

        StringBuffer command = new StringBuffer(2048);
        command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 "
                : "/usr/bin/pdftotext -enc UTF-8 ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());

        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (proc.exitValue() == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with xPDF:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // PDFBox
    try {
        logger.info("Extracting with PDFBox");
        stepStart = new Date();

        StringBuffer command = new StringBuffer(1024);
        command.append(System.getProperty("os.name").contains("Windows")
                ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "
                : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());
        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (exitCode == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with PDFBox:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // iText
    try {
        logger.info("Extracting with iText");
        stepStart = new Date();

        PdfReader reader = new PdfReader(infileName);
        int numberOfPages = reader.getNumberOfPages();

        outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
        for (int i = 0; i < numberOfPages; i++) {
            outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1));
        }

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting PDF with iText:", e);
    }

    // tika

    InputStream stream = null;

    try {
        logger.info("Extracting with Tika");
        stepStart = new Date();

        stream = TikaInputStream.get(new File(infileName));

        ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE);

        new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext());

        String content = handler.toString();

        FileUtils.writeStringToFile(outfile, content);

        stream.close();

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting Tika:", e);
        try {
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

    current = new Date();
    logger.warn(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    logger.info("... giving up");

    return ExtractionResult.FAILURE;
}

From source file:com.bah.applefox.main.plugins.fulltextindex.FTLoader.java

/**
 * This method is used to add all information parsed by tika into the
 * Accumulo table/*from  w ww.ja v  a 2 s .co m*/
 * 
 * @param url
 *            - the URL of the page that has been parsed
 * @param tikaParsed
 *            - all of the engrams from the page
 * @throws TikaException
 * @throws SAXException
 */
private static boolean addToDataBaseTable(String url) {
    try {
        // Connect to the data table
        BatchWriter writer = AccumuloUtils.connectBatchWrite(dTable);

        // Let the user know the url is being added
        System.out.println("Adding " + url + " with prefix " + longSuffix);

        // Get the input stream (in case it is not an html document
        InputStream urlInput = new URL(url).openStream();

        // Set the page contents (used for filtering if it is an html
        // document)
        String pageContents = getPageContents(new URL(url));

        // If the document is HTML
        if (exDivs.size() != 0 && pageContents.toLowerCase().contains("<html>")) {
            // Filter out some divs (especially generic headers/footers,
            // etc.)
            pageContents = DivsFilter.filterDivs(pageContents, exDivs);
            urlInput = new ByteArrayInputStream(pageContents.getBytes());
        }

        // Parse with tika
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        ContentHandler handler = new BodyContentHandler();

        parser.parse(urlInput, handler, metadata, context);

        // Get the keywords of the page and its title
        String keywords = metadata.get("keywords");
        String title = metadata.get("title");
        if (title == null) {
            WebPageCrawl p;
            try {
                p = new WebPageCrawl(url, "", Collections.<String>emptySet());
            } catch (PageCrawlException e) {
                log.info(e);
                return false;
            }
            title = p.getTitle();
        }

        // If there are keywords, delimit the commas, otherwise make it a
        // blank screen (not null)
        if (keywords != null) {
            keywords = keywords.replaceAll(",", "[ ]");
        } else {
            keywords = "";
        }

        // Make everything lower case for ease of search
        String plainText = handler.toString().toLowerCase();

        // Split it into <Key,Value> pairs of NGrams, with the Value being
        // the count of the NGram on the page
        HashMap<String, Integer> tikaParsed = IngestUtils
                .collectTerms(IngestUtils.createNGrams(plainText, maxNGrams));

        // A counter for the final number of words
        Integer totalWords = 0;

        // A HashMap for the final NGrams
        HashMap<String, Integer> finalParsed = new HashMap<String, Integer>();

        for (String i : tikaParsed.keySet()) {
            int freq = tikaParsed.get(i);
            totalWords += freq;
            // erase stop words
            if (stopWords != null && !stopWords.contains(i)) {
                finalParsed.put(i, tikaParsed.get(i));
            } else if (stopWords == null) {
                finalParsed.put(i, tikaParsed.get(i));
            }
        }

        System.out.println("Tika Parsed: " + finalParsed.keySet().size());
        System.out.println("Starting");
        int counter = 0;

        String namedURL = url + "[ ]" + title + "[ ]" + keywords;

        for (String row : finalParsed.keySet()) {
            row = row + " " + longSuffix;
            for (String CQ : finalParsed.keySet()) {
                String groupedVal = new String();
                Integer wc = finalParsed.get(CQ);
                double freq = wc.doubleValue() / totalWords.doubleValue();
                groupedVal = wc + "," + freq;
                Value val = new Value(IngestUtils.serialize(groupedVal));

                Mutation m = new Mutation(row);
                m.put(namedURL, CQ, new Date().getTime(), val);
                writer.addMutation(m);
                counter++;
            }

        }

        System.out.println("Wrote " + counter + " Key-Value pairs to Accumulo.");

        writer.close();
        System.out.println("Stopped writing");
    } catch (AccumuloException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (AccumuloSecurityException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TableNotFoundException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TableExistsException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (MalformedURLException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
    } catch (IOException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (SAXException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TikaException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    }
    return true;
}

From source file:de.u808.simpleinquest.indexer.impl.DefaultIndexer.java

public Document indexFile(File file) throws FileNotFoundException, IndexerException {
    Document document = null;/*from w  w w . j a  v a  2  s. c o  m*/
    if (file.canRead()) {
        try {
            document = new Document();
            InputStream input = new FileInputStream(file);

            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);

            document.add(new Field(Indexer.PATH_FIELD_NAME, file.getPath(), Field.Store.YES,
                    Field.Index.UN_TOKENIZED));
            document.add(new Field(Indexer.ID_FIELD_NAME, MD5Util.getInstance().getMD5Hex(file.getPath()),
                    Field.Store.YES, Field.Index.UN_TOKENIZED));

            document.add(new Field(Indexer.MODIFIED_FIELD_NAME,
                    DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES,
                    Field.Index.UN_TOKENIZED));

            document.add(new Field(Indexer.CONTENT_FIELD_NAME, new StringReader(handler.toString())));
        } catch (Exception e) {
            if (e instanceof TikaException) {
                Throwable t = e.getCause();
                if (t != null && t.getMessage() != null) {
                    if (t.getMessage().startsWith("Error decrypting document")) {
                        log.debug("Cant index encrypted document.");
                        return document;
                    }
                }
            }
            throw new IndexerException(e.getMessage(), e);
        }
    } else {
        log.debug("Cant read file: " + file.getName());
    }
    return document;
}

From source file:uk.ac.kcl.itemProcessors.TikaDocumentItemProcessor.java

@Override
public Document process(final Document doc) throws Exception {
    LOG.debug("starting " + this.getClass().getSimpleName() + " on doc " + doc.getDocName());
    long startTime = System.currentTimeMillis();
    ContentHandler handler;
    if (keepTags) {
        handler = new ToXMLContentHandler();
    } else {//from   ww w  . j a v  a2 s .  c o  m
        handler = new BodyContentHandler();
    }

    Metadata metadata = new Metadata();
    String contentType = "TL_CONTENT_TYPE_UNKNOWN";
    try (InputStream stream = new ByteArrayInputStream(doc.getBinaryContent())) {
        ParseContext context = new ParseContext();
        context.set(TikaConfig.class, config);
        parser.parse(stream, handler, metadata, context);

        Set<String> metaKeys = new HashSet<String>(Arrays.asList(metadata.names()));

        extractOCRMetadata(doc, metaKeys, metadata);

        contentType = extractContentTypeMetadata(doc, metaKeys, metadata);

        extractPageCountMetadata(doc, metaKeys, metadata);

        addField(doc, handler.toString());
    } catch (Exception ex) {
        addField(doc, ex.getMessage());
    }
    long endTime = System.currentTimeMillis();
    LOG.debug("{};Content-Type:{};Time:{} ms", this.getClass().getSimpleName(), contentType,
            endTime - startTime);
    LOG.debug("finished " + this.getClass().getSimpleName() + " on doc " + doc.getDocName());
    return doc;
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceRTF(Resource r) {

    Data d = r.getData();//www.  jav a 2s.co  m
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;

    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        RTFParser parser = new RTFParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourcePDF(Resource r) {

    Data d = r.getData();/*from  w w  w . j  a  v  a 2  s  . c om*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser parser = new PDFParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceOOXML(Resource r) {

    Data d = r.getData();/*w  w  w.j a va 2  s .c om*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OOXMLParser parser = new OOXMLParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceOffice(Resource r) {

    Data d = r.getData();/*from   w  w w  . j a v  a2 s.  co  m*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OfficeParser parser = new OfficeParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceODF(Resource r) {

    Data d = r.getData();//ww w .  j a  v a2  s. com
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OpenDocumentParser parser = new OpenDocumentParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:it.polito.tellmefirst.web.rest.clients.ClientEpub.java

private String autoParseAll(File file) {

    InputStream is = null;/*from www. ja  v  a  2  s .co  m*/
    String textBody = "";
    try {
        InputStream input = new FileInputStream(file);
        ContentHandler text = new BodyContentHandler(10 * 1024 * 1024);
        LinkContentHandler links = new LinkContentHandler();
        ContentHandler handler = new TeeContentHandler(links, text);
        Metadata metadata = new Metadata();
        EpubParser parser2 = new EpubParser();
        ParseContext context = new ParseContext();
        parser2.parse(input, handler, metadata, context);
        textBody = text.toString().replaceAll(">[\\s]*?<", "><").toLowerCase().replaceAll("\\d+.*", "");
        ;
        // Remove the Project Gutenberg meta information from the text
        textBody = textBody.split("end of the project gutenberg ebook")[0].toLowerCase();
        LOG.debug("Body: " + textBody); //all text in one
    } catch (Exception el) {
        el.printStackTrace();
    } finally {
        if (is != null)
            IOUtils.closeQuietly(is);
    }
    return textBody;
}