Example usage for org.xml.sax ContentHandler toString

List of usage examples for org.xml.sax ContentHandler toString

Introduction

In this page you can find the example usage for org.xml.sax ContentHandler toString.

Prototype

public String toString() 

Source Link

Document

Returns a string representation of the object.

Usage

From source file:de.mpg.escidoc.services.extraction.ExtractionChain.java

public ExtractionResult doExtract(String infileName, String outfileName) {
    File outfile = new File(outfileName);

    Date stepStart = new Date();
    Date current;//w w w. j  a  v a  2 s. com

    logger.info("Extracting PDF content ----------------------------------------");
    logger.info("Infile: " + infileName);
    logger.info("Outfile: " + outfileName);

    logger.info(stepStart + " -- started");

    // xPDF

    try {
        logger.info("Extracting with xPDF");

        StringBuffer command = new StringBuffer(2048);
        command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 "
                : "/usr/bin/pdftotext -enc UTF-8 ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());

        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (proc.exitValue() == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with xPDF:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // PDFBox
    try {
        logger.info("Extracting with PDFBox");
        stepStart = new Date();

        StringBuffer command = new StringBuffer(1024);
        command.append(System.getProperty("os.name").contains("Windows")
                ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "
                : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText ");
        command.append(infileName);
        command.append(" ");
        command.append(outfileName);

        Process proc = Runtime.getRuntime().exec(command.toString());
        StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox");
        StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox");

        inputGobbler.start();
        errorGobbler.start();

        int exitCode = proc.waitFor();

        if (exitCode == 0) {

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }
            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;
        }
    } catch (Exception e) {
        logger.warn("Error extracting PDF with PDFBox:");
        logger.warn(e.getStackTrace());
    }

    current = new Date();
    logger.info(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    // iText
    try {
        logger.info("Extracting with iText");
        stepStart = new Date();

        PdfReader reader = new PdfReader(infileName);
        int numberOfPages = reader.getNumberOfPages();

        outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
        for (int i = 0; i < numberOfPages; i++) {
            outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1));
        }

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting PDF with iText:", e);
    }

    // tika

    InputStream stream = null;

    try {
        logger.info("Extracting with Tika");
        stepStart = new Date();

        stream = TikaInputStream.get(new File(infileName));

        ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE);

        new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext());

        String content = handler.toString();

        FileUtils.writeStringToFile(outfile, content);

        stream.close();

        if (verbose) {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                logger.info(line);
            }
            bufferedReader.close();
        }

        current = new Date();
        logger.info(current + " -- finished successfully");
        logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

        return ExtractionResult.OK;

    } catch (Exception e) {
        logger.warn("Error extracting Tika:", e);
        try {
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

    current = new Date();
    logger.warn(current + " -- finished unsuccessfully");
    logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

    logger.info("... giving up");

    return ExtractionResult.FAILURE;
}

From source file:com.bah.applefox.main.plugins.fulltextindex.FTLoader.java

/**
 * This method is used to add all information parsed by tika into the
 * Accumulo table/*from  w ww.ja v  a 2 s .co m*/
 * 
 * @param url
 *            - the URL of the page that has been parsed
 * @param tikaParsed
 *            - all of the engrams from the page
 * @throws TikaException
 * @throws SAXException
 */
private static boolean addToDataBaseTable(String url) {
    try {
        // Connect to the data table
        BatchWriter writer = AccumuloUtils.connectBatchWrite(dTable);

        // Let the user know the url is being added
        System.out.println("Adding " + url + " with prefix " + longSuffix);

        // Get the input stream (in case it is not an html document
        InputStream urlInput = new URL(url).openStream();

        // Set the page contents (used for filtering if it is an html
        // document)
        String pageContents = getPageContents(new URL(url));

        // If the document is HTML
        if (exDivs.size() != 0 && pageContents.toLowerCase().contains("<html>")) {
            // Filter out some divs (especially generic headers/footers,
            // etc.)
            pageContents = DivsFilter.filterDivs(pageContents, exDivs);
            urlInput = new ByteArrayInputStream(pageContents.getBytes());
        }

        // Parse with tika
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        ContentHandler handler = new BodyContentHandler();

        parser.parse(urlInput, handler, metadata, context);

        // Get the keywords of the page and its title
        String keywords = metadata.get("keywords");
        String title = metadata.get("title");
        if (title == null) {
            WebPageCrawl p;
            try {
                p = new WebPageCrawl(url, "", Collections.<String>emptySet());
            } catch (PageCrawlException e) {
                log.info(e);
                return false;
            }
            title = p.getTitle();
        }

        // If there are keywords, delimit the commas, otherwise make it a
        // blank screen (not null)
        if (keywords != null) {
            keywords = keywords.replaceAll(",", "[ ]");
        } else {
            keywords = "";
        }

        // Make everything lower case for ease of search
        String plainText = handler.toString().toLowerCase();

        // Split it into <Key,Value> pairs of NGrams, with the Value being
        // the count of the NGram on the page
        HashMap<String, Integer> tikaParsed = IngestUtils
                .collectTerms(IngestUtils.createNGrams(plainText, maxNGrams));

        // A counter for the final number of words
        Integer totalWords = 0;

        // A HashMap for the final NGrams
        HashMap<String, Integer> finalParsed = new HashMap<String, Integer>();

        for (String i : tikaParsed.keySet()) {
            int freq = tikaParsed.get(i);
            totalWords += freq;
            // erase stop words
            if (stopWords != null && !stopWords.contains(i)) {
                finalParsed.put(i, tikaParsed.get(i));
            } else if (stopWords == null) {
                finalParsed.put(i, tikaParsed.get(i));
            }
        }

        System.out.println("Tika Parsed: " + finalParsed.keySet().size());
        System.out.println("Starting");
        int counter = 0;

        String namedURL = url + "[ ]" + title + "[ ]" + keywords;

        for (String row : finalParsed.keySet()) {
            row = row + " " + longSuffix;
            for (String CQ : finalParsed.keySet()) {
                String groupedVal = new String();
                Integer wc = finalParsed.get(CQ);
                double freq = wc.doubleValue() / totalWords.doubleValue();
                groupedVal = wc + "," + freq;
                Value val = new Value(IngestUtils.serialize(groupedVal));

                Mutation m = new Mutation(row);
                m.put(namedURL, CQ, new Date().getTime(), val);
                writer.addMutation(m);
                counter++;
            }

        }

        System.out.println("Wrote " + counter + " Key-Value pairs to Accumulo.");

        writer.close();
        System.out.println("Stopped writing");
    } catch (AccumuloException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (AccumuloSecurityException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TableNotFoundException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TableExistsException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (MalformedURLException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
    } catch (IOException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (SAXException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    } catch (TikaException e) {
        if (e.getMessage() != null) {
            log.error(e.getMessage());
        } else {
            log.error(e.getStackTrace());
        }
        return false;
    }
    return true;
}

From source file:de.u808.simpleinquest.indexer.impl.DefaultIndexer.java

public Document indexFile(File file) throws FileNotFoundException, IndexerException {
    Document document = null;/*from w  w w . j a  v a  2  s. c o  m*/
    if (file.canRead()) {
        try {
            document = new Document();
            InputStream input = new FileInputStream(file);

            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);

            document.add(new Field(Indexer.PATH_FIELD_NAME, file.getPath(), Field.Store.YES,
                    Field.Index.UN_TOKENIZED));
            document.add(new Field(Indexer.ID_FIELD_NAME, MD5Util.getInstance().getMD5Hex(file.getPath()),
                    Field.Store.YES, Field.Index.UN_TOKENIZED));

            document.add(new Field(Indexer.MODIFIED_FIELD_NAME,
                    DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES,
                    Field.Index.UN_TOKENIZED));

            document.add(new Field(Indexer.CONTENT_FIELD_NAME, new StringReader(handler.toString())));
        } catch (Exception e) {
            if (e instanceof TikaException) {
                Throwable t = e.getCause();
                if (t != null && t.getMessage() != null) {
                    if (t.getMessage().startsWith("Error decrypting document")) {
                        log.debug("Cant index encrypted document.");
                        return document;
                    }
                }
            }
            throw new IndexerException(e.getMessage(), e);
        }
    } else {
        log.debug("Cant read file: " + file.getName());
    }
    return document;
}

From source file:uk.ac.kcl.itemProcessors.TikaDocumentItemProcessor.java

@Override
public Document process(final Document doc) throws Exception {
    LOG.debug("starting " + this.getClass().getSimpleName() + " on doc " + doc.getDocName());
    long startTime = System.currentTimeMillis();
    ContentHandler handler;
    if (keepTags) {
        handler = new ToXMLContentHandler();
    } else {//from   ww w  . j a v  a2 s .  c o  m
        handler = new BodyContentHandler();
    }

    Metadata metadata = new Metadata();
    String contentType = "TL_CONTENT_TYPE_UNKNOWN";
    try (InputStream stream = new ByteArrayInputStream(doc.getBinaryContent())) {
        ParseContext context = new ParseContext();
        context.set(TikaConfig.class, config);
        parser.parse(stream, handler, metadata, context);

        Set<String> metaKeys = new HashSet<String>(Arrays.asList(metadata.names()));

        extractOCRMetadata(doc, metaKeys, metadata);

        contentType = extractContentTypeMetadata(doc, metaKeys, metadata);

        extractPageCountMetadata(doc, metaKeys, metadata);

        addField(doc, handler.toString());
    } catch (Exception ex) {
        addField(doc, ex.getMessage());
    }
    long endTime = System.currentTimeMillis();
    LOG.debug("{};Content-Type:{};Time:{} ms", this.getClass().getSimpleName(), contentType,
            endTime - startTime);
    LOG.debug("finished " + this.getClass().getSimpleName() + " on doc " + doc.getDocName());
    return doc;
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceRTF(Resource r) {

    Data d = r.getData();//www.  jav a 2s.co  m
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;

    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        RTFParser parser = new RTFParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourcePDF(Resource r) {

    Data d = r.getData();/*from  w w  w . j  a  v  a 2  s  . c om*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        PDFParser parser = new PDFParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceOOXML(Resource r) {

    Data d = r.getData();/*w  w  w.j a va 2  s .c om*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OOXMLParser parser = new OOXMLParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceOffice(Resource r) {

    Data d = r.getData();/*from   w  w w  . j a v  a2 s.  co  m*/
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OfficeParser parser = new OfficeParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:cx.fbn.nevernote.threads.IndexRunner.java

private void indexResourceODF(Resource r) {

    Data d = r.getData();//ww w .  j a  v a2  s. com
    for (int i = 0; i < 20 && d.getSize() == 0; i++)
        d = r.getData();
    if (d.getSize() == 0)
        return;
    QTemporaryFile f = writeResource(d);
    if (!keepRunning) {
        return;
    }

    InputStream input;
    try {
        input = new FileInputStream(new File(f.fileName()));
        ContentHandler textHandler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();
        OpenDocumentParser parser = new OpenDocumentParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        String[] result = textHandler.toString().split(regex);
        for (int i = 0; i < result.length && keepRunning; i++) {
            if (interrupt) {
                processInterrupt();
            }
            addToIndex(r.getNoteGuid(), result[i], "RESOURCE");
        }
        input.close();

        f.close();
    } catch (java.lang.ClassCastException e) {
        logger.log(logger.LOW, "Cast exception: " + e.getMessage());
    } catch (FileNotFoundException e) {
        logger.log(logger.LOW, "FileNotFound  exception: " + e.getMessage());
    } catch (IOException e) {
        logger.log(logger.LOW, "IO  exception: " + e.getMessage());
    } catch (SAXException e) {
        logger.log(logger.LOW, "SAX  exception: " + e.getMessage());
    } catch (TikaException e) {
        logger.log(logger.LOW, "Tika  exception: " + e.getMessage());
    } catch (Exception e) {
        logger.log(logger.LOW, "Unknown  exception: " + e.getMessage());
    } catch (java.lang.NoSuchMethodError e) {
        logger.log(logger.LOW, "NoSuchMethod error: " + e.getMessage());
    } catch (Error e) {
        logger.log(logger.LOW, "Unknown error: " + e.getMessage());
    }
}

From source file:it.polito.tellmefirst.web.rest.clients.ClientEpub.java

private String autoParseAll(File file) {

    InputStream is = null;/*from www. ja  v  a  2  s .co  m*/
    String textBody = "";
    try {
        InputStream input = new FileInputStream(file);
        ContentHandler text = new BodyContentHandler(10 * 1024 * 1024);
        LinkContentHandler links = new LinkContentHandler();
        ContentHandler handler = new TeeContentHandler(links, text);
        Metadata metadata = new Metadata();
        EpubParser parser2 = new EpubParser();
        ParseContext context = new ParseContext();
        parser2.parse(input, handler, metadata, context);
        textBody = text.toString().replaceAll(">[\\s]*?<", "><").toLowerCase().replaceAll("\\d+.*", "");
        ;
        // Remove the Project Gutenberg meta information from the text
        textBody = textBody.split("end of the project gutenberg ebook")[0].toLowerCase();
        LOG.debug("Body: " + textBody); //all text in one
    } catch (Exception el) {
        el.printStackTrace();
    } finally {
        if (is != null)
            IOUtils.closeQuietly(is);
    }
    return textBody;
}