Java tutorial
/* * Copyright (c) 2008, intarsys consulting GmbH * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Public License as published by the * Free Software Foundation; either version 3 of the License, * or (at your option) any later version. * <p/> * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * by MasYes: ? ? - ? ? , ? ^^ */ package lisa; import java.io.*; import de.intarsys.pdf.parser.COSLoadException; import de.intarsys.pdf.pd.PDDocument; import de.intarsys.tools.locator.FileLocator; import de.intarsys.pdf.content.CSDeviceBasedInterpreter; import de.intarsys.pdf.content.CSException; import de.intarsys.pdf.content.text.CSTextExtractor; import de.intarsys.pdf.cos.COSVisitorException; import de.intarsys.pdf.pd.PDPage; import de.intarsys.pdf.pd.PDPageNode; import de.intarsys.pdf.pd.PDPageTree; import de.intarsys.pdf.tools.kernel.PDFGeometryTools; import java.awt.geom.AffineTransform; import java.util.Iterator; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.*; /** * Extract complete text from document. * */ public class ExtractText { public static String parse(String file) { switch (file.substring(file.lastIndexOf("."))) { case ".pdf": return parsePDF(file); case ".txt": return parseTXT(file); case ".doc": return parseDOC(file); case ".docx": return parseDOCX(file); default: throw new UnsupportedFormatException(); } } private static String parseDOC(String file) { try { BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file)); WordExtractor word = new WordExtractor(isr); return word.getText(); } catch (Exception e) { Common.createLog(e); return ""; } } private static String parseDOCX(String file) { try { BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file)); XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr)); return word.getText(); } catch (Exception e) { Common.createLog(e); return ""; } } private static String parsePDF(String file) { ExtractText client = new ExtractText(); try { return client.run(file); } catch (Exception e) { Common.createLog(e); return ""; } } private static String parseTXT(String file) { File text = new File(file); try { System.gc(); String str = ""; InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8"); BufferedReader reader; reader = new BufferedReader(isr); for (String line; (line = reader.readLine()) != null;) { str += line + "\n"; } if (!str.contains("")) return str; str = ""; isr = new InputStreamReader(new FileInputStream(file), "Windows-1251"); reader = new BufferedReader(isr); for (String line; (line = reader.readLine()) != null;) { str += line + "\n"; } return str; } catch (IOException e) { Common.createLog(e); return ""; } } protected void extractText(PDPageTree pageTree, StringBuilder sb) { if (pageTree.getCount() > 30) { throw new LargeFileException(); } for (Iterator it = pageTree.getKids().iterator(); it.hasNext();) { PDPageNode node = (PDPageNode) it.next(); if (node.isPage()) { try { CSTextExtractor extractor = new CSTextExtractor(); PDPage page = (PDPage) node; AffineTransform pageTx = new AffineTransform(); PDFGeometryTools.adjustTransform(pageTx, page); extractor.setDeviceTransform(pageTx); CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(null, extractor); interpreter.process(page.getContentStream(), page.getResources()); sb.append(extractor.getContent()); } catch (CSException e) { e.printStackTrace(); } } else { extractText((PDPageTree) node, sb); } } } protected String extractText(String filename) throws COSVisitorException, IOException { PDDocument doc = getDoc(); StringBuilder sb = new StringBuilder(); extractText(doc.getPageTree(), sb); return sb.toString(); } private String run(String file) throws Exception { try { open(file); return extractText(file); } finally { close(); } } private PDDocument doc; protected PDDocument basicOpen(String pathname) throws IOException, COSLoadException { FileLocator locator = new FileLocator(pathname); return PDDocument.createFromLocator(locator); } protected void basicSave(PDDocument doc, String outputFileName) throws IOException { FileLocator locator = new FileLocator(outputFileName); doc.save(locator, null); } /** * Close the current document. * * @throws IOException */ public void close() throws IOException { if (getDoc() != null) { getDoc().close(); } } /** * Create a new document. */ public void create() { // First create a new document. setDoc(PDDocument.createNew()); // You could add more information about the environment: getDoc().setAuthor("intarsys consulting GmbH"); //$NON-NLS-1$ getDoc().setCreator("intarsys PDF API"); //$NON-NLS-1$ } /** * The current document. * * @return The current document. */ public PDDocument getDoc() { return doc; } /** * Open a document. * * @param pathname * The path name to the document. * @throws COSLoadException * @throws IOException */ public void open(String pathname) throws IOException, COSLoadException { setDoc(basicOpen(pathname)); } /** * Save current document to path. * * @param outputFileName * The destination path for the document. * @throws IOException */ public void save(String outputFileName) throws IOException { basicSave(getDoc(), outputFileName); } /** * Set the current document. * * @param doc * The new current document. */ protected void setDoc(PDDocument doc) { this.doc = doc; } }