List of usage examples for org.apache.pdfbox.pdmodel PDDocument PDDocument
public PDDocument(COSDocument doc)
From source file:eu.transkribus.languageresources.extractor.pdf.PDFExtractor.java
@Override public List<String> extractTextFromDocumentPagewise(String pathToFile) { List<String> pageWiseText = new LinkedList<>(); COSDocument cosDoc = null;//from w ww .j a v a2 s .c o m PDDocument pdDoc = null; try { PDFParser parser = new PDFParser(new FileInputStream(new File(pathToFile))); parser.parse(); cosDoc = parser.getDocument(); pdDoc = new PDDocument(cosDoc); for (int pageId = 0; pageId < pdDoc.getNumberOfPages(); pageId++) { pageWiseText.add(extractTextFromPage(pdDoc, pageId)); } } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } finally { if (pdDoc != null) { try { pdDoc.close(); } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } } if (cosDoc != null) { try { cosDoc.close(); } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } } } return pageWiseText; }
From source file:eu.transkribus.languageresources.extractor.pdf.PDFExtractor.java
@Override public String extractTextFromPage(String pathToFile, int page) { StringBuilder sb = new StringBuilder(); COSDocument cosDoc = null;/*from w w w. ja v a 2s . co m*/ PDDocument pdDoc = null; try { PDFParser parser = new PDFParser(new FileInputStream(new File(pathToFile))); parser.parse(); cosDoc = parser.getDocument(); pdDoc = new PDDocument(cosDoc); sb.append(extractTextFromPage(pdDoc, page)); } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } finally { if (pdDoc != null) { try { pdDoc.close(); } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } } if (cosDoc != null) { try { cosDoc.close(); } catch (IOException ex) { Logger.getLogger(PDFExtractor.class.getName()).log(Level.SEVERE, null, ex); } } } return sb.toString(); }
From source file:FileParser.Parser.java
public ArrayList<String> fileParser() throws IOException, SAXException { String path = file.getPath(); String[] getArray = fileChooser.getFileType(); String type = getArray[1];// w w w.j a v a2 s .com //System.out.println("Type: "+type); String fileName = getArray[0]; String fileContent = ""; file = fileChooser.getFile(); getParsedData = new ArrayList<>(); switch (type) { case "txt": try { FileReader contentReader = new FileReader(file.getPath()); bReader = new BufferedReader(contentReader); while ((fileContent = bReader.readLine()) != null) { getParsedData.add(fileContent); } } catch (FileNotFoundException ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } break; case "html": // https://scholar.google.com.tr/scholar?hl=tr&q=ecir+u%C4%9Fur+k%C3%BC%C3%A7%C3%BCksille&btnG=&lr= String url = ""; try { if (url.isEmpty()) { Document doc = Jsoup.parse(file, null); fileContent = doc.text(); getParsedData.add(fileContent); } else { Document doc = Jsoup.connect(url).get(); Elements elements = doc.select("div.gs_r"); for (Element div : elements) { fileContent += div.text(); } getParsedData.add(fileContent); } } catch (Exception e) { e.printStackTrace(); } break; case "pdf": try { inputStream = new FileInputStream(file); parser = new PDFParser(inputStream); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(2); fileContent = pdfStripper.getText(pdDoc); getParsedData.add(fileContent); } catch (Exception e) { e.printStackTrace(); } break; case "doc": try { FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); fileContent = extractor.getText(); getParsedData.add(fileContent); } catch (Exception e) { e.printStackTrace(); } break; case "xml": /* parsing xml file path /home/burakcan/Desktop/eurofxref.xml */ try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(file); doc.getDocumentElement().normalize(); Element firstCube = (Element) doc.getElementsByTagName("Cube").item(0); Element secondCube = (Element) firstCube.getElementsByTagName("Cube").item(0); NodeList nList = doc.getElementsByTagName("Cube"); for (int i = 0; i < nList.getLength(); i++) { Node nNode = nList.item(i); Element eElement = (Element) nNode; getParsedData.add(eElement.getAttribute("currency") + " " + eElement.getAttribute("rate")); } } catch (ParserConfigurationException ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } break; default: JOptionPane.showMessageDialog(null, "This program can not parse your choice!", "Program Error", JOptionPane.ERROR_MESSAGE); System.exit(0); } return getParsedData; }
From source file:hrpod.tools.PDFTools.java
public String getStringFromPDF(InputStream inputStream) { String text = null;/*from w w w . j a va 2s.c o m*/ try { COSDocument cosDoc = getParser(inputStream).getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); } catch (Exception ex) { logger.error("ERROR", ex); } return text; }
From source file:hrpod.tools.PDFTools.java
public String[] getPagesFromPDF(InputStream inputStream) { String[] pages = null;/* w w w. j ava 2 s . c o m*/ try { COSDocument cosDoc = getParser(inputStream).getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); int pagesCount = pdDoc.getNumberOfPages(); pages = new String[pagesCount]; for (int p = 1; p < pagesCount; p++) { pdfStripper.setStartPage(p); pdfStripper.setEndPage(p + 1); pages[p - 1] = pdfStripper.getText(pdDoc); } } catch (IOException e) { logger.error("IO ERROR", e); } catch (Exception ex) { logger.error("ERROR", ex); } return pages; }
From source file:indexer.Indexer.java
public static String getPDF(String fileLoc, int pageNumber) { PDDocument pdf = null;//from w ww .ja v a 2 s . com String parsedText = null; COSDocument cosDoc = null; //BufferedWriter br = null; try { File inputPDF = new File(fileLoc); PDFParser parser = new PDFParser(new FileInputStream(inputPDF)); parser.parse(); cosDoc = parser.getDocument(); pdf = new PDDocument(cosDoc); PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(pageNumber); stripper.setEndPage(pageNumber); //br = new BufferedWriter( new OutputStreamWriter(null)); //stripper.writeText(pdf, br); parsedText = stripper.getText(pdf); pdf.close(); } catch (IOException ex) { ex.printStackTrace(); } return parsedText; }
From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java
public static String readfrompdf(String datsetspath, String Document) { /**/* w w w .j ava 2s .c o m*/ * 1 POlicy text Policy Aspects */ PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText; String policytext = null; File file = new File(datsetspath + Document); try { parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); parsedText.replaceAll("[^A-Za-z0-9. ]+", ""); policytext = parsedText; // System.out.println(policytext); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } return policytext; }
From source file:it.polito.tellmefirst.parsing.PDFparser.java
License:Open Source License
public String pdfToText(File file) throws TMFVisibleException { LOG.debug("[pdfToText] - BEGIN"); String result;// w w w. j av a2s . c om if (!file.isFile()) { throw new TMFVisibleException("File in input is actually not a file."); } try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); //pdfStripper.setStartPage(1); //pdfStripper.setEndPage(5); // remove syllabification String parsedTextWithWrap = pdfStripper.getText(pdDoc); result = parsedTextWithWrap.replace("-\n", ""); if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { LOG.error("[pdfToText] - EXCEPTION: ", e); throw new TMFVisibleException("Problem parsing file: the PDF document you uploaded seems malformed."); } LOG.debug("[pdfToText] - END"); return result; }
From source file:javaapplication1.PDFManager.java
public Map<String, String> ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse();//from w w w . j av a2s. co m cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); // if you want specific number of pages pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); Text = pdfStripper.getText(pdDoc); System.out.println(Text); // spilt String[] result = Text.split("\n"); Map<String, String> map = new HashMap<String, String>(); try { for (int j = 0; j < result.length; j++) { if (result[j].contains("Type")) { String x = result[j].substring(5); map.put("Type", x); } else if (result[j].contains("Document Number")) { String x = result[j].substring(16); map.put("Document Number", x); } else if (result[j].contains("Date of Birth")) { String x = result[j].substring(14); map.put("Date of Birth", x); } else if (result[j].contains("Date of Expiry")) { String x = result[j].substring(15); map.put("Date of Expiry", x); } else if (result[j].contains("Issuer")) { String x = result[j].substring(7); map.put("Issuer", x); } else if (result[j].contains("Nationality")) { String x = result[j].substring(12); map.put("Nationality", x); } else if (result[j].contains("First Names")) { String x = result[j].substring(12); map.put("First Names", x); } else if (result[j].contains("Last Names")) { String x = result[j].substring(11); map.put("Last Names", x); } else if (result[j].contains("Discretionary 1")) { String x = result[j].substring(16); map.put("Discretionary 1", x); } // else if (result[j].contains("Discretionary 2")) // { // // String x = result[j].substring(16); // map.put("Discretionary 2", x); // // } else if (result[j].contains("Gender")) { String x = result[j].substring(7); map.put("Gender", x); } } } catch (Exception e) { JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE); } return map; }
From source file:javadocofflinesearch.htmlprocessing.PdfAttempter.java
public String pdftoText(InputStream is, boolean stats) throws IOException { PDDocument pdDoc = null;//from w w w .j av a 2s .com COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(is); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); String text = pdfStripper.getText(pdDoc); if (stats) { vc.addAll(text); } return text; } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } }