List of usage examples for com.itextpdf.text.pdf.parser SimpleTextExtractionStrategy SimpleTextExtractionStrategy
public SimpleTextExtractionStrategy()
From source file:bflows.FattureManagement.java
public void processPDF() { // Document pdf = null; BufferedWriter writer = null; consumi = new ArrayList<Consumo>(); lines = new ArrayList<String>(); try {/*from ww w .j a v a 2 s .co m*/ // Salvo file temporaneo per debugging //outputFile = new File("C:\\Users\\nklma\\Documents\\NetBeansProjects\\temp", "temp.txt"); //writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile))); //FileOutputStream fileOutputStream = new FileOutputStream("extracted.txt"); // iText Library PdfReader pdfReader = new PdfReader(inputStream); for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) { SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); String currentText = PdfTextExtractor.getTextFromPage(pdfReader, page, strategy); String[] l = currentText.split("\n"); for (int i = 0; i < l.length; i++) lines.add(l[i]); } pdfReader.close(); //for(String line : lines) //{ // writer.write(line); // writer.newLine(); //} //writer.close(); boolean startPointFound = false; boolean dateFound = false; boolean totaleFound = false; boolean contributiFound = false; boolean prodottiFound = false; boolean altriFound = false; boolean ivaFound = false; Consumo consumo = null; for (String line : lines) { //recupero data fattura if (!dateFound) { if (line.contains("Emessa")) { String cleanLine = line.replaceAll("\\s+", " "); String[] splitted = cleanLine.split(" "); for (String s : splitted) { if (s.contains("/")) { s = s.replace("/", "-"); data = s; } } dateFound = true; } } //recupero totale fattura con iva if (!totaleFound) { if (line.contains("IMPORTO")) { String cleanLine = line.replaceAll("\\s+", " ").replaceAll("_", ""); String importo = cleanLine.replace("IMPORTO: ", "").replace("Euro", "").trim(); totale = Double.parseDouble(importo.replace(".", "").replace(",", ".")); totaleFound = true; } } //recupero importo contributi e abbonamenti if (!contributiFound) { if (line.contains("CONTRIBUTI E ABBONAMENTI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("CONTRIBUTI E ABBONAMENTI ", ""); contributi = Double.parseDouble(importo.replace(".", "").replace(",", ".")); contributiFound = true; } } //recupero importo prodotti (noleggi) // SOLO PER 2017+ if (data != null && Integer.parseInt(data.split("-")[2]) >= 2017 && !prodottiFound) { if (line.contains("PRODOTTI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("PRODOTTI ", ""); prodotti = Double.parseDouble(importo.replace(".", "").replace(",", ".")); prodottiFound = true; } } //recupero importo altri addebiti e accrediti if (!altriFound) { if (line.contains("ALTRI ADDEBITI E ACCREDITI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("ALTRI ADDEBITI E ACCREDITI ", ""); altri = Double.parseDouble(importo.replace(".", "").replace(",", ".")); altriFound = true; } } //recupero importo IVA if ((contributiFound || altriFound) && !ivaFound) // in questo modo si evitano match con "partita iva" ecc { if (line.contains("IVA")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("IVA ", ""); iva = Double.parseDouble(importo.replace(".", "").replace(",", ".")); ivaFound = true; } } //Il primo RIEPILOGO PER UTENZA segna l'inizio della tabella dei consumi da analizzare if (!startPointFound && line.contains("RIEPILOGO PER UTENZA")) startPointFound = !startPointFound; if (!startPointFound) continue; //SERVIZI OPZIONALI segna la fine della tabella if (line.matches("SERVIZI OPZIONALI")) { consumi.add(consumo); return; } if (Integer.parseInt(data.split("-")[2]) >= 2017) { // ------------------------------ // PER FATTURE SUCCESSIVE AL 2017 // ------------------------------ ArrayList<String> splitted; splitted = StringMatcher.matches(line, "\\bLinea\\b\\s((\\d{10}))"); //if(line.matches("(?:(?:Linea)\\s)(\\d{10})")) if (!splitted.isEmpty()) { // Nuovo consumo if (consumo != null) //salvo la precedente { consumi.add(consumo); } //creo un nuovo consumo consumo = new Consumo(); StringBuilder str = new StringBuilder(splitted.get(0)); str.insert(3, "-"); consumo.Telefono = str.toString(); } splitted = StringMatcher.matches(line, "((?:\\w+\\s|\\w+-\\w+\\s)+)(?:\\d{2}\\/\\d{2}\\/\\d{4}\\s)((?:\\D+\\s)+)(?:\\d{2}\\/\\d{2}-\\d{2}\\/\\d{2}\\s)((\\d+,\\d+))$"); //if(line.matches("(?:(?:(?:\w+\s)+\w+\-)?(?:\w+\s)+)(?:\d{2}\/\d{2}\/\d{4})\s((?:\w+\s)+)(?:\d{2}\/\d{2}\-\d{2}\/\d{2})\s(\d+,\d+)")) if (!splitted.isEmpty()) { // Contributi o abbonamenti if (consumo != null) { String lel = splitted.get(1); if (splitted.get(1).contains("Contributi")) consumo.CRB += Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB += Double.parseDouble(splitted.get(2).replace(",", ".")); } } splitted = StringMatcher.matches(line, "\\bRicariche\\b(?:\\s\\w+)+(?:\\s\\d\\s)((\\d+,\\d+))$"); if (!splitted.isEmpty()) //if(line.matches("(Ricariche(?:\\s\\w+)+)(\\s\\d\\s)(\\d+,\\d+)")) { // Ricariche //splitted = SplitLine.splitNewRicarica(line); if (consumo != null) { consumo.AAA += Double.parseDouble(splitted.get(0).replace(",", ".")); } } splitted = StringMatcher.matches(line, "\\bTotale\\b\\s((\\d+,\\d+))$"); //if(line.matches("(Totale\\s+)(\\d+,\\d+)")) if (!splitted.isEmpty()) { // Totale //splitted = SplitLine.splitNewTotale(line); if (consumo != null) { consumo.Totale += Double.parseDouble(splitted.get(0).replace(",", ".")); } } } else { // ------------------------------ // PER FATTURE PRECEDENTI AL 2017 // ------------------------------ // Linea e consumo if (line.matches("(\\d{3}(\\s+)?-(\\s+)?\\d{7})((?:\\s+)(?:\\w+\\s+)+)(\\d+,\\d+)")) // (3 digits)(optional whitespaces)-(optional whitespaces)(7 digits) // (any number of whitespaces)(any number of words followed by whitespace)(1+ digits),(1+digits) { //elimino gli spazi nel numero di telefono line = line.replace(" - ", "-"); //se entro qui significa che inizia un consumo ArrayList<String> splitted = SplitLine.splitConsumo1(line); //esiste un consumo con lo stesso numero quindi i dati vanno aggiunti if (consumo != null && splitted.get(0).replaceAll("\\s+", "").equals(consumo.Telefono)) // il continuo del precedente { if (splitted.get(1).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", ".")); } else { //non esiste un consumo con il numero letto if (consumo != null) //salvo la precedente { consumi.add(consumo); } //creo un nuovo consumo consumo = new Consumo(); consumo.Telefono = splitted.get(0); if (splitted.get(1).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", ".")); } } if (line.matches("((?:\\w+\\s+)+)(\\d+,\\d+)"))//(any number of words followed by whitespaces)(1+ digits),(1+ digits) { //continua la fattura precedente ArrayList<String> splitted = SplitLine.splitConsumo2(line); if (consumo != null) { if (splitted.get(0).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Totale")) consumo.Totale = Double.parseDouble(splitted.get(1).replace(",", ".")); } } } } //outputFile.delete(); } catch (IOException ex) { EService.logAndRecover(ex); setResult(EService.UNRECOVERABLE_ERROR); setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage()); } catch (NumberFormatException ex) { EService.logAndRecover((FatalError) ex); setResult(EService.UNRECOVERABLE_ERROR); setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage()); } }
From source file:br.com.smarttaco.util.HelenaBarbosa.java
/** * pdf2txt/*ww w . j a v a 2 s. c o m*/ * * @param pdf * @param paginas se for <code>null</code> realiza leitura completa. * @param txt * @throws FileNotFoundException * @throws IOException */ private static void pdf2txt(final String pdf, List<Integer> paginas, final String txt) throws FileNotFoundException, IOException { PdfReader reader = new PdfReader(pdf); //System.out.println(reader.getInfo().toString()); if (paginas != null) { reader.selectPages(paginas); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(txt, "UTF-8"); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); reader.close(); }
From source file:com.cloudhub.util.PDFToText.java
License:Apache License
/** * Parses a PDF to a plain text file.//from w w w . ja v a 2 s .c o m * * @param source the original PDF * @param destination the resulting text * @throws IOException */ public static void parsePdf(String source, String destination) throws IOException { PdfReader reader = new PdfReader(source); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(destination)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); }
From source file:com.joanzapata.PDFViewActivity.java
License:Open Source License
public void parsePdf(String pdf2, String txt) throws IOException { // String//from w w w .j av a2 s . c o m // pdf1=Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOWNLOADS)+ // File.separator + "about.pdf"; PdfReader reader = new PdfReader(pdf2); PdfReaderContentParser parser = new PdfReaderContentParser(reader); File file = getFileStreamPath("test.txt"); if (!file.exists()) { file.createNewFile(); } FileOutputStream writer = openFileOutput(file.getName(), Context.MODE_PRIVATE); // PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= pageNumber; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); writer.write(strategy.getResultantText().getBytes()); writer.flush(); } writer.close(); reader.close(); String totalString = readFromFile(); System.out.println(totalString); totalContent = totalString; String word = "Number of words " + wordCount(totalString) + "\n"; String averageWords = "Average length of words " + " " + averageWords(totalString) + "\n"; String sentenseCount = "Number of sentences " + sentenseCount(totalString) + "\n"; String averageSentenses = "Average length of sentences " + averageSentense(totalString) + "\n"; // String complexity= String readability = " Readability Index " + getReadability(totalString) + "\n"; String subject = "Subject Area " + etSubject.getText().toString() + "\n"; String article = "Type of the Article : " + etArticle.getText().toString() + "\n"; String pages = " Pages " + startPage + " to " + pageNumber + "\n"; String book = "book name " + etBookName.getText().toString() + "\n"; String readername = "reader name " + etReaderName.getText().toString() + "\n"; totalInfo = readername + book + subject + "\n" + article + pages + word + averageWords + sentenseCount + averageSentenses + readability; displayMessage(totalInfo); }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
public static String extractITextText(String pdf) { PdfReader reader = null;//from w w w. j a v a 2 s . c o m try { reader = new PdfReader(pdf); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; String text = ""; for (int i = 1; i <= reader.getNumberOfPages(); i++) { try { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text += strategy.getResultantText(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } reader.close(); return text; }
From source file:com.sustainalytics.crawlerfilter.PDFtoText.java
License:Apache License
public static String extractITextText(String pdf) { PdfReader reader = null;/*from ww w .j a v a 2 s .c om*/ try { reader = new PdfReader(pdf); } catch (IOException e) { logger.info("Error in reading file with iText parser\n"); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; String text = ""; for (int i = 1; i <= reader.getNumberOfPages(); i++) { try { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); text += strategy.getResultantText(); } catch (IOException e) { logger.info("Error in parsing with iText parser\n"); } logger.info("PDF text extracted from " + pdf + "\n"); } reader.close(); return text; }
From source file:conversorpdf.Conversor.Conversor.java
/** * Parses a PDF to a plain text file./*from w w w. j a va2 s. co m*/ * @param pdf the original PDF * @param txt the resulting text * @throws IOException */ public boolean parsePdf(String pdf, String txt, boolean removerAcento) throws IOException { PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(txt)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); if (removerAcento == false) { out.println(strategy.getResultantText()); } else { out.println(this.removeAcentos(strategy.getResultantText())); } } out.flush(); out.close(); return true; }
From source file:coviam.pdf.PdfParser.java
public void getText() { String pdf = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/ejd1.pdf"; String text = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt"; StringBuffer textBuffer = new StringBuffer(); String resultText = ""; PdfReader reader;//from ww w. j a v a 2 s . c o m try { reader = new PdfReader(pdf); PdfReaderContentParser contentParser = new PdfReaderContentParser(reader); PrintWriter printWriter = new PrintWriter(new FileOutputStream(text)); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = contentParser.processContent(i, new SimpleTextExtractionStrategy()); textBuffer.append(strategy.getResultantText()); } resultText = textBuffer.toString(); resultText = resultText.replaceAll("-\n", ""); System.out.println("-->" + resultText); StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n"); PrintWriter lineWriter = new PrintWriter( new FileOutputStream("/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt")); while (stringTokenizer.hasMoreTokens()) { String curToken = stringTokenizer.nextToken(); lineWriter.println("line-->" + curToken); } lineWriter.flush(); lineWriter.close(); System.out.flush(); System.out.close(); } catch (IOException ioe) { } }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java
License:GNU General Public License
public String getText(int pageNumber) throws IOException, Exception { String returnValue = null;/*from w ww .j a v a 2 s. c om*/ if (pdfReader != null) { int numberOfPages = getNumberOfPages(); if (pageNumber > 0 && pageNumber <= numberOfPages) { TextExtractionStrategy strategy; PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); strategy = parser.processContent(pageNumber, new SimpleTextExtractionStrategy()); if (strategy != null && strategy.getResultantText().trim().length() > 0) { returnValue = PAGE_START_MARKER + strategy.getResultantText(); } // end if } else { // TODO: Add own exception. throw new Exception("The given page number (" + pageNumber + ") " + "is not in the range of valid pages (1.." + numberOfPages + ")."); } // end if..else } else { // TODO: Add own exception. throw new Exception("There is no open PDF to work with."); } // end if..else return returnValue; }
From source file:helper.PdfText.java
License:Apache License
/** * @param pdfFile this file will be extracted. * @return the plain text of the pdf//ww w . j a v a 2s. c o m */ public String itext(File pdfFile) { PdfReader reader; try { reader = new PdfReader(pdfFile.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuffer buf = new StringBuffer(); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); buf.append(strategy.getResultantText()); } return buf.toString(); } catch (IOException e) { throw new HttpArchiveException(500, e); } }