List of usage examples for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage
public static String getTextFromPage(PdfReader reader, int pageNumber, TextExtractionStrategy strategy) throws IOException
From source file:bflows.FattureManagement.java
public void processPDF() { // Document pdf = null; BufferedWriter writer = null; consumi = new ArrayList<Consumo>(); lines = new ArrayList<String>(); try {/*from w w w . j a v a2s. c o m*/ // Salvo file temporaneo per debugging //outputFile = new File("C:\\Users\\nklma\\Documents\\NetBeansProjects\\temp", "temp.txt"); //writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile))); //FileOutputStream fileOutputStream = new FileOutputStream("extracted.txt"); // iText Library PdfReader pdfReader = new PdfReader(inputStream); for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) { SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); String currentText = PdfTextExtractor.getTextFromPage(pdfReader, page, strategy); String[] l = currentText.split("\n"); for (int i = 0; i < l.length; i++) lines.add(l[i]); } pdfReader.close(); //for(String line : lines) //{ // writer.write(line); // writer.newLine(); //} //writer.close(); boolean startPointFound = false; boolean dateFound = false; boolean totaleFound = false; boolean contributiFound = false; boolean prodottiFound = false; boolean altriFound = false; boolean ivaFound = false; Consumo consumo = null; for (String line : lines) { //recupero data fattura if (!dateFound) { if (line.contains("Emessa")) { String cleanLine = line.replaceAll("\\s+", " "); String[] splitted = cleanLine.split(" "); for (String s : splitted) { if (s.contains("/")) { s = s.replace("/", "-"); data = s; } } dateFound = true; } } //recupero totale fattura con iva if (!totaleFound) { if (line.contains("IMPORTO")) { String cleanLine = line.replaceAll("\\s+", " ").replaceAll("_", ""); String importo = cleanLine.replace("IMPORTO: ", "").replace("Euro", "").trim(); totale = Double.parseDouble(importo.replace(".", "").replace(",", ".")); totaleFound = true; } } //recupero importo contributi e abbonamenti if (!contributiFound) { if (line.contains("CONTRIBUTI E ABBONAMENTI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("CONTRIBUTI E ABBONAMENTI ", ""); contributi = Double.parseDouble(importo.replace(".", "").replace(",", ".")); contributiFound = true; } } //recupero importo prodotti (noleggi) // SOLO PER 2017+ if (data != null && Integer.parseInt(data.split("-")[2]) >= 2017 && !prodottiFound) { if (line.contains("PRODOTTI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("PRODOTTI ", ""); prodotti = Double.parseDouble(importo.replace(".", "").replace(",", ".")); prodottiFound = true; } } //recupero importo altri addebiti e accrediti if (!altriFound) { if (line.contains("ALTRI ADDEBITI E ACCREDITI")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("ALTRI ADDEBITI E ACCREDITI ", ""); altri = Double.parseDouble(importo.replace(".", "").replace(",", ".")); altriFound = true; } } //recupero importo IVA if ((contributiFound || altriFound) && !ivaFound) // in questo modo si evitano match con "partita iva" ecc { if (line.contains("IVA")) { String cleanLine = line.replaceAll("\\s+", " "); String importo = cleanLine.replace("IVA ", ""); iva = Double.parseDouble(importo.replace(".", "").replace(",", ".")); ivaFound = true; } } //Il primo RIEPILOGO PER UTENZA segna l'inizio della tabella dei consumi da analizzare if (!startPointFound && line.contains("RIEPILOGO PER UTENZA")) startPointFound = !startPointFound; if (!startPointFound) continue; //SERVIZI OPZIONALI segna la fine della tabella if (line.matches("SERVIZI OPZIONALI")) { consumi.add(consumo); return; } if (Integer.parseInt(data.split("-")[2]) >= 2017) { // ------------------------------ // PER FATTURE SUCCESSIVE AL 2017 // ------------------------------ ArrayList<String> splitted; splitted = StringMatcher.matches(line, "\\bLinea\\b\\s((\\d{10}))"); //if(line.matches("(?:(?:Linea)\\s)(\\d{10})")) if (!splitted.isEmpty()) { // Nuovo consumo if (consumo != null) //salvo la precedente { consumi.add(consumo); } //creo un nuovo consumo consumo = new Consumo(); StringBuilder str = new StringBuilder(splitted.get(0)); str.insert(3, "-"); consumo.Telefono = str.toString(); } splitted = StringMatcher.matches(line, "((?:\\w+\\s|\\w+-\\w+\\s)+)(?:\\d{2}\\/\\d{2}\\/\\d{4}\\s)((?:\\D+\\s)+)(?:\\d{2}\\/\\d{2}-\\d{2}\\/\\d{2}\\s)((\\d+,\\d+))$"); //if(line.matches("(?:(?:(?:\w+\s)+\w+\-)?(?:\w+\s)+)(?:\d{2}\/\d{2}\/\d{4})\s((?:\w+\s)+)(?:\d{2}\/\d{2}\-\d{2}\/\d{2})\s(\d+,\d+)")) if (!splitted.isEmpty()) { // Contributi o abbonamenti if (consumo != null) { String lel = splitted.get(1); if (splitted.get(1).contains("Contributi")) consumo.CRB += Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB += Double.parseDouble(splitted.get(2).replace(",", ".")); } } splitted = StringMatcher.matches(line, "\\bRicariche\\b(?:\\s\\w+)+(?:\\s\\d\\s)((\\d+,\\d+))$"); if (!splitted.isEmpty()) //if(line.matches("(Ricariche(?:\\s\\w+)+)(\\s\\d\\s)(\\d+,\\d+)")) { // Ricariche //splitted = SplitLine.splitNewRicarica(line); if (consumo != null) { consumo.AAA += Double.parseDouble(splitted.get(0).replace(",", ".")); } } splitted = StringMatcher.matches(line, "\\bTotale\\b\\s((\\d+,\\d+))$"); //if(line.matches("(Totale\\s+)(\\d+,\\d+)")) if (!splitted.isEmpty()) { // Totale //splitted = SplitLine.splitNewTotale(line); if (consumo != null) { consumo.Totale += Double.parseDouble(splitted.get(0).replace(",", ".")); } } } else { // ------------------------------ // PER FATTURE PRECEDENTI AL 2017 // ------------------------------ // Linea e consumo if (line.matches("(\\d{3}(\\s+)?-(\\s+)?\\d{7})((?:\\s+)(?:\\w+\\s+)+)(\\d+,\\d+)")) // (3 digits)(optional whitespaces)-(optional whitespaces)(7 digits) // (any number of whitespaces)(any number of words followed by whitespace)(1+ digits),(1+digits) { //elimino gli spazi nel numero di telefono line = line.replace(" - ", "-"); //se entro qui significa che inizia un consumo ArrayList<String> splitted = SplitLine.splitConsumo1(line); //esiste un consumo con lo stesso numero quindi i dati vanno aggiunti if (consumo != null && splitted.get(0).replaceAll("\\s+", "").equals(consumo.Telefono)) // il continuo del precedente { if (splitted.get(1).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", ".")); } else { //non esiste un consumo con il numero letto if (consumo != null) //salvo la precedente { consumi.add(consumo); } //creo un nuovo consumo consumo = new Consumo(); consumo.Telefono = splitted.get(0); if (splitted.get(1).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", ".")); else if (splitted.get(1).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", ".")); } } if (line.matches("((?:\\w+\\s+)+)(\\d+,\\d+)"))//(any number of words followed by whitespaces)(1+ digits),(1+ digits) { //continua la fattura precedente ArrayList<String> splitted = SplitLine.splitConsumo2(line); if (consumo != null) { if (splitted.get(0).contains("Contributi")) consumo.CRB = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Altri")) consumo.AAA = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Abbonamenti")) consumo.ABB = Double.parseDouble(splitted.get(1).replace(",", ".")); else if (splitted.get(0).contains("Totale")) consumo.Totale = Double.parseDouble(splitted.get(1).replace(",", ".")); } } } } //outputFile.delete(); } catch (IOException ex) { EService.logAndRecover(ex); setResult(EService.UNRECOVERABLE_ERROR); setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage()); } catch (NumberFormatException ex) { EService.logAndRecover((FatalError) ex); setResult(EService.UNRECOVERABLE_ERROR); setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage()); } }
From source file:javaapplication1.JavaApplication1.java
/** * Parses a specific area of a PDF to a plain text file. * @param pdf the original PDF/*from w ww .j a va 2 s. c om*/ * @param txt the resulting text * @throws IOException */ public String mycheckline(PdfReader reader, int pheight, int pwidth, int lh, int page) throws IOException { // PrintWriter out = new PrintWriter(new FileOutputStream(txt)); Rectangle rect = new Rectangle(0, pheight - lh, pwidth / 3, pheight - lh - fontchecksize); RenderFilter filter = new RegionTextRenderFilter(rect); TextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); return (PdfTextExtractor.getTextFromPage(reader, page, strategy)); }
From source file:me.Aron.Heinecke.fbot.lib.Converter.java
License:Apache License
@Deprecated public String ReadPdfFile(File file) throws IOException { StringBuilder text = new StringBuilder(); if (file.exists()) { PdfReader pdfReader = new PdfReader(file.getAbsolutePath()); for (int pageid = 1; pageid <= pdfReader.getNumberOfPages(); pageid++) { SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); String currentText = PdfTextExtractor.getTextFromPage(pdfReader, pageid, strategy); //currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.append(currentText);/*w w w .j ava 2 s .c o m*/ } pdfReader.close(); } return text.toString(); }
From source file:org.sinarproject.ecparser.ECRedelineation.java
private static void processECStructure(String mymeta) { // TODO: Restructure all the stuff from main into here .. int n = my_reader.getNumberOfPages(); int i;//www. j a va 2 s . c om // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE> // <Key> -> Name:Population // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx final_mapped_data = new TreeMap<>(); error_while_parsing = new TreeMap<>(); // Loop through each page .. for (i = 1; i < n; i++) { try { String content; content = PdfTextExtractor.getTextFromPage(my_reader, i, new LocationTextExtractionStrategy()); // content = PdfTextExtractor.getTextFromPage(reader, i); describePage(content, i); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } } // Dump out error hash .. if any if (error_while_parsing.size() > 0) { out.println("=============================="); out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!"); out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs); out.println("=============================="); // TODO: Shift to file output /* for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) { out.println("CODE: " + single_report_entry.getKey()); out.println("UNMATCHED: " + single_report_entry.getValue()); } */ } else { out.println("========================"); out.println(" ALL OK!!! "); out.println("========================"); } out.println("========================"); out.println(" @@@@ Data!!! @@@@ "); out.println("========================"); out.println("Final DM count: " + countedDM); // Detect if there any population not being able to be detected; so action can be taken for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) { // Output those that were not able to be auto-corrected .. if (single_data_entry.getValue().endsWith(":0")) { out.print("KEY:" + single_data_entry.getKey()); out.println(" ==> " + single_data_entry.getValue()); } ; } if (null != mymeta) // write down Output // No JSON output in this cut :( Leave it for interaction with golang // and shapefile mapping and manipulatons .. { switch (mymeta) { case "json": Utils.writeJSONMappedData(); break; case "csv": // go direct to CSV .. Utils.writeCSVFinalData(); break; } } out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx"); }
From source file:org.sinarproject.ECRedelineation.java
/** * @param args the command line arguments *///from w w w.ja v a 2 s . c om public static void main(String[] args) { // TODO code application logic here out.println("Sinar Project's EC Parser .."); PdfReader reader = null; try { reader = new PdfReader(SOURCE); int n; n = reader.getNumberOfPages(); int i; // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE> // <Key> -> Name:Population // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx final_mapped_data = new TreeMap<>(); error_while_parsing = new TreeMap<>(); // Loop through each page .. for (i = 1; i < n; i++) { try { String content; content = PdfTextExtractor.getTextFromPage(reader, i, new LocationTextExtractionStrategy()); // content = PdfTextExtractor.getTextFromPage(reader, i); describePage(content, i); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } } // Dump out error hash .. if any if (error_while_parsing.size() > 0) { out.println("=============================="); out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!"); out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs); out.println("=============================="); // TODO: Shift to file output /* for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) { out.println("CODE: " + single_report_entry.getKey()); out.println("UNMATCHED: " + single_report_entry.getValue()); } */ } else { out.println("========================"); out.println(" ALL OK!!! "); out.println("========================"); } out.println("========================"); out.println(" @@@@ Data!!! @@@@ "); out.println("========================"); out.println("Final DM count: " + countedDM); // Detect if there any population not being able to be detected; so action can be taken for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) { // Output those that were not able to be auto-corrected .. if (single_data_entry.getValue().endsWith(":0")) { out.print("KEY:" + single_data_entry.getKey()); out.println(" ==> " + single_data_entry.getValue()); } ; } // write down Output // No JSON output in this cut :( Leave it for interaction with golang // and shapefile mapping and manipulatons .. Utils.writeJSONMappedData(); // go direct to CSV .. // Utils.writeCSVFinalData(); out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx"); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:Project.data.preparation.ExtractPageContentArea.java
public void parsePdf(String pdf, int pageNum, int upper_x, int upper_y, int lower_x, int lower_y) throws IOException { PdfReader reader = new PdfReader(pdf); // System.out.println("(" + upper_x + " , " + upper_y + ") to ( " + lower_x + " , " + lower_y + ")"); rect = new Rectangle(upper_x, upper_y, lower_x, lower_y); RenderFilter filter = new RegionTextRenderFilter(getRect()); TextExtractionStrategy strategy;//from w ww . j a v a 2 s .c om strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); TextCropped = PdfTextExtractor.getTextFromPage(reader, pageNum, strategy); setTextCropped(TextCropped); reader.close(); }