List of usage examples for com.itextpdf.text.pdf.parser LocationTextExtractionStrategy LocationTextExtractionStrategy
public LocationTextExtractionStrategy()
From source file:com.erikHolz.vertretungsplan.Converter.java
License:Open Source License
public void parsePDF() throws IOException { PdfReader reader = new PdfReader(fileDest + ".pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(fileDest + "__.txt")); TextExtractionStrategy strategy;/*from www. ja v a2 s.com*/ for (int intI = 1; intI <= reader.getNumberOfPages(); intI++) { strategy = parser.processContent(intI, new LocationTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); reader.close(); // lschen der ursprnglichen pdf File f = new File(fileDest + ".pdf"); if (f.exists()) f.delete(); }
From source file:javaapplication1.JavaApplication1.java
/** * Parses a specific area of a PDF to a plain text file. * @param pdf the original PDF/*ww w . ja v a2 s .com*/ * @param txt the resulting text * @throws IOException */ public String mycheckline(PdfReader reader, int pheight, int pwidth, int lh, int page) throws IOException { // PrintWriter out = new PrintWriter(new FileOutputStream(txt)); Rectangle rect = new Rectangle(0, pheight - lh, pwidth / 3, pheight - lh - fontchecksize); RenderFilter filter = new RegionTextRenderFilter(rect); TextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); return (PdfTextExtractor.getTextFromPage(reader, page, strategy)); }
From source file:org.sinarproject.ecparser.ECRedelineation.java
private static void processECStructure(String mymeta) { // TODO: Restructure all the stuff from main into here .. int n = my_reader.getNumberOfPages(); int i;//from w ww .j ava 2 s. c om // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE> // <Key> -> Name:Population // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx final_mapped_data = new TreeMap<>(); error_while_parsing = new TreeMap<>(); // Loop through each page .. for (i = 1; i < n; i++) { try { String content; content = PdfTextExtractor.getTextFromPage(my_reader, i, new LocationTextExtractionStrategy()); // content = PdfTextExtractor.getTextFromPage(reader, i); describePage(content, i); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } } // Dump out error hash .. if any if (error_while_parsing.size() > 0) { out.println("=============================="); out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!"); out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs); out.println("=============================="); // TODO: Shift to file output /* for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) { out.println("CODE: " + single_report_entry.getKey()); out.println("UNMATCHED: " + single_report_entry.getValue()); } */ } else { out.println("========================"); out.println(" ALL OK!!! "); out.println("========================"); } out.println("========================"); out.println(" @@@@ Data!!! @@@@ "); out.println("========================"); out.println("Final DM count: " + countedDM); // Detect if there any population not being able to be detected; so action can be taken for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) { // Output those that were not able to be auto-corrected .. if (single_data_entry.getValue().endsWith(":0")) { out.print("KEY:" + single_data_entry.getKey()); out.println(" ==> " + single_data_entry.getValue()); } ; } if (null != mymeta) // write down Output // No JSON output in this cut :( Leave it for interaction with golang // and shapefile mapping and manipulatons .. { switch (mymeta) { case "json": Utils.writeJSONMappedData(); break; case "csv": // go direct to CSV .. Utils.writeCSVFinalData(); break; } } out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx"); }
From source file:org.sinarproject.ECRedelineation.java
/** * @param args the command line arguments *//* w w w .java2 s . co m*/ public static void main(String[] args) { // TODO code application logic here out.println("Sinar Project's EC Parser .."); PdfReader reader = null; try { reader = new PdfReader(SOURCE); int n; n = reader.getNumberOfPages(); int i; // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE> // <Key> -> Name:Population // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx final_mapped_data = new TreeMap<>(); error_while_parsing = new TreeMap<>(); // Loop through each page .. for (i = 1; i < n; i++) { try { String content; content = PdfTextExtractor.getTextFromPage(reader, i, new LocationTextExtractionStrategy()); // content = PdfTextExtractor.getTextFromPage(reader, i); describePage(content, i); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } } // Dump out error hash .. if any if (error_while_parsing.size() > 0) { out.println("=============================="); out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!"); out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs); out.println("=============================="); // TODO: Shift to file output /* for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) { out.println("CODE: " + single_report_entry.getKey()); out.println("UNMATCHED: " + single_report_entry.getValue()); } */ } else { out.println("========================"); out.println(" ALL OK!!! "); out.println("========================"); } out.println("========================"); out.println(" @@@@ Data!!! @@@@ "); out.println("========================"); out.println("Final DM count: " + countedDM); // Detect if there any population not being able to be detected; so action can be taken for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) { // Output those that were not able to be auto-corrected .. if (single_data_entry.getValue().endsWith(":0")) { out.print("KEY:" + single_data_entry.getKey()); out.println(" ==> " + single_data_entry.getValue()); } ; } // write down Output // No JSON output in this cut :( Leave it for interaction with golang // and shapefile mapping and manipulatons .. Utils.writeJSONMappedData(); // go direct to CSV .. // Utils.writeCSVFinalData(); out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx"); } catch (IOException ex) { Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:Project.data.preparation.ExtractPageContentArea.java
public void parsePdf(String pdf, int pageNum, int upper_x, int upper_y, int lower_x, int lower_y) throws IOException { PdfReader reader = new PdfReader(pdf); // System.out.println("(" + upper_x + " , " + upper_y + ") to ( " + lower_x + " , " + lower_y + ")"); rect = new Rectangle(upper_x, upper_y, lower_x, lower_y); RenderFilter filter = new RegionTextRenderFilter(getRect()); TextExtractionStrategy strategy;/*from ww w.ja va 2s . co m*/ strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); TextCropped = PdfTextExtractor.getTextFromPage(reader, pageNum, strategy); setTextCropped(TextCropped); reader.close(); }