Example usage for com.itextpdf.text.pdf.parser LocationTextExtractionStrategy LocationTextExtractionStrategy

List of usage examples for com.itextpdf.text.pdf.parser LocationTextExtractionStrategy LocationTextExtractionStrategy

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser LocationTextExtractionStrategy LocationTextExtractionStrategy.

Prototype

public LocationTextExtractionStrategy() 

Source Link

Document

Creates a new text extraction renderer.

Usage

From source file:com.erikHolz.vertretungsplan.Converter.java

License:Open Source License

public void parsePDF() throws IOException {

    PdfReader reader = new PdfReader(fileDest + ".pdf");
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(fileDest + "__.txt"));

    TextExtractionStrategy strategy;/*from www.  ja v a2  s.com*/
    for (int intI = 1; intI <= reader.getNumberOfPages(); intI++) {
        strategy = parser.processContent(intI, new LocationTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }

    out.flush();
    out.close();
    reader.close();

    // lschen der ursprnglichen pdf
    File f = new File(fileDest + ".pdf");
    if (f.exists())
        f.delete();
}

From source file:javaapplication1.JavaApplication1.java

/**
 * Parses a specific area of a PDF to a plain text file.
 * @param pdf the original PDF/*ww w  .  ja v  a2  s  .com*/
 * @param txt the resulting text
 * @throws IOException
 */
public String mycheckline(PdfReader reader, int pheight, int pwidth, int lh, int page) throws IOException {
    // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    Rectangle rect = new Rectangle(0, pheight - lh, pwidth / 3, pheight - lh - fontchecksize);
    RenderFilter filter = new RegionTextRenderFilter(rect);
    TextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
            filter);
    return (PdfTextExtractor.getTextFromPage(reader, page, strategy));
}

From source file:org.sinarproject.ecparser.ECRedelineation.java

private static void processECStructure(String mymeta) {
    // TODO: Restructure all the stuff from main into here ..
    int n = my_reader.getNumberOfPages();
    int i;//from   w ww .j  ava  2 s.  c om
    // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE>
    // <Key> -> Name:Population
    // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx
    final_mapped_data = new TreeMap<>();
    error_while_parsing = new TreeMap<>();
    // Loop through each page ..
    for (i = 1; i < n; i++) {
        try {
            String content;
            content = PdfTextExtractor.getTextFromPage(my_reader, i, new LocationTextExtractionStrategy());
            // content = PdfTextExtractor.getTextFromPage(reader, i);
            describePage(content, i);
        } catch (IOException ex) {
            Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    // Dump out error hash .. if any
    if (error_while_parsing.size() > 0) {
        out.println("==============================");
        out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!");
        out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs);
        out.println("==============================");
        // TODO: Shift to file output
        /*
        for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) {
        out.println("CODE: " + single_report_entry.getKey());
        out.println("UNMATCHED: " + single_report_entry.getValue());
        }
        */
    } else {
        out.println("========================");
        out.println("      ALL OK!!!         ");
        out.println("========================");
    }

    out.println("========================");
    out.println("  @@@@ Data!!! @@@@     ");
    out.println("========================");
    out.println("Final DM count: " + countedDM);
    // Detect if there any population not being able to be detected; so action can be taken
    for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) {
        // Output those that were not able to be auto-corrected ..
        if (single_data_entry.getValue().endsWith(":0")) {
            out.print("KEY:" + single_data_entry.getKey());
            out.println(" ==> " + single_data_entry.getValue());
        }
        ;
    }
    if (null != mymeta) // write down Output
    // No JSON output in this cut :( Leave it for interaction with golang
    //  and shapefile mapping and manipulatons ..
    {
        switch (mymeta) {
        case "json":
            Utils.writeJSONMappedData();
            break;
        case "csv":
            // go direct to CSV ..
            Utils.writeCSVFinalData();
            break;
        }
    }

    out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx");
}

From source file:org.sinarproject.ECRedelineation.java

/**
 * @param args the command line arguments
 *//* w  w w  .java2 s . co  m*/
public static void main(String[] args) {
    // TODO code application logic here
    out.println("Sinar Project's EC Parser ..");
    PdfReader reader = null;
    try {
        reader = new PdfReader(SOURCE);
        int n;
        n = reader.getNumberOfPages();
        int i;
        // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE>
        // <Key> -> Name:Population
        // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx
        final_mapped_data = new TreeMap<>();
        error_while_parsing = new TreeMap<>();
        // Loop through each page ..
        for (i = 1; i < n; i++) {
            try {
                String content;
                content = PdfTextExtractor.getTextFromPage(reader, i, new LocationTextExtractionStrategy());
                // content = PdfTextExtractor.getTextFromPage(reader, i);
                describePage(content, i);
            } catch (IOException ex) {
                Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        // Dump out error hash .. if any
        if (error_while_parsing.size() > 0) {
            out.println("==============================");
            out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!");
            out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs);
            out.println("==============================");
            // TODO: Shift to file output
            /*
             for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) {
             out.println("CODE: " + single_report_entry.getKey());
             out.println("UNMATCHED: " + single_report_entry.getValue());
             }
             */
        } else {
            out.println("========================");
            out.println("      ALL OK!!!         ");
            out.println("========================");
        }

        out.println("========================");
        out.println("  @@@@ Data!!! @@@@     ");
        out.println("========================");
        out.println("Final DM count: " + countedDM);
        // Detect if there any population not being able to be detected; so action can be taken
        for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) {
            // Output those that were not able to be auto-corrected ..
            if (single_data_entry.getValue().endsWith(":0")) {
                out.print("KEY:" + single_data_entry.getKey());
                out.println(" ==> " + single_data_entry.getValue());
            }
            ;
        }
        // write down Output
        // No JSON output in this cut :( Leave it for interaction with golang
        //  and shapefile mapping and manipulatons ..
        Utils.writeJSONMappedData();
        // go direct to CSV ..
        // Utils.writeCSVFinalData();
        out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx");
    } catch (IOException ex) {
        Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:Project.data.preparation.ExtractPageContentArea.java

public void parsePdf(String pdf, int pageNum, int upper_x, int upper_y, int lower_x, int lower_y)
        throws IOException {
    PdfReader reader = new PdfReader(pdf);
    //        System.out.println("(" + upper_x + " , " + upper_y + ") to ( " + lower_x + " , " + lower_y + ")");
    rect = new Rectangle(upper_x, upper_y, lower_x, lower_y);
    RenderFilter filter = new RegionTextRenderFilter(getRect());
    TextExtractionStrategy strategy;/*from  ww  w.ja  va 2s .  co m*/
    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
    TextCropped = PdfTextExtractor.getTextFromPage(reader, pageNum, strategy);
    setTextCropped(TextCropped);
    reader.close();
}