Example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage.

Prototype

public static String getTextFromPage(PdfReader reader, int pageNumber, TextExtractionStrategy strategy)
        throws IOException

Source Link

Document

Extract text from a specified page using an extraction strategy.

Usage

From source file:bflows.FattureManagement.java

public void processPDF() {
    //        Document pdf = null;
    BufferedWriter writer = null;
    consumi = new ArrayList<Consumo>();
    lines = new ArrayList<String>();

    try {/*from  w w w  .  j a v a2s. c o  m*/
        // Salvo file temporaneo per debugging
        //outputFile = new File("C:\\Users\\nklma\\Documents\\NetBeansProjects\\temp", "temp.txt"); 
        //writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile)));            
        //FileOutputStream fileOutputStream = new FileOutputStream("extracted.txt");

        // iText Library      
        PdfReader pdfReader = new PdfReader(inputStream);
        for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) {
            SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
            String currentText = PdfTextExtractor.getTextFromPage(pdfReader, page, strategy);
            String[] l = currentText.split("\n");
            for (int i = 0; i < l.length; i++)
                lines.add(l[i]);
        }
        pdfReader.close();

        //for(String line : lines)
        //{
        //   writer.write(line);
        //   writer.newLine();
        //}

        //writer.close();    

        boolean startPointFound = false;
        boolean dateFound = false;
        boolean totaleFound = false;
        boolean contributiFound = false;
        boolean prodottiFound = false;
        boolean altriFound = false;
        boolean ivaFound = false;

        Consumo consumo = null;

        for (String line : lines) {
            //recupero data fattura
            if (!dateFound) {
                if (line.contains("Emessa")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String[] splitted = cleanLine.split(" ");
                    for (String s : splitted) {
                        if (s.contains("/")) {
                            s = s.replace("/", "-");
                            data = s;
                        }
                    }
                    dateFound = true;
                }
            }

            //recupero totale fattura con iva
            if (!totaleFound) {
                if (line.contains("IMPORTO")) {
                    String cleanLine = line.replaceAll("\\s+", " ").replaceAll("_", "");
                    String importo = cleanLine.replace("IMPORTO: ", "").replace("Euro", "").trim();
                    totale = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    totaleFound = true;
                }
            }

            //recupero importo contributi e abbonamenti
            if (!contributiFound) {
                if (line.contains("CONTRIBUTI E ABBONAMENTI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("CONTRIBUTI E ABBONAMENTI ", "");
                    contributi = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    contributiFound = true;
                }
            }

            //recupero importo prodotti (noleggi) 
            // SOLO PER 2017+            
            if (data != null && Integer.parseInt(data.split("-")[2]) >= 2017 && !prodottiFound) {
                if (line.contains("PRODOTTI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("PRODOTTI ", "");
                    prodotti = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    prodottiFound = true;
                }
            }

            //recupero importo altri addebiti e accrediti
            if (!altriFound) {
                if (line.contains("ALTRI ADDEBITI E ACCREDITI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("ALTRI ADDEBITI E ACCREDITI ", "");
                    altri = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    altriFound = true;
                }
            }

            //recupero importo IVA
            if ((contributiFound || altriFound) && !ivaFound) // in questo modo si evitano match con "partita iva" ecc
            {
                if (line.contains("IVA")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("IVA ", "");
                    iva = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    ivaFound = true;
                }
            }

            //Il primo RIEPILOGO PER UTENZA segna l'inizio della tabella dei consumi da analizzare
            if (!startPointFound && line.contains("RIEPILOGO PER UTENZA"))
                startPointFound = !startPointFound;

            if (!startPointFound)
                continue;

            //SERVIZI OPZIONALI segna la fine della tabella
            if (line.matches("SERVIZI OPZIONALI")) {
                consumi.add(consumo);
                return;
            }

            if (Integer.parseInt(data.split("-")[2]) >= 2017) {
                // ------------------------------
                // PER FATTURE SUCCESSIVE AL 2017
                // ------------------------------
                ArrayList<String> splitted;
                splitted = StringMatcher.matches(line, "\\bLinea\\b\\s((\\d{10}))");
                //if(line.matches("(?:(?:Linea)\\s)(\\d{10})"))
                if (!splitted.isEmpty()) {
                    // Nuovo consumo
                    if (consumo != null) //salvo la precedente
                    {
                        consumi.add(consumo);
                    }
                    //creo un nuovo consumo
                    consumo = new Consumo();
                    StringBuilder str = new StringBuilder(splitted.get(0));
                    str.insert(3, "-");
                    consumo.Telefono = str.toString();
                }

                splitted = StringMatcher.matches(line,
                        "((?:\\w+\\s|\\w+-\\w+\\s)+)(?:\\d{2}\\/\\d{2}\\/\\d{4}\\s)((?:\\D+\\s)+)(?:\\d{2}\\/\\d{2}-\\d{2}\\/\\d{2}\\s)((\\d+,\\d+))$");
                //if(line.matches("(?:(?:(?:\w+\s)+\w+\-)?(?:\w+\s)+)(?:\d{2}\/\d{2}\/\d{4})\s((?:\w+\s)+)(?:\d{2}\/\d{2}\-\d{2}\/\d{2})\s(\d+,\d+)"))
                if (!splitted.isEmpty()) {
                    // Contributi o abbonamenti

                    if (consumo != null) {
                        String lel = splitted.get(1);
                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB += Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB += Double.parseDouble(splitted.get(2).replace(",", "."));
                    }
                }

                splitted = StringMatcher.matches(line,
                        "\\bRicariche\\b(?:\\s\\w+)+(?:\\s\\d\\s)((\\d+,\\d+))$");
                if (!splitted.isEmpty())
                //if(line.matches("(Ricariche(?:\\s\\w+)+)(\\s\\d\\s)(\\d+,\\d+)"))
                {
                    // Ricariche
                    //splitted = SplitLine.splitNewRicarica(line);

                    if (consumo != null) {
                        consumo.AAA += Double.parseDouble(splitted.get(0).replace(",", "."));
                    }
                }

                splitted = StringMatcher.matches(line, "\\bTotale\\b\\s((\\d+,\\d+))$");
                //if(line.matches("(Totale\\s+)(\\d+,\\d+)"))
                if (!splitted.isEmpty()) {
                    // Totale
                    //splitted = SplitLine.splitNewTotale(line);

                    if (consumo != null) {
                        consumo.Totale += Double.parseDouble(splitted.get(0).replace(",", "."));
                    }
                }
            } else {
                // ------------------------------
                // PER FATTURE PRECEDENTI AL 2017
                // ------------------------------

                // Linea e consumo
                if (line.matches("(\\d{3}(\\s+)?-(\\s+)?\\d{7})((?:\\s+)(?:\\w+\\s+)+)(\\d+,\\d+)"))
                // (3 digits)(optional whitespaces)-(optional whitespaces)(7 digits)
                // (any number of whitespaces)(any number of words followed by whitespace)(1+ digits),(1+digits)
                {
                    //elimino gli spazi nel numero di telefono
                    line = line.replace(" - ", "-");

                    //se entro qui significa che inizia un consumo
                    ArrayList<String> splitted = SplitLine.splitConsumo1(line);

                    //esiste un consumo con lo stesso numero quindi i dati vanno aggiunti
                    if (consumo != null && splitted.get(0).replaceAll("\\s+", "").equals(consumo.Telefono)) // il continuo del precedente
                    {
                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", "."));
                    } else {
                        //non esiste un consumo con il numero letto
                        if (consumo != null) //salvo la precedente
                        {
                            consumi.add(consumo);
                        }
                        //creo un nuovo consumo
                        consumo = new Consumo();
                        consumo.Telefono = splitted.get(0);

                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", "."));
                    }
                }

                if (line.matches("((?:\\w+\\s+)+)(\\d+,\\d+)"))//(any number of words followed by whitespaces)(1+ digits),(1+ digits)
                {
                    //continua la fattura precedente                    
                    ArrayList<String> splitted = SplitLine.splitConsumo2(line);

                    if (consumo != null) {
                        if (splitted.get(0).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Totale"))
                            consumo.Totale = Double.parseDouble(splitted.get(1).replace(",", "."));
                    }
                }
            }

        }

        //outputFile.delete();
    } catch (IOException ex) {
        EService.logAndRecover(ex);
        setResult(EService.UNRECOVERABLE_ERROR);
        setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage());
    } catch (NumberFormatException ex) {
        EService.logAndRecover((FatalError) ex);
        setResult(EService.UNRECOVERABLE_ERROR);
        setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage());
    }
}

From source file:javaapplication1.JavaApplication1.java

/**
 * Parses a specific area of a PDF to a plain text file.
 * @param pdf the original PDF/*from   w ww .j  a va  2 s.  c  om*/
 * @param txt the resulting text
 * @throws IOException
 */
public String mycheckline(PdfReader reader, int pheight, int pwidth, int lh, int page) throws IOException {
    // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    Rectangle rect = new Rectangle(0, pheight - lh, pwidth / 3, pheight - lh - fontchecksize);
    RenderFilter filter = new RegionTextRenderFilter(rect);
    TextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
            filter);
    return (PdfTextExtractor.getTextFromPage(reader, page, strategy));
}

From source file:me.Aron.Heinecke.fbot.lib.Converter.java

License:Apache License

@Deprecated
public String ReadPdfFile(File file) throws IOException {
    StringBuilder text = new StringBuilder();

    if (file.exists()) {
        PdfReader pdfReader = new PdfReader(file.getAbsolutePath());

        for (int pageid = 1; pageid <= pdfReader.getNumberOfPages(); pageid++) {

            SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
            String currentText = PdfTextExtractor.getTextFromPage(pdfReader, pageid, strategy);

            //currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
            text.append(currentText);/*w  w w  .j ava  2  s  .c o  m*/
        }
        pdfReader.close();
    }
    return text.toString();
}

From source file:org.sinarproject.ecparser.ECRedelineation.java

private static void processECStructure(String mymeta) {
    // TODO: Restructure all the stuff from main into here ..
    int n = my_reader.getNumberOfPages();
    int i;//www. j  a va  2 s  . c om
    // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE>
    // <Key> -> Name:Population
    // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx
    final_mapped_data = new TreeMap<>();
    error_while_parsing = new TreeMap<>();
    // Loop through each page ..
    for (i = 1; i < n; i++) {
        try {
            String content;
            content = PdfTextExtractor.getTextFromPage(my_reader, i, new LocationTextExtractionStrategy());
            // content = PdfTextExtractor.getTextFromPage(reader, i);
            describePage(content, i);
        } catch (IOException ex) {
            Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    // Dump out error hash .. if any
    if (error_while_parsing.size() > 0) {
        out.println("==============================");
        out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!");
        out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs);
        out.println("==============================");
        // TODO: Shift to file output
        /*
        for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) {
        out.println("CODE: " + single_report_entry.getKey());
        out.println("UNMATCHED: " + single_report_entry.getValue());
        }
        */
    } else {
        out.println("========================");
        out.println("      ALL OK!!!         ");
        out.println("========================");
    }

    out.println("========================");
    out.println("  @@@@ Data!!! @@@@     ");
    out.println("========================");
    out.println("Final DM count: " + countedDM);
    // Detect if there any population not being able to be detected; so action can be taken
    for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) {
        // Output those that were not able to be auto-corrected ..
        if (single_data_entry.getValue().endsWith(":0")) {
            out.print("KEY:" + single_data_entry.getKey());
            out.println(" ==> " + single_data_entry.getValue());
        }
        ;
    }
    if (null != mymeta) // write down Output
    // No JSON output in this cut :( Leave it for interaction with golang
    //  and shapefile mapping and manipulatons ..
    {
        switch (mymeta) {
        case "json":
            Utils.writeJSONMappedData();
            break;
        case "csv":
            // go direct to CSV ..
            Utils.writeCSVFinalData();
            break;
        }
    }

    out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx");
}

From source file:org.sinarproject.ECRedelineation.java

/**
 * @param args the command line arguments
 *///from  w w  w.ja  v  a  2  s  .  c om
public static void main(String[] args) {
    // TODO code application logic here
    out.println("Sinar Project's EC Parser ..");
    PdfReader reader = null;
    try {
        reader = new PdfReader(SOURCE);
        int n;
        n = reader.getNumberOfPages();
        int i;
        // Maps init .. Key is <PAR_CODE>/<DUN_CODE>/<DM_CODE>
        // <Key> -> Name:Population
        // Errors -> Map<ErrKey, Original String>; ErrKey is Nxx
        final_mapped_data = new TreeMap<>();
        error_while_parsing = new TreeMap<>();
        // Loop through each page ..
        for (i = 1; i < n; i++) {
            try {
                String content;
                content = PdfTextExtractor.getTextFromPage(reader, i, new LocationTextExtractionStrategy());
                // content = PdfTextExtractor.getTextFromPage(reader, i);
                describePage(content, i);
            } catch (IOException ex) {
                Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        // Dump out error hash .. if any
        if (error_while_parsing.size() > 0) {
            out.println("==============================");
            out.println(" PARSING ERRORS --> " + error_while_parsing.size() + " errors!!!");
            out.println("DUN: " + DUNerrors + " DM: " + DMerrors + " Fixed:" + fixedDMs);
            out.println("==============================");
            // TODO: Shift to file output
            /*
             for (Map.Entry<String, String> single_report_entry : error_while_parsing.entrySet()) {
             out.println("CODE: " + single_report_entry.getKey());
             out.println("UNMATCHED: " + single_report_entry.getValue());
             }
             */
        } else {
            out.println("========================");
            out.println("      ALL OK!!!         ");
            out.println("========================");
        }

        out.println("========================");
        out.println("  @@@@ Data!!! @@@@     ");
        out.println("========================");
        out.println("Final DM count: " + countedDM);
        // Detect if there any population not being able to be detected; so action can be taken
        for (Map.Entry<String, String> single_data_entry : final_mapped_data.entrySet()) {
            // Output those that were not able to be auto-corrected ..
            if (single_data_entry.getValue().endsWith(":0")) {
                out.print("KEY:" + single_data_entry.getKey());
                out.println(" ==> " + single_data_entry.getValue());
            }
            ;
        }
        // write down Output
        // No JSON output in this cut :( Leave it for interaction with golang
        //  and shapefile mapping and manipulatons ..
        Utils.writeJSONMappedData();
        // go direct to CSV ..
        // Utils.writeCSVFinalData();
        out.println("xxxxxxxXXXXXXXXXXxxxxxxxxx");
    } catch (IOException ex) {
        Logger.getLogger(ECRedelineation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:Project.data.preparation.ExtractPageContentArea.java

public void parsePdf(String pdf, int pageNum, int upper_x, int upper_y, int lower_x, int lower_y)
        throws IOException {
    PdfReader reader = new PdfReader(pdf);
    //        System.out.println("(" + upper_x + " , " + upper_y + ") to ( " + lower_x + " , " + lower_y + ")");
    rect = new Rectangle(upper_x, upper_y, lower_x, lower_y);
    RenderFilter filter = new RegionTextRenderFilter(getRect());
    TextExtractionStrategy strategy;//from w  ww  . j a v a  2 s  .c  om
    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
    TextCropped = PdfTextExtractor.getTextFromPage(reader, pageNum, strategy);
    setTextCropped(TextCropped);
    reader.close();
}