Example usage for com.itextpdf.text.pdf.parser SimpleTextExtractionStrategy SimpleTextExtractionStrategy

List of usage examples for com.itextpdf.text.pdf.parser SimpleTextExtractionStrategy SimpleTextExtractionStrategy

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser SimpleTextExtractionStrategy SimpleTextExtractionStrategy.

Prototype

public SimpleTextExtractionStrategy() 

Source Link

Document

Creates a new text extraction renderer.

Usage

From source file:bflows.FattureManagement.java

public void processPDF() {
    //        Document pdf = null;
    BufferedWriter writer = null;
    consumi = new ArrayList<Consumo>();
    lines = new ArrayList<String>();

    try {/*from   ww w  .j a v  a 2 s  .co m*/
        // Salvo file temporaneo per debugging
        //outputFile = new File("C:\\Users\\nklma\\Documents\\NetBeansProjects\\temp", "temp.txt"); 
        //writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile)));            
        //FileOutputStream fileOutputStream = new FileOutputStream("extracted.txt");

        // iText Library      
        PdfReader pdfReader = new PdfReader(inputStream);
        for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) {
            SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
            String currentText = PdfTextExtractor.getTextFromPage(pdfReader, page, strategy);
            String[] l = currentText.split("\n");
            for (int i = 0; i < l.length; i++)
                lines.add(l[i]);
        }
        pdfReader.close();

        //for(String line : lines)
        //{
        //   writer.write(line);
        //   writer.newLine();
        //}

        //writer.close();    

        boolean startPointFound = false;
        boolean dateFound = false;
        boolean totaleFound = false;
        boolean contributiFound = false;
        boolean prodottiFound = false;
        boolean altriFound = false;
        boolean ivaFound = false;

        Consumo consumo = null;

        for (String line : lines) {
            //recupero data fattura
            if (!dateFound) {
                if (line.contains("Emessa")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String[] splitted = cleanLine.split(" ");
                    for (String s : splitted) {
                        if (s.contains("/")) {
                            s = s.replace("/", "-");
                            data = s;
                        }
                    }
                    dateFound = true;
                }
            }

            //recupero totale fattura con iva
            if (!totaleFound) {
                if (line.contains("IMPORTO")) {
                    String cleanLine = line.replaceAll("\\s+", " ").replaceAll("_", "");
                    String importo = cleanLine.replace("IMPORTO: ", "").replace("Euro", "").trim();
                    totale = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    totaleFound = true;
                }
            }

            //recupero importo contributi e abbonamenti
            if (!contributiFound) {
                if (line.contains("CONTRIBUTI E ABBONAMENTI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("CONTRIBUTI E ABBONAMENTI ", "");
                    contributi = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    contributiFound = true;
                }
            }

            //recupero importo prodotti (noleggi) 
            // SOLO PER 2017+            
            if (data != null && Integer.parseInt(data.split("-")[2]) >= 2017 && !prodottiFound) {
                if (line.contains("PRODOTTI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("PRODOTTI ", "");
                    prodotti = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    prodottiFound = true;
                }
            }

            //recupero importo altri addebiti e accrediti
            if (!altriFound) {
                if (line.contains("ALTRI ADDEBITI E ACCREDITI")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("ALTRI ADDEBITI E ACCREDITI ", "");
                    altri = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    altriFound = true;
                }
            }

            //recupero importo IVA
            if ((contributiFound || altriFound) && !ivaFound) // in questo modo si evitano match con "partita iva" ecc
            {
                if (line.contains("IVA")) {
                    String cleanLine = line.replaceAll("\\s+", " ");
                    String importo = cleanLine.replace("IVA ", "");
                    iva = Double.parseDouble(importo.replace(".", "").replace(",", "."));
                    ivaFound = true;
                }
            }

            //Il primo RIEPILOGO PER UTENZA segna l'inizio della tabella dei consumi da analizzare
            if (!startPointFound && line.contains("RIEPILOGO PER UTENZA"))
                startPointFound = !startPointFound;

            if (!startPointFound)
                continue;

            //SERVIZI OPZIONALI segna la fine della tabella
            if (line.matches("SERVIZI OPZIONALI")) {
                consumi.add(consumo);
                return;
            }

            if (Integer.parseInt(data.split("-")[2]) >= 2017) {
                // ------------------------------
                // PER FATTURE SUCCESSIVE AL 2017
                // ------------------------------
                ArrayList<String> splitted;
                splitted = StringMatcher.matches(line, "\\bLinea\\b\\s((\\d{10}))");
                //if(line.matches("(?:(?:Linea)\\s)(\\d{10})"))
                if (!splitted.isEmpty()) {
                    // Nuovo consumo
                    if (consumo != null) //salvo la precedente
                    {
                        consumi.add(consumo);
                    }
                    //creo un nuovo consumo
                    consumo = new Consumo();
                    StringBuilder str = new StringBuilder(splitted.get(0));
                    str.insert(3, "-");
                    consumo.Telefono = str.toString();
                }

                splitted = StringMatcher.matches(line,
                        "((?:\\w+\\s|\\w+-\\w+\\s)+)(?:\\d{2}\\/\\d{2}\\/\\d{4}\\s)((?:\\D+\\s)+)(?:\\d{2}\\/\\d{2}-\\d{2}\\/\\d{2}\\s)((\\d+,\\d+))$");
                //if(line.matches("(?:(?:(?:\w+\s)+\w+\-)?(?:\w+\s)+)(?:\d{2}\/\d{2}\/\d{4})\s((?:\w+\s)+)(?:\d{2}\/\d{2}\-\d{2}\/\d{2})\s(\d+,\d+)"))
                if (!splitted.isEmpty()) {
                    // Contributi o abbonamenti

                    if (consumo != null) {
                        String lel = splitted.get(1);
                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB += Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB += Double.parseDouble(splitted.get(2).replace(",", "."));
                    }
                }

                splitted = StringMatcher.matches(line,
                        "\\bRicariche\\b(?:\\s\\w+)+(?:\\s\\d\\s)((\\d+,\\d+))$");
                if (!splitted.isEmpty())
                //if(line.matches("(Ricariche(?:\\s\\w+)+)(\\s\\d\\s)(\\d+,\\d+)"))
                {
                    // Ricariche
                    //splitted = SplitLine.splitNewRicarica(line);

                    if (consumo != null) {
                        consumo.AAA += Double.parseDouble(splitted.get(0).replace(",", "."));
                    }
                }

                splitted = StringMatcher.matches(line, "\\bTotale\\b\\s((\\d+,\\d+))$");
                //if(line.matches("(Totale\\s+)(\\d+,\\d+)"))
                if (!splitted.isEmpty()) {
                    // Totale
                    //splitted = SplitLine.splitNewTotale(line);

                    if (consumo != null) {
                        consumo.Totale += Double.parseDouble(splitted.get(0).replace(",", "."));
                    }
                }
            } else {
                // ------------------------------
                // PER FATTURE PRECEDENTI AL 2017
                // ------------------------------

                // Linea e consumo
                if (line.matches("(\\d{3}(\\s+)?-(\\s+)?\\d{7})((?:\\s+)(?:\\w+\\s+)+)(\\d+,\\d+)"))
                // (3 digits)(optional whitespaces)-(optional whitespaces)(7 digits)
                // (any number of whitespaces)(any number of words followed by whitespace)(1+ digits),(1+digits)
                {
                    //elimino gli spazi nel numero di telefono
                    line = line.replace(" - ", "-");

                    //se entro qui significa che inizia un consumo
                    ArrayList<String> splitted = SplitLine.splitConsumo1(line);

                    //esiste un consumo con lo stesso numero quindi i dati vanno aggiunti
                    if (consumo != null && splitted.get(0).replaceAll("\\s+", "").equals(consumo.Telefono)) // il continuo del precedente
                    {
                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", "."));
                    } else {
                        //non esiste un consumo con il numero letto
                        if (consumo != null) //salvo la precedente
                        {
                            consumi.add(consumo);
                        }
                        //creo un nuovo consumo
                        consumo = new Consumo();
                        consumo.Telefono = splitted.get(0);

                        if (splitted.get(1).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(2).replace(",", "."));
                        else if (splitted.get(1).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(2).replace(",", "."));
                    }
                }

                if (line.matches("((?:\\w+\\s+)+)(\\d+,\\d+)"))//(any number of words followed by whitespaces)(1+ digits),(1+ digits)
                {
                    //continua la fattura precedente                    
                    ArrayList<String> splitted = SplitLine.splitConsumo2(line);

                    if (consumo != null) {
                        if (splitted.get(0).contains("Contributi"))
                            consumo.CRB = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Altri"))
                            consumo.AAA = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Abbonamenti"))
                            consumo.ABB = Double.parseDouble(splitted.get(1).replace(",", "."));
                        else if (splitted.get(0).contains("Totale"))
                            consumo.Totale = Double.parseDouble(splitted.get(1).replace(",", "."));
                    }
                }
            }

        }

        //outputFile.delete();
    } catch (IOException ex) {
        EService.logAndRecover(ex);
        setResult(EService.UNRECOVERABLE_ERROR);
        setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage());
    } catch (NumberFormatException ex) {
        EService.logAndRecover((FatalError) ex);
        setResult(EService.UNRECOVERABLE_ERROR);
        setErrorMessage("FattureManagement.ProcessPDF(): " + ex.getMessage());
    }
}

From source file:br.com.smarttaco.util.HelenaBarbosa.java

/**
 * pdf2txt/*ww  w .  j a  v  a  2 s.  c  o m*/
 *
 * @param pdf
 * @param paginas se for <code>null</code> realiza leitura completa.
 * @param txt
 * @throws FileNotFoundException
 * @throws IOException
 */
private static void pdf2txt(final String pdf, List<Integer> paginas, final String txt)
        throws FileNotFoundException, IOException {
    PdfReader reader = new PdfReader(pdf);
    //System.out.println(reader.getInfo().toString());
    if (paginas != null) {
        reader.selectPages(paginas);
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(txt, "UTF-8");
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
    reader.close();
}

From source file:com.cloudhub.util.PDFToText.java

License:Apache License

/**
 * Parses a PDF to a plain text file.//from   w w w  .  ja  v a  2  s  .c  o m
 *
 * @param source the original PDF
 * @param destination the resulting text
 * @throws IOException
 */
public static void parsePdf(String source, String destination) throws IOException {
    PdfReader reader = new PdfReader(source);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(destination));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
}

From source file:com.joanzapata.PDFViewActivity.java

License:Open Source License

public void parsePdf(String pdf2, String txt) throws IOException {
    // String//from  w w w .j av  a2  s . c o m
    // pdf1=Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOWNLOADS)+
    // File.separator + "about.pdf";
    PdfReader reader = new PdfReader(pdf2);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);

    File file = getFileStreamPath("test.txt");

    if (!file.exists()) {
        file.createNewFile();
    }

    FileOutputStream writer = openFileOutput(file.getName(), Context.MODE_PRIVATE);

    // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= pageNumber; i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        writer.write(strategy.getResultantText().getBytes());
        writer.flush();
    }
    writer.close();
    reader.close();
    String totalString = readFromFile();
    System.out.println(totalString);
    totalContent = totalString;
    String word = "Number of words  " + wordCount(totalString) + "\n";
    String averageWords = "Average length of words  " + " " + averageWords(totalString) + "\n";
    String sentenseCount = "Number of sentences  " + sentenseCount(totalString) + "\n";
    String averageSentenses = "Average length of sentences  " + averageSentense(totalString) + "\n";
    // String complexity=
    String readability = " Readability Index  " + getReadability(totalString) + "\n";

    String subject = "Subject Area  " + etSubject.getText().toString() + "\n";
    String article = "Type of the Article : " + etArticle.getText().toString() + "\n";
    String pages = " Pages  " + startPage + " to " + pageNumber + "\n";
    String book = "book name  " + etBookName.getText().toString() + "\n";
    String readername = "reader name  " + etReaderName.getText().toString() + "\n";
    totalInfo = readername + book + subject + "\n" + article + pages + word + averageWords + sentenseCount
            + averageSentenses + readability;
    displayMessage(totalInfo);

}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

public static String extractITextText(String pdf) {
    PdfReader reader = null;//from  w  w w. j  a v  a 2  s .  c o  m
    try {
        reader = new PdfReader(pdf);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    String text = "";
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        try {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text += strategy.getResultantText();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    reader.close();

    return text;
}

From source file:com.sustainalytics.crawlerfilter.PDFtoText.java

License:Apache License

public static String extractITextText(String pdf) {
    PdfReader reader = null;/*from   ww w .j a  v a  2  s .c om*/
    try {
        reader = new PdfReader(pdf);
    } catch (IOException e) {
        logger.info("Error in reading file with iText parser\n");
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    String text = "";
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        try {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text += strategy.getResultantText();
        } catch (IOException e) {
            logger.info("Error in parsing with iText parser\n");
        }
        logger.info("PDF text extracted from " + pdf + "\n");
    }
    reader.close();

    return text;
}

From source file:conversorpdf.Conversor.Conversor.java

/**
 * Parses a PDF to a plain text file./*from  w w  w.  j  a  va2  s. co  m*/
 * @param pdf the original PDF
 * @param txt the resulting text
 * @throws IOException
 */
public boolean parsePdf(String pdf, String txt, boolean removerAcento) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {

        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());

        if (removerAcento == false) {
            out.println(strategy.getResultantText());
        } else {
            out.println(this.removeAcentos(strategy.getResultantText()));
        }

    }
    out.flush();
    out.close();

    return true;
}

From source file:coviam.pdf.PdfParser.java

public void getText() {

    String pdf = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/ejd1.pdf";
    String text = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt";
    StringBuffer textBuffer = new StringBuffer();
    String resultText = "";
    PdfReader reader;//from   ww w.  j  a  v a 2 s . c  o  m
    try {
        reader = new PdfReader(pdf);
        PdfReaderContentParser contentParser = new PdfReaderContentParser(reader);
        PrintWriter printWriter = new PrintWriter(new FileOutputStream(text));
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = contentParser.processContent(i, new SimpleTextExtractionStrategy());
            textBuffer.append(strategy.getResultantText());
        }
        resultText = textBuffer.toString();
        resultText = resultText.replaceAll("-\n", "");
        System.out.println("-->" + resultText);

        StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n");
        PrintWriter lineWriter = new PrintWriter(
                new FileOutputStream("/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt"));
        while (stringTokenizer.hasMoreTokens()) {
            String curToken = stringTokenizer.nextToken();
            lineWriter.println("line-->" + curToken);
        }
        lineWriter.flush();
        lineWriter.close();
        System.out.flush();
        System.out.close();
    } catch (IOException ioe) {

    }

}

From source file:de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java

License:GNU General Public License

public String getText(int pageNumber) throws IOException, Exception {
    String returnValue = null;/*from w ww .j a v a 2  s.  c om*/

    if (pdfReader != null) {
        int numberOfPages = getNumberOfPages();

        if (pageNumber > 0 && pageNumber <= numberOfPages) {
            TextExtractionStrategy strategy;
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
            strategy = parser.processContent(pageNumber, new SimpleTextExtractionStrategy());

            if (strategy != null && strategy.getResultantText().trim().length() > 0) {
                returnValue = PAGE_START_MARKER + strategy.getResultantText();
            } // end if
        } else {
            // TODO: Add own exception.
            throw new Exception("The given page number (" + pageNumber + ") "
                    + "is not in the range of valid pages (1.." + numberOfPages + ").");
        } // end if..else
    } else {
        // TODO: Add own exception.
        throw new Exception("There is no open PDF to work with.");
    } // end if..else
    return returnValue;
}

From source file:helper.PdfText.java

License:Apache License

/**
 * @param pdfFile this file will be extracted.
 * @return the plain text of the pdf//ww w  .  j  a v a  2s. c o  m
 */
public String itext(File pdfFile) {

    PdfReader reader;
    try {
        reader = new PdfReader(pdfFile.getAbsolutePath());
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuffer buf = new StringBuffer();
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            buf.append(strategy.getResultantText());
        }

        return buf.toString();
    } catch (IOException e) {
        throw new HttpArchiveException(500, e);
    }

}