Example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument

List of usage examples for org.apache.pdfbox.pdmodel PDDocument PDDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument PDDocument.

Prototype

public PDDocument(COSDocument doc) 

Source Link

Document

Constructor that uses an existing document.

Usage

From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java

License:Apache License

@Test
public void testPdfFromStringTo() throws Exception {

    // GIVEN an html template containing special characters that java stores in utf-16 internally
    Pdf pdf = pdfBuilder.build();//from w  ww.  j av a2  s  .co m
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString);

    String tempFolder = temporaryFolder.newFolder().getPath();
    pdf.saveAs(tempFolder + "/output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();

    PDFParser parser = new PDFParser(
            new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));

    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));

    assertThat("document should contain the creditorName", pdfText, containsString("Mller"));
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *///from w  ww.  ja  v a  2  s  .  c  o m
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java

License:Open Source License

private String pdftoText(String fileName) throws ActionException {

    File pdfFile = new File(fileName);

    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }// w w  w  .  jav  a  2  s  .c  o m

    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    return parsedText;
}

From source file:data.PDFManager.java

/**
 * /*from  ww  w . java2 s . c  om*/
 * @return String do conteudo do pdf
 * @throws IOException 
 */
public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:de.hsmannheim.ss15.alr.searchengine.PDFParser.java

public String getTextOfPDF(byte[] in) throws Exception {

    ByteArrayInputStream input = new ByteArrayInputStream(in);

    org.apache.pdfbox.pdfparser.PDFParser parser;
    String parsedText = null;/*from  www. java  2  s .  com*/
    ;
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;

    parser = new NonSequentialPDFParser(input);

    //parse PDF
    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);

        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        throw (e);

    } finally {
        if (cosDoc != null) {
            cosDoc.close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }

    }
    return parsedText;
}

From source file:de.maklerpoint.office.Lucene.Indexer.java

License:Open Source License

private void indexFileorDir(String fileName) throws IOException {
    listFiles(new File(fileName));

    for (File f : queue) {
        FileReader fr = null;//from   w  w w  .j  a v a  2  s.  co m
        try {
            if (f.getName().startsWith(".")) {
                //                    System.out.println("Versteckte datei: " + f.getName());
                // TODO add html, xml parsers
            } else if (f.getName().endsWith(".htm") || f.getName().endsWith(".html")
                    || f.getName().endsWith(".xml") || f.getName().endsWith(".txt")) {
                Document doc = new Document();

                //===================================================
                // add contents of file
                //===================================================
                fr = new FileReader(f);
                doc.add(new Field("contents", fr));

                //===================================================
                //adding second field which contains the path of the file
                //===================================================
                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                /**
                 * Adding Typ
                 */
                doc.add(new Field("type", String.valueOf(FileTypes.TXT), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".pdf")) {
                PDFParser parser = new PDFParser(new FileInputStream(f));
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();

                String text = stripper.getText(new PDDocument(cd));

                Document doc = new Document();

                doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.PDF), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
                cd.close();
            } else if (f.getName().endsWith(".doc") || f.getName().endsWith(".docx")) {

                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                WordExtractor extractor = new WordExtractor(fs);
                String wordText = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", wordText, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.DOC), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".xls") || f.getName().endsWith(".xlsx")) {
                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                ExcelExtractor extractor = new ExcelExtractor(fs);
                String excelText = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", excelText, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.XLS), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".ppt") || f.getName().endsWith(".pptx")) {
                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                String ppttext = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", ppttext, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("type", String.valueOf(FileTypes.PPT), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            }

            if (Log.logger.isDebugEnabled()) {
                Log.logger.debug("Lucene | Neue Datei indexiert: " + f);
            }
        } catch (Exception e) {
            if (Log.logger.isDebugEnabled()) {
                Log.logger.debug("Datei konnte nicht indexiert werden: " + f, e);
            }
            continue;
        } finally {
            //                fr.close();
        }
    }

    writer.optimize();
    queue.clear();

}

From source file:edu.esprit.filereader.PdfReader.java

public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();/*  w ww. j a  va2  s.  c  o m*/
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * @deprecated/*from  ww w.  j  av  a 2 s .  com*/
 * @see LAPDFTextStripper#getWordBlocks( PDDocument )
 * @param doc The document to extract the text from.
 * @return The document text.
 * @throws IOException If there is an error extracting the text.
 */
public String getText(COSDocument doc) throws IOException {
    return getWordBlocks(new PDDocument(doc));
}

From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java

License:Apache License

/**
 * @deprecated/*from   ww w .  java2 s .  c  o  m*/
 * @see LAPDFTextStripper#writeWordBlocks( PDDocument, Writer )
 * @param doc The document to extract the text.
 * @param outputStream The stream to write the text to.
 * @throws IOException If there is an error extracting the text.
 */
public void writeText(COSDocument doc, Writer outputStream) throws IOException {
    writeWordBlocks(new PDDocument(doc), outputStream);
}

From source file:eu.sisob.uma.extractors.adhoc.email.EmailExtractor.java

License:Open Source License

/**
 *
 * @param input_file//from  ww w.java2  s .co m
 * @param data_dir
 * @param output_file
 * @param norepeat_output_file
 * @param notfound_output_file
 * @param notfound_norepeat_output_file
 * @param filters
 * @param error_sw
 */
public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file,
        File notfound_output_file, File notfound_norepeat_output_file, List<String> filters,
        StringWriter error_sw) {
    CSVReader reader = null;
    try {
        reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
    } catch (FileNotFoundException ex) {
        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
    }

    int idStaffIdentifier = -1;
    int idName = -1;
    int idFirstName = -1;
    int idLastName = -1;
    int idInitials = -1;
    int idUnitOfAssessment_Description = -1;
    int idInstitutionName = -1;
    int idWebAddress = -1;
    int idResearchGroupDescription = -1;
    int idResearcherWebAddress = -1;
    int idResearcherWebAddressType = -1;
    int idResearcherWebAddressExt = -1;
    int idScoreUrl = -1;

    String filter_literal = "(";
    for (String filter : filters) {
        filter_literal += filter + ",";
    }
    filter_literal += ")";

    String[] nextLine;
    try {
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
                    idScoreUrl = i;
            }
        }
    } catch (Exception ex) {
        String error_msg = "Error reading headers of " + input_file.getName();
        Logger.getRootLogger().error(error_msg + " - " + ex.toString());
        if (error_sw != null)
            error_sw.append(error_msg + "\r\n");

        return;
    }

    if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
        //if(!test_only_output)
        {
            try {
                String header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"";
                header += "\r\n";
                FileUtils.write(output_file, header, "UTF-8", false);

                header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"";
                header += "\r\n";

                FileUtils.write(notfound_output_file, header, "UTF-8", false);

            } catch (IOException ex) {
                Logger.getLogger("root").error(ex.toString());
                error_sw.append("Error creating output files\r\n");
            }
        }

        try {
            //if(!test_only_output)
            {
                Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");

                while ((nextLine = reader.readNext()) != null) {
                    nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idFirstName != -1)
                        nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ")
                                .toLowerCase();
                    if (idName != -1)
                        nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();

                    String content = "";
                    String researcher_page_url = nextLine[idResearcherWebAddress];
                    Logger.getLogger("root").info("Go with " + researcher_page_url);
                    if (p1.matcher(researcher_page_url).matches()) {

                        File f = new File(data_dir, researcher_page_url);

                        if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) {

                            Logger.getLogger("root")
                                    .error("The document " + researcher_page_url + " could not loaded");
                            error_sw.append("The document " + researcher_page_url + " could not loaded");

                        } else if (researcher_page_url.endsWith(".pdf")) {

                            PDFParser parser = null;
                            PDFTextStripper pdfStripper = null;
                            PDDocument pdDoc = null;
                            COSDocument cosDoc = null;

                            try {
                                parser = new PDFParser(new FileInputStream(f));
                            } catch (IOException e) {
                                Logger.getLogger("root").error(e.toString());
                                error_sw.append("Unable to open PDF called " + researcher_page_url);
                            }

                            if (parser != null) {
                                try {
                                    parser.parse();
                                    cosDoc = parser.getDocument();
                                    pdfStripper = new PDFTextStripper();
                                    pdDoc = new PDDocument(cosDoc);
                                    pdfStripper.setStartPage(1);
                                    pdfStripper.setEndPage(2);
                                    content = pdfStripper.getText(pdDoc);
                                } catch (Exception e) {
                                    Logger.getLogger("root").error(e.toString());
                                    error_sw.append("An exception occured in parsing the PDF Document.");
                                } finally {
                                    try {
                                        if (cosDoc != null)
                                            cosDoc.close();
                                        if (pdDoc != null)
                                            pdDoc.close();
                                    } catch (Exception e) {
                                        Logger.getLogger("root").error(e.toString());
                                    }
                                }
                            }
                        }

                    } else {

                        try {
                            Logger.getRootLogger().info("Reading " + researcher_page_url);

                            File temp;

                            temp = File.createTempFile("temp-file-name", ".tmp");
                            URL fetched_url = Downloader.fetchURL(researcher_page_url);
                            FileUtils.copyURLToFile(fetched_url, temp);
                            long sizeInBytes = temp.length();
                            long sizeInMb = sizeInBytes / (1024 * 1024);
                            if (sizeInMb > 100) {
                                content = "";
                            } else {
                                content = FileUtils.readFileToString(temp);
                                temp.delete();
                            }

                        } catch (Exception ex) {
                            Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        } catch (java.lang.OutOfMemoryError ex2) {
                            Logger.getLogger("root").error(
                                    researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        }

                    }

                    if (!content.equals("")) {

                        //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})";
                        final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})";
                        Pattern p = Pattern.compile(RE_MAIL);
                        Matcher m = p.matcher(content);
                        List<String> emails = new ArrayList<String>();
                        while (m.find()) {
                            String email = m.group(1);

                            if (!emails.contains(email)) {
                                // Apply filter
                                boolean pass = true;
                                if (filters.size() > 0) {
                                    pass = false;
                                    for (String filter : filters) {

                                        String filter2 = filter.replace("*", ".*?");
                                        Pattern pattern = Pattern.compile(filter2);
                                        if (pattern.matcher(email).matches()) {
                                            pass = true;
                                            break;
                                        } else {

                                        }
                                    }
                                }

                                if (pass) {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email
                                            + " PASS FILTER! " + filter_literal);
                                    emails.add(email);
                                } else {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email
                                            + " REFUSE BY FILTER! " + filter_literal);
                                }
                            }
                        }

                        if (emails.size() < MAX_MAIL_PER_PAGE) {
                            for (String email : emails) {

                                String score_email = "";
                                String lastname = nextLine[idLastName];
                                if (lastname.length() > 5)
                                    lastname = lastname.substring(0, 6);

                                if (email.toLowerCase().contains(lastname)) {
                                    score_email = "A";
                                } else {
                                    int temp_id = idFirstName;
                                    if (temp_id == -1)
                                        temp_id = idInitials;

                                    if (!nextLine[idInitials].trim().equals("")) {

                                        String firstname = nextLine[temp_id].split(" ")[0];
                                        if (firstname.length() > 5)
                                            firstname = firstname.substring(0, 5);
                                        if (firstname.length() > 1) {
                                            if (email.toLowerCase().contains(firstname)) {
                                                score_email = "A";
                                            }
                                        }
                                    }

                                    if (score_email.equals("")) {
                                        String initials = "";

                                        String[] arr = nextLine[temp_id].split(" ");
                                        for (int i = 0; i < arr.length; i++) {
                                            if (arr[i].length() > 0)
                                                initials += arr[i].charAt(0);
                                        }
                                        initials += nextLine[idLastName].charAt(0);

                                        if (email.toLowerCase().contains(initials)) {
                                            score_email = "B";
                                        } else {
                                            score_email = "Z";
                                        }
                                    }

                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + email + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressExt != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressType != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressType] + "\""
                                            + CSV_SEPARATOR;
                                if (idScoreUrl != -1)
                                    result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR;
                                result += "\"" + score_email + "\"";

                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }
                            }
                        } else {
                            content = "";
                        }

                        if (emails.size() == 0)
                            content = "";
                    }

                    if (content == "") {

                        String result = "";
                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                        if (idFirstName != -1)
                            result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                        if (idName != -1)
                            result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                        if (idInstitutionName != -1)
                            result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                        if (idWebAddress != -1)
                            result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressExt != -1)
                            result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressType != -1)
                            result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                        if (idScoreUrl != -1)
                            result += "\"" + nextLine[idScoreUrl] + "\"";

                        result += "\r\n";

                        try {
                            FileUtils.write(notfound_output_file, result, "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    }
                }

                reader.close();
            }

            Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications");

            boolean finish = false;
            String alternate_filename_1 = "file1";
            String alternate_filename_2 = "file2";

            File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
            File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);

            FileUtils.copyFile(output_file, alternate_file_s);

            //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false);
            FileUtils.write(norepeat_output_file, "", "UTF-8", false);

            while (!finish) {
                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger()
                            .error("Error reading " + input_file.getName() + " - " + ex.toString());
                }

                HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>();
                int idEmail = 3;
                if (idFirstName != -1)
                    idEmail++;
                if (idName != -1)
                    idEmail++;

                try {
                    FileUtils.write(alternate_file_d, "", "UTF-8", false);
                } catch (IOException ex) {
                    Logger.getLogger("root").error(ex.toString());
                }
                finish = true;
                while ((nextLine = reader.readNext()) != null) {
                    Integer count = 1;
                    if (count_dictionary.containsKey(nextLine[idEmail].toString()))
                        count = count_dictionary.get(nextLine[idEmail].toString());
                    else {
                        if (count_dictionary.size() < max_in_mem) {
                            count_dictionary.put(nextLine[idEmail].toString(), count + 1);
                        } else {
                            try {
                                for (int i = 0; i < nextLine.length; i++)
                                    nextLine[i] = "\"" + nextLine[i] + "\"";
                                FileUtils.write(alternate_file_d,
                                        StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR))
                                                + "\r\n",
                                        "UTF-8", true);
                                finish = false;
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }
                        }
                    }
                }

                reader.close();

                Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications");

                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger()
                            .error("Error reading " + input_file.getName() + " - " + ex.toString());
                }

                String previous_id = "%previous%";
                String previous_email = "%previous_email%";
                List<String[]> cache = new ArrayList<String[]>();

                while ((nextLine = reader.readNext()) != null) {
                    String id = nextLine[idStaffIdentifier].toString();

                    if (previous_id.equals(id)) {
                        cache.add(nextLine);
                        previous_id = id;
                    } else {
                        //Process
                        String[] winner_line = null;
                        String max_score = "Z";
                        for (String[] act_line : cache) {
                            String act_score = "Z";
                            try {
                                act_score = act_line[act_line.length - 1];
                            } catch (Exception ex) {
                            }

                            String email = act_line[idEmail].toString();

                            if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                                if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                    winner_line = act_line;
                                    max_score = act_score;
                                }

                                count_dictionary.put(email, 0);
                            }
                        }

                        if (winner_line != null) {
                            try {
                                for (int i = 0; i < winner_line.length; i++)
                                    winner_line[i] = "\"" + winner_line[i] + "\"";
                                FileUtils.write(norepeat_output_file,
                                        StringUtil.join(Arrays.asList(winner_line),
                                                String.valueOf(CSV_SEPARATOR)) + "\r\n",
                                        "UTF-8", true);
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }

                        } else {
                            //                            try {
                            //                                FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                            //                            } catch (IOException ex) {
                            //                                Logger.getLogger("root").error(ex.toString());
                            //                            }
                        }

                        cache.clear();

                        cache.add(nextLine);

                        previous_id = id;
                    }

                }

                //Process
                if (cache.size() > 0) {
                    String[] winner_line = null;
                    String max_score = "Z";
                    for (String[] act_line : cache) {
                        String act_score = "Z";
                        try {
                            act_score = (act_line[act_line.length - 1]);
                        } catch (Exception ex) {
                        }
                        String email = act_line[idEmail];
                        if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                            if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                winner_line = act_line;
                                max_score = act_score;
                            }

                            count_dictionary.put(email, 0);
                        }
                    }

                    if (winner_line != null) {

                        try {
                            for (int i = 0; i < winner_line.length; i++)
                                winner_line[i] = "\"" + winner_line[i] + "\"";
                            FileUtils.write(norepeat_output_file,
                                    StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR))
                                            + "\r\n",
                                    "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    } else {
                        //                        try {
                        //                            FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                        //                        } catch (IOException ex) {
                        //                            Logger.getLogger("root").error(ex.toString());
                        //                        }
                    }
                }

                reader.close();

                //
                if (!finish) {
                    FileUtils.copyFile(alternate_file_d, alternate_file_s);
                    alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
                    alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
                }
            }

            FileUtils.forceDelete(alternate_file_s);
            FileUtils.forceDelete(alternate_file_d);

            Logger.getLogger("root").info("Applying deduplication algoritm - Finish");

        } catch (Exception ex) {
            String error_msg = "Error extracting emails from extractor " + input_file.getName();
            Logger.getRootLogger().error(error_msg + " - " + ex.toString());
            if (error_sw != null)
                error_sw.append(error_msg + "\r\n");
            return;
        }
    }
}