Example usage for org.apache.pdfbox.pdmodel PDDocument close

List of usage examples for org.apache.pdfbox.pdmodel PDDocument close

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

This will close the underlying COSDocument object.

Usage

From source file:edu.uwm.jiaoduan.lab.ExtractTextByArea.java

License:Apache License

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 *//* www.  j a v  a2s. com*/
public static void main(String[] args) throws Exception {
    args = new String[] { "test.pdf" };
    if (args.length != 1) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            //Rectangle rect = new Rectangle( 99,219,80,15 );
            //convert xfdf coordinate to rectangle

            Rectangle2D.Double rect = new Rectangle2D.Double();

            List allPages = document.getDocumentCatalog().getAllPages();
            PDPage firstPage = (PDPage) allPages.get(0);

            double pageHeight = firstPage.getMediaBox().getHeight();

            //125.824906,672.39502,390.577109,694.679017
            double[] coords = new double[] { 58.50615, 500.847504, 302.919073, 552.419312 };
            //rect.height = 694.679017 - 672.39502;
            rect.height = coords[3] - coords[1];
            //rect.width = 390.577109 - 125.824906;
            rect.width = coords[2] - coords[0];
            ;

            //rect.x = 125.824906;
            rect.x = coords[0];
            //rect.y = pageHeight -672.39502 - rect.height; 
            rect.y = pageHeight - coords[1] - rect.height;
            System.out.println(rect);

            stripper.addRegion("class1", rect);
            stripper.extractRegions(firstPage);

            System.out.println("Text in the area:" + rect);
            System.out.println(stripper.getTextForRegion("class1"));

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:edworld.pdfreader4humans.PDFReader.java

License:Apache License

/**
 * Class responsible for reading PDF contents in the same order a human would read them.
 * //ww w .  j  a  v a 2s.  c om
 * @param url
 *            the PDF's location
 * @param componentLocator
 *            an instance of a PDFComponentLocator subclass such as MainPDFComponentLocator
 * @param boxDetector
 *            an instance of a BoxDetector subclass such as MainBoxDetector
 * @param marginDetector
 *            an instance of a MarginDetector subclass such as MainMarginDetector
 * @throws IOException
 */
public PDFReader(URL url, PDFComponentLocator componentLocator, BoxDetector boxDetector,
        MarginDetector marginDetector) throws IOException {
    this.url = url;
    PDDocument doc = PDDocument.load(url);
    try {
        readAllPages(doc, componentLocator, boxDetector, marginDetector);
    } finally {
        doc.close();
    }
}

From source file:edworld.pdfreader4humans.PDFReader.java

License:Apache License

private PDRectangle getPageCropBox(int pageIndex) throws IOException {
    PDDocument doc = PDDocument.load(url);
    try {/*  ww  w  .j a v  a2s  .c  om*/
        return ((PDPage) doc.getDocumentCatalog().getAllPages().get(pageIndex - 1)).findCropBox();
    } finally {
        doc.close();
    }
}

From source file:es.jscan.Pantallas.PantallaPrincipal.java

License:Apache License

public void importarDePdf(final File archivo) {
    final PantallaBarra pantbarra = new PantallaBarra(PantallaPrincipal.this, false);

    new Thread() {
        @Override//from   w  ww .  ja  v a  2 s.  co  m
        public void run() {
            org.apache.pdfbox.pdmodel.PDDocument documento = null;
            try {
                documento = org.apache.pdfbox.pdmodel.PDDocument.load(archivo);
            } catch (IOException ex) {
                Utilidades.escribeLog("Error -importarDePdf- al importar del archivo PDF "
                        + archivo.getAbsolutePath() + " - " + ex.getMessage());
                return;
            }
            int startpage = 1;
            int endpage = Integer.MAX_VALUE;
            List pages = documento.getDocumentCatalog().getAllPages();
            String titulo = pantbarra.getTitle();
            pantbarra.barra.setMinimum(startpage);
            pantbarra.barra.setMaximum(pages.size());
            pantbarra.barra.setValue(1);
            for (int i = startpage - 1; i < endpage && i < pages.size(); i++) {
                PDPage page = (PDPage) pages.get(i);
                java.awt.image.BufferedImage imagenpdf = null;
                try {
                    imagenpdf = page.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 200);
                } catch (IOException ex) {
                    Utilidades.escribeLog("Error -importarDePdf- al convertir Imagen de PDF a Escala de Grises "
                            + " - " + ex.getMessage());
                }
                //  Utilidades.escribeLog("Numero bits por pixel :"+imagenpdf.getColorModel().getPixelSize());

                //   imagenpdf = pantutil.convertirAGris((java.awt.image.BufferedImage) imagenpdf);
                guardarImagen(imagenpdf, contimagen);
                cargarMiniaturas(contimagen);
                JScrollBar bar = panelVisorMini.getVerticalScrollBar();
                bar.setValue(bar.getMaximum());
                pantbarra.barra.setValue(i);
                pantbarra.setTitle(titulo + "     " + (i + 1) + " de " + pages.size());
                contimagen++;
                doLayout();
                if (pantbarra.PARAR) {

                    break;
                }
            }

            PDPage page = (PDPage) pages.get(pages.size() - 1);
            java.awt.image.BufferedImage imagenpdf = null;
            try {
                imagenpdf = page.convertToImage();
                imagenpdf = pantutil.convertirAGris((java.awt.image.BufferedImage) imagenpdf);
                documento.close();
            } catch (IOException ex) {
                Utilidades.escribeLog("Error -importarDePdf- al convertir Imagen de PDF a Escala de Grises "
                        + " - " + ex.getMessage());
            }
            JScrollBar bar = panelVisorMini.getVerticalScrollBar();
            if (pantbarra.PARAR) {
                pintarImagenPorIndice(0);
                ponerBordeBoton(0);
                bar.setValue(bar.getMinimum());
            } else {
                pintarImagen(imagenpdf);
                ponerBordeBoton(contimagen - 1);
                bar.setValue(bar.getMaximum());
            }
            pantbarra.dispose();
        }
    }.start();
    pantbarra.setModalityType(Dialog.ModalityType.APPLICATION_MODAL);
    pantbarra.setVisible(false);
    pantbarra.setVisible(true);
}

From source file:es.ucm.pdfmeta.Main.java

License:Open Source License

public static void main(String[] args) {
    PDDocument doc = null;
    try {//w  w w  . j a  v  a2  s  .  com
        if (args.length > 0) {
            UIManager.setLookAndFeel("com.sun.java.swing.plaf.nimbus.NimbusLookAndFeel");
            for (String arg : args) {
                File f = new File(arg);
                doc = PDDocument.load(f);
                MetadataModel<String> m = buildModelFromDocument(doc);
                Controller c = new Controller(m);

                MainDialog md = new MainDialog(arg, m, c);
                if (md.runDialog()) {
                    modifyDocFromModel(doc, m);
                    doc.save(arg);
                }
            }
        } else {
            System.err.println("error: no input file(s) specified");
        }
    } catch (IOException ex) {
        JOptionPane.showMessageDialog(null, ex.getMessage(), "Error", JOptionPane.ERROR_MESSAGE);
    } catch (ClassNotFoundException | InstantiationException | IllegalAccessException
            | UnsupportedLookAndFeelException ex) {
        Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
        if (doc != null)
            try {
                doc.close();
            } catch (IOException ex) {
            }
    }

}

From source file:es.udc.fic.medregatas.util.PDFUtils.java

public static void printPDF(String docName) throws IOException, COSVisitorException {

    createPdfDirectoryIfNotExists();/*from  w  w w.j a  va2 s  .  c  om*/

    // Create a document and add a page to it
    PDDocument document = new PDDocument();
    PDPage page = new PDPage();
    document.addPage(page);

    // Create a new font object selecting one of the PDF base fonts
    PDFont font = PDType1Font.HELVETICA_BOLD;

    // Start a new content stream which will "hold" the to be created content
    PDPageContentStream contentStream = new PDPageContentStream(document, page);

    // Define a text content stream using the selected font, moving the 
    // cursor and drawing the text "Hello World"
    contentStream.beginText();

    contentStream.setFont(font, 12);
    contentStream.moveTextPositionByAmount(100, 700);
    contentStream.drawString("Hello World");
    contentStream.endText();

    // Make sure that the content stream is closed:
    contentStream.close();

    // Save the results and ensure that the document is properly closed:
    document.save(PDFS_FOLDER + "/" + docName);
    document.close();
}

From source file:eu.europa.ejusticeportal.dss.controller.signature.PdfUtils.java

License:EUPL

private static void closeQuietly(PDDocument doc) {
    if (doc != null) {
        try {/*  ww  w .  ja v a  2  s  . c  o m*/
            doc.close();
        } catch (IOException e) {
            LOGGER.error("Error closing the document.", e);
        }
    }

}

From source file:eu.europa.esig.dss.pades.InfiniteLoopDSS621Test.java

License:Open Source License

/**
 * These signatures are invalid because of non ordered signed attributes
 *///from ww  w  . j  av a2s. co  m
@Test
public void manualTest() throws Exception {

    File pdfFile = new File(FILE_PATH);

    FileInputStream fis = new FileInputStream(pdfFile);
    byte[] pdfBytes = IOUtils.toByteArray(fis);

    PDDocument document = PDDocument.load(pdfFile);
    List<PDSignature> signatures = document.getSignatureDictionaries();
    assertEquals(6, signatures.size());

    int idx = 0;
    for (PDSignature pdSignature : signatures) {
        byte[] contents = pdSignature.getContents(pdfBytes);
        byte[] signedContent = pdSignature.getSignedContent(pdfBytes);

        logger.info("Byte range : " + Arrays.toString(pdSignature.getByteRange()));

        IOUtils.write(contents, new FileOutputStream("target/sig" + (idx++) + ".p7s"));

        ASN1InputStream asn1sInput = new ASN1InputStream(contents);
        ASN1Sequence asn1Seq = (ASN1Sequence) asn1sInput.readObject();

        logger.info("SEQ : " + asn1Seq.toString());

        ASN1ObjectIdentifier oid = ASN1ObjectIdentifier.getInstance(asn1Seq.getObjectAt(0));
        assertEquals(PKCSObjectIdentifiers.signedData, oid);

        SignedData signedData = SignedData
                .getInstance(DERTaggedObject.getInstance(asn1Seq.getObjectAt(1)).getObject());

        ASN1Set digestAlgorithmSet = signedData.getDigestAlgorithms();
        ASN1ObjectIdentifier oidDigestAlgo = ASN1ObjectIdentifier
                .getInstance(ASN1Sequence.getInstance(digestAlgorithmSet.getObjectAt(0)).getObjectAt(0));
        DigestAlgorithm digestAlgorithm = DigestAlgorithm.forOID(oidDigestAlgo.getId());
        logger.info("DIGEST ALGO : " + digestAlgorithm);

        ContentInfo encapContentInfo = signedData.getEncapContentInfo();
        ASN1ObjectIdentifier contentTypeOID = encapContentInfo.getContentType();
        logger.info("ENCAPSULATED CONTENT INFO TYPE : " + contentTypeOID);

        if (!PKCSObjectIdentifiers.id_ct_TSTInfo.equals(contentTypeOID)) { // If not timestamp
            assertEquals(PKCSObjectIdentifiers.data, contentTypeOID);

            ASN1Encodable content = encapContentInfo.getContent();
            logger.info("ENCAPSULATED CONTENT INFO CONTENT : " + content);
            assertNull(content);

            List<X509Certificate> certificates = extractCertificates(signedData);

            ASN1Set signerInfosAsn1 = signedData.getSignerInfos();
            logger.info("SIGNER INFO ASN1 : " + signerInfosAsn1.toString());
            SignerInfo signedInfo = SignerInfo
                    .getInstance(ASN1Sequence.getInstance(signerInfosAsn1.getObjectAt(0)));

            ASN1Set authenticatedAttributeSet = signedInfo.getAuthenticatedAttributes();
            logger.info("AUTHENTICATED ATTR : " + authenticatedAttributeSet);

            Attribute attributeDigest = null;
            for (int i = 0; i < authenticatedAttributeSet.size(); i++) {
                Attribute attribute = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(i));
                if (PKCSObjectIdentifiers.pkcs_9_at_messageDigest.equals(attribute.getAttrType())) {
                    attributeDigest = attribute;
                    break;
                }
            }

            assertNotNull(attributeDigest);

            ASN1OctetString asn1ObjString = ASN1OctetString
                    .getInstance(attributeDigest.getAttrValues().getObjectAt(0));
            String embeddedDigest = Base64.encodeBase64String(asn1ObjString.getOctets());
            logger.info("MESSAGE DIGEST : " + embeddedDigest);

            byte[] digestSignedContent = DSSUtils.digest(digestAlgorithm, signedContent);
            String computedDigestSignedContentEncodeBase64 = Base64.encodeBase64String(digestSignedContent);
            logger.info("COMPUTED DIGEST SIGNED CONTENT BASE64 : " + computedDigestSignedContentEncodeBase64);
            assertEquals(embeddedDigest, computedDigestSignedContentEncodeBase64);

            SignerIdentifier sid = signedInfo.getSID();
            logger.info("SIGNER IDENTIFIER : " + sid.getId());

            IssuerAndSerialNumber issuerAndSerialNumber = IssuerAndSerialNumber
                    .getInstance(signedInfo.getSID());
            ASN1Integer signerSerialNumber = issuerAndSerialNumber.getSerialNumber();
            logger.info("ISSUER AND SN : " + issuerAndSerialNumber.getName() + " " + signerSerialNumber);

            BigInteger serial = issuerAndSerialNumber.getSerialNumber().getValue();
            X509Certificate signerCertificate = null;
            for (X509Certificate x509Certificate : certificates) {
                if (serial.equals(x509Certificate.getSerialNumber())) {
                    signerCertificate = x509Certificate;
                }
            }
            assertNotNull(signerCertificate);

            String algorithm = signerCertificate.getPublicKey().getAlgorithm();
            EncryptionAlgorithm encryptionAlgorithm = EncryptionAlgorithm.forName(algorithm);

            ASN1OctetString encryptedInfoOctedString = signedInfo.getEncryptedDigest();
            String signatureValue = Hex.toHexString(encryptedInfoOctedString.getOctets());

            logger.info("SIGNATURE VALUE : " + signatureValue);

            Cipher cipher = Cipher.getInstance(encryptionAlgorithm.getName());
            cipher.init(Cipher.DECRYPT_MODE, signerCertificate);
            byte[] decrypted = cipher.doFinal(encryptedInfoOctedString.getOctets());

            ASN1InputStream inputDecrypted = new ASN1InputStream(decrypted);

            ASN1Sequence seqDecrypt = (ASN1Sequence) inputDecrypted.readObject();
            logger.info("DECRYPTED : " + seqDecrypt);

            DigestInfo digestInfo = new DigestInfo(seqDecrypt);
            assertEquals(oidDigestAlgo, digestInfo.getAlgorithmId().getAlgorithm());

            String decryptedDigestEncodeBase64 = Base64.encodeBase64String(digestInfo.getDigest());
            logger.info("DECRYPTED BASE64 : " + decryptedDigestEncodeBase64);

            byte[] encoded = authenticatedAttributeSet.getEncoded();
            byte[] digest = DSSUtils.digest(digestAlgorithm, encoded);
            String computedDigestFromSignatureEncodeBase64 = Base64.encodeBase64String(digest);
            logger.info("COMPUTED DIGEST FROM SIGNATURE BASE64 : " + computedDigestFromSignatureEncodeBase64);

            assertEquals(decryptedDigestEncodeBase64, computedDigestFromSignatureEncodeBase64);

            IOUtils.closeQuietly(inputDecrypted);

        }

        IOUtils.closeQuietly(asn1sInput);
    }

    IOUtils.closeQuietly(fis);
    document.close();
}

From source file:eu.europa.esig.dss.pades.signature.PAdESLevelBTest.java

License:Open Source License

@Override
protected void onDocumentSigned(byte[] byteArray) {

    try {//from  w w w .  ja v  a 2 s .  c o  m
        InputStream inputStream = new ByteArrayInputStream(byteArray);

        PDDocument document = PDDocument.load(inputStream);
        List<PDSignature> signatures = document.getSignatureDictionaries();
        assertEquals(1, signatures.size());

        for (PDSignature pdSignature : signatures) {
            byte[] contents = pdSignature.getContents(byteArray);
            byte[] signedContent = pdSignature.getSignedContent(byteArray);

            logger.info("Byte range : " + Arrays.toString(pdSignature.getByteRange()));

            // IOUtils.write(contents, new FileOutputStream("sig.p7s"));

            ASN1InputStream asn1sInput = new ASN1InputStream(contents);
            ASN1Sequence asn1Seq = (ASN1Sequence) asn1sInput.readObject();

            logger.info("SEQ : " + asn1Seq.toString());

            ASN1ObjectIdentifier oid = ASN1ObjectIdentifier.getInstance(asn1Seq.getObjectAt(0));
            assertEquals(PKCSObjectIdentifiers.signedData, oid);

            SignedData signedData = SignedData
                    .getInstance(DERTaggedObject.getInstance(asn1Seq.getObjectAt(1)).getObject());

            ASN1Set digestAlgorithmSet = signedData.getDigestAlgorithms();
            ASN1ObjectIdentifier oidDigestAlgo = ASN1ObjectIdentifier
                    .getInstance(ASN1Sequence.getInstance(digestAlgorithmSet.getObjectAt(0)).getObjectAt(0));
            DigestAlgorithm digestAlgorithm = DigestAlgorithm.forOID(oidDigestAlgo.getId());
            logger.info("DIGEST ALGO : " + digestAlgorithm);

            ContentInfo encapContentInfo = signedData.getEncapContentInfo();
            ASN1ObjectIdentifier contentTypeOID = encapContentInfo.getContentType();
            logger.info("ENCAPSULATED CONTENT INFO TYPE : " + contentTypeOID);
            assertEquals(PKCSObjectIdentifiers.data, contentTypeOID);

            ASN1Encodable content = encapContentInfo.getContent();
            logger.info("ENCAPSULATED CONTENT INFO CONTENT : " + content);
            assertNull(content);

            List<X509Certificate> certificates = extractCertificates(signedData);

            ASN1Set signerInfosAsn1 = signedData.getSignerInfos();
            logger.info("SIGNER INFO ASN1 : " + signerInfosAsn1.toString());
            SignerInfo signedInfo = SignerInfo
                    .getInstance(ASN1Sequence.getInstance(signerInfosAsn1.getObjectAt(0)));

            ASN1Set authenticatedAttributeSet = signedInfo.getAuthenticatedAttributes();
            logger.info("AUTHENTICATED ATTR : " + authenticatedAttributeSet);

            List<ASN1ObjectIdentifier> attributeOids = new ArrayList<ASN1ObjectIdentifier>();
            int previousSize = 0;
            for (int i = 0; i < authenticatedAttributeSet.size(); i++) {
                Attribute attribute = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(i));
                ASN1ObjectIdentifier attrTypeOid = attribute.getAttrType();
                attributeOids.add(attrTypeOid);
                int size = attrTypeOid.getEncoded().length + attribute.getEncoded().length;
                assertTrue(size >= previousSize);

                previousSize = size;
            }
            logger.info("List of OID for Auth Attrb : " + attributeOids);

            Attribute attributeDigest = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(1));
            assertEquals(PKCSObjectIdentifiers.pkcs_9_at_messageDigest, attributeDigest.getAttrType());

            ASN1OctetString asn1ObjString = ASN1OctetString
                    .getInstance(attributeDigest.getAttrValues().getObjectAt(0));
            String embeddedDigest = Base64.encodeBase64String(asn1ObjString.getOctets());
            logger.info("MESSAGE DIGEST : " + embeddedDigest);

            byte[] digestSignedContent = DSSUtils.digest(digestAlgorithm, signedContent);
            String computedDigestSignedContentEncodeBase64 = Base64.encodeBase64String(digestSignedContent);
            logger.info("COMPUTED DIGEST SIGNED CONTENT BASE64 : " + computedDigestSignedContentEncodeBase64);
            assertEquals(embeddedDigest, computedDigestSignedContentEncodeBase64);

            SignerIdentifier sid = signedInfo.getSID();
            logger.info("SIGNER IDENTIFIER : " + sid.getId());

            IssuerAndSerialNumber issuerAndSerialNumber = IssuerAndSerialNumber
                    .getInstance(signedInfo.getSID());
            ASN1Integer signerSerialNumber = issuerAndSerialNumber.getSerialNumber();
            logger.info("ISSUER AND SN : " + issuerAndSerialNumber.getName() + " " + signerSerialNumber);

            BigInteger serial = issuerAndSerialNumber.getSerialNumber().getValue();
            X509Certificate signerCertificate = null;
            for (X509Certificate x509Certificate : certificates) {
                if (serial.equals(x509Certificate.getSerialNumber())) {
                    signerCertificate = x509Certificate;
                }
            }
            assertNotNull(signerCertificate);

            String algorithm = signerCertificate.getPublicKey().getAlgorithm();
            EncryptionAlgorithm encryptionAlgorithm = EncryptionAlgorithm.forName(algorithm);

            ASN1OctetString encryptedInfoOctedString = signedInfo.getEncryptedDigest();
            String signatureValue = Hex.toHexString(encryptedInfoOctedString.getOctets());

            logger.info("SIGNATURE VALUE : " + signatureValue);

            Cipher cipher = Cipher.getInstance(encryptionAlgorithm.getName());
            cipher.init(Cipher.DECRYPT_MODE, signerCertificate);
            byte[] decrypted = cipher.doFinal(encryptedInfoOctedString.getOctets());

            ASN1InputStream inputDecrypted = new ASN1InputStream(decrypted);

            ASN1Sequence seqDecrypt = (ASN1Sequence) inputDecrypted.readObject();
            logger.info("DECRYPTED : " + seqDecrypt);

            DigestInfo digestInfo = new DigestInfo(seqDecrypt);
            assertEquals(oidDigestAlgo, digestInfo.getAlgorithmId().getAlgorithm());

            String decryptedDigestEncodeBase64 = Base64.encodeBase64String(digestInfo.getDigest());
            logger.info("DECRYPTED BASE64 : " + decryptedDigestEncodeBase64);

            byte[] encoded = authenticatedAttributeSet.getEncoded();
            byte[] digest = DSSUtils.digest(digestAlgorithm, encoded);
            String computedDigestFromSignatureEncodeBase64 = Base64.encodeBase64String(digest);
            logger.info("COMPUTED DIGEST FROM SIGNATURE BASE64 : " + computedDigestFromSignatureEncodeBase64);

            assertEquals(decryptedDigestEncodeBase64, computedDigestFromSignatureEncodeBase64);

            IOUtils.closeQuietly(inputDecrypted);
            IOUtils.closeQuietly(asn1sInput);
        }

        IOUtils.closeQuietly(inputStream);
        document.close();
    } catch (Exception e) {
        logger.error(e.getMessage(), e);
        fail(e.getMessage());
    }
}

From source file:eu.sisob.uma.extractors.adhoc.email.EmailExtractor.java

License:Open Source License

/**
 *
 * @param input_file/*  w  w  w .  ja  v  a 2  s .co  m*/
 * @param data_dir
 * @param output_file
 * @param norepeat_output_file
 * @param notfound_output_file
 * @param notfound_norepeat_output_file
 * @param filters
 * @param error_sw
 */
public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file,
        File notfound_output_file, File notfound_norepeat_output_file, List<String> filters,
        StringWriter error_sw) {
    CSVReader reader = null;
    try {
        reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
    } catch (FileNotFoundException ex) {
        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
    }

    int idStaffIdentifier = -1;
    int idName = -1;
    int idFirstName = -1;
    int idLastName = -1;
    int idInitials = -1;
    int idUnitOfAssessment_Description = -1;
    int idInstitutionName = -1;
    int idWebAddress = -1;
    int idResearchGroupDescription = -1;
    int idResearcherWebAddress = -1;
    int idResearcherWebAddressType = -1;
    int idResearcherWebAddressExt = -1;
    int idScoreUrl = -1;

    String filter_literal = "(";
    for (String filter : filters) {
        filter_literal += filter + ",";
    }
    filter_literal += ")";

    String[] nextLine;
    try {
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
                    idScoreUrl = i;
            }
        }
    } catch (Exception ex) {
        String error_msg = "Error reading headers of " + input_file.getName();
        Logger.getRootLogger().error(error_msg + " - " + ex.toString());
        if (error_sw != null)
            error_sw.append(error_msg + "\r\n");

        return;
    }

    if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
        //if(!test_only_output)
        {
            try {
                String header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"";
                header += "\r\n";
                FileUtils.write(output_file, header, "UTF-8", false);

                header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressExt != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                if (idResearcherWebAddressType != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                if (idScoreUrl != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"";
                header += "\r\n";

                FileUtils.write(notfound_output_file, header, "UTF-8", false);

            } catch (IOException ex) {
                Logger.getLogger("root").error(ex.toString());
                error_sw.append("Error creating output files\r\n");
            }
        }

        try {
            //if(!test_only_output)
            {
                Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");

                while ((nextLine = reader.readNext()) != null) {
                    nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idFirstName != -1)
                        nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ")
                                .toLowerCase();
                    if (idName != -1)
                        nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();

                    String content = "";
                    String researcher_page_url = nextLine[idResearcherWebAddress];
                    Logger.getLogger("root").info("Go with " + researcher_page_url);
                    if (p1.matcher(researcher_page_url).matches()) {

                        File f = new File(data_dir, researcher_page_url);

                        if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) {

                            Logger.getLogger("root")
                                    .error("The document " + researcher_page_url + " could not loaded");
                            error_sw.append("The document " + researcher_page_url + " could not loaded");

                        } else if (researcher_page_url.endsWith(".pdf")) {

                            PDFParser parser = null;
                            PDFTextStripper pdfStripper = null;
                            PDDocument pdDoc = null;
                            COSDocument cosDoc = null;

                            try {
                                parser = new PDFParser(new FileInputStream(f));
                            } catch (IOException e) {
                                Logger.getLogger("root").error(e.toString());
                                error_sw.append("Unable to open PDF called " + researcher_page_url);
                            }

                            if (parser != null) {
                                try {
                                    parser.parse();
                                    cosDoc = parser.getDocument();
                                    pdfStripper = new PDFTextStripper();
                                    pdDoc = new PDDocument(cosDoc);
                                    pdfStripper.setStartPage(1);
                                    pdfStripper.setEndPage(2);
                                    content = pdfStripper.getText(pdDoc);
                                } catch (Exception e) {
                                    Logger.getLogger("root").error(e.toString());
                                    error_sw.append("An exception occured in parsing the PDF Document.");
                                } finally {
                                    try {
                                        if (cosDoc != null)
                                            cosDoc.close();
                                        if (pdDoc != null)
                                            pdDoc.close();
                                    } catch (Exception e) {
                                        Logger.getLogger("root").error(e.toString());
                                    }
                                }
                            }
                        }

                    } else {

                        try {
                            Logger.getRootLogger().info("Reading " + researcher_page_url);

                            File temp;

                            temp = File.createTempFile("temp-file-name", ".tmp");
                            URL fetched_url = Downloader.fetchURL(researcher_page_url);
                            FileUtils.copyURLToFile(fetched_url, temp);
                            long sizeInBytes = temp.length();
                            long sizeInMb = sizeInBytes / (1024 * 1024);
                            if (sizeInMb > 100) {
                                content = "";
                            } else {
                                content = FileUtils.readFileToString(temp);
                                temp.delete();
                            }

                        } catch (Exception ex) {
                            Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        } catch (java.lang.OutOfMemoryError ex2) {
                            Logger.getLogger("root").error(
                                    researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = "";
                        }

                    }

                    if (!content.equals("")) {

                        //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})";
                        final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})";
                        Pattern p = Pattern.compile(RE_MAIL);
                        Matcher m = p.matcher(content);
                        List<String> emails = new ArrayList<String>();
                        while (m.find()) {
                            String email = m.group(1);

                            if (!emails.contains(email)) {
                                // Apply filter
                                boolean pass = true;
                                if (filters.size() > 0) {
                                    pass = false;
                                    for (String filter : filters) {

                                        String filter2 = filter.replace("*", ".*?");
                                        Pattern pattern = Pattern.compile(filter2);
                                        if (pattern.matcher(email).matches()) {
                                            pass = true;
                                            break;
                                        } else {

                                        }
                                    }
                                }

                                if (pass) {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email
                                            + " PASS FILTER! " + filter_literal);
                                    emails.add(email);
                                } else {
                                    Logger.getRootLogger().info(researcher_page_url + " => " + email
                                            + " REFUSE BY FILTER! " + filter_literal);
                                }
                            }
                        }

                        if (emails.size() < MAX_MAIL_PER_PAGE) {
                            for (String email : emails) {

                                String score_email = "";
                                String lastname = nextLine[idLastName];
                                if (lastname.length() > 5)
                                    lastname = lastname.substring(0, 6);

                                if (email.toLowerCase().contains(lastname)) {
                                    score_email = "A";
                                } else {
                                    int temp_id = idFirstName;
                                    if (temp_id == -1)
                                        temp_id = idInitials;

                                    if (!nextLine[idInitials].trim().equals("")) {

                                        String firstname = nextLine[temp_id].split(" ")[0];
                                        if (firstname.length() > 5)
                                            firstname = firstname.substring(0, 5);
                                        if (firstname.length() > 1) {
                                            if (email.toLowerCase().contains(firstname)) {
                                                score_email = "A";
                                            }
                                        }
                                    }

                                    if (score_email.equals("")) {
                                        String initials = "";

                                        String[] arr = nextLine[temp_id].split(" ");
                                        for (int i = 0; i < arr.length; i++) {
                                            if (arr[i].length() > 0)
                                                initials += arr[i].charAt(0);
                                        }
                                        initials += nextLine[idLastName].charAt(0);

                                        if (email.toLowerCase().contains(initials)) {
                                            score_email = "B";
                                        } else {
                                            score_email = "Z";
                                        }
                                    }

                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + email + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressExt != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                                if (idResearcherWebAddressType != -1)
                                    result += "\"" + nextLine[idResearcherWebAddressType] + "\""
                                            + CSV_SEPARATOR;
                                if (idScoreUrl != -1)
                                    result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR;
                                result += "\"" + score_email + "\"";

                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }
                            }
                        } else {
                            content = "";
                        }

                        if (emails.size() == 0)
                            content = "";
                    }

                    if (content == "") {

                        String result = "";
                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                        if (idFirstName != -1)
                            result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                        if (idName != -1)
                            result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                        if (idInstitutionName != -1)
                            result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                        if (idWebAddress != -1)
                            result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressExt != -1)
                            result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                        if (idResearcherWebAddressType != -1)
                            result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                        if (idScoreUrl != -1)
                            result += "\"" + nextLine[idScoreUrl] + "\"";

                        result += "\r\n";

                        try {
                            FileUtils.write(notfound_output_file, result, "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    }
                }

                reader.close();
            }

            Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications");

            boolean finish = false;
            String alternate_filename_1 = "file1";
            String alternate_filename_2 = "file2";

            File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
            File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);

            FileUtils.copyFile(output_file, alternate_file_s);

            //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false);
            FileUtils.write(norepeat_output_file, "", "UTF-8", false);

            while (!finish) {
                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger()
                            .error("Error reading " + input_file.getName() + " - " + ex.toString());
                }

                HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>();
                int idEmail = 3;
                if (idFirstName != -1)
                    idEmail++;
                if (idName != -1)
                    idEmail++;

                try {
                    FileUtils.write(alternate_file_d, "", "UTF-8", false);
                } catch (IOException ex) {
                    Logger.getLogger("root").error(ex.toString());
                }
                finish = true;
                while ((nextLine = reader.readNext()) != null) {
                    Integer count = 1;
                    if (count_dictionary.containsKey(nextLine[idEmail].toString()))
                        count = count_dictionary.get(nextLine[idEmail].toString());
                    else {
                        if (count_dictionary.size() < max_in_mem) {
                            count_dictionary.put(nextLine[idEmail].toString(), count + 1);
                        } else {
                            try {
                                for (int i = 0; i < nextLine.length; i++)
                                    nextLine[i] = "\"" + nextLine[i] + "\"";
                                FileUtils.write(alternate_file_d,
                                        StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR))
                                                + "\r\n",
                                        "UTF-8", true);
                                finish = false;
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }
                        }
                    }
                }

                reader.close();

                Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications");

                reader = null;
                try {
                    reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR);
                } catch (FileNotFoundException ex) {
                    Logger.getRootLogger()
                            .error("Error reading " + input_file.getName() + " - " + ex.toString());
                }

                String previous_id = "%previous%";
                String previous_email = "%previous_email%";
                List<String[]> cache = new ArrayList<String[]>();

                while ((nextLine = reader.readNext()) != null) {
                    String id = nextLine[idStaffIdentifier].toString();

                    if (previous_id.equals(id)) {
                        cache.add(nextLine);
                        previous_id = id;
                    } else {
                        //Process
                        String[] winner_line = null;
                        String max_score = "Z";
                        for (String[] act_line : cache) {
                            String act_score = "Z";
                            try {
                                act_score = act_line[act_line.length - 1];
                            } catch (Exception ex) {
                            }

                            String email = act_line[idEmail].toString();

                            if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                                if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                    winner_line = act_line;
                                    max_score = act_score;
                                }

                                count_dictionary.put(email, 0);
                            }
                        }

                        if (winner_line != null) {
                            try {
                                for (int i = 0; i < winner_line.length; i++)
                                    winner_line[i] = "\"" + winner_line[i] + "\"";
                                FileUtils.write(norepeat_output_file,
                                        StringUtil.join(Arrays.asList(winner_line),
                                                String.valueOf(CSV_SEPARATOR)) + "\r\n",
                                        "UTF-8", true);
                            } catch (IOException ex) {
                                Logger.getLogger("root").error(ex.toString());
                            }

                        } else {
                            //                            try {
                            //                                FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                            //                            } catch (IOException ex) {
                            //                                Logger.getLogger("root").error(ex.toString());
                            //                            }
                        }

                        cache.clear();

                        cache.add(nextLine);

                        previous_id = id;
                    }

                }

                //Process
                if (cache.size() > 0) {
                    String[] winner_line = null;
                    String max_score = "Z";
                    for (String[] act_line : cache) {
                        String act_score = "Z";
                        try {
                            act_score = (act_line[act_line.length - 1]);
                        } catch (Exception ex) {
                        }
                        String email = act_line[idEmail];
                        if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) {
                            if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) {
                                winner_line = act_line;
                                max_score = act_score;
                            }

                            count_dictionary.put(email, 0);
                        }
                    }

                    if (winner_line != null) {

                        try {
                            for (int i = 0; i < winner_line.length; i++)
                                winner_line[i] = "\"" + winner_line[i] + "\"";
                            FileUtils.write(norepeat_output_file,
                                    StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR))
                                            + "\r\n",
                                    "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    } else {
                        //                        try {
                        //                            FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true);
                        //                        } catch (IOException ex) {
                        //                            Logger.getLogger("root").error(ex.toString());
                        //                        }
                    }
                }

                reader.close();

                //
                if (!finish) {
                    FileUtils.copyFile(alternate_file_d, alternate_file_s);
                    alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1);
                    alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2);
                }
            }

            FileUtils.forceDelete(alternate_file_s);
            FileUtils.forceDelete(alternate_file_d);

            Logger.getLogger("root").info("Applying deduplication algoritm - Finish");

        } catch (Exception ex) {
            String error_msg = "Error extracting emails from extractor " + input_file.getName();
            Logger.getRootLogger().error(error_msg + " - " + ex.toString());
            if (error_sw != null)
                error_sw.append(error_msg + "\r\n");
            return;
        }
    }
}