List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:edu.uwm.jiaoduan.lab.ExtractTextByArea.java
License:Apache License
/** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. *//* www. j a v a2s. com*/ public static void main(String[] args) throws Exception { args = new String[] { "test.pdf" }; if (args.length != 1) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); //Rectangle rect = new Rectangle( 99,219,80,15 ); //convert xfdf coordinate to rectangle Rectangle2D.Double rect = new Rectangle2D.Double(); List allPages = document.getDocumentCatalog().getAllPages(); PDPage firstPage = (PDPage) allPages.get(0); double pageHeight = firstPage.getMediaBox().getHeight(); //125.824906,672.39502,390.577109,694.679017 double[] coords = new double[] { 58.50615, 500.847504, 302.919073, 552.419312 }; //rect.height = 694.679017 - 672.39502; rect.height = coords[3] - coords[1]; //rect.width = 390.577109 - 125.824906; rect.width = coords[2] - coords[0]; ; //rect.x = 125.824906; rect.x = coords[0]; //rect.y = pageHeight -672.39502 - rect.height; rect.y = pageHeight - coords[1] - rect.height; System.out.println(rect); stripper.addRegion("class1", rect); stripper.extractRegions(firstPage); System.out.println("Text in the area:" + rect); System.out.println(stripper.getTextForRegion("class1")); } finally { if (document != null) { document.close(); } } } }
From source file:edworld.pdfreader4humans.PDFReader.java
License:Apache License
/** * Class responsible for reading PDF contents in the same order a human would read them. * //ww w . j a v a 2s. c om * @param url * the PDF's location * @param componentLocator * an instance of a PDFComponentLocator subclass such as MainPDFComponentLocator * @param boxDetector * an instance of a BoxDetector subclass such as MainBoxDetector * @param marginDetector * an instance of a MarginDetector subclass such as MainMarginDetector * @throws IOException */ public PDFReader(URL url, PDFComponentLocator componentLocator, BoxDetector boxDetector, MarginDetector marginDetector) throws IOException { this.url = url; PDDocument doc = PDDocument.load(url); try { readAllPages(doc, componentLocator, boxDetector, marginDetector); } finally { doc.close(); } }
From source file:edworld.pdfreader4humans.PDFReader.java
License:Apache License
private PDRectangle getPageCropBox(int pageIndex) throws IOException { PDDocument doc = PDDocument.load(url); try {/* ww w .j a v a2s .c om*/ return ((PDPage) doc.getDocumentCatalog().getAllPages().get(pageIndex - 1)).findCropBox(); } finally { doc.close(); } }
From source file:es.jscan.Pantallas.PantallaPrincipal.java
License:Apache License
public void importarDePdf(final File archivo) { final PantallaBarra pantbarra = new PantallaBarra(PantallaPrincipal.this, false); new Thread() { @Override//from w ww . ja v a 2 s. co m public void run() { org.apache.pdfbox.pdmodel.PDDocument documento = null; try { documento = org.apache.pdfbox.pdmodel.PDDocument.load(archivo); } catch (IOException ex) { Utilidades.escribeLog("Error -importarDePdf- al importar del archivo PDF " + archivo.getAbsolutePath() + " - " + ex.getMessage()); return; } int startpage = 1; int endpage = Integer.MAX_VALUE; List pages = documento.getDocumentCatalog().getAllPages(); String titulo = pantbarra.getTitle(); pantbarra.barra.setMinimum(startpage); pantbarra.barra.setMaximum(pages.size()); pantbarra.barra.setValue(1); for (int i = startpage - 1; i < endpage && i < pages.size(); i++) { PDPage page = (PDPage) pages.get(i); java.awt.image.BufferedImage imagenpdf = null; try { imagenpdf = page.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 200); } catch (IOException ex) { Utilidades.escribeLog("Error -importarDePdf- al convertir Imagen de PDF a Escala de Grises " + " - " + ex.getMessage()); } // Utilidades.escribeLog("Numero bits por pixel :"+imagenpdf.getColorModel().getPixelSize()); // imagenpdf = pantutil.convertirAGris((java.awt.image.BufferedImage) imagenpdf); guardarImagen(imagenpdf, contimagen); cargarMiniaturas(contimagen); JScrollBar bar = panelVisorMini.getVerticalScrollBar(); bar.setValue(bar.getMaximum()); pantbarra.barra.setValue(i); pantbarra.setTitle(titulo + " " + (i + 1) + " de " + pages.size()); contimagen++; doLayout(); if (pantbarra.PARAR) { break; } } PDPage page = (PDPage) pages.get(pages.size() - 1); java.awt.image.BufferedImage imagenpdf = null; try { imagenpdf = page.convertToImage(); imagenpdf = pantutil.convertirAGris((java.awt.image.BufferedImage) imagenpdf); documento.close(); } catch (IOException ex) { Utilidades.escribeLog("Error -importarDePdf- al convertir Imagen de PDF a Escala de Grises " + " - " + ex.getMessage()); } JScrollBar bar = panelVisorMini.getVerticalScrollBar(); if (pantbarra.PARAR) { pintarImagenPorIndice(0); ponerBordeBoton(0); bar.setValue(bar.getMinimum()); } else { pintarImagen(imagenpdf); ponerBordeBoton(contimagen - 1); bar.setValue(bar.getMaximum()); } pantbarra.dispose(); } }.start(); pantbarra.setModalityType(Dialog.ModalityType.APPLICATION_MODAL); pantbarra.setVisible(false); pantbarra.setVisible(true); }
From source file:es.ucm.pdfmeta.Main.java
License:Open Source License
public static void main(String[] args) { PDDocument doc = null; try {//w w w . j a v a2 s . com if (args.length > 0) { UIManager.setLookAndFeel("com.sun.java.swing.plaf.nimbus.NimbusLookAndFeel"); for (String arg : args) { File f = new File(arg); doc = PDDocument.load(f); MetadataModel<String> m = buildModelFromDocument(doc); Controller c = new Controller(m); MainDialog md = new MainDialog(arg, m, c); if (md.runDialog()) { modifyDocFromModel(doc, m); doc.save(arg); } } } else { System.err.println("error: no input file(s) specified"); } } catch (IOException ex) { JOptionPane.showMessageDialog(null, ex.getMessage(), "Error", JOptionPane.ERROR_MESSAGE); } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | UnsupportedLookAndFeelException ex) { Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); } finally { if (doc != null) try { doc.close(); } catch (IOException ex) { } } }
From source file:es.udc.fic.medregatas.util.PDFUtils.java
public static void printPDF(String docName) throws IOException, COSVisitorException { createPdfDirectoryIfNotExists();/*from w w w.j a va2 s . c om*/ // Create a document and add a page to it PDDocument document = new PDDocument(); PDPage page = new PDPage(); document.addPage(page); // Create a new font object selecting one of the PDF base fonts PDFont font = PDType1Font.HELVETICA_BOLD; // Start a new content stream which will "hold" the to be created content PDPageContentStream contentStream = new PDPageContentStream(document, page); // Define a text content stream using the selected font, moving the // cursor and drawing the text "Hello World" contentStream.beginText(); contentStream.setFont(font, 12); contentStream.moveTextPositionByAmount(100, 700); contentStream.drawString("Hello World"); contentStream.endText(); // Make sure that the content stream is closed: contentStream.close(); // Save the results and ensure that the document is properly closed: document.save(PDFS_FOLDER + "/" + docName); document.close(); }
From source file:eu.europa.ejusticeportal.dss.controller.signature.PdfUtils.java
License:EUPL
private static void closeQuietly(PDDocument doc) { if (doc != null) { try {/* ww w . ja v a 2 s . c o m*/ doc.close(); } catch (IOException e) { LOGGER.error("Error closing the document.", e); } } }
From source file:eu.europa.esig.dss.pades.InfiniteLoopDSS621Test.java
License:Open Source License
/** * These signatures are invalid because of non ordered signed attributes *///from ww w . j av a2s. co m @Test public void manualTest() throws Exception { File pdfFile = new File(FILE_PATH); FileInputStream fis = new FileInputStream(pdfFile); byte[] pdfBytes = IOUtils.toByteArray(fis); PDDocument document = PDDocument.load(pdfFile); List<PDSignature> signatures = document.getSignatureDictionaries(); assertEquals(6, signatures.size()); int idx = 0; for (PDSignature pdSignature : signatures) { byte[] contents = pdSignature.getContents(pdfBytes); byte[] signedContent = pdSignature.getSignedContent(pdfBytes); logger.info("Byte range : " + Arrays.toString(pdSignature.getByteRange())); IOUtils.write(contents, new FileOutputStream("target/sig" + (idx++) + ".p7s")); ASN1InputStream asn1sInput = new ASN1InputStream(contents); ASN1Sequence asn1Seq = (ASN1Sequence) asn1sInput.readObject(); logger.info("SEQ : " + asn1Seq.toString()); ASN1ObjectIdentifier oid = ASN1ObjectIdentifier.getInstance(asn1Seq.getObjectAt(0)); assertEquals(PKCSObjectIdentifiers.signedData, oid); SignedData signedData = SignedData .getInstance(DERTaggedObject.getInstance(asn1Seq.getObjectAt(1)).getObject()); ASN1Set digestAlgorithmSet = signedData.getDigestAlgorithms(); ASN1ObjectIdentifier oidDigestAlgo = ASN1ObjectIdentifier .getInstance(ASN1Sequence.getInstance(digestAlgorithmSet.getObjectAt(0)).getObjectAt(0)); DigestAlgorithm digestAlgorithm = DigestAlgorithm.forOID(oidDigestAlgo.getId()); logger.info("DIGEST ALGO : " + digestAlgorithm); ContentInfo encapContentInfo = signedData.getEncapContentInfo(); ASN1ObjectIdentifier contentTypeOID = encapContentInfo.getContentType(); logger.info("ENCAPSULATED CONTENT INFO TYPE : " + contentTypeOID); if (!PKCSObjectIdentifiers.id_ct_TSTInfo.equals(contentTypeOID)) { // If not timestamp assertEquals(PKCSObjectIdentifiers.data, contentTypeOID); ASN1Encodable content = encapContentInfo.getContent(); logger.info("ENCAPSULATED CONTENT INFO CONTENT : " + content); assertNull(content); List<X509Certificate> certificates = extractCertificates(signedData); ASN1Set signerInfosAsn1 = signedData.getSignerInfos(); logger.info("SIGNER INFO ASN1 : " + signerInfosAsn1.toString()); SignerInfo signedInfo = SignerInfo .getInstance(ASN1Sequence.getInstance(signerInfosAsn1.getObjectAt(0))); ASN1Set authenticatedAttributeSet = signedInfo.getAuthenticatedAttributes(); logger.info("AUTHENTICATED ATTR : " + authenticatedAttributeSet); Attribute attributeDigest = null; for (int i = 0; i < authenticatedAttributeSet.size(); i++) { Attribute attribute = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(i)); if (PKCSObjectIdentifiers.pkcs_9_at_messageDigest.equals(attribute.getAttrType())) { attributeDigest = attribute; break; } } assertNotNull(attributeDigest); ASN1OctetString asn1ObjString = ASN1OctetString .getInstance(attributeDigest.getAttrValues().getObjectAt(0)); String embeddedDigest = Base64.encodeBase64String(asn1ObjString.getOctets()); logger.info("MESSAGE DIGEST : " + embeddedDigest); byte[] digestSignedContent = DSSUtils.digest(digestAlgorithm, signedContent); String computedDigestSignedContentEncodeBase64 = Base64.encodeBase64String(digestSignedContent); logger.info("COMPUTED DIGEST SIGNED CONTENT BASE64 : " + computedDigestSignedContentEncodeBase64); assertEquals(embeddedDigest, computedDigestSignedContentEncodeBase64); SignerIdentifier sid = signedInfo.getSID(); logger.info("SIGNER IDENTIFIER : " + sid.getId()); IssuerAndSerialNumber issuerAndSerialNumber = IssuerAndSerialNumber .getInstance(signedInfo.getSID()); ASN1Integer signerSerialNumber = issuerAndSerialNumber.getSerialNumber(); logger.info("ISSUER AND SN : " + issuerAndSerialNumber.getName() + " " + signerSerialNumber); BigInteger serial = issuerAndSerialNumber.getSerialNumber().getValue(); X509Certificate signerCertificate = null; for (X509Certificate x509Certificate : certificates) { if (serial.equals(x509Certificate.getSerialNumber())) { signerCertificate = x509Certificate; } } assertNotNull(signerCertificate); String algorithm = signerCertificate.getPublicKey().getAlgorithm(); EncryptionAlgorithm encryptionAlgorithm = EncryptionAlgorithm.forName(algorithm); ASN1OctetString encryptedInfoOctedString = signedInfo.getEncryptedDigest(); String signatureValue = Hex.toHexString(encryptedInfoOctedString.getOctets()); logger.info("SIGNATURE VALUE : " + signatureValue); Cipher cipher = Cipher.getInstance(encryptionAlgorithm.getName()); cipher.init(Cipher.DECRYPT_MODE, signerCertificate); byte[] decrypted = cipher.doFinal(encryptedInfoOctedString.getOctets()); ASN1InputStream inputDecrypted = new ASN1InputStream(decrypted); ASN1Sequence seqDecrypt = (ASN1Sequence) inputDecrypted.readObject(); logger.info("DECRYPTED : " + seqDecrypt); DigestInfo digestInfo = new DigestInfo(seqDecrypt); assertEquals(oidDigestAlgo, digestInfo.getAlgorithmId().getAlgorithm()); String decryptedDigestEncodeBase64 = Base64.encodeBase64String(digestInfo.getDigest()); logger.info("DECRYPTED BASE64 : " + decryptedDigestEncodeBase64); byte[] encoded = authenticatedAttributeSet.getEncoded(); byte[] digest = DSSUtils.digest(digestAlgorithm, encoded); String computedDigestFromSignatureEncodeBase64 = Base64.encodeBase64String(digest); logger.info("COMPUTED DIGEST FROM SIGNATURE BASE64 : " + computedDigestFromSignatureEncodeBase64); assertEquals(decryptedDigestEncodeBase64, computedDigestFromSignatureEncodeBase64); IOUtils.closeQuietly(inputDecrypted); } IOUtils.closeQuietly(asn1sInput); } IOUtils.closeQuietly(fis); document.close(); }
From source file:eu.europa.esig.dss.pades.signature.PAdESLevelBTest.java
License:Open Source License
@Override protected void onDocumentSigned(byte[] byteArray) { try {//from w w w . ja v a 2 s . c o m InputStream inputStream = new ByteArrayInputStream(byteArray); PDDocument document = PDDocument.load(inputStream); List<PDSignature> signatures = document.getSignatureDictionaries(); assertEquals(1, signatures.size()); for (PDSignature pdSignature : signatures) { byte[] contents = pdSignature.getContents(byteArray); byte[] signedContent = pdSignature.getSignedContent(byteArray); logger.info("Byte range : " + Arrays.toString(pdSignature.getByteRange())); // IOUtils.write(contents, new FileOutputStream("sig.p7s")); ASN1InputStream asn1sInput = new ASN1InputStream(contents); ASN1Sequence asn1Seq = (ASN1Sequence) asn1sInput.readObject(); logger.info("SEQ : " + asn1Seq.toString()); ASN1ObjectIdentifier oid = ASN1ObjectIdentifier.getInstance(asn1Seq.getObjectAt(0)); assertEquals(PKCSObjectIdentifiers.signedData, oid); SignedData signedData = SignedData .getInstance(DERTaggedObject.getInstance(asn1Seq.getObjectAt(1)).getObject()); ASN1Set digestAlgorithmSet = signedData.getDigestAlgorithms(); ASN1ObjectIdentifier oidDigestAlgo = ASN1ObjectIdentifier .getInstance(ASN1Sequence.getInstance(digestAlgorithmSet.getObjectAt(0)).getObjectAt(0)); DigestAlgorithm digestAlgorithm = DigestAlgorithm.forOID(oidDigestAlgo.getId()); logger.info("DIGEST ALGO : " + digestAlgorithm); ContentInfo encapContentInfo = signedData.getEncapContentInfo(); ASN1ObjectIdentifier contentTypeOID = encapContentInfo.getContentType(); logger.info("ENCAPSULATED CONTENT INFO TYPE : " + contentTypeOID); assertEquals(PKCSObjectIdentifiers.data, contentTypeOID); ASN1Encodable content = encapContentInfo.getContent(); logger.info("ENCAPSULATED CONTENT INFO CONTENT : " + content); assertNull(content); List<X509Certificate> certificates = extractCertificates(signedData); ASN1Set signerInfosAsn1 = signedData.getSignerInfos(); logger.info("SIGNER INFO ASN1 : " + signerInfosAsn1.toString()); SignerInfo signedInfo = SignerInfo .getInstance(ASN1Sequence.getInstance(signerInfosAsn1.getObjectAt(0))); ASN1Set authenticatedAttributeSet = signedInfo.getAuthenticatedAttributes(); logger.info("AUTHENTICATED ATTR : " + authenticatedAttributeSet); List<ASN1ObjectIdentifier> attributeOids = new ArrayList<ASN1ObjectIdentifier>(); int previousSize = 0; for (int i = 0; i < authenticatedAttributeSet.size(); i++) { Attribute attribute = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(i)); ASN1ObjectIdentifier attrTypeOid = attribute.getAttrType(); attributeOids.add(attrTypeOid); int size = attrTypeOid.getEncoded().length + attribute.getEncoded().length; assertTrue(size >= previousSize); previousSize = size; } logger.info("List of OID for Auth Attrb : " + attributeOids); Attribute attributeDigest = Attribute.getInstance(authenticatedAttributeSet.getObjectAt(1)); assertEquals(PKCSObjectIdentifiers.pkcs_9_at_messageDigest, attributeDigest.getAttrType()); ASN1OctetString asn1ObjString = ASN1OctetString .getInstance(attributeDigest.getAttrValues().getObjectAt(0)); String embeddedDigest = Base64.encodeBase64String(asn1ObjString.getOctets()); logger.info("MESSAGE DIGEST : " + embeddedDigest); byte[] digestSignedContent = DSSUtils.digest(digestAlgorithm, signedContent); String computedDigestSignedContentEncodeBase64 = Base64.encodeBase64String(digestSignedContent); logger.info("COMPUTED DIGEST SIGNED CONTENT BASE64 : " + computedDigestSignedContentEncodeBase64); assertEquals(embeddedDigest, computedDigestSignedContentEncodeBase64); SignerIdentifier sid = signedInfo.getSID(); logger.info("SIGNER IDENTIFIER : " + sid.getId()); IssuerAndSerialNumber issuerAndSerialNumber = IssuerAndSerialNumber .getInstance(signedInfo.getSID()); ASN1Integer signerSerialNumber = issuerAndSerialNumber.getSerialNumber(); logger.info("ISSUER AND SN : " + issuerAndSerialNumber.getName() + " " + signerSerialNumber); BigInteger serial = issuerAndSerialNumber.getSerialNumber().getValue(); X509Certificate signerCertificate = null; for (X509Certificate x509Certificate : certificates) { if (serial.equals(x509Certificate.getSerialNumber())) { signerCertificate = x509Certificate; } } assertNotNull(signerCertificate); String algorithm = signerCertificate.getPublicKey().getAlgorithm(); EncryptionAlgorithm encryptionAlgorithm = EncryptionAlgorithm.forName(algorithm); ASN1OctetString encryptedInfoOctedString = signedInfo.getEncryptedDigest(); String signatureValue = Hex.toHexString(encryptedInfoOctedString.getOctets()); logger.info("SIGNATURE VALUE : " + signatureValue); Cipher cipher = Cipher.getInstance(encryptionAlgorithm.getName()); cipher.init(Cipher.DECRYPT_MODE, signerCertificate); byte[] decrypted = cipher.doFinal(encryptedInfoOctedString.getOctets()); ASN1InputStream inputDecrypted = new ASN1InputStream(decrypted); ASN1Sequence seqDecrypt = (ASN1Sequence) inputDecrypted.readObject(); logger.info("DECRYPTED : " + seqDecrypt); DigestInfo digestInfo = new DigestInfo(seqDecrypt); assertEquals(oidDigestAlgo, digestInfo.getAlgorithmId().getAlgorithm()); String decryptedDigestEncodeBase64 = Base64.encodeBase64String(digestInfo.getDigest()); logger.info("DECRYPTED BASE64 : " + decryptedDigestEncodeBase64); byte[] encoded = authenticatedAttributeSet.getEncoded(); byte[] digest = DSSUtils.digest(digestAlgorithm, encoded); String computedDigestFromSignatureEncodeBase64 = Base64.encodeBase64String(digest); logger.info("COMPUTED DIGEST FROM SIGNATURE BASE64 : " + computedDigestFromSignatureEncodeBase64); assertEquals(decryptedDigestEncodeBase64, computedDigestFromSignatureEncodeBase64); IOUtils.closeQuietly(inputDecrypted); IOUtils.closeQuietly(asn1sInput); } IOUtils.closeQuietly(inputStream); document.close(); } catch (Exception e) { logger.error(e.getMessage(), e); fail(e.getMessage()); } }
From source file:eu.sisob.uma.extractors.adhoc.email.EmailExtractor.java
License:Open Source License
/** * * @param input_file/* w w w . ja v a 2 s .co m*/ * @param data_dir * @param output_file * @param norepeat_output_file * @param notfound_output_file * @param notfound_norepeat_output_file * @param filters * @param error_sw */ public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file, File notfound_output_file, File notfound_norepeat_output_file, List<String> filters, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; String filter_literal = "("; for (String filter : filters) { filter_literal += filter + ","; } filter_literal += ")"; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { //if(!test_only_output) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\""; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\""; header += "\r\n"; FileUtils.write(notfound_output_file, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); String content = ""; String researcher_page_url = nextLine[idResearcherWebAddress]; Logger.getLogger("root").info("Go with " + researcher_page_url); if (p1.matcher(researcher_page_url).matches()) { File f = new File(data_dir, researcher_page_url); if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) { Logger.getLogger("root") .error("The document " + researcher_page_url + " could not loaded"); error_sw.append("The document " + researcher_page_url + " could not loaded"); } else if (researcher_page_url.endsWith(".pdf")) { PDFParser parser = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; try { parser = new PDFParser(new FileInputStream(f)); } catch (IOException e) { Logger.getLogger("root").error(e.toString()); error_sw.append("Unable to open PDF called " + researcher_page_url); } if (parser != null) { try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(2); content = pdfStripper.getText(pdDoc); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); error_sw.append("An exception occured in parsing the PDF Document."); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); } } } } } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); File temp; temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = ""; } else { content = FileUtils.readFileToString(temp); temp.delete(); } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root").error( researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } } if (!content.equals("")) { //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})"; final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})"; Pattern p = Pattern.compile(RE_MAIL); Matcher m = p.matcher(content); List<String> emails = new ArrayList<String>(); while (m.find()) { String email = m.group(1); if (!emails.contains(email)) { // Apply filter boolean pass = true; if (filters.size() > 0) { pass = false; for (String filter : filters) { String filter2 = filter.replace("*", ".*?"); Pattern pattern = Pattern.compile(filter2); if (pattern.matcher(email).matches()) { pass = true; break; } else { } } } if (pass) { Logger.getRootLogger().info(researcher_page_url + " => " + email + " PASS FILTER! " + filter_literal); emails.add(email); } else { Logger.getRootLogger().info(researcher_page_url + " => " + email + " REFUSE BY FILTER! " + filter_literal); } } } if (emails.size() < MAX_MAIL_PER_PAGE) { for (String email : emails) { String score_email = ""; String lastname = nextLine[idLastName]; if (lastname.length() > 5) lastname = lastname.substring(0, 6); if (email.toLowerCase().contains(lastname)) { score_email = "A"; } else { int temp_id = idFirstName; if (temp_id == -1) temp_id = idInitials; if (!nextLine[idInitials].trim().equals("")) { String firstname = nextLine[temp_id].split(" ")[0]; if (firstname.length() > 5) firstname = firstname.substring(0, 5); if (firstname.length() > 1) { if (email.toLowerCase().contains(firstname)) { score_email = "A"; } } } if (score_email.equals("")) { String initials = ""; String[] arr = nextLine[temp_id].split(" "); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) initials += arr[i].charAt(0); } initials += nextLine[idLastName].charAt(0); if (email.toLowerCase().contains(initials)) { score_email = "B"; } else { score_email = "Z"; } } } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; result += "\"" + email + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR; result += "\"" + score_email + "\""; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } else { content = ""; } if (emails.size() == 0) content = ""; } if (content == "") { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\""; result += "\r\n"; try { FileUtils.write(notfound_output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } reader.close(); } Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications"); boolean finish = false; String alternate_filename_1 = "file1"; String alternate_filename_2 = "file2"; File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); FileUtils.copyFile(output_file, alternate_file_s); //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false); FileUtils.write(norepeat_output_file, "", "UTF-8", false); while (!finish) { reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>(); int idEmail = 3; if (idFirstName != -1) idEmail++; if (idName != -1) idEmail++; try { FileUtils.write(alternate_file_d, "", "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } finish = true; while ((nextLine = reader.readNext()) != null) { Integer count = 1; if (count_dictionary.containsKey(nextLine[idEmail].toString())) count = count_dictionary.get(nextLine[idEmail].toString()); else { if (count_dictionary.size() < max_in_mem) { count_dictionary.put(nextLine[idEmail].toString(), count + 1); } else { try { for (int i = 0; i < nextLine.length; i++) nextLine[i] = "\"" + nextLine[i] + "\""; FileUtils.write(alternate_file_d, StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); finish = false; } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } reader.close(); Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications"); reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } String previous_id = "%previous%"; String previous_email = "%previous_email%"; List<String[]> cache = new ArrayList<String[]>(); while ((nextLine = reader.readNext()) != null) { String id = nextLine[idStaffIdentifier].toString(); if (previous_id.equals(id)) { cache.add(nextLine); previous_id = id; } else { //Process String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = act_line[act_line.length - 1]; } catch (Exception ex) { } String email = act_line[idEmail].toString(); if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } cache.clear(); cache.add(nextLine); previous_id = id; } } //Process if (cache.size() > 0) { String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = (act_line[act_line.length - 1]); } catch (Exception ex) { } String email = act_line[idEmail]; if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } } reader.close(); // if (!finish) { FileUtils.copyFile(alternate_file_d, alternate_file_s); alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); } } FileUtils.forceDelete(alternate_file_s); FileUtils.forceDelete(alternate_file_d); Logger.getLogger("root").info("Applying deduplication algoritm - Finish"); } catch (Exception ex) { String error_msg = "Error extracting emails from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }