List of usage examples for org.apache.pdfbox.pdmodel PDDocument PDDocument
public PDDocument(COSDocument doc)
From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java
License:Apache License
@Test public void testPdfFromStringTo() throws Exception { // GIVEN an html template containing special characters that java stores in utf-16 internally Pdf pdf = pdfBuilder.build();//from w ww. j av a2 s .co m pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString); String tempFolder = temporaryFolder.newFolder().getPath(); pdf.saveAs(tempFolder + "/output.pdf"); // WHEN byte[] pdfBytes = pdf.getPDF(); PDFParser parser = new PDFParser( new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes))); // that is a valid PDF (otherwise an IOException occurs) parser.parse(); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument())); assertThat("document should contain the creditorName", pdfText, containsString("Mller")); }
From source file:com.validation.manager.core.server.core.AttachmentServerTest.java
License:Apache License
/** * Test of addFile method, of class AttachmentServer. *///from w ww. ja v a 2 s . c o m @Test public void testAddRetrieveTextFile() { try { System.out.println("add text File"); File f = new File("target/Test.txt"); f.deleteOnExit(); List<String> lines = Arrays.asList("The first line", "The second line"); Path file = Paths.get(f.getAbsolutePath()); Files.write(file, lines, Charset.forName("UTF-8")); AttachmentServer instance = new AttachmentServer(); instance.addFile(f, f.getName()); instance.write2DB(); //Delete the file FileUtils.delete(f.getAbsolutePath()); assertEquals(1, (int) instance.getAttachmentType().getId());//Text file System.out.println("retrieveFile"); AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK()); File loadedFile = temp.getAttachedFile("target/loaded/"); BufferedReader br = new BufferedReader(new FileReader(loadedFile)); String line; int count = 0; while ((line = br.readLine()) != null) { assertEquals(lines.get(count), line); System.out.println(line); count++; } assertEquals(lines.size(), count); //Create pdf file System.out.println("add pdf File"); File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf"); pdf.deleteOnExit(); instance = new AttachmentServer(); instance.addFile(pdf, pdf.getName()); instance.write2DB(); //Delete the file FileUtils.delete(pdf.getAbsolutePath()); assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file System.out.println("retrieveFile"); temp = new AttachmentServer(instance.getAttachmentPK()); loadedFile = temp.getAttachedFile("target/loaded/"); PDFTextStripper pdfStripper; PDDocument pdDoc = null; COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException ex) { Exceptions.printStackTrace(ex); fail(); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } } catch (IOException | VMException ex) { Exceptions.printStackTrace(ex); fail(); } }
From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java
License:Open Source License
private String pdftoText(String fileName) throws ActionException { File pdfFile = new File(fileName); if (!pdfFile.isFile()) { LOGGER.error("The file: " + fileName + " does not exist."); throw new ActionException("Unable to parse the pdf file."); }// w w w . jav a 2 s .c o m PDFParser parser = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; PDDocument pdDoc = null; String parsedText; try { parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile))); } catch (Exception e) { LOGGER.error("Unable to open PDF Parser.: " + e); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file."); } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { LOGGER.error("An exception occured in parsing the PDF Document."); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file. " + e); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:data.PDFManager.java
/** * /*from ww w . java2 s . c om*/ * @return String do conteudo do pdf * @throws IOException */ public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:de.hsmannheim.ss15.alr.searchengine.PDFParser.java
public String getTextOfPDF(byte[] in) throws Exception { ByteArrayInputStream input = new ByteArrayInputStream(in); org.apache.pdfbox.pdfparser.PDFParser parser; String parsedText = null;/*from www. java 2 s . com*/ ; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; parser = new NonSequentialPDFParser(input); //parse PDF try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { throw (e); } finally { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } return parsedText; }
From source file:de.maklerpoint.office.Lucene.Indexer.java
License:Open Source License
private void indexFileorDir(String fileName) throws IOException { listFiles(new File(fileName)); for (File f : queue) { FileReader fr = null;//from w w w .j a v a 2 s. co m try { if (f.getName().startsWith(".")) { // System.out.println("Versteckte datei: " + f.getName()); // TODO add html, xml parsers } else if (f.getName().endsWith(".htm") || f.getName().endsWith(".html") || f.getName().endsWith(".xml") || f.getName().endsWith(".txt")) { Document doc = new Document(); //=================================================== // add contents of file //=================================================== fr = new FileReader(f); doc.add(new Field("contents", fr)); //=================================================== //adding second field which contains the path of the file //=================================================== doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); /** * Adding Typ */ doc.add(new Field("type", String.valueOf(FileTypes.TXT), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".pdf")) { PDFParser parser = new PDFParser(new FileInputStream(f)); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(new PDDocument(cd)); Document doc = new Document(); doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.PDF), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); cd.close(); } else if (f.getName().endsWith(".doc") || f.getName().endsWith(".docx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); WordExtractor extractor = new WordExtractor(fs); String wordText = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", wordText, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.DOC), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".xls") || f.getName().endsWith(".xlsx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); ExcelExtractor extractor = new ExcelExtractor(fs); String excelText = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", excelText, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.XLS), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".ppt") || f.getName().endsWith(".pptx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); PowerPointExtractor extractor = new PowerPointExtractor(fs); String ppttext = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", ppttext, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.PPT), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } if (Log.logger.isDebugEnabled()) { Log.logger.debug("Lucene | Neue Datei indexiert: " + f); } } catch (Exception e) { if (Log.logger.isDebugEnabled()) { Log.logger.debug("Datei konnte nicht indexiert werden: " + f, e); } continue; } finally { // fr.close(); } } writer.optimize(); queue.clear(); }
From source file:edu.esprit.filereader.PdfReader.java
public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse();/* w ww. j a va2 s. c o m*/ cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java
License:Apache License
/** * @deprecated/*from ww w. j av a 2 s . com*/ * @see LAPDFTextStripper#getWordBlocks( PDDocument ) * @param doc The document to extract the text from. * @return The document text. * @throws IOException If there is an error extracting the text. */ public String getText(COSDocument doc) throws IOException { return getWordBlocks(new PDDocument(doc)); }
From source file:edu.isi.bmkeg.lapdf.extraction.LAPDFTextStripper.java
License:Apache License
/** * @deprecated/*from ww w . java2 s . c o m*/ * @see LAPDFTextStripper#writeWordBlocks( PDDocument, Writer ) * @param doc The document to extract the text. * @param outputStream The stream to write the text to. * @throws IOException If there is an error extracting the text. */ public void writeText(COSDocument doc, Writer outputStream) throws IOException { writeWordBlocks(new PDDocument(doc), outputStream); }
From source file:eu.sisob.uma.extractors.adhoc.email.EmailExtractor.java
License:Open Source License
/** * * @param input_file//from ww w.java2 s .co m * @param data_dir * @param output_file * @param norepeat_output_file * @param notfound_output_file * @param notfound_norepeat_output_file * @param filters * @param error_sw */ public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file, File notfound_output_file, File notfound_norepeat_output_file, List<String> filters, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; String filter_literal = "("; for (String filter : filters) { filter_literal += filter + ","; } filter_literal += ")"; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { //if(!test_only_output) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\""; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\""; header += "\r\n"; FileUtils.write(notfound_output_file, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); String content = ""; String researcher_page_url = nextLine[idResearcherWebAddress]; Logger.getLogger("root").info("Go with " + researcher_page_url); if (p1.matcher(researcher_page_url).matches()) { File f = new File(data_dir, researcher_page_url); if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) { Logger.getLogger("root") .error("The document " + researcher_page_url + " could not loaded"); error_sw.append("The document " + researcher_page_url + " could not loaded"); } else if (researcher_page_url.endsWith(".pdf")) { PDFParser parser = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; try { parser = new PDFParser(new FileInputStream(f)); } catch (IOException e) { Logger.getLogger("root").error(e.toString()); error_sw.append("Unable to open PDF called " + researcher_page_url); } if (parser != null) { try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(2); content = pdfStripper.getText(pdDoc); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); error_sw.append("An exception occured in parsing the PDF Document."); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); } } } } } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); File temp; temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = ""; } else { content = FileUtils.readFileToString(temp); temp.delete(); } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root").error( researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } } if (!content.equals("")) { //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})"; final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})"; Pattern p = Pattern.compile(RE_MAIL); Matcher m = p.matcher(content); List<String> emails = new ArrayList<String>(); while (m.find()) { String email = m.group(1); if (!emails.contains(email)) { // Apply filter boolean pass = true; if (filters.size() > 0) { pass = false; for (String filter : filters) { String filter2 = filter.replace("*", ".*?"); Pattern pattern = Pattern.compile(filter2); if (pattern.matcher(email).matches()) { pass = true; break; } else { } } } if (pass) { Logger.getRootLogger().info(researcher_page_url + " => " + email + " PASS FILTER! " + filter_literal); emails.add(email); } else { Logger.getRootLogger().info(researcher_page_url + " => " + email + " REFUSE BY FILTER! " + filter_literal); } } } if (emails.size() < MAX_MAIL_PER_PAGE) { for (String email : emails) { String score_email = ""; String lastname = nextLine[idLastName]; if (lastname.length() > 5) lastname = lastname.substring(0, 6); if (email.toLowerCase().contains(lastname)) { score_email = "A"; } else { int temp_id = idFirstName; if (temp_id == -1) temp_id = idInitials; if (!nextLine[idInitials].trim().equals("")) { String firstname = nextLine[temp_id].split(" ")[0]; if (firstname.length() > 5) firstname = firstname.substring(0, 5); if (firstname.length() > 1) { if (email.toLowerCase().contains(firstname)) { score_email = "A"; } } } if (score_email.equals("")) { String initials = ""; String[] arr = nextLine[temp_id].split(" "); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) initials += arr[i].charAt(0); } initials += nextLine[idLastName].charAt(0); if (email.toLowerCase().contains(initials)) { score_email = "B"; } else { score_email = "Z"; } } } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; result += "\"" + email + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR; result += "\"" + score_email + "\""; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } else { content = ""; } if (emails.size() == 0) content = ""; } if (content == "") { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\""; result += "\r\n"; try { FileUtils.write(notfound_output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } reader.close(); } Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications"); boolean finish = false; String alternate_filename_1 = "file1"; String alternate_filename_2 = "file2"; File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); FileUtils.copyFile(output_file, alternate_file_s); //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false); FileUtils.write(norepeat_output_file, "", "UTF-8", false); while (!finish) { reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>(); int idEmail = 3; if (idFirstName != -1) idEmail++; if (idName != -1) idEmail++; try { FileUtils.write(alternate_file_d, "", "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } finish = true; while ((nextLine = reader.readNext()) != null) { Integer count = 1; if (count_dictionary.containsKey(nextLine[idEmail].toString())) count = count_dictionary.get(nextLine[idEmail].toString()); else { if (count_dictionary.size() < max_in_mem) { count_dictionary.put(nextLine[idEmail].toString(), count + 1); } else { try { for (int i = 0; i < nextLine.length; i++) nextLine[i] = "\"" + nextLine[i] + "\""; FileUtils.write(alternate_file_d, StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); finish = false; } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } reader.close(); Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications"); reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } String previous_id = "%previous%"; String previous_email = "%previous_email%"; List<String[]> cache = new ArrayList<String[]>(); while ((nextLine = reader.readNext()) != null) { String id = nextLine[idStaffIdentifier].toString(); if (previous_id.equals(id)) { cache.add(nextLine); previous_id = id; } else { //Process String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = act_line[act_line.length - 1]; } catch (Exception ex) { } String email = act_line[idEmail].toString(); if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } cache.clear(); cache.add(nextLine); previous_id = id; } } //Process if (cache.size() > 0) { String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = (act_line[act_line.length - 1]); } catch (Exception ex) { } String email = act_line[idEmail]; if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } } reader.close(); // if (!finish) { FileUtils.copyFile(alternate_file_d, alternate_file_s); alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); } } FileUtils.forceDelete(alternate_file_s); FileUtils.forceDelete(alternate_file_d); Logger.getLogger("root").info("Applying deduplication algoritm - Finish"); } catch (Exception ex) { String error_msg = "Error extracting emails from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }