List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:com.crawler.app.run.CrawlSite.java
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); // logger.info("URL: ", url); if (ReadXmlConfig() && readXmlConfigDatabase()) { status_read_xml = true;//from ww w . j av a 2 s .c o m } else { return; } System.out.println("\n URL visit: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); Element body = doc.body(); Elements listDetail = body.select(bodySelect); Integer i = 0; Integer siteID = siteIDXML; Integer provinceID = 1; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); String jobImage, jobUrl, aJobName, cJobLocation = null, cLocationNear = "", bJobCompany = "", dJobCareer, eJobSalary, gJobDescription, gJobDetailShort, gJobDetail, jobDetailImage, jobDetailImageName, hJobExpire = null; for (Element detail : listDetail) { i++; try { jobImage = ""; /* job img */ if (!jobImgQuery.isEmpty()) { if (jobImagePosition > -1) { if (jobImagePosition < detail.select(jobImgQuery).size()) { if (!detail.select(jobImgQuery).get(jobImagePosition).attr(jobImageFormatAttr) .isEmpty()) { if (!jobImgUrl.isEmpty()) { if (JobImageSelectPosition.isEmpty()) { jobImage = jobImgUrl + detail.select(jobImgQuery).get(jobImagePosition) .attr(jobImageFormatAttr); } else { jobImage = jobImgUrl + detail.select(jobImgQuery).get(jobImagePosition) .select(JobImageSelectPosition).attr(jobImageFormatAttr); } } else { if (JobImageSelectPosition.isEmpty()) { jobImage = detail.select(jobImgQuery).get(jobImagePosition) .attr(jobImageFormatAttr); } else { jobImage = detail.select(jobImgQuery).get(jobImagePosition) .select(JobImageSelectPosition).attr(jobImageFormatAttr); } } } } } else { if (!detail.select(jobImgQuery).attr(jobImageFormatAttr).isEmpty()) { if (!jobImgUrl.isEmpty()) { jobImage = jobImgUrl + detail.select(jobImgQuery).first().attr(jobImageFormatAttr); } else { jobImage = detail.select(jobImgQuery).first().attr(jobImageFormatAttr); } } } } /* job url */ jobUrl = ""; if (!jobUrlQuery.isEmpty()) { if (jobUrlPosition > -1) { if (jobUrlPosition < detail.select(jobUrlQuery).size()) { if (!joburl_url.isEmpty()) { if (JobUrlSelectPosition.isEmpty()) { jobUrl = joburl_url + detail.select(jobUrlQuery).get(jobUrlPosition) .attr(jobUrlFormatAttr); } else { jobUrl = joburl_url + detail.select(jobUrlQuery).get(jobUrlPosition) .select(JobUrlSelectPosition).attr(jobUrlFormatAttr); } } else { if (JobUrlSelectPosition.isEmpty()) { jobUrl = detail.select(jobUrlQuery).get(jobUrlPosition) .attr(jobUrlFormatAttr); } else { jobUrl = detail.select(jobUrlQuery).get(jobUrlPosition) .select(JobUrlSelectPosition).attr(jobUrlFormatAttr); } } } } else { if (!joburl_url.isEmpty()) { jobUrl = joburl_url + detail.select(jobUrlQuery).first().attr(jobUrlFormatAttr); } else { jobUrl = detail.select(jobUrlQuery).first().attr(jobUrlFormatAttr); } } } // change org.jsoup.nodes.Element detailJobUrl = convertUrlToDocument(jobUrl); //System.out.print(detailJobUrl); //System.exit(1); /* job location */ if (!jobLocationQuery.isEmpty()) { if (jobLocationFormatData.toUpperCase().equals("TEXT")) { cJobLocation = detailJobUrl.select(jobLocationQuery).text(); } else if (jobLocationFormatData.toUpperCase().equals("HTML")) { cJobLocation = detailJobUrl.select(jobLocationQuery).html(); } } /* job name */ aJobName = ""; if (jobNameFormatData.toUpperCase().equals("TEXT")) { aJobName = detailJobUrl.select(jobNameQuery).text(); } else if (jobNameFormatData.toUpperCase().equals("HTML")) { aJobName = detailJobUrl.select(jobNameQuery).html(); } /* job description */ gJobDescription = ""; if (!JobDescriptionQuery.isEmpty()) { if (jobDescriptionFormatData.toUpperCase().equals("TEXT")) { gJobDescription = detailJobUrl.select(JobDescriptionQuery).text(); } else if (jobDescriptionFormatData.toUpperCase().equals("HTML")) { gJobDescription = detailJobUrl.select(JobDescriptionQuery).html(); } } /* job detail short */ gJobDetailShort = ""; if (!JobDetailShortQuery.isEmpty()) { if (jobDetailShortFormatData.toUpperCase().equals("TEXT")) { gJobDetailShort = detailJobUrl.select(JobDetailShortQuery).text(); } else if (jobDetailShortFormatData.toUpperCase().equals("HTML")) { gJobDetailShort = detailJobUrl.select(JobDetailShortQuery).html(); } } /* job detail */ gJobDetail = ""; if (!JobDetailQuery.isEmpty()) { if (jobDetailFormatData.toUpperCase().equals("TEXT")) { gJobDetail = detailJobUrl.select(JobDetailQuery).text(); } else if (jobDetailFormatData.toUpperCase().equals("HTML")) { gJobDetail = detailJobUrl.select(JobDetailQuery).html(); } } /* job detail img*/ jobDetailImage = ""; jobDetailImageName = ""; if (!jobDetailImgQuery.isEmpty()) { if (jobDetailImagePosition > -1) { if (jobDetailImagePosition < detailJobUrl.select(jobDetailImgQuery).size()) { if (!detailJobUrl.select(jobDetailImgQuery).get(jobDetailImagePosition) .attr(jobDetailImageFormatAttr).isEmpty()) { if (!jobDetailImgUrl.isEmpty()) { if (JobDetailImageSelectPosition.isEmpty()) { jobDetailImage = jobDetailImgUrl + detailJobUrl .select(jobDetailImgQuery).get(jobDetailImagePosition) .attr(jobDetailImageFormatAttr); } else { jobDetailImage = jobDetailImgUrl + detailJobUrl .select(jobDetailImgQuery).get(jobDetailImagePosition) .select(JobDetailImageSelectPosition) .attr(jobDetailImageFormatAttr); } } else { if (JobDetailImageSelectPosition.isEmpty()) { jobDetailImage = detailJobUrl.select(jobDetailImgQuery) .get(jobDetailImagePosition).attr(jobDetailImageFormatAttr); } else { jobDetailImage = detailJobUrl.select(jobDetailImgQuery) .get(jobDetailImagePosition) .select(JobDetailImageSelectPosition) .attr(jobDetailImageFormatAttr); } } } } } else { if (!detailJobUrl.select(jobDetailImgQuery).attr(jobDetailImageFormatAttr).isEmpty()) { if (!jobDetailImgUrl.isEmpty()) { jobDetailImage = jobDetailImgUrl + detailJobUrl.select(jobDetailImgQuery) .first().attr(jobDetailImageFormatAttr); } else { jobDetailImage = detailJobUrl.select(jobDetailImgQuery).first() .attr(jobDetailImageFormatAttr); } } } if (!jobDetailImage.isEmpty()) { jobDetailImageName = DownloadImage.downloadImage(jobDetailImage, "D:\\/Java\\/storage"); } } /* job location near */ cLocationNear = ""; if (!locationNearQuery.isEmpty()) { if (locationNearFormatData.toUpperCase().equals("TEXT")) { cLocationNear = detailJobUrl.select(locationNearQuery).text(); } else if (locationNearFormatData.toUpperCase().equals("HTML")) { cLocationNear = detailJobUrl.select(locationNearQuery).html(); } } /* job salary */ eJobSalary = ""; if (!JobSalaryQuery.isEmpty()) { if (jobSalaryFormatData.toUpperCase().equals("TEXT")) { eJobSalary = detailJobUrl.select(JobSalaryQuery).text(); } else if (jobSalaryFormatData.toUpperCase().equals("HTML")) { eJobSalary = detailJobUrl.select(JobSalaryQuery).html(); } } /* job expire */ hJobExpire = ""; if (!JobExpireQuery.isEmpty()) { if (jobExpireFormatData.toUpperCase().equals("TEXT")) { hJobExpire = detailJobUrl.select(JobExpireQuery).text(); } else if (jobExpireFormatData.toUpperCase().equals("HTML")) { hJobExpire = detailJobUrl.select(JobExpireQuery).html(); } } /* job company */ bJobCompany = ""; if (!JobCompanyQuery.isEmpty()) { if (jobCompanyFormatData.toUpperCase().equals("TEXT")) { bJobCompany = detailJobUrl.select(JobCompanyQuery).text(); } else if (jobCompanyFormatData.toUpperCase().equals("HTML")) { bJobCompany = detailJobUrl.select(JobCompanyQuery).html(); } } /* job type */ String fJobType = ""; if (!JobTypeQuery.isEmpty()) { if (jobTypeFormatData.toUpperCase().equals("TEXT")) { fJobType = detailJobUrl.select(JobTypeQuery).text(); } else if (jobTypeFormatData.toUpperCase().equals("HTML")) { fJobType = detailJobUrl.select(JobTypeQuery).html(); } } /* job address */ String jobAddress = ""; if (!JobAddressQuery.isEmpty()) { if (jobAddressFormatData.toUpperCase().equals("TEXT")) { jobAddress = detailJobUrl.select(JobAddressQuery).text(); } else if (jobAddressFormatData.toUpperCase().equals("HTML")) { jobAddress = detailJobUrl.select(JobAddressQuery).html(); } } dJobCareer = ""; if (!JobCareerQuery.isEmpty()) { if (jobCareerFormatData.toUpperCase().equals("TEXT")) { dJobCareer = detailJobUrl.select(JobCareerQuery).text(); } else if (jobCareerFormatData.toUpperCase().equals("HTML")) { dJobCareer = detailJobUrl.select(JobCareerQuery).html(); } } System.out.println("\n Url : " + jobUrl); System.out.println("\n Image : " + jobImage); System.out.println("\n Title : " + aJobName); System.out.println("\n Title SEO : " + StringUtils.removeAccent(aJobName)); //System.out.println("\n Location : " + cJobLocation + "\n" // + cLocationNear); System.out.println("\n jobDetailImageName : " + jobDetailImageName); // System.out.println("\n Detail : " + gJobDetail); // System.out.println("\n Salary : " + eJobSalary); // System.out.println("\n expire Date : " + hJobExpire); // System.out.println("\n Company : " + bJobCompany); // System.out.println("\n JobType : " + fJobType); // System.out.println("\n Full I : " + i); String news_title = aJobName; String news_title_seo = StringUtils.removeAccent(aJobName); String news_meta = aJobName; String news_description = gJobDescription; String news_tag = aJobName.replace(" ", ", "); String news_pic = jobDetailImageName; String pic_note = aJobName; String news_subcontent = "<p>" + gJobDescription + "</p>"; String news_content = gJobDetailShort + "<p><img src='http://" + jobDetailImageName + "'></p>" + gJobDetail; int type = 4; int status = 0; int kind = 0; String source = "Theo http://monngonmoingay.com"; String author = null; int user_posted = 0; int user_activated = 0; int cate_id = 43; String list_productid_relation = "13,28,30"; if (!MysqlCrawler.getInstance().checkNewsUrl(news_title_seo)) { MysqlCrawler.getInstance().insertNewsContent(news_title, news_title_seo, news_meta, news_description, news_tag, news_pic, pic_note, news_subcontent, news_content, type, status, kind, source, author, user_posted, user_activated, cate_id, list_productid_relation); } // System.exit(1); } catch (Exception ex) { System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } } } /* * Header[] responseHeaders = page.getFetchResponseHeaders(); if * (responseHeaders != null) { logger.debug("Response headers:"); for * (Header header : responseHeaders) { logger.debug("\t{}: {}", * header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
/** * Method used to perform recursive creation indexing for a given web page * in search database./*from www . j a va 2s . c o m*/ * * @param webPage webPage.url is entered url * webPage.title is set * @param html Jsoup.Document of entered url * @param recursionNumber used to stop recursion at exceeding * MAX_RECURSION_SEARCH_NUMBER */ private void indexElements(WebPage webPage, Document html, final int recursionNumber) throws IOException, ParseException { String title = html.title(); if (referencedTitles.contains(title.trim())) { return; } referencedTitles.add(title.trim()); webPage.setTitle(title); if (containsPage(webPage)) { System.out.println(webPage.getUrl() + " is already indexed"); return; } Element prevElement = null; Elements elements = html.body().getAllElements(); //.getElementsByTag("a"); addDoc(webPage, html.text()); // for (Element element : elements) { //// System.out.println(element.nodeName() + " element.text() " //// + element.text() + " url " //// + element.absUrl("href")); // if (element.nodeName().equalsIgnoreCase("body")) { // addDoc(webPage, element.text()); // break; //// continue; // } // if (null == prevElement) { // prevElement = element; //// } else if (prevElementContainsElementText(prevElement, element)) { //// continue; // } //// if (null !== webPagesService.findWebPage(element.absUrl("href"))) // if (element.text().trim().isEmpty()) { // continue; // } //// StringTokenizer str = new StringTokenizer(element.text()); //// str. // addDoc(webPage, element.text()); // } if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) { // System.out.println(recursionNumber + " " // + referencedSites.contains(webPage.getUrl())); return; } elements.parallelStream() .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF) && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF)) && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF)))) .forEach((Element element) -> { WebPage webPage1 = new WebPage(element.absUrl(HREF)); String url1 = webPage1.getUrl(); // System.out.println(recursionNumber + " recursion for '" // + url1 + "'"); try { Document htmlR = Jsoup.connect(url1).get(); indexElements(webPage1, htmlR, recursionNumber + 1); } catch (IOException | ParseException e) { System.out.println("Exception " + e.getMessage()); } referencedSites.add(url1); }); // for (Element element : elements) { // if (!element.nodeName().equalsIgnoreCase("a")) { // continue; // } // WebPage webPage1 = new WebPage(element.absUrl("href")); // if (null == webPage1.getUrl() // || webPage1.getUrl().isEmpty() // || referencedSites.contains(webPage1.getUrl())) { // continue; // } // System.out.println(recursionNumber + "recursion for " // + element.absUrl("href")); // try { // Document htmlR = Jsoup.connect(webPage1.getUrl()).get(); // webPage1.setTitle(htmlR.title()); // indexElements(webPage1, htmlR, recursionNumber + 1); // } catch (IOException e) { // System.out.println("IOException " + e.getMessage()); // } // referencedSites.add(webPage1.getUrl()); // } }
From source file:lucene.IndexFiles.java
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document System.out.println("Test 3.1"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); String line = null;//www . j av a 2 s . c om StringBuilder stringBuilder = new StringBuilder(); String ls = System.getProperty("line.separator"); try { while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(ls); } } finally { reader.close(); } //index file name Field fileNameField = new StringField("name", file.getFileName().toString(), Field.Store.YES); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. String file_content = stringBuilder.toString(); //System.out.println(file_content); //String[] passages = file_content.split("<P|<p"); //String[] passages = file_content.split("<P"); //String[] passages = file_content.split("<P>|<H1>|<H2>|<H3>|<H4>|<H5>|<H6>|<BR>|<HR>|<TABLE>|<TD>|<TH>|<TR>|<OL>|<UL>|<p>|<br>|<hr>");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul"); String[] passages = file_content.split( "(?i)<P|(?i)<H1|(?i)<H2|(?i)<H3|(?i)<H4|(?i)<H5|(?i)<H6|(?i)<BR|(?i)<HR|(?i)<TABLE|(?i)<TD|(?i)<TH|(?i)<TR|(?i)<OL|(?i)<UL");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul"); //String[] passages = StringUtils.substringsBetween(file_content, "<P", "<P"); //String[] title = StringUtils.substringsBetween(file_content, "<body>", "</"); //System.out.println("path"); //String title = passages[0]; String title; Document dochtml;// = Jsoup.parse(title); String ptitle = ""; //= dochtml.body().text(); //System.out.println("Title is" + ptitle); //Field titleField = new StringField("title", ptitle, Field.Store.YES); ///////------FORMATING TEXT--------- StandardTokenizer stdToken = new StandardTokenizer(); //Tokenizer stdToken = new WhitespaceTokenizer(); EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); //stdToken.setReader(new StringReader("Some stuff that is in need of analysis. stuff patients PATIENT d > 0.5 Dnn>Bnn D.N.A diseases heart attacks at cl-fo")); //You're code starts here final List<String> stopWords = new ArrayList<>(); String f = "E:/stopwords_en.txt"; try (BufferedReader br = new BufferedReader(new FileReader(f))) { String topic; //int qid = 200;//cntr=0; while ((topic = br.readLine()) != null) { stopWords.add(topic.trim()); } } final CharArraySet stopSet = new CharArraySet(stopWords, false); //////------FORMATING TEXT--------- if (passages != null) { int j = 0; if (passages.length > 1) { title = passages[1].split("</P|</H1|</H2|</H3|</H4|</H5|</H6|</p")[0]; dochtml = Jsoup.parse(title); ptitle = dochtml.body().text().toLowerCase(); System.out.println("Title is" + ptitle); } for (int i = 0; i < passages.length; i++) { //System.out.println(i); //cnames = cname.split(":"); //cname = cnames[0]; String[] passage_contents = passages[i].split("</P|</p"); //String[] passage_contents = passages[i].split("</P"); String passage_content = passage_contents[0]; //if(passage_content.trim().isEmpty()){ // System.out.println("abc"); //continue; //} dochtml = Jsoup.parse(passage_content); String plainStr = dochtml.body().text(); String[] validpas = plainStr.split(" "); if (validpas.length > 9) { j++; Field passageId = new StringField("id", file.getFileName().toString() + "." + i, Field.Store.YES); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(fileNameField); doc.add(pathField); doc.add(passageId); //doc.add(titleField); doc.add(new StringField("offset", file_content.indexOf(passage_content) + "", Field.Store.YES)); doc.add(new StringField("length", passage_content.length() + "", Field.Store.YES)); doc.add(new LongPoint("modified", lastModified)); ((org.apache.lucene.document.Document) doc).add(new TextField("title", ptitle, Store.YES)); //System.out.println(passage_content); //InputStream is = new ByteArrayInputStream(passage_content.getBytes()); //String strippedText = passage_content.replaceAll("(?s)<[^>]*>(\\s*<[^>]*>)*", " "); //--------TEXT PROCESSING------------ TokenStream tokenStream; //String nplainstr = plainStr.replaceAll("-", ".zz"); //stdToken.setReader(new StringReader(nplainstr)); stdToken.setReader(new StringReader(plainStr)); tokenStream = new StopFilter( new ASCIIFoldingFilter(new ClassicFilter(new LowerCaseFilter(stdToken))), stopSet); //tokenStream = new PorterStemFilter(tokenStream); tokenStream.reset(); //int l=0; String term = ""; StringBuilder sb = new StringBuilder(); //OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class); try { //int l; while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } term = charTermAttr.toString(); /*if(term.contains(".zz")){ term = term.replaceAll(".zz", "-"); String[] terms=term.split("-"); String at=""; for(String t : terms){ //l = stemmer.stem(t.toCharArray(), t.length()); //t = t.substring(0, l); //sb.append(t.toString(),0,l); sb.append(t + " "); at = at+t; } sb.append(at + " "); }*/ if (term.contains(".") && !term.matches(".*\\d+.*")) {//&& StringUtils.isAlpha(term)){ term = term.replaceAll("\\.", ""); //sb.append(term); } //int l = stemmer.stem(charTermAttr.toString().toCharArray(), charTermAttr.toString().length()); int l; l = stemmer.stem(term.toCharArray(), term.length()); //sb.append(charTermAttr.toString(),0,l); sb.append(term, 0, l); //sb.append(term); /*if(term.contains("-")){ String[] terms=term.split("-"); String at=""; for(String t : terms){ sb.append(" " + t); at = at+t; } sb.append(" " + at); }*/ /*sb.append(charTermAttr.toString()); String[] hl = charTermAttr.toString().split("-"); if (hl.length > 1){ for(int j=0; j<hl.length; j++){ sb.append(" " + hl[j]); } //sb.append(" " + charTermAttr.toString().split("-")[1]); //sb.append(charTermAttr.toString()); }*/ } } catch (IOException e) { System.out.println(e.getMessage()); } //System.out.println(sb.toString()); tokenStream.close(); ///----------END OF TExt processin---------- ((org.apache.lucene.document.Document) doc) .add(new TextField("contents", sb.toString(), Store.YES));//new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)))); //doc.add(new StringField("contents", passage_content, Field.Store.YES)); //System.out.println(plainStr); //writer.addDocument(doc); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { n++; // New index, so we just add the document (no old document can be there): System.out.println( ".......adding " + file.getFileName().toString() + " passage " + j + "--" + n); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } } } } }
From source file:feedzilla.Feed.java
@Override public void run() { try {//from w w w . j a va 2 s. co m Thread.sleep((new Random()).nextInt(60 * 1000)); } catch (InterruptedException ex) { Log.warn("Could not sleep Thread", ex); } Document doc = null; boolean get = true; int trysCount = 0; do { get = true; try { doc = Jsoup.connect(this.link).timeout(60 * 1000).userAgent( "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") .referrer("http://www.google.com").get(); } catch (IOException ex) { Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex); Log.warn("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Could not get Feed page from FeedZilla", ex); get = false; if (++trysCount > 5) { Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Five attempts and has not yet been possible to " + "retrieve the page from filezilla. Ignoring this news."); return; } } } while (!get); Elements elements = doc.body().select("iframe"); for (Element element : elements) { try { this.link = URLDecoder.decode(element.attr("src"), "UTF-8"); } catch (UnsupportedEncodingException ex) { Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex); Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Could not get the news link from FeedZilla pages"); return; } } this.link = getUrlInParams(this.link); try { this.news = (new NewsCrawler(this.link)).getNews(); } catch (Exception ex) { Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Could not retrieve news from link " + this.link, ex); return; } newsXMLFile.getParentFile().mkdirs(); try { FileUtils.writeStringToFile(newsXMLFile, this.toXML()); Log.info("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Successfuly saved!"); System.out.println("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Successfuly saved!"); } catch (IOException ex) { Log.error("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Could not save news into file", ex); } }
From source file:codeu.chat.client.commandline.Chat.java
private boolean parseScript(String link, String phrase, boolean springfield) { String[] script;//w ww . j av a 2 s.c o m try { Document doc = Jsoup.connect(link).get(); /* If the script was retrieved from the Springfield website, the lines must be split up using the <br> tag instead of new line characters */ if (springfield) { String temp = Jsoup.parse(doc.html().replaceAll("(?i)<br[^>]*>", "br2n")).text(); script = mergeScriptSentences(temp.split("br2n")); } else { script = mergeScriptSentences(doc.body().text().split("\n")); } /* Search for a line containing the phrase. Once one is found, determine the best response and return accordingly. In some cases, this will mean continuing to search for a later match */ for (int lineNum = 0; lineNum < script.length; lineNum++) { script[lineNum] = script[lineNum].trim().toLowerCase(); for (String sentence : script[lineNum].split("(?<=[!\\?\\.])")) { if (sentence.contains(phrase) || StringUtils.getLevenshteinDistance(sentence, phrase) <= phrase.length() / 3.0) { if (findNextScriptResponse(lineNum, phrase, script)) { return true; } } } } } catch (IOException e) { e.printStackTrace(); } return false; // Return false if no line containing the phrase was found }
From source file:edu.stanford.muse.email.EmailFetcherStats.java
/** * this method returns the text content of the message as a list of strings * // each element of the list could be the content of a multipart message * // m is the top level subject//from www .jav a2s .com * // p is the specific part that we are processing (p could be == m) * also sets up names of attachments (though it will not download the * attachment unless downloadAttachments is true) */ private List<String> processMessagePart(int messageNum, Message m, Part p, List<Blob> attachmentsList) throws MessagingException, IOException { List<String> list = new ArrayList<String>(); // return list if (p == null) { dataErrors.add("part is null: " + folder_name() + " idx " + messageNum); return list; } if (p == m && p.isMimeType("text/html")) { /* String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription(); dataErrors.add(s); */ // we don't normally expect the top-level part to have content-type text/html // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html String html = (String) p.getContent(); String text = Util.unescapeHTML(html); org.jsoup.nodes.Document doc = Jsoup.parse(text); StringBuilder sb = new StringBuilder(); HTMLUtils.extractTextFromHTML(doc.body(), sb); list.add(sb.toString()); return list; } if (p.isMimeType("text/plain")) { //make sure, p is not wrongly labelled as plain text. Enumeration headers = p.getAllHeaders(); boolean dirty = false; if (headers != null) while (headers.hasMoreElements()) { Header h = (Header) headers.nextElement(); String name = h.getName(); String value = h.getValue(); if (name != null && value != null) { if (name.equals("Content-transfer-encoding") && value.equals("base64")) { dirty = true; break; } } } String fname = p.getFileName(); if (fname != null) { int idx = fname.lastIndexOf('.'); if ((idx < fname.length()) && (idx >= 0)) { String extension = fname.substring(idx); //anything extension other than .txt is suspicious. if (!extension.equals(".txt")) dirty = true; } } if (dirty) { dataErrors.add("Dirty message part, has conflicting message part headers." + folder_name() + " Message# " + messageNum); return list; } log.debug("Message part with content type text/plain"); String content; String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8"); try { // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us if (FORCED_ENCODING != null) { byte b[] = Util.getBytesFromStream(p.getInputStream()); content = new String(b, FORCED_ENCODING); } else content = (String) p.getContent(); } catch (UnsupportedEncodingException uee) { dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion"); // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers. // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013 // though it may be better to consider official support for utf-7 or other encodings. // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it. // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt", // the hack below doesn't work for it. ByteArrayOutputStream bao = new ByteArrayOutputStream(); p.writeTo(bao); content = bao.toString(); } list.add(content); } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) { // rfc822 mime type is for embedded mbox format or some such (appears for things like // forwarded messages). the content appears to be just a multipart. Object o = p.getContent(); if (o instanceof Multipart) { Multipart allParts = (Multipart) o; if (p.isMimeType("multipart/alternative")) { // this is an alternative mime type. v common case to have text and html alternatives // so just process the text part if there is one, and avoid fetching the alternatives. // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html. // revisit in future we want to retain the html alternative for display purposes Part[] parts = new Part[allParts.getCount()]; for (int i = 0; i < parts.length; i++) parts[i] = allParts.getBodyPart(i); for (int i = 0; i < parts.length; i++) { Part thisPart = parts[i]; if (thisPart.isMimeType("text/plain")) { // common case, return quickly list.add((String) thisPart.getContent()); log.debug("Multipart/alternative with content type text/plain"); return list; } } // no text part, let's look for an html part. this happens for html parts. for (int i = 0; i < allParts.getCount(); i++) { Part thisPart = parts[i]; if (thisPart.isMimeType("text/html")) { // common case, return quickly String html = (String) thisPart.getContent(); String text = Util.unescapeHTML(html); org.jsoup.nodes.Document doc = Jsoup.parse(text); StringBuilder sb = new StringBuilder(); HTMLUtils.extractTextFromHTML(doc.body(), sb); list.add(sb.toString()); log.debug("Multipart/alternative with content type text/html"); return list; } } // no text or html part. hmmm... blindly process the first part only if (allParts.getCount() >= 1) list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList)); } else { // process it like a regular multipart for (int i = 0; i < allParts.getCount(); i++) { BodyPart bp = allParts.getBodyPart(i); list.addAll(processMessagePart(messageNum, m, bp, attachmentsList)); } } } else if (o instanceof Part) list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList)); else dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType()); } else { try { // do attachments only if downloadAttachments is set. // some apps do not need attachments, so this saves some time. // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too? if (fetchConfig.downloadAttachments) handleAttachments(messageNum, m, p, list, attachmentsList); } catch (Exception e) { dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": " + Util.stackTrace(e)); } } return list; }
From source file:edu.stanford.muse.email.EmailFetcherStats.java
/** * recursively processes attachments, fetching and saving it if needed * parses the given part p, and adds it to hte attachmentsList. * in some cases, like a text/html type without a filename, we instead append it to the textlist * @throws MessagingException// w w w . jav a2 s .com */ private void handleAttachments(int idx, Message m, Part p, List<String> textList, List<Blob> attachmentsList) throws MessagingException { String ct = null; if (!(m instanceof MimeMessage)) { Exception e = new IllegalArgumentException("Not a MIME message!"); e.fillInStackTrace(); log.warn(Util.stackTrace(e)); return; } String filename = null; try { filename = p.getFileName(); } catch (Exception e) { // seen this happen with: // Folders__gmail-sent Message #12185 Expected ';', got "Message" // javax.mail.internet.ParseException: Expected ';', got "Message" dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx); return; } String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name()); if (filename == null) { String tempFname = sanitizedFName + "." + idx; dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname); if (p.isMimeType("text/html")) { try { log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment"); String html = (String) p.getContent(); String text = Util.unescapeHTML(html); org.jsoup.nodes.Document doc = Jsoup.parse(text); StringBuilder sb = new StringBuilder(); HTMLUtils.extractTextFromHTML(doc.body(), sb); textList.add(sb.toString()); return; } catch (Exception e) { Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log); return; } } filename = tempFname; } // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _ // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced) String newFilename = Util.sanitizeFileName(filename); // Updating filename if it's changed after sanitizing. if (!newFilename.equals(filename)) { log.info("Filename changed from " + filename + " to " + newFilename); filename = newFilename; } try { ct = p.getContentType(); if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible { // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud.... // Below are the most common file types, more type can be added if needed // Most common APPLICATION TYPE if (ct.startsWith("application/pdf")) filename = filename + ".pdf"; if (ct.startsWith("application/zip")) filename = filename + ",zip"; // Most common IMAGE TYPE if (ct.startsWith("image/jpeg")) filename = filename + ".jpg"; if (ct.startsWith("image/gif")) filename = filename + ".gif"; if (ct.startsWith("image/png")) filename = filename + ".png"; // Most Common VIDEO TYPE if (ct.startsWith("video/x-ms-wmv")) filename = filename + ".wmv"; // Most Common AUDIO TYPE if (ct.startsWith("audio/mpeg")) filename = filename + ".mp3"; if (ct.startsWith("audio/mp4")) filename = filename + ".mp4"; // Most Common TEXT TYPE if (ct.startsWith("text/html")) filename = filename + ".html"; // Windows Office if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word filename = filename + ".docx"; if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel filename = filename + ".xlsx"; if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint filename = filename + ".pptx"; } // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename int x = ct.indexOf(";"); if (x >= 0) ct = ct.substring(0, x); log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename)); } catch (Exception pex) { dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex)); return; } // if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html // log.warn ("Attachment filename is null: " + Util.stackTrace()); boolean success = true; // the size passed in here is the part size, which is not really the binary blob size. // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p); if (fetchConfig.downloadAttachments) { // this containment check is only on the basis of file name and size currently, // not on the actual hash if (archive.getBlobStore().contains(b)) { log.debug("Cache hit! " + b); } else { try { if (filename.endsWith(".tif")) log.info("Fetching attachment..." + Util.blurKeepingExtension(filename)); // performance critical! use large buffer! currently 256KB // stream will be closed by callee long start = System.currentTimeMillis(); long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024)); long end = System.currentTimeMillis(); if (nBytes != -1) { long diff = end - start; String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis"; if (diff > 0) s += " (" + (nBytes / diff) + " KB/s)"; log.info(s); } Util.ASSERT(archive.getBlobStore().contains(b)); } catch (IOException ioe) { success = false; dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); ioe.printStackTrace(System.out); } } if (success) { attachmentsList.add(b); /// generate thumbnail only if not already cached try { archive.getBlobStore().generate_thumbnail(b); // supplement } catch (IOException ioe) { log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); ioe.printStackTrace(System.out); } } } }
From source file:com.aurel.track.exchange.docx.exporter.PreprocessImage.java
/** * Gets the image captions in a map keyed by itemID_attachmentID * The key is saved also in the <img> tag's "alt" attribute for later use from word * @param doc//from w w w . ja v a 2 s . c o m * @param personID * @param imageCaptionsMap * @return */ private String getImageCaptions(Document doc, Integer personID, Map<String, ImageOrTableCaption> imageCaptionsMap) { Elements imgElements = doc.select("img"); if (imgElements != null) { for (Iterator<Element> iterator = imgElements.iterator(); iterator.hasNext();) { Element imageElement = iterator.next(); String sourceAttribute = imageElement.attr("src"); String style = imageElement.attr("style"); //remove the width and height attributes from html img to avoid java.lang.OutOfMemoryError: Java heap space imageElement.removeAttr("width"); imageElement.removeAttr("height"); ALIGN align = null; if (style != null) { if (style.contains("float:left")) { align = ALIGN.LEFT; } else { if (style.contains("float:right")) { align = ALIGN.RIGHT; } } } String altAttribute = imageElement.attr("alt"); Map<String, String> map = getTemporaryFilePathMap(sourceAttribute, personID); if (map != null) { imageElement.attr("src", map.get("temporaryFilePath")); //save imageCaption into the map and now use the "alt" attribute for storing the merged key //which will be transformed in nonvisualdrawingprops.getDescr() by XHTMLImporterImpl to set the caption on the ms word side String imageCaption = null; if (altAttribute != null && !"".equals(altAttribute)) { //probably from previously removed figcaption but it may also be explicitly set imageCaption = altAttribute; } else { imageCaption = map.get("description"); } globalCounter++; counterWithinChapter++; imageElement.attr("alt", String.valueOf(globalCounter)); if (imageCaption == null) { //add anyway to the map even as empty string because this marks the image to be added to the List of figures imageCaption = ""; } imageCaptionsMap.put(String.valueOf(globalCounter), new ImageOrTableCaption(chapterNo, counterWithinChapter, imageCaption, align)); } } } return doc.body().html(); }
From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isNotBlank(httpCT)) { if (httpCT.toLowerCase().contains("html")) { CT_OK = true;/* w ww. j a v a2 s.c om*/ } } // simply ignore cases where the content type has not been set // TODO sniff content with Tika? else { CT_OK = true; } if (!CT_OK) { String errorMessage = "Exception content-type " + httpCT + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); return; } LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<String, List<String>>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<String, List<String>>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<String>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : outlinks) { collector.emit(StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit(tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }
From source file:com.digitalpebble.stormcrawler.bolt.JSoupParserBolt.java
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); LOG.info("Parsing : starting {}", url); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (detectMimeType) { mimeType = guessMimeType(url, mimeType, content); // store identified type in md metadata.setValue("parse.Content-Type", mimeType); }// w w w . jav a 2 s . c om if (StringUtils.isNotBlank(mimeType)) { if (mimeType.toLowerCase().contains("html")) { CT_OK = true; } } // go ahead even if no mimetype is available else { CT_OK = true; } if (!CT_OK) { if (this.treat_non_html_as_error) { String errorMessage = "Exception content-type " + mimeType + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); } else { LOG.info("Incorrect mimetype - passing on : {}", url); collector.emit(tuple, new Values(url, content, metadata, "")); collector.ack(tuple); } return; } long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text = ""; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } Element body = jsoupDoc.body(); if (body != null) { text = body.text(); } } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : parse.getOutlinks()) { collector.emit(StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit(tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }