Example usage for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body()

Source Link

Document

Accessor to the document's body element.

Usage

From source file:com.crawler.app.run.CrawlSite.java

@Override
public void visit(Page page) {
    String url = page.getWebURL().getURL();
    // logger.info("URL: ", url);
    if (ReadXmlConfig() && readXmlConfigDatabase()) {
        status_read_xml = true;//from  ww  w  . j  av a  2  s .c o m
    } else {
        return;
    }

    System.out.println("\n URL visit: " + url);

    if (page.getParseData() instanceof HtmlParseData) {

        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        String title = htmlParseData.getTitle();
        Document doc = Jsoup.parse(html, "UTF-8");
        Element body = doc.body();
        Elements listDetail = body.select(bodySelect);
        Integer i = 0;
        Integer siteID = siteIDXML;
        Integer provinceID = 1;
        MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd);
        String jobImage, jobUrl, aJobName, cJobLocation = null, cLocationNear = "", bJobCompany = "",
                dJobCareer, eJobSalary, gJobDescription, gJobDetailShort, gJobDetail, jobDetailImage,
                jobDetailImageName, hJobExpire = null;
        for (Element detail : listDetail) {
            i++;
            try {
                jobImage = "";
                /* job img */

                if (!jobImgQuery.isEmpty()) {
                    if (jobImagePosition > -1) {
                        if (jobImagePosition < detail.select(jobImgQuery).size()) {
                            if (!detail.select(jobImgQuery).get(jobImagePosition).attr(jobImageFormatAttr)
                                    .isEmpty()) {
                                if (!jobImgUrl.isEmpty()) {
                                    if (JobImageSelectPosition.isEmpty()) {
                                        jobImage = jobImgUrl + detail.select(jobImgQuery).get(jobImagePosition)
                                                .attr(jobImageFormatAttr);
                                    } else {
                                        jobImage = jobImgUrl + detail.select(jobImgQuery).get(jobImagePosition)
                                                .select(JobImageSelectPosition).attr(jobImageFormatAttr);
                                    }
                                } else {
                                    if (JobImageSelectPosition.isEmpty()) {
                                        jobImage = detail.select(jobImgQuery).get(jobImagePosition)
                                                .attr(jobImageFormatAttr);
                                    } else {
                                        jobImage = detail.select(jobImgQuery).get(jobImagePosition)
                                                .select(JobImageSelectPosition).attr(jobImageFormatAttr);
                                    }
                                }
                            }
                        }
                    } else {
                        if (!detail.select(jobImgQuery).attr(jobImageFormatAttr).isEmpty()) {
                            if (!jobImgUrl.isEmpty()) {
                                jobImage = jobImgUrl
                                        + detail.select(jobImgQuery).first().attr(jobImageFormatAttr);
                            } else {
                                jobImage = detail.select(jobImgQuery).first().attr(jobImageFormatAttr);
                            }
                        }
                    }
                }
                /* job url */
                jobUrl = "";
                if (!jobUrlQuery.isEmpty()) {
                    if (jobUrlPosition > -1) {
                        if (jobUrlPosition < detail.select(jobUrlQuery).size()) {
                            if (!joburl_url.isEmpty()) {
                                if (JobUrlSelectPosition.isEmpty()) {
                                    jobUrl = joburl_url + detail.select(jobUrlQuery).get(jobUrlPosition)
                                            .attr(jobUrlFormatAttr);
                                } else {
                                    jobUrl = joburl_url + detail.select(jobUrlQuery).get(jobUrlPosition)
                                            .select(JobUrlSelectPosition).attr(jobUrlFormatAttr);
                                }
                            } else {
                                if (JobUrlSelectPosition.isEmpty()) {
                                    jobUrl = detail.select(jobUrlQuery).get(jobUrlPosition)
                                            .attr(jobUrlFormatAttr);
                                } else {
                                    jobUrl = detail.select(jobUrlQuery).get(jobUrlPosition)
                                            .select(JobUrlSelectPosition).attr(jobUrlFormatAttr);
                                }
                            }
                        }
                    } else {
                        if (!joburl_url.isEmpty()) {
                            jobUrl = joburl_url + detail.select(jobUrlQuery).first().attr(jobUrlFormatAttr);
                        } else {
                            jobUrl = detail.select(jobUrlQuery).first().attr(jobUrlFormatAttr);
                        }
                    }
                }

                // change
                org.jsoup.nodes.Element detailJobUrl = convertUrlToDocument(jobUrl);
                //System.out.print(detailJobUrl);
                //System.exit(1);
                /* job location */
                if (!jobLocationQuery.isEmpty()) {
                    if (jobLocationFormatData.toUpperCase().equals("TEXT")) {
                        cJobLocation = detailJobUrl.select(jobLocationQuery).text();
                    } else if (jobLocationFormatData.toUpperCase().equals("HTML")) {
                        cJobLocation = detailJobUrl.select(jobLocationQuery).html();
                    }
                }

                /* job name */
                aJobName = "";
                if (jobNameFormatData.toUpperCase().equals("TEXT")) {
                    aJobName = detailJobUrl.select(jobNameQuery).text();
                } else if (jobNameFormatData.toUpperCase().equals("HTML")) {
                    aJobName = detailJobUrl.select(jobNameQuery).html();
                }

                /* job description */
                gJobDescription = "";
                if (!JobDescriptionQuery.isEmpty()) {
                    if (jobDescriptionFormatData.toUpperCase().equals("TEXT")) {
                        gJobDescription = detailJobUrl.select(JobDescriptionQuery).text();
                    } else if (jobDescriptionFormatData.toUpperCase().equals("HTML")) {
                        gJobDescription = detailJobUrl.select(JobDescriptionQuery).html();
                    }
                }
                /* job detail short */
                gJobDetailShort = "";
                if (!JobDetailShortQuery.isEmpty()) {
                    if (jobDetailShortFormatData.toUpperCase().equals("TEXT")) {
                        gJobDetailShort = detailJobUrl.select(JobDetailShortQuery).text();
                    } else if (jobDetailShortFormatData.toUpperCase().equals("HTML")) {
                        gJobDetailShort = detailJobUrl.select(JobDetailShortQuery).html();
                    }
                }
                /* job detail */
                gJobDetail = "";
                if (!JobDetailQuery.isEmpty()) {
                    if (jobDetailFormatData.toUpperCase().equals("TEXT")) {
                        gJobDetail = detailJobUrl.select(JobDetailQuery).text();
                    } else if (jobDetailFormatData.toUpperCase().equals("HTML")) {
                        gJobDetail = detailJobUrl.select(JobDetailQuery).html();
                    }
                }
                /* job detail img*/
                jobDetailImage = "";
                jobDetailImageName = "";
                if (!jobDetailImgQuery.isEmpty()) {
                    if (jobDetailImagePosition > -1) {
                        if (jobDetailImagePosition < detailJobUrl.select(jobDetailImgQuery).size()) {
                            if (!detailJobUrl.select(jobDetailImgQuery).get(jobDetailImagePosition)
                                    .attr(jobDetailImageFormatAttr).isEmpty()) {
                                if (!jobDetailImgUrl.isEmpty()) {
                                    if (JobDetailImageSelectPosition.isEmpty()) {
                                        jobDetailImage = jobDetailImgUrl + detailJobUrl
                                                .select(jobDetailImgQuery).get(jobDetailImagePosition)
                                                .attr(jobDetailImageFormatAttr);
                                    } else {
                                        jobDetailImage = jobDetailImgUrl + detailJobUrl
                                                .select(jobDetailImgQuery).get(jobDetailImagePosition)
                                                .select(JobDetailImageSelectPosition)
                                                .attr(jobDetailImageFormatAttr);
                                    }
                                } else {
                                    if (JobDetailImageSelectPosition.isEmpty()) {
                                        jobDetailImage = detailJobUrl.select(jobDetailImgQuery)
                                                .get(jobDetailImagePosition).attr(jobDetailImageFormatAttr);
                                    } else {
                                        jobDetailImage = detailJobUrl.select(jobDetailImgQuery)
                                                .get(jobDetailImagePosition)
                                                .select(JobDetailImageSelectPosition)
                                                .attr(jobDetailImageFormatAttr);
                                    }
                                }
                            }
                        }
                    } else {
                        if (!detailJobUrl.select(jobDetailImgQuery).attr(jobDetailImageFormatAttr).isEmpty()) {
                            if (!jobDetailImgUrl.isEmpty()) {
                                jobDetailImage = jobDetailImgUrl + detailJobUrl.select(jobDetailImgQuery)
                                        .first().attr(jobDetailImageFormatAttr);
                            } else {
                                jobDetailImage = detailJobUrl.select(jobDetailImgQuery).first()
                                        .attr(jobDetailImageFormatAttr);
                            }
                        }
                    }
                    if (!jobDetailImage.isEmpty()) {
                        jobDetailImageName = DownloadImage.downloadImage(jobDetailImage, "D:\\/Java\\/storage");
                    }
                }
                /* job location near */
                cLocationNear = "";
                if (!locationNearQuery.isEmpty()) {
                    if (locationNearFormatData.toUpperCase().equals("TEXT")) {
                        cLocationNear = detailJobUrl.select(locationNearQuery).text();
                    } else if (locationNearFormatData.toUpperCase().equals("HTML")) {
                        cLocationNear = detailJobUrl.select(locationNearQuery).html();
                    }
                }
                /* job salary */
                eJobSalary = "";
                if (!JobSalaryQuery.isEmpty()) {
                    if (jobSalaryFormatData.toUpperCase().equals("TEXT")) {
                        eJobSalary = detailJobUrl.select(JobSalaryQuery).text();
                    } else if (jobSalaryFormatData.toUpperCase().equals("HTML")) {
                        eJobSalary = detailJobUrl.select(JobSalaryQuery).html();
                    }
                }

                /* job expire */
                hJobExpire = "";
                if (!JobExpireQuery.isEmpty()) {
                    if (jobExpireFormatData.toUpperCase().equals("TEXT")) {
                        hJobExpire = detailJobUrl.select(JobExpireQuery).text();
                    } else if (jobExpireFormatData.toUpperCase().equals("HTML")) {
                        hJobExpire = detailJobUrl.select(JobExpireQuery).html();
                    }
                }
                /* job company */
                bJobCompany = "";
                if (!JobCompanyQuery.isEmpty()) {
                    if (jobCompanyFormatData.toUpperCase().equals("TEXT")) {
                        bJobCompany = detailJobUrl.select(JobCompanyQuery).text();
                    } else if (jobCompanyFormatData.toUpperCase().equals("HTML")) {
                        bJobCompany = detailJobUrl.select(JobCompanyQuery).html();
                    }
                }
                /* job type */
                String fJobType = "";
                if (!JobTypeQuery.isEmpty()) {
                    if (jobTypeFormatData.toUpperCase().equals("TEXT")) {
                        fJobType = detailJobUrl.select(JobTypeQuery).text();
                    } else if (jobTypeFormatData.toUpperCase().equals("HTML")) {
                        fJobType = detailJobUrl.select(JobTypeQuery).html();
                    }
                }
                /* job address */
                String jobAddress = "";
                if (!JobAddressQuery.isEmpty()) {
                    if (jobAddressFormatData.toUpperCase().equals("TEXT")) {
                        jobAddress = detailJobUrl.select(JobAddressQuery).text();
                    } else if (jobAddressFormatData.toUpperCase().equals("HTML")) {
                        jobAddress = detailJobUrl.select(JobAddressQuery).html();
                    }
                }
                dJobCareer = "";
                if (!JobCareerQuery.isEmpty()) {
                    if (jobCareerFormatData.toUpperCase().equals("TEXT")) {
                        dJobCareer = detailJobUrl.select(JobCareerQuery).text();
                    } else if (jobCareerFormatData.toUpperCase().equals("HTML")) {
                        dJobCareer = detailJobUrl.select(JobCareerQuery).html();
                    }
                }

                System.out.println("\n Url : " + jobUrl);
                System.out.println("\n Image : " + jobImage);
                System.out.println("\n Title : " + aJobName);
                System.out.println("\n Title SEO : " + StringUtils.removeAccent(aJobName));
                //System.out.println("\n Location : " + cJobLocation + "\n"
                // + cLocationNear);
                System.out.println("\n jobDetailImageName : " + jobDetailImageName);
                // System.out.println("\n Detail : " + gJobDetail);
                // System.out.println("\n Salary : " + eJobSalary);
                // System.out.println("\n expire Date : " + hJobExpire);
                // System.out.println("\n Company : " + bJobCompany);
                // System.out.println("\n JobType : " + fJobType);
                //
                System.out.println("\n Full I : " + i);
                String news_title = aJobName;
                String news_title_seo = StringUtils.removeAccent(aJobName);
                String news_meta = aJobName;
                String news_description = gJobDescription;
                String news_tag = aJobName.replace(" ", ", ");
                String news_pic = jobDetailImageName;
                String pic_note = aJobName;
                String news_subcontent = "<p>" + gJobDescription + "</p>";
                String news_content = gJobDetailShort + "<p><img src='http://" + jobDetailImageName + "'></p>"
                        + gJobDetail;
                int type = 4;
                int status = 0;
                int kind = 0;
                String source = "Theo http://monngonmoingay.com";
                String author = null;
                int user_posted = 0;
                int user_activated = 0;
                int cate_id = 43;
                String list_productid_relation = "13,28,30";

                if (!MysqlCrawler.getInstance().checkNewsUrl(news_title_seo)) {
                    MysqlCrawler.getInstance().insertNewsContent(news_title, news_title_seo, news_meta,
                            news_description, news_tag, news_pic, pic_note, news_subcontent, news_content, type,
                            status, kind, source, author, user_posted, user_activated, cate_id,
                            list_productid_relation);
                }

                // System.exit(1);
            } catch (Exception ex) {
                System.out.println("\n Fail I : " + i);
                System.out.println("\n Ex : " + ex);
            }
        }

    }

    /*
     * Header[] responseHeaders = page.getFetchResponseHeaders(); if
     * (responseHeaders != null) { logger.debug("Response headers:"); for
     * (Header header : responseHeaders) { logger.debug("\t{}: {}",
     * header.getName(), header.getValue()); } }
     */
    logger.debug("=============");
}

From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java

/**
 * Method used to perform recursive creation indexing for a given web page 
 * in search database./*from  www  . j  a va 2s  .  c  o  m*/
 *
 * @param webPage webPage.url is entered url
 * webPage.title is set
 * @param html Jsoup.Document of entered url
 * @param recursionNumber used to stop recursion at exceeding 
 * MAX_RECURSION_SEARCH_NUMBER
 */
private void indexElements(WebPage webPage, Document html, final int recursionNumber)
        throws IOException, ParseException {
    String title = html.title();
    if (referencedTitles.contains(title.trim())) {
        return;
    }
    referencedTitles.add(title.trim());
    webPage.setTitle(title);
    if (containsPage(webPage)) {
        System.out.println(webPage.getUrl() + " is already indexed");
        return;
    }
    Element prevElement = null;
    Elements elements = html.body().getAllElements(); //.getElementsByTag("a");
    addDoc(webPage, html.text());
    //        for (Element element : elements) {
    ////                System.out.println(element.nodeName() + " element.text() " 
    ////                        + element.text() + " url " 
    ////                        + element.absUrl("href"));
    //            if (element.nodeName().equalsIgnoreCase("body")) {
    //                addDoc(webPage, element.text());
    //                break;
    ////                continue;
    //            }
    //            if (null == prevElement) {
    //                prevElement = element;
    ////            } else if (prevElementContainsElementText(prevElement, element)) {
    ////                continue;
    //            }
    ////            if (null !== webPagesService.findWebPage(element.absUrl("href")))
    //            if (element.text().trim().isEmpty()) {
    //                continue;
    //            }
    ////            StringTokenizer str = new StringTokenizer(element.text());
    ////            str.
    //            addDoc(webPage, element.text());
    //        }
    if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) {
        //            System.out.println(recursionNumber + " " 
        //                    + referencedSites.contains(webPage.getUrl()));
        return;
    }
    elements.parallelStream()
            .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF)
                    && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF))
                    && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF))))
            .forEach((Element element) -> {
                WebPage webPage1 = new WebPage(element.absUrl(HREF));
                String url1 = webPage1.getUrl();
                //                    System.out.println(recursionNumber + " recursion for '" 
                //                            + url1 + "'");
                try {
                    Document htmlR = Jsoup.connect(url1).get();
                    indexElements(webPage1, htmlR, recursionNumber + 1);
                } catch (IOException | ParseException e) {
                    System.out.println("Exception " + e.getMessage());
                }
                referencedSites.add(url1);
            });
    //        for (Element element : elements) {
    //            if (!element.nodeName().equalsIgnoreCase("a")) {
    //                continue;
    //            }
    //            WebPage webPage1 = new WebPage(element.absUrl("href"));
    //            if (null == webPage1.getUrl() 
    //                    || webPage1.getUrl().isEmpty()
    //                    || referencedSites.contains(webPage1.getUrl())) {
    //                continue;
    //            }
    //            System.out.println(recursionNumber + "recursion for " 
    //                    + element.absUrl("href"));
    //            try {
    //                Document htmlR = Jsoup.connect(webPage1.getUrl()).get();
    //                webPage1.setTitle(htmlR.title());
    //                indexElements(webPage1, htmlR, recursionNumber + 1);
    //            } catch (IOException e) {
    //                System.out.println("IOException " + e.getMessage());
    //            }
    //            referencedSites.add(webPage1.getUrl());
    //        }
}

From source file:lucene.IndexFiles.java

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {

    try (InputStream stream = Files.newInputStream(file)) {

        // make a new, empty document
        System.out.println("Test 3.1");
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
        String line = null;//www  .  j  av  a  2  s .  c  om
        StringBuilder stringBuilder = new StringBuilder();
        String ls = System.getProperty("line.separator");

        try {
            while ((line = reader.readLine()) != null) {
                stringBuilder.append(line);
                stringBuilder.append(ls);
            }

        } finally {
            reader.close();
        }

        //index file name
        Field fileNameField = new StringField("name", file.getFileName().toString(), Field.Store.YES);

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.

        String file_content = stringBuilder.toString();
        //System.out.println(file_content);
        //String[] passages = file_content.split("<P|<p");
        //String[] passages = file_content.split("<P");
        //String[] passages = file_content.split("<P>|<H1>|<H2>|<H3>|<H4>|<H5>|<H6>|<BR>|<HR>|<TABLE>|<TD>|<TH>|<TR>|<OL>|<UL>|<p>|<br>|<hr>");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul");
        String[] passages = file_content.split(
                "(?i)<P|(?i)<H1|(?i)<H2|(?i)<H3|(?i)<H4|(?i)<H5|(?i)<H6|(?i)<BR|(?i)<HR|(?i)<TABLE|(?i)<TD|(?i)<TH|(?i)<TR|(?i)<OL|(?i)<UL");//|<p|<h1|<h2|<h3|<h4|<h5|<h6|<br|<hr|<table|<td|<th|<tr|<ol|<ul");

        //String[] passages = StringUtils.substringsBetween(file_content, "<P", "<P");
        //String[] title = StringUtils.substringsBetween(file_content, "<body>", "</");
        //System.out.println("path");
        //String title = passages[0];
        String title;
        Document dochtml;// = Jsoup.parse(title);
        String ptitle = ""; //= dochtml.body().text();
        //System.out.println("Title is" + ptitle);
        //Field titleField = new StringField("title", ptitle, Field.Store.YES);

        ///////------FORMATING TEXT---------
        StandardTokenizer stdToken = new StandardTokenizer();
        //Tokenizer stdToken = new WhitespaceTokenizer();
        EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();

        //stdToken.setReader(new StringReader("Some stuff that is in need of analysis. stuff patients PATIENT d > 0.5 Dnn>Bnn D.N.A diseases heart attacks at cl-fo"));

        //You're code starts here
        final List<String> stopWords = new ArrayList<>();
        String f = "E:/stopwords_en.txt";

        try (BufferedReader br = new BufferedReader(new FileReader(f))) {

            String topic;
            //int qid = 200;//cntr=0;
            while ((topic = br.readLine()) != null) {
                stopWords.add(topic.trim());
            }
        }
        final CharArraySet stopSet = new CharArraySet(stopWords, false);

        //////------FORMATING TEXT---------
        if (passages != null) {
            int j = 0;
            if (passages.length > 1) {
                title = passages[1].split("</P|</H1|</H2|</H3|</H4|</H5|</H6|</p")[0];
                dochtml = Jsoup.parse(title);
                ptitle = dochtml.body().text().toLowerCase();
                System.out.println("Title is" + ptitle);
            }
            for (int i = 0; i < passages.length; i++) {

                //System.out.println(i);
                //cnames = cname.split(":");
                //cname =  cnames[0];
                String[] passage_contents = passages[i].split("</P|</p");
                //String[] passage_contents = passages[i].split("</P");
                String passage_content = passage_contents[0];
                //if(passage_content.trim().isEmpty()){
                //  System.out.println("abc");
                //continue;
                //}
                dochtml = Jsoup.parse(passage_content);
                String plainStr = dochtml.body().text();
                String[] validpas = plainStr.split(" ");

                if (validpas.length > 9) {
                    j++;
                    Field passageId = new StringField("id", file.getFileName().toString() + "." + i,
                            Field.Store.YES);

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(fileNameField);
                    doc.add(pathField);
                    doc.add(passageId);
                    //doc.add(titleField);
                    doc.add(new StringField("offset", file_content.indexOf(passage_content) + "",
                            Field.Store.YES));
                    doc.add(new StringField("length", passage_content.length() + "", Field.Store.YES));
                    doc.add(new LongPoint("modified", lastModified));
                    ((org.apache.lucene.document.Document) doc).add(new TextField("title", ptitle, Store.YES));
                    //System.out.println(passage_content);
                    //InputStream is = new ByteArrayInputStream(passage_content.getBytes());

                    //String strippedText = passage_content.replaceAll("(?s)<[^>]*>(\\s*<[^>]*>)*", " ");

                    //--------TEXT PROCESSING------------
                    TokenStream tokenStream;
                    //String nplainstr = plainStr.replaceAll("-", ".zz");
                    //stdToken.setReader(new StringReader(nplainstr));
                    stdToken.setReader(new StringReader(plainStr));

                    tokenStream = new StopFilter(
                            new ASCIIFoldingFilter(new ClassicFilter(new LowerCaseFilter(stdToken))), stopSet);

                    //tokenStream = new PorterStemFilter(tokenStream);
                    tokenStream.reset();
                    //int l=0;
                    String term = "";
                    StringBuilder sb = new StringBuilder();
                    //OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
                    CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class);
                    try {
                        //int l;
                        while (tokenStream.incrementToken()) {
                            if (sb.length() > 0) {
                                sb.append(" ");
                            }
                            term = charTermAttr.toString();
                            /*if(term.contains(".zz")){
                               term = term.replaceAll(".zz", "-");
                               String[] terms=term.split("-");
                               String at="";
                               for(String t : terms){
                                  //l = stemmer.stem(t.toCharArray(), t.length());
                                  //t = t.substring(0, l); 
                                  //sb.append(t.toString(),0,l);
                                   sb.append(t + " ");
                                   at = at+t;
                               }
                                       
                               sb.append(at + " ");
                            }*/

                            if (term.contains(".") && !term.matches(".*\\d+.*")) {//&& StringUtils.isAlpha(term)){
                                term = term.replaceAll("\\.", "");
                                //sb.append(term);
                            }
                            //int l = stemmer.stem(charTermAttr.toString().toCharArray(), charTermAttr.toString().length());
                            int l;
                            l = stemmer.stem(term.toCharArray(), term.length());
                            //sb.append(charTermAttr.toString(),0,l);
                            sb.append(term, 0, l);
                            //sb.append(term);

                            /*if(term.contains("-")){
                               String[] terms=term.split("-");
                               String at="";
                               for(String t : terms){
                                   sb.append(" " + t);
                                   at = at+t;
                               }
                                       
                               sb.append(" " + at);
                            }*/
                            /*sb.append(charTermAttr.toString());
                            String[] hl = charTermAttr.toString().split("-");
                            if (hl.length > 1){
                               for(int j=0; j<hl.length; j++){
                                  sb.append(" " + hl[j]);
                               }
                                       
                               //sb.append(" " + charTermAttr.toString().split("-")[1]);
                               //sb.append(charTermAttr.toString());
                            }*/

                        }
                    } catch (IOException e) {
                        System.out.println(e.getMessage());
                    }
                    //System.out.println(sb.toString());
                    tokenStream.close();

                    ///----------END OF TExt processin----------

                    ((org.apache.lucene.document.Document) doc)
                            .add(new TextField("contents", sb.toString(), Store.YES));//new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))));
                    //doc.add(new StringField("contents", passage_content, Field.Store.YES));
                    //System.out.println(plainStr);
                    //writer.addDocument(doc);

                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                        n++;
                        // New index, so we just add the document (no old document can be there):
                        System.out.println(
                                ".......adding " + file.getFileName().toString() + " passage " + j + "--" + n);
                        writer.addDocument(doc);
                    } else {
                        // Existing index (an old copy of this document may have been indexed) so 
                        // we use updateDocument instead to replace the old one matching the exact 
                        // path, if present:
                        System.out.println("updating " + file);
                        writer.updateDocument(new Term("path", file.toString()), doc);
                    }

                }
            }
        }

    }

}

From source file:feedzilla.Feed.java

@Override
public void run() {
    try {//from w w  w . j a  va  2  s. co m
        Thread.sleep((new Random()).nextInt(60 * 1000));
    } catch (InterruptedException ex) {
        Log.warn("Could not sleep Thread", ex);
    }

    Document doc = null;
    boolean get = true;
    int trysCount = 0;
    do {
        get = true;
        try {
            doc = Jsoup.connect(this.link).timeout(60 * 1000).userAgent(
                    "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
                    .referrer("http://www.google.com").get();
        } catch (IOException ex) {
            Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex);
            Log.warn("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                    + " - Could not get Feed page from FeedZilla", ex);
            get = false;
            if (++trysCount > 5) {
                Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                        + "Five attempts and has not yet been possible to "
                        + "retrieve the page from filezilla. Ignoring this news.");
                return;
            }
        }
    } while (!get);

    Elements elements = doc.body().select("iframe");
    for (Element element : elements) {
        try {
            this.link = URLDecoder.decode(element.attr("src"), "UTF-8");
        } catch (UnsupportedEncodingException ex) {
            Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex);
            Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                    + "Could not get the news link from FeedZilla pages");
            return;
        }
    }
    this.link = getUrlInParams(this.link);
    try {
        this.news = (new NewsCrawler(this.link)).getNews();
    } catch (Exception ex) {
        Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                + "Could not retrieve news from link " + this.link, ex);
        return;
    }

    newsXMLFile.getParentFile().mkdirs();
    try {
        FileUtils.writeStringToFile(newsXMLFile, this.toXML());
        Log.info("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                + " - Successfuly saved!");
        System.out.println("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                + " - Successfuly saved!");
    } catch (IOException ex) {
        Log.error("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                + " - Could not save news into file", ex);
    }
}

From source file:codeu.chat.client.commandline.Chat.java

private boolean parseScript(String link, String phrase, boolean springfield) {
    String[] script;//w  ww  . j  av  a 2 s.c  o  m

    try {
        Document doc = Jsoup.connect(link).get();

        /* If the script was retrieved from the Springfield website, the lines
           must be split up using the <br> tag instead of new line characters */
        if (springfield) {
            String temp = Jsoup.parse(doc.html().replaceAll("(?i)<br[^>]*>", "br2n")).text();
            script = mergeScriptSentences(temp.split("br2n"));
        } else {
            script = mergeScriptSentences(doc.body().text().split("\n"));
        }

        /* Search for a line containing the phrase. Once one is found,
           determine the best response and return accordingly. In some
           cases, this will mean continuing to search for a later match */
        for (int lineNum = 0; lineNum < script.length; lineNum++) {
            script[lineNum] = script[lineNum].trim().toLowerCase();
            for (String sentence : script[lineNum].split("(?<=[!\\?\\.])")) {
                if (sentence.contains(phrase)
                        || StringUtils.getLevenshteinDistance(sentence, phrase) <= phrase.length() / 3.0) {
                    if (findNextScriptResponse(lineNum, phrase, script)) {
                        return true;
                    }
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return false; // Return false if no line containing the phrase was found
}

From source file:edu.stanford.muse.email.EmailFetcherStats.java

/**
 * this method returns the text content of the message as a list of strings
 * // each element of the list could be the content of a multipart message
 * // m is the top level subject//from  www .jav a2s .com
 * // p is the specific part that we are processing (p could be == m)
 * also sets up names of attachments (though it will not download the
 * attachment unless downloadAttachments is true)
 */
private List<String> processMessagePart(int messageNum, Message m, Part p, List<Blob> attachmentsList)
        throws MessagingException, IOException {
    List<String> list = new ArrayList<String>(); // return list
    if (p == null) {
        dataErrors.add("part is null: " + folder_name() + " idx " + messageNum);
        return list;
    }

    if (p == m && p.isMimeType("text/html")) {
        /*
        String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription();
        dataErrors.add(s);
        */
        // we don't normally expect the top-level part to have content-type text/html
        // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html
        String html = (String) p.getContent();
        String text = Util.unescapeHTML(html);
        org.jsoup.nodes.Document doc = Jsoup.parse(text);

        StringBuilder sb = new StringBuilder();
        HTMLUtils.extractTextFromHTML(doc.body(), sb);
        list.add(sb.toString());
        return list;
    }

    if (p.isMimeType("text/plain")) {
        //make sure, p is not wrongly labelled as plain text.
        Enumeration headers = p.getAllHeaders();
        boolean dirty = false;
        if (headers != null)
            while (headers.hasMoreElements()) {
                Header h = (Header) headers.nextElement();
                String name = h.getName();
                String value = h.getValue();
                if (name != null && value != null) {
                    if (name.equals("Content-transfer-encoding") && value.equals("base64")) {
                        dirty = true;
                        break;
                    }
                }
            }
        String fname = p.getFileName();
        if (fname != null) {
            int idx = fname.lastIndexOf('.');
            if ((idx < fname.length()) && (idx >= 0)) {
                String extension = fname.substring(idx);
                //anything extension other than .txt is suspicious.
                if (!extension.equals(".txt"))
                    dirty = true;
            }
        }
        if (dirty) {
            dataErrors.add("Dirty message part, has conflicting message part headers." + folder_name()
                    + " Message# " + messageNum);
            return list;
        }

        log.debug("Message part with content type text/plain");
        String content;
        String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
        try {
            // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
            if (FORCED_ENCODING != null) {
                byte b[] = Util.getBytesFromStream(p.getInputStream());
                content = new String(b, FORCED_ENCODING);
            } else
                content = (String) p.getContent();
        } catch (UnsupportedEncodingException uee) {
            dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type "
                    + type + ", using brute force conversion");
            // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
            // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013
            // though it may be better to consider official support for utf-7 or other encodings.

            // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it.
            // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt",
            // the hack below doesn't work for it.
            ByteArrayOutputStream bao = new ByteArrayOutputStream();
            p.writeTo(bao);
            content = bao.toString();
        }
        list.add(content);
    } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) {
        // rfc822 mime type is for embedded mbox format or some such (appears for things like
        // forwarded messages). the content appears to be just a multipart.
        Object o = p.getContent();
        if (o instanceof Multipart) {
            Multipart allParts = (Multipart) o;
            if (p.isMimeType("multipart/alternative")) {
                // this is an alternative mime type. v common case to have text and html alternatives
                // so just process the text part if there is one, and avoid fetching the alternatives.
                // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html.
                // revisit in future we want to retain the html alternative for display purposes
                Part[] parts = new Part[allParts.getCount()];
                for (int i = 0; i < parts.length; i++)
                    parts[i] = allParts.getBodyPart(i);

                for (int i = 0; i < parts.length; i++) {
                    Part thisPart = parts[i];
                    if (thisPart.isMimeType("text/plain")) {
                        // common case, return quickly
                        list.add((String) thisPart.getContent());
                        log.debug("Multipart/alternative with content type text/plain");
                        return list;
                    }
                }

                // no text part, let's look for an html part. this happens for html parts.
                for (int i = 0; i < allParts.getCount(); i++) {
                    Part thisPart = parts[i];
                    if (thisPart.isMimeType("text/html")) {
                        // common case, return quickly
                        String html = (String) thisPart.getContent();
                        String text = Util.unescapeHTML(html);
                        org.jsoup.nodes.Document doc = Jsoup.parse(text);

                        StringBuilder sb = new StringBuilder();
                        HTMLUtils.extractTextFromHTML(doc.body(), sb);
                        list.add(sb.toString());

                        log.debug("Multipart/alternative with content type text/html");
                        return list;
                    }
                }

                // no text or html part. hmmm... blindly process the first part only
                if (allParts.getCount() >= 1)
                    list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList));
            } else {
                // process it like a regular multipart
                for (int i = 0; i < allParts.getCount(); i++) {
                    BodyPart bp = allParts.getBodyPart(i);
                    list.addAll(processMessagePart(messageNum, m, bp, attachmentsList));
                }
            }
        } else if (o instanceof Part)
            list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList));
        else
            dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum
                    + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType());
    } else {
        try {
            // do attachments only if downloadAttachments is set.
            // some apps do not need attachments, so this saves some time.
            // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too?
            if (fetchConfig.downloadAttachments)
                handleAttachments(messageNum, m, p, list, attachmentsList);
        } catch (Exception e) {
            dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": "
                    + Util.stackTrace(e));
        }
    }

    return list;
}

From source file:edu.stanford.muse.email.EmailFetcherStats.java

/**
 * recursively processes attachments, fetching and saving it if needed
 * parses the given part p, and adds it to hte attachmentsList.
 * in some cases, like a text/html type without a filename, we instead append it to the textlist
 * @throws MessagingException//  w w  w .  jav  a2  s  .com
 */
private void handleAttachments(int idx, Message m, Part p, List<String> textList, List<Blob> attachmentsList)
        throws MessagingException {
    String ct = null;
    if (!(m instanceof MimeMessage)) {
        Exception e = new IllegalArgumentException("Not a MIME message!");
        e.fillInStackTrace();
        log.warn(Util.stackTrace(e));
        return;
    }

    String filename = null;
    try {
        filename = p.getFileName();
    } catch (Exception e) {
        // seen this happen with:
        // Folders__gmail-sent Message #12185 Expected ';', got "Message"
        // javax.mail.internet.ParseException: Expected ';', got "Message"

        dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx);
        return;
    }

    String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name());
    if (filename == null) {
        String tempFname = sanitizedFName + "." + idx;
        dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx
                + " assigning it the name: " + tempFname);
        if (p.isMimeType("text/html")) {
            try {
                log.info("Turning message " + sanitizedFName + " Message#" + idx
                        + " into text although it is an attachment");
                String html = (String) p.getContent();
                String text = Util.unescapeHTML(html);
                org.jsoup.nodes.Document doc = Jsoup.parse(text);

                StringBuilder sb = new StringBuilder();
                HTMLUtils.extractTextFromHTML(doc.body(), sb);
                textList.add(sb.toString());
                return;
            } catch (Exception e) {
                Util.print_exception("Error reading contents of text/html multipart without a filename!", e,
                        log);
                return;
            }
        }
        filename = tempFname;
    }

    // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _
    // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced)
    String newFilename = Util.sanitizeFileName(filename);

    // Updating filename if it's changed after sanitizing.
    if (!newFilename.equals(filename)) {
        log.info("Filename changed from " + filename + " to " + newFilename);
        filename = newFilename;
    }

    try {
        ct = p.getContentType();
        if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible
        {
            // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud....
            // Below are the most common file types, more type can be added if needed

            // Most common APPLICATION TYPE
            if (ct.startsWith("application/pdf"))
                filename = filename + ".pdf";
            if (ct.startsWith("application/zip"))
                filename = filename + ",zip";
            // Most common IMAGE TYPE
            if (ct.startsWith("image/jpeg"))
                filename = filename + ".jpg";
            if (ct.startsWith("image/gif"))
                filename = filename + ".gif";
            if (ct.startsWith("image/png"))
                filename = filename + ".png";
            // Most Common VIDEO TYPE
            if (ct.startsWith("video/x-ms-wmv"))
                filename = filename + ".wmv";
            // Most Common AUDIO TYPE
            if (ct.startsWith("audio/mpeg"))
                filename = filename + ".mp3";
            if (ct.startsWith("audio/mp4"))
                filename = filename + ".mp4";
            // Most Common TEXT TYPE
            if (ct.startsWith("text/html"))
                filename = filename + ".html";
            // Windows Office
            if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word
                filename = filename + ".docx";
            if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel
                filename = filename + ".xlsx";
            if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint
                filename = filename + ".pptx";
        }
        // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename
        int x = ct.indexOf(";");
        if (x >= 0)
            ct = ct.substring(0, x);
        log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename));
    } catch (Exception pex) {
        dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize()
                + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: "
                + pex + "\n" + Util.stackTrace(pex));
        return;
    }

    //       if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html
    //          log.warn ("Attachment filename is null: " + Util.stackTrace());

    boolean success = true;
    // the size passed in here is the part size, which is not really the binary blob size.
    // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size
    Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p);

    if (fetchConfig.downloadAttachments) {
        // this containment check is only on the basis of file name and size currently,
        // not on the actual hash
        if (archive.getBlobStore().contains(b)) {
            log.debug("Cache hit! " + b);
        } else {
            try {
                if (filename.endsWith(".tif"))
                    log.info("Fetching attachment..." + Util.blurKeepingExtension(filename));

                // performance critical! use large buffer! currently 256KB
                // stream will be closed by callee

                long start = System.currentTimeMillis();
                long nBytes = archive.getBlobStore().add(b,
                        new BufferedInputStream(p.getInputStream(), 256 * 1024));
                long end = System.currentTimeMillis();
                if (nBytes != -1) {
                    long diff = end - start;
                    String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis";
                    if (diff > 0)
                        s += " (" + (nBytes / diff) + " KB/s)";
                    log.info(s);
                }

                Util.ASSERT(archive.getBlobStore().contains(b));

            } catch (IOException ioe) {
                success = false;
                dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = "
                        + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString()
                        + "\nException: " + ioe);
                ioe.printStackTrace(System.out);
            }
        }

        if (success) {
            attachmentsList.add(b);

            /// generate thumbnail only if not already cached
            try {
                archive.getBlobStore().generate_thumbnail(b); // supplement
            } catch (IOException ioe) {
                log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize()
                        + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString()
                        + "\nException: " + ioe);
                ioe.printStackTrace(System.out);
            }
        }
    }
}

From source file:com.aurel.track.exchange.docx.exporter.PreprocessImage.java

/**
 * Gets the image captions in a map keyed by itemID_attachmentID
 * The key is saved also in the <img> tag's "alt" attribute for later use from word
 * @param doc//from   w  w  w .  ja v a 2 s .  c o  m
 * @param personID
 * @param imageCaptionsMap
 * @return
 */
private String getImageCaptions(Document doc, Integer personID,
        Map<String, ImageOrTableCaption> imageCaptionsMap) {
    Elements imgElements = doc.select("img");
    if (imgElements != null) {
        for (Iterator<Element> iterator = imgElements.iterator(); iterator.hasNext();) {
            Element imageElement = iterator.next();
            String sourceAttribute = imageElement.attr("src");
            String style = imageElement.attr("style");
            //remove the width and height attributes from html img to avoid java.lang.OutOfMemoryError: Java heap space
            imageElement.removeAttr("width");
            imageElement.removeAttr("height");
            ALIGN align = null;
            if (style != null) {
                if (style.contains("float:left")) {
                    align = ALIGN.LEFT;
                } else {
                    if (style.contains("float:right")) {
                        align = ALIGN.RIGHT;
                    }
                }
            }
            String altAttribute = imageElement.attr("alt");
            Map<String, String> map = getTemporaryFilePathMap(sourceAttribute, personID);
            if (map != null) {
                imageElement.attr("src", map.get("temporaryFilePath"));
                //save imageCaption into the map and now use the "alt" attribute for storing the merged key
                //which will be transformed  in nonvisualdrawingprops.getDescr() by XHTMLImporterImpl to set the caption on the ms word side
                String imageCaption = null;
                if (altAttribute != null && !"".equals(altAttribute)) {
                    //probably from previously removed figcaption but it may also be explicitly set
                    imageCaption = altAttribute;
                } else {
                    imageCaption = map.get("description");
                }
                globalCounter++;
                counterWithinChapter++;
                imageElement.attr("alt", String.valueOf(globalCounter));
                if (imageCaption == null) {
                    //add anyway to the map even as empty string because this marks the image to be added to the List of figures 
                    imageCaption = "";
                }
                imageCaptionsMap.put(String.valueOf(globalCounter),
                        new ImageOrTableCaption(chapterNo, counterWithinChapter, imageCaption, align));
            }
        }
    }
    return doc.body().html();
}

From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java

@Override
public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
        if (httpCT.toLowerCase().contains("html")) {
            CT_OK = true;/*  w ww. j a  v a2 s.c om*/
        }
    }
    // simply ignore cases where the content type has not been set
    // TODO sniff content with Tika?
    else {
        CT_OK = true;
    }

    if (!CT_OK) {
        String errorMessage = "Exception content-type " + httpCT + " for " + url;
        RuntimeException e = new RuntimeException(errorMessage);
        handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
        return;
    }

    LOG.info("Parsing : starting {}", url);

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text;
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
        org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

        fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

        // extracts the robots directives from the meta tags
        robotsTags.extractMetaTags(fragment);

        // store a normalised representation in metadata
        // so that the indexer is aware of it
        robotsTags.normaliseToMetadata(metadata);

        // do not extract the links if no follow has been set
        // and we are in strict mode
        if (robotsTags.isNoFollow() && robots_noFollow_strict) {
            slinks = new HashMap<String, List<String>>(0);
        } else {
            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<String, List<String>>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains
                // for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");

                // nofollow
                boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
                // remove altogether
                if (noFollow && robots_noFollow_strict) {
                    continue;
                }

                // link not specifically marked as no follow
                // but whole page is
                if (!noFollow && robotsTags.isNoFollow()) {
                    noFollow = true;
                }

                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    // any existing anchors for the same target?
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<String>();
                        slinks.put(targetURL, anchors);
                    }
                    // track the anchors only if no follow is false
                    if (!noFollow && StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }
        }

        text = jsoupDoc.body().text();

    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content parsing", errorMessage);
        return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content filtering", errorMessage);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : outlinks) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
}

From source file:com.digitalpebble.stormcrawler.bolt.JSoupParserBolt.java

@Override
public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    LOG.info("Parsing : starting {}", url);

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;

    String mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);

    if (detectMimeType) {
        mimeType = guessMimeType(url, mimeType, content);
        // store identified type in md
        metadata.setValue("parse.Content-Type", mimeType);
    }// w w  w . jav  a  2  s  . c  om

    if (StringUtils.isNotBlank(mimeType)) {
        if (mimeType.toLowerCase().contains("html")) {
            CT_OK = true;
        }
    }
    // go ahead even if no mimetype is available
    else {
        CT_OK = true;
    }

    if (!CT_OK) {
        if (this.treat_non_html_as_error) {
            String errorMessage = "Exception content-type " + mimeType + " for " + url;
            RuntimeException e = new RuntimeException(errorMessage);
            handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
        } else {
            LOG.info("Incorrect mimetype - passing on : {}", url);
            collector.emit(tuple, new Values(url, content, metadata, ""));
            collector.ack(tuple);
        }
        return;
    }

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text = "";
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
        org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

        fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

        // extracts the robots directives from the meta tags
        robotsTags.extractMetaTags(fragment);

        // store a normalised representation in metadata
        // so that the indexer is aware of it
        robotsTags.normaliseToMetadata(metadata);

        // do not extract the links if no follow has been set
        // and we are in strict mode
        if (robotsTags.isNoFollow() && robots_noFollow_strict) {
            slinks = new HashMap<>(0);
        } else {
            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains
                // for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");

                // nofollow
                boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
                // remove altogether
                if (noFollow && robots_noFollow_strict) {
                    continue;
                }

                // link not specifically marked as no follow
                // but whole page is
                if (!noFollow && robotsTags.isNoFollow()) {
                    noFollow = true;
                }

                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    // any existing anchors for the same target?
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<>();
                        slinks.put(targetURL, anchors);
                    }
                    // track the anchors only if no follow is false
                    if (!noFollow && StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }
        }

        Element body = jsoupDoc.body();
        if (body != null) {
            text = body.text();
        }

    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content parsing", errorMessage);
        return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content filtering", errorMessage);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : parse.getOutlinks()) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
}