Example usage for org.jsoup.nodes Document text

List of usage examples for org.jsoup.nodes Document text

Introduction

In this page you can find the example usage for org.jsoup.nodes Document text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:explore.ArgminCorpusReader.java

@Override
public void getNext(JCas aJcas) throws CollectionException {
    try {/*from  w w w  .  j  a v  a 2 s.c om*/
        Map<String, Object> jsonData = this.documentsIterator.next();

        String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT);
        org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText);
        String rawDocumentText = cleanedText.text();

        String file = (String) jsonData.get(JsonCorpusUtil.FILE);
        String documentId = file.replace(".json", "");
        String url = (String) jsonData.get(JsonCorpusUtil.URL);

        // original HTML version not required for TC experiment
        //            JCas view = jCas.createView(JsonCorpusUtil.VIEW_ORIGINAL_HTML);
        //            view.setDocumentText(htmlText);

        aJcas.setDocumentText(rawDocumentText);
        aJcas.setDocumentLanguage(this.language);

        DocumentMetaData metaData = DocumentMetaData.create(aJcas);
        metaData.setDocumentBaseUri("");
        metaData.setDocumentUri("/" + documentId);
        metaData.setDocumentTitle(url);
        metaData.setDocumentId(documentId);

        Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText);

        @SuppressWarnings("unchecked")
        List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData
                .get(JsonCorpusUtil.USER_ANNOTATIONS);

        for (Map<String, Object> userAnnotation : userAnnotations) {

            String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR);
            if (annotator.equals(this.annotator)) {

                @SuppressWarnings("unchecked")
                List<String> argUnits = (List<String>) userAnnotation.get(JsonCorpusUtil.ARGUMENTATION_UNITS);

                for (String argUnit : argUnits) {
                    String cleanedArgUnit = argUnit.replaceAll("\\s+", "");
                    Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit);
                    if (!matcher.matches()) {
                        this.getLogger()
                                .warn(String.format("argument unit %s does not match the expected pattern %s",
                                        cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern()));
                    } else {
                        // **************************************************
                        // coordinates of an argument unit:
                        String label = matcher.group(1);
                        String stringIndices = matcher.group(3).replaceAll("^,", "");
                        List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ",");

                        int firstIndex = Collections.min(indices);
                        Token firstToken = idxToTokenMapping.get(firstIndex);

                        int lastIndex = Collections.max(indices);
                        Token lastToken = idxToTokenMapping.get(lastIndex);
                        // *****************************************************

                        // Read argument unit as Paragraph annotation
                        Paragraph para = new Paragraph(aJcas, firstToken.getBegin(), lastToken.getEnd());
                        para.addToIndexes();

                        // print some counts:
                        System.out.println("annotator: " + annotator);
                        counter++;
                        System.out
                                .println("AU " + counter + " -- argument unit text: " + para.getCoveredText());
                        System.out.println("label: " + label);
                        if (label.contains("claim")) {
                            claims++;
                        } else {
                            premises++;
                        }
                        System.out.println("premises " + premises + "\t claims " + claims);

                        NamedEntity outcome = new NamedEntity(aJcas, firstToken.getBegin(), lastToken.getEnd());
                        outcome.setValue(label);
                        outcome.addToIndexes();

                    } // matching was ok
                } // for argUnit : argUnits
                ++this.nextDocumentIdx;

            } // if annotator.equals(this.annotator)
        }
    } catch (final CASException e) {
        throw new CollectionException(e);
    } catch (final ResourceInitializationException e) {
        throw new CollectionException(e);
    } catch (final UIMAException e) {
        throw new CollectionException(e);
    }
}

From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java

/**
 *
 * @param nextLine/*from  w w  w  .  j  av a2s . c o  m*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND ";
    keywords = "";

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : "");
    String expression_subject = expression + " AND " + subject;
    String expression_site = expression + " site: " + domain;
    String expression_inst_name = expression + and_institution_name;
    String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject;

    String url = "";

    switch (search_patterns) {
    case P1:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression;
        break;
    case P2:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    case P3:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_site;
        break;
    case P4:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name;
        break;
    case P5:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject;
        break;
    default:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    }
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 1000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        HashMap<String, String> results = new HashMap<String, String>();

        int max_results = 2;
        int i_result = 0;
        for (Element e : elements) {
            if ((e.text().startsWith("[")
            //&& !e.text().startsWith("[PDF]")
            ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.")
                    || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                max_results++;
                continue;
            }

            boolean add = false;
            String score = "";
            String ext = "";
            if (!results.containsKey("HTML") && !e.text().startsWith("[")) {
                //results.put("html", )

                File temp;
                try {
                    temp = File.createTempFile("temp-file-name", ".tmp");
                    URL fetched_url = Downloader.fetchURL(e.absUrl("href"));
                    FileUtils.copyURLToFile(fetched_url, temp);
                    long sizeInBytes = temp.length();
                    long sizeInMb = sizeInBytes / (1024 * 1024);
                    if (sizeInMb > 100) {
                        score = "B";
                    } else {
                        String content = FileUtils.readFileToString(temp);
                        if (content.contains(nextLine[idLastName])) {
                            score = "A";
                        } else {
                            score = "B";
                        }
                    }
                } catch (IOException ex) {
                    score = "B";
                }

                ext = "HTML";
                add = true;
            }

            //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){                                                        
            //    score = "A";
            //    ext = "PDF";
            //    add = true;
            //}                          

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.put(ext, result);

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text());
            }

            //                if(results.containsKey("PDF") && results.containsKey("HTML")){
            //                    break;
            //                }

            i_result++;
            if (max_results <= i_result) {
                break;
            }
        }

        //            if(results.containsKey("PDF"))
        //                final_result = results.get("PDF");
        //            else 
        if (results.containsKey("HTML"))
            final_result = results.get("HTML");
        else
            final_result = "";
    }

    return final_result;
}

From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java

/**
 *
 * @param nextLine/* w  w w .  ja va  2  s  .com*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query;
    expression_subject = expression_subject.replaceAll("\t", " ");
    expression_subject = expression_subject.replaceAll("  ", " ");

    String url = "https://duckduckgo.com/html/?q=" + expression_subject;
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 2000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){
    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        List<String[]> results = new ArrayList<String[]>();
        final int EXT_I = 0;
        final int SCORE_INT_I = 1;
        final int SCORE_LETTER_I = 2;
        final int RESULT_I = 3;
        final int WORST_SCORE = 67;

        //int max_results = elements.size();
        //int i_result = 0; 
        for (Element e : elements) {
            if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]"))
                    || e.absUrl("href").contains("duckduckgo.com/y.js")
                    || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                continue;
            }

            boolean add = false;
            int score_int = WORST_SCORE;
            String score = "";
            String ext = "";

            if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]")
                    || e.text().startsWith("[RTF]")) {

                String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                int i = e.absUrl("href").lastIndexOf("/");
                int f = e.absUrl("href").lastIndexOf(".");
                String clean_name_2 = "";
                if (i != -1 && f != -1)
                    clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase();
                boolean b = false;
                for (String k : cv_keywords_in_name_list) {
                    if (clean_name_1.contains(k) || clean_name_2.contains(k)) {
                        b = true;
                        break;
                    }
                }
                if (b) {
                    score_int--;
                }

                if (clean_name_1.contains(nextLine[idLastName])
                        || clean_name_2.contains(nextLine[idLastName])) {
                    score_int--;
                }

                score = Character.toChars(score_int)[0] + "";
                add = true;
                ext = "PDF";
            }

            //if(!results.containsKey("HTML") && !e.text().startsWith("[")){
            //}                                                 

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.add(new String[] { ext, score_int + "", score, result });

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text());
            }
        }

        final_result = "";
        int best_score = WORST_SCORE;
        for (String[] result : results) {

            if (result[EXT_I].equals("PDF")) {
                int act_score = Integer.parseInt(result[SCORE_INT_I]);

                if (act_score < best_score) {
                    best_score = act_score;
                    final_result = result[RESULT_I];
                }

            }
        }
    }

    return final_result;
}

From source file:index.IndexManager.java

public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) {
    final SolrInputDocument index = new SolrInputDocument();
    index.setField("id", document.location());
    index.setField("time", String.valueOf(System.currentTimeMillis()));
    index.setField("title", document.title());

    final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href"))
            .collect(Collectors.toSet());
    final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src"))
            .collect(Collectors.toSet());

    links.forEach(link -> index.addField("link", link));
    media.forEach(link -> index.addField("media", link));

    formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e));

    formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e));

    formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e));

    formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e));

    formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e));

    formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e));

    formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e));

    formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e));

    int i = 0;//from ww  w.j a  va  2  s  .c  o  m
    Collection<String> text = chunkToLength(document.text());
    for (String chunk : text)
        index.addField(++i + "_text", chunk);

    return Triple.of(index, links, media);
}

From source file:me.vertretungsplan.parser.TurboVertretungParser.java

private void parseTurboVertretungDay(SubstitutionSchedule v, Document doc) {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();

    String date = doc.select(".Titel").text().replaceFirst("Vertretungsplan( fr)? ", "");
    day.setDate(DateTimeFormat.forPattern("EEEE, d. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date));

    String lastChange = doc.select(".Stand").text().replace("Stand: ", "");
    day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy HH:mm:ss").withLocale(Locale.GERMAN)
            .parseLocalDateTime(lastChange));

    if (doc.text().contains("Kein Vertretungsplan")) {
        v.addDay(day);/* w  w  w.  j a  v  a2  s  .c o  m*/
        return;
    }

    if (doc.select(".LehrerFrueher").size() > 0) {
        day.addMessage(doc.select(".LehrerFrueherLabel").text() + "\n" + doc.select(".LehrerFrueher").text());
    }
    if (doc.select(".LehrerVerplant").size() > 0) {
        day.addMessage(doc.select(".LehrerVerplantLabel").text() + "\n" + doc.select(".LehrerVerplant").text());
    }
    if (doc.select(".Abwesenheiten-Klassen").size() > 0) {
        day.addMessage(doc.select(".Abwesenheiten-KlassenLabel").text() + "\n"
                + doc.select(".Abwesenheiten-Klassen").text());
    }

    Element table = doc.select("table").first();
    for (Element row : table.select("tr:has(td)")) {
        Substitution substitution = new Substitution();
        substitution.setLesson(row.select(query("Stunde")).text());
        substitution.setPreviousTeacher(row.select(query("Lehrer")).text());
        substitution.setTeacher(row.select(query("Vertretung")).text());
        substitution.setClasses(new HashSet<>(Arrays.asList(row.select(query("Klasse")).text().split(" "))));
        substitution.setSubject(row.select(query("Fach")).text());
        substitution.setDesc(row.select(query("Anmerkung")).text());
        substitution.setRoom(row.select(query("Raum")).text());

        String type = recognizeType(row.select(query("Anmerkung")).text());
        if (type == null)
            type = "Vertretung";
        substitution.setType(type);
        substitution.setColor(colorProvider.getColor(type));

        day.addSubstitution(substitution);
    }

    v.addDay(day);
}

From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java

/**
 * Method used to create the record of given web page in search database.
 *
 * @param webPage webPage.url is entered url
 * @return webPage for success or null for fail
 *//*from  w  ww  .  j a v a 2 s.  c om*/
private WebPage createWebPageRecord(WebPage webPage) {
    try {
        Document html = Jsoup.connect(webPage.getUrl()).get();
        referencedSites.clear();
        indexElements(webPage, html, 0);
        System.out.println(html.text());
        System.out.println("number of indexed fields is " + indexWriter.numDocs());
        //            indexWriter.commit();
        indexWriter.close();
        return webPage;
    } catch (Exception ex) {
        System.out.println("createWebPageRecord " + ex.getMessage());
        //            ex.printStackTrace();
        return null;
    }
}

From source file:me.vertretungsplan.parser.UntisMonitorParser.java

private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl,
        int recursionDepth) throws IOException, CredentialInvalidException {
    String html;//w  w  w .  ja  va  2  s .c  om
    if (url.equals(VALUE_URL_LOGIN_RESPONSE)) {
        html = loginResponse;
    } else {
        try {
            html = httpGet(url, encoding).replace("&nbsp;", "");
        } catch (HttpResponseException e) {
            if (docs.size() == 0) {
                throw e;
            } else {
                return; // ignore if first page was loaded and redirect didn't work
            }
        }
    }
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(url);

    if (doc.select(".mon_title").size() == 0) {
        // We have a problem - there seems to be no substitution schedule. Maybe it is hiding
        // inside a frame?
        if (doc.select("frameset frame[name").size() > 0) {
            for (Element frame : doc.select("frameset frame")) {
                if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?")
                        && recursionDepth < MAX_RECURSION_DEPTH) {
                    String frameUrl = frame.absUrl("src");
                    loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1);
                }
            }
        } else if (doc.text().contains("registriert")) {
            throw new CredentialInvalidException();
        } else {
            if (docs.size() == 0) {
                // ignore if first page was loaded and redirect didn't work
                throw new IOException(
                        "Could not find .mon-title, seems like there is no Untis " + "schedule here");
            }
        }
    } else {
        findSubDocs(docs, html, doc);

        if (following && doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) {
                loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1);
            }
        }
    }
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

@Override
public ProlongResult prolong(String media, Account account, int useraction, String Selection)
        throws IOException {
    // internal convention: We add "NEW" to the media ID to show that we have the new iOPAC
    // version//from   ww w  .j a  va 2 s .  c  om
    if (media.startsWith("NEW")) {
        String mediaNr = media.substring(3);
        String html = httpGet(
                opac_url + "/cgi-bin/di.exe?mode=42&MedNrVerlAll=" + URLEncoder.encode(mediaNr, "UTF-8"),
                getDefaultEncoding());

        Document doc = Jsoup.parse(html);
        if (doc.text().contains("1 Medium wurde verl")) {
            return new ProlongResult(MultiStepResult.Status.OK);
        } else {
            return new ProlongResult(MultiStepResult.Status.ERROR, doc.text());
        }
    } else {
        String html = httpGet(opac_url + "/" + media, getDefaultEncoding());
        Document doc = Jsoup.parse(html);
        if (doc.select("table th").size() > 0) {
            if (doc.select("h1").size() > 0) {
                if (doc.select("h1").first().text().contains("Hinweis")) {
                    return new ProlongResult(MultiStepResult.Status.ERROR,
                            doc.select("table th").first().text());
                }
            }
            try {
                Element form = doc.select("form[name=form1]").first();
                String sessionid = form.select("input[name=sessionid]").attr("value");
                String mednr = form.select("input[name=mednr]").attr("value");
                httpGet(opac_url + "/cgi-bin/di.exe?mode=8&kndnr=" + account.getName() + "&mednr=" + mednr
                        + "&sessionid=" + sessionid + "&psh100=Verl%C3%A4ngern", getDefaultEncoding());
                return new ProlongResult(MultiStepResult.Status.OK);
            } catch (Throwable e) {
                e.printStackTrace();
                return new ProlongResult(MultiStepResult.Status.ERROR);
            }
        }
        return new ProlongResult(MultiStepResult.Status.ERROR);
    }
}

From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java

/**
 * Method used to perform recursive creation indexing for a given web page 
 * in search database./*w  w w. jav  a2s  .  c  o m*/
 *
 * @param webPage webPage.url is entered url
 * webPage.title is set
 * @param html Jsoup.Document of entered url
 * @param recursionNumber used to stop recursion at exceeding 
 * MAX_RECURSION_SEARCH_NUMBER
 */
private void indexElements(WebPage webPage, Document html, final int recursionNumber)
        throws IOException, ParseException {
    String title = html.title();
    if (referencedTitles.contains(title.trim())) {
        return;
    }
    referencedTitles.add(title.trim());
    webPage.setTitle(title);
    if (containsPage(webPage)) {
        System.out.println(webPage.getUrl() + " is already indexed");
        return;
    }
    Element prevElement = null;
    Elements elements = html.body().getAllElements(); //.getElementsByTag("a");
    addDoc(webPage, html.text());
    //        for (Element element : elements) {
    ////                System.out.println(element.nodeName() + " element.text() " 
    ////                        + element.text() + " url " 
    ////                        + element.absUrl("href"));
    //            if (element.nodeName().equalsIgnoreCase("body")) {
    //                addDoc(webPage, element.text());
    //                break;
    ////                continue;
    //            }
    //            if (null == prevElement) {
    //                prevElement = element;
    ////            } else if (prevElementContainsElementText(prevElement, element)) {
    ////                continue;
    //            }
    ////            if (null !== webPagesService.findWebPage(element.absUrl("href")))
    //            if (element.text().trim().isEmpty()) {
    //                continue;
    //            }
    ////            StringTokenizer str = new StringTokenizer(element.text());
    ////            str.
    //            addDoc(webPage, element.text());
    //        }
    if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) {
        //            System.out.println(recursionNumber + " " 
        //                    + referencedSites.contains(webPage.getUrl()));
        return;
    }
    elements.parallelStream()
            .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF)
                    && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF))
                    && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF))))
            .forEach((Element element) -> {
                WebPage webPage1 = new WebPage(element.absUrl(HREF));
                String url1 = webPage1.getUrl();
                //                    System.out.println(recursionNumber + " recursion for '" 
                //                            + url1 + "'");
                try {
                    Document htmlR = Jsoup.connect(url1).get();
                    indexElements(webPage1, htmlR, recursionNumber + 1);
                } catch (IOException | ParseException e) {
                    System.out.println("Exception " + e.getMessage());
                }
                referencedSites.add(url1);
            });
    //        for (Element element : elements) {
    //            if (!element.nodeName().equalsIgnoreCase("a")) {
    //                continue;
    //            }
    //            WebPage webPage1 = new WebPage(element.absUrl("href"));
    //            if (null == webPage1.getUrl() 
    //                    || webPage1.getUrl().isEmpty()
    //                    || referencedSites.contains(webPage1.getUrl())) {
    //                continue;
    //            }
    //            System.out.println(recursionNumber + "recursion for " 
    //                    + element.absUrl("href"));
    //            try {
    //                Document htmlR = Jsoup.connect(webPage1.getUrl()).get();
    //                webPage1.setTitle(htmlR.title());
    //                indexElements(webPage1, htmlR, recursionNumber + 1);
    //            } catch (IOException e) {
    //                System.out.println("IOException " + e.getMessage());
    //            }
    //            referencedSites.add(webPage1.getUrl());
    //        }
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

@Override
public ProlongAllResult prolongAll(Account account, int useraction, String selection) throws IOException {
    Document doc = getAccountPage(account);
    // Check if the iOPAC verion supports this feature
    if (doc.select("button.verlallbutton").size() > 0) {
        List<NameValuePair> params = new ArrayList<>();
        params.add(new BasicNameValuePair("mode", "42"));
        for (Element checkbox : doc.select("input.VerlAllCheckboxOK")) {
            params.add(new BasicNameValuePair("MedNrVerlAll", checkbox.val()));
        }//from  ww  w.j  a v  a2 s .c o  m
        String html = httpGet(opac_url + "/cgi-bin/di.exe?" + URLEncodedUtils.format(params, "UTF-8"),
                getDefaultEncoding());
        Document doc2 = Jsoup.parse(html);
        Pattern pattern = Pattern.compile("(\\d+ Medi(?:en|um) wurden? verl.ngert)\\s*(\\d+ "
                + "Medi(?:en|um) wurden? nicht verl.ngert)?");
        Matcher matcher = pattern.matcher(doc2.text());
        if (matcher.find()) {
            String text1 = matcher.group(1);
            String text2 = matcher.group(2);
            List<Map<String, String>> list = new ArrayList<>();
            Map<String, String> map1 = new HashMap<>();
            // TODO: We are abusing the ProlongAllResult.KEY_LINE_ ... keys here because we
            // do not get information about all the media
            map1.put(ProlongAllResult.KEY_LINE_TITLE, text1);
            list.add(map1);
            if (text2 != null && !text2.equals("")) {
                Map<String, String> map2 = new HashMap<>();
                map2.put(ProlongAllResult.KEY_LINE_TITLE, text2);
                list.add(map2);
            }
            return new ProlongAllResult(MultiStepResult.Status.OK, list);
        } else {
            return new ProlongAllResult(MultiStepResult.Status.ERROR, doc2.text());
        }
    } else {
        return new ProlongAllResult(MultiStepResult.Status.ERROR,
                stringProvider.getString(StringProvider.UNSUPPORTED_IN_LIBRARY));
    }
}