Example usage for org.jsoup.nodes Document text

List of usage examples for org.jsoup.nodes Document text

Introduction

In this page you can find the example usage for org.jsoup.nodes Document text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:Project.FILER.java

public static String getDescription(String query, long Doc_id) throws FileNotFoundException, IOException {
    boolean phrase;
    String description = "";
    String content = "";
    File f = new File("C:\\Users\\user\\workspace\\Ph2\\html\\" + Doc_id + ".html");
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    content = content + " " + doc.text();
    content = content.toLowerCase();/*from www.  j  a v a 2s  . c om*/
    if (query.endsWith("\"") == true && query.startsWith("\"") == true) {
        phrase = true;
    } else
        phrase = false;
    int query_length = 0;

    String query_words[] = query.split("\\P{Alpha}+");
    query_length = query_words.length;
    String words[] = content.split("\\P{Alpha}+");
    int index = ArrayUtils.indexOf(words, query_words[0]);
    System.out.println("index " + index);
    int i = 0, start = 0, end = 0;
    if (phrase && query_length > 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (query.indexOf(words[i]) != -1) {
                description += "<b> " + words[i] + "</b>";
            } else
                description += " " + words[i];
        }

    } else if (query_length == 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (words[i].equals(query)) {
                description += "<b> " + words[i] + "</b>";
            } else {
                description += " " + words[i];
            }
        }
    } else if (!phrase && query_length > 1) {
        if (index - 10 < 0)
            start = 0;
        else
            start = index - 10;
        if (index + 20 > content.length() - 1)
            end = content.length() - 1;
        else
            end = index + 20;
        for (i = start; i < end; i++) {
            if (query.indexOf(words[i]) != -1) {
                description += "<b> " + words[i] + "</b>";
            } else
                description += " " + words[i];
        }
    }
    System.out.println("description  " + description);
    return description;
}

From source file:Project.FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "" }; //first element is the title,second is all headers,third is img alt
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;/*from  w  ww  .  j  av  a2 s . co  m*/
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:projectapt.Parser.java

void removeTagsAndSetImportance() throws IOException {
    webPageString = webPageString.replaceAll("[^\\x00-\\x7F]", "");
    webPageString_Temp = webPageString;/*from  w  w w .jav a 2 s.c o  m*/

    Document doc = Jsoup.parse(webPageString);

    String title = doc.select("title").html();
    title = title.trim();
    if (!title.equals("")) {
        Document docT = Jsoup.parse(title);
        title = docT.text();
        String[] arrT = title.split(" ");

        for (String s : arrT) {
            s = s.trim();
            if (WordsInfo.containsKey(s)) {
                Word W = (Word) WordsInfo.get(s);
                W.Count++;
                W.Importance = 'T';
            } else {
                Word W = new Word();
                W.Importance = 'T';
                W.Count++;
                WordsInfo.put(s, W);
            }
        }
        doc.select("title").remove();
    }

    String header = doc.select("header").html();
    header = header.trim();
    if (!header.equals("")) {
        Document docH = Jsoup.parse(header);
        header = docH.text();
        String[] arrH = header.split(" ");

        for (String s : arrH) {
            s = s.trim();
            if (WordsInfo.containsKey(s)) {
                Word W = (Word) WordsInfo.get(s);
                W.Count++;
                if (W.Importance != 'T')
                    W.Importance = 'H';
            } else {
                Word W = new Word();
                W.Importance = 'H';
                W.Count++;
                WordsInfo.put(s, W);
            }
        }
        doc.select("header").remove();
    }

    String hTags = doc.select("h1, h2, h3, h4, h5, h6").html();
    hTags = hTags.trim();
    if (!hTags.equals("")) {
        Document dochTags = Jsoup.parse(hTags);
        hTags = dochTags.text();
        String[] arrhTags = hTags.split(" ");

        for (String s : arrhTags) {
            s = s.trim();
            if (WordsInfo.containsKey(s)) {
                Word W = (Word) WordsInfo.get(s);
                W.Count++;
                if (W.Importance != 'T')
                    W.Importance = 'H';
            } else {
                Word W = new Word();
                W.Importance = 'H';
                W.Count++;
                WordsInfo.put(s, W);
            }
        }

        doc.select("h1, h2, h3, h4, h5, h6").remove();
    }

    String ImgAlt = doc.select("img").attr("alt");
    ImgAlt = ImgAlt.trim();
    if (!ImgAlt.equals("")) {
        Document docI = Jsoup.parse(ImgAlt);
        ImgAlt = docI.text();
        String[] arrI = ImgAlt.split(" ");

        for (String s : arrI) {
            s = s.trim();
            if (WordsInfo.containsKey(s)) {
                Word W = (Word) WordsInfo.get(s);
                W.Count++;
                if (W.Importance != 'T' && W.Importance != 'H')
                    W.Importance = 'I';
            } else {
                Word W = new Word();
                W.Importance = 'I';
                W.Count++;
                WordsInfo.put(s, W);
            }
        }

        doc.select("img").remove();
    }

    doc.select("script, style, .hidden, label").remove();
    String S = doc.text();
    String[] arr = S.split(" ");

    for (String s : arr) {
        if (WordsInfo.containsKey(s)) {
            Word W = (Word) WordsInfo.get(s);
            W.Count++;
            if (W.Importance != 'T' && W.Importance != 'H')
                W.Importance = 'X';
        } else {
            Word W = new Word();
            W.Importance = 'X';
            W.Count++;
            WordsInfo.put(s, W);
        }
    }
}

From source file:reader.ArgumentUnitTCReader.java

@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
    super.initialize(context);

    // read input file with texts (= argument units) and labels
    labels = new ArrayList<String>();
    texts = new ArrayList<String>();
    Iterator<Map<String, Object>> documentsIterator;

    try {/*from   w w  w. ja va 2 s .  c om*/
        String inputString = FileUtils.readFileToString(this.inputFile);
        JSONParser jsonParser = new JSONParser();

        @SuppressWarnings("unchecked")
        ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>(
                (List<Map<String, Object>>) jsonParser.parse(inputString));
        documentsIterator = jsonTexts.iterator();

        while (documentsIterator.hasNext()) {
            Map<String, Object> jsonData = documentsIterator.next();

            @SuppressWarnings("unchecked")
            List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData
                    .get(JsonCorpusUtil.USER_ANNOTATIONS);

            for (Map<String, Object> userAnnotation : userAnnotations) {
                String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR);
                if (annotator.equals(this.annotator)) {

                    String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT);
                    org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText);
                    String rawDocumentText = cleanedText.text();
                    Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText);

                    @SuppressWarnings("unchecked")
                    List<String> argUnits = (List<String>) userAnnotation
                            .get(JsonCorpusUtil.ARGUMENTATION_UNITS);

                    for (String argUnit : argUnits) {
                        //System.out.println("au: " +argUnit);  
                        String cleanedArgUnit = argUnit.replaceAll("\\s+", "");
                        Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit);
                        if (!matcher.matches()) {
                            this.getLogger()
                                    .warn(String.format(
                                            "argument unit %s does not match the expected pattern %s",
                                            cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern()));
                        } else {
                            // **************************************************
                            // coordinates of an argument unit:
                            String label = matcher.group(1);
                            String stringIndices = matcher.group(3).replaceAll("^,", "");
                            List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ",");

                            int firstIndex = Collections.min(indices);
                            Token firstToken = idxToTokenMapping.get(firstIndex);

                            int lastIndex = Collections.max(indices);
                            Token lastToken = idxToTokenMapping.get(lastIndex);

                            //String text = getArgunitText(firstIndex, lastIndex);
                            // *****************************************************

                            String generalizedLabel = getGeneralizedLabel(label);

                            // Read argument unit as dummy Paragraph annotation to get the text
                            JCas dummyJCas = JCasFactory.createJCas();
                            dummyJCas.setDocumentText(rawDocumentText);

                            Paragraph para = new Paragraph(dummyJCas, firstToken.getBegin(),
                                    lastToken.getEnd());
                            //System.out.println("argument unit text: " +para.getCoveredText());

                            texts.add(para.getCoveredText());
                            labels.add(generalizedLabel);

                            //System.out.println("annotator: " +annotator);                        
                            System.out.println("label: " + label + " general label: " + generalizedLabel);
                        } // matching was ok
                    } // for argUnit : argUnits
                } // if annotator.equals(this.annotator)
            } // for user annotation
        } // while hasNext
    } catch (final IOException e) {
        throw new ResourceInitializationException(e);
    } catch (final ParseException e) {
        throw new ResourceInitializationException(e);
    } catch (UIMAException e) {
        throw new ResourceInitializationException(e);
    }
    offset = 0;
    System.out.println("number of AUs: " + texts.size());
}

From source file:scrapper.TextBrowser.java

public static void processEachURL(String eachURL) {
    if (eachURL == null || StringUtils.isEmpty(eachURL)) {
        return;/*www . j av a  2 s .  co  m*/
    }
    try {
        if (!eachURL.startsWith("http") && !eachURL.startsWith("https")) {
            eachURL = "http://" + eachURL;
        } else if (!eachURL.startsWith(mainURL) && !eachURL.contains("www")) {
            eachURL = mainURL + eachURL;
        }

        Document doc = Jsoup.connect(eachURL).get();
        String docText = doc.text();
        String[] allWords = docText.split(" ");
        List<String> lines = new ArrayList<>();
        StringBuilder eachLine = new StringBuilder();
        int length = 0;
        for (String eachWord : allWords) {
            eachLine.append(eachWord.trim());
            eachLine.append(" ");
            length += eachWord.length() + 1;
            if (length > 80) {
                eachLine.append(System.getProperty("line.separator"));
                length = 0;
            }
        }
        lines.add(eachLine.toString());

        String whatToWrite = FlipTable.of(new String[] { eachURL },
                new String[][] { lines.toArray(new String[0]) });
        System.out.println(whatToWrite);

        writer.println(whatToWrite);

        Elements elts = doc.select("a");
        for (Element each : elts) {
            try {
                String url = each.attr("href");
                if (!url.startsWith(mainURL) && !url.contains(mainURL)) {
                    url = mainURL + url;
                }

                if (parsedURL.add(url)) {
                    processEachURL(url);
                }
            } catch (Throwable ignore) {
                LOGGER.error(ignore.getMessage(), ignore);
            }
        }

    } catch (IOException ignore) {
        LOGGER.error(ignore.getMessage(), ignore);
        // System.err.println(ignore.getMessage());
    }
    return;
}

From source file:webcralwerproject1.Webcrawler.java

public String contentprocessor() {
    File folder = new File(DirectoryName + "/" + crawlcount);
    FileWriter f_write = null;/* w  ww  .  j a va 2s .c om*/
    Elements p, c = null;
    String contentprocessfile = "./crawler" + crawlcount + "content.html";
    if (!folder.exists()) {
    } else {
        try {
            File[] listOfFiles = folder.listFiles();
            f_write = new FileWriter(contentprocessfile, true);

            //Open repo directory and loop through all files
            for (File file : listOfFiles) {
                if (file.isFile()) {
                    File input = new File(file.getAbsolutePath());
                    Document doc = Jsoup.parse(input, "UTF-8");
                    String title = doc.select("title").toString();
                    Elements n = doc.select("nav").remove();
                    //  String d =doc.select("div.id");
                    doc.select("head").remove();
                    doc.select("link").remove();
                    doc.select("style").remove();
                    doc.select("meta").remove();
                    doc.select("script").remove();
                    doc.select("figure").remove();
                    doc.select("img").remove();
                    doc.select("footer").remove();
                    doc.select("input[type = search]").remove();
                    doc.select("form").remove();
                    doc.select("button").remove();
                    doc.select("video").remove();
                    doc.select("div:empty").remove();
                    doc.select("div#footer").remove();
                    doc.select("div#id").remove();
                    doc.select("div#nav").remove();
                    doc.select("div#navigation").remove();
                    doc.select("div.footer").remove();
                    doc.select("div.header").remove();
                    doc.select("li > a[href]").remove();

                    Elements linksOnPage = doc.select("body a[href]");
                    for (Element link : linksOnPage) {
                        if (link.html() == null) {
                            link.remove();//<a></a>
                        } else if (link.html().length() <= 4) {// does not contains title of the page 
                            link.remove();
                        } else {
                            int child = link.parentNode().childNodeSize();
                            if (child == 1) {//only element remove
                                link.remove();
                            }
                        }
                    }
                    f_write.write(doc.text());
                }
                f_write.write("<br>");
            }
            f_write.close();
        } catch (Exception e) {
            System.out.println("Inside Contentprocessor" + e);
        }

        return contentprocessfile;
    }
    return null;
}

From source file:webindex.integration.DevServerIT.java

@Test
public void basic() throws Exception {
    Document doc = Jsoup.connect("http://localhost:24567/").get();
    Assert.assertTrue(doc.text().contains("Enter a domain to view known webpages in that domain"));

    IndexClient client = devServer.getIndexClient();
    Pages pages = client.getPages("stackoverflow.com", "", 0);
    Assert.assertEquals(4, pages.getTotal().intValue());

    Pages.PageScore pageScore = pages.getPages().get(0);
    Assert.assertEquals("http://blog.stackoverflow.com/2009/06/attribution-required/", pageScore.getUrl());
    Assert.assertEquals(4, pageScore.getScore().intValue());
}

From source file:won.bot.framework.component.needproducer.impl.MailFileNeedProducer.java

@Override
public synchronized Model readNeedFromFile(final File file) throws IOException {
    logger.debug("processing as mail file: {} ", file);
    FileInputStream fis = new FileInputStream(file);
    NeedModelBuilder needModelBuilder = new NeedModelBuilder();
    try {//from   w w w.  j a v a  2 s .  com
        MimeMessage emailMessage = new MimeMessage(null, fis);
        MimeMessageParser parser = new MimeMessageParser(emailMessage);
        parser.parse();
        needModelBuilder.setTitle(parser.getSubject());
        String content = null;
        if (parser.hasPlainContent()) {
            content = parser.getPlainContent();
        } else if (parser.hasHtmlContent()) {
            Document doc = Jsoup.parse(parser.getHtmlContent());
            content = doc.text();
        }
        if (content != null) {
            needModelBuilder.setDescription(content);
        }
        logger.debug("mail subject          : {}", parser.getSubject());
        logger.debug("mail has plain content: {}", parser.hasPlainContent());
        logger.debug("mail has html content : {}", parser.hasHtmlContent());
        logger.debug("mail has attachments  : {}", parser.hasAttachments());
        logger.debug("mail plain content    : {}", StringUtils.abbreviate(parser.getPlainContent(), 200));
        logger.debug("mail html content     : {}", StringUtils.abbreviate(parser.getHtmlContent(), 200));
        needModelBuilder.setUri("no:uri");
        return needModelBuilder.build();
    } catch (Exception e) {
        logger.debug("could not parse email from file {} ", file, e);
    } finally {
        if (fis != null)
            fis.close();
    }
    return null;
}