List of usage examples for org.jsoup.nodes Document text
public String text()
From source file:Project.FILER.java
public static String getDescription(String query, long Doc_id) throws FileNotFoundException, IOException { boolean phrase; String description = ""; String content = ""; File f = new File("C:\\Users\\user\\workspace\\Ph2\\html\\" + Doc_id + ".html"); org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); content = content + " " + doc.text(); content = content.toLowerCase();/*from www. j a v a 2s . c om*/ if (query.endsWith("\"") == true && query.startsWith("\"") == true) { phrase = true; } else phrase = false; int query_length = 0; String query_words[] = query.split("\\P{Alpha}+"); query_length = query_words.length; String words[] = content.split("\\P{Alpha}+"); int index = ArrayUtils.indexOf(words, query_words[0]); System.out.println("index " + index); int i = 0, start = 0, end = 0; if (phrase && query_length > 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (query.indexOf(words[i]) != -1) { description += "<b> " + words[i] + "</b>"; } else description += " " + words[i]; } } else if (query_length == 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (words[i].equals(query)) { description += "<b> " + words[i] + "</b>"; } else { description += " " + words[i]; } } } else if (!phrase && query_length > 1) { if (index - 10 < 0) start = 0; else start = index - 10; if (index + 20 > content.length() - 1) end = content.length() - 1; else end = index + 20; for (i = start; i < end; i++) { if (query.indexOf(words[i]) != -1) { description += "<b> " + words[i] + "</b>"; } else description += " " + words[i]; } } System.out.println("description " + description); return description; }
From source file:Project.FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "" }; //first element is the title,second is all headers,third is img alt org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/*from w ww . j av a2 s . co m*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:projectapt.Parser.java
void removeTagsAndSetImportance() throws IOException { webPageString = webPageString.replaceAll("[^\\x00-\\x7F]", ""); webPageString_Temp = webPageString;/*from w w w .jav a 2 s.c o m*/ Document doc = Jsoup.parse(webPageString); String title = doc.select("title").html(); title = title.trim(); if (!title.equals("")) { Document docT = Jsoup.parse(title); title = docT.text(); String[] arrT = title.split(" "); for (String s : arrT) { s = s.trim(); if (WordsInfo.containsKey(s)) { Word W = (Word) WordsInfo.get(s); W.Count++; W.Importance = 'T'; } else { Word W = new Word(); W.Importance = 'T'; W.Count++; WordsInfo.put(s, W); } } doc.select("title").remove(); } String header = doc.select("header").html(); header = header.trim(); if (!header.equals("")) { Document docH = Jsoup.parse(header); header = docH.text(); String[] arrH = header.split(" "); for (String s : arrH) { s = s.trim(); if (WordsInfo.containsKey(s)) { Word W = (Word) WordsInfo.get(s); W.Count++; if (W.Importance != 'T') W.Importance = 'H'; } else { Word W = new Word(); W.Importance = 'H'; W.Count++; WordsInfo.put(s, W); } } doc.select("header").remove(); } String hTags = doc.select("h1, h2, h3, h4, h5, h6").html(); hTags = hTags.trim(); if (!hTags.equals("")) { Document dochTags = Jsoup.parse(hTags); hTags = dochTags.text(); String[] arrhTags = hTags.split(" "); for (String s : arrhTags) { s = s.trim(); if (WordsInfo.containsKey(s)) { Word W = (Word) WordsInfo.get(s); W.Count++; if (W.Importance != 'T') W.Importance = 'H'; } else { Word W = new Word(); W.Importance = 'H'; W.Count++; WordsInfo.put(s, W); } } doc.select("h1, h2, h3, h4, h5, h6").remove(); } String ImgAlt = doc.select("img").attr("alt"); ImgAlt = ImgAlt.trim(); if (!ImgAlt.equals("")) { Document docI = Jsoup.parse(ImgAlt); ImgAlt = docI.text(); String[] arrI = ImgAlt.split(" "); for (String s : arrI) { s = s.trim(); if (WordsInfo.containsKey(s)) { Word W = (Word) WordsInfo.get(s); W.Count++; if (W.Importance != 'T' && W.Importance != 'H') W.Importance = 'I'; } else { Word W = new Word(); W.Importance = 'I'; W.Count++; WordsInfo.put(s, W); } } doc.select("img").remove(); } doc.select("script, style, .hidden, label").remove(); String S = doc.text(); String[] arr = S.split(" "); for (String s : arr) { if (WordsInfo.containsKey(s)) { Word W = (Word) WordsInfo.get(s); W.Count++; if (W.Importance != 'T' && W.Importance != 'H') W.Importance = 'X'; } else { Word W = new Word(); W.Importance = 'X'; W.Count++; WordsInfo.put(s, W); } } }
From source file:reader.ArgumentUnitTCReader.java
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // read input file with texts (= argument units) and labels labels = new ArrayList<String>(); texts = new ArrayList<String>(); Iterator<Map<String, Object>> documentsIterator; try {/*from w w w. ja va 2 s . c om*/ String inputString = FileUtils.readFileToString(this.inputFile); JSONParser jsonParser = new JSONParser(); @SuppressWarnings("unchecked") ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>( (List<Map<String, Object>>) jsonParser.parse(inputString)); documentsIterator = jsonTexts.iterator(); while (documentsIterator.hasNext()) { Map<String, Object> jsonData = documentsIterator.next(); @SuppressWarnings("unchecked") List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData .get(JsonCorpusUtil.USER_ANNOTATIONS); for (Map<String, Object> userAnnotation : userAnnotations) { String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR); if (annotator.equals(this.annotator)) { String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT); org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText); String rawDocumentText = cleanedText.text(); Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText); @SuppressWarnings("unchecked") List<String> argUnits = (List<String>) userAnnotation .get(JsonCorpusUtil.ARGUMENTATION_UNITS); for (String argUnit : argUnits) { //System.out.println("au: " +argUnit); String cleanedArgUnit = argUnit.replaceAll("\\s+", ""); Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit); if (!matcher.matches()) { this.getLogger() .warn(String.format( "argument unit %s does not match the expected pattern %s", cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern())); } else { // ************************************************** // coordinates of an argument unit: String label = matcher.group(1); String stringIndices = matcher.group(3).replaceAll("^,", ""); List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ","); int firstIndex = Collections.min(indices); Token firstToken = idxToTokenMapping.get(firstIndex); int lastIndex = Collections.max(indices); Token lastToken = idxToTokenMapping.get(lastIndex); //String text = getArgunitText(firstIndex, lastIndex); // ***************************************************** String generalizedLabel = getGeneralizedLabel(label); // Read argument unit as dummy Paragraph annotation to get the text JCas dummyJCas = JCasFactory.createJCas(); dummyJCas.setDocumentText(rawDocumentText); Paragraph para = new Paragraph(dummyJCas, firstToken.getBegin(), lastToken.getEnd()); //System.out.println("argument unit text: " +para.getCoveredText()); texts.add(para.getCoveredText()); labels.add(generalizedLabel); //System.out.println("annotator: " +annotator); System.out.println("label: " + label + " general label: " + generalizedLabel); } // matching was ok } // for argUnit : argUnits } // if annotator.equals(this.annotator) } // for user annotation } // while hasNext } catch (final IOException e) { throw new ResourceInitializationException(e); } catch (final ParseException e) { throw new ResourceInitializationException(e); } catch (UIMAException e) { throw new ResourceInitializationException(e); } offset = 0; System.out.println("number of AUs: " + texts.size()); }
From source file:scrapper.TextBrowser.java
public static void processEachURL(String eachURL) { if (eachURL == null || StringUtils.isEmpty(eachURL)) { return;/*www . j av a 2 s . co m*/ } try { if (!eachURL.startsWith("http") && !eachURL.startsWith("https")) { eachURL = "http://" + eachURL; } else if (!eachURL.startsWith(mainURL) && !eachURL.contains("www")) { eachURL = mainURL + eachURL; } Document doc = Jsoup.connect(eachURL).get(); String docText = doc.text(); String[] allWords = docText.split(" "); List<String> lines = new ArrayList<>(); StringBuilder eachLine = new StringBuilder(); int length = 0; for (String eachWord : allWords) { eachLine.append(eachWord.trim()); eachLine.append(" "); length += eachWord.length() + 1; if (length > 80) { eachLine.append(System.getProperty("line.separator")); length = 0; } } lines.add(eachLine.toString()); String whatToWrite = FlipTable.of(new String[] { eachURL }, new String[][] { lines.toArray(new String[0]) }); System.out.println(whatToWrite); writer.println(whatToWrite); Elements elts = doc.select("a"); for (Element each : elts) { try { String url = each.attr("href"); if (!url.startsWith(mainURL) && !url.contains(mainURL)) { url = mainURL + url; } if (parsedURL.add(url)) { processEachURL(url); } } catch (Throwable ignore) { LOGGER.error(ignore.getMessage(), ignore); } } } catch (IOException ignore) { LOGGER.error(ignore.getMessage(), ignore); // System.err.println(ignore.getMessage()); } return; }
From source file:webcralwerproject1.Webcrawler.java
public String contentprocessor() { File folder = new File(DirectoryName + "/" + crawlcount); FileWriter f_write = null;/* w ww . j a va 2s .c om*/ Elements p, c = null; String contentprocessfile = "./crawler" + crawlcount + "content.html"; if (!folder.exists()) { } else { try { File[] listOfFiles = folder.listFiles(); f_write = new FileWriter(contentprocessfile, true); //Open repo directory and loop through all files for (File file : listOfFiles) { if (file.isFile()) { File input = new File(file.getAbsolutePath()); Document doc = Jsoup.parse(input, "UTF-8"); String title = doc.select("title").toString(); Elements n = doc.select("nav").remove(); // String d =doc.select("div.id"); doc.select("head").remove(); doc.select("link").remove(); doc.select("style").remove(); doc.select("meta").remove(); doc.select("script").remove(); doc.select("figure").remove(); doc.select("img").remove(); doc.select("footer").remove(); doc.select("input[type = search]").remove(); doc.select("form").remove(); doc.select("button").remove(); doc.select("video").remove(); doc.select("div:empty").remove(); doc.select("div#footer").remove(); doc.select("div#id").remove(); doc.select("div#nav").remove(); doc.select("div#navigation").remove(); doc.select("div.footer").remove(); doc.select("div.header").remove(); doc.select("li > a[href]").remove(); Elements linksOnPage = doc.select("body a[href]"); for (Element link : linksOnPage) { if (link.html() == null) { link.remove();//<a></a> } else if (link.html().length() <= 4) {// does not contains title of the page link.remove(); } else { int child = link.parentNode().childNodeSize(); if (child == 1) {//only element remove link.remove(); } } } f_write.write(doc.text()); } f_write.write("<br>"); } f_write.close(); } catch (Exception e) { System.out.println("Inside Contentprocessor" + e); } return contentprocessfile; } return null; }
From source file:webindex.integration.DevServerIT.java
@Test public void basic() throws Exception { Document doc = Jsoup.connect("http://localhost:24567/").get(); Assert.assertTrue(doc.text().contains("Enter a domain to view known webpages in that domain")); IndexClient client = devServer.getIndexClient(); Pages pages = client.getPages("stackoverflow.com", "", 0); Assert.assertEquals(4, pages.getTotal().intValue()); Pages.PageScore pageScore = pages.getPages().get(0); Assert.assertEquals("http://blog.stackoverflow.com/2009/06/attribution-required/", pageScore.getUrl()); Assert.assertEquals(4, pageScore.getScore().intValue()); }
From source file:won.bot.framework.component.needproducer.impl.MailFileNeedProducer.java
@Override public synchronized Model readNeedFromFile(final File file) throws IOException { logger.debug("processing as mail file: {} ", file); FileInputStream fis = new FileInputStream(file); NeedModelBuilder needModelBuilder = new NeedModelBuilder(); try {//from w w w. j a v a 2 s . com MimeMessage emailMessage = new MimeMessage(null, fis); MimeMessageParser parser = new MimeMessageParser(emailMessage); parser.parse(); needModelBuilder.setTitle(parser.getSubject()); String content = null; if (parser.hasPlainContent()) { content = parser.getPlainContent(); } else if (parser.hasHtmlContent()) { Document doc = Jsoup.parse(parser.getHtmlContent()); content = doc.text(); } if (content != null) { needModelBuilder.setDescription(content); } logger.debug("mail subject : {}", parser.getSubject()); logger.debug("mail has plain content: {}", parser.hasPlainContent()); logger.debug("mail has html content : {}", parser.hasHtmlContent()); logger.debug("mail has attachments : {}", parser.hasAttachments()); logger.debug("mail plain content : {}", StringUtils.abbreviate(parser.getPlainContent(), 200)); logger.debug("mail html content : {}", StringUtils.abbreviate(parser.getHtmlContent(), 200)); needModelBuilder.setUri("no:uri"); return needModelBuilder.build(); } catch (Exception e) { logger.debug("could not parse email from file {} ", file, e); } finally { if (fis != null) fis.close(); } return null; }