List of usage examples for org.jsoup.nodes Document toString
public String toString()
From source file:ExtractorContentTest.java
private PCMStatistic computeStatistic(String wikiPageName) throws Exception { WikiPageContentExtractor wikipediaExtractor = new WikiPageContentExtractor(); String content = wikipediaExtractor.getContent(wikiPageName); assertNotNull(content);/*from ww w .jav a 2 s . c o m*/ FileUtils.writeStringToFile(new File("output/" + wikiPageName + ".wikipedia"), content); //System.err.println("content = " + content); WikiTabularExtractor wikiTabExtractor = new WikiTabularExtractor(); //content = "'''Video converters''' are [[computer program]]s" ; String htmlContent = wikiTabExtractor.run(content, "" + wikiPageName); assertNotNull(htmlContent); //Document doc = Jsoup.connect("http://en.wikipedia.org/w/index.php?title=" + wikiPageName).get(); Document doc = Jsoup.parse(htmlContent); FileUtils.writeStringToFile(new File("output/" + wikiPageName + ".html"), doc.toString()); //Element docContentEntryPoint = doc ; // doc.getElementsByClass("article-content").first(); //Elements sections = docContentEntryPoint.getElementsByClass("section") ; // FIXME what about no section ? //treatSection(doc.body()); Elements tabs = doc.select("table"); List<Catalog> catalogs = new ArrayList<Catalog>(); for (Element section : tabs) { treatTable(section, catalogs); } Collection<CatalogStat> catalogStats = new ArrayList<CatalogStat>(); for (Catalog catalog : catalogs) { int nHeaders = catalog.getHeaders().size(); int nProduct = catalog.size(); CatalogStat catalogStat = new CatalogStat(); catalogStat.setNHeaders(nHeaders); catalogStat.setNProduct(nProduct); // analyze each product and all values int nUncertain = 0; int nBoolean = 0; int nEmpty = 0; int nMulti = 0; int nSingleV = 0; int nUnknowns = 0; int nConstrained = 0; for (Product product : catalog) { Collection<String> values = product.getAllValues(); for (String val : values) { if (VariabilityPatternsUtils.isUncertain(val)) { nUncertain++; } else if (VariabilityPatternsUtils.isYes(val) || VariabilityPatternsUtils.isNot(val)) { // pattern #1 nBoolean++; } else if (VariabilityPatternsUtils.isBlanked(val)) { // pattern #6 nEmpty++; } else if (VariabilityPatternsUtils.isMultiValues(val)) { // pattern #4 nMulti++; } else if (VariabilityPatternsUtils.isUnknowns(val)) { // pattern #5 nUnknowns++; } else if (VariabilityPatternsUtils.isConstrained(val)) { // pattern #2 nConstrained++; } else { // pattern #3 nSingleV++; } } } catalogStat.setnConstrained(nConstrained); catalogStat.setnUnknowns(nUnknowns); catalogStat.setnSingleV(nSingleV); catalogStat.setnMultiValues(nMulti); catalogStat.setnEmpty(nEmpty); catalogStat.setnBooleans(nBoolean); catalogStat.setnUncertains(nUncertain); catalogStats.add(catalogStat); } int nTable = catalogs.size(); return new PCMStatistic(nTable, catalogStats); }
From source file:ExtractorContentTest.java
private FeatureModelVariable executeWikipediaToFML(String wikiPageName, String[] excludeColumnNames, String[] excludeProductNames, String[] excludeSectionNames, Map<String, String> renamings) throws Exception { WikiPageContentExtractor wikipediaExtractor = new WikiPageContentExtractor(); String content = wikipediaExtractor.getContent(wikiPageName); assertNotNull(content);/*from www . jav a2 s .c o m*/ //System.err.println("content = " + content); WikiTabularExtractor wikiTabExtractor = new WikiTabularExtractor(); //content = "'''Video converters''' are [[computer program]]s" ; String htmlContent = wikiTabExtractor.run(content, "video"); assertNotNull(htmlContent); //Document doc = Jsoup.connect("http://en.wikipedia.org/w/index.php?title=" + wikiPageName).get(); Document doc = Jsoup.parse(htmlContent); FileUtils.writeStringToFile(new File("output/" + wikiPageName + ".html"), doc.toString()); //Element docContentEntryPoint = doc ; // doc.getElementsByClass("article-content").first(); //Elements sections = docContentEntryPoint.getElementsByClass("section") ; // FIXME what about no section ? //treatSection(doc.body()); Elements tabs = doc.select("table"); List<Catalog> catalogs = new ArrayList<Catalog>(); for (Element section : tabs) { treatTable(section, catalogs); } /*for (Element section : sections) { treatSection (section, catalogs); }*/ // set the "ID" / names // clean up // FIXME here it is specific for (Catalog catalog : catalogs) { for (String columnName : excludeColumnNames) { if (!catalog.hasHeader(columnName)) continue; if (!catalog.removeColumn(columnName)) { System.err.println("Unable to remove the column " + columnName); } } } Set<String> excludeProductIDs = new HashSet<String>(Arrays.asList(excludeProductNames)); Set<String> excludeSections = new HashSet<String>(Arrays.asList(excludeSectionNames)); List<FeatureModelVariable> fmvs = new ArrayList<FeatureModelVariable>(); for (Catalog catalog : catalogs) { String catalogName = catalog.getName(); if (excludeSections.contains(catalogName)) continue; System.err.println("***" + catalogName + "****"); /* if (!catalog.getName().equals("General information")) continue ; */ for (Product product : catalog) { FeatureModelVariable fmv = product.toFeatureDiagram(); /* * POST */ // renaming Set<String> oFts = renamings.keySet(); // features to rename for (String oFt : oFts) { fmv.renameFeature(oFt, renamings.get(oFt)); } String id = fmv.getIdentifier(); if (!excludeProductIDs.contains(id)) fmvs.add(fmv); } //System.err.println("\n\nfmvs=" + fmvs); } List<FeatureModelVariable> fmvsToMerge = new ArrayList<FeatureModelVariable>(); if (catalogs.size() == 1) { fmvsToMerge = fmvs; } // aggregate feature models with same identifiers when there are numerous catalogs (dimensions) else { Set<String> idsDone = new HashSet<String>(); for (FeatureModelVariable fmv : fmvs) { String id1 = fmv.getIdentifier(); if (idsDone.contains(id1)) continue; //System.err.println("Aggregating..." + id1) ; // + " = " + fmv); List<FeatureModelVariable> toAggreagte = new ArrayList<FeatureModelVariable>(); for (FeatureModelVariable fmv2 : fmvs) { String id2 = fmv2.getIdentifier(); if (id1.equals(id2)) { toAggreagte.add(fmv2); } } if (!toAggreagte.isEmpty()) { fmvsToMerge.add(new AggregatorFM().build(toAggreagte, new HashSet<Expression<String>>(), _interop(wikiPageName))); } else { System.err.println("Didn't find another for " + id1); continue; } idsDone.add(id1); } } // serialize product by product (for debug) StringBuffer sb = new StringBuffer(); int i = 0; for (FeatureModelVariable fmv : fmvsToMerge) { sb.append("fmProduct" + i++ + " = FM (" + fmv + "\n)\n\n"); } File f = new File(OUTPUT_DIRECTORY + wikiPageName + "_FMLMergingScript" + ".fml"); FileUtils.writeStringToFile(f, sb.toString()); FMLMergerBDD fmlMerger = new FMLMergerBDD(fmvsToMerge, _builder); // FeatureModelVariable fmMerged = null; _shell.setVerbose(true); boolean _SAT_EVALUATION = false; if (_SAT_EVALUATION) { fmMerged = new FMLMergerDisjunctiveSAT(fmvsToMerge).union(); fmMerged.setIdentifier(wikiPageName); return fmMerged; } boolean _SAT_EVALUATION_2 = false; if (_SAT_EVALUATION_2) { Collection<Expression<String>> exprs = new TseitinTransformationDisjunctive( fmvsToMerge.toArray(new FeatureModelVariable[] {})).compute(); //new TseitinTransformation(_z3, b12).compute(); //System.err.println("exprs:" + exprs); // SMT bridges System.err.println("" + new FeatureModelVariableSATFormula("", new SATFMLFormula(ExpressionUtility.mkConjunction(exprs))).computeImplicationGraph()); return null; } //Formula<String> flaMerged = fmlMerger.calculateFormula(Mode.StrictUnion); //System.err.println("#fla=" + flaMerged.getDomain().size()); fmMerged = fmlMerger.union(new KSynthesisConfiguration() { @Override public boolean isAddingCrossTreeConstraints() { return false; //false; } @Override public boolean hasOrGroupSupport() { return false; } }); // post-process: mandatory status for for (Catalog catalog : catalogs) { String catalogName = catalog.getName(); if (excludeSections.contains(catalogName)) continue; if (fmMerged.features().names().contains(catalogName)) { fmMerged.setMandatory(fmMerged.getFeature(catalogName)); // fmMerged.addConstraint(new Expression<String>(catalogName)); fmMerged.getFormula() .andWith(new Formula<String>(_builder.mkExpression(new Expression<String>(catalogName)), Arrays.asList(catalogName), _builder)); } } fmMerged.setIdentifier(wikiPageName); return fmMerged; }
From source file:com.pagecrumb.proxy.util.filter.HtmlProxyTransformParser.java
public HtmlProxyTransformParser(String html, final String targetServer) throws ParserException { log.debug("Creating Html Parser Object."); // TODO Using targetServer directly is dangerous // because there might be URL passed which is not absolute URL. // its either the URL is decoded using some utilities to get // absolute domain this.targetServer = targetServer; log.info(this.getClass().toString() + " " + "Requested URL: " + this.targetServer); NodeVisitor linkvisitor = new NodeVisitor() { @Override/*from w w w. ja v a 2 s.co m*/ public void visitTag(Tag tag) { String name = tag.getTagName(); if ("link".equalsIgnoreCase(name)) { String hrefValue = tag.getAttribute("href"); if (hrefValue != null && !hrefValue.startsWith("/proxy")) { if (hrefValue.startsWith("http://") || hrefValue.startsWith("https://")) { // add more protocols later log.info("Rewriting with targetServer: " + hrefValue); hrefValue = hostServlet + hrefValue; } if (!hrefValue.startsWith("http://") || !hrefValue.startsWith("https://") && !hrefValue.startsWith("/proxy")) { // add more protocols later if (!hrefValue.startsWith("/")) { hrefValue = "/" + hrefValue; } if (hrefValue.startsWith("/") && !hrefValue.startsWith("/proxy")) { log.info("Rewriting with targetServer: " + hrefValue); hrefValue = hostServlet + targetServer + hrefValue; } hrefValue = hrefValue.replaceAll("&", "&"); tag.setAttribute("href", hrefValue); log.debug("hrefValue=" + hrefValue); } } } /** * Anchor */ if ("a".equalsIgnoreCase(name)) { String hrefValue = tag.getAttribute("href"); if (hrefValue != null && !hrefValue.startsWith("/proxy")) // Prevent over re-writing the proxy strings { log.debug("hrefValue=" + hrefValue); if (hrefValue.startsWith("//")) { hrefValue = "http:" + hrefValue; } if (hrefValue.startsWith("http://") || hrefValue.startsWith("https://")) { // add more protocols later log.info("Rewriting with targetServer: " + hrefValue); hrefValue = hostServlet + hrefValue; } // TODO Check if the href value is just a filename e.g "home.html" /** * Mail Protocol */ else if (hrefValue.startsWith("mailto:")) { } /** * HTTP Protocol */ else if (!hrefValue.startsWith("http://") || !hrefValue.startsWith("https://") && !hrefValue.startsWith("/proxy")) { // add more protocols later // TODO Must run hrefValue in malformed URL fix, to fix problems with the URL // i.e. "double slash" http://127.0.0.1:8888/proxy?http://localhost:8080/docs//introduction.html // reason could be that target server end with "/" if (!hrefValue.startsWith("/")) { hrefValue = "/" + hrefValue; hrefValue = hostServlet + targetServer + hrefValue; } else if (hrefValue.startsWith("/") && !hrefValue.startsWith("/proxy")) { log.info("Rewriting with targetServer: " + hrefValue); hrefValue = hostServlet + targetServer + hrefValue; } } hrefValue = hrefValue.replaceAll("&", "&"); tag.setAttribute("href", hrefValue); log.debug("hrefValue=" + hrefValue); } } // TODO hostServletNoFilter is intended to be used for // non page documents, like .js or .css // this way it will not run through the filter URL rewriting if ("script".equalsIgnoreCase(name)) { String srcValue = tag.getAttribute("src"); if (srcValue != null && !srcValue.startsWith("/")) { srcValue = "/" + srcValue; srcValue = /*hostServer +*/ hostServlet + targetServer + srcValue; } if (srcValue != null && srcValue.startsWith("//")) { // special case (see YouTube) //srcValue = /*hostServer +*/ hostServletNoFilter + targetServer + srcValue; srcValue = "http:" + srcValue; srcValue = hostServlet + srcValue; } if (srcValue != null) { tag.setAttribute("src", srcValue); } log.debug("srcValue=" + srcValue); } if ("form".equalsIgnoreCase(name)) { String actionValue = tag.getAttribute("action"); if (actionValue != null && !actionValue.startsWith("/")) { actionValue = "/" + actionValue; actionValue = hostServlet + targetServer + actionValue; } if (actionValue != null && actionValue.startsWith("/")) { actionValue = hostServlet + targetServer + actionValue; } if (actionValue != null) { tag.setAttribute("action", actionValue); } log.debug("actionValue=" + actionValue); } /** * Get javascripts */ if ("script".equalsIgnoreCase(name)) { ScriptTag script = (ScriptTag) tag; if (script != null) { //String text = script.getStringText(); //final AstNode astRoot = new org.mozilla.javascript.Parser().parse(text, "", 1); //log.info("Script_from_parser="+astRoot.toSource()); //log.info("script="+text); // Parse the script? based on the documented activities. } } if ("img".equalsIgnoreCase(name)) { String srcValue = tag.getAttribute("src"); if (srcValue != null && !srcValue.startsWith("/proxy")) { if (srcValue.startsWith("http://") || srcValue.startsWith("https://")) { // add more protocols later log.info("Rewriting with targetServer: " + srcValue); srcValue = hostServlet + srcValue; } if (!srcValue.startsWith("http://") || !srcValue.startsWith("https://") && !srcValue.startsWith("/proxy")) { // add more protocols later if (!srcValue.startsWith("/")) { srcValue = "/" + srcValue; srcValue = hostServlet + targetServer + srcValue; } if (srcValue.startsWith("/") && !srcValue.startsWith("/proxy")) { log.info("Rewriting with targetServer: " + srcValue); srcValue = hostServlet + targetServer + srcValue; } srcValue = srcValue.replaceAll("&", "&"); tag.setAttribute("src", srcValue); log.debug("srcValue=" + srcValue); } } } } }; Parser parser = new Parser(html, null); NodeList nl = parser.parse(null); nl.visitAllNodesWith(linkvisitor); this.html = nl.toHtml(); Document doc = Jsoup.parse(this.html); //Element bScriptElement = new Element(org.jsoup.parser.Tag.valueOf("script"), ""); //bScriptElement.attr("src", "/browz.js"); //bScriptElement.attr("type", "text/javascript"); //bScriptElement.attr("language", "javascript"); Element jqEl = new Element(org.jsoup.parser.Tag.valueOf("script"), ""); jqEl.attr("src", "/jquery.min.js"); jqEl.attr("type", "text/javascript"); jqEl.attr("language", "javascript"); Element bzEl = new Element(org.jsoup.parser.Tag.valueOf("script"), ""); bzEl.attr("src", "/browz.js"); bzEl.attr("type", "text/javascript"); bzEl.attr("language", "javascript"); //doc.select("head").first().children().first().before("<script type=\"text/javascript\" language=\"javascript\">" // + readFileAsString("browz.js") + "</script>"); // Important! Removed to satisfy error, must be reviewed // doc.select("head").first().children().first().before(bzEl); // doc.select("head").first().children().first().before(jqEl); /* for (Element el : doc.getElementsByTag("html")) { jqEl = el.appendElement("script"); jqEl.attr("src", "/jquery.min.js"); jqEl.attr("type", "text/javascript"); jqEl.attr("language", "javascript"); bzEl = el.appendElement("script"); bzEl.attr("src", "/browz.js"); bzEl.attr("type", "text/javascript"); bzEl.attr("language", "javascript"); } */ this.html = doc.toString(); }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Loads the html source code from the cached file, * or fetches it from the web server if needed. * //from w ww . j av a2 s .com * @param name * Name of the concerned article. * @param url * URL of the concerned article. * @return * The DOM representation of the original page. * * @throws IOException * Problem while accessing the cache or web page. */ private Document retrieveSourceCode(String name, URL url) throws IOException { Document result = null; logger.increaseOffset(); logger.log("Retrieve HTML source code"); // check if the cache can/must be used String folderPath = FileNames.FO_OUTPUT + File.separator + name; File originalFile = new File(folderPath + File.separator + FileNames.FI_ORIGINAL_PAGE); if (cache && originalFile.exists()) { logger.log("Cache enabled and HTML already retrieved >> we use the cached file (" + originalFile.getName() + ")"); String sourceCode = FileTools.readTextFile(originalFile); result = Jsoup.parse(sourceCode); } // otherwise, load and cache the html file else { logger.log("Cache disabled or HTML never retrieved before>> we get it from the web server"); // use custom page loader // String sourceCode = manuallyReadUrl(url); // System.out.println(sourceCode.toString()); // result = new Source(sourceCode); // use jericho page loader int timeOut = 5000; result = Jsoup.parse(url, timeOut); String sourceCode = result.toString(); // cache html source code FileTools.writeTextFile(originalFile, sourceCode); } //System.out.println(source.toString()); logger.decreaseOffset(); return result; }
From source file:net.acesinc.convergentui.ConvergentUIResponseFilter.java
@Override public Object run() { String origBody = contentManager.getDownstreamResponse(); if (origBody == null || origBody.isEmpty()) { return null; }/* w ww . j a v a 2 s . co m*/ String composedBody = null; log.trace("Response from downstream server: " + origBody); Document doc = Jsoup.parse(origBody); if (hasReplaceableElements(doc)) { log.debug("We have replaceable elements. Let's get em!"); Elements elementsToUpdate = doc.select("div[data-loc]"); for (Element e : elementsToUpdate) { StringBuilder content = new StringBuilder(); String location = e.dataset().get("loc"); String fragmentName = e.dataset().get("fragment-name"); String cacheName = e.dataset().get("cache-name"); boolean useCaching = !Boolean.valueOf(e.dataset().get("disable-caching")); boolean failQuietly = Boolean.valueOf(e.dataset().get("fail-quietly")); URL url = null; try { url = new URL(location); String protocol = url.getProtocol(); String service = url.getHost(); log.debug("Fetching content at location [ " + location + " ] with cacheName = [ " + cacheName + " ]"); try { RequestContext context = RequestContext.getCurrentContext(); ContentResponse response = contentManager.getContentFromService(location, cacheName, useCaching, context); log.trace(response.toString()); if (!response.isError()) { Object resp = response.getContent(); if (String.class.isAssignableFrom(resp.getClass())) { String subContentResponse = (String) resp; //TODO You better trust the source of your downstream HTML! // String cleanedContent = Jsoup.clean(subContentResponse, Whitelist.basic()); //this totally stripped the html out... Document subDocument = Jsoup.parse(subContentResponse); if (fragmentName != null) { Elements fragments = subDocument .select("div[data-fragment-name=\"" + fragmentName + "\"]"); if (fragments != null && fragments.size() > 0) { if (fragments.size() == 1) { Element frag = fragments.first(); //need to see if there are images that we need to replace the urls on Elements images = frag.select("img"); for (Element i : images) { String src = i.attr("src"); if (src.startsWith("/") && !src.startsWith("//")) { i.attr("src", "/cui-req://" + protocol + "://" + service + src); } //else what do we do about relative urls? } content.append(frag.toString()); } else { for (Element frag : fragments) { content.append(frag.toString()).append("\n\n"); } } } else { log.debug("Found no matching fragments for [ " + fragmentName + " ]"); if (failQuietly) { content.append("<div class='cui-error'></div>"); } else { content.append( "<span class='cui-error'>Failed getting content from remote service. Possible reason in reponse below</span>"); content.append(subDocument.toString()); } } } else { //take the whole thing and cram it in there! content.append(subDocument.toString()); } } else { //not text... if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: content was not text</span>"); } else { content.append("<div class='cui-error'></div>"); } } } else { if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: " + response.getMessage() + "</span>"); } else { content.append("<div class='cui-error'></div>"); } } //now append it to the page if (!content.toString().isEmpty()) { e.html(content.toString()); } } catch (Throwable t) { if (!failQuietly) { e.html("<span class='cui-error'>Failed getting content from remote service. Reason: " + t.getMessage() + "</span>"); } log.warn("Failed replacing content", t); } } catch (MalformedURLException ex) { log.warn("location was invalid: [ " + location + " ]", ex); if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: data-loc was an invalid location.</span>"); } else { content.append("<div class='cui-error'></div>"); } } } composedBody = doc.toString(); } else { log.debug("Document has no replaeable elements. Skipping"); } try { addResponseHeaders(); if (composedBody != null && !composedBody.isEmpty()) { writeResponse(composedBody, getMimeType(RequestContext.getCurrentContext())); } else { writeResponse(origBody, getMimeType(RequestContext.getCurrentContext())); } } catch (Exception ex) { log.error("Error sending response", ex); } return null; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * /* w w w . jav a 2 s . c o m*/ * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }
From source file:moose.com.ac.ArticleViewActivity.java
private void filterImg(String str) { Document mDocument = Jsoup.parse(str); Elements imgs = mDocument.select("img"); for (int imgIndex = 0; imgIndex < imgs.size(); imgIndex++) { Element img = imgs.get(imgIndex); String src = img.attr("src").trim(); if (TextUtils.isEmpty(src)) continue; Uri parsedUri = Uri.parse(src);//from w ww .jav a2 s. co m if ("file".equals(parsedUri.getScheme())) continue; if (parsedUri.getPath() == null) continue; if (!"http".equals(parsedUri.getScheme())) { parsedUri = parsedUri.buildUpon().scheme("http").authority("www.acfun.tv").build(); } // url may have encoded path parsedUri = parsedUri.buildUpon().path(parsedUri.getPath()).build(); src = parsedUri.toString(); Log.i(TAG, "image src:" + src); img.attr("org", src); if (CommonUtil.getMode() == 1 && !CommonUtil.isWifiConnected(mContext)) {// Log.i(TAG, "[?]"); img.after("<div style=\"width: 100%;text-align: center;\"><br><p>[]</p></div>"); } else { Log.i(TAG, "[?]"); StringBuilder builder = new StringBuilder(); builder.append("<div style='text-align: center;'><br>") .append("<img src='file:///android_asset/loading.gif'").append("name = '").append(src) .append("'\n;onclick = window.JsBridge.showImage('").append(src).append("')") .append(" alt=' '/>\n").append("</div>"); img.after(builder.toString()); Log.i(TAG, "image:table:-" + builder.toString()); } /*if (CommonUtil.getMode() == 1 && !CommonUtil.isWifiConnected(mContext)) { img.after("<p >[]</p>"); } else if (!src.contains(Config.AC_EMOTION)) { StringBuilder builder = new StringBuilder(); builder.append("<div style=\"width: 100%;text-align: center;\"><br><img src=\"") .append(src) .append("\" width=: 100%;height:auto\"") .append(" alt=\" \"/>\n") .append("</div>"); Log.i(TAG, "index image:" + builder.toString()); img.after(builder.toString()); } else { img.after("<img src=\"" + src + "\" alt=\" \"/>\n"); }*/ img.remove(); //img.removeAttr("style"); HtmlBody = mDocument.toString(); Log.i(TAG, "??html:" + HtmlBody); } }
From source file:org.craftercms.social.migration.controllers.MainController.java
protected void getHtml(final FileWriter writer) throws TransformerException, IOException { final URL in = getClass().getResource( MigrationTool.systemProperties.getString("crafter" + ".migration" + "" + ".loggerTemplate")); if (in == null) { log.error("Unable to find {} " + MigrationTool.systemProperties.getString("crafter" + ".migration" + "" + ".loggerTemplate")); }/*from ww w .j a va 2 s .c om*/ final Document loggingDoc = Jsoup.parse(IOUtils.toString(in)); final Element logs = loggingDoc.getElementById("logs"); for (Object o : logTable.getItems()) { if (o instanceof UserLogEntry) { UserLogEntry userLogEntry = (UserLogEntry) o; String dateFormat = new SimpleDateFormat("yyyy MM dd hh:mm:ss zzz").format(userLogEntry.getDate()); final Element tr = loggingDoc.createElement("tr"); tr.attr("class", userLogEntry.getLevel().getCssClass()); final Element tmigrator = loggingDoc.createElement("td"); final Element tdate = loggingDoc.createElement("td"); final Element tmessage = loggingDoc.createElement("td"); tmessage.attr("class", "text-center"); tmessage.text(userLogEntry.getMessage()); tdate.text(dateFormat); tmigrator.text(userLogEntry.getSource()); tr.appendChild(tmigrator); tr.appendChild(tdate); tr.appendChild(tmessage); logs.appendChild(tr); } } IOUtils.write(loggingDoc.toString(), writer); // Transformer transformer = TransformerFactory.newInstance().newTransformer(); // transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); // transformer.setOutputProperty(OutputKeys.METHOD, "xml"); // transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); // transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); // transformer.transform(new DOMSource(loggingDoc), new StreamResult(writer)); writer.flush(); writer.close(); }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
public static void main(String[] args) { System.out.println(removeTags("Ceci est un test <a href=\"tutu\">slurp</a> hop <code>arlgs</code>.", new String[] { "a", "body" })); String content = getTypeContent(null, "mdn", JSweetDefTranslatorConfig.LANG_PACKAGE, "Array"); Document doc = Jsoup.parse(content, "UTF-8"); System.out.println(doc.toString()); }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception { LOGGER.debug("getTrailers() " + options.toString()); List<MediaTrailer> trailers = new ArrayList<>(); if (!MetadataUtil.isValidImdbId(options.getImdbId())) { LOGGER.debug("IMDB id not found"); return trailers; }//w w w. j a v a 2 s . com /* * function getTrailerData(ci) { switch (ci) { case 'http://de.clip-1.filmtrailer.com/9507_31566_a_1.flv?log_var=72|491100001 -1|-' : return * '<b>Trailer 1</b><br><i>(small)</i><br><br>» 160px<br><br>Download:<br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_1.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(small)</i><br><br>» * 160px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; } } */ Url url = null; String searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + options.getImdbId(); try { // search with IMDB url = new Url(searchString); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,"); if (filme == null || filme.isEmpty()) { LOGGER.debug("found no search results"); return trailers; } LOGGER.debug("found " + filme.size() + " search results"); // hopefully // only one LOGGER.debug("get (trailer) details page"); url = new Url(BASE_URL + "/" + StrgUtils.substr(filme.first().toString(), "href=\\\"(.*?)\\\"")); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // OLD STYLE // <b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» 640px<br><br>Download:<br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br> Pattern regex = Pattern.compile("return '(.*?)';"); Matcher m = regex.matcher(doc.toString()); while (m.find()) { String s = m.group(1); String tname = StrgUtils.substr(s, "<b>(.*?)</b>"); String tpix = StrgUtils.substr(s, "raquo; (.*?)x<br>"); // String tqual = StrgUtils.substr(s, "<i>\\((.*?)\\)</i>"); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(s); while (lm.find()) { String turl = lm.group(1); // String tformat = lm.group(2); MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } // NEW STYLE (additional!) // <div class="clips" id="clips2" style="display: none;"> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 1:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 2:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 3:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <br> // </div> // new style size // 1 = 160 x 90 = small // 2 = 240 x 136 = medium // 3 = 320 x 180 = large // 4 = 400 x 226 = xlarge // 5 = 640 x 360 = xxlarge ; regex = Pattern.compile("<i>(.*?)</i>(.*?)<br>", Pattern.DOTALL); // get them as single trailer line m = regex.matcher(doc.getElementsByClass("clips").html()); while (m.find()) { // LOGGER.info(doc.getElementsByClass("clips").html()); // parse each line with 5 qualities String tname = m.group(1).trim(); tname = tname.replaceFirst(":$", ""); // replace ending colon String urls = m.group(2); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(urls); while (lm.find()) { String turl = lm.group(1); String tpix = ""; String tformat = lm.group(2).replaceAll(" ", "").trim(); switch (tformat) { case "small": tpix = "90p"; break; case "medium": tpix = "136p"; break; case "large": tpix = "180p"; break; case "xlarge": tpix = "226p"; break; case "xxlarge": tpix = "360p"; break; default: break; } MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } } catch (Exception e) { if (url != null) { LOGGER.error("Error parsing {}", url.toString()); } else { LOGGER.error("Error parsing {}", searchString); } throw e; } return trailers; }