List of usage examples for org.jsoup.nodes Document getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {/*from w w w. j a v a 2 s. c o m*/ stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) { List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {/*from ww w. j a v a2 s .c o m*/ stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; } if (startPrinting && !navLabel .matches("(|??? ??)")) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); documentPointers.add(documentPointer); } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + documentPointers.size()); // for (DocumentPointer pointer : documentPointers) // { //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); }
From source file:com.zacwolf.commons.email.Email.java
private void prepareImgs(final org.jsoup.nodes.Document doc, final Multipart htmlmultipart) { final Map<String, EmailAttachment> attachments = getAttachments(); final org.jsoup.select.Elements imgs = doc.getElementsByTag("img"); for (org.jsoup.nodes.Element img : imgs) { final String src = img.attr("src"); final String cid = !src.startsWith("cid:") ? null : src.substring(4); try {/*from w ww . ja v a 2 s .c om*/ EmailAttachment attachment; ByteArrayOutputStream baos; if (cid != null) { attachment = attachments.get(cid); img.attr("alt", attachment.getDescription()); if (!img.attr("style").contains("display:"))//all inline images need the display:block; added for GMail compatability img.attr("style", img.attr("style") + (!img.attr("style").endsWith(";") ? ";" : "") + "display:block;"); if (cid.toLowerCase().contains("_banner") && doc.select("#banner").attr("style").contains("-radius")) { BufferedImage image = makeRoundedBanner( ImageIO.read(new ByteArrayInputStream(attachment.data)), 20); doc.select("#contenttable").attr("style", "width:" + image.getWidth() + "px;" + doc.select("#contenttable").attr("style")); baos = new ByteArrayOutputStream(); try { ImageIO.write(image, EmailAttachment.CONTENT_MIMETYPES.get(attachment.contenttype), baos); } finally { baos.flush(); } attachment = new EmailAttachment(attachment.filename, attachment.contenttype, baos.toByteArray(), cid, "Rounded banner image"); if (htmlmultipart == null) dataurlEncode(img, attachment); if (doc.select("#footer").size() == 1 && doc.select("#footer").first().attr("style").contains("-radius")) { Color bgcolor = Color.WHITE; Color border = null; String newstyle = ""; String[] styles = doc.select("#footer").first().attr("style").split(";"); for (String style : styles) { if (style.startsWith("border")) border = getColorFromStyle(style, null); else if (style.startsWith("background-color:")) bgcolor = getColorFromStyle(style, Color.WHITE); else newstyle += style + ";"; } baos = new ByteArrayOutputStream(); try { ImageIO.write(makeRoundedFooter(image.getWidth(), 20, bgcolor, border), "png", baos); } finally { baos.flush(); } doc.select("#footer").first().parent() .html("<td style=\"margin:0px;padding:0px;\" valign=\"top\" style=\"" + newstyle + "\"><img id=\"footer\" alt=\"rounded footer image\" src=\"cid:" + getREFID() + "_rounded_footer\" style=\"display:block;\" /></td>"); } if (htmlmultipart == null) dataurlEncode(doc.select("#footer").first(), new EmailAttachment("footer.png", "image/png", baos.toByteArray(), getREFID() + "_rounded_footer", "Rounded footer image")); else htmlmultipart.addBodyPart(new EmailAttachment("footer.png", "image/png", baos.toByteArray(), getREFID() + "_rounded_footer", "Rounded footer image")); } else if (htmlmultipart == null) { dataurlEncode(img, attachment); } if (htmlmultipart != null) htmlmultipart.addBodyPart(attachment); } } catch (Exception e) { throw new NullPointerException( "Problem with embedding images into content.\nContact the content owner.\n\nERROR:" + e); } } }
From source file:jp.mau.twappremover.MainActivity.java
private void getApps() { _apps.clear();//from w w w . ja v a 2 s . c o m HttpGet request = new HttpGet(APP_PAGE); request.addHeader("User-Agent", USER_AGENT); request.addHeader("Cookie", "_twitter_sess=" + _session_id + "; auth_token=" + _cookie_auth); try { String result = _client.execute(request, new ResponseHandler<String>() { @Override public String handleResponse(HttpResponse response) throws ClientProtocolException, IOException { switch (response.getStatusLine().getStatusCode()) { case HttpStatus.SC_OK: return EntityUtils.toString(response.getEntity(), "UTF-8"); case HttpStatus.SC_NOT_FOUND: throw new RuntimeException("not found"); default: throw new RuntimeException("error"); } } }); Document doc = null; doc = Jsoup.parse(result); // parse top page and get authenticity token Elements forms = doc.getElementsByTag("form"); for (Element e : forms) { Elements auths = e.getElementsByAttributeValue("name", "authenticity_token"); if (auths.size() > 0) { _auth_token = auths.get(0).attr("value"); break; } } Elements apps = doc.getElementsByClass("app"); for (Element e : apps) { LinkedApp app = new LinkedApp(); if (e.getElementsByTag("strong").size() > 0) app.name = e.getElementsByTag("strong").get(0).text(); if (e.getElementsByClass("creator").size() > 0) app.creator = e.getElementsByClass("creator").get(0).text(); if (e.getElementsByClass("description").size() > 0) app.desc = e.getElementsByClass("description").get(0).text(); if (e.getElementsByClass("app-img").size() > 0) app.imgUrl = e.getElementsByClass("app-img").get(0).attr("src"); if (e.getElementsByClass("revoke").size() > 0) { String tmp = e.getElementsByClass("revoke").get(0).attr("id"); app.revokeId = tmp.replaceAll(KEY_HEADER_REVOKE, ""); } else { // revoke id ????(facebook????????) continue; } _apps.add(app); } _handler.post(new Runnable() { @Override public void run() { _appadapter.notifyDataSetChanged(); } }); } catch (Exception ex) { ex.printStackTrace(); } }
From source file:ie.nuim.cs.dri.metadata.WebSearch.java
/** * * @param title the title of the ROS//from w ww.j a va 2s . c om */ public void searchGoogle(String title) { String searchTitle = buildGoogleSearchTitle(title); boolean found = false; String publication = ""; String publicationType = ""; int citationCount = -1; String url = "http://scholar.google.com/scholar?" + searchTitle; Document doc = Jsoup.parse(getGS()); Elements aElement = doc.getElementsByTag("h3"); System.out.println("=====searching google======="); for (Element e : aElement) { Elements bElement = e.getElementsByTag("a"); for (Element f : bElement) { System.out.println(f.text() + "\t" + title); if (title.equalsIgnoreCase(f.text())) { found = true; break; } } // System.out.println(e); } if (found == true) { Elements pElement = doc.getElementsByTag("div"); for (Element p : pElement) { Elements pubElement = p.getElementsByClass("gs_a"); for (Element pub : pubElement) { System.out.println(pub); } } for (Element p : pElement) { Elements pubElement = p.getElementsByClass("gs_fl"); for (Element pub : pubElement) { System.out.println(pub); } } } }
From source file:com.salsaberries.narchiver.Trawler.java
/** * Extracts links from html, and returns a set of Pages with their parent * page already defined./*w ww .j a v a2 s.com*/ * * @param html * @return A list of pages to follow. */ private ArrayList<Page> extractPages(Page extractPage) { String html = extractPage.getHtml(); ArrayList<Page> pages = new ArrayList<>(); // Are we at a stop at page? for (String e : stopAt) { if (extractPage.getTagURL().contains(e)) { return pages; } } // Parse the html Document doc = Jsoup.parse(html); Elements links = doc.getElementsByTag("a"); for (Element link : links) { String tagURL = ""; String linkText = ""; boolean alreadyFollowed; boolean validURL = false; // First format the link if (link.attr("href").startsWith(baseURL)) { tagURL = link.attr("href").replace(baseURL, ""); linkText = link.html(); validURL = true; } else if (link.attr("href").startsWith("/")) { tagURL = link.attr("href"); linkText = link.html(); validURL = true; } else if (link.attr("href").startsWith("./")) { tagURL = link.attr("href").substring(1); linkText = link.html(); validURL = true; } //else if (!link.attr("href").startsWith("/") && !link.attr("href").startsWith("http")) { // tagURL = "/" + link.attr("href"); // linkText = link.html(); // validURL = true; //} // Has it already been followed? alreadyFollowed = trawledPages.contains(tagURL); // Does it violate the exclusion rules? boolean excluded = false; for (String e : exclude) { if (tagURL.contains(e)) { excluded = true; } } // Does it violate the exclusion equal rule? for (String e : excludeIfEqual) { if (tagURL.equals(e)) { excluded = true; } } if (!alreadyFollowed && validURL && !excluded) { logger.debug("Creating new page at URL " + tagURL); Page page = new Page(tagURL, extractPage, linkText); trawledPages.add(tagURL); pages.add(page); } if (alreadyFollowed) { logger.debug("Skipping duplicate at URL " + tagURL); } if (!validURL) { logger.debug("Invalid URL at " + link.attr("href")); } if (excluded) { logger.debug("Exclusion at " + link.attr("href")); } } return pages; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * returns a list of nodes we want to search on like paragraphs and tables * * @return/* w w w . j ava 2 s . c o m*/ */ private ArrayList<Element> getNodesToCheck(Document doc) { ArrayList<Element> nodesToCheck = new ArrayList<Element>(); nodesToCheck.addAll(doc.getElementsByTag("p")); nodesToCheck.addAll(doc.getElementsByTag("pre")); nodesToCheck.addAll(doc.getElementsByTag("td")); return nodesToCheck; }
From source file:com.fluidops.iwb.provider.HTMLProvider.java
@Override public void gather(List<Statement> res) throws Exception { String url = config.url;/* ww w .j a v a2 s .co m*/ Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); // Elements article = // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table"); Elements article = doc.getElementsByTag("tbody").select("tr"); Elements tableElem; URI nameURI = null; URI roadsURI = null; URI sideURI = null; URI totalURI = null; File file = new File("HTMLdata.txt"); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file))); out.println("Media"); print("\nMedia: (%d)", media.size()); for (Element el : media) { if (el.tagName().equals("img")) { print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.println(); } else { print(" * %s: <%s>", el.tagName(), el.attr("abs:src")); out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src")); out.println(); } } out.println("Imports"); print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.println(); } out.println("Links"); print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text()); out.println(); } /* * out.println("Custom text"); print("\nCustom: (%d)",customArt.size()); * for (Element custom:customArt){ * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text()); * out.println(); } */ out.println("Article"); print("\nArticle: (%d)", article.size()); for (int i = 3; i < article.size() - 2; i++) { tableElem = article.get(i).select("td"); out.println(); if (i == 3) { nameURI = ProviderUtils.objectToUri(tableElem.get(0).text()); roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text()); sideURI = ProviderUtils.objectToUri(tableElem.get(2).text()); totalURI = ProviderUtils.objectToUri(tableElem.get(3).text()); } else { res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE, nameURI)); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDFS.LABEL, tableElem.get(0).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), roadsURI, tableElem.get(1).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), sideURI, tableElem.get(2).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), totalURI, tableElem.get(3).text())); for (Element el : tableElem) { out.printf("\n * (%s): (%s)", el.tagName(), el.text()); out.println(); } } out.println(); out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text()); out.println(); } out.close(); }
From source file:com.jimplush.goose.ContentExtractor.java
/** * attemps to grab titles from the html pages, lots of sites use different delimiters * for titles so we'll try and do our best guess. * * * @param doc/*from ww w .j a v a2 s . c om*/ * @return */ private String getTitle(Document doc) { String title = string.empty; try { Elements titleElem = doc.getElementsByTag("title"); if (titleElem == null || titleElem.isEmpty()) return string.empty; String titleText = titleElem.first().text(); if (string.isNullOrEmpty(titleText)) return string.empty; boolean usedDelimeter = false; if (titleText.contains("|")) { titleText = doTitleSplits(titleText, PIPE_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains("-")) { titleText = doTitleSplits(titleText, DASH_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains("")) { titleText = doTitleSplits(titleText, ARROWS_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains(":")) { titleText = doTitleSplits(titleText, COLON_SPLITTER); } // encode unicode charz title = StringEscapeUtils.escapeHtml(titleText); // todo this is a hack until I can fix this.. weird motely crue error with // http://money.cnn.com/2010/10/25/news/companies/motley_crue_bp.fortune/index.htm?section=money_latest title = MOTLEY_REPLACEMENT.replaceAll(title); if (logger.isDebugEnabled()) { logger.debug("Page title is: " + title); } } catch (NullPointerException e) { logger.error(e.toString()); } return title; }
From source file:de.ipbhalle.metfusion.main.SubstructureSearch.java
private List<ResultSubstructure> queryDatabase(String substrucPresent) { List<ResultSubstructure> candidates = new ArrayList<ResultSubstructure>(); // convert input SMILES to MOL format for ChemSpider service SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance()); // sp.setPreservingAromaticity(false); // String mol = ""; // String s = ""; // try { // IMolecule temp = sp.parseSmiles(substrucPresent); // System.out.println("aromatic Hueckel? -> " + CDKHueckelAromaticityDetector.detectAromaticity(temp)); // System.out.println("aromatic double bond? -> " + DoubleBondAcceptingAromaticityDetector.detectAromaticity(temp)); // // create coordinates // StructureDiagramGenerator sdg = new StructureDiagramGenerator(); // sdg.setMolecule(temp); // sdg.generateCoordinates(); // IMolecule layedOutMol = sdg.getMolecule(); // // // // byte[] b = null; // ByteArrayOutputStream bos = new ByteArrayOutputStream(); // MDLV2000Writer writer = new MDLV2000Writer(bos); // IOSetting[] ios = writer.getIOSettings(); // for (int i = 0; i < ios.length; i++) { // System.out.println(ios[i].getName() + "\t" + ios[i].getSetting()); // }/*from ww w . j a va 2 s . c o m*/ // Properties customSettings = new Properties(); // customSettings.setProperty("ForceWriteAs2DCoordinates", "true"); // customSettings.setProperty("WriteAromaticBondTypes", "true"); // PropertiesListener listener = new PropertiesListener(customSettings); // writer.addChemObjectIOListener(listener); // // writer.write(layedOutMol); // writer.close(); // b = bos.toByteArray(); // mol = new String(b, "UTF-8"); // System.out.println(mol); // MassBankUtilities mbu = new MassBankUtilities(); // IAtomContainer test2 = mbu.getContainer(mol); // //IAtomContainer test2 = mbu.getContainerUnmodified("c1cccc2nnnc12", "/home/mgerlich/projects/metfusion_tp/BTs/"); // System.out.println("aromatic Hueckel? -> " + CDKHueckelAromaticityDetector.detectAromaticity(test2)); // System.out.println("aromatic? -> " + DoubleBondAcceptingAromaticityDetector.detectAromaticity(test2)); // SmilesGenerator sg = new SmilesGenerator(true); // s = sg.createSMILES(layedOutMol); // System.out.println("old smiles -> " + substrucPresent); // System.out.println("smiles -> " + s); // } catch (InvalidSmilesException e2) { // // TODO Auto-generated catch block // e2.printStackTrace(); // } catch (CDKException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } catch (IOException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } OpenBabelLocator obl = new OpenBabelLocator(); String obmol = ""; // try { // OpenBabelSoap obsoap = obl.getOpenBabelSoap(); // obmol = obsoap.convert(substrucPresent, "smi", "mol"); // System.out.println("obmol\n" + obmol); // } catch (ServiceException e2) { // // TODO Auto-generated catch block // e2.printStackTrace(); // } catch (RemoteException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } MassSpecAPISoapProxy chemSpiderProxy = new MassSpecAPISoapProxy(); SearchSoapProxy ssp = new SearchSoapProxy(); SubstructureSearchOptions sso = new SubstructureSearchOptions(substrucPresent, false); //sso.setMatchTautomers(false); //sso.setMolecule(substrucPresent); CommonSearchOptions cso = new CommonSearchOptions(EComplexity.Single, EIsotopic.NotLabeled, false, false); //cso.setComplexity(EComplexity.Single); //cso.setIsotopic(EIsotopic.NotLabeled); // NotLabeled when using Formula search // cso.setComplexity(EComplexity.Any); // cso.setIsotopic(EIsotopic.Any); //cso.setHasSpectra(false); //cso.setHasPatents(false); String transactionID = ""; ERequestStatus ers = null; try { transactionID = ssp.substructureSearch(sso, cso, token); System.out.println("transaction id -> " + transactionID); ers = ssp.getAsyncSearchStatus(transactionID, token); while (ers.equals(ERequestStatus.Processing)) { Thread.sleep(2000); ers = ssp.getAsyncSearchStatus(transactionID, token); } } catch (RemoteException e1) { e1.printStackTrace(); return candidates; } catch (InterruptedException e) { e.printStackTrace(); return candidates; } if (ers.equals(ERequestStatus.Failed)) { System.out.println("failed"); return candidates; } if (ers.equals(ERequestStatus.ResultReady)) { int[] CSIDs = null; System.out.println("woohoo"); try { CSIDs = ssp.getAsyncSearchResult(transactionID, token); } catch (RemoteException e) { System.err.println("Error retrieving information and parsing results."); String resultURL = "http://www.chemspider.com/Search.asmx/GetAsyncSearchResult?rid=%s&token=%s"; String format = String.format(resultURL, transactionID, token); try { URL u = new URL(format); URLConnection con = u.openConnection(); InputStream is = con.getInputStream(); String ids = IOUtils.toString(is); is.close(); Document doc = Jsoup.parse(ids); Elements elem = doc.getElementsByTag("int"); CSIDs = new int[elem.size()]; for (int i = 0; i < CSIDs.length; i++) { CSIDs[i] = Integer.parseInt(elem.get(i).text().trim()); } } catch (MalformedURLException e1) { System.err.println("Wrong URL for retrieving results!\n" + format); } catch (IOException e1) { System.err.println("Error parsing results!"); } } if (CSIDs == null || CSIDs.length == 0) return candidates; System.out.println("#CSIDs -> " + CSIDs.length); int arrLength = CSIDs.length; int splitLength = 1000; // if(CSIDs.length > splitLength) // CSIDs = Arrays.copyOf(CSIDs, splitLength); int[] temp = new int[1]; int numSplits = arrLength / splitLength; int remaining = arrLength % splitLength; if (numSplits == 0) { try { chemspiderInfo = chemSpiderProxy.getExtendedCompoundInfoArray(CSIDs, token); } catch (RemoteException e) { System.err.println("Error retrieving information and parsing results."); return candidates; } } else { int pos = 0; int current = 0; List<ExtendedCompoundInfo> eci = new ArrayList<ExtendedCompoundInfo>(); for (int i = 0; i < numSplits; i++) { System.out.println("split [" + i + "] from " + numSplits); temp = Arrays.copyOfRange(CSIDs, pos, pos + splitLength); ExtendedCompoundInfo[] part; try { part = chemSpiderProxy.getExtendedCompoundInfoArray(temp, token); } catch (RemoteException e1) { System.err .println("Error retrieving information and parsing results for split [" + i + "]."); pos = pos + splitLength; continue; } for (int j = 0; j < part.length; j++) { eci.add(part[j]); //chemspiderInfo[current] = part[j]; current++; } pos = pos + splitLength; try { Thread.sleep(5000); } catch (InterruptedException e) { System.err.println("Error while thread sleep!"); } } // add remaining stuff if (remaining > 0) { temp = Arrays.copyOfRange(CSIDs, pos, pos + remaining); ExtendedCompoundInfo[] part; try { part = chemSpiderProxy.getExtendedCompoundInfoArray(temp, token); } catch (RemoteException e) { System.err.println("Error retrieving information and parsing results."); return candidates; } for (int j = 0; j < part.length; j++) { eci.add(part[j]); //chemspiderInfo[current] = part[j]; current++; } } // copy list into array chemspiderInfo = new ExtendedCompoundInfo[eci.size()]; for (int i = 0; i < chemspiderInfo.length; i++) { chemspiderInfo[i] = eci.get(i); } } // chemspiderInfo = chemSpiderProxy.getExtendedCompoundInfoArray(CSIDs, token); // chemspiderInfo = new ExtendedCompoundInfo[CSIDs.length]; // for (int i = 0; i < chemspiderInfo.length; i++) { // chemspiderInfo[i] = chemSpiderProxy.getExtendedCompoundInfo(CSIDs[i], token); // } boolean writeSDF = true; String filename = batchFileHandler.getBatchFile().getName(); int idx = filename.lastIndexOf("."); String ending = ".sdf"; filename = filename.substring(0, idx) + "_original" + ending; File originalSDF = new File(batchFileHandler.getBatchFile().getParent(), filename); MDLV2000Writer writer = null; try { writer = new MDLV2000Writer(new FileOutputStream(originalSDF)); } catch (FileNotFoundException e1) { System.err .println("File [" + originalSDF.getAbsolutePath() + "] not found for original SDF writer!"); writeSDF = false; } if (writeSDF) { } System.out.println("# matches -> " + chemspiderInfo.length); for (int i = 0; i < chemspiderInfo.length; i++) { System.out.println(chemspiderInfo[i].getCSID() + "\t" + chemspiderInfo[i].getSMILES()); IAtomContainer ac = null; boolean used = false; try { // TODO check for kekule on new CDK SmilesParser to retain all candidates ac = sp.parseSmiles(chemspiderInfo[i].getSMILES()); used = true; } catch (InvalidSmilesException ise) { ac = null; used = false; System.err.println("skipping " + chemspiderInfo[i].getCSID()); } candidates.add(new ResultSubstructure(chemspiderInfo[i], ac, used)); if (used && writeSDF) { try { Map<Object, Object> props = ac.getProperties(); props.put("CSID", chemspiderInfo[i].getCSID()); props.put("SMILES", chemspiderInfo[i].getSMILES()); props.put("name", chemspiderInfo[i].getCommonName()); props.put("ALogP", chemspiderInfo[i].getALogP()); props.put("XLogP", chemspiderInfo[i].getXLogP()); props.put("InChI", chemspiderInfo[i].getInChI()); props.put("InChIKey", chemspiderInfo[i].getInChIKey()); props.put("MF", chemspiderInfo[i].getMF()); ac.setProperties(props); writer.write(ac); } catch (CDKException e) { System.err.println("Error writing " + chemspiderInfo[i].getCSID() + " to file [" + originalSDF.getAbsolutePath() + "]!"); } } } try { writer.close(); } catch (IOException e) { System.err.println("Error finalizing original SDF output file!"); } } return candidates; }