Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:us.colloquy.index.IndexHandler.java

public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) {
    ///Documents/Tolstoy/diaries

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {/*from  w  w  w. j  a  v a 2  s. c  o m*/

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}

From source file:us.colloquy.index.IndexHandler.java

public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) {
    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {/*from  ww w.  j  a v a2  s .c  o m*/

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                }

                if (startPrinting && !navLabel
                        .matches("(|??? ??)")) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    documentPointers.add(documentPointer);
                }

            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + documentPointers.size());

    //  for (DocumentPointer pointer : documentPointers)
    // {
    //parse and
    //     System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
}

From source file:com.zacwolf.commons.email.Email.java

private void prepareImgs(final org.jsoup.nodes.Document doc, final Multipart htmlmultipart) {
    final Map<String, EmailAttachment> attachments = getAttachments();
    final org.jsoup.select.Elements imgs = doc.getElementsByTag("img");
    for (org.jsoup.nodes.Element img : imgs) {
        final String src = img.attr("src");
        final String cid = !src.startsWith("cid:") ? null : src.substring(4);
        try {/*from  w ww  . ja v a 2  s .c om*/
            EmailAttachment attachment;
            ByteArrayOutputStream baos;
            if (cid != null) {
                attachment = attachments.get(cid);
                img.attr("alt", attachment.getDescription());
                if (!img.attr("style").contains("display:"))//all inline images need the display:block; added for GMail compatability
                    img.attr("style", img.attr("style") + (!img.attr("style").endsWith(";") ? ";" : "")
                            + "display:block;");
                if (cid.toLowerCase().contains("_banner")
                        && doc.select("#banner").attr("style").contains("-radius")) {
                    BufferedImage image = makeRoundedBanner(
                            ImageIO.read(new ByteArrayInputStream(attachment.data)), 20);
                    doc.select("#contenttable").attr("style",
                            "width:" + image.getWidth() + "px;" + doc.select("#contenttable").attr("style"));
                    baos = new ByteArrayOutputStream();
                    try {
                        ImageIO.write(image, EmailAttachment.CONTENT_MIMETYPES.get(attachment.contenttype),
                                baos);
                    } finally {
                        baos.flush();
                    }
                    attachment = new EmailAttachment(attachment.filename, attachment.contenttype,
                            baos.toByteArray(), cid, "Rounded banner image");
                    if (htmlmultipart == null)
                        dataurlEncode(img, attachment);
                    if (doc.select("#footer").size() == 1
                            && doc.select("#footer").first().attr("style").contains("-radius")) {
                        Color bgcolor = Color.WHITE;
                        Color border = null;
                        String newstyle = "";
                        String[] styles = doc.select("#footer").first().attr("style").split(";");
                        for (String style : styles) {
                            if (style.startsWith("border"))
                                border = getColorFromStyle(style, null);
                            else if (style.startsWith("background-color:"))
                                bgcolor = getColorFromStyle(style, Color.WHITE);
                            else
                                newstyle += style + ";";
                        }
                        baos = new ByteArrayOutputStream();
                        try {
                            ImageIO.write(makeRoundedFooter(image.getWidth(), 20, bgcolor, border), "png",
                                    baos);
                        } finally {
                            baos.flush();
                        }
                        doc.select("#footer").first().parent()
                                .html("<td style=\"margin:0px;padding:0px;\" valign=\"top\" style=\"" + newstyle
                                        + "\"><img id=\"footer\" alt=\"rounded footer image\" src=\"cid:"
                                        + getREFID() + "_rounded_footer\" style=\"display:block;\" /></td>");
                    }
                    if (htmlmultipart == null)
                        dataurlEncode(doc.select("#footer").first(),
                                new EmailAttachment("footer.png", "image/png", baos.toByteArray(),
                                        getREFID() + "_rounded_footer", "Rounded footer image"));
                    else
                        htmlmultipart.addBodyPart(new EmailAttachment("footer.png", "image/png",
                                baos.toByteArray(), getREFID() + "_rounded_footer", "Rounded footer image"));
                } else if (htmlmultipart == null) {
                    dataurlEncode(img, attachment);
                }
                if (htmlmultipart != null)
                    htmlmultipart.addBodyPart(attachment);
            }
        } catch (Exception e) {
            throw new NullPointerException(
                    "Problem with embedding images into content.\nContact the content owner.\n\nERROR:" + e);
        }
    }
}

From source file:jp.mau.twappremover.MainActivity.java

private void getApps() {
    _apps.clear();//from  w  w w  . ja v  a  2  s . c  o  m

    HttpGet request = new HttpGet(APP_PAGE);
    request.addHeader("User-Agent", USER_AGENT);
    request.addHeader("Cookie", "_twitter_sess=" + _session_id + "; auth_token=" + _cookie_auth);

    try {
        String result = _client.execute(request, new ResponseHandler<String>() {
            @Override
            public String handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
                switch (response.getStatusLine().getStatusCode()) {
                case HttpStatus.SC_OK:
                    return EntityUtils.toString(response.getEntity(), "UTF-8");
                case HttpStatus.SC_NOT_FOUND:
                    throw new RuntimeException("not found");
                default:
                    throw new RuntimeException("error");
                }
            }
        });

        Document doc = null;
        doc = Jsoup.parse(result);

        // parse top page and get authenticity token
        Elements forms = doc.getElementsByTag("form");
        for (Element e : forms) {
            Elements auths = e.getElementsByAttributeValue("name", "authenticity_token");
            if (auths.size() > 0) {
                _auth_token = auths.get(0).attr("value");
                break;
            }
        }

        Elements apps = doc.getElementsByClass("app");
        for (Element e : apps) {
            LinkedApp app = new LinkedApp();
            if (e.getElementsByTag("strong").size() > 0)
                app.name = e.getElementsByTag("strong").get(0).text();
            if (e.getElementsByClass("creator").size() > 0)
                app.creator = e.getElementsByClass("creator").get(0).text();
            if (e.getElementsByClass("description").size() > 0)
                app.desc = e.getElementsByClass("description").get(0).text();
            if (e.getElementsByClass("app-img").size() > 0)
                app.imgUrl = e.getElementsByClass("app-img").get(0).attr("src");
            if (e.getElementsByClass("revoke").size() > 0) {
                String tmp = e.getElementsByClass("revoke").get(0).attr("id");
                app.revokeId = tmp.replaceAll(KEY_HEADER_REVOKE, "");
            } else {
                // revoke id ????(facebook????????)
                continue;
            }
            _apps.add(app);
        }
        _handler.post(new Runnable() {
            @Override
            public void run() {
                _appadapter.notifyDataSetChanged();
            }
        });
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:ie.nuim.cs.dri.metadata.WebSearch.java

/**
 *
 * @param title the title of the ROS//from   w ww.j a va  2s  .  c om
 */
public void searchGoogle(String title) {
    String searchTitle = buildGoogleSearchTitle(title);
    boolean found = false;
    String publication = "";
    String publicationType = "";
    int citationCount = -1;
    String url = "http://scholar.google.com/scholar?" + searchTitle;
    Document doc = Jsoup.parse(getGS());
    Elements aElement = doc.getElementsByTag("h3");
    System.out.println("=====searching google=======");
    for (Element e : aElement) {
        Elements bElement = e.getElementsByTag("a");
        for (Element f : bElement) {
            System.out.println(f.text() + "\t" + title);

            if (title.equalsIgnoreCase(f.text())) {
                found = true;
                break;
            }
        }
        // System.out.println(e);

    }
    if (found == true) {
        Elements pElement = doc.getElementsByTag("div");
        for (Element p : pElement) {
            Elements pubElement = p.getElementsByClass("gs_a");
            for (Element pub : pubElement) {
                System.out.println(pub);
            }

        }
        for (Element p : pElement) {
            Elements pubElement = p.getElementsByClass("gs_fl");
            for (Element pub : pubElement) {
                System.out.println(pub);
            }

        }

    }
}

From source file:com.salsaberries.narchiver.Trawler.java

/**
 * Extracts links from html, and returns a set of Pages with their parent
 * page already defined./*w ww  .j  a  v  a2 s.com*/
 *
 * @param html
 * @return A list of pages to follow.
 */
private ArrayList<Page> extractPages(Page extractPage) {

    String html = extractPage.getHtml();

    ArrayList<Page> pages = new ArrayList<>();

    // Are we at a stop at page?
    for (String e : stopAt) {
        if (extractPage.getTagURL().contains(e)) {
            return pages;
        }
    }

    // Parse the html
    Document doc = Jsoup.parse(html);
    Elements links = doc.getElementsByTag("a");

    for (Element link : links) {

        String tagURL = "";
        String linkText = "";
        boolean alreadyFollowed;
        boolean validURL = false;

        // First format the link
        if (link.attr("href").startsWith(baseURL)) {
            tagURL = link.attr("href").replace(baseURL, "");
            linkText = link.html();
            validURL = true;
        } else if (link.attr("href").startsWith("/")) {
            tagURL = link.attr("href");
            linkText = link.html();
            validURL = true;
        } else if (link.attr("href").startsWith("./")) {
            tagURL = link.attr("href").substring(1);
            linkText = link.html();
            validURL = true;
        }

        //else if (!link.attr("href").startsWith("/") && !link.attr("href").startsWith("http")) {
        //    tagURL = "/" + link.attr("href");
        //    linkText = link.html();
        //    validURL = true;
        //}
        // Has it already been followed?
        alreadyFollowed = trawledPages.contains(tagURL);

        // Does it violate the exclusion rules?
        boolean excluded = false;
        for (String e : exclude) {
            if (tagURL.contains(e)) {
                excluded = true;
            }
        }

        // Does it violate the exclusion equal rule?
        for (String e : excludeIfEqual) {
            if (tagURL.equals(e)) {
                excluded = true;
            }
        }

        if (!alreadyFollowed && validURL && !excluded) {
            logger.debug("Creating new page at URL " + tagURL);
            Page page = new Page(tagURL, extractPage, linkText);
            trawledPages.add(tagURL);
            pages.add(page);
        }

        if (alreadyFollowed) {
            logger.debug("Skipping duplicate at URL " + tagURL);
        }
        if (!validURL) {
            logger.debug("Invalid URL at " + link.attr("href"));
        }
        if (excluded) {
            logger.debug("Exclusion at " + link.attr("href"));
        }
    }
    return pages;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * returns a list of nodes we want to search on like paragraphs and tables
 *
 * @return/*  w  w w .  j  ava 2  s .  c o m*/
 */
private ArrayList<Element> getNodesToCheck(Document doc) {
    ArrayList<Element> nodesToCheck = new ArrayList<Element>();

    nodesToCheck.addAll(doc.getElementsByTag("p"));
    nodesToCheck.addAll(doc.getElementsByTag("pre"));
    nodesToCheck.addAll(doc.getElementsByTag("td"));
    return nodesToCheck;

}

From source file:com.fluidops.iwb.provider.HTMLProvider.java

@Override
public void gather(List<Statement> res) throws Exception {

    String url = config.url;/* ww w .j  a v a2 s .co m*/
    Document doc = Jsoup.connect(url).get();
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    // Elements article =
    // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table");
    Elements article = doc.getElementsByTag("tbody").select("tr");
    Elements tableElem;
    URI nameURI = null;
    URI roadsURI = null;
    URI sideURI = null;
    URI totalURI = null;

    File file = new File("HTMLdata.txt");
    PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)));

    out.println("Media");
    print("\nMedia: (%d)", media.size());
    for (Element el : media) {
        if (el.tagName().equals("img")) {
            print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.println();
        } else {
            print(" * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.println();
        }

    }

    out.println("Imports");
    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.println();
    }

    out.println("Links");
    print("\nLinks: (%d)", links.size());
    for (Element link : links) {
        print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
        out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text());
        out.println();
    }

    /*
     * out.println("Custom text"); print("\nCustom: (%d)",customArt.size());
     * for (Element custom:customArt){
     * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text());
     * out.println(); }
     */

    out.println("Article");
    print("\nArticle: (%d)", article.size());

    for (int i = 3; i < article.size() - 2; i++) {
        tableElem = article.get(i).select("td");
        out.println();

        if (i == 3) {
            nameURI = ProviderUtils.objectToUri(tableElem.get(0).text());
            roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text());
            sideURI = ProviderUtils.objectToUri(tableElem.get(2).text());
            totalURI = ProviderUtils.objectToUri(tableElem.get(3).text());

        } else {

            res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE,
                    nameURI));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    RDFS.LABEL, tableElem.get(0).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    roadsURI, tableElem.get(1).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    sideURI, tableElem.get(2).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    totalURI, tableElem.get(3).text()));

            for (Element el : tableElem) {
                out.printf("\n * (%s): (%s)", el.tagName(), el.text());
                out.println();

            }
        }
        out.println();
        out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text());
        out.println();
    }
    out.close();
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * attemps to grab titles from the html pages, lots of sites use different delimiters
 * for titles so we'll try and do our best guess.
 *
 *
 * @param doc/*from  ww w  .j a v  a2  s .  c om*/
 * @return
 */
private String getTitle(Document doc) {
    String title = string.empty;

    try {

        Elements titleElem = doc.getElementsByTag("title");
        if (titleElem == null || titleElem.isEmpty())
            return string.empty;

        String titleText = titleElem.first().text();

        if (string.isNullOrEmpty(titleText))
            return string.empty;

        boolean usedDelimeter = false;

        if (titleText.contains("|")) {
            titleText = doTitleSplits(titleText, PIPE_SPLITTER);
            usedDelimeter = true;
        }

        if (!usedDelimeter && titleText.contains("-")) {
            titleText = doTitleSplits(titleText, DASH_SPLITTER);
            usedDelimeter = true;
        }
        if (!usedDelimeter && titleText.contains("")) {
            titleText = doTitleSplits(titleText, ARROWS_SPLITTER);
            usedDelimeter = true;
        }

        if (!usedDelimeter && titleText.contains(":")) {
            titleText = doTitleSplits(titleText, COLON_SPLITTER);
        }

        // encode unicode charz
        title = StringEscapeUtils.escapeHtml(titleText);

        // todo this is a hack until I can fix this.. weird motely crue error with
        // http://money.cnn.com/2010/10/25/news/companies/motley_crue_bp.fortune/index.htm?section=money_latest
        title = MOTLEY_REPLACEMENT.replaceAll(title);

        if (logger.isDebugEnabled()) {
            logger.debug("Page title is: " + title);
        }

    } catch (NullPointerException e) {
        logger.error(e.toString());
    }
    return title;

}

From source file:de.ipbhalle.metfusion.main.SubstructureSearch.java

private List<ResultSubstructure> queryDatabase(String substrucPresent) {
    List<ResultSubstructure> candidates = new ArrayList<ResultSubstructure>();

    // convert input SMILES to MOL format for ChemSpider service
    SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance());
    //      sp.setPreservingAromaticity(false);
    //      String mol = "";
    //      String s = "";
    //      try {
    //         IMolecule temp = sp.parseSmiles(substrucPresent);
    //         System.out.println("aromatic Hueckel? -> " + CDKHueckelAromaticityDetector.detectAromaticity(temp));
    //         System.out.println("aromatic double bond? -> " + DoubleBondAcceptingAromaticityDetector.detectAromaticity(temp));
    //         // create coordinates
    //            StructureDiagramGenerator sdg = new StructureDiagramGenerator();
    //            sdg.setMolecule(temp);
    //            sdg.generateCoordinates();
    //            IMolecule layedOutMol = sdg.getMolecule();
    //            //
    //            
    //         byte[] b = null;
    //         ByteArrayOutputStream bos = new ByteArrayOutputStream();
    //         MDLV2000Writer writer = new MDLV2000Writer(bos);
    //         IOSetting[] ios = writer.getIOSettings();
    //         for (int i = 0; i < ios.length; i++) {
    //            System.out.println(ios[i].getName() + "\t" + ios[i].getSetting());
    //         }/*from  ww w . j  a  va 2 s . c  o m*/
    //         Properties customSettings = new Properties();
    //         customSettings.setProperty("ForceWriteAs2DCoordinates", "true");
    //         customSettings.setProperty("WriteAromaticBondTypes", "true");
    //         PropertiesListener listener = new PropertiesListener(customSettings);
    //         writer.addChemObjectIOListener(listener);
    //          
    //         writer.write(layedOutMol);
    //         writer.close();
    //         b = bos.toByteArray();
    //         mol = new String(b, "UTF-8");
    //         System.out.println(mol);
    //         MassBankUtilities mbu = new MassBankUtilities();
    //         IAtomContainer test2 = mbu.getContainer(mol);
    //         //IAtomContainer test2 = mbu.getContainerUnmodified("c1cccc2nnnc12", "/home/mgerlich/projects/metfusion_tp/BTs/");
    //         System.out.println("aromatic Hueckel? -> " + CDKHueckelAromaticityDetector.detectAromaticity(test2));
    //         System.out.println("aromatic? -> " + DoubleBondAcceptingAromaticityDetector.detectAromaticity(test2));
    //         SmilesGenerator sg = new SmilesGenerator(true);
    //         s = sg.createSMILES(layedOutMol);
    //         System.out.println("old smiles -> " + substrucPresent);
    //         System.out.println("smiles -> " + s);
    //      } catch (InvalidSmilesException e2) {
    //         // TODO Auto-generated catch block
    //         e2.printStackTrace();
    //      } catch (CDKException e) {
    //         // TODO Auto-generated catch block
    //         e.printStackTrace();
    //      } catch (IOException e) {
    //         // TODO Auto-generated catch block
    //         e.printStackTrace();
    //      }
    OpenBabelLocator obl = new OpenBabelLocator();
    String obmol = "";
    //      try {
    //         OpenBabelSoap obsoap = obl.getOpenBabelSoap();
    //         obmol = obsoap.convert(substrucPresent, "smi", "mol");
    //         System.out.println("obmol\n" + obmol);
    //      } catch (ServiceException e2) {
    //         // TODO Auto-generated catch block
    //         e2.printStackTrace();
    //      } catch (RemoteException e) {
    //         // TODO Auto-generated catch block
    //         e.printStackTrace();
    //      }

    MassSpecAPISoapProxy chemSpiderProxy = new MassSpecAPISoapProxy();
    SearchSoapProxy ssp = new SearchSoapProxy();
    SubstructureSearchOptions sso = new SubstructureSearchOptions(substrucPresent, false);
    //sso.setMatchTautomers(false);
    //sso.setMolecule(substrucPresent);

    CommonSearchOptions cso = new CommonSearchOptions(EComplexity.Single, EIsotopic.NotLabeled, false, false);
    //cso.setComplexity(EComplexity.Single);
    //cso.setIsotopic(EIsotopic.NotLabeled);   // NotLabeled when using Formula search
    //      cso.setComplexity(EComplexity.Any);
    //      cso.setIsotopic(EIsotopic.Any);
    //cso.setHasSpectra(false);
    //cso.setHasPatents(false);
    String transactionID = "";
    ERequestStatus ers = null;

    try {
        transactionID = ssp.substructureSearch(sso, cso, token);
        System.out.println("transaction id -> " + transactionID);
        ers = ssp.getAsyncSearchStatus(transactionID, token);
        while (ers.equals(ERequestStatus.Processing)) {
            Thread.sleep(2000);
            ers = ssp.getAsyncSearchStatus(transactionID, token);
        }
    } catch (RemoteException e1) {
        e1.printStackTrace();
        return candidates;
    } catch (InterruptedException e) {
        e.printStackTrace();
        return candidates;
    }

    if (ers.equals(ERequestStatus.Failed)) {
        System.out.println("failed");
        return candidates;
    }

    if (ers.equals(ERequestStatus.ResultReady)) {
        int[] CSIDs = null;
        System.out.println("woohoo");
        try {
            CSIDs = ssp.getAsyncSearchResult(transactionID, token);
        } catch (RemoteException e) {
            System.err.println("Error retrieving information and parsing results.");

            String resultURL = "http://www.chemspider.com/Search.asmx/GetAsyncSearchResult?rid=%s&token=%s";
            String format = String.format(resultURL, transactionID, token);
            try {
                URL u = new URL(format);
                URLConnection con = u.openConnection();
                InputStream is = con.getInputStream();
                String ids = IOUtils.toString(is);
                is.close();

                Document doc = Jsoup.parse(ids);
                Elements elem = doc.getElementsByTag("int");
                CSIDs = new int[elem.size()];
                for (int i = 0; i < CSIDs.length; i++) {
                    CSIDs[i] = Integer.parseInt(elem.get(i).text().trim());
                }
            } catch (MalformedURLException e1) {
                System.err.println("Wrong URL for retrieving results!\n" + format);
            } catch (IOException e1) {
                System.err.println("Error parsing results!");
            }
        }

        if (CSIDs == null || CSIDs.length == 0)
            return candidates;

        System.out.println("#CSIDs -> " + CSIDs.length);
        int arrLength = CSIDs.length;
        int splitLength = 1000;
        //            if(CSIDs.length > splitLength)
        //               CSIDs = Arrays.copyOf(CSIDs, splitLength);
        int[] temp = new int[1];
        int numSplits = arrLength / splitLength;
        int remaining = arrLength % splitLength;
        if (numSplits == 0) {
            try {
                chemspiderInfo = chemSpiderProxy.getExtendedCompoundInfoArray(CSIDs, token);
            } catch (RemoteException e) {
                System.err.println("Error retrieving information and parsing results.");
                return candidates;
            }
        } else {
            int pos = 0;
            int current = 0;
            List<ExtendedCompoundInfo> eci = new ArrayList<ExtendedCompoundInfo>();
            for (int i = 0; i < numSplits; i++) {
                System.out.println("split [" + i + "] from " + numSplits);
                temp = Arrays.copyOfRange(CSIDs, pos, pos + splitLength);
                ExtendedCompoundInfo[] part;
                try {
                    part = chemSpiderProxy.getExtendedCompoundInfoArray(temp, token);
                } catch (RemoteException e1) {
                    System.err
                            .println("Error retrieving information and parsing results for split [" + i + "].");
                    pos = pos + splitLength;
                    continue;
                }
                for (int j = 0; j < part.length; j++) {
                    eci.add(part[j]);
                    //chemspiderInfo[current] = part[j];
                    current++;
                }
                pos = pos + splitLength;
                try {
                    Thread.sleep(5000);
                } catch (InterruptedException e) {
                    System.err.println("Error while thread sleep!");
                }
            }
            // add remaining stuff
            if (remaining > 0) {
                temp = Arrays.copyOfRange(CSIDs, pos, pos + remaining);
                ExtendedCompoundInfo[] part;
                try {
                    part = chemSpiderProxy.getExtendedCompoundInfoArray(temp, token);
                } catch (RemoteException e) {
                    System.err.println("Error retrieving information and parsing results.");
                    return candidates;
                }
                for (int j = 0; j < part.length; j++) {
                    eci.add(part[j]);
                    //chemspiderInfo[current] = part[j];
                    current++;
                }
            }

            // copy list into array
            chemspiderInfo = new ExtendedCompoundInfo[eci.size()];
            for (int i = 0; i < chemspiderInfo.length; i++) {
                chemspiderInfo[i] = eci.get(i);
            }
        }

        //         chemspiderInfo = chemSpiderProxy.getExtendedCompoundInfoArray(CSIDs, token);
        //         chemspiderInfo = new ExtendedCompoundInfo[CSIDs.length];
        //         for (int i = 0; i < chemspiderInfo.length; i++) {
        //            chemspiderInfo[i] = chemSpiderProxy.getExtendedCompoundInfo(CSIDs[i], token);
        //         }
        boolean writeSDF = true;
        String filename = batchFileHandler.getBatchFile().getName();
        int idx = filename.lastIndexOf(".");
        String ending = ".sdf";
        filename = filename.substring(0, idx) + "_original" + ending;
        File originalSDF = new File(batchFileHandler.getBatchFile().getParent(), filename);
        MDLV2000Writer writer = null;
        try {
            writer = new MDLV2000Writer(new FileOutputStream(originalSDF));
        } catch (FileNotFoundException e1) {
            System.err
                    .println("File [" + originalSDF.getAbsolutePath() + "] not found for original SDF writer!");
            writeSDF = false;
        }
        if (writeSDF) {

        }

        System.out.println("# matches -> " + chemspiderInfo.length);
        for (int i = 0; i < chemspiderInfo.length; i++) {
            System.out.println(chemspiderInfo[i].getCSID() + "\t" + chemspiderInfo[i].getSMILES());
            IAtomContainer ac = null;
            boolean used = false;
            try {
                // TODO check for kekule on new CDK SmilesParser to retain all candidates
                ac = sp.parseSmiles(chemspiderInfo[i].getSMILES());
                used = true;
            } catch (InvalidSmilesException ise) {
                ac = null;
                used = false;
                System.err.println("skipping " + chemspiderInfo[i].getCSID());
            }

            candidates.add(new ResultSubstructure(chemspiderInfo[i], ac, used));
            if (used && writeSDF) {
                try {
                    Map<Object, Object> props = ac.getProperties();
                    props.put("CSID", chemspiderInfo[i].getCSID());
                    props.put("SMILES", chemspiderInfo[i].getSMILES());
                    props.put("name", chemspiderInfo[i].getCommonName());
                    props.put("ALogP", chemspiderInfo[i].getALogP());
                    props.put("XLogP", chemspiderInfo[i].getXLogP());
                    props.put("InChI", chemspiderInfo[i].getInChI());
                    props.put("InChIKey", chemspiderInfo[i].getInChIKey());
                    props.put("MF", chemspiderInfo[i].getMF());

                    ac.setProperties(props);
                    writer.write(ac);
                } catch (CDKException e) {
                    System.err.println("Error writing " + chemspiderInfo[i].getCSID() + " to file ["
                            + originalSDF.getAbsolutePath() + "]!");
                }
            }
        }

        try {
            writer.close();
        } catch (IOException e) {
            System.err.println("Error finalizing original SDF output file!");
        }
    }

    return candidates;
}