Java tutorial
/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; import java.io.File; import java.io.IOException; import java.nio.charset.IllegalCharsetNameException; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * Class related to the parsing procedures of HTML files by our mechanism * @author Themis Mavridis */ public class WebParser { /** * The number of embeded videos */ public int number_embeded_videos; /** * The number of embeded videos that are internal to the domain links */ public int number_embeded_videos_internal; /** * The number of scripts */ public int scripts_number; /** * The number of scripts that are internal */ public int scripts_internal; /** * The number of frames */ public int frames_number; /** * The number of internal frames */ public int frames_internal; /** * The number of links */ public int links_number; /** * The number of internal links */ public int links_internal; /** * The number of schema.org usages */ public int nschem; /** * The number of hcards */ public int hcardsn; /** * hcalendars */ public int hcalen; /** * hreviews */ public int hrevn; /** * hevents */ public int hevenn; /** *hadresses */ public int haddrn; /** * hgeo */ public int hgeon; /** *rel tags */ public int hreln; /** *total microformats */ public int total_micron; /** *microformats-1 */ public int micron1; /** *microformats-2 */ public int micron2; /** *microdata */ public int microd; /** *number of foaf */ public int foaf; /** * Get the text content of a url cleaned from stopwords and symbols and lemmatized * @param html_string the url to parse * @return the content in a string */ public String Parse(String html_string) { String content; content = cleanhtml(html_string); if (content != null) { DataManipulation txtpro = new DataManipulation(); Stopwords st = new Stopwords(); content = txtpro.removeChars(content); content = st.stop(content); content = txtpro.removeChars(content); //List<String> contentList = Arrays.asList(content.split(" ")); //StemmerSnow snowballstemmer = new StemmerSnow(); //contentList=snowballstemmer.stem(contentList); //for(String contentListItem : contentList){ // content=content+" "+contentListItem; //} Lemmatizer lemmatizer = new Lemmatizer(); List<String> contentList = lemmatizer.lemmatize(content); content = ""; for (String contentListItem : contentList) { content = content + " " + contentListItem; } } return content; } /** * Parse the url and get all the content * @param link_html the url to parse * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } } /** * Method to get the number of links (total, internal) * @param link_html the url to parse * @return the number of links */ public int[] getnlinks(String link_html) { int[] nlinks = new int[2]; nlinks[0] = 0;//total number of links nlinks[1] = 0;//number of internal links try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); Elements links = doc.select("a[href]"); nlinks[0] = links.size(); //----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs) for (Element link : links) { if (link.attr("abs:href").contains(link_html)) { nlinks[1]++; } } return nlinks; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); return nlinks; } } /** * Method to get the various html stats * @param link_html the url to analyze * @return flag if we got all the stats */ public boolean gethtmlstats(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); Elements schemas = doc.getElementsByAttributeValueContaining("itemtype", "schema.org"); Elements microdata = doc.getElementsByAttribute("itemtype"); Elements microformats_vcard = doc.getElementsByAttributeValueContaining("class", "vcard"); Elements microformats_hreview = doc.getElementsByAttributeValueContaining("class", "hreview"); Elements microformats_vevent = doc.getElementsByAttributeValueContaining("class", "vevent"); Elements microformats_vcalendar = doc.getElementsByAttributeValueContaining("class", "vcalendar"); Elements microformats_vgeo = doc.getElementsByAttributeValueContaining("class", "geo"); Elements microformats_vadrn = doc.getElementsByAttributeValueContaining("class", "ardn"); Elements microformats_acquaintance = doc.getElementsByAttributeValueContaining("rel", "link_html"); Elements microformats_alternate = doc.getElementsByAttributeValueContaining("rel", "alternate"); Elements microformats_appendix = doc.getElementsByAttributeValueContaining("rel", "appendix"); Elements microformats_bookmark = doc.getElementsByAttributeValueContaining("rel", "bookmark"); Elements microformats_chapter = doc.getElementsByAttributeValueContaining("rel", "chapter"); Elements microformats_child = doc.getElementsByAttributeValueContaining("rel", "child"); Elements microformats_coll = doc.getElementsByAttributeValueContaining("rel", "colleague"); Elements microformats_contact = doc.getElementsByAttributeValueContaining("rel", "contact"); Elements microformats_contents = doc.getElementsByAttributeValueContaining("rel", "contents"); Elements microformats_copyright = doc.getElementsByAttributeValueContaining("rel", "copyright"); Elements microformats_coresident = doc.getElementsByAttributeValueContaining("rel", "co-resident"); Elements microformats_coworker = doc.getElementsByAttributeValueContaining("rel", "co-worker"); Elements microformats_crush = doc.getElementsByAttributeValueContaining("rel", "crush"); Elements microformats_date = doc.getElementsByAttributeValueContaining("rel", "date"); Elements microformats_friend = doc.getElementsByAttributeValueContaining("rel", "friend"); Elements microformats_glossary = doc.getElementsByAttributeValueContaining("rel", "glossary"); Elements microformats_help = doc.getElementsByAttributeValueContaining("rel", "help"); Elements microformats_itsrules = doc.getElementsByAttributeValueContaining("rel", "its-rules"); Elements microformats_kin = doc.getElementsByAttributeValueContaining("rel", "kin"); Elements microformats_license = doc.getElementsByAttributeValueContaining("rel", "license"); Elements microformats_me = doc.getElementsByAttributeValueContaining("rel", "me"); Elements microformats_met = doc.getElementsByAttributeValueContaining("rel", "met"); Elements microformats_muse = doc.getElementsByAttributeValueContaining("rel", "muse"); Elements microformats_neighbor = doc.getElementsByAttributeValueContaining("rel", "neighbor"); Elements microformats_next = doc.getElementsByAttributeValueContaining("rel", "next"); Elements microformats_nofollow = doc.getElementsByAttributeValueContaining("rel", "nofollow"); Elements microformats_parent = doc.getElementsByAttributeValueContaining("rel", "parent"); Elements microformats_prev = doc.getElementsByAttributeValueContaining("rel", "prev"); Elements microformats_previous = doc.getElementsByAttributeValueContaining("rel", "previous"); Elements microformats_section = doc.getElementsByAttributeValueContaining("rel", "section"); Elements microformats_sibling = doc.getElementsByAttributeValueContaining("rel", "sibling"); Elements microformats_spouse = doc.getElementsByAttributeValueContaining("rel", "spouse"); Elements microformats_start = doc.getElementsByAttributeValueContaining("rel", "start"); Elements microformats_stylesheet = doc.getElementsByAttributeValueContaining("rel", "stylesheet"); Elements microformats_subsection = doc.getElementsByAttributeValueContaining("rel", "subsection"); Elements microformats_sweetheart = doc.getElementsByAttributeValueContaining("rel", "sweetheart"); Elements microformats_tag = doc.getElementsByAttributeValueContaining("rel", "tag"); Elements microformats_toc = doc.getElementsByAttributeValueContaining("rel", "toc"); Elements microformats_transformation = doc.getElementsByAttributeValueContaining("rel", "transformation"); Elements microformats_appleti = doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon"); Elements microformats_appletip = doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon-precomposed"); Elements microformats_appletsi = doc.getElementsByAttributeValueContaining("rel", "apple-touch-startup-image"); Elements microformats_attachment = doc.getElementsByAttributeValueContaining("rel", "attachment"); Elements microformats_can = doc.getElementsByAttributeValueContaining("rel", "canonical"); Elements microformats_categ = doc.getElementsByAttributeValueContaining("rel", "category"); Elements microformats_compon = doc.getElementsByAttributeValueContaining("rel", "component"); Elements microformats_chromewebi = doc.getElementsByAttributeValueContaining("rel", "chrome-webstore-item"); Elements microformats_disclosure = doc.getElementsByAttributeValueContaining("rel", "disclosure"); Elements microformats_discussion = doc.getElementsByAttributeValueContaining("rel", "discussion"); Elements microformats_dns = doc.getElementsByAttributeValueContaining("rel", "dns-prefetch"); Elements microformats_edit = doc.getElementsByAttributeValueContaining("rel", "edit"); Elements microformats_edituri = doc.getElementsByAttributeValueContaining("rel", "EditURI"); Elements microformats_entrycon = doc.getElementsByAttributeValueContaining("rel", "entry-content"); Elements microformats_external = doc.getElementsByAttributeValueContaining("rel", "external"); Elements microformats_home = doc.getElementsByAttributeValueContaining("rel", "home"); Elements microformats_hub = doc.getElementsByAttributeValueContaining("rel", "hub"); Elements microformats_inreplyto = doc.getElementsByAttributeValueContaining("rel", "in-reply-to"); Elements microformats_index = doc.getElementsByAttributeValueContaining("rel", "index"); Elements microformats_indieauth = doc.getElementsByAttributeValueContaining("rel", "indieauth"); Elements microformats_issues = doc.getElementsByAttributeValueContaining("rel", "issues"); Elements microformats_lightbox = doc.getElementsByAttributeValueContaining("rel", "lightbox"); Elements microformats_meta = doc.getElementsByAttributeValueContaining("rel", "meta"); Elements microformats_openid = doc.getElementsByAttributeValueContaining("rel", "opendid"); Elements microformats_p3pv1 = doc.getElementsByAttributeValueContaining("rel", "p3pv1"); Elements microformats_pgpkey = doc.getElementsByAttributeValueContaining("rel", "pgpkey"); Elements microformats_pingback = doc.getElementsByAttributeValueContaining("rel", "pingback"); Elements microformats_prerender = doc.getElementsByAttributeValueContaining("rel", "prerender"); Elements microformats_profile = doc.getElementsByAttributeValueContaining("rel", "profile"); Elements microformats_rendition = doc.getElementsByAttributeValueContaining("rel", "rendition"); Elements microformats_service = doc.getElementsByAttributeValueContaining("rel", "service"); Elements microformats_shortlink = doc.getElementsByAttributeValueContaining("rel", "shortlink"); Elements microformats_sidebar = doc.getElementsByAttributeValueContaining("rel", "sidebar"); Elements microformats_sitemap = doc.getElementsByAttributeValueContaining("rel", "sitemap"); Elements microformats_subresource = doc.getElementsByAttributeValueContaining("rel", "subresource"); Elements microformats_syndication = doc.getElementsByAttributeValueContaining("rel", "syndication"); Elements microformats_timesheet = doc.getElementsByAttributeValueContaining("rel", "timesheet"); Elements microformats_webmention = doc.getElementsByAttributeValueContaining("rel", "webmention"); Elements microformats_widget = doc.getElementsByAttributeValueContaining("rel", "widget"); Elements microformats_wlwmanifest = doc.getElementsByAttributeValueContaining("rel", "wlwmanifest"); Elements microformats_imgsrc = doc.getElementsByAttributeValueContaining("rel", "image_src"); Elements microformats_cmisacl = doc.getElementsByAttributeValueContaining("rel", "http://docs.oasis-open.org/ns/cmis/link/200908/acl"); Elements microformats_stylesheetless = doc.getElementsByAttributeValueContaining("rel", "stylesheet/less"); Elements microformats_accessibility = doc.getElementsByAttributeValueContaining("rel", "accessibility"); Elements microformats_biblio = doc.getElementsByAttributeValueContaining("rel", "bibliography"); Elements microformats_cite = doc.getElementsByAttributeValueContaining("rel", "cite"); Elements microformats_group = doc.getElementsByAttributeValueContaining("rel", "group"); Elements microformats_jslicence = doc.getElementsByAttributeValueContaining("rel", "jslicense"); Elements microformats_longdesc = doc.getElementsByAttributeValueContaining("rel", "longdesc"); Elements microformats_map = doc.getElementsByAttributeValueContaining("rel", "map"); Elements microformats_member = doc.getElementsByAttributeValueContaining("rel", "member"); Elements microformats_source = doc.getElementsByAttributeValueContaining("rel", "source"); Elements microformats_status = doc.getElementsByAttributeValueContaining("rel", "status"); Elements microformats_archive = doc.getElementsByAttributeValueContaining("rel", "archive"); Elements microformats_archives = doc.getElementsByAttributeValueContaining("rel", "archives"); Elements microformats_comment = doc.getElementsByAttributeValueContaining("rel", "comment"); Elements microformats_contribution = doc.getElementsByAttributeValueContaining("rel", "contribution"); Elements microformats_endorsed = doc.getElementsByAttributeValueContaining("rel", "endorsed"); Elements microformats_fan = doc.getElementsByAttributeValueContaining("rel", "fan"); Elements microformats_feed = doc.getElementsByAttributeValueContaining("rel", "feed"); Elements microformats_footnote = doc.getElementsByAttributeValueContaining("rel", "footnote"); Elements microformats_icon = doc.getElementsByAttributeValueContaining("rel", "icon"); Elements microformats_kinstyle = doc.getElementsByAttributeValueContaining("rel", "kinetic-stylesheet"); Elements microformats_prettyphoto = doc.getElementsByAttributeValueContaining("rel", "prettyPhoto"); Elements microformats_clearbox = doc.getElementsByAttributeValueContaining("rel", "clearbox"); Elements microformats_made = doc.getElementsByAttributeValueContaining("rel", "made"); Elements microformats_microsummary = doc.getElementsByAttributeValueContaining("rel", "microsummary"); Elements microformats_noreferrer = doc.getElementsByAttributeValueContaining("rel", "noreferrer"); Elements microformats_permalink = doc.getElementsByAttributeValueContaining("rel", "permalink"); Elements microformats_popover = doc.getElementsByAttributeValueContaining("rel", "popover"); Elements microformats_prefetch = doc.getElementsByAttributeValueContaining("rel", "prefetch"); Elements microformats_publickey = doc.getElementsByAttributeValueContaining("rel", "publickey"); Elements microformats_publisher = doc.getElementsByAttributeValueContaining("rel", "publisher"); Elements microformats_referral = doc.getElementsByAttributeValueContaining("rel", "referral"); Elements microformats_related = doc.getElementsByAttributeValueContaining("rel", "related"); Elements microformats_replies = doc.getElementsByAttributeValueContaining("rel", "replies"); Elements microformats_resource = doc.getElementsByAttributeValueContaining("rel", "resource"); Elements microformats_search = doc.getElementsByAttributeValueContaining("rel", "search"); Elements microformats_sponsor = doc.getElementsByAttributeValueContaining("rel", "sponsor"); Elements microformats_tooltip = doc.getElementsByAttributeValueContaining("rel", "tooltip"); Elements microformats_trackback = doc.getElementsByAttributeValueContaining("rel", "trackback"); Elements microformats_unendorsed = doc.getElementsByAttributeValueContaining("rel", "unendorsed"); Elements microformats_user = doc.getElementsByAttributeValueContaining("rel", "user"); Elements microformats_wlw = doc.getElementsByAttributeValueContaining("rel", "wlwmanifest"); //-----microformats2 Elements microformats2_hadr = doc.getElementsByAttributeValueContaining("class", "h-adr"); Elements microformats2_hcard = doc.getElementsByAttributeValueContaining("class", "h-card"); Elements microformats2_hentry = doc.getElementsByAttributeValueContaining("class", "h-entry"); Elements microformats2_hevent = doc.getElementsByAttributeValueContaining("class", "h-event"); Elements microformats2_hgeo = doc.getElementsByAttributeValueContaining("class", "h-geo"); Elements microformats2_hitem = doc.getElementsByAttributeValueContaining("class", "h-item"); Elements microformats2_hproduct = doc.getElementsByAttributeValueContaining("class", "h-product"); Elements microformats2_hrecipe = doc.getElementsByAttributeValueContaining("class", "h-recipe"); Elements microformats2_hresume = doc.getElementsByAttributeValueContaining("class", "h-resume"); Elements microformats2_hreview = doc.getElementsByAttributeValueContaining("class", "h-review"); Elements microformats2_hreviewagg = doc.getElementsByAttributeValueContaining("class", "h-review-aggregate"); Elements foaf_autodiscoveries = doc.getElementsByAttributeValueContaining("href", "foaf"); Elements foaf_types = doc.getElementsByAttributeValueContaining("type", "foaf"); Elements media = doc.select("embed"); Elements iframes = doc.select("iframe"); Elements script_el = doc.select("script"); Elements reltags = doc.select("link[rel]"); Elements reltags_a = doc.select("a[rel]"); number_embeded_videos = media.size(); scripts_number = script_el.size(); frames_number = iframes.size(); nschem = schemas.size(); hreln = reltags.size() + reltags_a.size(); foaf = foaf_autodiscoveries.size() + foaf_types.size(); micron1 = microformats_cmisacl.size() + microformats_vcard.size() + microformats_vevent.size() + microformats_hreview.size() + microformats_vgeo.size() + microformats_vcalendar.size() + microformats_vadrn.size() + microformats_acquaintance.size() + microformats_alternate.size() + microformats_appendix.size() + microformats_biblio.size() + microformats_bookmark.size() + microformats_chapter.size() + microformats_child.size() + microformats_coll.size() + microformats_contact.size() + microformats_contents.size() + microformats_copyright.size() + microformats_coresident.size() + microformats_coworker.size() + microformats_crush.size() + microformats_date.size() + microformats_friend.size() + microformats_glossary.size() + microformats_help.size() + microformats_itsrules.size() + microformats_kin.size() + microformats_license.size() + microformats_me.size() + microformats_met.size() + microformats_muse.size() + microformats_neighbor.size() + microformats_next.size() + microformats_nofollow.size() + microformats_parent.size() + microformats_prev.size() + microformats_previous.size() + microformats_section.size() + microformats_sibling.size() + microformats_spouse.size() + microformats_start.size() + microformats_stylesheet.size() + microformats_subsection.size() + microformats_sweetheart.size() + microformats_tag.size() + microformats_toc.size() + microformats_transformation.size() + microformats_appleti.size() + microformats_appletip.size() + microformats_appletsi.size() + microformats_attachment.size() + microformats_can.size() + microformats_categ.size() + microformats_compon.size() + microformats_chromewebi.size() + microformats_disclosure.size() + microformats_discussion.size() + microformats_dns.size() + microformats_edit.size() + microformats_edituri.size() + microformats_entrycon.size() + microformats_external.size() + microformats_home.size() + microformats_hub.size() + microformats_inreplyto.size() + microformats_index.size() + microformats_indieauth.size() + microformats_issues.size() + microformats_lightbox.size() + microformats_meta.size() + microformats_openid.size() + microformats_p3pv1.size() + microformats_pgpkey.size() + microformats_pingback.size() + microformats_prerender.size() + microformats_profile.size() + microformats_rendition.size() + microformats_service.size() + microformats_shortlink.size() + microformats_sidebar.size() + microformats_sitemap.size() + microformats_subresource.size() + microformats_syndication.size() + microformats_timesheet.size() + microformats_webmention.size() + microformats_widget.size() + microformats_wlwmanifest.size() + microformats_imgsrc.size() + microformats_imgsrc.size() + microformats_stylesheetless.size() + microformats_accessibility.size() + microformats_accessibility.size() + microformats_cite.size() + microformats_group.size() + microformats_jslicence.size() + microformats_longdesc.size() + microformats_map.size() + microformats_member.size() + microformats_source.size() + microformats_status.size() + microformats_archive.size() + microformats_archives.size() + microformats_comment.size() + microformats_contribution.size() + microformats_endorsed.size() + microformats_fan.size() + microformats_feed.size() + microformats_footnote.size() + microformats_icon.size() + microformats_kinstyle.size() + microformats_prettyphoto.size() + microformats_clearbox.size() + microformats_made.size() + microformats_microsummary.size() + microformats_noreferrer.size() + microformats_permalink.size() + microformats_popover.size() + microformats_prefetch.size() + microformats_publickey.size() + microformats_publisher.size() + microformats_referral.size() + microformats_related.size() + microformats_replies.size() + microformats_resource.size() + microformats_search.size() + microformats_sponsor.size() + microformats_tooltip.size() + microformats_trackback.size() + microformats_unendorsed.size() + microformats_user.size() + microformats_wlw.size() + foaf; micron2 = microformats2_hadr.size() + microformats2_hcard.size() + microformats2_hentry.size() + microformats2_hevent.size() + microformats2_hgeo.size() + microformats2_hitem.size() + microformats2_hproduct.size() + microformats2_hrecipe.size() + microformats2_hresume.size() + microformats2_hreview.size() + microformats2_hreviewagg.size(); total_micron = micron1 + micron2; microd = microdata.size(); return true; } catch (IOException | IllegalCharsetNameException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); return false; } } /** * Method to get all the elements with a specific html feature (not used in SWebRank's current version) * @param link_html the url to check * @param dir the directory to save the file * @return a list with the text of all the elements */ public List<String> getbold(String link_html, String dir) { List<String> SEwords = new ArrayList<String>(); try { //link_html="http://www.themismavridis.com/"; Document doc = Jsoup.connect(link_html).get(); //---------to select the rest of the terms Elements bold = doc.select("em"); //bold=bold.select("b"); for (Element btext : bold) { String stringtosplit = btext.text().toString().toString(); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", ""); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { String[] tokenizedTerms = stringtosplit.split("\\W+"); for (int j = 0; j < tokenizedTerms.length; j++) { if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) { SEwords.add(tokenizedTerms[j]); } } } } } File file_thelist = new File(dir + "Javawords.txt"); FileUtils.writeLines(file_thelist, SEwords); return SEwords; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); System.out.print("can not create the content file for SEwords"); return SEwords; } } /** * Method to check if we can connect with JSOUP to a specific url * @param link_html the url to connect * @return true/false */ public boolean checkconn(String link_html) { try { Connection.Response response = Jsoup.connect(link_html).timeout(10 * 1000).execute(); return response.statusCode() == 200; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); System.out.print("can not connect to:" + link_html); return false; } } }