List of usage examples for org.apache.commons.lang StringUtils substringBetween
public static String substringBetween(String str, String open, String close)
Gets the String that is nested in between two Strings.
From source file:news.tna_youtube.Youtube.java
public boolean getLinkYT(String yturl) { try {/*from ww w . j a va 2 s . co m*/ URL oracle = new URL(yturl); textReader = new BufferedReader(new InputStreamReader(oracle.openStream())); String line; String line0 = ""; String line1 = ""; while ((line = textReader.readLine()) != null) { if (line.matches(".*(\"adaptive_fmts\":|\"url_encoded_fmt_stream_map\":).*")) { // behind that two strings are the comma separated video URLs we use rc = true; HashMap<String, String> sourceCodeVideoUrls = new HashMap<String, String>(); line = line.replaceAll(" ", ""); line = line.replace("%25", "%"); line = line.replace("\\u0026", "&"); if (line.contains("\"url_encoded_fmt_stream_map\":\"")) { line1 = StringUtils.substringBetween(line, "\"url_encoded_fmt_stream_map\":\"", "\""); } line0 = ""; if (line.contains("\"adaptive_fmts\":\"")) { line0 = StringUtils.substringBetween(line, "\"adaptive_fmts\":\"", "\""); } if (line0 == null) { line0 = ""; } if (line1 == null) { line1 = ""; } line = line0 + "," + line1; System.err .println(String.format("length sline0 sline1: %d %d", line0.length(), line1.length())); System.err.println(String.format("sline0 (adaptive fmt) %s \n sline1 (url_encoded fmt): %s", line0, line1)); String[] sourceCodeYoutubeUrls = line.split(","); System.err.println( "ssourcecodeuturls.length: ".concat(Integer.toString(sourceCodeYoutubeUrls.length))); for (String url : sourceCodeYoutubeUrls) { // assuming rtmpe is used for all resolutions, if found once - end download if (url.matches(".*conn=rtmpe.*")) { System.err.println("RTMPE found. cannot download this one!"); break; } String[] fmtUrlPair = url.split("url=http(s)?", 2); fmtUrlPair[1] = "url=http" + fmtUrlPair[1] + "&" + fmtUrlPair[0]; // grep itag=xz out and use xy as hash key // 2013-02 itag now has up to 3 digits fmtUrlPair[0] = fmtUrlPair[1].substring(fmtUrlPair[1].indexOf("itag=") + 5, fmtUrlPair[1].indexOf("itag=") + 5 + 1 + (fmtUrlPair[1].matches(".*itag=[0-9]{2}.*") ? 1 : 0) + (fmtUrlPair[1].matches(".*itag=[0-9]{3}.*") ? 1 : 0)); if (yturl.startsWith("https")) { fmtUrlPair[1] = fmtUrlPair[1].replaceFirst("url=http%3A%2F%2F", "https://"); // webpage source code only provides http urls if accessed via wget or ytd2 - the browser does something unknown so google sends back httpS urls within source code! } else { fmtUrlPair[1] = fmtUrlPair[1].replaceFirst("url=http%3A%2F%2F", "http://"); } fmtUrlPair[1] = fmtUrlPair[1].replaceAll("%3F", "?").replaceAll("%2F", "/") .replaceAll("%3B", ";")/*.replaceAll("%2C",",")*/.replaceAll("%3D", "=") .replaceAll("%26", "&").replaceAll("%252C", "%2C").replaceAll("sig=", "signature=") .replaceAll("&s=", "&signature=").replaceAll("\\?s=", "?signature="); // remove duplicate parts between & String sortedUrl = this.sortStringAt(fmtUrlPair[1], "&"); System.err.println(String.format("video tag: %s url: %s", fmtUrlPair[0], sortedUrl)); // fmtUrlPair[1] -> ssortedURL fmtUrlPair[1] = sortedUrl; try { sourceCodeVideoUrls.put(fmtUrlPair[0], fmtUrlPair[1]); // save that URL //debugoutput(String.format( "video url saved with key %s: %s",fmtUrlPair[0],ssourcecodevideourls.get(fmtUrlPair[0]) )); resolutions = resolutions.concat(fmtUrlPair[0].equals("138") ? "2304p mpeg, " : // 4k HD type=video/mp4;+codecs="avc1.640033" & size=4096x2304 fmtUrlPair[0].equals("264") ? "1440p mpeg, " : // <4k HD type=video/mp4;+codecs="avc1.640032" & size=2560x1440 fmtUrlPair[0].equals("266") ? "2160p mpeg, " : // <4k HD type=video/mp4;+codecs=%22avc1.640033" & size=3840x2160& fmtUrlPair[0].equals("271") ? "1440p webm, " : // <4k HD type=video/webm;+codecs=%22vp9" & size=2560x1440 fmtUrlPair[0].equals("272") ? "2160p webm, " : // <4k HD type=video/webm;+codecs="vp9" & size=3840x2160 fmtUrlPair[0].equals("248") ? "1080p mpeg, " : // HD type=video/webm;+codecs="vp9" & size=1920x1080 fmtUrlPair[0].equals("37") ? "1080p mpeg, " : // HD type=video/mp4;+codecs="avc1.64001F,+mp4a.40.2" fmtUrlPair[0].equals("22") ? "720p mpeg, " : // HD type=video/mp4;+codecs="avc1.64001F,+mp4a.40.2" fmtUrlPair[0].equals( "247") ? "1080p mpeg, " : // HD type=video/webm;+codecs="vp9" & size=1280x720 fmtUrlPair[0] .equals("84") ? "1080p 3d mpeg, " : // HD 3D type=video/mp4;+codecs="avc1.64001F,+mp4a.40.2" fmtUrlPair[0] .equals("18") ? "360p mpeg, " : // SD type=video/mp4;+codecs="avc1.42001E,+mp4a.40.2" fmtUrlPair[0] .equals("35") ? "480p flv, " : // SD type=video/x-flv fmtUrlPair[0] .equals("34") ? "360p flv, " : // SD type=video/x-flv fmtUrlPair[0] .equals("82") ? "360p 3d mpeg, " : // SD 3D type=video/mp4;+codecs="avc1.42001E,+mp4a.40.2" fmtUrlPair[0] .equals("36") ? "240p mpeg 3gpp, " : // LD type=video/3gpp;+codecs="mp4v.20.3,+mp4a.40.2" fmtUrlPair[0] .equals("17") ? "114p mpeg 3gpp, " : // LD type=video/3gpp;+codecs="mp4v.20.3,+mp4a.40.2" fmtUrlPair[0] .equals("46") ? "1080p webm, " : // HD type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("45") ? "720p webm, " : // HD type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("100") ? "1080p 3d webm, " : // HD 3D type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("44") ? "480p webm, " : // SD type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("43") ? "360p webm, " : // SD type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("102") ? "360p 3d webm, " : // SD 3D type=video/webm;+codecs="vp8.0,+vorbis" fmtUrlPair[0] .equals("244") ? "480p webm, " : // SD type=video/webm;+codecs="vp9" & size=854x480 fmtUrlPair[0] .equals("5") ? "240p flv, " : // LD type=video/x-flv fmtUrlPair[0] .equals("137") ? "1080p mpeg, " : // HD type=video/mp4;+codecs="avc1.640028" & size=1920x1080 fmtUrlPair[0] .equals("136") ? "720p mpeg, " : // HD type=video/mp4;+codecs="avc1.4d401f" & size=1280x720 fmtUrlPair[0] .equals("135") ? "480p mpeg, " : // SD type=video/mp4;+codecs="avc1.4d401f" & size=854x480 fmtUrlPair[0] .equals("134") ? "360p mpeg, " : // SD type=video/mp4;+codecs="avc1.4d401e" & size=640x360 fmtUrlPair[0] .equals("133") ? "240p mpeg, " : // LD type=video/mp4;+codecs="avc1.4d4015" & size=426x240 fmtUrlPair[0] .equals("160") ? "144p mpeg, " : // LD type=video/mp4;+codecs="avc1.42c00c" & size=256x144 fmtUrlPair[0] .equals("243") ? "360p webm, " : // LD type=video/webm;+codecs="vp9" fmtUrlPair[0] .equals("242") ? "240p webm, " : // LD type=video/webm;+codecs="vp9" fmtUrlPair[0] .equals("140") ? "mpeg audio only, " : // ?? type=audio/mp4;+codecs="mp4a.40.2 & bitrate=127949 fmtUrlPair[0] .equals("171") ? "ogg vorbis audio only, " : // ?? audio/webm;+codecs="vorbis" & bitrate=127949 "unknown resolution! (" .concat(fmtUrlPair[0]) .concat(") ")); } catch (java.lang.ArrayIndexOutOfBoundsException aioobe) { //TODO catch must not be empty } } } } textReader.close(); } catch (MalformedURLException ex) { Logger.getLogger(Youtube.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(Youtube.class.getName()).log(Level.SEVERE, null, ex); } return false; }
From source file:nl.ucan.navigate.NestedPath.java
private static String namedToPositioned(String next, int position) { String substr = StringUtils.substringBetween(next, "" + ResolverImpl.getIndexedStart(), "" + ResolverImpl.getIndexedEnd()); return StringUtils.replace(next, substr, "" + position); }
From source file:no.imr.stox.functions.utils.RUtils.java
public static String callR(String rFolder, String fileName, String cmd) { try {// w w w .j a va 2s.c om String triggerFile = getTmpDir() + fileName; try (PrintWriter pw = new PrintWriter(triggerFile)) { pw.println(cmd); } Process proc = callR(rFolder, triggerFile, false); if (proc != null) { java.io.InputStream is = proc.getInputStream(); java.util.Scanner s = new java.util.Scanner(is).useDelimiter(";"); while (s.hasNext()) { String str = s.next(); return StringUtils.substringBetween(str, "\"", "\""); } } } catch (IOException ex) { } return ""; }
From source file:opennlp.tools.apps.relevanceVocabs.PhraseProcessor.java
public List<String> extractNounPhraseProductNameCandidate(String sentence) { List<String> queryArrayStr = new ArrayList<String>(); if (sentence.split(" ").length == 1) { // this is a word, return empty //queryArrayStr.add( sentence); return queryArrayStr; }/*ww w. j av a 2 s. c om*/ String quoted1 = StringUtils.substringBetween(sentence, "\"", "\""); String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'"); List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); if (groupedChunks.size() < 1) return queryArrayStr; List<ParseTreeChunk> nPhrases = groupedChunks.get(0); for (ParseTreeChunk ch : nPhrases) { String query = ""; int size = ch.getLemmas().size(); boolean phraseBeingFormed = false; for (int i = 0; i < size; i++) { if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J") || ch.getPOSs().get(i).startsWith("CD"))) // && StringUtils.isAlpha(ch.getLemmas().get(i))) { query += ch.getLemmas().get(i) + " "; phraseBeingFormed = true; } else if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO")) && phraseBeingFormed) break; else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC")) continue; } query = query.trim(); int len = query.split(" ").length; if (len > 5 || len < 2) // too long or too short continue; /* if (len < 4 && len>1) { // every word should start with capital String[] qs = query.split(" "); boolean bAccept = true; for (String w : qs) { if (w.toLowerCase().equals(w)) // idf only two words then // has to be person name, // title or geo // location bAccept = false; } if (!bAccept) continue; } */ // individual word, possibly a frequent word // if len==1 do nothing query = query.trim(); queryArrayStr.add(query); } /* if (queryArrayStr.size() < 1) { // release constraints on NP down to 2 // keywords for (ParseTreeChunk ch : nPhrases) { String query = ""; int size = ch.getLemmas().size(); for (int i = 0; i < size; i++) { if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J")) { query += ch.getLemmas().get(i) + " "; } } query = query.trim(); int len = query.split(" ").length; if (len < 2) continue; query = TextProcessor.fastTokenize(query.toLowerCase(), false) .toString().replace('[', ' ').replace(']', ' ').trim(); if (query.length() > 6) queryArrayStr.add(query); } } //queryArrayStr = Utils // .removeDuplicatesFromQueries(queryArrayStr); if (quoted1 != null && ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1 .length() > 10)) queryArrayStr.add(quoted1); if (quoted2 != null && ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2 .length() > 10)) queryArrayStr.add(quoted2); */ return queryArrayStr; }
From source file:opennlp.tools.doc_classifier.DocClassifierTrainingSetMultilingualExtender.java
public void processDirectory(String fileName) throws IOException { List<String[]> report = new ArrayList<String[]>(); report.add(new String[] { "filename", "category", "confirmed?", }); addFiles(new File(fileName)); // FileUtils.deleteDirectory(new File(destinationDir)); // FileUtils.forceMkdir(new File(destinationDir)); for (File f : queue) { String content = null;/*from w w w . ja v a 2 s .com*/ try {// should be wiki page //if (f.getName().toString().toLowerCase().indexOf(" wiki")<0 && // if ( f.getAbsolutePath().indexOf("wiki-new")<0) // continue; // should not be a page already derived by a link if (f.getName().toString().toLowerCase().indexOf(".html_") > -1) continue; System.out.println("processing " + f.getName()); content = FileUtils.readFileToString(f, "utf-8"); int langIndex = 0; for (String[] begEnd : multilingualTokens) { String urlDirty = StringUtils.substringBetween(content, begEnd[0], begEnd[1]); String url = StringUtils.substringBefore(urlDirty, "\""); if (url != null) { if (!url.startsWith("http:")) url = "https:" + url; String[] parts = url.split("/"); String multilingualName = parts[parts.length - 1]; String destFileName = f.getAbsolutePath().replace(sourceDir, destinationDir) .replace(" - Wikipedia, the free encyclopedia.html", "-wiki") + "." + langs[langIndex] + "." + "_" + multilingualName + ".html"; if (!new File(destFileName).exists()) { saveDocFromTheWeb(url, destFileName); System.out.println(f.getName() + " => " + destFileName); } } else { System.out.println("Unable to extract multilingual urls for'" + langs[langIndex] + "' from file " + f.getCanonicalPath()); } langIndex++; } } catch (Exception ee) { ee.printStackTrace(); } } queue.clear(); }
From source file:opennlp.tools.jsmlearning.FeatureSpaceCoverageProcessor.java
public Map<String, String> computeIntersection(String[] line1, String[] line2) { Map<String, String> attr_value = new HashMap<String, String>(); for (String attr : attributes) { int attrIndex = getIdForAttributeName(attr); String v1 = line1[attrIndex].toLowerCase().replace("\"", "").replace(", ", ", ").replace(", ", ","); ;/*from w w w .j a va2s. c o m*/ String v2 = line2[attrIndex].toLowerCase().replace("\"", "").replace(", ", ", ").replace(", ", ","); ; String valArr1Str = StringUtils.substringBetween(v1, "{", "}"); String valArr2Str = StringUtils.substringBetween(v2, "{", "}"); if (valArr1Str == null || valArr2Str == null) { // we assume single value, not an array of values if (v1.equals(v2)) { attr_value.put(attr, v1); } } else { valArr1Str = valArr1Str.replaceAll(", ", ","); valArr2Str = valArr2Str.replaceAll(", ", ","); String[] valArr1 = valArr1Str.split(","); String[] valArr2 = valArr2Str.split(","); List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1)); List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2)); valList1.retainAll(valList2); /* verification of coverage valList1.retainAll(valList2); List<String> vl1 = new ArrayList<String>(Arrays.asList(valArr1)); valList1.retainAll(vl1); */ if (!valList1.isEmpty()) { v1 = "{" + valList1.toString().replace("[", " ").replace("]", " ").trim() + "}"; attr_value.put(attr, v1); } } } return attr_value; }
From source file:opennlp.tools.jsmlearning.FeatureSpaceCoverageProcessor.java
public boolean ruleCoversCase(Map<String, String> attr_value, String[] line) { boolean soFarCovers = true; for (String attr : attributes) { int attrIndex = getIdForAttributeName(attr); String rule = attr_value.get(attr); if (rule == null) continue; // no constraint rule = rule.toLowerCase().replace("\"", "").replace(", ", ",").replace(", ", ","); String vCase = line[attrIndex].toLowerCase().replace("\"", "").replace(", ", ",").replace(", ", ","); if (vCase == null) {// rule for this attribute exists but case has no value soFarCovers = false;// w w w . j a v a2s . c o m return false; } String valArrCaseStr = StringUtils.substringBetween(vCase, "{", "}"); String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}"); if (valArrCaseStr == null || valArrRuleStr == null) { // we assume single value, not an array of values if (!vCase.equals(rule)) { soFarCovers = false; return false; } } else { String[] valArrCase = valArrCaseStr.split(","); String[] valArrRule = valArrRuleStr.split(","); List<String> valListCase = new ArrayList<String>(Arrays.asList(valArrCase)); List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule)); int ruleSize = valListRule.size(); //System.out.println(valListRule); //System.out.println(valListCase); // rule members are subset of case valListRule.retainAll(valListCase); //System.out.println(valListRule); if (ruleSize != valListRule.size()) { soFarCovers = false; return false; } } } return soFarCovers; }
From source file:opennlp.tools.jsmlearning.FeatureSpaceCoverageProcessor.java
public boolean ruleCoversRule(Map<String, String> attr_value, Map<String, String> line) { boolean soFarCovers = true; for (String attr : attributes) { int attrIndex = getIdForAttributeName(attr); String rule = attr_value.get(attr); if (rule == null) continue; // no constraint String vRuleBeingCovered = line.get(attr); if (vRuleBeingCovered == null) {// rule for this attribute exists but RuleBeingCovered has no value soFarCovers = false;/*from w w w. ja v a2 s .c om*/ return false; } String valArrRuleBeingCoveredStr = StringUtils.substringBetween(vRuleBeingCovered, "{", "}"); String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}"); if (valArrRuleBeingCoveredStr == null || valArrRuleStr == null) { // we assume single value, not an array of values if (!vRuleBeingCovered.equals(rule)) { soFarCovers = false; return false; } } else { String[] valArrRuleBeingCovered = valArrRuleBeingCoveredStr.split(","); String[] valArrRule = valArrRuleStr.split(","); List<String> valListRuleBeingCovered = new ArrayList<String>(Arrays.asList(valArrRuleBeingCovered)); List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule)); for (String r : valListRule) { if (!strListContainsMember(valListRuleBeingCovered, r)) { soFarCovers = false; return false; } } } } return soFarCovers; }
From source file:opennlp.tools.jsmlearning.FeatureSpaceCoverageProcessor.java
public Map<String, String> computeIntersection(Map<String, String> rule1, Map<String, String> rule2) { Map<String, String> attr_value = new HashMap<String, String>(); for (String attr : attributes) { int attrIndex = getIdForAttributeName(attr); String v1 = rule1.get(attr); String v2 = rule2.get(attr); if (v1 == null || v2 == null) continue; String valArr1Str = StringUtils.substringBetween(v1, "{", "}"); String valArr2Str = StringUtils.substringBetween(v2, "{", "}"); if (valArr1Str == null || valArr2Str == null) { // we assume single value, not an array of values if (v1.equals(v2)) { attr_value.put(attr, v1); }//from w ww . ja va 2 s . co m } else { valArr1Str = valArr1Str.replaceAll(", ", ","); valArr2Str = valArr2Str.replaceAll(", ", ","); String[] valArr1 = valArr1Str.split(","); String[] valArr2 = valArr2Str.split(","); List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1)); List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2)); valList1.retainAll(valList2); if (!valList1.isEmpty()) { v1 = "{" + valList1.toString().replace("[", " ").replace("]", " ").trim() + "}"; attr_value.put(attr, v1); } } } return attr_value; }
From source file:opennlp.tools.parse_thicket.apps.WebPageExtractor.java
public List<String[]> extractSentencesWithPotentialProductKeywords(String url) { int maxSentsFromPage = 20; List<String[]> results = new ArrayList<String[]>(); String downloadedPage = pageFetcher.fetchPage(url, 20000); if (downloadedPage == null || downloadedPage.length() < 100) { return null; }//from w w w .ja va 2 s. c o m String pageOrigHTML = pageFetcher.fetchOrigHTML(url); String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>"); pageTitle = pageTitle.replace(" ", ". ").replace("..", ".").replace(". . .", " ").replace(": ", ". ") .replace("- ", ". ").replace(" |", ". ").replace(". .", ".").trim(); List<String> pageTitles = new ArrayList<String>(); pageTitles.addAll(TextProcessor.splitToSentences(pageTitle)); pageTitles.addAll(Arrays.asList(pageTitle.split("."))); String[] headerSections = pageOrigHTML.split("<h2"); if (headerSections.length < 2) headerSections = pageOrigHTML.split("<h3"); for (String section : headerSections) { String header = StringUtils.substringBetween(section, ">", "<"); if (header != null && header.length() > 20) pageTitles.add(header); } downloadedPage = downloadedPage.replace(" ", "&"); downloadedPage = downloadedPage.replaceAll("(?:&)+", "#"); String[] sents = downloadedPage.split("#"); List<TextChunk> sentsList = new ArrayList<TextChunk>(); for (String s : sents) { s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ").replace(": ", ". ") .replace("- ", ". ").replace(". .", ".").trim(); sentsList.add(new TextChunk(s, s.length())); } Collections.sort(sentsList, new TextChunkComparable()); String[] longestSents = new String[maxSentsFromPage]; int j = 0; for (int i = sentsList.size() - maxSentsFromPage; i < sentsList.size(); i++) { longestSents[j] = sentsList.get(i).text; j++; } sents = cleanListOfSents(longestSents); List<String> mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter .getMostFrequentWordsInTextArr(sents); // mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage); results.add(pageTitles.toArray(new String[0])); results.add(mosFrequentWordsListFromPage.toArray(new String[0])); results.add(sents); return results; }