List of usage examples for org.apache.commons.lang.text StrTokenizer reset
public StrTokenizer reset(char[] input)
From source file:com.savy3.util.DBConfiguration.java
/** * Converts a String back to connection parameters. * @param input String from configuration * @return JDBC connection parameters/*ww w . j a v a 2 s. c o m*/ */ protected static Properties propertiesFromString(String input) { if (input != null && !input.isEmpty()) { Properties result = new Properties(); StrTokenizer propertyTokenizer = StrTokenizer.getCSVInstance(input); StrTokenizer valueTokenizer = StrTokenizer.getCSVInstance(); valueTokenizer.setDelimiterChar('='); while (propertyTokenizer.hasNext()) { valueTokenizer.reset(propertyTokenizer.nextToken()); String[] values = valueTokenizer.getTokenArray(); if (values.length == 2) { result.put(values[0], values[1]); } } return result; } else { return null; } }
From source file:it.drwolf.ridire.session.async.Mapper.java
@SuppressWarnings("unchecked") public static Integer countWordsFromPoSTagResource(String posTagResourceFileName) throws IOException { List<String> lines = FileUtils.readLines(new File(posTagResourceFileName)); Integer count = 0;/*w w w. j av a 2 s. c o m*/ StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { if (Mapper.isValidPos(tokens[1].trim())) { ++count; } } } return count; }
From source file:it.drwolf.ridire.session.async.WordCounter.java
public Integer countWordsFromPoSTagResource(File posTagResourceFile) throws IOException { List<String> lines = FileUtils.readLines(posTagResourceFile); Integer count = 0;//from w w w . j ava2 s . c o m StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { if (this.isValidPos(tokens[1].trim())) { ++count; } } } return count; }
From source file:it.drwolf.ridire.index.cwb.CWBFrequencyList.java
private String getFrequencyList(boolean deleteFLFile, List<String> semDescription, List<String> funDescription, int quantityP, String type, Integer threshold, boolean sorted) { CommandLine commandLine = CommandLine.parse(this.cwbscanExecutable); commandLine.addArgument("-q"); if (threshold != null && threshold > 0) { commandLine.addArgument("-f"); commandLine.addArgument(threshold + ""); }//w w w.j a v a 2 s . c om commandLine.addArgument("-r").addArgument(this.cqpRegistry); commandLine.addArgument("-C"); commandLine.addArgument(this.cqpCorpusName); if (type.equals("forma")) { commandLine.addArgument("word+0"); } else if (type.equals("PoS")) { commandLine.addArgument("pos+0"); } else if (type.equals("easypos")) { commandLine.addArgument("easypos+0"); } else if (type.equals("lemma")) { commandLine.addArgument("lemma+0"); } else if (type.equals("PoS-forma")) { commandLine.addArgument("pos+0"); commandLine.addArgument("word+0"); } else if (type.equals("PoS-lemma")) { commandLine.addArgument("pos+0"); commandLine.addArgument("lemma+0"); } String semFuncParam = ""; if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null && funDescription.get(0).trim().length() > 0 || semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null && semDescription.get(0).trim().length() > 0) { semFuncParam = "?"; if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null && funDescription.get(0).trim().length() > 0) { String fd = StringUtils.join(funDescription, "\\|"); semFuncParam += "text_functional=/\\(" + fd + "\\)/ "; } if (semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null && semDescription.get(0).trim().length() > 0) { String sd = StringUtils.join(semDescription, "\\|"); semFuncParam += "text_semantic=/\\(" + sd + "\\)/ "; } commandLine.addArgument(semFuncParam); } if (sorted) { commandLine.addArgument("|"); commandLine.addArgument("sort"); commandLine.addArgument("-nr"); commandLine.addArgument("-k"); commandLine.addArgument("1"); } if (quantityP > 0) { commandLine.addArgument("|"); commandLine.addArgument("head"); commandLine.addArgument("-" + quantityP); } File flTempFile = null; try { flTempFile = File.createTempFile("ridireFL", null); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } commandLine.addArgument(" > "); commandLine.addArgument(flTempFile.getAbsolutePath()); String c = commandLine.toString(); try { File tempSh = File.createTempFile("ridireSH", ".sh"); FileUtils.writeStringToFile(tempSh, c); tempSh.setExecutable(true); commandLine = CommandLine.parse(tempSh.getAbsolutePath()); DefaultExecutor executor = new DefaultExecutor(); executor.setExitValue(0); ExecuteWatchdog watchdog = new ExecuteWatchdog(CWBFrequencyList.TIMEOUT); executor.setWatchdog(watchdog); ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024); ByteArrayOutputStream baosStdErr = new ByteArrayOutputStream(1024); ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, baosStdErr, null); executor.setStreamHandler(executeStreamHandler); int exitValue = 0; exitValue = executor.execute(commandLine); FileUtils.deleteQuietly(tempSh); if (exitValue == 0) { StrTokenizer strTokenizer = new StrTokenizer(); this.frequencyList = new ArrayList<FrequencyItem>(); List<String> lines = FileUtils.readLines(flTempFile); for (String line : lines) { strTokenizer.reset(line); String[] tokens = strTokenizer.getTokenArray(); if (tokens.length == 2) { FrequencyItem frequencyItem = new FrequencyItem(tokens[1], Integer.parseInt(tokens[0].trim())); this.frequencyList.add(frequencyItem); } else if (tokens.length == 3) { FrequencyItem frequencyItem = new FrequencyItem(tokens[2], tokens[1], Integer.parseInt(tokens[0].trim())); this.frequencyList.add(frequencyItem); } } if (deleteFLFile) { FileUtils.deleteQuietly(flTempFile); } } } catch (ExecuteException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return flTempFile.getAbsolutePath(); }
From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java
private Map<String, Map<String, Number>> createResTable(List<String> lines, StrTokenizer strTokenizer) { Map<String, Map<String, Number>> resTable = new HashMap<String, Map<String, Number>>(); for (String line : lines) { String[] tokens = strTokenizer.reset(line).getTokenArray(); if (tokens.length != 3) { continue; }//from w w w .java 2s . c o m String[] lemmas = tokens[0].split("\\s"); String[] poss = tokens[1].split("\\s"); if (lemmas.length != poss.length || poss.length < 2) { continue; } String target = tokens[2].trim(); String preArtpre = null; for (int i = 1; i < poss.length; i++) { if (poss[i].trim().matches("PRE|ARTPRE")) { preArtpre = lemmas[i].trim(); break; } } if (preArtpre == null) { continue; } Map<String, Number> tableForPre = resTable.get(preArtpre); if (tableForPre == null) { tableForPre = new HashMap<String, Number>(); } Number n = tableForPre.get(target); if (n == null) { tableForPre.put(target, 1); } else { tableForPre.put(target, n.intValue() + 1); } resTable.put(preArtpre, tableForPre); } return resTable; }
From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java
private void processNotTrinaryTable(HashMap<String, SketchResult> sr, String freqTable, long firstFreq, StrTokenizer strTokenizer, List<String> lines) { if (lines != null && lines.size() > 0 && lines.get(0).trim().length() > 0) { // fA = first line Number fA = Long.valueOf(lines.get(0).trim()); for (String l : lines) { String[] tokens = strTokenizer.reset(l).getTokenArray(); if (tokens.length != 2) { continue; }/*from ww w. j a v a 2 s . c om*/ String f = tokens[1]; List<Number> fBs = this.entityManager .createNativeQuery("select freq from " + freqTable + " where item=:item") .setParameter("item", f).getResultList(); if (fBs == null || fBs.size() < 1) { continue; } SketchResult res = sr.get(f); if (res == null) { res = new SketchResult(); } long fB = fBs.get(0).longValue(); if (fBs != null && fBs.size() > 0 && fB > 0) { res.setCollocata(f); long n = this.corpusSizeParams.getCorpusSize(freqTable.substring(5)).longValue(); long fAB = Long.parseLong(tokens[0]); double score = this.getSketchScore(CWBCollocatesExtractor.LOGDICE_SCORE, fA.longValue(), fB, fAB, n); res.setScore(score); res.setfA(fA.longValue()); res.setfAB(fAB); res.setfB(fB); sr.put(f, res); } } } }
From source file:it.drwolf.ridire.util.async.FrequencyListGenerator.java
private Map<String, Integer> getBareTable(List<String> corporaNames, String functionalMetadatumDescription, String semanticMetadatumDescription, String frequencyBy) throws IOException { Map<String, Integer> fl = new HashMap<String, Integer>(); Query q = new BooleanQuery(); if (corporaNames != null && corporaNames.size() > 0 && !(corporaNames.size() == 1 && corporaNames.get(0) == null)) { BooleanQuery corporaQuery = new BooleanQuery(); for (String cn : corporaNames) { if (cn != null) { corporaQuery.add(new TermQuery(new Term("corpus", cn)), Occur.SHOULD); }/* w ww .j a va 2 s . c o m*/ } ((BooleanQuery) q).add(corporaQuery, Occur.MUST); } if (functionalMetadatumDescription != null) { TermQuery funcQuery = new TermQuery(new Term("functionalMetadatum", functionalMetadatumDescription)); ((BooleanQuery) q).add(funcQuery, Occur.MUST); } if (semanticMetadatumDescription != null) { TermQuery semaQuery = new TermQuery(new Term("semanticMetadatum", semanticMetadatumDescription)); ((BooleanQuery) q).add(semaQuery, Occur.MUST); } PrefixQuery prefixQuery = new PrefixQuery(new Term("performaFL", "")); ((BooleanQuery) q).add(prefixQuery, Occur.MUST); IndexSearcher indexSearcher = this.contextsIndexManager.getIndexSearcherR(); System.out.println("Starting FL calculation"); TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector(); indexSearcher.search(q, null, totalHitCountCollector); int totalHits = totalHitCountCollector.getTotalHits(); System.out.println("Frequency list calculation. Docs to be processed: " + totalHits); ScoreDoc after = null; int docsProcessed = 0; for (int j = 0; j < totalHits; j += FrequencyListGenerator.BATCH_SIZE) { TopDocs topDocs = null; if (after == null) { topDocs = indexSearcher.search(q, FrequencyListGenerator.BATCH_SIZE); } else { topDocs = indexSearcher.searchAfter(after, q, FrequencyListGenerator.BATCH_SIZE); } StrTokenizer strTokenizer = new StrTokenizer(); strTokenizer.setDelimiterString(ContextAnalyzer.SEPARATOR); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs != null) { for (ScoreDoc scoreDoc : scoreDocs) { ++docsProcessed; after = scoreDoc; TermFreqVector termFreqVector = indexSearcher.getIndexReader().getTermFreqVector(scoreDoc.doc, "performaFL"); if (termFreqVector == null) { continue; } String[] terms = termFreqVector.getTerms(); int[] frequencies = termFreqVector.getTermFrequencies(); for (int i = 0; i < terms.length; i++) { String term = terms[i]; String[] tokenArray = strTokenizer.reset(term).getTokenArray(); if (tokenArray.length != 3) { continue; } String pos = tokenArray[1]; String lemma = tokenArray[2]; if (lemma.equals("<unknown>")) { lemma = tokenArray[0]; } if (frequencyBy.equals("forma")) { term = tokenArray[0]; } else if (frequencyBy.equals("lemma")) { term = lemma; } else if (frequencyBy.equals("PoS-lemma")) { if (pos.startsWith("VER")) { pos = "VER"; } term = pos + " / " + lemma; } else if (frequencyBy.equals("PoS-forma")) { if (pos.startsWith("VER")) { pos = "VER"; } term = pos + " / " + tokenArray[0]; } else { term = tokenArray[1]; } Integer count = fl.get(term); if (count == null) { fl.put(term, frequencies[i]); } else { fl.put(term, frequencies[i] + count); } } if (docsProcessed % 1000 == 0) { System.out.println("Frequency list calculation. Docs processed: " + docsProcessed + " on total: " + totalHits + " (" + docsProcessed * 100.0f / totalHits + "%)"); } } } } return fl; }
From source file:it.drwolf.ridire.index.cwb.scripts.VRTFilesBuilder.java
public void createVRTFile(String posFileName, StrTokenizer strTokenizer, CrawledResource cr, File destDir) { File posFile = new File(posFileName); if (posFile.exists() && posFile.canRead()) { try {/*from www. ja va2 s .com*/ List<String> posFileLines = FileUtils.readLines(posFile); if (this.haveStrangeChars(posFileLines)) { this.log.warn("File with strange chars {0}", posFileName); return; } List<String> newLines = new ArrayList<String>(); for (String l : posFileLines) { strTokenizer.reset(l); String[] tokens = strTokenizer.getTokenArray(); if (tokens.length != 3) { System.err.println("File: " + posFileName + " Stringa malformed: " + l); continue; } String nl = tokens[0] + "\t"; nl += tokens[1].replaceAll(":", "") + "\t"; nl += this.getEasyPos(tokens[1]).replaceAll(":", "") + "\t"; nl += tokens[2]; newLines.add(nl); } String functionalMetadatum = cr.getFunctionalMetadatum() != null ? cr.getFunctionalMetadatum().getDescription() : ""; String semanticMetadatum = cr.getSemanticMetadatum() != null ? cr.getSemanticMetadatum().getDescription() : ""; String url = cr.getUrl(); if (url == null) { url = ""; } String header = this.getHeaderFromResource(cr.getJob().getName(), functionalMetadatum, semanticMetadatum, url, posFile); newLines.add(0, header); newLines.add("</text>"); File vrtFile = new File(destDir, cr.getDigest() + ".vrt"); FileUtils.writeLines(vrtFile, newLines); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { System.err.println("Warning - File " + posFileName + " doesn't exist."); } }
From source file:it.drwolf.ridire.index.cwb.CWBPatternSearcher.java
private String getTermVisualization(String term, boolean leftContext) { StrTokenizer s1 = new StrTokenizer(); List<String> t1 = s1.reset(term).getTokenList(); new ArrayList<String>(); List<String> total = new ArrayList<String>(); for (String t : t1) { List<String> ret = new ArrayList<String>(); String[] tokens = new StrTokenizer(t, CWBPatternSearcher.SEPARATOR).getTokenArray(); if (this.getToBeVisualized().contains(CWBPatternSearcher.TUTTO)) { if (tokens.length > 0) { ret.add(tokens[0]);/* www . j a va 2 s .com*/ } if (tokens.length > 1) { ret.add(tokens[1]); } if (tokens.length > 2) { ret.add(tokens[2]); } } else { if (this.getToBeVisualized().contains(CWBPatternSearcher.FORMA) && tokens.length > 0) { ret.add(tokens[0]); } if (this.getToBeVisualized().contains("PoS") && tokens.length > 1) { ret.add(tokens[1]); } if (this.getToBeVisualized().contains("Lemma") && tokens.length > 2) { ret.add(tokens[2]); } } if (!leftContext) { total.add(StringUtils.join(ret, " / ")); } else { total.add(StringUtils.join(ret, "_/_")); } } return StringUtils.join(total, " "); }
From source file:it.drwolf.ridire.session.JobManager.java
public void retrievePoSText(CrawledResource cr) { File resourceDir = new File( FilenameUtils.getFullPath(cr.getArcFile().replaceAll("__\\d+", "")) + JobManager.RESOURCESDIR); File posTextFile = new File(resourceDir, cr.getDigest() + ".txt.pos"); List<PoSLine> posLines = new ArrayList<PoSLine>(); try {/* ww w . j ava 2 s.c o m*/ List<String> lines = FileUtils.readLines(posTextFile); StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { PoSLine poSLine = new PoSLine(); poSLine.setForm(tokens[0].trim()); poSLine.setPosTag(tokens[1].trim()); poSLine.setLemma(tokens[2].trim()); posLines.add(poSLine); } } } catch (IOException e) { } this.setPosText(posLines); }