List of usage examples for org.apache.commons.lang.text StrTokenizer StrTokenizer
public StrTokenizer()
From source file:it.drwolf.ridire.util.async.FrequencyListGenerator.java
private Map<String, Integer> getBareTable(List<String> corporaNames, String functionalMetadatumDescription, String semanticMetadatumDescription, String frequencyBy) throws IOException { Map<String, Integer> fl = new HashMap<String, Integer>(); Query q = new BooleanQuery(); if (corporaNames != null && corporaNames.size() > 0 && !(corporaNames.size() == 1 && corporaNames.get(0) == null)) { BooleanQuery corporaQuery = new BooleanQuery(); for (String cn : corporaNames) { if (cn != null) { corporaQuery.add(new TermQuery(new Term("corpus", cn)), Occur.SHOULD); }//from w w w.j ava 2s . c o m } ((BooleanQuery) q).add(corporaQuery, Occur.MUST); } if (functionalMetadatumDescription != null) { TermQuery funcQuery = new TermQuery(new Term("functionalMetadatum", functionalMetadatumDescription)); ((BooleanQuery) q).add(funcQuery, Occur.MUST); } if (semanticMetadatumDescription != null) { TermQuery semaQuery = new TermQuery(new Term("semanticMetadatum", semanticMetadatumDescription)); ((BooleanQuery) q).add(semaQuery, Occur.MUST); } PrefixQuery prefixQuery = new PrefixQuery(new Term("performaFL", "")); ((BooleanQuery) q).add(prefixQuery, Occur.MUST); IndexSearcher indexSearcher = this.contextsIndexManager.getIndexSearcherR(); System.out.println("Starting FL calculation"); TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector(); indexSearcher.search(q, null, totalHitCountCollector); int totalHits = totalHitCountCollector.getTotalHits(); System.out.println("Frequency list calculation. Docs to be processed: " + totalHits); ScoreDoc after = null; int docsProcessed = 0; for (int j = 0; j < totalHits; j += FrequencyListGenerator.BATCH_SIZE) { TopDocs topDocs = null; if (after == null) { topDocs = indexSearcher.search(q, FrequencyListGenerator.BATCH_SIZE); } else { topDocs = indexSearcher.searchAfter(after, q, FrequencyListGenerator.BATCH_SIZE); } StrTokenizer strTokenizer = new StrTokenizer(); strTokenizer.setDelimiterString(ContextAnalyzer.SEPARATOR); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs != null) { for (ScoreDoc scoreDoc : scoreDocs) { ++docsProcessed; after = scoreDoc; TermFreqVector termFreqVector = indexSearcher.getIndexReader().getTermFreqVector(scoreDoc.doc, "performaFL"); if (termFreqVector == null) { continue; } String[] terms = termFreqVector.getTerms(); int[] frequencies = termFreqVector.getTermFrequencies(); for (int i = 0; i < terms.length; i++) { String term = terms[i]; String[] tokenArray = strTokenizer.reset(term).getTokenArray(); if (tokenArray.length != 3) { continue; } String pos = tokenArray[1]; String lemma = tokenArray[2]; if (lemma.equals("<unknown>")) { lemma = tokenArray[0]; } if (frequencyBy.equals("forma")) { term = tokenArray[0]; } else if (frequencyBy.equals("lemma")) { term = lemma; } else if (frequencyBy.equals("PoS-lemma")) { if (pos.startsWith("VER")) { pos = "VER"; } term = pos + " / " + lemma; } else if (frequencyBy.equals("PoS-forma")) { if (pos.startsWith("VER")) { pos = "VER"; } term = pos + " / " + tokenArray[0]; } else { term = tokenArray[1]; } Integer count = fl.get(term); if (count == null) { fl.put(term, frequencies[i]); } else { fl.put(term, frequencies[i] + count); } } if (docsProcessed % 1000 == 0) { System.out.println("Frequency list calculation. Docs processed: " + docsProcessed + " on total: " + totalHits + " (" + docsProcessed * 100.0f / totalHits + "%)"); } } } } return fl; }
From source file:it.drwolf.ridire.index.cwb.CWBFrequencyList.java
private String getFrequencyList(boolean deleteFLFile, List<String> semDescription, List<String> funDescription, int quantityP, String type, Integer threshold, boolean sorted) { CommandLine commandLine = CommandLine.parse(this.cwbscanExecutable); commandLine.addArgument("-q"); if (threshold != null && threshold > 0) { commandLine.addArgument("-f"); commandLine.addArgument(threshold + ""); }/* w ww .j a v a2 s .c o m*/ commandLine.addArgument("-r").addArgument(this.cqpRegistry); commandLine.addArgument("-C"); commandLine.addArgument(this.cqpCorpusName); if (type.equals("forma")) { commandLine.addArgument("word+0"); } else if (type.equals("PoS")) { commandLine.addArgument("pos+0"); } else if (type.equals("easypos")) { commandLine.addArgument("easypos+0"); } else if (type.equals("lemma")) { commandLine.addArgument("lemma+0"); } else if (type.equals("PoS-forma")) { commandLine.addArgument("pos+0"); commandLine.addArgument("word+0"); } else if (type.equals("PoS-lemma")) { commandLine.addArgument("pos+0"); commandLine.addArgument("lemma+0"); } String semFuncParam = ""; if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null && funDescription.get(0).trim().length() > 0 || semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null && semDescription.get(0).trim().length() > 0) { semFuncParam = "?"; if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null && funDescription.get(0).trim().length() > 0) { String fd = StringUtils.join(funDescription, "\\|"); semFuncParam += "text_functional=/\\(" + fd + "\\)/ "; } if (semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null && semDescription.get(0).trim().length() > 0) { String sd = StringUtils.join(semDescription, "\\|"); semFuncParam += "text_semantic=/\\(" + sd + "\\)/ "; } commandLine.addArgument(semFuncParam); } if (sorted) { commandLine.addArgument("|"); commandLine.addArgument("sort"); commandLine.addArgument("-nr"); commandLine.addArgument("-k"); commandLine.addArgument("1"); } if (quantityP > 0) { commandLine.addArgument("|"); commandLine.addArgument("head"); commandLine.addArgument("-" + quantityP); } File flTempFile = null; try { flTempFile = File.createTempFile("ridireFL", null); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } commandLine.addArgument(" > "); commandLine.addArgument(flTempFile.getAbsolutePath()); String c = commandLine.toString(); try { File tempSh = File.createTempFile("ridireSH", ".sh"); FileUtils.writeStringToFile(tempSh, c); tempSh.setExecutable(true); commandLine = CommandLine.parse(tempSh.getAbsolutePath()); DefaultExecutor executor = new DefaultExecutor(); executor.setExitValue(0); ExecuteWatchdog watchdog = new ExecuteWatchdog(CWBFrequencyList.TIMEOUT); executor.setWatchdog(watchdog); ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024); ByteArrayOutputStream baosStdErr = new ByteArrayOutputStream(1024); ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, baosStdErr, null); executor.setStreamHandler(executeStreamHandler); int exitValue = 0; exitValue = executor.execute(commandLine); FileUtils.deleteQuietly(tempSh); if (exitValue == 0) { StrTokenizer strTokenizer = new StrTokenizer(); this.frequencyList = new ArrayList<FrequencyItem>(); List<String> lines = FileUtils.readLines(flTempFile); for (String line : lines) { strTokenizer.reset(line); String[] tokens = strTokenizer.getTokenArray(); if (tokens.length == 2) { FrequencyItem frequencyItem = new FrequencyItem(tokens[1], Integer.parseInt(tokens[0].trim())); this.frequencyList.add(frequencyItem); } else if (tokens.length == 3) { FrequencyItem frequencyItem = new FrequencyItem(tokens[2], tokens[1], Integer.parseInt(tokens[0].trim())); this.frequencyList.add(frequencyItem); } } if (deleteFLFile) { FileUtils.deleteQuietly(flTempFile); } } } catch (ExecuteException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return flTempFile.getAbsolutePath(); }
From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java
private HashMap<String, SketchResult> extractSingleLemmaSketches(String lemma, String functionalMetadatum, String semanticMetadatum, Sketch s, IndexWriter indexWriter) { HashMap<String, SketchResult> sr = new HashMap<String, SketchResult>(); String freqTable = "freq_lemma_all"; if (functionalMetadatum != null) { freqTable = "freq_lemma_" + functionalMetadatum.trim().replaceAll("\\s", "_"); }/*w w w .ja va 2 s .c o m*/ if (semanticMetadatum != null) { freqTable = "freq_lemma_" + semanticMetadatum.trim().replaceAll("\\s", "_"); } List<Number> firstFreqList = this.entityManager .createNativeQuery("select freq from " + freqTable + " where item=:item") .setParameter("item", lemma).getResultList(); if (firstFreqList != null && firstFreqList.size() > 0 && firstFreqList.get(0).longValue() > 0) { long firstFreq = firstFreqList.get(0).longValue(); StrTokenizer strTokenizer = new StrTokenizer(); try { List<File> tableFiles = new ArrayList<File>(); String queryString = null; String stringToAdd = null; String realQuery = ""; for (GramRel gramRel : s.getGramrels()) { File resTblFile = File.createTempFile("ridireTBL", ".tbl"); tableFiles.add(resTblFile); String rel = gramRel.getRel(); realQuery = String.format(rel, lemma); String subquery = gramRel.getSubquery(); if (subquery != null) { realQuery += ";\nASUB;\n" + String.format(subquery, lemma); } queryString = this.createQueryForCQP(resTblFile, stringToAdd, functionalMetadatum, semanticMetadatum, realQuery, s.isTrinary()); File queryFile = File.createTempFile("ridireQ", ".query"); FileUtils.writeStringToFile(queryFile, queryString); long start = System.currentTimeMillis(); this.executeCQPQuery(queryFile, gramRel.isInverse()); System.out.println( "CQP exec time for " + realQuery.replaceAll("\n", " ") + " " + functionalMetadatum + " " + semanticMetadatum + " : " + (System.currentTimeMillis() - start)); if (!resTblFile.exists() || !resTblFile.canRead()) { continue; } FileUtils.deleteQuietly(queryFile); } List<String> lines = null; if (!s.isTrinary()) { File resTblFile = File.createTempFile("ridireTBLFINAL", ".tbl"); this.compactLines(tableFiles, resTblFile); lines = FileUtils.readLines(resTblFile); FileUtils.deleteQuietly(resTblFile); } else if (tableFiles.size() > 0) { lines = FileUtils.readLines(tableFiles.get(0)); } for (File tableFile : tableFiles) { FileUtils.deleteQuietly(tableFile); } if (s.isTrinary()) { strTokenizer.setDelimiterString("@@##"); this.processTrinaryTable(freqTable, firstFreq, strTokenizer, lines, lemma, indexWriter, s.getName(), functionalMetadatum, semanticMetadatum, s.getGoodFor()); } else { strTokenizer.setDelimiterString(" "); this.processNotTrinaryTable(sr, freqTable, firstFreq, strTokenizer, lines); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return sr; }
From source file:it.drwolf.ridire.index.cwb.CWBPatternSearcher.java
private String getTermVisualization(String term, boolean leftContext) { StrTokenizer s1 = new StrTokenizer(); List<String> t1 = s1.reset(term).getTokenList(); new ArrayList<String>(); List<String> total = new ArrayList<String>(); for (String t : t1) { List<String> ret = new ArrayList<String>(); String[] tokens = new StrTokenizer(t, CWBPatternSearcher.SEPARATOR).getTokenArray(); if (this.getToBeVisualized().contains(CWBPatternSearcher.TUTTO)) { if (tokens.length > 0) { ret.add(tokens[0]);/*from w w w . j ava 2 s. c o m*/ } if (tokens.length > 1) { ret.add(tokens[1]); } if (tokens.length > 2) { ret.add(tokens[2]); } } else { if (this.getToBeVisualized().contains(CWBPatternSearcher.FORMA) && tokens.length > 0) { ret.add(tokens[0]); } if (this.getToBeVisualized().contains("PoS") && tokens.length > 1) { ret.add(tokens[1]); } if (this.getToBeVisualized().contains("Lemma") && tokens.length > 2) { ret.add(tokens[2]); } } if (!leftContext) { total.add(StringUtils.join(ret, " / ")); } else { total.add(StringUtils.join(ret, "_/_")); } } return StringUtils.join(total, " "); }
From source file:org.eclipse.skalli.services.permit.Permit.java
private static String[] split(String path) { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterChar('/'); tokenizer.setTrimmerMatcher(StrMatcher.trimMatcher()); tokenizer.reset(path);/*from w w w . ja va 2 s . com*/ return tokenizer.getTokenArray(); }
From source file:org.gbif.occurrence.util.CSVReader.java
public CSVReader(File source, String encoding, String delimiter, Character quotes, Integer headerRows) throws IOException { this.delimiter = delimiter; this.encoding = encoding; this.quoteChar = quotes; if (headerRows == null || headerRows < 0) { this.headerRows = 0; } else {/* www. ja v a2 s .co m*/ this.headerRows = headerRows; } tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(delimiter); if (quotes != null) { tokenizer.setQuoteChar(quotes); } tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset(); InputStream fis = null; if (source.getName().endsWith(".gz")) { fis = new GZIPInputStream(new FileInputStream(source)); } else { fis = new FileInputStream(source); } InputStreamReader reader = new InputStreamReader(fis, encoding); br = new BufferedReader(reader); row = br.readLine(); // parse header row if (row == null) { header = null; } else { tokenizer.reset(row); header = tokenizer.getTokenArray(); } // skip initial header rows? while (headerRows > 0) { headerRows--; row = br.readLine(); } }