Example usage for org.apache.commons.lang.text StrTokenizer StrTokenizer

List of usage examples for org.apache.commons.lang.text StrTokenizer StrTokenizer

Introduction

In this page you can find the example usage for org.apache.commons.lang.text StrTokenizer StrTokenizer.

Prototype

public StrTokenizer() 

Source Link

Document

Constructs a tokenizer splitting on space, tab, newline and formfeed as per StringTokenizer, but with no text to tokenize.

Usage

From source file:it.drwolf.ridire.util.async.FrequencyListGenerator.java

private Map<String, Integer> getBareTable(List<String> corporaNames, String functionalMetadatumDescription,
        String semanticMetadatumDescription, String frequencyBy) throws IOException {
    Map<String, Integer> fl = new HashMap<String, Integer>();
    Query q = new BooleanQuery();
    if (corporaNames != null && corporaNames.size() > 0
            && !(corporaNames.size() == 1 && corporaNames.get(0) == null)) {
        BooleanQuery corporaQuery = new BooleanQuery();
        for (String cn : corporaNames) {
            if (cn != null) {
                corporaQuery.add(new TermQuery(new Term("corpus", cn)), Occur.SHOULD);
            }//from   w w w.j ava 2s . c  o  m
        }
        ((BooleanQuery) q).add(corporaQuery, Occur.MUST);
    }
    if (functionalMetadatumDescription != null) {
        TermQuery funcQuery = new TermQuery(new Term("functionalMetadatum", functionalMetadatumDescription));
        ((BooleanQuery) q).add(funcQuery, Occur.MUST);
    }
    if (semanticMetadatumDescription != null) {
        TermQuery semaQuery = new TermQuery(new Term("semanticMetadatum", semanticMetadatumDescription));
        ((BooleanQuery) q).add(semaQuery, Occur.MUST);
    }
    PrefixQuery prefixQuery = new PrefixQuery(new Term("performaFL", ""));
    ((BooleanQuery) q).add(prefixQuery, Occur.MUST);
    IndexSearcher indexSearcher = this.contextsIndexManager.getIndexSearcherR();
    System.out.println("Starting FL calculation");
    TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
    indexSearcher.search(q, null, totalHitCountCollector);
    int totalHits = totalHitCountCollector.getTotalHits();
    System.out.println("Frequency list calculation. Docs to be processed: " + totalHits);
    ScoreDoc after = null;
    int docsProcessed = 0;
    for (int j = 0; j < totalHits; j += FrequencyListGenerator.BATCH_SIZE) {
        TopDocs topDocs = null;
        if (after == null) {
            topDocs = indexSearcher.search(q, FrequencyListGenerator.BATCH_SIZE);
        } else {
            topDocs = indexSearcher.searchAfter(after, q, FrequencyListGenerator.BATCH_SIZE);
        }
        StrTokenizer strTokenizer = new StrTokenizer();
        strTokenizer.setDelimiterString(ContextAnalyzer.SEPARATOR);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        if (scoreDocs != null) {
            for (ScoreDoc scoreDoc : scoreDocs) {
                ++docsProcessed;
                after = scoreDoc;
                TermFreqVector termFreqVector = indexSearcher.getIndexReader().getTermFreqVector(scoreDoc.doc,
                        "performaFL");
                if (termFreqVector == null) {
                    continue;
                }
                String[] terms = termFreqVector.getTerms();
                int[] frequencies = termFreqVector.getTermFrequencies();
                for (int i = 0; i < terms.length; i++) {
                    String term = terms[i];
                    String[] tokenArray = strTokenizer.reset(term).getTokenArray();
                    if (tokenArray.length != 3) {
                        continue;
                    }
                    String pos = tokenArray[1];
                    String lemma = tokenArray[2];
                    if (lemma.equals("<unknown>")) {
                        lemma = tokenArray[0];
                    }
                    if (frequencyBy.equals("forma")) {
                        term = tokenArray[0];
                    } else if (frequencyBy.equals("lemma")) {
                        term = lemma;
                    } else if (frequencyBy.equals("PoS-lemma")) {
                        if (pos.startsWith("VER")) {
                            pos = "VER";
                        }
                        term = pos + " / " + lemma;
                    } else if (frequencyBy.equals("PoS-forma")) {
                        if (pos.startsWith("VER")) {
                            pos = "VER";
                        }
                        term = pos + " / " + tokenArray[0];
                    } else {
                        term = tokenArray[1];
                    }
                    Integer count = fl.get(term);
                    if (count == null) {
                        fl.put(term, frequencies[i]);
                    } else {
                        fl.put(term, frequencies[i] + count);
                    }
                }
                if (docsProcessed % 1000 == 0) {
                    System.out.println("Frequency list calculation. Docs processed: " + docsProcessed
                            + " on total: " + totalHits + " (" + docsProcessed * 100.0f / totalHits + "%)");
                }
            }
        }
    }
    return fl;
}

From source file:it.drwolf.ridire.index.cwb.CWBFrequencyList.java

private String getFrequencyList(boolean deleteFLFile, List<String> semDescription, List<String> funDescription,
        int quantityP, String type, Integer threshold, boolean sorted) {
    CommandLine commandLine = CommandLine.parse(this.cwbscanExecutable);
    commandLine.addArgument("-q");
    if (threshold != null && threshold > 0) {
        commandLine.addArgument("-f");
        commandLine.addArgument(threshold + "");
    }/*  w  ww  .j  a v a2 s .c o  m*/
    commandLine.addArgument("-r").addArgument(this.cqpRegistry);
    commandLine.addArgument("-C");
    commandLine.addArgument(this.cqpCorpusName);
    if (type.equals("forma")) {
        commandLine.addArgument("word+0");
    } else if (type.equals("PoS")) {
        commandLine.addArgument("pos+0");
    } else if (type.equals("easypos")) {
        commandLine.addArgument("easypos+0");
    } else if (type.equals("lemma")) {
        commandLine.addArgument("lemma+0");
    } else if (type.equals("PoS-forma")) {
        commandLine.addArgument("pos+0");
        commandLine.addArgument("word+0");
    } else if (type.equals("PoS-lemma")) {
        commandLine.addArgument("pos+0");
        commandLine.addArgument("lemma+0");
    }
    String semFuncParam = "";
    if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null
            && funDescription.get(0).trim().length() > 0
            || semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null
                    && semDescription.get(0).trim().length() > 0) {
        semFuncParam = "?";
        if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null
                && funDescription.get(0).trim().length() > 0) {
            String fd = StringUtils.join(funDescription, "\\|");
            semFuncParam += "text_functional=/\\(" + fd + "\\)/ ";
        }
        if (semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null
                && semDescription.get(0).trim().length() > 0) {
            String sd = StringUtils.join(semDescription, "\\|");
            semFuncParam += "text_semantic=/\\(" + sd + "\\)/ ";

        }
        commandLine.addArgument(semFuncParam);
    }
    if (sorted) {
        commandLine.addArgument("|");
        commandLine.addArgument("sort");
        commandLine.addArgument("-nr");
        commandLine.addArgument("-k");
        commandLine.addArgument("1");
    }
    if (quantityP > 0) {
        commandLine.addArgument("|");
        commandLine.addArgument("head");
        commandLine.addArgument("-" + quantityP);
    }
    File flTempFile = null;
    try {
        flTempFile = File.createTempFile("ridireFL", null);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    commandLine.addArgument(" > ");
    commandLine.addArgument(flTempFile.getAbsolutePath());
    String c = commandLine.toString();
    try {
        File tempSh = File.createTempFile("ridireSH", ".sh");
        FileUtils.writeStringToFile(tempSh, c);
        tempSh.setExecutable(true);
        commandLine = CommandLine.parse(tempSh.getAbsolutePath());
        DefaultExecutor executor = new DefaultExecutor();
        executor.setExitValue(0);
        ExecuteWatchdog watchdog = new ExecuteWatchdog(CWBFrequencyList.TIMEOUT);
        executor.setWatchdog(watchdog);
        ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024);
        ByteArrayOutputStream baosStdErr = new ByteArrayOutputStream(1024);
        ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, baosStdErr, null);
        executor.setStreamHandler(executeStreamHandler);
        int exitValue = 0;
        exitValue = executor.execute(commandLine);
        FileUtils.deleteQuietly(tempSh);
        if (exitValue == 0) {
            StrTokenizer strTokenizer = new StrTokenizer();
            this.frequencyList = new ArrayList<FrequencyItem>();
            List<String> lines = FileUtils.readLines(flTempFile);
            for (String line : lines) {
                strTokenizer.reset(line);
                String[] tokens = strTokenizer.getTokenArray();
                if (tokens.length == 2) {
                    FrequencyItem frequencyItem = new FrequencyItem(tokens[1],
                            Integer.parseInt(tokens[0].trim()));
                    this.frequencyList.add(frequencyItem);
                } else if (tokens.length == 3) {
                    FrequencyItem frequencyItem = new FrequencyItem(tokens[2], tokens[1],
                            Integer.parseInt(tokens[0].trim()));
                    this.frequencyList.add(frequencyItem);
                }
            }
            if (deleteFLFile) {
                FileUtils.deleteQuietly(flTempFile);
            }
        }
    } catch (ExecuteException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return flTempFile.getAbsolutePath();
}

From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java

private HashMap<String, SketchResult> extractSingleLemmaSketches(String lemma, String functionalMetadatum,
        String semanticMetadatum, Sketch s, IndexWriter indexWriter) {
    HashMap<String, SketchResult> sr = new HashMap<String, SketchResult>();
    String freqTable = "freq_lemma_all";
    if (functionalMetadatum != null) {
        freqTable = "freq_lemma_" + functionalMetadatum.trim().replaceAll("\\s", "_");
    }/*w  w w  .ja va  2 s  .c o  m*/
    if (semanticMetadatum != null) {
        freqTable = "freq_lemma_" + semanticMetadatum.trim().replaceAll("\\s", "_");
    }
    List<Number> firstFreqList = this.entityManager
            .createNativeQuery("select freq from " + freqTable + " where item=:item")
            .setParameter("item", lemma).getResultList();
    if (firstFreqList != null && firstFreqList.size() > 0 && firstFreqList.get(0).longValue() > 0) {
        long firstFreq = firstFreqList.get(0).longValue();
        StrTokenizer strTokenizer = new StrTokenizer();
        try {
            List<File> tableFiles = new ArrayList<File>();
            String queryString = null;
            String stringToAdd = null;
            String realQuery = "";
            for (GramRel gramRel : s.getGramrels()) {
                File resTblFile = File.createTempFile("ridireTBL", ".tbl");
                tableFiles.add(resTblFile);
                String rel = gramRel.getRel();
                realQuery = String.format(rel, lemma);
                String subquery = gramRel.getSubquery();
                if (subquery != null) {
                    realQuery += ";\nASUB;\n" + String.format(subquery, lemma);
                }
                queryString = this.createQueryForCQP(resTblFile, stringToAdd, functionalMetadatum,
                        semanticMetadatum, realQuery, s.isTrinary());
                File queryFile = File.createTempFile("ridireQ", ".query");
                FileUtils.writeStringToFile(queryFile, queryString);
                long start = System.currentTimeMillis();
                this.executeCQPQuery(queryFile, gramRel.isInverse());
                System.out.println(
                        "CQP exec time for " + realQuery.replaceAll("\n", " ") + " " + functionalMetadatum + " "
                                + semanticMetadatum + " : " + (System.currentTimeMillis() - start));
                if (!resTblFile.exists() || !resTblFile.canRead()) {
                    continue;
                }
                FileUtils.deleteQuietly(queryFile);
            }
            List<String> lines = null;
            if (!s.isTrinary()) {
                File resTblFile = File.createTempFile("ridireTBLFINAL", ".tbl");
                this.compactLines(tableFiles, resTblFile);
                lines = FileUtils.readLines(resTblFile);
                FileUtils.deleteQuietly(resTblFile);
            } else if (tableFiles.size() > 0) {
                lines = FileUtils.readLines(tableFiles.get(0));
            }
            for (File tableFile : tableFiles) {
                FileUtils.deleteQuietly(tableFile);
            }
            if (s.isTrinary()) {
                strTokenizer.setDelimiterString("@@##");
                this.processTrinaryTable(freqTable, firstFreq, strTokenizer, lines, lemma, indexWriter,
                        s.getName(), functionalMetadatum, semanticMetadatum, s.getGoodFor());
            } else {
                strTokenizer.setDelimiterString(" ");
                this.processNotTrinaryTable(sr, freqTable, firstFreq, strTokenizer, lines);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    return sr;
}

From source file:it.drwolf.ridire.index.cwb.CWBPatternSearcher.java

private String getTermVisualization(String term, boolean leftContext) {
    StrTokenizer s1 = new StrTokenizer();
    List<String> t1 = s1.reset(term).getTokenList();
    new ArrayList<String>();
    List<String> total = new ArrayList<String>();
    for (String t : t1) {
        List<String> ret = new ArrayList<String>();
        String[] tokens = new StrTokenizer(t, CWBPatternSearcher.SEPARATOR).getTokenArray();
        if (this.getToBeVisualized().contains(CWBPatternSearcher.TUTTO)) {
            if (tokens.length > 0) {
                ret.add(tokens[0]);/*from  w w w .  j ava 2  s.  c  o m*/
            }
            if (tokens.length > 1) {
                ret.add(tokens[1]);
            }
            if (tokens.length > 2) {
                ret.add(tokens[2]);
            }
        } else {
            if (this.getToBeVisualized().contains(CWBPatternSearcher.FORMA) && tokens.length > 0) {
                ret.add(tokens[0]);
            }
            if (this.getToBeVisualized().contains("PoS") && tokens.length > 1) {
                ret.add(tokens[1]);
            }
            if (this.getToBeVisualized().contains("Lemma") && tokens.length > 2) {
                ret.add(tokens[2]);
            }
        }
        if (!leftContext) {
            total.add(StringUtils.join(ret, " / "));
        } else {
            total.add(StringUtils.join(ret, "_/_"));
        }
    }
    return StringUtils.join(total, " ");
}

From source file:org.eclipse.skalli.services.permit.Permit.java

private static String[] split(String path) {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterChar('/');
    tokenizer.setTrimmerMatcher(StrMatcher.trimMatcher());
    tokenizer.reset(path);/*from  w  w w  .  ja  va  2 s .  com*/
    return tokenizer.getTokenArray();
}

From source file:org.gbif.occurrence.util.CSVReader.java

public CSVReader(File source, String encoding, String delimiter, Character quotes, Integer headerRows)
        throws IOException {
    this.delimiter = delimiter;
    this.encoding = encoding;
    this.quoteChar = quotes;
    if (headerRows == null || headerRows < 0) {
        this.headerRows = 0;
    } else {/* www. ja v a2  s .co m*/
        this.headerRows = headerRows;
    }
    tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(delimiter);
    if (quotes != null) {
        tokenizer.setQuoteChar(quotes);
    }
    tokenizer.setIgnoreEmptyTokens(false);
    tokenizer.reset();
    InputStream fis = null;
    if (source.getName().endsWith(".gz")) {
        fis = new GZIPInputStream(new FileInputStream(source));
    } else {
        fis = new FileInputStream(source);
    }
    InputStreamReader reader = new InputStreamReader(fis, encoding);
    br = new BufferedReader(reader);
    row = br.readLine();
    // parse header row
    if (row == null) {
        header = null;
    } else {
        tokenizer.reset(row);
        header = tokenizer.getTokenArray();
    }
    // skip initial header rows?
    while (headerRows > 0) {
        headerRows--;
        row = br.readLine();
    }
}