Example usage for org.apache.commons.lang.text StrTokenizer reset

List of usage examples for org.apache.commons.lang.text StrTokenizer reset

Introduction

In this page you can find the example usage for org.apache.commons.lang.text StrTokenizer reset.

Prototype

public StrTokenizer reset(char[] input) 

Source Link

Document

Reset this tokenizer, giving it a new input string to parse.

Usage

From source file:com.savy3.util.DBConfiguration.java

/**
 * Converts a String back to connection parameters.
 * @param input String from configuration
 * @return JDBC connection parameters/*ww w . j a  v  a  2 s.  c o  m*/
 */
protected static Properties propertiesFromString(String input) {
    if (input != null && !input.isEmpty()) {
        Properties result = new Properties();
        StrTokenizer propertyTokenizer = StrTokenizer.getCSVInstance(input);
        StrTokenizer valueTokenizer = StrTokenizer.getCSVInstance();
        valueTokenizer.setDelimiterChar('=');
        while (propertyTokenizer.hasNext()) {
            valueTokenizer.reset(propertyTokenizer.nextToken());
            String[] values = valueTokenizer.getTokenArray();
            if (values.length == 2) {
                result.put(values[0], values[1]);
            }
        }
        return result;
    } else {
        return null;
    }
}

From source file:it.drwolf.ridire.session.async.Mapper.java

@SuppressWarnings("unchecked")
public static Integer countWordsFromPoSTagResource(String posTagResourceFileName) throws IOException {
    List<String> lines = FileUtils.readLines(new File(posTagResourceFileName));
    Integer count = 0;/*w w w. j  av  a  2  s.  c o m*/
    StrTokenizer tokenizer = StrTokenizer.getTSVInstance();
    for (String l : lines) {
        tokenizer.reset(l);
        String[] tokens = tokenizer.getTokenArray();
        if (tokens.length == 3) {
            if (Mapper.isValidPos(tokens[1].trim())) {
                ++count;
            }
        }
    }
    return count;
}

From source file:it.drwolf.ridire.session.async.WordCounter.java

public Integer countWordsFromPoSTagResource(File posTagResourceFile) throws IOException {
    List<String> lines = FileUtils.readLines(posTagResourceFile);
    Integer count = 0;//from   w w  w  .  j  ava2 s . c  o m
    StrTokenizer tokenizer = StrTokenizer.getTSVInstance();
    for (String l : lines) {
        tokenizer.reset(l);
        String[] tokens = tokenizer.getTokenArray();
        if (tokens.length == 3) {
            if (this.isValidPos(tokens[1].trim())) {
                ++count;
            }
        }
    }
    return count;
}

From source file:it.drwolf.ridire.index.cwb.CWBFrequencyList.java

private String getFrequencyList(boolean deleteFLFile, List<String> semDescription, List<String> funDescription,
        int quantityP, String type, Integer threshold, boolean sorted) {
    CommandLine commandLine = CommandLine.parse(this.cwbscanExecutable);
    commandLine.addArgument("-q");
    if (threshold != null && threshold > 0) {
        commandLine.addArgument("-f");
        commandLine.addArgument(threshold + "");
    }//w  w  w.j a  v a  2  s  . c  om
    commandLine.addArgument("-r").addArgument(this.cqpRegistry);
    commandLine.addArgument("-C");
    commandLine.addArgument(this.cqpCorpusName);
    if (type.equals("forma")) {
        commandLine.addArgument("word+0");
    } else if (type.equals("PoS")) {
        commandLine.addArgument("pos+0");
    } else if (type.equals("easypos")) {
        commandLine.addArgument("easypos+0");
    } else if (type.equals("lemma")) {
        commandLine.addArgument("lemma+0");
    } else if (type.equals("PoS-forma")) {
        commandLine.addArgument("pos+0");
        commandLine.addArgument("word+0");
    } else if (type.equals("PoS-lemma")) {
        commandLine.addArgument("pos+0");
        commandLine.addArgument("lemma+0");
    }
    String semFuncParam = "";
    if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null
            && funDescription.get(0).trim().length() > 0
            || semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null
                    && semDescription.get(0).trim().length() > 0) {
        semFuncParam = "?";
        if (funDescription != null && funDescription.size() > 0 && funDescription.get(0) != null
                && funDescription.get(0).trim().length() > 0) {
            String fd = StringUtils.join(funDescription, "\\|");
            semFuncParam += "text_functional=/\\(" + fd + "\\)/ ";
        }
        if (semDescription != null && semDescription.size() > 0 && semDescription.get(0) != null
                && semDescription.get(0).trim().length() > 0) {
            String sd = StringUtils.join(semDescription, "\\|");
            semFuncParam += "text_semantic=/\\(" + sd + "\\)/ ";

        }
        commandLine.addArgument(semFuncParam);
    }
    if (sorted) {
        commandLine.addArgument("|");
        commandLine.addArgument("sort");
        commandLine.addArgument("-nr");
        commandLine.addArgument("-k");
        commandLine.addArgument("1");
    }
    if (quantityP > 0) {
        commandLine.addArgument("|");
        commandLine.addArgument("head");
        commandLine.addArgument("-" + quantityP);
    }
    File flTempFile = null;
    try {
        flTempFile = File.createTempFile("ridireFL", null);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    commandLine.addArgument(" > ");
    commandLine.addArgument(flTempFile.getAbsolutePath());
    String c = commandLine.toString();
    try {
        File tempSh = File.createTempFile("ridireSH", ".sh");
        FileUtils.writeStringToFile(tempSh, c);
        tempSh.setExecutable(true);
        commandLine = CommandLine.parse(tempSh.getAbsolutePath());
        DefaultExecutor executor = new DefaultExecutor();
        executor.setExitValue(0);
        ExecuteWatchdog watchdog = new ExecuteWatchdog(CWBFrequencyList.TIMEOUT);
        executor.setWatchdog(watchdog);
        ByteArrayOutputStream baosStdOut = new ByteArrayOutputStream(1024);
        ByteArrayOutputStream baosStdErr = new ByteArrayOutputStream(1024);
        ExecuteStreamHandler executeStreamHandler = new PumpStreamHandler(baosStdOut, baosStdErr, null);
        executor.setStreamHandler(executeStreamHandler);
        int exitValue = 0;
        exitValue = executor.execute(commandLine);
        FileUtils.deleteQuietly(tempSh);
        if (exitValue == 0) {
            StrTokenizer strTokenizer = new StrTokenizer();
            this.frequencyList = new ArrayList<FrequencyItem>();
            List<String> lines = FileUtils.readLines(flTempFile);
            for (String line : lines) {
                strTokenizer.reset(line);
                String[] tokens = strTokenizer.getTokenArray();
                if (tokens.length == 2) {
                    FrequencyItem frequencyItem = new FrequencyItem(tokens[1],
                            Integer.parseInt(tokens[0].trim()));
                    this.frequencyList.add(frequencyItem);
                } else if (tokens.length == 3) {
                    FrequencyItem frequencyItem = new FrequencyItem(tokens[2], tokens[1],
                            Integer.parseInt(tokens[0].trim()));
                    this.frequencyList.add(frequencyItem);
                }
            }
            if (deleteFLFile) {
                FileUtils.deleteQuietly(flTempFile);
            }
        }
    } catch (ExecuteException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return flTempFile.getAbsolutePath();
}

From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java

private Map<String, Map<String, Number>> createResTable(List<String> lines, StrTokenizer strTokenizer) {
    Map<String, Map<String, Number>> resTable = new HashMap<String, Map<String, Number>>();
    for (String line : lines) {
        String[] tokens = strTokenizer.reset(line).getTokenArray();
        if (tokens.length != 3) {
            continue;
        }//from  w w  w  .java  2s  .  c o m
        String[] lemmas = tokens[0].split("\\s");
        String[] poss = tokens[1].split("\\s");
        if (lemmas.length != poss.length || poss.length < 2) {
            continue;
        }
        String target = tokens[2].trim();
        String preArtpre = null;
        for (int i = 1; i < poss.length; i++) {
            if (poss[i].trim().matches("PRE|ARTPRE")) {
                preArtpre = lemmas[i].trim();
                break;
            }
        }
        if (preArtpre == null) {
            continue;
        }
        Map<String, Number> tableForPre = resTable.get(preArtpre);
        if (tableForPre == null) {
            tableForPre = new HashMap<String, Number>();
        }
        Number n = tableForPre.get(target);
        if (n == null) {
            tableForPre.put(target, 1);
        } else {
            tableForPre.put(target, n.intValue() + 1);
        }
        resTable.put(preArtpre, tableForPre);
    }
    return resTable;
}

From source file:it.drwolf.ridire.index.sketch.AsyncSketchCreator.java

private void processNotTrinaryTable(HashMap<String, SketchResult> sr, String freqTable, long firstFreq,
        StrTokenizer strTokenizer, List<String> lines) {
    if (lines != null && lines.size() > 0 && lines.get(0).trim().length() > 0) {
        // fA = first line
        Number fA = Long.valueOf(lines.get(0).trim());
        for (String l : lines) {
            String[] tokens = strTokenizer.reset(l).getTokenArray();
            if (tokens.length != 2) {
                continue;
            }/*from  ww w.  j a v a 2  s  . c  om*/
            String f = tokens[1];
            List<Number> fBs = this.entityManager
                    .createNativeQuery("select freq from " + freqTable + " where item=:item")
                    .setParameter("item", f).getResultList();
            if (fBs == null || fBs.size() < 1) {
                continue;
            }
            SketchResult res = sr.get(f);
            if (res == null) {
                res = new SketchResult();
            }
            long fB = fBs.get(0).longValue();
            if (fBs != null && fBs.size() > 0 && fB > 0) {
                res.setCollocata(f);
                long n = this.corpusSizeParams.getCorpusSize(freqTable.substring(5)).longValue();
                long fAB = Long.parseLong(tokens[0]);
                double score = this.getSketchScore(CWBCollocatesExtractor.LOGDICE_SCORE, fA.longValue(), fB,
                        fAB, n);
                res.setScore(score);
                res.setfA(fA.longValue());
                res.setfAB(fAB);
                res.setfB(fB);
                sr.put(f, res);
            }
        }
    }
}

From source file:it.drwolf.ridire.util.async.FrequencyListGenerator.java

private Map<String, Integer> getBareTable(List<String> corporaNames, String functionalMetadatumDescription,
        String semanticMetadatumDescription, String frequencyBy) throws IOException {
    Map<String, Integer> fl = new HashMap<String, Integer>();
    Query q = new BooleanQuery();
    if (corporaNames != null && corporaNames.size() > 0
            && !(corporaNames.size() == 1 && corporaNames.get(0) == null)) {
        BooleanQuery corporaQuery = new BooleanQuery();
        for (String cn : corporaNames) {
            if (cn != null) {
                corporaQuery.add(new TermQuery(new Term("corpus", cn)), Occur.SHOULD);
            }/*  w  ww  .j a va 2 s .  c o  m*/
        }
        ((BooleanQuery) q).add(corporaQuery, Occur.MUST);
    }
    if (functionalMetadatumDescription != null) {
        TermQuery funcQuery = new TermQuery(new Term("functionalMetadatum", functionalMetadatumDescription));
        ((BooleanQuery) q).add(funcQuery, Occur.MUST);
    }
    if (semanticMetadatumDescription != null) {
        TermQuery semaQuery = new TermQuery(new Term("semanticMetadatum", semanticMetadatumDescription));
        ((BooleanQuery) q).add(semaQuery, Occur.MUST);
    }
    PrefixQuery prefixQuery = new PrefixQuery(new Term("performaFL", ""));
    ((BooleanQuery) q).add(prefixQuery, Occur.MUST);
    IndexSearcher indexSearcher = this.contextsIndexManager.getIndexSearcherR();
    System.out.println("Starting FL calculation");
    TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
    indexSearcher.search(q, null, totalHitCountCollector);
    int totalHits = totalHitCountCollector.getTotalHits();
    System.out.println("Frequency list calculation. Docs to be processed: " + totalHits);
    ScoreDoc after = null;
    int docsProcessed = 0;
    for (int j = 0; j < totalHits; j += FrequencyListGenerator.BATCH_SIZE) {
        TopDocs topDocs = null;
        if (after == null) {
            topDocs = indexSearcher.search(q, FrequencyListGenerator.BATCH_SIZE);
        } else {
            topDocs = indexSearcher.searchAfter(after, q, FrequencyListGenerator.BATCH_SIZE);
        }
        StrTokenizer strTokenizer = new StrTokenizer();
        strTokenizer.setDelimiterString(ContextAnalyzer.SEPARATOR);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        if (scoreDocs != null) {
            for (ScoreDoc scoreDoc : scoreDocs) {
                ++docsProcessed;
                after = scoreDoc;
                TermFreqVector termFreqVector = indexSearcher.getIndexReader().getTermFreqVector(scoreDoc.doc,
                        "performaFL");
                if (termFreqVector == null) {
                    continue;
                }
                String[] terms = termFreqVector.getTerms();
                int[] frequencies = termFreqVector.getTermFrequencies();
                for (int i = 0; i < terms.length; i++) {
                    String term = terms[i];
                    String[] tokenArray = strTokenizer.reset(term).getTokenArray();
                    if (tokenArray.length != 3) {
                        continue;
                    }
                    String pos = tokenArray[1];
                    String lemma = tokenArray[2];
                    if (lemma.equals("<unknown>")) {
                        lemma = tokenArray[0];
                    }
                    if (frequencyBy.equals("forma")) {
                        term = tokenArray[0];
                    } else if (frequencyBy.equals("lemma")) {
                        term = lemma;
                    } else if (frequencyBy.equals("PoS-lemma")) {
                        if (pos.startsWith("VER")) {
                            pos = "VER";
                        }
                        term = pos + " / " + lemma;
                    } else if (frequencyBy.equals("PoS-forma")) {
                        if (pos.startsWith("VER")) {
                            pos = "VER";
                        }
                        term = pos + " / " + tokenArray[0];
                    } else {
                        term = tokenArray[1];
                    }
                    Integer count = fl.get(term);
                    if (count == null) {
                        fl.put(term, frequencies[i]);
                    } else {
                        fl.put(term, frequencies[i] + count);
                    }
                }
                if (docsProcessed % 1000 == 0) {
                    System.out.println("Frequency list calculation. Docs processed: " + docsProcessed
                            + " on total: " + totalHits + " (" + docsProcessed * 100.0f / totalHits + "%)");
                }
            }
        }
    }
    return fl;
}

From source file:it.drwolf.ridire.index.cwb.scripts.VRTFilesBuilder.java

public void createVRTFile(String posFileName, StrTokenizer strTokenizer, CrawledResource cr, File destDir) {
    File posFile = new File(posFileName);
    if (posFile.exists() && posFile.canRead()) {
        try {/*from   www.  ja  va2 s .com*/
            List<String> posFileLines = FileUtils.readLines(posFile);
            if (this.haveStrangeChars(posFileLines)) {
                this.log.warn("File with strange chars {0}", posFileName);
                return;
            }
            List<String> newLines = new ArrayList<String>();
            for (String l : posFileLines) {
                strTokenizer.reset(l);
                String[] tokens = strTokenizer.getTokenArray();
                if (tokens.length != 3) {
                    System.err.println("File: " + posFileName + " Stringa malformed: " + l);
                    continue;
                }
                String nl = tokens[0] + "\t";
                nl += tokens[1].replaceAll(":", "") + "\t";
                nl += this.getEasyPos(tokens[1]).replaceAll(":", "") + "\t";
                nl += tokens[2];
                newLines.add(nl);
            }
            String functionalMetadatum = cr.getFunctionalMetadatum() != null
                    ? cr.getFunctionalMetadatum().getDescription()
                    : "";
            String semanticMetadatum = cr.getSemanticMetadatum() != null
                    ? cr.getSemanticMetadatum().getDescription()
                    : "";
            String url = cr.getUrl();
            if (url == null) {
                url = "";
            }
            String header = this.getHeaderFromResource(cr.getJob().getName(), functionalMetadatum,
                    semanticMetadatum, url, posFile);
            newLines.add(0, header);
            newLines.add("</text>");
            File vrtFile = new File(destDir, cr.getDigest() + ".vrt");
            FileUtils.writeLines(vrtFile, newLines);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } else {
        System.err.println("Warning - File " + posFileName + " doesn't exist.");
    }
}

From source file:it.drwolf.ridire.index.cwb.CWBPatternSearcher.java

private String getTermVisualization(String term, boolean leftContext) {
    StrTokenizer s1 = new StrTokenizer();
    List<String> t1 = s1.reset(term).getTokenList();
    new ArrayList<String>();
    List<String> total = new ArrayList<String>();
    for (String t : t1) {
        List<String> ret = new ArrayList<String>();
        String[] tokens = new StrTokenizer(t, CWBPatternSearcher.SEPARATOR).getTokenArray();
        if (this.getToBeVisualized().contains(CWBPatternSearcher.TUTTO)) {
            if (tokens.length > 0) {
                ret.add(tokens[0]);/*  www . j a va 2 s  .com*/
            }
            if (tokens.length > 1) {
                ret.add(tokens[1]);
            }
            if (tokens.length > 2) {
                ret.add(tokens[2]);
            }
        } else {
            if (this.getToBeVisualized().contains(CWBPatternSearcher.FORMA) && tokens.length > 0) {
                ret.add(tokens[0]);
            }
            if (this.getToBeVisualized().contains("PoS") && tokens.length > 1) {
                ret.add(tokens[1]);
            }
            if (this.getToBeVisualized().contains("Lemma") && tokens.length > 2) {
                ret.add(tokens[2]);
            }
        }
        if (!leftContext) {
            total.add(StringUtils.join(ret, " / "));
        } else {
            total.add(StringUtils.join(ret, "_/_"));
        }
    }
    return StringUtils.join(total, " ");
}

From source file:it.drwolf.ridire.session.JobManager.java

public void retrievePoSText(CrawledResource cr) {
    File resourceDir = new File(
            FilenameUtils.getFullPath(cr.getArcFile().replaceAll("__\\d+", "")) + JobManager.RESOURCESDIR);
    File posTextFile = new File(resourceDir, cr.getDigest() + ".txt.pos");
    List<PoSLine> posLines = new ArrayList<PoSLine>();
    try {/* ww  w  .  j  ava  2  s.c  o m*/
        List<String> lines = FileUtils.readLines(posTextFile);
        StrTokenizer tokenizer = StrTokenizer.getTSVInstance();
        for (String l : lines) {
            tokenizer.reset(l);
            String[] tokens = tokenizer.getTokenArray();
            if (tokens.length == 3) {
                PoSLine poSLine = new PoSLine();
                poSLine.setForm(tokens[0].trim());
                poSLine.setPosTag(tokens[1].trim());
                poSLine.setLemma(tokens[2].trim());
                posLines.add(poSLine);
            }
        }

    } catch (IOException e) {

    }
    this.setPosText(posLines);
}