Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * /*from ww w . j  ava 2  s  .com*/
 * @param docs
 * @param pub2bugsIds
 * @param dim1size
 * @param dim2size
 * @return
 * @throws IOException 
 */
public static Map<Integer, Map<Integer, Map<Integer, Double>>> calculateAspectDocumentMap(
        Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds,
        final int dim1size, final int dim2size, final int dim3size) {

    //       double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size];
    //word -> aspect -> document -> occur number
    Map<Integer, Map<Integer, Map<Integer, Double>>> w2a2d2num = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>();

    try {
        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            //document id
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            List<String> textList = doc.getText();
            //aspect id
            for (int i = 0; i < textList.size(); i++) {
                //TODO:: remove URL, @XXX and non-ASCII characters
                //DONE
                String text = textList.get(i);
                //remove url
                String t1 = text.replaceAll(
                        "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
                //remove non-ascii
                String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

                //               stemmed text token stream
                TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));

                while (tokenStream.incrementToken()) {
                    String token = tokenStream.reflectAsString(false);
                    int termBegin = 5;
                    int termEnd = token.indexOf(",");
                    final String wordstem = token.substring(termBegin, termEnd);

                    //w id
                    Integer vocabIndex = vocabulary.get(wordstem);
                    if (vocabIndex != null) {
                        //instance.add(vocabIndex, bugid, 1.0);
                        //                          apsectDocMatrix[vocabIndex][i][bugid] +=1.0;

                        //initialize w2a2d2count map
                        Double oldValue = Util.get3Map(w2a2d2num, vocabIndex, i, bugid);
                        Util.update3Map(w2a2d2num, vocabIndex, i, bugid, oldValue + 1);
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return w2a2d2num;
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

public static double[][][] calculateAspectDocumentMatrix(Map<Integer, Doc> docs,
        Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size,
        final int dim2size, final int dim3size) {

    double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size];
    //word -> aspect -> document -> occur number

    try {//from   w ww  . j a v a2 s .  c o m
        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            //document id
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            List<String> textList = doc.getText();
            //aspect id
            for (int i = 0; i < textList.size(); i++) {
                //TODO:: remove URL, @XXX and non-ASCII characters
                //DONE
                String text = textList.get(i);
                //remove url
                String t1 = text.replaceAll(
                        "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
                //remove non-ascii
                String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

                //               stemmed text token stream
                TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));

                //                    for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
                while (tokenStream.incrementToken()) {
                    String token = tokenStream.reflectAsString(false);
                    int termBegin = 5;
                    int termEnd = token.indexOf(",");
                    final String wordstem = token.substring(termBegin, termEnd);

                    //w id
                    Integer vocabIndex = vocabulary.get(wordstem);
                    if (vocabIndex != null) {
                        //instance.add(vocabIndex, bugid, 1.0);
                        apsectDocMatrix[vocabIndex][i][bugid] += 1.0;
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return apsectDocMatrix;
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * /*  w  w w.  j  a v a 2s  .c o  m*/
 * @param docs
 * @param vocabulary
 * @param vocabInverse
 * @param pub2bugsIds
 * @param dim1size
 * @param dim2size
 * @return
 */
public static double[][] calculateWordDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary,
        List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) {

    double[][] wordDocMatrix = new double[dim1size][dim2size];
    try {
        assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size;
        //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) :
        //        "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + 
        //        "  Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values());

        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                Integer vocabIndex = vocabulary.get(wordstem);
                if (vocabIndex != null) {
                    //instance.add(vocabIndex, bugid, 1.0);
                    wordDocMatrix[vocabIndex][bugid] += 1.0;
                }
            }
        }

        return wordDocMatrix;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

public static Map<Integer, Map<Integer, Double>> calculateWordDocumentMap(Map<Integer, Doc> docs,
        Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size,
        final int dim2size) {

    Map<Integer, Map<Integer, Double>> wordDocMap = new HashMap<Integer, Map<Integer, Double>>();
    try {/*from www .  ja  va2  s. co m*/
        assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size;
        //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) :
        //        "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + 
        //        "  Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values());

        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                Integer vocabIndex = vocabulary.get(wordstem);
                if (vocabIndex != null) {
                    //instance.add(vocabIndex, bugid, 1.0);
                    Double oldValue = Util.get2Map(wordDocMap, vocabIndex, bugid);
                    Util.update2Map(wordDocMap, vocabIndex, bugid, oldValue + 1.0);
                }
            }
        }

        return wordDocMap;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * Huiping noted 2012-02-29//from ww  w .ja v  a 2 s  .c o  m
 * 
 * Extract vocabularies from all documents and Build TF array, DF array
 * 
 */
public void createVocabulary(Map<Integer, Doc> docs, int minWordsOccurence,
        //Map<String, Integer> vocabularyMap,
        List<String> vocabInverse) {
    vocabularyMap.clear();
    docInverse = hashDocs(new ArrayList<Doc>(docs.values()));
    int corpus_length = 0;
    try {
        // calculate origVocabulary
        //Map from vocabularity to its index position
        origVocabulary = new HashMap<String, Integer>();
        //word ordered by word index 
        origVocabInverse = new ArrayList<String>();
        //list of documents containing word, ordered by word index
        setOrigVocabInverse2Doc(new ArrayList<List<Integer>>());
        // each element holds a list of the frequency with which a word occurs in the document.
        // order is the same as above
        origVocabInverse2DocFrequency = new ArrayList<List<Integer>>();

        assert (docs.size() > 0);

        for (int pubid : docs.keySet()) {
            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                corpus_length++;
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                // get the vocab index
                Integer vocabIndex = origVocabulary.get(wordstem);
                if (vocabIndex == null) {
                    vocabIndex = origVocabulary.size();
                    origVocabulary.put(wordstem, vocabIndex); // hash

                    origVocabInverse.add(wordstem); // add to list
                    getOrigVocabInverse2Doc().add(new ArrayList<Integer>(Collections.singleton(pubid))); // add to document hash
                    origVocabInverse2DocFrequency.add(new ArrayList<Integer>(Collections.singleton(1))); // add to document hash
                    assert (vocabIndex == origVocabInverse.size() - 1) : "vocabIndex = " + vocabIndex
                            + " origVocabInverse.size()-1=" + (origVocabInverse.size() - 1);
                    assert (vocabIndex == getOrigVocabInverse2Doc().size() - 1) : "vocabIndex =" + vocabIndex
                            + " origVocabInverse2Doc.size()-1=" + (getOrigVocabInverse2Doc().size() - 1);
                    assert (getOrigVocabInverse2Doc().get(vocabIndex)
                            .size() == 1) : "origVocabInverse2Doc.get(vocabIndex).size()="
                                    + getOrigVocabInverse2Doc().get(vocabIndex).size();
                    assert (origVocabInverse2DocFrequency.get(vocabIndex)
                            .size() == 1) : "origVocabInverse2DocFrequency.get(vocabIndex).size()="
                                    + origVocabInverse2DocFrequency.get(vocabIndex).size();
                } else {
                    assert (origVocabInverse.get(origVocabulary.get(wordstem))
                            .equals(wordstem)) : "Inverse vocabulary broken. wordstem=" + wordstem
                                    + " vocabulary.get(wordstem)=" + origVocabulary.get(wordstem)
                                    + " origVocabInverse.get(" + origVocabulary.get(wordstem) + ")="
                                    + origVocabInverse.get(origVocabulary.get(wordstem)) + ")";
                    // add 1 to the frequencyList of (vocabIndex,doc)
                    List<Integer> doclist = getOrigVocabInverse2Doc().get(vocabIndex);
                    List<Integer> freqlist = origVocabInverse2DocFrequency.get(vocabIndex);
                    int docindex1 = doclist.indexOf(pubid);
                    if (docindex1 < 0) {
                        docindex1 = doclist.size();
                        doclist.add(pubid);
                    }

                    while (docindex1 >= freqlist.size()) {
                        freqlist.add(0);
                    }
                    int docindex = docindex1;
                    final int oldFreq = freqlist.get(docindex);
                    freqlist.set(docindex, oldFreq + 1);

                    assert (getOrigVocabInverse2Doc().size() == origVocabInverse2DocFrequency.size());
                    assert (origVocabInverse2DocFrequency.get(vocabIndex).get(docindex) > 0);

                }
            }
        }

        // calculate (real) vocabulary (use by the inference algorithm)
        // throw out words that occur only within one document (and thus achieve no coupling)

        assert (getOrigVocabInverse2Doc().size() > 1);
        assert (origVocabInverse.size() > 1);

        //vocabulary = new HashMap<String, Integer>();
        //vocabInverse = new ArrayList<String>();
        for (int w = 0; w < getOrigVocabInverse2Doc().size(); w++) {
            if (getOrigVocabInverse2Doc().get(w).size() >= minWordsOccurence) {
                int vocaIndex = vocabInverse.size();
                vocabInverse.add(origVocabInverse.get(w));
                vocabularyMap.put(origVocabInverse.get(w), vocaIndex);
            }
        }

        //            System.out.println(Debugger.getCallerPosition()+"vocabInverse (" + vocabInverse.size() + ") " + vocabInverse);
        //            System.out.println(Debugger.getCallerPosition()+"origVocabInverse (" + origVocabInverse.size() + ")" + origVocabInverse);
        //System.out.println(Debugger.getCallerPosition()+Arrays.toString(vocabInverse.toArray()));

        assert (vocabInverse.size() > 0);
    } catch (IOException e) {
        throw new RuntimeException(e); //todo handle
    }

    Constant.tokenNum = vocabularyMap.size();
    System.out.println(Debugger.getCallerPosition() + " corpus length is " + corpus_length);
}

From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java

License:Open Source License

public void test(boolean lowercase) throws Exception {
    SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase);
    TokenStream stream = analyzer.tokenStream("test", text);

    CharTermAttribute token;//from  ww w  .j a va  2 s  . c  om
    int count = 0;
    stream.reset();
    while (stream.incrementToken()) {
        Assert.assertTrue(count < expectedTokens.length);
        token = stream.getAttribute(CharTermAttribute.class);
        if (lowercase) {
            Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString());
        } else {
            Assert.assertEquals(expectedTokens[count], token.toString());
        }
        ++count;
    }
    Assert.assertEquals(expectedTokens.length, count);
    analyzer.close();
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java

License:Open Source License

/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * /* ww  w . ja  va 2 s  .co  m*/
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException {
    final int expectedCount = expectedTokens.size();
    int count = 0;

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString())) {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            } else {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;//from  w w  w.ja  va 2  s .  co  m

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.alfresco.solr.AlfrescoFieldType.java

License:Open Source License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {/* www  .j  av a2  s  .com*/
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.alfresco.solr.query.Solr4QueryParser.java

License:Open Source License

private ArrayList<String> getTokens(IndexableField indexableField) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null);
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from ww  w  .j a  v  a 2s. c  o m
    while (ts.incrementToken()) {
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();

    return tokens;
}