Example usage for org.apache.lucene.analysis TokenStream reflectAsString

List of usage examples for org.apache.lucene.analysis TokenStream reflectAsString

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reflectAsString.

Prototype

public final String reflectAsString(final boolean prependAttClass) 

Source Link

Document

This method returns the current attribute values as a string in the following format by calling the #reflectWith(AttributeReflector) method:
  • iff prependAttClass=true : "AttributeClass#key=value,AttributeClass#key=value"
  • iff prependAttClass=false : "key=value,key=value"

Usage

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * /*from www. j  av  a 2s .  co m*/
 * @param docs
 * @param pub2bugsIds
 * @param dim1size
 * @param dim2size
 * @return
 * @throws IOException 
 */
public static Map<Integer, Map<Integer, Map<Integer, Double>>> calculateAspectDocumentMap(
        Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds,
        final int dim1size, final int dim2size, final int dim3size) {

    //       double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size];
    //word -> aspect -> document -> occur number
    Map<Integer, Map<Integer, Map<Integer, Double>>> w2a2d2num = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>();

    try {
        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            //document id
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            List<String> textList = doc.getText();
            //aspect id
            for (int i = 0; i < textList.size(); i++) {
                //TODO:: remove URL, @XXX and non-ASCII characters
                //DONE
                String text = textList.get(i);
                //remove url
                String t1 = text.replaceAll(
                        "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
                //remove non-ascii
                String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

                //               stemmed text token stream
                TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));

                while (tokenStream.incrementToken()) {
                    String token = tokenStream.reflectAsString(false);
                    int termBegin = 5;
                    int termEnd = token.indexOf(",");
                    final String wordstem = token.substring(termBegin, termEnd);

                    //w id
                    Integer vocabIndex = vocabulary.get(wordstem);
                    if (vocabIndex != null) {
                        //instance.add(vocabIndex, bugid, 1.0);
                        //                          apsectDocMatrix[vocabIndex][i][bugid] +=1.0;

                        //initialize w2a2d2count map
                        Double oldValue = Util.get3Map(w2a2d2num, vocabIndex, i, bugid);
                        Util.update3Map(w2a2d2num, vocabIndex, i, bugid, oldValue + 1);
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return w2a2d2num;
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

public static double[][][] calculateAspectDocumentMatrix(Map<Integer, Doc> docs,
        Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size,
        final int dim2size, final int dim3size) {

    double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size];
    //word -> aspect -> document -> occur number

    try {//from ww  w .j  a  v  a2  s .  c o  m
        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            //document id
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            List<String> textList = doc.getText();
            //aspect id
            for (int i = 0; i < textList.size(); i++) {
                //TODO:: remove URL, @XXX and non-ASCII characters
                //DONE
                String text = textList.get(i);
                //remove url
                String t1 = text.replaceAll(
                        "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
                //remove non-ascii
                String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

                //               stemmed text token stream
                TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));

                //                    for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
                while (tokenStream.incrementToken()) {
                    String token = tokenStream.reflectAsString(false);
                    int termBegin = 5;
                    int termEnd = token.indexOf(",");
                    final String wordstem = token.substring(termBegin, termEnd);

                    //w id
                    Integer vocabIndex = vocabulary.get(wordstem);
                    if (vocabIndex != null) {
                        //instance.add(vocabIndex, bugid, 1.0);
                        apsectDocMatrix[vocabIndex][i][bugid] += 1.0;
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return apsectDocMatrix;
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * /*from   www .j a  v  a 2s  . c om*/
 * @param docs
 * @param vocabulary
 * @param vocabInverse
 * @param pub2bugsIds
 * @param dim1size
 * @param dim2size
 * @return
 */
public static double[][] calculateWordDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary,
        List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) {

    double[][] wordDocMatrix = new double[dim1size][dim2size];
    try {
        assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size;
        //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) :
        //        "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + 
        //        "  Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values());

        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                Integer vocabIndex = vocabulary.get(wordstem);
                if (vocabIndex != null) {
                    //instance.add(vocabIndex, bugid, 1.0);
                    wordDocMatrix[vocabIndex][bugid] += 1.0;
                }
            }
        }

        return wordDocMatrix;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

public static Map<Integer, Map<Integer, Double>> calculateWordDocumentMap(Map<Integer, Doc> docs,
        Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size,
        final int dim2size) {

    Map<Integer, Map<Integer, Double>> wordDocMap = new HashMap<Integer, Map<Integer, Double>>();
    try {/*from   w  ww. jav  a2  s  . c om*/
        assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size;
        //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) :
        //        "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + 
        //        "  Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values());

        for (Object pubid_ : pub2bugsIds.keySet()) {
            int pubid = (Integer) pubid_;
            int bugid = getPubid2BugsId(pubid, pub2bugsIds);

            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                Integer vocabIndex = vocabulary.get(wordstem);
                if (vocabIndex != null) {
                    //instance.add(vocabIndex, bugid, 1.0);
                    Double oldValue = Util.get2Map(wordDocMap, vocabIndex, bugid);
                    Util.update2Map(wordDocMap, vocabIndex, bugid, oldValue + 1.0);
                }
            }
        }

        return wordDocMap;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:nmsu.cs.DocParser.java

License:Open Source License

/**
 * Huiping noted 2012-02-29/*  w  w  w .  j  a v a  2  s  .c  o m*/
 * 
 * Extract vocabularies from all documents and Build TF array, DF array
 * 
 */
public void createVocabulary(Map<Integer, Doc> docs, int minWordsOccurence,
        //Map<String, Integer> vocabularyMap,
        List<String> vocabInverse) {
    vocabularyMap.clear();
    docInverse = hashDocs(new ArrayList<Doc>(docs.values()));
    int corpus_length = 0;
    try {
        // calculate origVocabulary
        //Map from vocabularity to its index position
        origVocabulary = new HashMap<String, Integer>();
        //word ordered by word index 
        origVocabInverse = new ArrayList<String>();
        //list of documents containing word, ordered by word index
        setOrigVocabInverse2Doc(new ArrayList<List<Integer>>());
        // each element holds a list of the frequency with which a word occurs in the document.
        // order is the same as above
        origVocabInverse2DocFrequency = new ArrayList<List<Integer>>();

        assert (docs.size() > 0);

        for (int pubid : docs.keySet()) {
            Doc doc = docs.get(pubid);
            String text = doc.getFullText();
            assert (text.length() > 1);

            //remove url
            String t1 = text
                    .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2));
            //             for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) {
            //                final String wordstem = tok.termText();
            while (tokenStream.incrementToken()) {
                corpus_length++;
                String token = tokenStream.reflectAsString(false);
                int termBegin = 5;
                int termEnd = token.indexOf(",");
                final String wordstem = token.substring(termBegin, termEnd);
                // get the vocab index
                Integer vocabIndex = origVocabulary.get(wordstem);
                if (vocabIndex == null) {
                    vocabIndex = origVocabulary.size();
                    origVocabulary.put(wordstem, vocabIndex); // hash

                    origVocabInverse.add(wordstem); // add to list
                    getOrigVocabInverse2Doc().add(new ArrayList<Integer>(Collections.singleton(pubid))); // add to document hash
                    origVocabInverse2DocFrequency.add(new ArrayList<Integer>(Collections.singleton(1))); // add to document hash
                    assert (vocabIndex == origVocabInverse.size() - 1) : "vocabIndex = " + vocabIndex
                            + " origVocabInverse.size()-1=" + (origVocabInverse.size() - 1);
                    assert (vocabIndex == getOrigVocabInverse2Doc().size() - 1) : "vocabIndex =" + vocabIndex
                            + " origVocabInverse2Doc.size()-1=" + (getOrigVocabInverse2Doc().size() - 1);
                    assert (getOrigVocabInverse2Doc().get(vocabIndex)
                            .size() == 1) : "origVocabInverse2Doc.get(vocabIndex).size()="
                                    + getOrigVocabInverse2Doc().get(vocabIndex).size();
                    assert (origVocabInverse2DocFrequency.get(vocabIndex)
                            .size() == 1) : "origVocabInverse2DocFrequency.get(vocabIndex).size()="
                                    + origVocabInverse2DocFrequency.get(vocabIndex).size();
                } else {
                    assert (origVocabInverse.get(origVocabulary.get(wordstem))
                            .equals(wordstem)) : "Inverse vocabulary broken. wordstem=" + wordstem
                                    + " vocabulary.get(wordstem)=" + origVocabulary.get(wordstem)
                                    + " origVocabInverse.get(" + origVocabulary.get(wordstem) + ")="
                                    + origVocabInverse.get(origVocabulary.get(wordstem)) + ")";
                    // add 1 to the frequencyList of (vocabIndex,doc)
                    List<Integer> doclist = getOrigVocabInverse2Doc().get(vocabIndex);
                    List<Integer> freqlist = origVocabInverse2DocFrequency.get(vocabIndex);
                    int docindex1 = doclist.indexOf(pubid);
                    if (docindex1 < 0) {
                        docindex1 = doclist.size();
                        doclist.add(pubid);
                    }

                    while (docindex1 >= freqlist.size()) {
                        freqlist.add(0);
                    }
                    int docindex = docindex1;
                    final int oldFreq = freqlist.get(docindex);
                    freqlist.set(docindex, oldFreq + 1);

                    assert (getOrigVocabInverse2Doc().size() == origVocabInverse2DocFrequency.size());
                    assert (origVocabInverse2DocFrequency.get(vocabIndex).get(docindex) > 0);

                }
            }
        }

        // calculate (real) vocabulary (use by the inference algorithm)
        // throw out words that occur only within one document (and thus achieve no coupling)

        assert (getOrigVocabInverse2Doc().size() > 1);
        assert (origVocabInverse.size() > 1);

        //vocabulary = new HashMap<String, Integer>();
        //vocabInverse = new ArrayList<String>();
        for (int w = 0; w < getOrigVocabInverse2Doc().size(); w++) {
            if (getOrigVocabInverse2Doc().get(w).size() >= minWordsOccurence) {
                int vocaIndex = vocabInverse.size();
                vocabInverse.add(origVocabInverse.get(w));
                vocabularyMap.put(origVocabInverse.get(w), vocaIndex);
            }
        }

        //            System.out.println(Debugger.getCallerPosition()+"vocabInverse (" + vocabInverse.size() + ") " + vocabInverse);
        //            System.out.println(Debugger.getCallerPosition()+"origVocabInverse (" + origVocabInverse.size() + ")" + origVocabInverse);
        //System.out.println(Debugger.getCallerPosition()+Arrays.toString(vocabInverse.toArray()));

        assert (vocabInverse.size() > 0);
    } catch (IOException e) {
        throw new RuntimeException(e); //todo handle
    }

    Constant.tokenNum = vocabularyMap.size();
    System.out.println(Debugger.getCallerPosition() + " corpus length is " + corpus_length);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;/*from   w  w w  .  j a  v  a 2s .  c om*/

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java

License:Open Source License

static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
    TokenStream ts = analyzer.tokenStream(termField, text);
    Map<String, Integer> termFreq = new HashMap<>();
    try {//from www.  jav  a 2  s .c om
        ts.reset();
        while (ts.incrementToken()) {
            String term = ts.reflectAsString(false);
            int cnt = termFreq.getOrDefault(term, 0);
            termFreq.put(term, cnt + 1);
        }
        ts.close();
    } catch (IOException e) {
        logger.error("{}", e.getMessage());
        throw new BadRequestException(e);
    }
    return termFreq;
}