Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("text", text);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }//  w w  w  . j a v  a 2  s.  c  om
}

From source file:lia.chapter4.SimpleAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    TokenStream tokenStream = new SimpleAnalyzer().tokenStream("text", "The quick brown fox..");
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*ww  w .j  a  va2  s .co  m*/
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        System.out.println(startOffset);
        int endOffset = offsetAttribute.endOffset();
        System.out.println(endOffset);
        String term = charTermAttribute.toString();
        System.out.println(term);

    }

    /*AnalyzerUtils.displayTokensWithFullDetails(new SimpleAnalyzer(),
      "The quick brown fox....");*/
}

From source file:lucandra.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {

    List<String> allIndexedTerms = new ArrayList<String>();

    // check for special field name
    String docId = doc.get(CassandraUtils.documentIdField);

    if (docId == null)
        docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime())));

    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {

        // Untokenized fields go in without a termPosition

        if (field.isIndexed() && !field.isTokenized()) {

            String term = CassandraUtils.createColumnName(field.name(), field.stringValue());

            allIndexedTerms.add(term);//from  www.  j  a v a 2s  .co  m

            String key = indexName + CassandraUtils.delimeter + term;

            Map<String, List<Number>> termMap = new HashMap<String, List<Number>>();

            termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray);
            termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                    docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap);

        } else if (field.isIndexed()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }

            // collect term information per field
            Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms

            tokens.reset(); // reset the TokenStream to the first token

            // set up token attributes we are working on

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                String term = CassandraUtils.createColumnName(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<String, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new HashMap<String, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                {
                    List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);

                    if (termFrequency == null) {
                        termFrequency = new ArrayList<Number>();
                        termFrequency.add(new Integer(0));
                        termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
                    }

                    // increment
                    termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                }

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.positionVectorKey, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) {

                // Terms are stored within a unique key combination
                // This is required since cassandra loads all columns
                // in a key/column family into memory
                String key = indexName + CassandraUtils.delimeter + term.getKey();

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(CassandraUtils.normsKey, bnorm);
                }

                CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                        docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue());
            }
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            String key = indexName + CassandraUtils.delimeter + docId;

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
                    field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null);

        }
    }

    // Finally, Store meta-data so we can delete this document
    String key = indexName + CassandraUtils.delimeter + docId;

    CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
            CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key),
            CassandraUtils.toBytes(allIndexedTerms), null);

    if (autoCommit)
        CassandraUtils.robustBatchInsert(client, getMutationMap());
}

From source file:lucli.LuceneMethods.java

License:Apache License

private void invertDocument(Document doc) throws IOException {

    Map tokenMap = new HashMap();
    final int maxFieldLength = 10000;

    Analyzer analyzer = createAnalyzer();
    Iterator fields = doc.getFields().iterator();
    final Token reusableToken = new Token();
    while (fields.hasNext()) {
        Field field = (Field) fields.next();
        String fieldName = field.name();

        if (field.isIndexed()) {
            if (field.isTokenized()) { // un-tokenized field
                Reader reader; // find or make Reader
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");

                int position = 0;
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream
                        .addAttribute(PositionIncrementAttribute.class);

                try {
                    while (stream.incrementToken()) {
                        position += (posIncrAtt.getPositionIncrement() - 1);
                        position++;/*from  w  w  w .ja v  a  2 s  .  c om*/
                        String name = termAtt.term();
                        Integer Count = (Integer) tokenMap.get(name);
                        if (Count == null) { // not in there yet
                            tokenMap.put(name, new Integer(1)); //first one
                        } else {
                            int count = Count.intValue();
                            tokenMap.put(name, new Integer(count + 1));
                        }
                        if (position > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }

        }
    }
    Entry[] sortedHash = getSortedMapEntries(tokenMap);
    for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
        Entry currentEntry = sortedHash[ii];
        message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue());
    }
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*from  w  ww. j a  v a2  s  .c o m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*from   www .j a  v  a  2s . c om*/
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {/*from  w ww.  ja v  a2  s. c om*/
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

License:Apache License

public static Set<String> analyzer(String sentence) throws Exception {
    if (StringUtils.isEmpty(sentence)) {
        return Collections.emptySet();
    }//from   w ww  . j a v  a 2s . c  om

    Analyzer analyzer = new StandardAnalyzer();
    try {
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new LinkedHashSet<>();
        while (tokenStream.incrementToken()) {
            String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

            if (word.length() <= 1) {
                continue;
            }

            words.add(word);
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public void tokenise() throws IOException {
    String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
        ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</"
                + ignoredElements + " *>";
    if (!tagIndexing)
        ignregexp = ignregexp + "|<.*?>";
    //ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;/*  ww  w.  j  ava 2s . c  o  m*/
    while (igns.find()) {
        int s = igns.start();
        int e = igns.end();
        if (verbose)
            PrintUtil.printNoMove("Processing exclusions ...", ct++);
        //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
        char sp[] = new char[e - s];
        for (int j = 0; j < sp.length; j++) {
            sp[j] = ' ';
        }
        tx.replace(s, e, new String(sp));
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
    //verbose = false;
    String text = new String(tx);
    //System.out.println("-->"+text+"<--");
    Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    //stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    //stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        tokenMap.putPos(token, startOffset);
        //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public List<String> split(String s) {
    ArrayList<String> ret = new ArrayList<String>();
    try {/*ww w  . j  av a2 s .c o m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(token);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}